This is page 3 of 3. Use http://codebase.md/omgwtfwow/mcp-crawl4ai-ts?page={x} to view the full context. # Directory Structure ``` ├── .env.example ├── .github │ ├── CI.md │ ├── copilot-instructions.md │ └── workflows │ └── ci.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── CHANGELOG.md ├── eslint.config.mjs ├── jest.config.cjs ├── jest.setup.cjs ├── LICENSE ├── package-lock.json ├── package.json ├── README.md ├── src │ ├── __tests__ │ │ ├── crawl.test.ts │ │ ├── crawl4ai-service.network.test.ts │ │ ├── crawl4ai-service.test.ts │ │ ├── handlers │ │ │ ├── crawl-handlers.test.ts │ │ │ ├── parameter-combinations.test.ts │ │ │ ├── screenshot-saving.test.ts │ │ │ ├── session-handlers.test.ts │ │ │ └── utility-handlers.test.ts │ │ ├── index.cli.test.ts │ │ ├── index.npx.test.ts │ │ ├── index.server.test.ts │ │ ├── index.test.ts │ │ ├── integration │ │ │ ├── batch-crawl.integration.test.ts │ │ │ ├── capture-screenshot.integration.test.ts │ │ │ ├── crawl-advanced.integration.test.ts │ │ │ ├── crawl-handlers.integration.test.ts │ │ │ ├── crawl-recursive.integration.test.ts │ │ │ ├── crawl.integration.test.ts │ │ │ ├── execute-js.integration.test.ts │ │ │ ├── extract-links.integration.test.ts │ │ │ ├── extract-with-llm.integration.test.ts │ │ │ ├── generate-pdf.integration.test.ts │ │ │ ├── get-html.integration.test.ts │ │ │ ├── get-markdown.integration.test.ts │ │ │ ├── parse-sitemap.integration.test.ts │ │ │ ├── session-management.integration.test.ts │ │ │ ├── smart-crawl.integration.test.ts │ │ │ └── test-utils.ts │ │ ├── request-handler.test.ts │ │ ├── schemas │ │ │ └── validation-edge-cases.test.ts │ │ ├── types │ │ │ └── mocks.ts │ │ └── utils │ │ └── javascript-validation.test.ts │ ├── crawl4ai-service.ts │ ├── handlers │ │ ├── base-handler.ts │ │ ├── content-handlers.ts │ │ ├── crawl-handlers.ts │ │ ├── session-handlers.ts │ │ └── utility-handlers.ts │ ├── index.ts │ ├── schemas │ │ ├── helpers.ts │ │ └── validation-schemas.ts │ ├── server.ts │ └── types.ts ├── tsconfig.build.json └── tsconfig.json ``` # Files -------------------------------------------------------------------------------- /src/__tests__/integration/crawl.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, generateSessionId, expectSuccessfulCrawl, expectScreenshot, delay, TEST_TIMEOUTS, } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; data?: string; mimeType?: string; }>; } describe('crawl Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Basic Crawling', () => { it( 'should crawl a simple page with basic configuration', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', cache_mode: 'BYPASS', word_count_threshold: 50, }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); it( 'should handle invalid URL gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'not-a-valid-url', cache_mode: 'BYPASS', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); // Our Zod validation catches this before it reaches the API expect(content[0].text).toContain('Invalid parameters for crawl'); expect(content[0].text).toContain('Invalid url'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domain gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://this-domain-definitely-does-not-exist-12345.com', cache_mode: 'BYPASS', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); // Could be DNS error, connection error, or "Internal Server Error" expect(content[0].text).toMatch(/Failed to crawl|Internal Server Error|DNS|connection/i); }, TEST_TIMEOUTS.medium, ); it( 'should handle browser configuration', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/user-agent', viewport_width: 1920, viewport_height: 1080, user_agent: 'MCP Integration Test Bot', cache_mode: 'DISABLED', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Dynamic Content Tests', () => { it( 'should execute JavaScript on page', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: ['return document.querySelectorAll("a").length', 'return document.title'], wait_after_js: 1000, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin.org/html contains links and a title expect(textContent?.text?.toLowerCase()).toMatch(/herman|melville|moby/); // Content from the page }, TEST_TIMEOUTS.medium, ); it( 'should wait for specific elements', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/delay/2', wait_for: 'body', wait_for_timeout: 5000, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); it( 'should handle virtual scrolling for infinite feeds', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com/trending', virtual_scroll_config: { container_selector: '.Box-row', scroll_count: 3, scroll_by: 'container_height', wait_after_scroll: 1.0, }, cache_mode: 'BYPASS', wait_for: '.Box-row', word_count_threshold: 50, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); // Should have captured multiple trending repos after scrolling expect(textContent?.text).toBeTruthy(); expect(textContent?.text?.length).toBeGreaterThan(1000); }, TEST_TIMEOUTS.long, ); }); describe('Session Management Tests', () => { it( 'should create and use a session', async () => { const sessionId = generateSessionId(); // First crawl with session const result1 = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', session_id: sessionId, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result1); // Second crawl reusing session const result2 = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com/features', session_id: sessionId, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result2); }, TEST_TIMEOUTS.medium, ); it( 'should handle cookies in session', async () => { const sessionId = generateSessionId(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', session_id: sessionId, cookies: [ { name: 'test_cookie', value: 'test_value', domain: '.github.com', path: '/', }, ], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Content Extraction Tests', () => { it.skip( 'should extract content using CSS selectors - SKIPPED: Not supported via REST API', async () => { // CSS extraction is not supported via the REST API due to Python class serialization limitations // This test is kept for documentation purposes but skipped const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://www.google.com', extraction_type: 'css', css_selectors: { title: 'title', search_button: 'input[type="submit"]', logo: 'img[alt*="Google"]', }, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); }, TEST_TIMEOUTS.short, ); it( 'should extract content using LLM via extract_with_llm tool', async () => { // Note: This test requires the Crawl4AI server to have an LLM provider configured try { const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'https://httpbin.org/html', query: 'Extract the main page title and any author names mentioned', }, }); expect(result).toBeTruthy(); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // The response should be JSON with an "answer" field try { const parsed = JSON.parse(textContent?.text || '{}'); expect(parsed).toHaveProperty('answer'); expect(typeof parsed.answer).toBe('string'); expect(parsed.answer.length).toBeGreaterThan(0); } catch { // If parsing fails, at least check we got text expect(textContent?.text?.length || 0).toBeGreaterThan(0); } } catch (error) { // If the server doesn't have LLM configured, it will return an error if (error instanceof Error && error.message?.includes('No LLM provider configured')) { console.log('⚠️ LLM extraction test skipped: Server needs LLM provider configured'); return; } throw error; } }, TEST_TIMEOUTS.long, ); }); describe('Media Handling Tests', () => { it( 'should capture screenshots', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', screenshot: true, screenshot_wait_for: 1.0, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); }, TEST_TIMEOUTS.medium, ); it( 'should generate PDF', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', pdf: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); // PDF generation should return some content const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should contain some content from the page expect(textContent?.text?.toLowerCase()).toContain('herman'); }, TEST_TIMEOUTS.medium, ); it( 'should handle image filtering', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', exclude_external_images: true, image_description_min_word_threshold: 20, image_score_threshold: 5, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Performance & Caching Tests', () => { it( 'should respect cache modes', async () => { const url = 'https://httpbin.org/html'; // Use a simpler page for cache testing // First request - populate cache with ENABLED mode const result1 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'ENABLED', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result1); const content1 = (result1 as ToolResult).content.find((c) => c.type === 'text')?.text; // Wait a bit to ensure cache is saved await delay(500); // Second request - should use cache (ENABLED mode) const startTime = Date.now(); const result2 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'ENABLED', word_count_threshold: 10, }, }); const cacheTime = Date.now() - startTime; await expectSuccessfulCrawl(result2); const content2 = (result2 as ToolResult).content.find((c) => c.type === 'text')?.text; // Content should be identical if cache was used expect(content2).toBe(content1); // Third request - bypass cache const bypassStartTime = Date.now(); const result3 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); const bypassTime = Date.now() - bypassStartTime; await expectSuccessfulCrawl(result3); // Cache hit should typically be faster, but we'll make this test more lenient // Just verify all requests succeeded expect(cacheTime).toBeGreaterThan(0); expect(bypassTime).toBeGreaterThan(0); // Fourth request - DISABLED mode should not use cache const result4 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'DISABLED', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result4); }, TEST_TIMEOUTS.long, ); it( 'should handle timeout configuration', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/delay/1', timeout: 20000, page_timeout: 15000, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Content Filtering Tests', () => { it( 'should filter content by tags', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', // Simpler page for testing excluded_tags: ['script', 'style', 'nav', 'footer', 'header'], word_count_threshold: 10, cache_mode: 'BYPASS', only_text: true, // Force text-only output remove_overlay_elements: true, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Just verify we got content back - the server's filtering behavior may vary // With all the filters applied, content might be minimal expect(textContent?.text?.length).toBeGreaterThan(10); // Should contain some text from the page expect(textContent?.text).toBeTruthy(); }, TEST_TIMEOUTS.short, ); it( 'should filter content by selectors', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', excluded_selector: '.footer, .header-nav, [aria-label="Advertisement"]', remove_overlay_elements: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); it( 'should handle link filtering', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', exclude_external_links: true, exclude_social_media_links: true, exclude_domains: ['twitter.com', 'facebook.com', 'linkedin.com'], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); // Should not contain social media domains expect(textContent?.text).not.toMatch(/twitter\.com|facebook\.com/); }, TEST_TIMEOUTS.medium, ); }); describe('Bot Detection Avoidance Tests', () => { it( 'should simulate user behavior', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', simulate_user: true, override_navigator: true, magic: true, delay_before_scroll: 1000, scroll_delay: 500, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); it( 'should use custom headers and user agent', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/headers', user_agent: 'Mozilla/5.0 (compatible; MCP Test Bot)', headers: { 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'X-Custom-Header': 'MCP-Test', }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); // httpbin returns headers in response expect(textContent?.text).toContain('MCP Test Bot'); expect(textContent?.text).toContain('X-Custom-Header'); }, TEST_TIMEOUTS.medium, ); }); describe('Error Handling Tests', () => { it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'not-a-valid-url', cache_mode: 'BYPASS', }, }); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toContain('Error'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://this-domain-definitely-does-not-exist-123456789.com', cache_mode: 'BYPASS', }, }); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text?.toLowerCase()).toMatch(/error|failed/); }, TEST_TIMEOUTS.short, ); it( 'should handle JavaScript errors gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: 'throw new Error("Test error")', cache_mode: 'BYPASS', }, }); // Should still return content even if JS fails const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); }, TEST_TIMEOUTS.short, ); }); describe('Advanced Configurations', () => { it( 'should handle complex multi-feature crawl', async () => { const sessionId = generateSessionId(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', // Browser config viewport_width: 1920, viewport_height: 1080, user_agent: 'MCP Advanced Test Bot', // Session session_id: sessionId, // JavaScript js_code: 'return document.querySelectorAll("h1").length', wait_after_js: 1000, // Content filtering excluded_tags: ['script', 'style'], word_count_threshold: 50, remove_overlay_elements: true, // Media screenshot: true, screenshot_wait_for: 1.0, // Performance cache_mode: 'DISABLED', timeout: 60000, // Bot detection simulate_user: true, override_navigator: true, }, }); await expectSuccessfulCrawl(result); // Screenshot might not always be returned in complex multi-feature crawls // especially with httpbin.org which is a simple HTML page const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); }, TEST_TIMEOUTS.long, ); it( 'should handle proxy configuration', async () => { // Test that proxy configuration is accepted, even without a real proxy const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/ip', proxy_server: 'http://example-proxy.com:8080', proxy_username: 'testuser', proxy_password: 'testpass', cache_mode: 'BYPASS', word_count_threshold: 10, }, }); // The request should complete (even if proxy doesn't exist, the config should be accepted) expect(result).toBeDefined(); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); }, TEST_TIMEOUTS.medium, ); it( 'should process iframes', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://www.w3schools.com/html/html_iframe.asp', process_iframes: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Browser Configuration Tests', () => { describe('Cookie handling', () => { it( 'should set and send cookies correctly', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/cookies', cookies: [ { name: 'test_cookie', value: 'test_value', domain: '.httpbin.org', path: '/', }, ], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin returns cookies as JSON in the response expect(textContent?.text).toContain('test_cookie'); expect(textContent?.text).toContain('test_value'); }, TEST_TIMEOUTS.short, ); it( 'should handle multiple cookies', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/cookies', cookies: [ { name: 'session_id', value: 'abc123', domain: '.httpbin.org', path: '/', }, { name: 'user_pref', value: 'dark_mode', domain: '.httpbin.org', path: '/', }, ], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Verify both cookies are present expect(textContent?.text).toContain('session_id'); expect(textContent?.text).toContain('abc123'); expect(textContent?.text).toContain('user_pref'); expect(textContent?.text).toContain('dark_mode'); }, TEST_TIMEOUTS.short, ); }); describe('Custom headers', () => { it( 'should send custom headers', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/headers', headers: { 'X-Custom-Header': 'test-value', 'X-Request-ID': '12345', 'Accept-Language': 'en-US,en;q=0.9', }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin returns headers in the response expect(textContent?.text).toContain('X-Custom-Header'); expect(textContent?.text).toContain('test-value'); // Note: Some headers may be filtered by the browser // Just verify our custom header got through }, TEST_TIMEOUTS.short, ); }); describe('User-Agent configuration', () => { it( 'should set custom user agent', async () => { const customUserAgent = 'MCP-Crawl4AI-Test/1.0 (Integration Tests)'; const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/user-agent', user_agent: customUserAgent, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin returns the user-agent in the response expect(textContent?.text).toContain(customUserAgent); }, TEST_TIMEOUTS.short, ); }); describe('Viewport sizes and screenshots', () => { it( 'should capture screenshot at mobile size (375x667)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 375, viewport_height: 667, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); // Check screenshot was captured const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); expect(imageContent).toBeDefined(); expect(imageContent?.data).toBeTruthy(); // Verify reasonable data size for mobile screenshot const dataLength = imageContent?.data?.length || 0; expect(dataLength).toBeGreaterThan(10000); // At least 10KB expect(dataLength).toBeLessThan(3000000); // Less than 3MB for mobile (base64 encoded) }, TEST_TIMEOUTS.medium, ); it( 'should capture screenshot at tablet size (768x1024)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 768, viewport_height: 1024, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); // Check screenshot was captured const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); expect(imageContent).toBeDefined(); expect(imageContent?.data).toBeTruthy(); // Verify reasonable data size for tablet screenshot const dataLength = imageContent?.data?.length || 0; expect(dataLength).toBeGreaterThan(15000); // At least 15KB expect(dataLength).toBeLessThan(3000000); // Less than 3MB for tablet (base64 encoded) }, TEST_TIMEOUTS.medium, ); it( 'should capture screenshot at HD size (1280x720)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 1280, viewport_height: 720, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); // Check screenshot was captured const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); expect(imageContent).toBeDefined(); expect(imageContent?.data).toBeTruthy(); // Verify reasonable data size for HD screenshot const dataLength = imageContent?.data?.length || 0; expect(dataLength).toBeGreaterThan(20000); // At least 20KB expect(dataLength).toBeLessThan(3000000); // Less than 3MB for HD (base64 encoded) }, TEST_TIMEOUTS.medium, ); it( 'should fail gracefully for very large viewport (1920x1080)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 1920, viewport_height: 1080, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); // This should either timeout or return an error based on testing // We expect either an error or no screenshot data const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); // If we got text but no image, that's expected for large viewports if (textContent && !imageContent) { expect(textContent).toBeDefined(); } else if (textContent?.text?.includes('Error') || textContent?.text?.includes('timeout')) { // Expected error for large viewport expect(textContent.text).toMatch(/Error|timeout/i); } }, TEST_TIMEOUTS.long, ); }); describe('Combined browser configurations', () => { it( 'should handle cookies, headers, and custom viewport together', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/anything', viewport_width: 768, viewport_height: 1024, user_agent: 'MCP-Test-Bot/2.0', cookies: [ { name: 'auth_token', value: 'secret123', domain: '.httpbin.org', path: '/', }, ], headers: { 'X-Test-Header': 'combined-test', }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin/anything endpoint returns all request data // Verify all configurations were applied expect(textContent?.text).toContain('MCP-Test-Bot/2.0'); expect(textContent?.text).toContain('auth_token'); expect(textContent?.text).toContain('X-Test-Header'); expect(textContent?.text).toContain('combined-test'); }, TEST_TIMEOUTS.medium, ); }); }); describe('Crawler Configuration Advanced Tests', () => { describe('Content filtering parameters', () => { it( 'should remove forms when remove_forms is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/forms/post', remove_forms: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Forms should be removed, so no form-related text should appear expect(textContent?.text).not.toContain('<form'); expect(textContent?.text).not.toContain('type="submit"'); expect(textContent?.text).not.toContain('input type='); }, TEST_TIMEOUTS.short, ); it( 'should keep forms when remove_forms is false', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/forms/post', remove_forms: false, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Forms should be present - check for form-related keywords const text = textContent?.text?.toLowerCase() || ''; // httpbin forms page should have form elements expect(text.length).toBeGreaterThan(100); }, TEST_TIMEOUTS.short, ); it( 'should preserve data attributes when keep_data_attributes is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://getbootstrap.com/docs/4.0/components/alerts/', keep_data_attributes: true, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should contain alert content expect(textContent?.text).toContain('alert'); }, TEST_TIMEOUTS.medium, ); }); describe('JavaScript execution parameters', () => { it( 'should return only JS results when js_only is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: ['return document.title', 'return document.querySelectorAll("p").length'], js_only: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should contain JS execution results but not the full HTML content // The result should be much shorter than full page content expect(textContent?.text?.length).toBeLessThan(1000); // Should not contain the full Moby Dick text from the page expect(textContent?.text).not.toContain('Herman Melville'); }, TEST_TIMEOUTS.short, ); it( 'should handle js_only with session_id', async () => { const sessionId = generateSessionId(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', session_id: sessionId, js_code: 'return window.location.href', js_only: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Page visibility parameters', () => { it( 'should extract content when body is hidden and ignore_body_visibility is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: 'document.body.style.visibility = "hidden"; return "body hidden"', ignore_body_visibility: true, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should still extract content despite hidden body expect(textContent?.text).toContain('Herman Melville'); }, TEST_TIMEOUTS.short, ); it( 'should respect body visibility when ignore_body_visibility is false', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: 'document.body.style.visibility = "hidden"; return "body hidden"', ignore_body_visibility: false, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); // Content extraction behavior may vary when body is hidden }, TEST_TIMEOUTS.short, ); }); describe('Debug and logging parameters', () => { it( 'should capture console logs when log_console is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: [ 'console.log("Test log message 1")', 'console.warn("Test warning")', 'console.error("Test error")', 'return "logs executed"', ], log_console: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); // Note: Console logs may be captured in a separate field or in verbose output }, TEST_TIMEOUTS.short, ); it( 'should provide verbose output when verbose is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', verbose: true, cache_mode: 'BYPASS', word_count_threshold: 50, }, }); await expectSuccessfulCrawl(result); // Verbose output may include additional debugging information }, TEST_TIMEOUTS.short, ); }); describe('Media filtering parameters', () => { it( 'should exclude external images when exclude_external_images is true', async () => { // First, let's create a page with external images via JS const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: ` const img1 = document.createElement('img'); img1.src = 'https://httpbin.org/image/png'; img1.alt = 'External PNG'; document.body.appendChild(img1); const img2 = document.createElement('img'); img2.src = '/local-image.png'; img2.alt = 'Local image'; document.body.appendChild(img2); return document.images.length; `, exclude_external_images: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // The external image references should be filtered out }, TEST_TIMEOUTS.medium, ); it( 'should include external images when exclude_external_images is false', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', exclude_external_images: false, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Combined crawler configuration tests', () => { it( 'should handle multiple filtering options together', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/forms/post', remove_forms: true, exclude_external_links: true, exclude_external_images: true, only_text: true, word_count_threshold: 10, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should have filtered content expect(textContent?.text).not.toContain('<form'); expect(textContent?.text).not.toContain('type="submit"'); }, TEST_TIMEOUTS.short, ); it( 'should handle debug options with content extraction', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', verbose: true, log_console: true, js_code: 'console.log("Debug test"); return document.title', keep_data_attributes: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('New crawler parameters (0.7.3/0.7.4)', () => { it( 'should accept undetected browser type', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/user-agent', browser_type: 'undetected', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // The undetected browser should mask automation indicators // but we can at least verify the request was accepted }, TEST_TIMEOUTS.short, ); it( 'should filter content using css_selector', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', css_selector: 'h1', cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // css_selector returns ONLY the selected element content expect(textContent?.text?.toLowerCase()).toContain('example domain'); // Should NOT contain the paragraph text that's outside the h1 expect(textContent?.text).not.toContain('use in illustrative examples'); }, TEST_TIMEOUTS.short, ); it( 'should include links when include_links is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', include_links: true, }, }); await expectSuccessfulCrawl(result); // Check if links section is included const hasLinksInfo = (result as ToolResult).content.some( (item) => item.type === 'text' && item.text?.includes('Links:'), ); expect(hasLinksInfo).toBe(true); }, TEST_TIMEOUTS.short, ); it( 'should respect delay_before_return_html parameter', async () => { const startTime = Date.now(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/delay/1', // 1 second delay from server delay_before_return_html: 2, // Additional 2 second delay (in seconds, not ms) cache_mode: 'BYPASS', }, }); const elapsed = Date.now() - startTime; await expectSuccessfulCrawl(result); // Total time should be at least 3 seconds (1s from endpoint + 2s delay) expect(elapsed).toBeGreaterThanOrEqual(3000); }, TEST_TIMEOUTS.medium, ); it( 'should convert relative URLs when resolve_absolute_urls is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', resolve_absolute_urls: true, include_links: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); // Links should be in a separate content item const linksContent = (result as ToolResult).content.find( (c) => c.type === 'text' && c.text?.includes('Links:'), ); // With include_links=true, links info should be present expect(linksContent).toBeDefined(); expect(linksContent?.text).toContain('External: 1'); }, TEST_TIMEOUTS.short, ); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/index.server.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { jest } from '@jest/globals'; import { describe, it, expect, beforeEach } from '@jest/globals'; // Create mock functions const mockGetMarkdown = jest.fn(); const mockCaptureScreenshot = jest.fn(); const mockGeneratePDF = jest.fn(); const mockExecuteJS = jest.fn(); const mockGetHTML = jest.fn(); const mockBatchCrawl = jest.fn(); const mockExtractWithLLM = jest.fn(); const mockCrawl = jest.fn(); const mockParseSitemap = jest.fn(); // Mock the Crawl4AIService module jest.unstable_mockModule('../crawl4ai-service.js', () => ({ Crawl4AIService: jest.fn().mockImplementation(() => ({ getMarkdown: mockGetMarkdown, captureScreenshot: mockCaptureScreenshot, generatePDF: mockGeneratePDF, executeJS: mockExecuteJS, getHTML: mockGetHTML, batchCrawl: mockBatchCrawl, extractWithLLM: mockExtractWithLLM, crawl: mockCrawl, parseSitemap: mockParseSitemap, })), })); // Mock MCP SDK const mockSetRequestHandler = jest.fn(); const mockTool = jest.fn(); const mockConnect = jest.fn(); jest.unstable_mockModule('@modelcontextprotocol/sdk/server/index.js', () => ({ Server: jest.fn().mockImplementation(() => ({ setRequestHandler: mockSetRequestHandler, tool: mockTool, connect: mockConnect, })), })); // Mock the types module that exports the schemas const CallToolRequestSchema = { method: 'tools/call' }; const ListToolsRequestSchema = { method: 'tools/list' }; jest.unstable_mockModule('@modelcontextprotocol/sdk/types.js', () => ({ CallToolRequestSchema, ListToolsRequestSchema, })); jest.unstable_mockModule('@modelcontextprotocol/sdk/server/stdio.js', () => ({ StdioServerTransport: jest.fn(), })); // Mock axios const mockPost = jest.fn(); const mockGet = jest.fn(); const mockHead = jest.fn(); jest.unstable_mockModule('axios', () => ({ default: { create: jest.fn(() => ({ post: mockPost, get: mockGet, head: mockHead, })), get: mockGet, }, })); // Now dynamically import the modules after mocks are set up const { Crawl4AIServer } = await import('../server.js'); const { GetMarkdownSchema, CrawlSchema, BatchCrawlSchema, CaptureScreenshotSchema: _CaptureScreenshotSchema, GeneratePdfSchema: _GeneratePdfSchema, ExecuteJsSchema: _ExecuteJsSchema, ExtractWithLlmSchema: _ExtractWithLlmSchema, SmartCrawlSchema: _SmartCrawlSchema, CrawlRecursiveSchema: _CrawlRecursiveSchema, } = await import('../schemas/validation-schemas.js'); const { Crawl4AIService } = await import('../crawl4ai-service.js'); // Import types statically (these are removed at compile time) import type { MarkdownEndpointResponse, ScreenshotEndpointResponse, PDFEndpointResponse, HTMLEndpointResponse, CrawlEndpointResponse, } from '../types.js'; // Define types for test results interface ContentItem { type: string; text?: string; data?: string; resource?: { uri: string; mimeType: string; blob: string; }; } interface ToolResult { content: ContentItem[]; } type RequestHandler = (request: { method: string; params: unknown }) => Promise<ToolResult>; // Removed TestServerMethods interface - no longer needed since we use 'any' type describe('Crawl4AIServer Tool Handlers', () => { let server: any; // eslint-disable-line @typescript-eslint/no-explicit-any let requestHandler: RequestHandler; beforeEach(async () => { jest.clearAllMocks(); // Reset all mock functions mockGetMarkdown.mockReset(); mockCaptureScreenshot.mockReset(); mockGeneratePDF.mockReset(); mockExecuteJS.mockReset(); mockGetHTML.mockReset(); mockBatchCrawl.mockReset(); mockExtractWithLLM.mockReset(); mockCrawl.mockReset(); mockParseSitemap.mockReset(); mockPost.mockReset(); mockGet.mockReset(); mockHead.mockReset(); // Create server instance - the mock will be used automatically server = new Crawl4AIServer( process.env.CRAWL4AI_BASE_URL || 'http://test.example.com', process.env.CRAWL4AI_API_KEY || 'test-api-key', 'test-server', '1.0.0', ); // Start the server to register handlers await server.start(); // Get the request handler for CallToolRequestSchema const handlerCalls = mockSetRequestHandler.mock.calls; // Find the handler for CallToolRequestSchema (tools/call) for (const call of handlerCalls) { const [schema, handler] = call; if (schema && schema.method === 'tools/call') { requestHandler = handler; break; } } // Debug: Check if we found the handler if (!requestHandler) { console.log('Handler calls:', handlerCalls.length); handlerCalls.forEach((call, i) => { console.log(`Call ${i}:`, call[0], typeof call[1]); }); } }); // Add a simple test to verify mocking works it('should use the mocked service', () => { const MockedService = Crawl4AIService as jest.MockedClass<typeof Crawl4AIService>; expect(MockedService).toHaveBeenCalledTimes(1); expect(MockedService).toHaveBeenCalledWith('http://localhost:11235', 'test-api-key'); }); describe('Constructor and setup', () => { it('should initialize with correct configuration', () => { expect(server).toBeDefined(); expect(server.service).toBeDefined(); expect(server.sessions).toBeDefined(); }); it('should set up handlers on construction', () => { expect(mockSetRequestHandler).toHaveBeenCalled(); expect(mockSetRequestHandler.mock.calls.length).toBeGreaterThan(0); }); }); describe('Tool Handler Success Cases', () => { describe('get_markdown', () => { it('should handle successful markdown extraction', async () => { const mockResponse: MarkdownEndpointResponse = { url: 'https://example.com', filter: 'fit', query: null, cache: 'false', markdown: '# Example Page\n\nThis is example content.', success: true, }; mockGetMarkdown.mockResolvedValue(mockResponse); const result: ToolResult = await server.getMarkdown({ url: 'https://example.com', }); expect(result.content).toHaveLength(1); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toContain('# Example Page'); expect(result.content[0].text).toContain('URL: https://example.com'); expect(result.content[0].text).toContain('Filter: fit'); }); it('should handle markdown with query', async () => { const mockResponse: MarkdownEndpointResponse = { url: 'https://example.com', filter: 'bm25', query: 'test query', cache: 'false', markdown: 'Filtered content', success: true, }; mockGetMarkdown.mockResolvedValue(mockResponse); const result: ToolResult = await server.getMarkdown({ url: 'https://example.com', filter: 'bm25', query: 'test query', }); expect(mockGetMarkdown).toHaveBeenCalledWith({ url: 'https://example.com', f: 'bm25', q: 'test query', }); expect(result.content[0].text).toContain('Query: test query'); }); }); describe('capture_screenshot', () => { it('should handle successful screenshot capture', async () => { const mockResponse: ScreenshotEndpointResponse = { success: true, screenshot: 'base64-encoded-screenshot-data', }; mockCaptureScreenshot.mockResolvedValue(mockResponse); const result: ToolResult = await server.captureScreenshot({ url: 'https://example.com', }); expect(result.content).toHaveLength(2); expect(result.content[0].type).toBe('image'); expect(result.content[0].data).toBe('base64-encoded-screenshot-data'); expect(result.content[1].type).toBe('text'); expect(result.content[1].text).toBe('Screenshot captured for: https://example.com'); }); }); describe('generate_pdf', () => { it('should handle successful PDF generation', async () => { const mockResponse: PDFEndpointResponse = { success: true, pdf: 'base64-encoded-pdf-data', }; mockGeneratePDF.mockResolvedValue(mockResponse); const result: ToolResult = await server.generatePDF({ url: 'https://example.com', }); expect(result.content).toHaveLength(2); expect(result.content[0].type).toBe('resource'); expect(result.content[0].resource.blob).toBeDefined(); expect(result.content[1].type).toBe('text'); expect(result.content[1].text).toContain('PDF generated for: https://example.com'); }); }); describe('execute_js', () => { it('should handle successful JS execution', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: { success: true, results: ['Title: Example', 'Link count: 5'], }, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: ['return document.title', 'return document.links.length'], }); expect(result.content).toHaveLength(1); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('Title: Example'); expect(result.content[0].text).toContain('Link count: 5'); }); it('should handle JS execution without results', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: null, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: 'console.log("test")', }); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('No results returned'); }); it('should handle JS execution with error status', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: { success: true, results: [ { success: false, error: 'Error: Test error', stack: 'Error: Test error\n at eval (eval at evaluate (:291:30), <anonymous>:4:43)', }, ], }, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: 'throw new Error("Test error")', }); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('Script: throw new Error("Test error")'); expect(result.content[0].text).toContain('Returned: Error: Error: Test error'); }); it('should handle JS execution with no return value', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: { success: true, results: [{ success: true }], }, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: 'console.log("hello")', }); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('Returned: Executed successfully (no return value)'); }); }); describe('get_html', () => { it('should handle successful HTML retrieval', async () => { const mockResponse: HTMLEndpointResponse = { html: '<html><body><h1>Example</h1></body></html>', url: 'https://example.com', success: true, }; mockGetHTML.mockResolvedValue(mockResponse); const result: ToolResult = await server.getHTML({ url: 'https://example.com', }); expect(result.content).toHaveLength(1); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('<html><body><h1>Example</h1></body></html>'); }); }); describe('batch_crawl', () => { it('should handle successful batch crawl', async () => { const mockResponse = { results: [ { url: 'https://example1.com', markdown: { raw_markdown: 'Content 1' }, success: true }, { url: 'https://example2.com', markdown: { raw_markdown: 'Content 2' }, success: true }, ], success: true, }; // Mock axios response since batchCrawl uses axiosClient directly mockPost.mockResolvedValue({ data: mockResponse }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example1.com', 'https://example2.com'], }); expect(result.content).toHaveLength(1); expect(result.content[0].text).toContain('Batch crawl completed'); expect(result.content[0].text).toContain('Processed 2 URLs'); }); it('should handle batch crawl with remove_images', async () => { // Mock axios response since batchCrawl uses axiosClient directly mockPost.mockResolvedValue({ data: { results: [] } }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example.com'], remove_images: true, }); expect(mockPost).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], crawler_config: { exclude_tags: ['img', 'picture', 'svg'], }, }); expect(result.content[0].text).toContain('Batch crawl completed'); }); }); describe('crawl', () => { it('should handle successful crawl with all options', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', html: '<html>...</html>', cleaned_html: '<html>clean</html>', fit_html: '<html>fit</html>', success: true, status_code: 200, response_headers: {}, session_id: 'test-session', metadata: { title: 'Example' }, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: '# Example', markdown_with_citations: '# Example [1]', references_markdown: '[1]: https://example.com', fit_markdown: '# Example', fit_html: '<h1>Example</h1>', }, tables: [], extracted_content: null, screenshot: 'screenshot-data', pdf: 'pdf-data', mhtml: null, js_execution_result: { success: true, results: ['JS result'] }, downloaded_files: null, network_requests: null, console_messages: ['Console log'], ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.5, server_memory_delta_mb: 10, server_peak_memory_mb: 100, }; mockCrawl.mockResolvedValue(mockResponse); const result: ToolResult = await server.crawl({ url: 'https://example.com', screenshot: true, pdf: true, js_code: 'return document.title', session_id: 'test-session', }); expect(result.content.length).toBeGreaterThan(0); // Multiple content types // Check text content const textContent = result.content.find((c) => c.type === 'text' && c.text?.includes('# Example')); expect(textContent).toBeDefined(); // Check screenshot const screenshotContent = result.content.find((c) => c.type === 'image'); expect(screenshotContent?.data).toBe('screenshot-data'); }); it('should handle crawl with proxy configuration', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Proxied content' }, success: true, status_code: 200, }, ], }; mockCrawl.mockResolvedValue(mockResponse); await server.crawl({ url: 'https://example.com', proxy_server: 'http://proxy.example.com:8080', proxy_username: 'user', proxy_password: 'pass', }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ browser_config: expect.objectContaining({ proxy_config: { server: 'http://proxy.example.com:8080', username: 'user', password: 'pass', }, }), }), ); }); it('should handle crawl with cookies and headers', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content with auth' }, success: true, status_code: 200, }, ], }; mockCrawl.mockResolvedValue(mockResponse); await server.crawl({ url: 'https://example.com', cookies: [{ name: 'session', value: 'abc123' }], headers: { Authorization: 'Bearer token123' }, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ browser_config: expect.objectContaining({ cookies: [{ name: 'session', value: 'abc123' }], headers: { Authorization: 'Bearer token123' }, }), }), ); }); it('should handle virtual scroll configuration', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Scrolled content' }, success: true, status_code: 200, }, ], }; mockCrawl.mockResolvedValue(mockResponse); await server.crawl({ url: 'https://example.com', virtual_scroll_config: { enabled: true, scroll_step: 100, max_scrolls: 10, }, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ virtual_scroll_config: { enabled: true, scroll_step: 100, max_scrolls: 10, }, }), }), ); }); it('should handle js_code as null error', async () => { await expect( server.crawl({ url: 'https://example.com', js_code: null, }), ).rejects.toThrow('js_code parameter is null'); }); }); describe('extract_with_llm', () => { it('should handle successful LLM extraction', async () => { mockExtractWithLLM.mockResolvedValue({ answer: 'The main topic is JavaScript testing.', }); const result: ToolResult = await server.extractWithLLM({ url: 'https://example.com', query: 'What is the main topic?', }); expect(result.content).toHaveLength(1); expect(result.content[0].text).toBe('The main topic is JavaScript testing.'); }); }); describe('extract_links', () => { it('should extract and categorize links', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [ { href: '/page1', text: 'Page 1' }, { href: '/page2', text: 'Page 2' }, ], external: [{ href: 'https://external.com', text: 'External' }], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('Link analysis for https://example.com:'); expect(result.content[0].text).toContain('internal (2)'); expect(result.content[0].text).toContain('/page1'); expect(result.content[0].text).toContain('external (1)'); }); it('should categorize external links (social, images, scripts)', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [], external: [ 'https://facebook.com/profile', 'https://example.com/image.jpg', 'https://cdn.com/script.js', ], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('social (1)'); expect(result.content[0].text).toContain('images (1)'); expect(result.content[0].text).toContain('scripts (1)'); expect(result.content[0].text).toContain('external (0)'); }); }); describe('crawl_recursive', () => { it('should crawl recursively with depth limit', async () => { // Ensure mock is clean before setting up mockPost.mockReset(); mockPost .mockResolvedValueOnce({ data: { results: [ { url: 'https://example.com', links: { internal: [{ href: 'https://example.com/page1', text: 'Page 1' }], }, markdown: { raw_markdown: 'Home page' }, success: true, }, ], }, }) .mockResolvedValueOnce({ data: { results: [ { url: 'https://example.com/page1', links: { internal: [] }, markdown: { raw_markdown: 'Page 1 content' }, success: true, }, ], }, }); const result: ToolResult = await server.crawlRecursive({ url: 'https://example.com', max_depth: 2, }); expect(result.content[0].text).toContain('Recursive crawl completed:'); expect(result.content[0].text).toContain('Pages crawled: 2'); expect(result.content[0].text).toContain('https://example.com'); expect(result.content[0].text).toContain('https://example.com/page1'); }); }); describe('parse_sitemap', () => { it('should parse sitemap successfully', async () => { mockGet.mockResolvedValue({ data: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>https://example.com/</loc></url> <url><loc>https://example.com/page1</loc></url> <url><loc>https://example.com/page2</loc></url> </urlset>`, }); const result: ToolResult = await server.parseSitemap({ url: 'https://example.com/sitemap.xml', }); expect(result.content[0].text).toContain('Sitemap parsed successfully:'); expect(result.content[0].text).toContain('Total URLs found: 3'); expect(result.content[0].text).toContain('https://example.com/'); expect(result.content[0].text).toContain('https://example.com/page1'); }); }); describe('smart_crawl', () => { it('should handle smart crawl for HTML content', async () => { mockHead.mockResolvedValue({ headers: { 'content-type': 'text/html' }, }); mockPost.mockResolvedValue({ data: { results: [ { markdown: { raw_markdown: 'HTML content' }, links: { internal: [], external: [] }, }, ], }, }); const result: ToolResult = await server.smartCrawl({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Smart crawl detected content type'); // Already contains 'Smart crawl detected content type' }); it('should handle smart crawl for PDF content', async () => { mockHead.mockResolvedValue({ headers: { 'content-type': 'application/pdf' }, }); // Mock the crawl response for PDF mockPost.mockResolvedValue({ data: { results: [ { markdown: { raw_markdown: 'PDF content extracted' }, links: { internal: [], external: [] }, }, ], }, }); const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/doc.pdf', }); expect(result.content[0].text).toContain('Smart crawl detected content type'); expect(result.content[0].text).toContain('PDF content extracted'); }); }); }); describe('Tool Handler Error Cases', () => { describe('Service errors', () => { it('should handle service error for get_markdown', async () => { mockGetMarkdown.mockRejectedValue(new Error('Network error')); await expect(server.getMarkdown({ url: 'https://example.com' })).rejects.toThrow( 'Failed to get markdown: Network error', ); }); it('should handle axios error with response detail', async () => { const axiosError = { response: { data: { detail: 'Invalid API key', }, }, }; mockCaptureScreenshot.mockRejectedValue(axiosError); await expect(server.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 'Failed to capture screenshot: Invalid API key', ); }); it('should handle missing screenshot data', async () => { mockCaptureScreenshot.mockResolvedValue({ success: false, screenshot: '', }); await expect(server.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 'Screenshot capture failed - no screenshot data in response', ); }); it('should handle missing PDF data', async () => { mockGeneratePDF.mockResolvedValue({ success: true, pdf: '', }); await expect(server.generatePDF({ url: 'https://example.com' })).rejects.toThrow( 'PDF generation failed - no PDF data in response', ); }); }); describe('Validation errors', () => { it('should handle missing scripts for execute_js', async () => { await expect( server.executeJS({ url: 'https://example.com', scripts: null as unknown as string }), ).rejects.toThrow('scripts is required'); }); it('should handle empty crawl options', async () => { await expect(server.crawl(null as unknown as Parameters<typeof server.crawl>[0])).rejects.toThrow( 'crawl requires options object with at least a url parameter', ); }); it('should handle crawl_recursive errors', async () => { // Setup the mock to fail - crawlRecursive catches the error internally mockPost.mockRejectedValue(new Error('API error')); const result: ToolResult = await server.crawlRecursive({ url: 'https://example.com' }); // The method catches errors and returns a message about no pages crawled expect(result.content[0].text).toContain('Pages crawled: 0'); expect(result.content[0].text).toContain('No pages could be crawled'); }); it('should handle parse_sitemap errors', async () => { mockGet.mockRejectedValue(new Error('Failed to fetch sitemap')); await expect(server.parseSitemap({ url: 'https://example.com/sitemap.xml' })).rejects.toThrow( 'Failed to parse sitemap: Failed to fetch sitemap', ); }); }); describe('Edge cases', () => { it('should handle batch crawl with no results', async () => { mockPost.mockResolvedValue({ data: { results: [], }, }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example.com'], }); expect(result.content[0].text).toContain('Batch crawl completed'); expect(result.content[0].text).toContain('Processed 0 URLs'); }); it('should handle extract_links with no links', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [], external: [], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', }); expect(result.content[0].text).toContain('All links from https://example.com:'); expect(result.content[0].text).toMatch(/\n\s*$/); }); it('should handle smart crawl with HEAD request failure', async () => { mockHead.mockRejectedValue(new Error('HEAD failed')); // Fallback to HTML crawl mockPost.mockResolvedValue({ data: { results: [ { markdown: { raw_markdown: 'Fallback content' }, links: { internal: [], external: [] }, }, ], }, }); const result: ToolResult = await server.smartCrawl({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Smart crawl detected content type'); }); }); describe('ZodError validation tests', () => { it('should validate get_markdown parameters', () => { // Valid case expect(() => { GetMarkdownSchema.parse({ url: 'https://example.com' }); }).not.toThrow(); // Invalid - missing url expect(() => { GetMarkdownSchema.parse({ filter: 'fit' }); }).toThrow(); // Invalid - bm25 without query expect(() => { GetMarkdownSchema.parse({ url: 'https://example.com', filter: 'bm25' }); }).toThrow('Query parameter is required when using bm25 or llm filter'); }); it('should validate crawl parameters', () => { // Valid case expect(() => { CrawlSchema.parse({ url: 'https://example.com' }); }).not.toThrow(); // Invalid - js_only without session_id expect(() => { CrawlSchema.parse({ url: 'https://example.com', js_only: true }); }).toThrow('js_only requires session_id'); // Invalid - empty js_code array expect(() => { CrawlSchema.parse({ url: 'https://example.com', js_code: [] }); }).toThrow('js_code array cannot be empty'); }); it('should validate batch_crawl parameters', () => { // Valid case expect(() => { BatchCrawlSchema.parse({ urls: ['https://example.com'] }); }).not.toThrow(); // Invalid - not an array expect(() => { BatchCrawlSchema.parse({ urls: 'not-an-array' }); }).toThrow(); }); }); describe('Parameter validation edge cases', () => { // These tests require proper schema validation which happens at the handler level // Skipping direct method calls as they bypass validation }); describe('Additional coverage tests', () => { it('should handle crawl with media extraction', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, media: { images: [ { src: 'https://example.com/img1.jpg', alt: 'Image 1' }, { src: 'https://example.com/img2.jpg', alt: 'Image 2' }, ], videos: [{ src: 'https://example.com/video.mp4', type: 'video/mp4' }], audios: [], }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', media_handling: { images: true, videos: true }, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with tables extraction', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, tables: [ { headers: ['Name', 'Age'], rows: [ ['John', '30'], ['Jane', '25'], ], markdown: '| Name | Age |\n|------|-----|\n| John | 30 |\n| Jane | 25 |', }, ], success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with network_requests', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, network_requests: [ { url: 'https://api.example.com/data', method: 'GET', status: 200 }, { url: 'https://api.example.com/post', method: 'POST', status: 201 }, ], success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', network_requests: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with mhtml output', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, mhtml: 'MHTML content here', success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', mhtml: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with downloaded_files', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, downloaded_files: { 'file1.pdf': 'base64content1', 'file2.doc': 'base64content2', }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', download_files: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with ssl_certificate', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, ssl_certificate: { issuer: "Let's Encrypt", subject: '*.example.com', validFrom: '2024-01-01', validTo: '2024-12-31', protocol: 'TLSv1.3', }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', ssl_certificate: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with wait_for conditions', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Dynamic content loaded' }, success: true, status_code: 200, }, ], }); await server.crawl({ url: 'https://example.com', wait_for: { selector: '.dynamic-content', timeout: 5000, }, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ wait_for: { selector: '.dynamic-content', timeout: 5000, }, }), }), ); }); it('should handle crawl error scenarios', async () => { mockCrawl.mockResolvedValue({ success: false, results: [ { url: 'https://example.com', success: false, error: 'Page load timeout', status_code: 0, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('No content extracted'); }); it('should handle extract_links with categorized output', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [ { href: '/page1', text: 'Page 1' }, { href: '/page2', text: 'Page 2' }, ], external: [{ href: 'https://external.com', text: 'External' }], social: [{ href: 'https://twitter.com/example', text: 'Twitter' }], documents: [{ href: '/file.pdf', text: 'PDF Document' }], images: [{ href: '/image.jpg', text: 'Image' }], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('internal (2)'); expect(result.content[0].text).toContain('external (1)'); expect(result.content[0].text).toContain('social (0)'); // No social links in internal/external expect(result.content[0].text).toContain('documents (0)'); // No documents in internal/external expect(result.content[0].text).toContain('images (0)'); // No images in internal/external }); it('should handle smart_crawl for sitemap', async () => { // Set up axios client mock for the server instance const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/xml' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/sitemap.xml', markdown: { raw_markdown: 'Sitemap content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/sitemap.xml', }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); expect(result.content[0].text).toContain('Sitemap content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/sitemap.xml'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); it('should handle smart_crawl for RSS feed', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/rss+xml' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/feed.rss', markdown: { raw_markdown: 'RSS feed content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/feed.rss', }); expect(result.content[0].text).toContain('Smart crawl detected content type: rss'); expect(result.content[0].text).toContain('RSS feed content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/feed.rss'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); it('should handle smart_crawl for JSON content', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/json' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/data.json', markdown: { raw_markdown: 'JSON content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/data.json', }); expect(result.content[0].text).toContain('Smart crawl detected content type: json'); expect(result.content[0].text).toContain('JSON content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/data.json'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); it('should correctly categorize internal documents and images', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [ { href: '/page1', text: 'Page 1' }, { href: '/docs/manual.pdf', text: 'Manual' }, { href: '/images/logo.png', text: 'Logo' }, { href: '/assets/style.css', text: 'Styles' }, ], external: [{ href: 'https://example.com/report.pdf', text: 'External Report' }], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('internal (1)'); // Only /page1 remains as internal expect(result.content[0].text).toContain('external (0)'); // External PDF moved to documents expect(result.content[0].text).toContain('documents (2)'); // Both PDFs expect(result.content[0].text).toContain('images (1)'); // The PNG expect(result.content[0].text).toContain('scripts (1)'); // The CSS }); it('should handle smart_crawl for plain text', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'text/plain' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/file.txt', markdown: { raw_markdown: 'This is plain text content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/file.txt', }); expect(result.content[0].text).toContain('Smart crawl detected content type: text'); expect(result.content[0].text).toContain('This is plain text content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/file.txt'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); }); describe('Additional Method Tests', () => { it('should handle parse_sitemap', async () => { // Mock axios.get to return sitemap XML mockGet.mockResolvedValue({ data: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>https://example.com/page1</loc></url> <url><loc>https://example.com/page2</loc></url> <url><loc>https://example.com/page3</loc></url> </urlset>`, }); const result: ToolResult = await server.parseSitemap({ url: 'https://example.com/sitemap.xml', }); expect(result.content[0].text).toContain('Sitemap parsed successfully'); expect(result.content[0].text).toContain('Total URLs found: 3'); }); it('should handle parse_sitemap with filter', async () => { // Mock axios.get to return sitemap XML with blog URLs mockGet.mockResolvedValue({ data: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>https://example.com/page1</loc></url> <url><loc>https://example.com/blog/post1</loc></url> <url><loc>https://example.com/blog/post2</loc></url> <url><loc>https://example.com/page2</loc></url> </urlset>`, }); const result: ToolResult = await server.parseSitemap({ url: 'https://example.com/sitemap.xml', filter_pattern: '.*blog.*', }); expect(result.content[0].text).toContain('Total URLs found: 4'); expect(result.content[0].text).toContain('Filtered URLs: 2'); }); it('should handle crawl_recursive', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, links: { internal: [], external: [] }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawlRecursive({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Recursive crawl completed'); }); it('should handle parse_sitemap error', async () => { mockParseSitemap.mockRejectedValue(new Error('Network error')); await expect( server.parseSitemap({ url: 'https://example.com/sitemap.xml', }), ).rejects.toThrow('Failed to parse sitemap'); }); it('should handle crawl with error result', async () => { mockCrawl.mockResolvedValue({ success: false, results: [], }); await expect( server.crawl({ url: 'https://example.com', }), ).rejects.toThrow('Invalid response from server'); }); it('should handle crawl with metadata and links', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, metadata: { title: 'Test Page', description: 'Test' }, links: { internal: ['/page1'], external: ['https://external.com'] }, js_execution_result: { results: [42, 'test'] }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', }); expect(result.content.length).toBeGreaterThan(1); expect(result.content.some((c) => c.text?.includes('Metadata'))).toBe(true); expect(result.content.some((c) => c.text?.includes('Links'))).toBe(true); expect(result.content.some((c) => c.text?.includes('JavaScript Execution Results'))).toBe(true); }); it('should handle executeJS with no scripts', async () => { await expect( server.executeJS({ url: 'https://example.com', scripts: null, }), ).rejects.toThrow('scripts is required'); }); it('should handle executeJS with array of scripts', async () => { mockExecuteJS.mockResolvedValue({ content: [{ type: 'text', text: 'JS executed' }], }); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: ['return 1', 'return 2'], }); expect(result.content[0].text).toContain('JavaScript executed on:'); }); it('should handle batchCrawl with cache bypass', async () => { mockPost.mockResolvedValue({ data: { results: [{ success: true }, { success: false }], }, }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example.com/1', 'https://example.com/2'], bypass_cache: true, remove_images: true, }); expect(result.content[0].text).toContain('Batch crawl completed'); expect(mockPost).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ crawler_config: expect.objectContaining({ cache_mode: 'BYPASS', exclude_tags: ['img', 'picture', 'svg'], }), }), ); }); it('should handle smart_crawl with follow_links', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/xml' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/sitemap.xml', markdown: { raw_markdown: '<url><loc>https://example.com/page1</loc></url>' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/sitemap.xml', follow_links: true, }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); }); it('should handle smart_crawl with HEAD request failure', async () => { const axiosClientMock = { head: jest.fn().mockRejectedValue({ response: { status: 500 } }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content from crawl' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com', }); // Should continue despite HEAD failure expect(result.content[0].text).toContain('Smart crawl detected content type: html'); expect(result.content[0].text).toContain('Content from crawl'); }); it('should handle extractLinks with no links', async () => { mockPost.mockResolvedValue({ data: { results: [ { markdown: 'Content without links', }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: false, }); expect(result.content[0].text).toContain('All links from'); }); it('should handle extractLinks with manually extracted links', async () => { mockPost.mockResolvedValue({ data: { results: [ { markdown: 'Check out <a href="/page1">Page 1</a>', }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', }); expect(result.content[0].text).toContain('All links from'); }); it('should handle MCP request handler for all tools', async () => { // Request handler should be available from beforeEach expect(requestHandler).toBeDefined(); // Test various tools through the request handler const tools = [ { name: 'get_markdown', args: { url: 'https://example.com' } }, { name: 'capture_screenshot', args: { url: 'https://example.com' } }, { name: 'generate_pdf', args: { url: 'https://example.com' } }, { name: 'execute_js', args: { url: 'https://example.com', scripts: 'return 1' } }, { name: 'batch_crawl', args: { urls: ['https://example.com'] } }, { name: 'smart_crawl', args: { url: 'https://example.com' } }, { name: 'get_html', args: { url: 'https://example.com' } }, { name: 'extract_links', args: { url: 'https://example.com' } }, { name: 'crawl_recursive', args: { url: 'https://example.com' } }, { name: 'parse_sitemap', args: { url: 'https://example.com/sitemap.xml' } }, { name: 'crawl', args: { url: 'https://example.com' } }, { name: 'manage_session', args: { action: 'create' } }, { name: 'manage_session', args: { action: 'clear', session_id: 'test' } }, { name: 'manage_session', args: { action: 'list' } }, { name: 'extract_with_llm', args: { url: 'https://example.com', prompt: 'test' } }, ]; // Mock all service methods to return success mockGetMarkdown.mockResolvedValue({ content: [{ type: 'text', text: 'markdown' }] }); mockCaptureScreenshot.mockResolvedValue({ content: [{ type: 'text', text: 'screenshot' }] }); mockGeneratePDF.mockResolvedValue({ content: [{ type: 'text', text: 'pdf' }] }); mockExecuteJS.mockResolvedValue({ content: [{ type: 'text', text: 'js' }] }); mockBatchCrawl.mockResolvedValue({ content: [{ type: 'text', text: 'batch' }] }); mockGetHTML.mockResolvedValue({ content: [{ type: 'text', text: 'html' }] }); mockExtractWithLLM.mockResolvedValue({ content: [{ type: 'text', text: 'llm' }] }); mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'content' }, success: true, status_code: 200, }, ], }); mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [], external: [] }, }, ], }, }); mockParseSitemap.mockResolvedValue(['https://example.com/page1']); // Test each tool for (const tool of tools) { const result = await requestHandler({ method: 'tools/call', params: { name: tool.name, arguments: tool.args, }, }); expect(result).toBeDefined(); expect(result.content).toBeDefined(); } // Test unknown tool const unknownResult = await requestHandler({ method: 'tools/call', params: { name: 'unknown_tool', arguments: {}, }, }); expect(unknownResult.content[0].text).toContain('Error: Unknown tool'); // The handler only handles tools/call requests, // so we don't test other methods here }); it('should handle MCP request handler validation errors', async () => { expect(requestHandler).toBeDefined(); // Test validation errors for various tools const invalidRequests = [ { name: 'get_markdown', args: {} }, // missing url { name: 'capture_screenshot', args: {} }, // missing url { name: 'generate_pdf', args: {} }, // missing url { name: 'execute_js', args: { url: 'https://example.com' } }, // missing scripts { name: 'batch_crawl', args: {} }, // missing urls { name: 'smart_crawl', args: {} }, // missing url { name: 'get_html', args: {} }, // missing url { name: 'extract_links', args: {} }, // missing url { name: 'crawl_recursive', args: {} }, // missing url { name: 'parse_sitemap', args: {} }, // missing url { name: 'crawl', args: {} }, // missing url { name: 'manage_session', args: {} }, // missing action { name: 'manage_session', args: { action: 'clear' } }, // missing session_id for clear { name: 'manage_session', args: { action: 'invalid' } }, // invalid action { name: 'extract_with_llm', args: { url: 'https://example.com' } }, // missing prompt ]; for (const req of invalidRequests) { const result = await requestHandler({ method: 'tools/call', params: { name: req.name, arguments: req.args, }, }); expect(result.content[0].text).toContain(`Error: Invalid parameters for ${req.name}`); } }); it('should handle crawl with all output types', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', extracted_content: { data: 'extracted' }, screenshot: 'base64screenshot', pdf: 'base64pdf', success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', screenshot: true, pdf: true, }); expect(result.content.some((c) => c.type === 'text')).toBe(true); expect(result.content.some((c) => c.type === 'image')).toBe(true); expect(result.content.some((c) => c.type === 'resource' && c.resource?.mimeType === 'application/pdf')).toBe( true, ); }); }); describe('MCP Protocol Handler Tests', () => { it('should handle tools/list request', async () => { // Find the tools/list handler const toolsListHandler = mockSetRequestHandler.mock.calls.find( (call) => (call[0] as any).method === 'tools/list', )?.[1]; expect(toolsListHandler).toBeDefined(); const result = await (toolsListHandler as any)({ method: 'tools/list', params: {} }); // eslint-disable-line @typescript-eslint/no-explicit-any expect(result).toBeDefined(); expect(result.tools).toBeDefined(); expect(result.tools.length).toBe(13); // Should have 13 tools }); it('should handle get_markdown query functionality', async () => { mockGetMarkdown.mockResolvedValue({ url: 'https://example.com', filter: 'fit', query: 'What products are listed?', cache: 'false', markdown: 'Page content about products', success: true, }); const result: ToolResult = await server.getMarkdown({ url: 'https://example.com', query: 'What products are listed?', }); expect(result.content[0].text).toContain('Query: What products are listed?'); expect(result.content[0].text).toContain('Page content about products'); }); }); }); }); ```