This is page 2 of 3. Use http://codebase.md/omgwtfwow/mcp-crawl4ai-ts?page={x} to view the full context. # Directory Structure ``` ├── .env.example ├── .github │ ├── CI.md │ ├── copilot-instructions.md │ └── workflows │ └── ci.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── CHANGELOG.md ├── eslint.config.mjs ├── jest.config.cjs ├── jest.setup.cjs ├── LICENSE ├── package-lock.json ├── package.json ├── README.md ├── src │ ├── __tests__ │ │ ├── crawl.test.ts │ │ ├── crawl4ai-service.network.test.ts │ │ ├── crawl4ai-service.test.ts │ │ ├── handlers │ │ │ ├── crawl-handlers.test.ts │ │ │ ├── parameter-combinations.test.ts │ │ │ ├── screenshot-saving.test.ts │ │ │ ├── session-handlers.test.ts │ │ │ └── utility-handlers.test.ts │ │ ├── index.cli.test.ts │ │ ├── index.npx.test.ts │ │ ├── index.server.test.ts │ │ ├── index.test.ts │ │ ├── integration │ │ │ ├── batch-crawl.integration.test.ts │ │ │ ├── capture-screenshot.integration.test.ts │ │ │ ├── crawl-advanced.integration.test.ts │ │ │ ├── crawl-handlers.integration.test.ts │ │ │ ├── crawl-recursive.integration.test.ts │ │ │ ├── crawl.integration.test.ts │ │ │ ├── execute-js.integration.test.ts │ │ │ ├── extract-links.integration.test.ts │ │ │ ├── extract-with-llm.integration.test.ts │ │ │ ├── generate-pdf.integration.test.ts │ │ │ ├── get-html.integration.test.ts │ │ │ ├── get-markdown.integration.test.ts │ │ │ ├── parse-sitemap.integration.test.ts │ │ │ ├── session-management.integration.test.ts │ │ │ ├── smart-crawl.integration.test.ts │ │ │ └── test-utils.ts │ │ ├── request-handler.test.ts │ │ ├── schemas │ │ │ └── validation-edge-cases.test.ts │ │ ├── types │ │ │ └── mocks.ts │ │ └── utils │ │ └── javascript-validation.test.ts │ ├── crawl4ai-service.ts │ ├── handlers │ │ ├── base-handler.ts │ │ ├── content-handlers.ts │ │ ├── crawl-handlers.ts │ │ ├── session-handlers.ts │ │ └── utility-handlers.ts │ ├── index.ts │ ├── schemas │ │ ├── helpers.ts │ │ └── validation-schemas.ts │ ├── server.ts │ └── types.ts ├── tsconfig.build.json └── tsconfig.json ``` # Files -------------------------------------------------------------------------------- /src/__tests__/request-handler.test.ts: -------------------------------------------------------------------------------- ```typescript import { jest } from '@jest/globals'; // Mock all dependencies before imports const mockGetMarkdown = jest.fn(); const mockCaptureScreenshot = jest.fn(); const mockGeneratePDF = jest.fn(); const mockExecuteJS = jest.fn(); const mockGetHTML = jest.fn(); const mockBatchCrawl = jest.fn(); const mockExtractWithLLM = jest.fn(); const mockCrawl = jest.fn(); const mockParseSitemap = jest.fn(); jest.unstable_mockModule('../crawl4ai-service.js', () => ({ Crawl4AIService: jest.fn().mockImplementation(() => ({ getMarkdown: mockGetMarkdown, captureScreenshot: mockCaptureScreenshot, generatePDF: mockGeneratePDF, executeJS: mockExecuteJS, getHTML: mockGetHTML, batchCrawl: mockBatchCrawl, extractWithLLM: mockExtractWithLLM, crawl: mockCrawl, parseSitemap: mockParseSitemap, })), })); // Mock axios const mockPost = jest.fn(); const mockAxiosCreate = jest.fn(() => ({ post: mockPost, })); jest.unstable_mockModule('axios', () => ({ default: { create: mockAxiosCreate, }, })); // Mock MCP SDK const mockSetRequestHandler = jest.fn(); const mockTool = jest.fn(); const mockConnect = jest.fn(); jest.unstable_mockModule('@modelcontextprotocol/sdk/server/index.js', () => ({ Server: jest.fn().mockImplementation(() => ({ setRequestHandler: mockSetRequestHandler, tool: mockTool, connect: mockConnect, })), })); // Mock the types module that exports the schemas const CallToolRequestSchema = { method: 'tools/call' }; const ListToolsRequestSchema = { method: 'tools/list' }; jest.unstable_mockModule('@modelcontextprotocol/sdk/types.js', () => ({ CallToolRequestSchema, ListToolsRequestSchema, })); jest.unstable_mockModule('@modelcontextprotocol/sdk/server/stdio.js', () => ({ StdioServerTransport: jest.fn(), })); // Now import the server after mocks are set up const { Crawl4AIServer } = await import('../server.js'); // Removed unused type definitions - using 'any' for test mocks describe('MCP Request Handler Direct Testing', () => { let server: any; // eslint-disable-line @typescript-eslint/no-explicit-any let requestHandler: any; // eslint-disable-line @typescript-eslint/no-explicit-any beforeEach(async () => { jest.clearAllMocks(); // Set up mock responses mockGetMarkdown.mockResolvedValue({ success: true, content: 'markdown content' }); mockCaptureScreenshot.mockResolvedValue({ success: true, screenshot: 'base64image' }); mockGeneratePDF.mockResolvedValue({ success: true, pdf: 'base64pdf' }); mockExecuteJS.mockResolvedValue({ js_execution_result: { results: [42] } }); mockGetHTML.mockResolvedValue({ success: true, html: '<html></html>' }); mockExtractWithLLM.mockResolvedValue({ answer: 'extracted answer' }); mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'content' }, success: true, status_code: 200, }, ], }); mockParseSitemap.mockResolvedValue(['https://example.com/page1']); mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [], external: [] }, success: true, }, ], }, }); // Create server server = new Crawl4AIServer( process.env.CRAWL4AI_BASE_URL || 'http://test.example.com', process.env.CRAWL4AI_API_KEY || 'test-api-key', 'test-server', '1.0.0', ); await server.start(); // Get the request handler for CallToolRequestSchema const handlerCalls = mockSetRequestHandler.mock.calls; // Find the handler for CallToolRequestSchema (tools/call) for (const call of handlerCalls) { const [schema, handler] = call; if (schema && (schema as any).method === 'tools/call') { requestHandler = handler; break; } } }); describe('Tool Handler Coverage', () => { it('should handle all valid tool requests', async () => { expect(requestHandler).toBeDefined(); const validRequests = [ { name: 'get_markdown', arguments: { url: 'https://example.com' } }, { name: 'capture_screenshot', arguments: { url: 'https://example.com' } }, { name: 'generate_pdf', arguments: { url: 'https://example.com' } }, { name: 'execute_js', arguments: { url: 'https://example.com', scripts: 'return 1' } }, { name: 'batch_crawl', arguments: { urls: ['https://example.com'] } }, { name: 'smart_crawl', arguments: { url: 'https://example.com' } }, { name: 'get_html', arguments: { url: 'https://example.com' } }, { name: 'extract_links', arguments: { url: 'https://example.com' } }, { name: 'crawl_recursive', arguments: { url: 'https://example.com' } }, { name: 'parse_sitemap', arguments: { url: 'https://example.com/sitemap.xml' } }, { name: 'crawl', arguments: { url: 'https://example.com' } }, { name: 'manage_session', arguments: { action: 'create' } }, { name: 'manage_session', arguments: { action: 'clear', session_id: 'test' } }, { name: 'manage_session', arguments: { action: 'list' } }, { name: 'extract_with_llm', arguments: { url: 'https://example.com', prompt: 'test' } }, ]; for (const req of validRequests) { const result = await requestHandler({ method: 'tools/call', params: req, }); expect(result).toBeDefined(); expect(result.content).toBeDefined(); } }); it('should handle all validation error cases', async () => { const invalidRequests = [ { name: 'get_markdown', arguments: {}, expectedError: 'Invalid parameters for get_markdown' }, { name: 'capture_screenshot', arguments: {}, expectedError: 'Invalid parameters for capture_screenshot' }, { name: 'generate_pdf', arguments: {}, expectedError: 'Invalid parameters for generate_pdf' }, { name: 'execute_js', arguments: { url: 'https://example.com' }, expectedError: 'Invalid parameters for execute_js', }, { name: 'batch_crawl', arguments: {}, expectedError: 'Invalid parameters for batch_crawl' }, { name: 'smart_crawl', arguments: {}, expectedError: 'Invalid parameters for smart_crawl' }, { name: 'get_html', arguments: {}, expectedError: 'Invalid parameters for get_html' }, { name: 'extract_links', arguments: {}, expectedError: 'Invalid parameters for extract_links' }, { name: 'crawl_recursive', arguments: {}, expectedError: 'Invalid parameters for crawl_recursive' }, { name: 'parse_sitemap', arguments: {}, expectedError: 'Invalid parameters for parse_sitemap' }, { name: 'crawl', arguments: {}, expectedError: 'Invalid parameters for crawl' }, { name: 'manage_session', arguments: {}, expectedError: 'Invalid parameters for manage_session' }, { name: 'manage_session', arguments: { action: 'clear' }, expectedError: 'Invalid parameters for manage_session', }, { name: 'extract_with_llm', arguments: { url: 'https://example.com' }, expectedError: 'Invalid parameters for extract_with_llm', }, ]; for (const req of invalidRequests) { const result = await requestHandler({ method: 'tools/call', params: req, }); expect(result.content[0].text).toContain(req.expectedError); } }); it('should handle unknown tool', async () => { const result = await requestHandler({ method: 'tools/call', params: { name: 'unknown_tool', arguments: {}, }, }); expect(result.content[0].text).toContain('Error: Unknown tool: unknown_tool'); }); it('should handle non-ZodError exceptions', async () => { // Make the service method throw a non-Zod error mockGetMarkdown.mockRejectedValue(new Error('Service error')); const result = await requestHandler({ method: 'tools/call', params: { name: 'get_markdown', arguments: { url: 'https://example.com' }, }, }); expect(result.content[0].text).toContain('Error: Failed to get markdown: Service error'); }); it('should handle manage_session with create action', async () => { const result = await requestHandler({ method: 'tools/call', params: { name: 'manage_session', arguments: { action: 'create', session_id: 'test-session', initial_url: 'https://example.com', }, }, }); expect(result.content[0].text).toContain('Session created successfully'); expect(result.content[0].text).toContain('test-session'); }); it('should handle manage_session with clear action', async () => { // First create a session await requestHandler({ method: 'tools/call', params: { name: 'manage_session', arguments: { action: 'create', session_id: 'test-to-clear', }, }, }); // Then clear it const result = await requestHandler({ method: 'tools/call', params: { name: 'manage_session', arguments: { action: 'clear', session_id: 'test-to-clear', }, }, }); expect(result.content[0].text).toContain('Session cleared successfully'); }); it('should handle manage_session with list action', async () => { // First create a session await requestHandler({ method: 'tools/call', params: { name: 'manage_session', arguments: { action: 'create', session_id: 'test-list-session', }, }, }); // List sessions const result = await requestHandler({ method: 'tools/call', params: { name: 'manage_session', arguments: { action: 'list' }, }, }); expect(result.content[0].text).toContain('Active sessions'); expect(result.content[0].text).toContain('test-list-session'); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/screenshot-saving.test.ts: -------------------------------------------------------------------------------- ```typescript import { jest } from '@jest/globals'; // Mock fs/promises const mockMkdir = jest.fn(); const mockWriteFile = jest.fn(); jest.unstable_mockModule('fs/promises', () => ({ mkdir: mockMkdir, writeFile: mockWriteFile, })); // Mock os const mockHomedir = jest.fn(); jest.unstable_mockModule('os', () => ({ homedir: mockHomedir, })); // Import after mocking const { ContentHandlers } = await import('../../handlers/content-handlers.js'); const { CrawlHandlers } = await import('../../handlers/crawl-handlers.js'); // Mock the service const mockService = { captureScreenshot: jest.fn(), crawl: jest.fn(), }; // Mock axios client const mockAxiosClient = { post: jest.fn(), }; describe('Screenshot Local Saving', () => { let contentHandlers: InstanceType<typeof ContentHandlers>; let crawlHandlers: InstanceType<typeof CrawlHandlers>; const testScreenshotBase64 = 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=='; // 1x1 red pixel beforeEach(() => { jest.clearAllMocks(); contentHandlers = new ContentHandlers(mockService as never, mockAxiosClient as never, new Map()); crawlHandlers = new CrawlHandlers(mockService as never, mockAxiosClient as never, new Map()); // Default mock implementations mockMkdir.mockResolvedValue(undefined); mockWriteFile.mockResolvedValue(undefined); }); describe('ContentHandlers.captureScreenshot', () => { it('should save screenshot to local directory when save_to_directory is provided', async () => { const mockDate = new Date('2024-01-15T10:30:00Z'); jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); mockService.captureScreenshot.mockResolvedValue({ success: true, screenshot: testScreenshotBase64, }); const result = await contentHandlers.captureScreenshot({ url: 'https://example.com', save_to_directory: '/tmp/screenshots', }); // Verify directory creation expect(mockMkdir).toHaveBeenCalledWith('/tmp/screenshots', { recursive: true }); // Verify file write const expectedFilename = 'example-com-2024-01-15T10-30-00.png'; const expectedPath = '/tmp/screenshots/' + expectedFilename; expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); // Verify response includes saved path expect(result.content[1].text).toContain(`Saved to: ${expectedPath}`); }); it('should handle directory creation failure gracefully', async () => { const consoleErrorSpy = jest.spyOn(console, 'error').mockImplementation(); mockMkdir.mockRejectedValue(new Error('Permission denied')); mockService.captureScreenshot.mockResolvedValue({ success: true, screenshot: testScreenshotBase64, }); const result = await contentHandlers.captureScreenshot({ url: 'https://example.com', save_to_directory: '/root/screenshots', }); // Should still return the screenshot expect(result.content[0].type).toBe('image'); expect(result.content[0].data).toBe(testScreenshotBase64); // Should not include saved path in text expect(result.content[1].text).not.toContain('Saved to:'); // Should log error expect(consoleErrorSpy).toHaveBeenCalledWith('Failed to save screenshot locally:', expect.any(Error)); consoleErrorSpy.mockRestore(); }); it('should handle file path instead of directory path', async () => { const mockDate = new Date('2024-01-15T10:30:00Z'); jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(); mockService.captureScreenshot.mockResolvedValue({ success: true, screenshot: testScreenshotBase64, }); await contentHandlers.captureScreenshot({ url: 'https://example.com', save_to_directory: '/tmp/screenshots/screenshot.png', }); // Should warn about file path expect(consoleWarnSpy).toHaveBeenCalledWith( 'Warning: save_to_directory should be a directory path, not a file path. Using parent directory.', ); // Should use parent directory expect(mockMkdir).toHaveBeenCalledWith('/tmp/screenshots', { recursive: true }); // Should still generate filename const expectedFilename = 'example-com-2024-01-15T10-30-00.png'; const expectedPath = '/tmp/screenshots/' + expectedFilename; expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); consoleWarnSpy.mockRestore(); }); it('should resolve home directory paths', async () => { const mockDate = new Date('2024-01-15T10:30:00Z'); jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); mockHomedir.mockReturnValue('/Users/testuser'); mockService.captureScreenshot.mockResolvedValue({ success: true, screenshot: testScreenshotBase64, }); await contentHandlers.captureScreenshot({ url: 'https://example.com', save_to_directory: '~/Desktop/screenshots', }); // Should resolve ~ to home directory expect(mockMkdir).toHaveBeenCalledWith('/Users/testuser/Desktop/screenshots', { recursive: true }); const expectedPath = '/Users/testuser/Desktop/screenshots/example-com-2024-01-15T10-30-00.png'; expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); }); it('should not return large screenshots when saved locally', async () => { // Create a large fake screenshot (>800KB when decoded) const largeBase64 = 'A'.repeat(1200000); // ~900KB when decoded mockService.captureScreenshot.mockResolvedValue({ success: true, screenshot: largeBase64, }); const result = await contentHandlers.captureScreenshot({ url: 'https://example.com', save_to_directory: '/tmp', }); // Should not include image in response const imageContent = result.content.find((c) => c.type === 'image'); expect(imageContent).toBeUndefined(); // Should mention size in text const textContent = result.content.find((c) => c.type === 'text'); expect(textContent?.text).toContain('not returned due to size'); expect(textContent?.text).toContain('KB'); }); it('should sanitize filename for URLs with special characters', async () => { const mockDate = new Date('2024-01-15T10:30:00Z'); jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); mockService.captureScreenshot.mockResolvedValue({ success: true, screenshot: testScreenshotBase64, }); await contentHandlers.captureScreenshot({ url: 'https://my-site.com:8080/path?query=value', save_to_directory: '/tmp/screenshots', }); const expectedFilename = 'my-site-com-2024-01-15T10-30-00.png'; const expectedPath = '/tmp/screenshots/' + expectedFilename; expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, expect.any(Buffer)); }); }); describe('CrawlHandlers.crawl', () => { it('should save screenshot to local directory when screenshot_directory is provided', async () => { const mockDate = new Date('2024-01-15T10:30:00Z'); jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); mockService.crawl.mockResolvedValue({ results: [ { url: 'https://example.com', success: true, screenshot: testScreenshotBase64, markdown: { raw_markdown: 'Test content' }, }, ], }); const result = await crawlHandlers.crawl({ url: 'https://example.com', screenshot: true, screenshot_directory: '/tmp/crawl-screenshots', }); // Verify directory creation expect(mockMkdir).toHaveBeenCalledWith('/tmp/crawl-screenshots', { recursive: true }); // Verify file write const expectedFilename = 'example-com-2024-01-15T10-30-00.png'; const expectedPath = '/tmp/crawl-screenshots/' + expectedFilename; expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); // Verify response includes saved path const textContent = result.content.find( (c) => c.type === 'text' && 'text' in c && c.text?.includes('Screenshot saved'), ); expect(textContent?.text).toContain(`Screenshot saved to: ${expectedPath}`); }); it('should handle file save failure gracefully in crawl', async () => { const consoleErrorSpy = jest.spyOn(console, 'error').mockImplementation(); mockMkdir.mockResolvedValue(undefined); // directory creation succeeds mockWriteFile.mockRejectedValue(new Error('Disk full')); // but file write fails mockService.crawl.mockResolvedValue({ results: [ { url: 'https://example.com', success: true, screenshot: testScreenshotBase64, markdown: { raw_markdown: 'Test content' }, }, ], }); const result = await crawlHandlers.crawl({ url: 'https://example.com', screenshot: true, screenshot_directory: '/tmp/crawl-screenshots', }); // Should still return the screenshot as image const imageContent = result.content.find((c) => c.type === 'image'); expect(imageContent?.data).toBe(testScreenshotBase64); // Should log error expect(consoleErrorSpy).toHaveBeenCalledWith('Failed to save screenshot locally:', expect.any(Error)); consoleErrorSpy.mockRestore(); }); it('should not attempt to save when screenshot_directory is not provided', async () => { mockService.crawl.mockResolvedValue({ results: [ { url: 'https://example.com', success: true, screenshot: testScreenshotBase64, markdown: { raw_markdown: 'Test content' }, }, ], }); await crawlHandlers.crawl({ url: 'https://example.com', screenshot: true, }); // Should not call fs methods expect(mockMkdir).not.toHaveBeenCalled(); expect(mockWriteFile).not.toHaveBeenCalled(); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/crawl4ai-service.network.test.ts: -------------------------------------------------------------------------------- ```typescript import { jest } from '@jest/globals'; // Mock axios before importing the service const mockAxiosInstance = { get: jest.fn() as jest.Mock, post: jest.fn() as jest.Mock, interceptors: { request: { use: jest.fn() as jest.Mock }, response: { use: jest.fn() as jest.Mock }, }, }; jest.unstable_mockModule('axios', () => ({ default: { create: jest.fn(() => mockAxiosInstance), isAxiosError: jest.fn((error: any) => error.isAxiosError === true), // eslint-disable-line @typescript-eslint/no-explicit-any get: jest.fn(), head: jest.fn(), }, isAxiosError: jest.fn((error: any) => error.isAxiosError === true), // eslint-disable-line @typescript-eslint/no-explicit-any })); // Import after mocking const { Crawl4AIService } = await import('../crawl4ai-service.js'); describe('Crawl4AI Service - Network Failures', () => { let service: any; // eslint-disable-line @typescript-eslint/no-explicit-any interface ErrorWithCode extends Error { code?: string; response?: { status: number; data?: any; // eslint-disable-line @typescript-eslint/no-explicit-any }; isAxiosError?: boolean; } beforeEach(() => { jest.clearAllMocks(); service = new Crawl4AIService('http://localhost:11235', 'test-api-key'); }); describe('Network Timeouts', () => { it('should handle request timeout', async () => { const timeoutError = new Error('timeout of 30000ms exceeded') as ErrorWithCode; timeoutError.code = 'ECONNABORTED'; timeoutError.isAxiosError = true; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(timeoutError); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Request timed out'); }); it('should handle response timeout', async () => { const timeoutError = new Error('timeout of 30000ms exceeded') as ErrorWithCode; timeoutError.code = 'ETIMEDOUT'; timeoutError.isAxiosError = true; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(timeoutError); await expect(service.getHTML({ url: 'https://example.com' })).rejects.toThrow('Request timeout'); }); }); describe('HTTP Error Responses', () => { it('should handle 401 Unauthorized', async () => { const error = { response: { status: 401, data: { error: 'Invalid API key' }, }, isAxiosError: true, }; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.crawl({ urls: ['https://example.com'] })).rejects.toThrow( 'Request failed with status 401: Invalid API key', ); }); it('should handle 403 Forbidden', async () => { const error = { response: { status: 403, data: { error: 'Access denied' }, }, isAxiosError: true, }; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 'Request failed with status 403: Access denied', ); }); it('should handle 404 Not Found', async () => { const error = { response: { status: 404, data: { error: 'Endpoint not found' }, }, isAxiosError: true, }; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.generatePDF({ url: 'https://example.com' })).rejects.toThrow( 'Request failed with status 404: Endpoint not found', ); }); it('should handle 429 Too Many Requests', async () => { const error = { response: { status: 429, data: { error: 'Rate limit exceeded' }, headers: { 'retry-after': '60', }, }, isAxiosError: true, }; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.executeJS({ url: 'https://example.com', scripts: ['return 1;'] })).rejects.toThrow( 'Request failed with status 429: Rate limit exceeded', ); }); it('should handle 500 Internal Server Error', async () => { const error = { response: { status: 500, data: { error: 'Internal server error' }, }, isAxiosError: true, }; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.crawl({ urls: ['https://example.com'] })).rejects.toThrow( 'Request failed with status 500: Internal server error', ); }); it('should handle 502 Bad Gateway', async () => { const error = { response: { status: 502, data: 'Bad Gateway', }, isAxiosError: true, message: 'Request failed with status code 502', }; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow( 'Request failed with status 502: Request failed with status code 502', ); }); it('should handle 503 Service Unavailable', async () => { const error = { response: { status: 503, data: { error: 'Service temporarily unavailable' }, }, isAxiosError: true, }; (mockAxiosInstance.get as jest.Mock).mockRejectedValue(error); await expect(service.extractWithLLM({ url: 'https://example.com', query: 'test' })).rejects.toThrow( 'Request failed with status 503: Service temporarily unavailable', ); }); it('should handle 504 Gateway Timeout', async () => { const error = { response: { status: 504, data: { error: 'Gateway timeout' }, }, isAxiosError: true, }; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.getHTML({ url: 'https://example.com' })).rejects.toThrow( 'Request failed with status 504: Gateway timeout', ); }); }); describe('Network Connection Failures', () => { it('should handle DNS resolution failure', async () => { const error = new Error('getaddrinfo ENOTFOUND invalid.domain') as ErrorWithCode; error.code = 'ENOTFOUND'; error.isAxiosError = true; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.getMarkdown({ url: 'https://invalid.domain' })).rejects.toThrow( 'DNS resolution failed: getaddrinfo ENOTFOUND invalid.domain', ); }); it('should handle connection refused', async () => { const error = new Error('connect ECONNREFUSED 127.0.0.1:11235') as ErrorWithCode; error.code = 'ECONNREFUSED'; error.isAxiosError = true; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.crawl({ urls: ['https://example.com'] })).rejects.toThrow( 'Connection refused: connect ECONNREFUSED 127.0.0.1:11235', ); }); it('should handle connection reset', async () => { const error = new Error('socket hang up') as ErrorWithCode; error.code = 'ECONNRESET'; error.isAxiosError = true; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 'Connection reset: socket hang up', ); }); it('should handle network unreachable', async () => { const error = new Error('connect ENETUNREACH') as ErrorWithCode; error.code = 'ENETUNREACH'; error.isAxiosError = true; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); await expect(service.executeJS({ url: 'https://example.com', scripts: ['return 1;'] })).rejects.toThrow( 'Network unreachable: connect ENETUNREACH', ); }); }); describe('Response Parsing Failures', () => { it('should handle invalid JSON response', async () => { // This test is not applicable anymore since we handle errors at axios level // The service will return whatever axios returns (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ data: '<html>Not JSON</html>', headers: { 'content-type': 'text/html' }, }); const result = await service.getHTML({ url: 'https://example.com' }); expect(result).toBe('<html>Not JSON</html>'); }); it('should handle empty response', async () => { (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ data: null, }); // The service returns null, which is valid const result = await service.crawl({ urls: ['https://example.com'] }); expect(result).toBeNull(); }); it('should handle malformed response structure', async () => { (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ data: { unexpected: 'structure' }, }); // The service returns whatever the API returns const result = await service.crawl({ urls: ['https://example.com'] }); expect(result).toEqual({ unexpected: 'structure' }); }); }); describe('Request Configuration Errors', () => { it('should handle invalid URL format', async () => { await expect(service.getMarkdown({ url: 'not-a-valid-url' })).rejects.toThrow('Invalid URL format'); }); it('should handle missing required parameters', async () => { await expect(service.batchCrawl({ urls: [] })).rejects.toThrow('URLs array cannot be empty'); }); it('should handle oversized request payload', async () => { const error = new Error('Request Entity Too Large') as ErrorWithCode; error.response = { status: 413 }; error.isAxiosError = true; error.message = 'Request Entity Too Large'; (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); const hugeScript = 'x'.repeat(10 * 1024 * 1024); // 10MB await expect(service.executeJS({ url: 'https://example.com', scripts: [hugeScript] })).rejects.toThrow( 'Request failed with status 413: Request Entity Too Large', ); }); }); describe('Partial Response Handling', () => { it('should handle successful response with partial data', async () => { (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ data: { results: [ { success: true, url: 'https://example.com', markdown: 'Content' }, { success: false, url: 'https://example.com/page2', error: 'Failed' }, ], }, }); const result = await service.crawl({ urls: ['https://example.com', 'https://example.com/page2'] }); expect(result.results).toHaveLength(2); expect(result.results[0].success).toBe(true); expect(result.results[1].success).toBe(false); }); it('should handle response with missing optional fields', async () => { (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ data: { success: true, url: 'https://example.com', // Missing markdown field }, }); const result = await service.getMarkdown({ url: 'https://example.com' }); expect(result.url).toBe('https://example.com'); expect(result.markdown).toBeUndefined(); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/parameter-combinations.test.ts: -------------------------------------------------------------------------------- ```typescript import { jest } from '@jest/globals'; import { CrawlHandlers } from '../../handlers/crawl-handlers.js'; import { ContentHandlers } from '../../handlers/content-handlers.js'; type MockService = { crawl: jest.Mock; getMarkdown: jest.Mock; captureScreenshot: jest.Mock; }; type MockAxiosClient = { post: jest.Mock; get: jest.Mock; head: jest.Mock; }; describe('Optional Parameter Combinations', () => { let crawlHandlers: CrawlHandlers; let _contentHandlers: ContentHandlers; let mockService: MockService; let mockAxiosClient: MockAxiosClient; beforeEach(() => { jest.clearAllMocks(); mockService = { crawl: jest.fn(), getMarkdown: jest.fn(), captureScreenshot: jest.fn(), }; mockAxiosClient = { post: jest.fn(), get: jest.fn(), head: jest.fn(), }; crawlHandlers = new CrawlHandlers(mockService, mockAxiosClient, new Map()); _contentHandlers = new ContentHandlers(mockService, mockAxiosClient, new Map()); }); describe('Batch Crawl Parameter Combinations', () => { const testCases = [ { name: 'default parameters only', options: { urls: ['https://example.com'] }, expectedConfig: undefined, }, { name: 'remove_images only', options: { urls: ['https://example.com'], remove_images: true }, expectedConfig: { exclude_tags: ['img', 'picture', 'svg'] }, }, { name: 'bypass_cache only', options: { urls: ['https://example.com'], bypass_cache: true }, expectedConfig: { cache_mode: 'BYPASS' }, }, { name: 'both remove_images and bypass_cache', options: { urls: ['https://example.com'], remove_images: true, bypass_cache: true }, expectedConfig: { exclude_tags: ['img', 'picture', 'svg'], cache_mode: 'BYPASS' }, }, { name: 'with max_concurrent', options: { urls: ['https://example.com'], max_concurrent: 5, remove_images: true }, expectedConfig: { exclude_tags: ['img', 'picture', 'svg'] }, }, ]; testCases.forEach(({ name, options, expectedConfig }) => { it(`should handle ${name}`, async () => { mockAxiosClient.post.mockResolvedValue({ data: { results: [{ success: true }] }, }); await crawlHandlers.batchCrawl(options); expect(mockAxiosClient.post).toHaveBeenCalledWith('/crawl', { urls: options.urls, max_concurrent: options.max_concurrent, crawler_config: expectedConfig, }); }); }); }); describe('Smart Crawl Parameter Combinations', () => { const testCases = [ { name: 'minimal configuration', options: { url: 'https://example.com' }, expectedCacheMode: 'ENABLED', }, { name: 'with bypass_cache', options: { url: 'https://example.com', bypass_cache: true }, expectedCacheMode: 'BYPASS', }, { name: 'with max_depth', options: { url: 'https://example.com', max_depth: 5 }, expectedCacheMode: 'ENABLED', }, { name: 'with follow_links and bypass_cache', options: { url: 'https://example.com', follow_links: true, bypass_cache: true }, expectedCacheMode: 'BYPASS', }, ]; testCases.forEach(({ name, options, expectedCacheMode }) => { it(`should handle ${name}`, async () => { mockAxiosClient.head.mockResolvedValue({ headers: { 'content-type': 'text/html' } }); mockAxiosClient.post.mockResolvedValue({ data: { results: [{ success: true, markdown: { raw_markdown: 'Content' } }] }, }); await crawlHandlers.smartCrawl(options); expect(mockAxiosClient.post).toHaveBeenCalledWith('/crawl', { urls: [options.url], crawler_config: { cache_mode: expectedCacheMode, }, browser_config: { headless: true, browser_type: 'chromium', }, }); }); }); }); describe('Crawl Parameter Combinations', () => { // Table-driven tests for various parameter combinations const parameterSets = [ // Browser configuration combinations { name: 'browser type with viewport', params: { url: 'https://example.com', browser_type: 'firefox', viewport_width: 1920, viewport_height: 1080, }, }, { name: 'proxy with authentication', params: { url: 'https://example.com', proxy_server: 'http://proxy.example.com:8080', proxy_username: 'user', proxy_password: 'pass', }, }, { name: 'cookies and headers', params: { url: 'https://example.com', cookies: [{ name: 'session', value: '123', domain: '.example.com' }], headers: { 'X-Custom': 'value', Authorization: 'Bearer token' }, }, }, // Content filtering combinations { name: 'content filtering options', params: { url: 'https://example.com', word_count_threshold: 100, excluded_tags: ['script', 'style'], remove_overlay_elements: true, }, }, { name: 'text-only with form removal', params: { url: 'https://example.com', only_text: true, remove_forms: true, keep_data_attributes: false, }, }, // JavaScript execution combinations { name: 'js_code with wait conditions', params: { url: 'https://example.com', js_code: ['document.querySelector("button").click()'], wait_for: '#result', wait_for_timeout: 5000, }, }, { name: 'js_only with session', params: { url: 'https://example.com', js_only: true, session_id: 'test-session-123', }, }, // Dynamic content handling { name: 'scrolling configuration', params: { url: 'https://example.com', delay_before_scroll: 2000, scroll_delay: 500, scan_full_page: true, }, }, { name: 'virtual scroll for infinite feeds', params: { url: 'https://example.com', virtual_scroll_config: { container_selector: '.feed', scroll_count: 10, scroll_by: 500, wait_after_scroll: 1000, }, }, }, // Media handling combinations { name: 'screenshot with PDF', params: { url: 'https://example.com', screenshot: true, screenshot_wait_for: 3, pdf: true, capture_mhtml: true, }, }, { name: 'image filtering options', params: { url: 'https://example.com', image_description_min_word_threshold: 10, image_score_threshold: 0.5, exclude_external_images: true, }, }, // Link filtering combinations { name: 'link exclusion options', params: { url: 'https://example.com', exclude_social_media_links: true, exclude_domains: ['facebook.com', 'twitter.com'], exclude_external_links: true, }, }, // Page interaction combinations { name: 'stealth mode options', params: { url: 'https://example.com', simulate_user: true, override_navigator: true, magic: true, user_agent: 'Custom Bot 1.0', }, }, // Complex combinations { name: 'kitchen sink - many options', params: { url: 'https://example.com', browser_type: 'chromium', viewport_width: 1280, viewport_height: 720, word_count_threshold: 50, excluded_tags: ['nav', 'footer'], js_code: ['window.scrollTo(0, document.body.scrollHeight)'], wait_for: '.loaded', screenshot: true, exclude_external_links: true, session_id: 'complex-session', cache_mode: 'BYPASS', verbose: true, }, }, ]; parameterSets.forEach(({ name, params }) => { it(`should correctly process ${name}`, async () => { mockService.crawl.mockResolvedValue({ results: [ { url: params.url, success: true, markdown: { raw_markdown: 'Test content' }, }, ], }); const result = await crawlHandlers.crawl(params); // Verify the service was called expect(mockService.crawl).toHaveBeenCalled(); // Verify response structure expect(result.content).toBeDefined(); expect(result.content[0].type).toBe('text'); }); }); // Test parameter validation it('should handle invalid parameter combinations', async () => { const invalidParams = { url: 'https://example.com', js_only: true, // Missing required session_id when js_only is true }; await expect(crawlHandlers.crawl(invalidParams)).rejects.toThrow(); }); // Test default values it('should apply correct defaults when parameters are omitted', async () => { mockService.crawl.mockResolvedValue({ results: [ { url: 'https://example.com', success: true, markdown: { raw_markdown: 'Content' }, }, ], }); await crawlHandlers.crawl({ url: 'https://example.com' }); const call = mockService.crawl.mock.calls[0][0]; // Check browser_config defaults expect(call.browser_config).toBeDefined(); expect(call.browser_config.headless).toBe(true); // Check that optional configs are not included when not specified expect(call.crawler_config.word_count_threshold).toBeUndefined(); expect(call.crawler_config.excluded_tags).toBeUndefined(); }); }); describe('Parameter Priority and Conflicts', () => { it('should handle conflicting cache modes correctly', async () => { mockService.crawl.mockResolvedValue({ results: [{ success: true, markdown: { raw_markdown: 'Content' } }], }); // Test that explicit cache_mode takes precedence await crawlHandlers.crawl({ url: 'https://example.com', cache_mode: 'DISABLED', // Even with other params that might suggest caching session_id: 'test-session', }); const call = mockService.crawl.mock.calls[0][0]; expect(call.crawler_config.cache_mode).toBe('DISABLED'); }); it('should handle mutually exclusive options', async () => { mockService.crawl.mockResolvedValue({ results: [{ success: true, html: '<p>HTML</p>' }], }); // only_text should override other content options await crawlHandlers.crawl({ url: 'https://example.com', only_text: true, keep_data_attributes: true, // Should be ignored with only_text }); const call = mockService.crawl.mock.calls[0][0]; expect(call.crawler_config.only_text).toBe(true); expect(call.crawler_config.keep_data_attributes).toBe(true); // Still passed through }); }); describe('Edge Cases for Optional Parameters', () => { it('should handle empty arrays correctly', async () => { mockService.crawl.mockResolvedValue({ results: [{ success: true, markdown: { raw_markdown: 'Content' } }], }); await crawlHandlers.crawl({ url: 'https://example.com', excluded_tags: [], // Empty array exclude_domains: [], // Empty array cookies: [], // Empty array }); const call = mockService.crawl.mock.calls[0][0]; expect(call.crawler_config.excluded_tags).toEqual([]); expect(call.crawler_config.exclude_domains).toEqual([]); expect(call.browser_config.cookies).toEqual([]); }); it('should handle null vs undefined correctly', async () => { mockService.crawl.mockResolvedValue({ results: [{ success: true, markdown: { raw_markdown: 'Content' } }], }); // null js_code should throw error await expect( crawlHandlers.crawl({ url: 'https://example.com', js_code: null as unknown as string[], }), ).rejects.toThrow('js_code parameter is null'); // undefined js_code should be fine await crawlHandlers.crawl({ url: 'https://example.com', js_code: undefined, }); expect(mockService.crawl).toHaveBeenCalledTimes(1); }); it('should handle boolean flags in all combinations', async () => { const booleanFlags = [ 'remove_overlay_elements', 'process_iframes', 'exclude_external_links', 'screenshot', 'pdf', 'verbose', 'log_console', 'simulate_user', 'override_navigator', 'magic', ]; // Test all flags as true const allTrue = booleanFlags.reduce((acc, flag) => ({ ...acc, [flag]: true }), { url: 'https://example.com', }); mockService.crawl.mockResolvedValue({ results: [{ success: true, markdown: { raw_markdown: 'Content' } }], }); await crawlHandlers.crawl(allTrue); const call = mockService.crawl.mock.calls[0][0]; booleanFlags.forEach((flag) => { const config = call.crawler_config[flag] || call.browser_config[flag]; expect(config).toBe(true); }); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/index.test.ts: -------------------------------------------------------------------------------- ```typescript import { jest } from '@jest/globals'; import { z } from 'zod'; // Mock the MCP SDK jest.mock('@modelcontextprotocol/sdk/server/index.js'); jest.mock('@modelcontextprotocol/sdk/server/stdio.js'); describe('MCP Server Validation', () => { describe('Stateless tool validation', () => { // Test the createStatelessSchema helper const createStatelessSchema = <T extends z.ZodTypeAny>(schema: T, toolName: string) => { // Tool-specific guidance for common scenarios const toolGuidance: Record<string, string> = { capture_screenshot: 'To capture screenshots with sessions, use crawl(session_id, screenshot: true)', generate_pdf: 'To generate PDFs with sessions, use crawl(session_id, pdf: true)', execute_js: 'To run JavaScript with sessions, use crawl(session_id, js_code: [...])', get_html: 'To get HTML with sessions, use crawl(session_id)', extract_with_llm: 'To extract data with sessions, first use crawl(session_id) then extract from the response', }; const message = `${toolName} does not support session_id. This tool is stateless - each call creates a new browser. ${ toolGuidance[toolName] || 'For persistent operations, use crawl with session_id.' }`; return z .object({ session_id: z.never({ message }).optional(), }) .passthrough() .and(schema) .transform((data) => { const { session_id, ...rest } = data as Record<string, unknown> & { session_id?: unknown }; if (session_id !== undefined) { throw new Error(message); } return rest; }); }; it('should reject session_id for stateless tools', () => { const ExecuteJsSchema = createStatelessSchema( z.object({ url: z.string().url(), js_code: z.union([z.string(), z.array(z.string())]), }), 'execute_js', ); // Should reject with session_id expect(() => { ExecuteJsSchema.parse({ url: 'https://example.com', js_code: 'return document.title', session_id: 'test-session', }); }).toThrow('execute_js does not support session_id'); }); it('should accept valid parameters without session_id', () => { const ExecuteJsSchema = createStatelessSchema( z.object({ url: z.string().url(), js_code: z.union([z.string(), z.array(z.string())]), }), 'execute_js', ); const result = ExecuteJsSchema.parse({ url: 'https://example.com', js_code: 'return document.title', }); expect(result).toEqual({ url: 'https://example.com', js_code: 'return document.title', }); }); it('should provide helpful error message when session_id is used', () => { const GetMarkdownSchema = createStatelessSchema( z.object({ url: z.string().url(), }), 'get_markdown', ); try { GetMarkdownSchema.parse({ url: 'https://example.com', session_id: 'my-session', }); } catch (error) { expect(error).toBeInstanceOf(z.ZodError); const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('get_markdown does not support session_id'); expect(zodError.errors[0].message).toContain('For persistent operations, use crawl'); } }); it('should provide tool-specific guidance for common tools', () => { // Test capture_screenshot guidance const CaptureScreenshotSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'capture_screenshot'); try { CaptureScreenshotSchema.parse({ url: 'https://example.com', session_id: 'test' }); } catch (error) { const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('use crawl(session_id, screenshot: true)'); } // Test generate_pdf guidance const GeneratePdfSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'generate_pdf'); try { GeneratePdfSchema.parse({ url: 'https://example.com', session_id: 'test' }); } catch (error) { const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('use crawl(session_id, pdf: true)'); } // Test execute_js guidance const ExecuteJsSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'execute_js'); try { ExecuteJsSchema.parse({ url: 'https://example.com', session_id: 'test' }); } catch (error) { const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('use crawl(session_id, js_code: [...])'); } }); it('should validate all stateless tools', () => { const statelessTools = [ 'get_markdown', 'capture_screenshot', 'generate_pdf', 'execute_js', 'batch_crawl', 'smart_crawl', 'get_html', 'extract_links', 'crawl_recursive', 'parse_sitemap', 'extract_with_llm', ]; statelessTools.forEach((toolName) => { const schema = createStatelessSchema( z.object({ url: z.string().url(), }), toolName, ); // Should reject session_id expect(() => { schema.parse({ url: 'https://example.com', session_id: 'test', }); }).toThrow(`${toolName} does not support session_id`); // Should accept without session_id const result = schema.parse({ url: 'https://example.com', }); expect(result).toEqual({ url: 'https://example.com', }); }); }); }); describe('Extract links tool', () => { it('should validate extract_links parameters', () => { const ExtractLinksSchema = z.object({ url: z.string().url(), categorize: z.boolean().optional().default(true), }); // Valid input with categorize true const result1 = ExtractLinksSchema.parse({ url: 'https://example.com', categorize: true, }); expect(result1.categorize).toBe(true); // Valid input with categorize false const result2 = ExtractLinksSchema.parse({ url: 'https://example.com', categorize: false, }); expect(result2.categorize).toBe(false); // Default categorize should be true const result3 = ExtractLinksSchema.parse({ url: 'https://example.com', }); expect(result3.categorize).toBe(true); }); }); describe('Session management tools', () => { it('should validate create_session parameters', () => { const CreateSessionSchema = z.object({ session_id: z.string(), initial_url: z.string().optional(), browser_type: z.string().optional(), }); // Valid input const result = CreateSessionSchema.parse({ session_id: 'my-session', initial_url: 'https://example.com', }); expect(result.session_id).toBe('my-session'); // Missing required session_id expect(() => { CreateSessionSchema.parse({ initial_url: 'https://example.com', }); }).toThrow(); }); it('should validate clear_session parameters', () => { const ClearSessionSchema = z.object({ session_id: z.string(), }); // Valid input const result = ClearSessionSchema.parse({ session_id: 'my-session', }); expect(result.session_id).toBe('my-session'); // Missing required session_id expect(() => { ClearSessionSchema.parse({}); }).toThrow(); }); }); describe('crawl validation', () => { it('should accept session_id for crawl', () => { const CrawlWithConfigSchema = z.object({ url: z.string().url(), session_id: z.string().optional(), js_code: z.union([z.string(), z.array(z.string())]).optional(), }); const result = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'my-session', js_code: 'document.querySelector("button").click()', }); expect(result.session_id).toBe('my-session'); }); it('should work without session_id', () => { const CrawlWithConfigSchema = z.object({ url: z.string().url(), session_id: z.string().optional(), }); const result = CrawlWithConfigSchema.parse({ url: 'https://example.com', }); expect(result.session_id).toBeUndefined(); }); it('should require js_only when using js_code with session_id WITHOUT output options', () => { // Create a schema that mirrors the real one's refinement const CrawlWithConfigSchema = z .object({ url: z.string().url(), session_id: z.string().optional(), js_code: z.union([z.string(), z.array(z.string())]).optional(), js_only: z.boolean().optional(), screenshot: z.boolean().optional(), pdf: z.boolean().optional(), }) .refine( (data) => { // Only require js_only when using js_code + session_id WITHOUT any output options if (data.js_code && data.session_id && !data.js_only && !data.screenshot && !data.pdf) { return false; } return true; }, { message: 'When using js_code with session_id WITHOUT screenshot or pdf, you MUST set js_only: true to prevent server errors. If you want screenshots/PDFs, you can omit js_only. Correct usage: crawl({url, session_id, js_code: [...], js_only: true})', }, ); // Should fail without js_only when no output options expect(() => { CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], }); }).toThrow('When using js_code with session_id WITHOUT screenshot or pdf'); // Should pass with js_only: true const result = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], js_only: true, }); expect(result.js_only).toBe(true); // Should pass with screenshot (no js_only required) const result2 = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], screenshot: true, }); expect(result2.screenshot).toBe(true); expect(result2.js_only).toBeUndefined(); // Should pass with pdf (no js_only required) const result3 = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], pdf: true, }); expect(result3.pdf).toBe(true); expect(result3.js_only).toBeUndefined(); // Should pass without js_code const result4 = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', }); expect(result4.session_id).toBe('test-session'); // Should pass without session_id const result5 = CrawlWithConfigSchema.parse({ url: 'https://example.com', js_code: ['document.querySelector("button").click()'], }); expect(result5.js_code).toBeDefined(); }); }); describe('JavaScript code validation', () => { const validateJavaScriptCode = (code: string): boolean => { if (/"|&|<|>|&#\d+;|&\w+;/.test(code)) { return false; } if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) { return false; } if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) { return false; } if (/[;})]\s*\\n\s*\w/.test(code)) { return false; } return true; }; const JsCodeSchema = z.union([ z.string().refine(validateJavaScriptCode, { message: 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', }), z.array( z.string().refine(validateJavaScriptCode, { message: 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', }), ), ]); it('should reject JavaScript with HTML entities', () => { expect(() => { JsCodeSchema.parse('document.querySelector("button").click()'); }).toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should reject JavaScript with literal \\n between statements', () => { expect(() => { JsCodeSchema.parse('console.log("line1");\\nconsole.log("line2")'); }).toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should accept valid JavaScript with \\n inside strings', () => { const result = JsCodeSchema.parse('console.log("line1\\nline2")'); expect(result).toBe('console.log("line1\\nline2")'); }); it('should accept valid multiline JavaScript', () => { const code = `// Comment document.querySelector('button').click(); return true;`; const result = JsCodeSchema.parse(code); expect(result).toBe(code); }); it('should validate arrays of JavaScript code', () => { // Invalid array expect(() => { JsCodeSchema.parse(['document.querySelector("input").value = "test"', 'form.submit()']); }).toThrow('Invalid JavaScript: Contains HTML entities'); // Valid array const validArray = ['document.querySelector("input").value = "test"', 'form.submit()']; const result = JsCodeSchema.parse(validArray); expect(result).toEqual(validArray); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/crawl-handlers.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { jest } from '@jest/globals'; import { AxiosError } from 'axios'; import type { CrawlHandlers as CrawlHandlersType } from '../../handlers/crawl-handlers.js'; import type { Crawl4AIService } from '../../crawl4ai-service.js'; // Mock the service const mockCrawl = jest.fn(); const mockService = { crawl: mockCrawl, } as unknown as Crawl4AIService; // Mock axios client const mockPost = jest.fn() as jest.Mock; const mockHead = jest.fn() as jest.Mock; const mockAxiosClient = { post: mockPost, head: mockHead, } as any; // eslint-disable-line @typescript-eslint/no-explicit-any // Mock axios for parseSitemap const mockAxiosGet = jest.fn(); jest.unstable_mockModule('axios', () => ({ default: { get: mockAxiosGet, }, AxiosError, })); // Import after setting up mocks const { CrawlHandlers: CrawlHandlersClass } = await import('../../handlers/crawl-handlers.js'); describe('CrawlHandlers', () => { let handler: CrawlHandlersType; let sessions: Map<string, any>; // eslint-disable-line @typescript-eslint/no-explicit-any beforeEach(() => { jest.clearAllMocks(); sessions = new Map(); handler = new CrawlHandlersClass(mockService, mockAxiosClient, sessions); }); describe('batchCrawl', () => { it('should handle API errors gracefully', async () => { // Mock API error response (mockPost as jest.Mock).mockRejectedValue( new AxiosError('Request failed with status code 500', 'ERR_BAD_RESPONSE', undefined, undefined, { status: 500, statusText: 'Internal Server Error', data: 'Internal Server Error', headers: {}, config: {} as any, // eslint-disable-line @typescript-eslint/no-explicit-any } as any), // eslint-disable-line @typescript-eslint/no-explicit-any ); await expect( handler.batchCrawl({ urls: ['not-a-valid-url', 'https://invalid-domain.com'], max_concurrent: 2, }), ).rejects.toThrow('Failed to batch crawl: Internal Server Error'); }); it('should support per-URL configs array', async () => { (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { url: 'https://example1.com', success: true, markdown: { raw_markdown: 'Test 1' } }, { url: 'https://example2.com', success: true, markdown: { raw_markdown: 'Test 2' } }, ], }, }); const result = await handler.batchCrawl({ urls: ['https://example1.com', 'https://example2.com'], configs: [ { url: 'https://example1.com', browser_config: { browser_type: 'chromium' }, crawler_config: { screenshot: true }, }, { url: 'https://example2.com', browser_config: { browser_type: 'undetected' }, crawler_config: { pdf: true }, extraction_strategy: { provider: 'openai' }, }, ], max_concurrent: 2, }); // Verify the configs array was passed through expect(mockPost).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ configs: expect.arrayContaining([ expect.objectContaining({ url: 'https://example1.com', browser_config: { browser_type: 'chromium' }, crawler_config: { screenshot: true }, }), expect.objectContaining({ url: 'https://example2.com', browser_config: { browser_type: 'undetected' }, crawler_config: { pdf: true }, extraction_strategy: { provider: 'openai' }, }), ]), max_concurrent: 2, }), ); expect(result.content[0].text).toContain('Batch crawl completed'); }); }); describe('smartCrawl', () => { it('should detect XML content type from HEAD request', async () => { // Mock HEAD response with XML content type (mockHead as jest.Mock).mockResolvedValue({ headers: { 'content-type': 'application/xml', }, }); // Mock crawl response (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { success: true, markdown: { raw_markdown: '<xml>Test content</xml>', }, }, ], }, }); const result = await handler.smartCrawl({ url: 'https://example.com/data.xml', }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); expect(result.content[0].text).toContain('<xml>Test content</xml>'); }); it('should handle HEAD request failure gracefully', async () => { // Mock HEAD request failure (mockHead as jest.Mock).mockRejectedValue(new Error('HEAD request failed')); // Mock successful crawl (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }, }); const result = await handler.smartCrawl({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Smart crawl detected content type: html'); }); it('should follow links from sitemap when follow_links is true', async () => { // Mock successful HEAD request (mockHead as jest.Mock).mockResolvedValue({ headers: { 'content-type': 'application/xml', }, }); // Mock initial crawl with sitemap content (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>https://example.com/page1</loc> </url> <url> <loc>https://example.com/page2</loc> </url> </urlset>`, }, ], }, }); // Mock follow-up crawl (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [{ success: true }, { success: true }], }, }); const result = await handler.smartCrawl({ url: 'https://example.com/sitemap.xml', follow_links: true, max_depth: 2, }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); expect(result.content[0].text).toContain('Followed 2 links:'); expect(result.content[0].text).toContain('https://example.com/page1'); expect(result.content[0].text).toContain('https://example.com/page2'); }); it('should handle smartCrawl API errors', async () => { (mockHead as jest.Mock).mockResolvedValue({ headers: {} }); // Mock crawl to get empty results first, then error on follow-up (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [], }, }); const result = await handler.smartCrawl({ url: 'https://example.com', }); // With empty results, it should still return a response expect(result.content[0].text).toContain('Smart crawl detected content type: html'); expect(result.content[0].text).toContain('No content extracted'); }); }); describe('crawlRecursive', () => { it('should handle max_depth limit correctly', async () => { // Mock successful crawl with links (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, links: { internal: [{ href: 'https://example.com/page1' }, { href: 'https://example.com/page2' }], external: [], }, }, ], }, }); // Mock second crawl for page1 (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Page 1 content', }, links: { internal: [], external: [], }, }, ], }, }); // Mock third crawl for page2 (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Page 2 content', }, links: { internal: [], external: [], }, }, ], }, }); const result = await handler.crawlRecursive({ url: 'https://example.com', max_depth: 1, // Should crawl initial URL and one level deep }); expect(result.content[0].text).toContain('Pages crawled: 3'); // Initial + 2 pages at depth 1 expect(result.content[0].text).toContain('Max depth reached: 1'); expect(mockPost).toHaveBeenCalledTimes(3); // Initial crawl + two more }); it('should handle invalid URLs in discovered links', async () => { // Mock crawl with invalid link (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, links: { internal: [ { href: 'javascript:void(0)' }, // Invalid URL { href: 'https://example.com/valid' }, // Valid URL ], external: [], }, }, ], }, }); const result = await handler.crawlRecursive({ url: 'https://example.com', max_depth: 1, }); // Should continue despite invalid URL expect(result.content[0].text).toContain('Pages crawled:'); }); it('should handle crawl failures during recursion', async () => { // First crawl succeeds (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, links: { internal: [{ href: 'https://example.com/page1' }], external: [], }, }, ], }, }); // Second crawl fails (mockPost as jest.Mock).mockRejectedValueOnce(new Error('Crawl failed')); const result = await handler.crawlRecursive({ url: 'https://example.com', max_depth: 1, }); // Should continue despite failure expect(result.content[0].text).toContain('Pages crawled: 1'); }); it('should handle crawlRecursive API errors', async () => { (mockPost as jest.Mock).mockRejectedValue(new Error('API Error')); const result = await handler.crawlRecursive({ url: 'https://example.com', }); // When the initial crawl fails, it should return a result with no pages crawled expect(result.content[0].text).toContain('Pages crawled: 0'); expect(result.content[0].text).toContain('No pages could be crawled'); }); }); describe('parseSitemap', () => { it('should handle network errors gracefully', async () => { // Mock ENOTFOUND error const error = new Error('getaddrinfo ENOTFOUND not-a-real-domain-12345.com'); (error as { code?: string }).code = 'ENOTFOUND'; mockAxiosGet.mockRejectedValue(error); await expect( handler.parseSitemap({ url: 'https://not-a-real-domain-12345.com/sitemap.xml', }), ).rejects.toThrow('Failed to parse sitemap: getaddrinfo ENOTFOUND not-a-real-domain-12345.com'); }); }); describe('crawl', () => { it('should handle word_count_threshold parameter', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); const result = await handler.crawl({ url: 'https://example.com', word_count_threshold: 100, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ word_count_threshold: 100, }), }), ); expect(result.content[0].text).toBe('Test content'); }); it('should update session last_used time when using session_id', async () => { const sessionId = 'test-session'; const session = { id: sessionId, created_at: new Date(), last_used: new Date('2025-08-01'), }; sessions.set(sessionId, session); (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); await handler.crawl({ url: 'https://example.com', session_id: sessionId, }); const updatedSession = sessions.get(sessionId) as { last_used: Date }; expect(updatedSession.last_used.getTime()).toBeGreaterThan(new Date('2025-08-01').getTime()); }); it('should handle image description parameters', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); await handler.crawl({ url: 'https://example.com', image_description_min_word_threshold: 10, image_score_threshold: 0.5, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ image_description_min_word_threshold: 10, image_score_threshold: 0.5, }), }), ); }); it('should handle exclude_social_media_links parameter', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); await handler.crawl({ url: 'https://example.com', exclude_social_media_links: true, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ exclude_social_media_links: true, }), }), ); }); it('should use extracted_content when available as string', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, extracted_content: 'Extracted text content', }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('Extracted text content'); }); it('should handle extracted_content as object', async () => { const extractedObj = { title: 'Test', body: 'Content' }; (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, extracted_content: extractedObj, }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe(JSON.stringify(extractedObj, null, 2)); }); it('should fallback to html when markdown is not available', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, html: '<html><body>HTML content</body></html>', }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('<html><body>HTML content</body></html>'); }); it('should fallback to fit_html when neither markdown nor html is available', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, fit_html: '<div>Fit HTML content</div>', }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('<div>Fit HTML content</div>'); }); it('should handle js_code as null error', async () => { await expect( handler.crawl({ url: 'https://example.com', js_code: null, }), ).rejects.toThrow( 'Failed to crawl: js_code parameter is null. Please provide JavaScript code as a string or array of strings.', ); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/crawl.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { jest } from '@jest/globals'; import type { AxiosResponse } from 'axios'; import type { MockAxiosInstance } from './types/mocks.js'; import type { Crawl4AIService as Crawl4AIServiceType } from '../crawl4ai-service.js'; // Manual mock for axios const mockAxios = { create: jest.fn(), }; jest.unstable_mockModule('axios', () => ({ default: mockAxios, })); // Import modules after mocking const { Crawl4AIService } = await import('../crawl4ai-service.js'); // Helper function to create a complete AxiosResponse object function createMockAxiosResponse<T>(data: T): AxiosResponse<T> { return { data, status: 200, statusText: 'OK', headers: {}, config: { url: '', method: 'post', headers: {}, }, } as AxiosResponse<T>; } describe('crawl parameter mapping', () => { let service: Crawl4AIServiceType; let mockAxiosInstance: MockAxiosInstance; beforeEach(() => { mockAxiosInstance = { post: jest.fn(), get: jest.fn(), head: jest.fn(), }; mockAxios.create.mockReturnValue(mockAxiosInstance); service = new Crawl4AIService('http://test.com', 'test-key'); }); afterEach(() => { jest.clearAllMocks(); }); describe('Browser configuration mapping', () => { it('should map all browser config parameters correctly', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', browser_config: { browser_type: 'firefox', headless: true, viewport_width: 1920, viewport_height: 1080, user_agent: 'Custom User Agent', proxy_config: { server: 'http://proxy.com:8080', username: 'proxyuser', password: 'proxypass', }, cookies: [{ name: 'session', value: 'abc123', domain: '.example.com', path: '/' }], headers: { 'X-Custom-Header': 'value' }, extra_args: ['--disable-gpu'], }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: { browser_type: 'firefox', headless: true, viewport_width: 1920, viewport_height: 1080, user_agent: 'Custom User Agent', proxy_config: { server: 'http://proxy.com:8080', username: 'proxyuser', password: 'proxypass', }, cookies: [{ name: 'session', value: 'abc123', domain: '.example.com', path: '/' }], headers: { 'X-Custom-Header': 'value' }, extra_args: ['--disable-gpu'], }, crawler_config: {}, }); }); it('should support undetected browser type', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', browser_config: { browser_type: 'undetected', headless: true, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: { browser_type: 'undetected', headless: true, }, crawler_config: {}, }); }); it('should support unified proxy format (string)', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', browser_config: { proxy: 'http://user:[email protected]:8080', }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: { proxy: 'http://user:[email protected]:8080', }, crawler_config: {}, }); }); it('should support unified proxy format (object)', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', browser_config: { proxy: { server: 'http://proxy.example.com:8080', username: 'user', password: 'pass', }, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: { proxy: { server: 'http://proxy.example.com:8080', username: 'user', password: 'pass', }, }, crawler_config: {}, }); }); }); describe('Crawler configuration mapping', () => { it('should map content filtering parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { word_count_threshold: 150, excluded_tags: ['nav', 'footer', 'aside'], excluded_selector: '#ads, .popup', remove_overlay_elements: true, only_text: true, remove_forms: true, keep_data_attributes: true, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { word_count_threshold: 150, excluded_tags: ['nav', 'footer', 'aside'], excluded_selector: '#ads, .popup', remove_overlay_elements: true, only_text: true, remove_forms: true, keep_data_attributes: true, }, }); }); it('should map JavaScript execution parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { js_code: ['document.querySelector(".load-more").click()', 'window.scrollTo(0, 1000)'], js_only: true, wait_for: '.content-loaded', wait_for_timeout: 10000, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { js_code: ['document.querySelector(".load-more").click()', 'window.scrollTo(0, 1000)'], js_only: true, wait_for: '.content-loaded', wait_for_timeout: 10000, }, }); }); it('should map page navigation and timing parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { wait_until: 'networkidle', page_timeout: 45000, wait_for_images: true, ignore_body_visibility: false, scan_full_page: true, delay_before_scroll: 2000, scroll_delay: 1000, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { wait_until: 'networkidle', page_timeout: 45000, wait_for_images: true, ignore_body_visibility: false, scan_full_page: true, delay_before_scroll: 2000, scroll_delay: 1000, }, }); }); it('should map media handling parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { screenshot: true, screenshot_wait_for: 2.5, pdf: true, capture_mhtml: true, image_description_min_word_threshold: 30, image_score_threshold: 5, exclude_external_images: true, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { screenshot: true, screenshot_wait_for: 2.5, pdf: true, capture_mhtml: true, image_description_min_word_threshold: 30, image_score_threshold: 5, exclude_external_images: true, }, }); }); it('should map link filtering parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { exclude_external_links: true, exclude_social_media_links: true, exclude_domains: ['ads.com', 'tracker.io', 'analytics.com'], }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { exclude_external_links: true, exclude_social_media_links: true, exclude_domains: ['ads.com', 'tracker.io', 'analytics.com'], }, }); }); it('should map page interaction parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { simulate_user: true, override_navigator: true, magic: true, process_iframes: true, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { simulate_user: true, override_navigator: true, magic: true, process_iframes: true, }, }); }); it('should map virtual scroll configuration', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { virtual_scroll_config: { container_selector: '#timeline', scroll_count: 20, scroll_by: 'container_height', wait_after_scroll: 1.5, }, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { virtual_scroll_config: { container_selector: '#timeline', scroll_count: 20, scroll_by: 'container_height', wait_after_scroll: 1.5, }, }, }); }); // Note: Extraction strategies removed - not supported via REST API // Use extract_with_llm tool instead for structured data extraction it('should map session and cache parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { session_id: 'test-session-123', cache_mode: 'DISABLED', }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { session_id: 'test-session-123', cache_mode: 'DISABLED', }, }); }); it('should map new crawler parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { delay_before_return_html: 2000, css_selector: '.main-content', include_links: true, resolve_absolute_urls: true, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { delay_before_return_html: 2000, css_selector: '.main-content', include_links: true, resolve_absolute_urls: true, }, }); }); it('should map performance and debug parameters', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { timeout: 90000, verbose: true, log_console: true, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { timeout: 90000, verbose: true, log_console: true, }, }); }); }); describe('Extraction strategies', () => { it('should support extraction_strategy passthrough', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', extraction_strategy: { provider: 'openai', api_key: 'sk-test', model: 'gpt-4', temperature: 0.7, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: {}, extraction_strategy: { provider: 'openai', api_key: 'sk-test', model: 'gpt-4', temperature: 0.7, }, }); }); it('should support table_extraction_strategy passthrough', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', table_extraction_strategy: { enable_chunking: true, thresholds: { min_rows: 5, max_columns: 20, }, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: {}, table_extraction_strategy: { enable_chunking: true, thresholds: { min_rows: 5, max_columns: 20, }, }, }); }); it('should support markdown_generator_options passthrough', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', markdown_generator_options: { include_links: true, preserve_formatting: true, }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: {}, markdown_generator_options: { include_links: true, preserve_formatting: true, }, }); }); }); describe('Combined configurations', () => { it('should handle both browser and crawler configs together', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', browser_config: { viewport_width: 1920, viewport_height: 1080, user_agent: 'Custom Bot', }, crawler_config: { word_count_threshold: 100, js_code: 'document.querySelector(".accept").click()', wait_for: '.content', screenshot: true, session_id: 'test-session', cache_mode: 'BYPASS', }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: { viewport_width: 1920, viewport_height: 1080, user_agent: 'Custom Bot', }, crawler_config: { word_count_threshold: 100, js_code: 'document.querySelector(".accept").click()', wait_for: '.content', screenshot: true, session_id: 'test-session', cache_mode: 'BYPASS', }, }); }); }); describe('Edge cases', () => { it('should handle undefined values correctly', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { word_count_threshold: 0, // Should be included (falsy but defined) excluded_tags: undefined, // Should not be included remove_overlay_elements: false, // Should be included only_text: undefined, // Should not be included }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { word_count_threshold: 0, excluded_tags: undefined, remove_overlay_elements: false, only_text: undefined, }, }); }); it('should handle empty arrays correctly', async () => { const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); await service.crawl({ url: 'https://example.com', crawler_config: { excluded_tags: [], exclude_domains: [], }, }); expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], browser_config: undefined, crawler_config: { excluded_tags: [], exclude_domains: [], }, }); }); }); }); ``` -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- ```markdown # Changelog ## Version 3.0.2 (2025-09-01) ### Bug Fixes - Fixed manage_session tool schema compatibility with Claude/Anthropic tools - Removed oneOf/allOf/anyOf from top-level schema - Simplified to plain object schema with enum constraints - Maintains all functionality while improving MCP client compatibility ## Version 3.0.1 (2025-08-30) ### Documentation - Updated README.md to accurately document all new parameters from v3.0.0 - Added documentation for batch_crawl configs array parameter - Clarified proxy object format support - Documented all new crawler parameters from Crawl4AI 0.7.3/0.7.4 ## Version 3.0.0 (2025-08-30) ### Features - Added full support for Crawl4AI 0.7.3/0.7.4 features: - **'undetected' browser type** - Stealth browser option for anti-bot detection - **New crawler parameters**: - `delay_before_return_html` - Delay before returning HTML content - `css_selector` - Filter content by CSS selector - `include_links` - Include extracted links in response - `resolve_absolute_urls` - Convert relative URLs to absolute - **Extraction strategies** - Support for LLM extraction, table extraction, and markdown generation options - **Multi-config batch crawling** - Per-URL configurations in batch_crawl - **Unified proxy format** - Support both string and object proxy configurations - **Memory metrics display** - Show server memory usage when available ### Improvements - Enhanced error formatting for better debugging - Better handling of object error responses from API - Fixed batch_crawl to include required `urls` field when using configs array ### Testing - Added comprehensive integration tests for all new features - Fixed TypeScript errors in test files - All 306 unit tests passing - All 150 integration tests passing ### Backward Compatibility - Fully backward compatible with older Crawl4AI servers (before 0.7.4) - All new features are optional and gracefully degrade ## Version 2.9.0 (2025-08-29) ### Breaking Changes - Consolidated session management into single `manage_session` tool - Replaces `create_session`, `clear_session`, and `list_sessions` tools - Uses discriminated union with `action` parameter: 'create', 'clear', or 'list' - Reduces tool count from 15 to 13 ### Removed - Removed `create_session` tool (use `manage_session` with `action: 'create'`) - Removed `clear_session` tool (use `manage_session` with `action: 'clear'`) - Removed `list_sessions` tool (use `manage_session` with `action: 'list'`) ### Improvements - Simplified API surface for better LLM interaction - Improved type safety with discriminated unions - Reduced code duplication in session management ### Testing - Updated all tests to use new `manage_session` tool - Maintained 100% test coverage ## Version 2.7.1 (2025-08-30) ### Bug Fixes - Fixed lint/formatting issues in test files - Cleaned up trailing whitespace ## Version 2.7.0 (2025-08-30) ### Compatibility Updates - Verified full compatibility with Crawl4AI version 0.7.4 - All 15 MCP tools tested and working - 100% integration test pass rate (148 tests) - Supports new v0.7.3/0.7.4 features including: - Undetected browser support with stealth mode - Multi-URL configuration system - Enhanced table extraction - Memory optimization improvements ### Bug Fixes - Fixed unit test timeout issues in NPX and CLI tests - Added proper process cleanup and timeouts - Fixed edge case where dotenv was loading during tests - Ensured all spawned child processes are properly terminated ### Testing - Comprehensive testing against Crawl4AI v0.7.4 Docker image - All integration tests pass with LLM features enabled - Unit test suite: 308 tests passing - Integration test suite: 148 tests passing ## Version 2.6.12 (2025-08-05) ### Bug Fixes - Fixed server startup issue when running via npx - Removed complex module detection logic that was preventing server startup - Server now always starts when the script is executed (as intended for MCP servers) - Simplified dotenv loading to only attempt in development when env vars aren't set ## Version 2.6.11 (2025-08-05) ### Bug Fixes - Fixed environment variable handling when running via npx - Only loads .env file if CRAWL4AI_BASE_URL is not already set - Prevents issues when env vars are passed via CLI/MCP configuration - Ensures package works correctly with Claude Desktop and other MCP clients ## Version 2.6.10 (2025-08-05) ### Bug Fixes - Fixed unit tests to use correct localhost URL from jest.setup.cjs - Fixed network error handling tests to not specify request body in nock mocks - Unit tests always use http://localhost:11235 as configured - Integration tests get URL from .env file ### Code Quality - Replaced all 'any' type warnings with proper type assertions in tests - All tests passing with zero lint warnings ## Version 2.6.9 (2025-08-05) ### Testing Improvements - Improved crawl4ai-service.ts test coverage from 76% to 84% - Added comprehensive network error handling tests - Added URL validation tests for all service methods - Added tests for optional parameter handling - Added JavaScript validation edge case tests ### Code Quality - All tests pass with zero lint errors - Maintained 100% function coverage for service layer ## Version 2.6.8 (2025-08-05) ### Code Cleanup - Removed unused mock generation system - Cleaned up package.json scripts - Simplified development workflow ### Chores - Verified alignment between unit tests, integration tests, and implementation - Confirmed all tests properly mock API interactions ## Version 2.6.7 (2025-08-05) ### Bug Fixes - Fixed integration tests to use production Crawl4AI server from environment variables - Fixed child process environment variable loading in test utilities - Added support for both string and object markdown responses from Crawl4AI API - Fixed timeout issues in MHTML capture and HTML extraction tests - Replaced unreliable test URLs (httpbin.org) with stable alternatives - Added 30-second timeout to session creation to prevent socket hang-ups ### Testing Improvements - Integration tests now run sequentially (maxWorkers: 1) to avoid rate limiting - Added proper working directory configuration for child processes - Fixed all integration tests to pass with production API - Maintained test coverage at 92.25% with all tests passing ## Version 2.6.6 (2025-08-05) ### Testing - Improved test coverage from 88.8% to 93.19% - Added comprehensive CLI entry point tests for signal handling, environment variables, and dotenv loading - Added network failure tests for axios timeout and HTTP error scenarios - Added input validation edge case tests for JavaScript code validation - Added parameter combination tests for optional parameters and edge cases - Improved branch coverage from 80.76% to 86.12% - Improved function coverage from 96.41% to 98.92% ## Version 2.6.5 (2025-08-05) ### Features - Enhanced screenshot handling for better compatibility - Added home directory (`~`) path resolution support - Large screenshots (>800KB) are now saved locally without being returned inline to avoid MCP's 1MB response limit - Clear indication when screenshots are too large to display inline ### Bug Fixes - Improved screenshot directory handling - Better parameter descriptions clarifying that only directory paths should be provided - Added automatic handling when file paths are mistakenly provided instead of directories - Warning messages when incorrect path format is detected - Ensures compatibility with various LLM usage patterns ## Version 2.6.4 (2025-08-04) ### Features - Added local screenshot storage support - capture_screenshot: New save_to_directory parameter saves screenshots locally while returning as MCP resource - crawl: New screenshot_directory parameter saves screenshots when screenshot=true - Automatic filename generation using URL hostname and timestamp - Creates directories if they don't exist - Graceful error handling - failures don't interrupt the crawl operation - Added comprehensive unit tests for file saving functionality ## Version 2.6.3 (2025-08-04) ### Enhancements - Improved tool descriptions for better LLM understanding and workflow clarity - Added [STATELESS], [SUPPORTS SESSIONS], [SESSION MANAGEMENT] indicators - Enhanced get_html description to emphasize selector discovery for automation - Added inspect-first workflow patterns to crawl tool description - Emphasized element verification in js_code parameter description - Added typical workflow guidance to create_session - Improved cross-references between related tools - Removed problematic one-shot form pattern that assumed element existence ### Bug Fixes - Fixed crawl_recursive max_depth behavior - max_depth: 0 now correctly crawls only the initial page - Previously, max_depth: 0 would crawl pages at depth 0 and depth 1 ## Version 2.6.2 (2025-08-04) ### Refactoring - Consolidated error handling in server.ts with validateAndExecute helper - Reduced ~90 lines of duplicate code - Preserved exact error message format for LLM compatibility - Improved maintainability while keeping behavior identical - Server.ts coverage improved from ~90% to 98.66% ## Version 2.6.1 (2025-08-04) ### Testing - Improved crawl-handlers test coverage from 87% to 97% - Added comprehensive unit tests for all crawl handler methods - Test error handling for batchCrawl, smartCrawl, crawlRecursive, parseSitemap - Cover edge cases including XML detection, URL validation, depth limits - Added integration tests for real API behavior validation - Test all crawl parameters including word_count_threshold, image thresholds, exclude_social_media_links - Properly handle MCP error formatting vs direct handler throws ## Version 2.6.0 (2025-08-04) ### Testing - Added comprehensive test coverage for error handling paths - Session creation with failed initial crawl - JavaScript execution error handling with accurate API response formats - Extract links manual extraction fallback when API returns empty links - Improved coverage from 87.23% to 89.71% lines - Added integration tests for crawl error handling - Invalid URL validation - Non-existent domain handling - Added unit tests for utility handlers - Manual link extraction from markdown - Malformed URL handling - Empty results scenarios ### Improvements - Better error resilience in session creation when initial crawl fails - More accurate test mocks based on real API responses ## Version 2.5.0 (2025-08-04) ### Refactoring - Removed backward compatibility exports from index.ts - Updated test imports to use direct module paths - Cleaned up index.ts to focus solely on CLI entry point ### Testing - Updated jest.setup.cjs to load .env for integration tests - Unit tests continue using localhost:11235 - Integration tests now use values from .env file ## Version 2.4.0 (2025-08-04) ### Features - Replaced Codecov with GitHub Actions-based coverage badge - Coverage badge now uses GitHub Gist for storage - No external dependencies for coverage tracking - Badge updates automatically with each CI run - Coverage reports published to GitHub Pages - Interactive HTML coverage report available at https://omgwtfwow.github.io/mcp-crawl4ai-ts/coverage/ ### Bug Fixes - Fixed smart_crawl implementation to remove unsupported 'strategy' parameter - Fixed coverage extraction in CI to use lcov.info format - Added proper URL encoding for Shields.io endpoint badge ### CI/CD Improvements - Added GitHub Pages deployment for coverage reports - Added write permissions for GitHub Actions to create gh-pages branch - Removed Codecov integration completely ### Maintenance - Removed .codecov.yml configuration file - Removed CODECOV_TOKEN from repository secrets - Updated README.md with new coverage badge ## Version 2.3.0 (2025-08-03) ### Refactoring - Split large 2,366-line index.ts file into modular structure - Created handlers/ directory with operation-specific handlers - Created schemas/ directory for validation schemas - Reduced file sizes to under 1,000 lines each (most under 300) - Maintained backward compatibility with all exports - Improved code organization and maintainability ### Testing - Updated tests to work with new modular structure - Maintained test coverage at 87.23% (exceeds 86% requirement) - All 165 unit tests passing ## Version 2.2.0 (2025-08-03) ### Features - Added comprehensive test coverage infrastructure - Set up Jest code coverage with Istanbul - Added test:coverage and test:ci npm scripts - Configured coverage thresholds (80% for all metrics) - Added coverage badge to README - Achieved 86.51% line coverage, 82.21% statement coverage ### Testing Improvements - Added comprehensive unit tests for all tool handlers in index.ts - Tests for success cases, error handling, and edge cases - Tests for MCP protocol request handling - Tests for parameter validation with Zod schemas - Added unit tests for JavaScript validation function - Added tests for private methods: parseSitemap and detectContentType - Fixed integration test reliability issues: - Replaced example.com with httpbin.org in execute-js tests - Fixed test expectations for JavaScript execution results - Fixed MCP request handler test setup ### Bug Fixes - Fixed parse_sitemap implementation to use axios.get directly instead of non-existent service method - Fixed TypeScript 'any' warnings in test files (eliminated 90+ warnings) - Fixed linting errors and formatting issues across the test suite - Fixed test URL in batch-crawl test (httpbingo.org → httpbin.org) ### CI/CD Improvements - Updated GitHub Actions workflow to include coverage reporting - Added Node.js 22.x to the test matrix - Fixed all failing CI tests ## Version 2.1.2 (2025-08-03) ### Documentation - Updated Node.js requirement from 16+ to 18+ to reflect actual testing and support - Node.js 16 reached End-of-Life in September 2023 - CI only tests on Node.js 18.x and 20.x - Added `engines` field to package.json to enforce Node.js 18+ requirement ## Version 2.1.1 (2025-08-03) ### Bug Fixes - Fixed GitHub homepage README display issue by renaming .github/README.md to CI.md - GitHub was showing the CI documentation instead of the main project README ## Version 2.1.0 (2025-08-03) ### Bug Fixes - Fixed `smart_crawl` bug where markdown object was incorrectly printed as `[object Object]` - Now correctly accesses `result.markdown.raw_markdown` for content display - Fixed integration test timeout issues: - Replaced example.com with httpbin.org/html in tests to avoid "domcontentloaded" timeout issues - Fixed httpbin.org URLs by adding proper path suffixes (e.g., /links/5/0) - Limited Jest parallelization for integration tests to prevent server overload - Fixed parameter mapping in `get_markdown` tool - now correctly maps schema properties (`filter`, `query`, `cache`) to API parameters (`f`, `q`, `c`) - Fixed `smart_crawl` schema to use `follow_links` parameter instead of `remove_images` - Fixed `extract_links` schema mismatch - corrected schema to use `categorize` parameter as defined in tool - Fixed `extract_links` implementation to properly handle link objects returned by API - Fixed `crawl_recursive` schema mismatch - corrected schema to use `include_pattern` and `exclude_pattern` instead of `filter_pattern` and `bypass_cache` - Fixed `crawl_recursive` implementation to use `/crawl` endpoint instead of `/md` for proper link extraction - Fixed `crawl_recursive` type issues and improved link handling for recursive crawling - Fixed `parse_sitemap` implementation to fetch sitemaps directly instead of through Crawl4AI server API - Fixed `create_session` schema to make `session_id` optional as documented - Enhanced `create_session` response to include all session parameters for programmatic access - Implemented proper handling for non-functional server parameters: - `batch_crawl`: `remove_images` now uses `exclude_tags` in crawler_config to actually remove images - `smart_crawl`: `follow_links` now crawls URLs found in sitemaps/RSS feeds (max 10 URLs) - Fixed `crawl` and `generate_pdf` tools PDF response to use proper MCP SDK embedded resource format with blob field ### Improvements - Added comprehensive integration tests for `batch_crawl` tool (7 tests) - Added comprehensive integration tests for `smart_crawl` tool (8 tests) - Fixed all ESLint formatting issues across the codebase - Enhanced error handling for empty URL arrays in batch_crawl - Improved test reliability by replacing problematic test URLs - Updated tool descriptions to accurately reflect actual behavior - Added proper TypeScript types for getMarkdown function - Enhanced test coverage for batch_crawl parameter handling - Added comprehensive unit and integration tests for `extract_links` tool - Improved JSON endpoint detection in `extract_links` tool - Better error handling for `extract_links` with graceful error messages - Added comprehensive integration tests for `crawl_recursive` tool - Improved `crawl_recursive` output format to clearly show depth levels and internal link counts - Enhanced error handling in `crawl_recursive` to continue crawling even if individual pages fail - Added comprehensive integration tests for `parse_sitemap` tool with various test cases - Added comprehensive integration tests for session management tools (`create_session`, `clear_session`, `list_sessions`) - Enhanced integration tests for `extract_with_llm` tool to handle non-deterministic LLM responses - Installed nock library for future HTTP mocking in unit tests - Fixed TypeScript lint warnings by replacing `any` types with proper types: - Changed error handling to use proper type assertions - Updated `unknown[]` for JavaScript execution results - Used `Record<string, unknown>` for generic objects - Created `LinkItem` interface for better type safety - Fixed all production code `any` types - Removed unused legacy `CrawlResult` interface - Consolidated unit tests to use nock for HTTP mocking: - Removed redundant Jest mock test file - Removed unused mocks directory - Renamed test file for clarity - Improved unit test performance from 92s to ~1s by removing timeout tests - Cleaned up test organization and removed test README - Added GitHub Actions CI workflow: - Automatic testing on push to main and pull requests - Tests run on Node.js 18.x and 20.x - Includes linting, formatting checks, and build verification - Added mock helper scripts: - `npm run generate-mocks`: Generate nock mock code from real API - `npm run view-mocks`: View and save API responses for reference - Both scripts help maintain accurate test mocks ## Version 2.0.1 (2025-08-02) Update README ## Version 2.0.0 (2025-08-02) ### Breaking Changes - Renamed `crawl_with_config` tool to `crawl` ### New Features - Added comprehensive response types for all endpoints (PDF, screenshot, HTML, markdown) - Enhanced parameter validation with clearer error messages - Improved documentation for JavaScript execution patterns - Added selector strategy guidance for form interaction - Better distinction between `wait_for` and `wait_until` usage ### Bug Fixes - Fixed server 500 errors by always including `crawler_config` in requests - Updated media and links types to match actual server responses - Corrected validation for `js_only` parameter usage ### Documentation - Added troubleshooting section with common issues and solutions - Included practical examples for form filling and multi-step navigation - Enhanced tool descriptions with clear warnings and recommendations - Added selector strategy guide for working with dynamic content ### Technical Improvements - Updated all TypeScript types based on actual server responses - Improved error handling and user-friendly messages - Enhanced Zod validation schemas with helpful refinements - Added comprehensive integration tests for new features ### Known Issues - `js_only: true` causes server serialization errors - use `screenshot: true` as workaround - Using `wait_for` with elements that already exist can cause timeouts - use `wait_until` instead ## Version 1.0.2 - Initial stable release with full MCP implementation - Support for all Crawl4AI endpoints - Basic session management - Integration with MCP clients ``` -------------------------------------------------------------------------------- /src/handlers/crawl-handlers.ts: -------------------------------------------------------------------------------- ```typescript import { BaseHandler } from './base-handler.js'; import { BatchCrawlOptions, CrawlResultItem, AdvancedCrawlConfig, CrawlEndpointResponse, ExtractionStrategy, TableExtractionStrategy, MarkdownGeneratorOptions, } from '../types.js'; import * as fs from 'fs/promises'; import * as path from 'path'; import * as os from 'os'; export class CrawlHandlers extends BaseHandler { async batchCrawl(options: BatchCrawlOptions) { try { let response; // Check if we have per-URL configs (new in 0.7.3/0.7.4) if (options.configs && options.configs.length > 0) { // Use the new configs array format // Extract URLs from configs for the urls field const urls = options.configs.map((config) => config.url); const requestBody = { urls: urls, configs: options.configs, max_concurrent: options.max_concurrent, }; response = await this.axiosClient.post('/crawl', requestBody); } else { // Use the legacy format with single crawler_config // Build crawler config if needed const crawler_config: Record<string, unknown> = {}; // Handle remove_images by using exclude_tags if (options.remove_images) { crawler_config.exclude_tags = ['img', 'picture', 'svg']; } if (options.bypass_cache) { crawler_config.cache_mode = 'BYPASS'; } response = await this.axiosClient.post('/crawl', { urls: options.urls, max_concurrent: options.max_concurrent, crawler_config: Object.keys(crawler_config).length > 0 ? crawler_config : undefined, }); } const results = response.data.results || []; // Add memory metrics if available let metricsText = ''; const responseData = response.data as CrawlEndpointResponse; if (responseData.server_memory_delta_mb !== undefined || responseData.server_peak_memory_mb !== undefined) { const memoryInfo = []; if (responseData.server_processing_time_s !== undefined) { memoryInfo.push(`Processing time: ${responseData.server_processing_time_s.toFixed(2)}s`); } if (responseData.server_memory_delta_mb !== undefined) { memoryInfo.push(`Memory delta: ${responseData.server_memory_delta_mb.toFixed(1)}MB`); } if (responseData.server_peak_memory_mb !== undefined) { memoryInfo.push(`Peak memory: ${responseData.server_peak_memory_mb.toFixed(1)}MB`); } if (memoryInfo.length > 0) { metricsText = `\n\nServer metrics: ${memoryInfo.join(', ')}`; } } return { content: [ { type: 'text', text: `Batch crawl completed. Processed ${results.length} URLs:\n\n${results .map( (r: CrawlResultItem, i: number) => `${i + 1}. ${options.urls[i]}: ${r.success ? 'Success' : 'Failed'}`, ) .join('\n')}${metricsText}`, }, ], }; } catch (error) { throw this.formatError(error, 'batch crawl'); } } async smartCrawl(options: { url: string; max_depth?: number; follow_links?: boolean; bypass_cache?: boolean }) { try { // First, try to detect the content type from URL or HEAD request let contentType = ''; try { const headResponse = await this.axiosClient.head(options.url); contentType = headResponse.headers['content-type'] || ''; } catch { // If HEAD request fails, continue anyway - we'll detect from the crawl response console.debug('HEAD request failed, will detect content type from response'); } let detectedType = 'html'; if (options.url.includes('sitemap') || options.url.endsWith('.xml')) { detectedType = 'sitemap'; } else if (options.url.includes('rss') || options.url.includes('feed')) { detectedType = 'rss'; } else if (contentType.includes('text/plain') || options.url.endsWith('.txt')) { detectedType = 'text'; } else if (contentType.includes('application/xml') || contentType.includes('text/xml')) { detectedType = 'xml'; } else if (contentType.includes('application/json')) { detectedType = 'json'; } // Crawl without the unsupported 'strategy' parameter const response = await this.axiosClient.post('/crawl', { urls: [options.url], crawler_config: { cache_mode: options.bypass_cache ? 'BYPASS' : 'ENABLED', }, browser_config: { headless: true, browser_type: 'chromium', }, }); const results = response.data.results || []; const result = results[0] || {}; // Handle follow_links for sitemaps and RSS feeds if (options.follow_links && (detectedType === 'sitemap' || detectedType === 'rss' || detectedType === 'xml')) { // Extract URLs from the content const urlPattern = /<loc>(.*?)<\/loc>|<link[^>]*>(.*?)<\/link>|href=["']([^"']+)["']/gi; const content = result.markdown || result.html || ''; const foundUrls: string[] = []; let match; while ((match = urlPattern.exec(content)) !== null) { const url = match[1] || match[2] || match[3]; if (url && url.startsWith('http')) { foundUrls.push(url); } } if (foundUrls.length > 0) { // Limit to first 10 URLs to avoid overwhelming the system const urlsToFollow = foundUrls.slice(0, Math.min(10, options.max_depth || 10)); // Crawl the found URLs await this.axiosClient.post('/crawl', { urls: urlsToFollow, max_concurrent: 3, bypass_cache: options.bypass_cache, }); return { content: [ { type: 'text', text: `Smart crawl detected content type: ${detectedType}\n\nMain content:\n${result.markdown?.raw_markdown || result.html || 'No content extracted'}\n\n---\nFollowed ${urlsToFollow.length} links:\n${urlsToFollow.map((url, i) => `${i + 1}. ${url}`).join('\n')}`, }, ...(result.metadata ? [ { type: 'text', text: `\n\n---\nMetadata:\n${JSON.stringify(result.metadata, null, 2)}`, }, ] : []), ], }; } } return { content: [ { type: 'text', text: `Smart crawl detected content type: ${detectedType}\n\n${result.markdown?.raw_markdown || result.html || 'No content extracted'}`, }, ...(result.metadata ? [ { type: 'text', text: `\n\n---\nMetadata:\n${JSON.stringify(result.metadata, null, 2)}`, }, ] : []), ], }; } catch (error) { throw this.formatError(error, 'smart crawl'); } } async crawlRecursive(options: { url: string; max_depth?: number; max_pages?: number; include_pattern?: string; exclude_pattern?: string; }) { try { const startUrl = new URL(options.url); const visited = new Set<string>(); const toVisit: Array<{ url: string; depth: number }> = [{ url: options.url, depth: 0 }]; const results: Array<{ url: string; content: string; internal_links_found: number; depth: number }> = []; let maxDepthReached = 0; const includeRegex = options.include_pattern ? new RegExp(options.include_pattern) : null; const excludeRegex = options.exclude_pattern ? new RegExp(options.exclude_pattern) : null; const maxDepth = options.max_depth !== undefined ? options.max_depth : 3; const maxPages = options.max_pages || 50; while (toVisit.length > 0 && results.length < maxPages) { const current = toVisit.shift(); if (!current || visited.has(current.url) || current.depth > maxDepth) { continue; } visited.add(current.url); try { // Check URL patterns if (excludeRegex && excludeRegex.test(current.url)) continue; if (includeRegex && !includeRegex.test(current.url)) continue; // Crawl the page using the crawl endpoint to get links const response = await this.axiosClient.post('/crawl', { urls: [current.url], crawler_config: { cache_mode: 'BYPASS', }, }); const crawlResults = response.data.results || [response.data]; const result: CrawlResultItem = crawlResults[0]; if (result && result.success) { const markdownContent = result.markdown?.fit_markdown || result.markdown?.raw_markdown || ''; const internalLinksCount = result.links?.internal?.length || 0; maxDepthReached = Math.max(maxDepthReached, current.depth); results.push({ url: current.url, content: markdownContent, internal_links_found: internalLinksCount, depth: current.depth, }); // Add internal links to crawl queue if (current.depth < maxDepth && result.links?.internal) { for (const linkObj of result.links.internal) { const linkUrl = linkObj.href || linkObj; try { const absoluteUrl = new URL(linkUrl, current.url).toString(); if (!visited.has(absoluteUrl) && new URL(absoluteUrl).hostname === startUrl.hostname) { toVisit.push({ url: absoluteUrl, depth: current.depth + 1 }); } } catch (e) { // Skip invalid URLs console.debug('Invalid URL:', e); } } } } } catch (error) { // Log but continue crawling other pages console.error(`Failed to crawl ${current.url}:`, error instanceof Error ? error.message : error); } } // Prepare the output text let outputText = `Recursive crawl completed:\n\nPages crawled: ${results.length}\nStarting URL: ${options.url}\n`; if (results.length > 0) { outputText += `Max depth reached: ${maxDepthReached} (limit: ${maxDepth})\n\nNote: Only internal links (same domain) are followed during recursive crawling.\n\nPages found:\n${results.map((r) => `- [Depth ${r.depth}] ${r.url}\n Content: ${r.content.length} chars\n Internal links found: ${r.internal_links_found}`).join('\n')}`; } else { outputText += `\nNo pages could be crawled. This might be due to:\n- The starting URL returned an error\n- No internal links were found\n- All discovered links were filtered out by include/exclude patterns`; } return { content: [ { type: 'text', text: outputText, }, ], }; } catch (error) { throw this.formatError(error, 'crawl recursively'); } } async parseSitemap(options: { url: string; filter_pattern?: string }) { try { // Fetch the sitemap directly (not through Crawl4AI server) const axios = (await import('axios')).default; const response = await axios.get(options.url, { timeout: 30000, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MCP-Crawl4AI/1.0)', }, }); const sitemapContent = response.data; // Parse XML content - simple regex approach for basic sitemaps const urlMatches = sitemapContent.match(/<loc>(.*?)<\/loc>/g) || []; const urls = urlMatches.map((match: string) => match.replace(/<\/?loc>/g, '')); // Apply filter if provided let filteredUrls = urls; if (options.filter_pattern) { const filterRegex = new RegExp(options.filter_pattern); filteredUrls = urls.filter((url: string) => filterRegex.test(url)); } return { content: [ { type: 'text', text: `Sitemap parsed successfully:\n\nTotal URLs found: ${urls.length}\nFiltered URLs: ${filteredUrls.length}\n\nURLs:\n${filteredUrls.slice(0, 100).join('\n')}${filteredUrls.length > 100 ? '\n... and ' + (filteredUrls.length - 100) + ' more' : ''}`, }, ], }; } catch (error) { throw this.formatError(error, 'parse sitemap'); } } async crawl(options: Record<string, unknown>) { try { // Ensure options is an object if (!options || typeof options !== 'object') { throw new Error('crawl requires options object with at least a url parameter'); } // Build browser_config const browser_config: Record<string, unknown> = { headless: true, // Always true as noted }; if (options.browser_type) browser_config.browser_type = options.browser_type; if (options.viewport_width) browser_config.viewport_width = options.viewport_width; if (options.viewport_height) browser_config.viewport_height = options.viewport_height; if (options.user_agent) browser_config.user_agent = options.user_agent; if (options.headers) browser_config.headers = options.headers; if (options.cookies) browser_config.cookies = options.cookies; // Handle proxy configuration - support both unified and legacy formats if (options.proxy) { // New unified format (0.7.3/0.7.4) browser_config.proxy = options.proxy; } else if (options.proxy_server) { // Legacy format for backward compatibility browser_config.proxy_config = { server: options.proxy_server, username: options.proxy_username, password: options.proxy_password, }; } // Build crawler_config const crawler_config: Record<string, unknown> = {}; // Content filtering if (options.word_count_threshold !== undefined) crawler_config.word_count_threshold = options.word_count_threshold; if (options.excluded_tags) crawler_config.excluded_tags = options.excluded_tags; if (options.remove_overlay_elements) crawler_config.remove_overlay_elements = options.remove_overlay_elements; // JavaScript execution if (options.js_code !== undefined && options.js_code !== null) { // If js_code is an array, join it with newlines for the server crawler_config.js_code = Array.isArray(options.js_code) ? options.js_code.join('\n') : options.js_code; } else if (options.js_code === null) { // If js_code is explicitly null, throw a helpful error throw new Error('js_code parameter is null. Please provide JavaScript code as a string or array of strings.'); } if (options.wait_for) crawler_config.wait_for = options.wait_for; if (options.wait_for_timeout) crawler_config.wait_for_timeout = options.wait_for_timeout; // Dynamic content if (options.delay_before_scroll) crawler_config.delay_before_scroll = options.delay_before_scroll; if (options.scroll_delay) crawler_config.scroll_delay = options.scroll_delay; // Content processing if (options.process_iframes) crawler_config.process_iframes = options.process_iframes; if (options.exclude_external_links) crawler_config.exclude_external_links = options.exclude_external_links; // Export options if (options.screenshot) crawler_config.screenshot = options.screenshot; if (options.pdf) crawler_config.pdf = options.pdf; // Session and cache if (options.session_id) { crawler_config.session_id = options.session_id; // Update session last_used time const session = this.sessions.get(String(options.session_id)); if (session) { session.last_used = new Date(); } } if (options.cache_mode) crawler_config.cache_mode = String(options.cache_mode).toLowerCase(); // Performance if (options.timeout) crawler_config.timeout = options.timeout; if (options.verbose) crawler_config.verbose = options.verbose; // Additional crawler parameters if (options.wait_until) crawler_config.wait_until = options.wait_until; if (options.page_timeout) crawler_config.page_timeout = options.page_timeout; if (options.wait_for_images) crawler_config.wait_for_images = options.wait_for_images; if (options.ignore_body_visibility) crawler_config.ignore_body_visibility = options.ignore_body_visibility; if (options.scan_full_page) crawler_config.scan_full_page = options.scan_full_page; if (options.remove_forms) crawler_config.remove_forms = options.remove_forms; if (options.keep_data_attributes) crawler_config.keep_data_attributes = options.keep_data_attributes; if (options.excluded_selector) crawler_config.excluded_selector = options.excluded_selector; if (options.only_text) crawler_config.only_text = options.only_text; // Media handling if (options.image_description_min_word_threshold !== undefined) crawler_config.image_description_min_word_threshold = options.image_description_min_word_threshold; if (options.image_score_threshold !== undefined) crawler_config.image_score_threshold = options.image_score_threshold; if (options.exclude_external_images) crawler_config.exclude_external_images = options.exclude_external_images; if (options.screenshot_wait_for !== undefined) crawler_config.screenshot_wait_for = options.screenshot_wait_for; // Link filtering if (options.exclude_social_media_links) crawler_config.exclude_social_media_links = options.exclude_social_media_links; if (options.exclude_domains) crawler_config.exclude_domains = options.exclude_domains; // Page interaction if (options.js_only) crawler_config.js_only = options.js_only; if (options.simulate_user) crawler_config.simulate_user = options.simulate_user; if (options.override_navigator) crawler_config.override_navigator = options.override_navigator; if (options.magic) crawler_config.magic = options.magic; // Virtual scroll if (options.virtual_scroll_config) crawler_config.virtual_scroll_config = options.virtual_scroll_config; // Cache control if (options.cache_mode) crawler_config.cache_mode = options.cache_mode; // Other if (options.log_console) crawler_config.log_console = options.log_console; if (options.capture_mhtml) crawler_config.capture_mhtml = options.capture_mhtml; // New parameters from 0.7.3/0.7.4 if (options.delay_before_return_html) crawler_config.delay_before_return_html = options.delay_before_return_html; if (options.css_selector) crawler_config.css_selector = options.css_selector; if (options.include_links !== undefined) crawler_config.include_links = options.include_links; if (options.resolve_absolute_urls !== undefined) crawler_config.resolve_absolute_urls = options.resolve_absolute_urls; // Call service with proper configuration const crawlConfig: AdvancedCrawlConfig = { url: options.url ? String(options.url) : undefined, crawler_config, }; // Add extraction strategy passthrough objects if provided if (options.extraction_strategy) crawlConfig.extraction_strategy = options.extraction_strategy as ExtractionStrategy; if (options.table_extraction_strategy) crawlConfig.table_extraction_strategy = options.table_extraction_strategy as TableExtractionStrategy; if (options.markdown_generator_options) crawlConfig.markdown_generator_options = options.markdown_generator_options as MarkdownGeneratorOptions; // Only include browser_config if we're not using a session if (!options.session_id) { crawlConfig.browser_config = browser_config; } const response: CrawlEndpointResponse = await this.service.crawl(crawlConfig); // Validate response structure if (!response || !response.results || response.results.length === 0) { throw new Error('Invalid response from server: no results received'); } const result: CrawlResultItem = response.results[0]; // Build response content const content = []; // Main content - use markdown.raw_markdown as primary content let mainContent = 'No content extracted'; if (result.extracted_content) { // Handle extraction results which might be objects or strings if (typeof result.extracted_content === 'string') { mainContent = result.extracted_content; } else if (typeof result.extracted_content === 'object') { mainContent = JSON.stringify(result.extracted_content, null, 2); } } else if (result.markdown?.raw_markdown) { mainContent = result.markdown.raw_markdown; } else if (result.html) { mainContent = result.html; } else if (result.fit_html) { mainContent = result.fit_html; } content.push({ type: 'text', text: mainContent, }); // Screenshot if available if (result.screenshot) { // Save to local directory if requested let savedFilePath: string | undefined; if (options.screenshot_directory && typeof options.screenshot_directory === 'string') { try { // Resolve home directory path let screenshotDir = options.screenshot_directory; if (screenshotDir.startsWith('~')) { const homedir = os.homedir(); screenshotDir = path.join(homedir, screenshotDir.slice(1)); } // Check if user provided a file path instead of directory if (screenshotDir.endsWith('.png') || screenshotDir.endsWith('.jpg')) { console.warn( `Warning: screenshot_directory should be a directory path, not a file path. Using parent directory.`, ); screenshotDir = path.dirname(screenshotDir); } // Ensure directory exists await fs.mkdir(screenshotDir, { recursive: true }); // Generate filename from URL and timestamp const url = new URL(String(options.url)); const hostname = url.hostname.replace(/[^a-z0-9]/gi, '-'); const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, -5); const filename = `${hostname}-${timestamp}.png`; savedFilePath = path.join(screenshotDir, filename); // Convert base64 to buffer and save const buffer = Buffer.from(result.screenshot, 'base64'); await fs.writeFile(savedFilePath, buffer); } catch (saveError) { // Log error but don't fail the operation console.error('Failed to save screenshot locally:', saveError); } } // If saved locally and screenshot is large (>800KB), don't return the base64 data const screenshotSize = Buffer.from(result.screenshot, 'base64').length; const shouldReturnImage = !savedFilePath || screenshotSize < 800 * 1024; // 800KB threshold if (shouldReturnImage) { content.push({ type: 'image', data: result.screenshot, mimeType: 'image/png', }); } if (savedFilePath) { const sizeInfo = !shouldReturnImage ? ` (${Math.round(screenshotSize / 1024)}KB - too large to display inline)` : ''; content.push({ type: 'text', text: `\n---\nScreenshot saved to: ${savedFilePath}${sizeInfo}`, }); } } // PDF if available if (result.pdf) { content.push({ type: 'resource', resource: { uri: `data:application/pdf;name=${encodeURIComponent(new URL(String(options.url)).hostname)}.pdf;base64,${result.pdf}`, mimeType: 'application/pdf', blob: result.pdf, }, }); } // Metadata if (result.metadata) { content.push({ type: 'text', text: `\n---\nMetadata: ${JSON.stringify(result.metadata, null, 2)}`, }); } // Links if (result.links && (result.links.internal.length > 0 || result.links.external.length > 0)) { content.push({ type: 'text', text: `\n---\nLinks: Internal: ${result.links.internal.length}, External: ${result.links.external.length}`, }); } // JS execution results if available if (result.js_execution_result && result.js_execution_result.results.length > 0) { const jsResults = result.js_execution_result.results .map((res: unknown, idx: number) => { return `Result ${idx + 1}: ${JSON.stringify(res, null, 2)}`; }) .join('\n'); content.push({ type: 'text', text: `\n---\nJavaScript Execution Results:\n${jsResults}`, }); } // Add memory metrics if available if (response.server_memory_delta_mb !== undefined || response.server_peak_memory_mb !== undefined) { const memoryInfo = []; if (response.server_processing_time_s !== undefined) { memoryInfo.push(`Processing time: ${response.server_processing_time_s.toFixed(2)}s`); } if (response.server_memory_delta_mb !== undefined) { memoryInfo.push(`Memory delta: ${response.server_memory_delta_mb.toFixed(1)}MB`); } if (response.server_peak_memory_mb !== undefined) { memoryInfo.push(`Peak memory: ${response.server_peak_memory_mb.toFixed(1)}MB`); } if (memoryInfo.length > 0) { content.push({ type: 'text', text: `\n---\nServer metrics: ${memoryInfo.join(', ')}`, }); } } return { content }; } catch (error) { throw this.formatError(error, 'crawl'); } } } ``` -------------------------------------------------------------------------------- /src/__tests__/crawl4ai-service.test.ts: -------------------------------------------------------------------------------- ```typescript import nock from 'nock'; import { Crawl4AIService } from '../crawl4ai-service.js'; import type { MarkdownEndpointResponse, ScreenshotEndpointResponse, PDFEndpointResponse, HTMLEndpointResponse, CrawlEndpointResponse, } from '../types.js'; /** * Unit tests for Crawl4AIService using nock for HTTP mocking * * Mock Maintenance: * - These mocks are maintained manually based on the actual API responses * - When the API changes, update the mock responses to match * - Integration tests validate against the real API */ describe('Crawl4AIService', () => { let service: Crawl4AIService; // Unit tests always use localhost as configured in jest.setup.cjs const baseURL = 'http://localhost:11235'; const apiKey = 'test-api-key'; beforeEach(() => { service = new Crawl4AIService(baseURL, apiKey); // Clean all nock interceptors before each test nock.cleanAll(); }); afterEach(() => { // Clean up any remaining interceptors nock.cleanAll(); }); describe('getMarkdown', () => { it('should fetch markdown with default parameters', async () => { const mockResponse: MarkdownEndpointResponse = { url: 'https://example.com', filter: 'fit', query: null, cache: 'false', markdown: '# Example Page\n\nThis is example content.', success: true, }; // Mock the HTTP request nock(baseURL) .post('/md', { url: 'https://example.com', f: 'fit', q: undefined, c: undefined, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.getMarkdown({ url: 'https://example.com', f: 'fit', }); expect(result).toEqual(mockResponse); }); it('should fetch markdown with all parameters', async () => { const mockResponse: MarkdownEndpointResponse = { url: 'https://example.com', filter: 'bm25', query: 'test query', cache: 'true', markdown: '# Filtered Content\n\nMatching content for test query.', success: true, }; nock(baseURL) .post('/md', { url: 'https://example.com', f: 'bm25', q: 'test query', c: 'true', }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.getMarkdown({ url: 'https://example.com', f: 'bm25', q: 'test query', c: 'true', }); expect(result).toEqual(mockResponse); }); it('should handle API errors', async () => { nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).reply(500, { detail: 'Internal server error' }); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow( 'Request failed with status 500: Internal server error', ); }); it('should validate URL format', async () => { await expect(service.getMarkdown({ url: 'invalid-url' })).rejects.toThrow('Invalid URL format'); }); it('should handle network errors', async () => { nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError('Network error'); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Network error'); }); }); describe('captureScreenshot', () => { it('should capture screenshot successfully', async () => { const mockResponse: ScreenshotEndpointResponse = { success: true, screenshot: 'base64-encoded-screenshot-data', }; nock(baseURL) .post('/screenshot', { url: 'https://example.com', screenshot_wait_for: 2, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.captureScreenshot({ url: 'https://example.com', screenshot_wait_for: 2, }); expect(result).toEqual(mockResponse); }); it('should validate URL format', async () => { await expect(service.captureScreenshot({ url: 'not-a-url' })).rejects.toThrow('Invalid URL format'); }); }); describe('generatePDF', () => { it('should generate PDF successfully', async () => { const mockResponse: PDFEndpointResponse = { success: true, pdf: 'base64-encoded-pdf-data', }; nock(baseURL) .post('/pdf', { url: 'https://example.com', }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.generatePDF({ url: 'https://example.com', }); expect(result).toEqual(mockResponse); }); it('should validate URL format', async () => { await expect(service.generatePDF({ url: 'not a url' })).rejects.toThrow('Invalid URL format'); }); }); describe('getHTML', () => { it('should fetch HTML successfully', async () => { const mockResponse: HTMLEndpointResponse = { html: '<html><body><h1>Example</h1></body></html>', url: 'https://example.com', success: true, }; nock(baseURL) .post('/html', { url: 'https://example.com', }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.getHTML({ url: 'https://example.com', }); expect(result).toEqual(mockResponse); }); it('should validate URL format', async () => { await expect(service.getHTML({ url: 'just text' })).rejects.toThrow('Invalid URL format'); }); }); describe('crawl', () => { it('should crawl with basic configuration', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: '# Example', markdown_with_citations: '# Example [1]', references_markdown: '[1]: https://example.com', fit_markdown: '# Example', fit_html: '<h1>Example</h1>', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.5, server_memory_delta_mb: 10, server_peak_memory_mb: 100, }; nock(baseURL) .post('/crawl', { urls: ['https://example.com'], browser_config: { headless: true }, crawler_config: { cache_mode: 'ENABLED' }, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://example.com'], browser_config: { headless: true }, crawler_config: { cache_mode: 'ENABLED' }, }); expect(result).toEqual(mockResponse); }); it('should reject invalid JavaScript in crawler_config', async () => { await expect( service.crawl({ url: 'https://example.com', crawler_config: { js_code: 'console.log("test")', }, }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should handle js_code as array with invalid script', async () => { await expect( service.crawl({ url: 'https://example.com', crawler_config: { js_code: ['valid code', '<script>alert("test")</script>'], }, }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); // Timeout testing is better suited for integration tests // where we can test against real API behavior }); describe('batchCrawl', () => { it('should batch crawl multiple URLs', async () => { const urls = ['https://example1.com', 'https://example2.com']; const mockResponse = { success: true, results: urls.map((url) => ({ url, success: true, markdown: { raw_markdown: `Content from ${url}` }, })), }; nock(baseURL) .post('/crawl', (body) => { return body.urls?.length === 2 && body.urls[0] === urls[0] && body.urls[1] === urls[1]; }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.batchCrawl({ urls }); expect(result.success).toBe(true); expect(result.results).toHaveLength(2); }); it('should validate empty URLs array', async () => { await expect(service.batchCrawl({ urls: [] })).rejects.toThrow('URLs array cannot be empty'); }); }); describe('executeJS', () => { it('should execute JavaScript successfully', async () => { const mockResponse = { success: true, js_execution_result: { success: true, results: ['Example Title'], }, markdown: '# Example Page', }; nock(baseURL) .post('/execute_js', { url: 'https://example.com', scripts: ['return document.title'], }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.executeJS({ url: 'https://example.com', scripts: 'return document.title', }); expect(result).toEqual(mockResponse); }); it('should handle array of scripts', async () => { const scripts = ['return document.title', 'return window.location.href']; const mockResponse = { success: true, js_execution_result: { success: true, results: ['Example Title', 'https://example.com'], }, }; nock(baseURL) .post('/execute_js', { url: 'https://example.com', scripts: scripts, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.executeJS({ url: 'https://example.com', scripts, }); expect(result).toEqual(mockResponse); }); it('should reject scripts with HTML entities', async () => { await expect( service.executeJS({ url: 'https://httpbin.org/html', scripts: 'console.log("test")', }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should reject scripts with HTML tags', async () => { await expect( service.executeJS({ url: 'https://httpbin.org/html', scripts: '<script>alert("test")</script>', }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should reject scripts with literal \\n', async () => { await expect( service.executeJS({ url: 'https://httpbin.org/html', scripts: 'console.log("test");\\nconsole.log("test2");', }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should reject array with invalid scripts', async () => { await expect( service.executeJS({ url: 'https://httpbin.org/html', scripts: ['valid script', 'console.log(&& true)'], }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should validate URL format', async () => { await expect(service.executeJS({ url: '//no-protocol', scripts: 'return 1' })).rejects.toThrow( 'Invalid URL format', ); }); it('should reject scripts with escaped backslash-n pattern', async () => { // Test the specific pattern that line 40-41 checks for: })\\nword const scriptWithPattern = 'function test() {}\\nconsole.log("test")'; await expect( service.executeJS({ url: 'https://example.com', scripts: scriptWithPattern, }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should allow valid JavaScript with actual newlines', async () => { const validScript = `function test() { console.log("This has real newlines"); return true; }`; const mockResponse = { success: true, js_execution_result: { results: [true] }, }; nock(baseURL).post('/execute_js').matchHeader('x-api-key', apiKey).reply(200, mockResponse); const result = await service.executeJS({ url: 'https://example.com', scripts: validScript, }); expect(result.success).toBe(true); }); }); describe('extractWithLLM', () => { it('should extract content with LLM', async () => { const mockResponse = { answer: 'The main topic of this page is JavaScript testing.', }; nock(baseURL) .get('/llm/https%3A%2F%2Fexample.com?q=What%20is%20the%20main%20topic%3F') .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.extractWithLLM({ url: 'https://example.com', query: 'What is the main topic?', }); expect(result).toEqual(mockResponse); }); // Timeout testing moved to integration tests it('should handle missing LLM provider', async () => { nock(baseURL) .get(/\/llm\/.*/) .matchHeader('x-api-key', apiKey) .reply(401, { detail: 'No LLM provider configured' }); await expect( service.extractWithLLM({ url: 'https://example.com', query: 'test', }), ).rejects.toThrow('No LLM provider configured'); }); }); describe('Browser Configuration', () => { it('should send cookies configuration correctly', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://httpbin.org/cookies', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: '{"cookies": {"test": "value"}}', markdown_with_citations: '', references_markdown: '', fit_markdown: '{"cookies": {"test": "value"}}', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.0, server_memory_delta_mb: 5, server_peak_memory_mb: 50, }; nock(baseURL) .post('/crawl', { urls: ['https://httpbin.org/cookies'], browser_config: { headless: true, cookies: [ { name: 'test', value: 'value', domain: '.httpbin.org', path: '/', }, ], }, crawler_config: {}, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://httpbin.org/cookies'], browser_config: { headless: true, cookies: [ { name: 'test', value: 'value', domain: '.httpbin.org', path: '/', }, ], }, crawler_config: {}, }); expect(result.success).toBe(true); expect(result.results[0].markdown?.raw_markdown).toContain('cookies'); }); it('should send headers configuration correctly', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://httpbin.org/headers', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: '{"headers": {"X-Custom": "test-value"}}', markdown_with_citations: '', references_markdown: '', fit_markdown: '{"headers": {"X-Custom": "test-value"}}', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.0, server_memory_delta_mb: 5, server_peak_memory_mb: 50, }; nock(baseURL) .post('/crawl', { urls: ['https://httpbin.org/headers'], browser_config: { headless: true, headers: { 'X-Custom': 'test-value', 'X-Request-ID': '12345', }, }, crawler_config: {}, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://httpbin.org/headers'], browser_config: { headless: true, headers: { 'X-Custom': 'test-value', 'X-Request-ID': '12345', }, }, crawler_config: {}, }); expect(result.success).toBe(true); expect(result.results[0].markdown?.raw_markdown).toContain('headers'); }); it('should send viewport configuration correctly', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: 'Content', markdown_with_citations: '', references_markdown: '', fit_markdown: 'Content', fit_html: '', }, tables: [], extracted_content: null, screenshot: 'base64-screenshot-data', pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 2.0, server_memory_delta_mb: 10, server_peak_memory_mb: 100, }; nock(baseURL) .post('/crawl', { urls: ['https://example.com'], browser_config: { headless: true, viewport_width: 375, viewport_height: 667, }, crawler_config: { screenshot: true, }, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://example.com'], browser_config: { headless: true, viewport_width: 375, viewport_height: 667, }, crawler_config: { screenshot: true, }, }); expect(result.success).toBe(true); expect(result.results[0].screenshot).toBeTruthy(); }); it('should send user agent configuration correctly', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://httpbin.org/user-agent', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: '{"user-agent": "Custom-Bot/1.0"}', markdown_with_citations: '', references_markdown: '', fit_markdown: '{"user-agent": "Custom-Bot/1.0"}', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.0, server_memory_delta_mb: 5, server_peak_memory_mb: 50, }; nock(baseURL) .post('/crawl', { urls: ['https://httpbin.org/user-agent'], browser_config: { headless: true, user_agent: 'Custom-Bot/1.0', }, crawler_config: {}, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://httpbin.org/user-agent'], browser_config: { headless: true, user_agent: 'Custom-Bot/1.0', }, crawler_config: {}, }); expect(result.success).toBe(true); expect(result.results[0].markdown?.raw_markdown).toContain('Custom-Bot/1.0'); }); it('should handle complex browser configuration', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://httpbin.org/anything', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: 'Response with all configs', markdown_with_citations: '', references_markdown: '', fit_markdown: 'Response with all configs', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.5, server_memory_delta_mb: 8, server_peak_memory_mb: 80, }; const complexConfig = { urls: ['https://httpbin.org/anything'], browser_config: { headless: true, viewport_width: 768, viewport_height: 1024, user_agent: 'Test-Bot/2.0', cookies: [ { name: 'session', value: 'abc123', domain: '.httpbin.org', path: '/', }, ], headers: { 'X-Test': 'value', }, }, crawler_config: { cache_mode: 'BYPASS' as const, }, }; nock(baseURL).post('/crawl', complexConfig).matchHeader('x-api-key', apiKey).reply(200, mockResponse); const result = await service.crawl(complexConfig); expect(result.success).toBe(true); expect(result.results).toHaveLength(1); }); }); describe('Crawler Configuration Advanced Parameters', () => { it('should send content filtering parameters correctly', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://httpbin.org/forms/post', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: 'Form content without forms', markdown_with_citations: '', references_markdown: '', fit_markdown: 'Form content without forms', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.0, server_memory_delta_mb: 5, server_peak_memory_mb: 50, }; nock(baseURL) .post('/crawl', { urls: ['https://httpbin.org/forms/post'], browser_config: { headless: true, }, crawler_config: { remove_forms: true, keep_data_attributes: true, exclude_external_images: true, }, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://httpbin.org/forms/post'], browser_config: { headless: true, }, crawler_config: { remove_forms: true, keep_data_attributes: true, exclude_external_images: true, }, }); expect(result.success).toBe(true); }); it('should send js_only parameter correctly', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://httpbin.org/html', html: '', cleaned_html: '', fit_html: '', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: '', markdown_with_citations: '', references_markdown: '', fit_markdown: '', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: { success: true, results: ['Page Title', '5'], }, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.0, server_memory_delta_mb: 5, server_peak_memory_mb: 50, }; nock(baseURL) .post('/crawl', { urls: ['https://httpbin.org/html'], browser_config: { headless: true, }, crawler_config: { js_code: ['return document.title', 'return document.querySelectorAll("p").length'], js_only: true, }, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://httpbin.org/html'], browser_config: { headless: true, }, crawler_config: { js_code: ['return document.title', 'return document.querySelectorAll("p").length'], js_only: true, }, }); expect(result.success).toBe(true); expect(result.results[0].js_execution_result).toBeDefined(); }); it('should send visibility and debug parameters correctly', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://httpbin.org/html', html: '<html>...</html>', cleaned_html: '<html>...</html>', fit_html: '<html>...</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: 'Content', markdown_with_citations: '', references_markdown: '', fit_markdown: 'Content', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: ['Test log message 1', 'Test warning', 'Test error'], ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.5, server_memory_delta_mb: 8, server_peak_memory_mb: 80, }; nock(baseURL) .post('/crawl', { urls: ['https://httpbin.org/html'], browser_config: { headless: true, }, crawler_config: { ignore_body_visibility: true, verbose: true, log_console: true, }, }) .matchHeader('x-api-key', apiKey) .reply(200, mockResponse); const result = await service.crawl({ urls: ['https://httpbin.org/html'], browser_config: { headless: true, }, crawler_config: { ignore_body_visibility: true, verbose: true, log_console: true, }, }); expect(result.success).toBe(true); expect(result.results[0].console_messages).toBeDefined(); }); }); describe('parseSitemap', () => { it('should fetch and return sitemap content', async () => { const mockSitemapXML = `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>https://example.com/page1</loc></url> <url><loc>https://example.com/page2</loc></url> </urlset>`; // parseSitemap now uses axios directly without baseURL nock('https://example.com').get('/sitemap.xml').reply(200, mockSitemapXML); const response = await service.parseSitemap('https://example.com/sitemap.xml'); expect(response).toBe(mockSitemapXML); }); it('should handle sitemap fetch errors', async () => { nock('https://example.com').get('/sitemap.xml').reply(404, 'Not Found'); await expect(service.parseSitemap('https://example.com/sitemap.xml')).rejects.toThrow(); }); }); describe('detectContentType', () => { it('should return content type from HEAD request', async () => { // detectContentType now uses axios directly without baseURL nock('https://example.com').head('/document.pdf').reply(200, '', { 'content-type': 'application/pdf' }); const contentType = await service.detectContentType('https://example.com/document.pdf'); expect(contentType).toBe('application/pdf'); }); it('should return empty string when content-type header is missing', async () => { nock('https://example.com').head('/file').reply(200, ''); const contentType = await service.detectContentType('https://example.com/file'); expect(contentType).toBe(''); }); it('should return empty string on HEAD request failure', async () => { nock('https://example.com').head('/file').reply(404, 'Not Found'); const contentType = await service.detectContentType('https://example.com/file'); expect(contentType).toBe(''); }); }); describe('Network Error Handling', () => { it('should handle ECONNABORTED error', async () => { const error = new Error('Connection aborted') as Error & { code?: string }; error.code = 'ECONNABORTED'; nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Request timed out'); }); it('should handle ETIMEDOUT error', async () => { const error = new Error('Socket timed out') as Error & { code?: string }; error.code = 'ETIMEDOUT'; nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Request timeout'); }); it('should handle ENOTFOUND error', async () => { const error = new Error('getaddrinfo ENOTFOUND') as Error & { code?: string }; error.code = 'ENOTFOUND'; nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('DNS resolution failed'); }); it('should handle ECONNREFUSED error', async () => { const error = new Error('connect ECONNREFUSED') as Error & { code?: string }; error.code = 'ECONNREFUSED'; nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Connection refused'); }); it('should handle ECONNRESET error', async () => { const error = new Error('socket hang up') as Error & { code?: string }; error.code = 'ECONNRESET'; nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Connection reset'); }); it('should handle ENETUNREACH error', async () => { const error = new Error('Network is unreachable') as Error & { code?: string }; error.code = 'ENETUNREACH'; nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Network unreachable'); }); it('should handle generic axios errors', async () => { const error = new Error('Generic error') as Error & { isAxiosError?: boolean }; error.isAxiosError = true; nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Generic error'); }); }); describe('Optional Parameter Handling', () => { it('should handle batchCrawl with remove_images option', async () => { const urls = ['https://example.com']; nock(baseURL) .post('/crawl', (body) => { return body.crawler_config?.exclude_tags?.includes('img'); }) .matchHeader('x-api-key', apiKey) .reply(200, { success: true, results: [] }); await service.batchCrawl({ urls, remove_images: true }); }); it('should handle batchCrawl with bypass_cache option', async () => { const urls = ['https://example.com']; nock(baseURL) .post('/crawl', (body) => { return body.crawler_config?.cache_mode === 'BYPASS'; }) .matchHeader('x-api-key', apiKey) .reply(200, { success: true, results: [] }); await service.batchCrawl({ urls, bypass_cache: true }); }); it('should test edge case JavaScript validation pattern', async () => { // Test the specific pattern on line 40-41: })\\nword const scriptWithEdgeCase = 'if (true) {}\\nwindow.alert("test")'; await expect( service.executeJS({ url: 'https://example.com', scripts: scriptWithEdgeCase, }), ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should include memory metrics in crawl response', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', html: '<html>Test</html>', cleaned_html: '<html>Test</html>', fit_html: '<html>Test</html>', success: true, status_code: 200, response_headers: {}, session_id: null, metadata: {}, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: 'Test content', markdown_with_citations: '', references_markdown: '', fit_markdown: 'Test content', fit_html: '', }, tables: [], extracted_content: null, screenshot: null, pdf: null, mhtml: null, js_execution_result: null, downloaded_files: null, network_requests: null, console_messages: null, ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 2.5, server_memory_delta_mb: 15.3, server_peak_memory_mb: 512.7, }; nock(baseURL).post('/crawl').matchHeader('x-api-key', apiKey).reply(200, mockResponse); const result = await service.crawl({ url: 'https://example.com' }); expect(result.server_processing_time_s).toBe(2.5); expect(result.server_memory_delta_mb).toBe(15.3); expect(result.server_peak_memory_mb).toBe(512.7); }); }); }); ``` -------------------------------------------------------------------------------- /src/server.ts: -------------------------------------------------------------------------------- ```typescript import { Server } from '@modelcontextprotocol/sdk/server/index.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js'; import axios, { AxiosInstance } from 'axios'; import { z } from 'zod'; import { Crawl4AIService } from './crawl4ai-service.js'; import { SessionInfo } from './handlers/base-handler.js'; import { ContentHandlers } from './handlers/content-handlers.js'; import { SessionHandlers } from './handlers/session-handlers.js'; import { UtilityHandlers } from './handlers/utility-handlers.js'; import { CrawlHandlers } from './handlers/crawl-handlers.js'; import { BatchCrawlOptions } from './types.js'; // Define the tool call result type type ToolCallResult = { content: Array<{ type: string; text?: string; data?: string; mimeType?: string; }>; session_id?: string; browser_type?: string; }; import { GetMarkdownSchema, CaptureScreenshotSchema, GeneratePdfSchema, ExecuteJsSchema, BatchCrawlSchema, SmartCrawlSchema, GetHtmlSchema, ExtractLinksSchema, CrawlRecursiveSchema, ParseSitemapSchema, CrawlSchema, ManageSessionSchema, ExtractWithLlmSchema, } from './schemas/validation-schemas.js'; export class Crawl4AIServer { private server: Server; protected axiosClient: AxiosInstance; protected service: Crawl4AIService; private sessions: Map<string, SessionInfo> = new Map(); private serverName: string; private serverVersion: string; // Handler instances private contentHandlers: ContentHandlers; private sessionHandlers: SessionHandlers; private utilityHandlers: UtilityHandlers; private crawlHandlers: CrawlHandlers; constructor(baseUrl: string, apiKey: string, serverName: string = 'crawl4ai-mcp', serverVersion: string = '1.0.0') { this.serverName = serverName; this.serverVersion = serverVersion; this.server = new Server( { name: serverName, version: serverVersion, }, { capabilities: { tools: {}, }, }, ); // Initialize axios client with API key this.axiosClient = axios.create({ baseURL: baseUrl, headers: { 'X-API-Key': apiKey, 'Content-Type': 'application/json', }, timeout: 120000, // 2 minutes timeout }); // Initialize the service this.service = new Crawl4AIService(baseUrl, apiKey); // Initialize handlers this.contentHandlers = new ContentHandlers(this.service, this.axiosClient, this.sessions); this.sessionHandlers = new SessionHandlers(this.service, this.axiosClient, this.sessions); this.utilityHandlers = new UtilityHandlers(this.service, this.axiosClient, this.sessions); this.crawlHandlers = new CrawlHandlers(this.service, this.axiosClient, this.sessions); this.setupHandlers(); } /** * Helper method to validate arguments and execute handler with consistent error formatting * Preserves the exact error message format that LLMs rely on */ private async validateAndExecute<T>( toolName: string, args: unknown, schema: z.ZodSchema<T>, handler: (validatedArgs: T) => Promise<ToolCallResult>, ): Promise<ToolCallResult> { try { const validatedArgs = schema.parse(args); return await handler(validatedArgs); } catch (error) { if (error instanceof z.ZodError) { // EXACT same formatting as before - critical for LLM understanding const details = error.errors .map((e) => (e.path.length > 0 ? `${e.path.join('.')}: ${e.message}` : e.message)) .join(', '); throw new Error(`Invalid parameters for ${toolName}: ${details}`); } throw error; } } private setupHandlers() { // Handle list tools request this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: 'get_markdown', description: '[STATELESS] Extract content as markdown with filtering options. Supports: raw (full content), fit (optimized, default), bm25 (keyword search), llm (AI-powered extraction). Use bm25/llm with query for specific content. Creates new browser each time. For persistence use create_session + crawl.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to extract markdown from', }, filter: { type: 'string', enum: ['raw', 'fit', 'bm25', 'llm'], description: 'Filter type: raw (full), fit (optimized), bm25 (search), llm (AI extraction)', default: 'fit', }, query: { type: 'string', description: 'Query string for bm25/llm filters. Required when using bm25 or llm filter.', }, cache: { type: 'string', description: 'Cache-bust parameter (use different values to force fresh extraction)', default: '0', }, }, required: ['url'], }, }, { name: 'capture_screenshot', description: "[STATELESS] Capture webpage screenshot. Returns base64-encoded PNG data. Creates new browser each time. Optionally saves screenshot to local directory. IMPORTANT: Chained calls (execute_js then capture_screenshot) will NOT work - the screenshot won't see JS changes! For JS changes + screenshot use create_session + crawl(session_id, js_code, screenshot:true) in ONE call.", inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to capture', }, screenshot_wait_for: { type: 'number', description: 'Seconds to wait before taking screenshot (allows page loading/animations)', default: 2, }, save_to_directory: { type: 'string', description: "Directory path to save screenshot (e.g., ~/Desktop, /tmp). Do NOT include filename - it will be auto-generated. Large screenshots (>800KB) won't be returned inline when saved.", }, }, required: ['url'], }, }, { name: 'generate_pdf', description: '[STATELESS] Convert webpage to PDF. Returns base64-encoded PDF data. Creates new browser each time. Cannot capture form fills or JS changes. For persistent PDFs use create_session + crawl(session_id, pdf:true).', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to convert to PDF', }, }, required: ['url'], }, }, { name: 'execute_js', description: '[STATELESS] Execute JavaScript and get return values + page content. Creates new browser each time. Use for: extracting data, triggering dynamic content, checking page state. Scripts with "return" statements return actual values (strings, numbers, objects, arrays). Note: null returns as {"success": true}. Returns values but page state is lost. For persistent JS execution, use crawl with session_id.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to load', }, scripts: { type: ['string', 'array'], items: { type: 'string' }, description: 'JavaScript to execute. Use "return" to get values back! Each string runs separately. Returns appear in results array. Examples: "return document.title", "return document.querySelectorAll(\'a\').length", "return {url: location.href, links: [...document.links].map(a => a.href)}". Use proper JS syntax: real quotes, no HTML entities.', }, }, required: ['url', 'scripts'], }, }, { name: 'batch_crawl', description: '[STATELESS] Crawl multiple URLs concurrently for efficiency. Use when: processing URL lists, comparing multiple pages, or bulk data extraction. Faster than sequential crawling. Max 5 concurrent by default. Each URL gets a fresh browser. Cannot maintain state between URLs. For persistent operations use create_session + crawl.', inputSchema: { type: 'object', properties: { urls: { type: 'array', items: { type: 'string' }, description: 'List of URLs to crawl', }, max_concurrent: { type: 'number', description: 'Parallel request limit. Higher = faster but more resource intensive. Adjust based on server capacity and rate limits', default: 5, }, remove_images: { type: 'boolean', description: 'Remove images from output by excluding img, picture, and svg tags', default: false, }, bypass_cache: { type: 'boolean', description: 'Bypass cache for all URLs', default: false, }, }, required: ['urls'], }, }, { name: 'smart_crawl', description: '[STATELESS] Auto-detect and handle different content types (HTML, sitemap, RSS, text). Use when: URL type is unknown, crawling feeds/sitemaps, or want automatic format handling. Adapts strategy based on content. Creates new browser each time. For persistent operations use create_session + crawl.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to crawl intelligently', }, max_depth: { type: 'number', description: 'Maximum crawl depth for sitemaps', default: 2, }, follow_links: { type: 'boolean', description: 'For sitemaps/RSS: crawl found URLs (max 10). For HTML: no effect', default: false, }, bypass_cache: { type: 'boolean', description: 'Force fresh crawl', default: false, }, }, required: ['url'], }, }, { name: 'get_html', description: '[STATELESS] Get sanitized/processed HTML for inspection and automation planning. Use when: finding form fields/selectors, analyzing page structure before automation, building schemas. Returns cleaned HTML showing element names, IDs, and classes - perfect for identifying selectors for subsequent crawl operations. Commonly used before crawl to find selectors for automation. Creates new browser each time.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to extract HTML from', }, }, required: ['url'], }, }, { name: 'extract_links', description: '[STATELESS] Extract and categorize all page links. Use when: building sitemaps, analyzing site structure, finding broken links, or discovering resources. Groups by internal/external/social/documents. Creates new browser each time. For persistent operations use create_session + crawl.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to extract links from', }, categorize: { type: 'boolean', description: 'Group links by type: internal (same domain), external, social media, documents (PDF/DOC), images. Helpful for link analysis', default: true, }, }, required: ['url'], }, }, { name: 'crawl_recursive', description: '[STATELESS] Deep crawl a website following internal links. Use when: mapping entire sites, finding all pages, building comprehensive indexes. Control with max_depth (default 3) and max_pages (default 50). Note: May need JS execution for dynamic sites. Each page gets a fresh browser. For persistent operations use create_session + crawl.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'Starting URL to crawl from', }, max_depth: { type: 'number', description: 'Maximum depth to follow links', default: 3, }, max_pages: { type: 'number', description: 'Maximum number of pages to crawl', default: 50, }, include_pattern: { type: 'string', description: 'Regex to match URLs to crawl. Example: ".*\\/blog\\/.*" for blog posts only, ".*\\.html$" for HTML pages', }, exclude_pattern: { type: 'string', description: 'Regex to skip URLs. Example: ".*\\/(login|admin).*" to avoid auth pages, ".*\\.pdf$" to skip PDFs', }, }, required: ['url'], }, }, { name: 'parse_sitemap', description: '[STATELESS] Extract URLs from XML sitemaps. Use when: discovering all site pages, planning crawl strategies, or checking sitemap validity. Supports regex filtering. Try sitemap.xml or robots.txt first. Creates new browser each time.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the sitemap (e.g., https://example.com/sitemap.xml)', }, filter_pattern: { type: 'string', description: 'Optional regex pattern to filter URLs', }, }, required: ['url'], }, }, { name: 'crawl', description: '[SUPPORTS SESSIONS] THE ONLY TOOL WITH BROWSER PERSISTENCE\n\n' + 'RECOMMENDED PATTERNS:\n' + '• Inspect first workflow:\n' + ' 1) get_html(url) → find selectors & verify elements exist\n' + ' 2) create_session() → "session-123"\n' + ' 3) crawl({url, session_id: "session-123", js_code: ["action 1"]})\n' + ' 4) crawl({url: "/page2", session_id: "session-123", js_code: ["action 2"]})\n\n' + '• Multi-step with state:\n' + ' 1) create_session() → "session-123"\n' + ' 2) crawl({url, session_id: "session-123"}) → inspect current state\n' + ' 3) crawl({url, session_id: "session-123", js_code: ["verified actions"]})\n\n' + 'WITH session_id: Maintains browser state (cookies, localStorage, page) across calls\n' + 'WITHOUT session_id: Creates fresh browser each time (like other tools)\n\n' + 'WHEN TO USE SESSIONS vs STATELESS:\n' + '• Need state between calls? → create_session + crawl\n' + '• Just extracting data? → Use stateless tools\n' + '• Filling forms? → Inspect first, then use sessions\n' + '• Taking screenshot after JS? → Must use crawl with session\n' + '• Unsure if elements exist? → Always use get_html first\n\n' + 'CRITICAL FOR js_code:\n' + 'RECOMMENDED: Always use screenshot: true when running js_code\n' + 'This avoids server serialization errors and gives visual confirmation', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to crawl', }, session_id: { type: 'string', description: 'ENABLES PERSISTENCE: Use SAME ID across all crawl calls to maintain browser state.\n' + '• First call with ID: Creates persistent browser\n' + '• Subsequent calls with SAME ID: Reuses browser with all state intact\n' + '• Different/no ID: Fresh browser (stateless)\n' + 'WARNING: ONLY works with crawl tool - other tools ignore this parameter', }, // === CORE CONFIGURATION === browser_type: { type: 'string', enum: ['chromium', 'firefox', 'webkit'], description: 'Browser engine for crawling. Chromium offers best compatibility, Firefox for specific use cases, WebKit for Safari-like behavior', default: 'chromium', }, viewport_width: { type: 'number', description: 'Browser window width in pixels. Affects responsive layouts and content visibility', default: 1080, }, viewport_height: { type: 'number', description: 'Browser window height in pixels. Impacts content loading and screenshot dimensions', default: 600, }, user_agent: { type: 'string', description: 'Custom browser identity. Use for: mobile sites (include "Mobile"), avoiding bot detection, or specific browser requirements. Example: "Mozilla/5.0 (iPhone...)"', }, proxy_server: { type: 'string', description: 'Proxy server URL (e.g., "http://proxy.example.com:8080")', }, proxy_username: { type: 'string', description: 'Proxy authentication username', }, proxy_password: { type: 'string', description: 'Proxy authentication password', }, cookies: { type: 'array', items: { type: 'object', properties: { name: { type: 'string', description: 'Cookie name' }, value: { type: 'string', description: 'Cookie value' }, domain: { type: 'string', description: 'Domain where cookie is valid' }, path: { type: 'string', description: 'URL path scope for cookie' }, }, required: ['name', 'value', 'domain'], }, description: 'Pre-set cookies for authentication or personalization', }, headers: { type: 'object', description: 'Custom HTTP headers for API keys, auth tokens, or specific server requirements', }, // === CONTENT PROCESSING === word_count_threshold: { type: 'number', description: 'Min words per text block. Filters out menus, footers, and short snippets. Lower = more content but more noise. Higher = only substantial paragraphs', default: 200, }, excluded_tags: { type: 'array', items: { type: 'string' }, description: 'HTML tags to remove completely. Common: ["nav", "footer", "aside", "script", "style"]. Cleans up content before extraction', }, remove_overlay_elements: { type: 'boolean', description: 'Automatically remove popups, modals, and overlays that obscure content', default: false, }, js_code: { type: ['string', 'array'], items: { type: 'string' }, description: 'JavaScript to execute. Each string runs separately. Use return to get values.\n\n' + 'IMPORTANT: Always verify elements exist before acting on them!\n' + 'Use get_html first to find correct selectors, then:\n' + 'GOOD: ["if (document.querySelector(\'input[name=\\"email\\"]\')) { ... }"]\n' + 'BAD: ["document.querySelector(\'input[name=\\"email\\"]\').value = \'...\'"]\n\n' + 'USAGE PATTERNS:\n' + '1. WITH screenshot/pdf: {js_code: [...], screenshot: true} ✓\n' + '2. MULTI-STEP: First {js_code: [...], session_id: "x"}, then {js_only: true, session_id: "x"}\n' + '3. AVOID: {js_code: [...], js_only: true} on first call ✗\n\n' + 'SELECTOR TIPS: Use get_html first to find:\n' + ' • name="..." (best for forms)\n' + ' • id="..." (if unique)\n' + ' • class="..." (careful, may repeat)\n\n' + 'FORM EXAMPLE WITH VERIFICATION: [\n' + ' "const emailInput = document.querySelector(\'input[name=\\"email\\"]\');",\n' + ' "if (emailInput) emailInput.value = \'[email protected]\';",\n' + ' "const submitBtn = document.querySelector(\'button[type=\\"submit\\"]\');",\n' + ' "if (submitBtn) submitBtn.click();"\n' + ']', }, js_only: { type: 'boolean', description: 'FOR SUBSEQUENT CALLS ONLY: Reuse existing session without navigation\n' + 'First call: Use js_code WITHOUT js_only (or with screenshot/pdf)\n' + 'Later calls: Use js_only=true to run more JS in same session\n' + 'ERROR: Using js_only=true on first call causes server errors', default: false, }, wait_for: { type: 'string', description: 'Wait for element that loads AFTER initial page load. Format: "css:.selector" or "js:() => condition"\n\n' + 'WHEN TO USE:\n' + ' • Dynamic content that loads after page (AJAX, lazy load)\n' + ' • Elements that appear after animations/transitions\n' + ' • Content loaded by JavaScript frameworks\n\n' + 'WHEN NOT TO USE:\n' + ' • Elements already in initial HTML (forms, static content)\n' + ' • Standard page elements (just use wait_until: "load")\n' + ' • Can cause timeouts/errors if element already exists!\n\n' + 'SELECTOR TIPS: Use get_html first to check if element exists\n' + 'Examples: "css:.ajax-content", "js:() => document.querySelector(\'.lazy-loaded\')"', }, wait_for_timeout: { type: 'number', description: 'Maximum milliseconds to wait for condition', default: 30000, }, delay_before_scroll: { type: 'number', description: 'Milliseconds to wait before scrolling. Allows initial content to render', default: 1000, }, scroll_delay: { type: 'number', description: 'Milliseconds between scroll steps for lazy-loaded content', default: 500, }, process_iframes: { type: 'boolean', description: 'Extract content from embedded iframes including videos and forms', default: false, }, exclude_external_links: { type: 'boolean', description: 'Remove links pointing to different domains for cleaner content', default: false, }, screenshot: { type: 'boolean', description: 'Capture full-page screenshot as base64 PNG', default: false, }, screenshot_directory: { type: 'string', description: "Directory path to save screenshot (e.g., ~/Desktop, /tmp). Do NOT include filename - it will be auto-generated. Large screenshots (>800KB) won't be returned inline when saved.", }, pdf: { type: 'boolean', description: 'Generate PDF as base64 preserving exact layout', default: false, }, cache_mode: { type: 'string', enum: ['ENABLED', 'BYPASS', 'DISABLED'], description: 'Cache strategy. ENABLED: Use cache if available. BYPASS: Fetch fresh (recommended). DISABLED: No cache', default: 'BYPASS', }, timeout: { type: 'number', description: 'Overall request timeout in milliseconds', default: 60000, }, verbose: { type: 'boolean', description: 'Enable server-side debug logging (not shown in output). Only for troubleshooting. Does not affect extraction results', default: false, }, // === DYNAMIC CONTENT HANDLING === wait_until: { type: 'string', enum: ['domcontentloaded', 'networkidle', 'load'], description: 'When to consider page loaded (use INSTEAD of wait_for for initial load):\n' + '• "domcontentloaded" (default): Fast, DOM ready, use for forms/static content\n' + '• "load": All resources loaded, use if you need images\n' + '• "networkidle": Wait for network quiet, use for heavy JS apps\n' + "WARNING: Don't use wait_for for elements in initial HTML!", default: 'domcontentloaded', }, page_timeout: { type: 'number', description: 'Page navigation timeout in milliseconds', default: 60000, }, wait_for_images: { type: 'boolean', description: 'Wait for all images to load before extraction', default: false, }, ignore_body_visibility: { type: 'boolean', description: 'Skip checking if body element is visible', default: true, }, scan_full_page: { type: 'boolean', description: 'Auto-scroll entire page to trigger lazy loading. WARNING: Can be slow on long pages. Avoid combining with wait_until:"networkidle" or CSS extraction on dynamic sites. Better to use virtual_scroll_config for infinite feeds', default: false, }, remove_forms: { type: 'boolean', description: 'Remove all form elements from extracted content', default: false, }, keep_data_attributes: { type: 'boolean', description: 'Preserve data-* attributes in cleaned HTML', default: false, }, excluded_selector: { type: 'string', description: 'CSS selector for elements to remove. Comma-separate multiple selectors.\n\n' + 'SELECTOR STRATEGY: Use get_html first to inspect page structure. Look for:\n' + ' • id attributes (e.g., #cookie-banner)\n' + ' • CSS classes (e.g., .advertisement, .popup)\n' + ' • data-* attributes (e.g., [data-type="ad"])\n' + ' • Element type + attributes (e.g., div[role="banner"])\n\n' + 'Examples: "#cookie-banner, .advertisement, .social-share"', }, only_text: { type: 'boolean', description: 'Extract only text content, no HTML structure', default: false, }, // === OUTPUT OPTIONS === image_description_min_word_threshold: { type: 'number', description: 'Minimum words for image alt text to be considered valid', default: 50, }, image_score_threshold: { type: 'number', description: 'Minimum relevance score for images (filters low-quality images)', default: 3, }, exclude_external_images: { type: 'boolean', description: 'Exclude images from external domains', default: false, }, screenshot_wait_for: { type: 'number', description: 'Extra wait time in seconds before taking screenshot', }, // === LINK & DOMAIN FILTERING === exclude_social_media_links: { type: 'boolean', description: 'Remove links to social media platforms', default: false, }, exclude_domains: { type: 'array', items: { type: 'string' }, description: 'List of domains to exclude from links (e.g., ["ads.com", "tracker.io"])', }, // === PERFORMANCE & ANTI-BOT === simulate_user: { type: 'boolean', description: 'Mimic human behavior with random mouse movements and delays. Helps bypass bot detection on protected sites. Slows crawling but improves success rate', default: false, }, override_navigator: { type: 'boolean', description: 'Override navigator properties for stealth', default: false, }, magic: { type: 'boolean', description: 'EXPERIMENTAL: Auto-handles popups, cookies, overlays.\n' + 'Use as LAST RESORT - can conflict with wait_for & CSS extraction\n' + 'Try first: remove_overlay_elements, excluded_selector\n' + 'Avoid with: CSS extraction, precise timing needs', default: false, }, // Virtual Scroll Configuration virtual_scroll_config: { type: 'object', description: 'For infinite scroll sites that REPLACE content (Twitter/Instagram feeds).\n' + 'USE when: Content disappears as you scroll (virtual scrolling)\n' + "DON'T USE when: Content appends (use scan_full_page instead)\n" + 'Example: {container_selector: "#timeline", scroll_count: 10, wait_after_scroll: 1}', properties: { container_selector: { type: 'string', description: 'CSS selector for the scrollable container.\n\n' + 'SELECTOR STRATEGY: Use get_html first to inspect page structure. Look for:\n' + ' • id attributes (e.g., #timeline)\n' + ' • role attributes (e.g., [role="feed"])\n' + ' • CSS classes (e.g., .feed, .timeline)\n' + ' • data-* attributes (e.g., [data-testid="primaryColumn"])\n\n' + 'Common: "#timeline" (Twitter), "[role=\'feed\']" (generic), ".feed" (Instagram)', }, scroll_count: { type: 'number', description: 'How many times to scroll. Each scroll loads new content batch. More = more posts but slower', default: 10, }, scroll_by: { type: ['string', 'number'], description: 'Distance per scroll. "container_height": one viewport, "page_height": full page, or pixels like 500', default: 'container_height', }, wait_after_scroll: { type: 'number', description: 'Seconds to wait after each scroll', default: 0.5, }, }, required: ['container_selector'], }, // Other log_console: { type: 'boolean', description: 'Capture browser console logs for debugging', default: false, }, }, required: ['url'], }, }, { name: 'manage_session', description: '[SESSION MANAGEMENT] Unified tool for managing browser sessions. Supports three actions:\n\n' + '• CREATE: Start a persistent browser session that maintains state across calls\n' + '• CLEAR: Remove a session from local tracking\n' + '• LIST: Show all active sessions with age and usage info\n\n' + 'USAGE EXAMPLES:\n' + '1. Create session: {action: "create", session_id: "my-session", initial_url: "https://example.com"}\n' + '2. Clear session: {action: "clear", session_id: "my-session"}\n' + '3. List sessions: {action: "list"}\n\n' + 'Browser sessions maintain ALL state (cookies, localStorage, page) across multiple crawl calls. Essential for: forms, login flows, multi-step processes, maintaining state across operations.', inputSchema: { // Anthropic/Claude tools require top-level schemas to be a plain object without oneOf/allOf/anyOf type: 'object', properties: { action: { type: 'string', description: 'Action to perform: create, clear, or list', enum: ['create', 'clear', 'list'], }, session_id: { type: 'string', description: 'Session identifier. Required for action="clear". Optional for create (auto-generated if omitted).', }, initial_url: { type: 'string', description: 'URL to load when creating session (action="create").', }, browser_type: { type: 'string', enum: ['chromium', 'firefox', 'webkit'], description: 'Browser engine for the session (action="create").', default: 'chromium', }, }, required: ['action'], }, }, { name: 'extract_with_llm', description: '[STATELESS] Ask questions about webpage content using AI. Returns natural language answers. ' + 'Crawls fresh each time. For dynamic content or sessions, use crawl with session_id first.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to extract data from', }, query: { type: 'string', description: 'Your question about the webpage content. Examples: "What is the main topic?", ' + '"List all product prices", "Summarize the key points", "What contact information is available?"', }, }, required: ['url', 'query'], }, }, ], })); // Handle tool calls this.server.setRequestHandler(CallToolRequestSchema, async (request) => { const { name, arguments: args } = request.params; try { switch (name) { case 'get_markdown': return await this.validateAndExecute( 'get_markdown', args, GetMarkdownSchema as z.ZodSchema<z.infer<typeof GetMarkdownSchema>>, async (validatedArgs) => this.contentHandlers.getMarkdown(validatedArgs), ); case 'capture_screenshot': return await this.validateAndExecute( 'capture_screenshot', args, CaptureScreenshotSchema, async (validatedArgs) => this.contentHandlers.captureScreenshot(validatedArgs), ); case 'generate_pdf': return await this.validateAndExecute('generate_pdf', args, GeneratePdfSchema, async (validatedArgs) => this.contentHandlers.generatePDF(validatedArgs), ); case 'execute_js': return await this.validateAndExecute('execute_js', args, ExecuteJsSchema, async (validatedArgs) => this.utilityHandlers.executeJS(validatedArgs), ); case 'batch_crawl': return await this.validateAndExecute('batch_crawl', args, BatchCrawlSchema, async (validatedArgs) => this.crawlHandlers.batchCrawl(validatedArgs as BatchCrawlOptions), ); case 'smart_crawl': return await this.validateAndExecute('smart_crawl', args, SmartCrawlSchema, async (validatedArgs) => this.crawlHandlers.smartCrawl(validatedArgs), ); case 'get_html': return await this.validateAndExecute('get_html', args, GetHtmlSchema, async (validatedArgs) => this.contentHandlers.getHTML(validatedArgs), ); case 'extract_links': return await this.validateAndExecute( 'extract_links', args, ExtractLinksSchema as z.ZodSchema<z.infer<typeof ExtractLinksSchema>>, async (validatedArgs) => this.utilityHandlers.extractLinks(validatedArgs), ); case 'crawl_recursive': return await this.validateAndExecute('crawl_recursive', args, CrawlRecursiveSchema, async (validatedArgs) => this.crawlHandlers.crawlRecursive(validatedArgs), ); case 'parse_sitemap': return await this.validateAndExecute('parse_sitemap', args, ParseSitemapSchema, async (validatedArgs) => this.crawlHandlers.parseSitemap(validatedArgs), ); case 'crawl': return await this.validateAndExecute('crawl', args, CrawlSchema, async (validatedArgs) => this.crawlHandlers.crawl(validatedArgs), ); case 'manage_session': return await this.validateAndExecute('manage_session', args, ManageSessionSchema, async (validatedArgs) => this.sessionHandlers.manageSession(validatedArgs), ); case 'extract_with_llm': return await this.validateAndExecute( 'extract_with_llm', args, ExtractWithLlmSchema, async (validatedArgs) => this.contentHandlers.extractWithLLM(validatedArgs), ); default: throw new Error(`Unknown tool: ${name}`); } } catch (error) { return { content: [ { type: 'text', text: `Error: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } }); } // Expose handler methods for testing protected async getMarkdown(options: Parameters<ContentHandlers['getMarkdown']>[0]) { return this.contentHandlers.getMarkdown(options); } protected async captureScreenshot(options: Parameters<ContentHandlers['captureScreenshot']>[0]) { return this.contentHandlers.captureScreenshot(options); } protected async generatePDF(options: Parameters<ContentHandlers['generatePDF']>[0]) { return this.contentHandlers.generatePDF(options); } protected async getHTML(options: Parameters<ContentHandlers['getHTML']>[0]) { return this.contentHandlers.getHTML(options); } protected async extractWithLLM(options: Parameters<ContentHandlers['extractWithLLM']>[0]) { return this.contentHandlers.extractWithLLM(options); } protected async executeJS(options: Parameters<UtilityHandlers['executeJS']>[0]) { return this.utilityHandlers.executeJS(options); } protected async extractLinks(options: Parameters<UtilityHandlers['extractLinks']>[0]) { return this.utilityHandlers.extractLinks(options); } protected async batchCrawl(options: Parameters<CrawlHandlers['batchCrawl']>[0]) { return this.crawlHandlers.batchCrawl(options); } protected async smartCrawl(options: Parameters<CrawlHandlers['smartCrawl']>[0]) { return this.crawlHandlers.smartCrawl(options); } protected async crawlRecursive(options: Parameters<CrawlHandlers['crawlRecursive']>[0]) { return this.crawlHandlers.crawlRecursive(options); } protected async parseSitemap(options: Parameters<CrawlHandlers['parseSitemap']>[0]) { return this.crawlHandlers.parseSitemap(options); } protected async crawl(options: Parameters<CrawlHandlers['crawl']>[0]) { return this.crawlHandlers.crawl(options); } // Setter for axiosClient to update all handlers (for testing) set axiosClientForTesting(client: AxiosInstance) { this.axiosClient = client; // Re-initialize handlers with new client this.contentHandlers = new ContentHandlers(this.service, client, this.sessions); this.sessionHandlers = new SessionHandlers(this.service, client, this.sessions); this.utilityHandlers = new UtilityHandlers(this.service, client, this.sessions); this.crawlHandlers = new CrawlHandlers(this.service, client, this.sessions); } /* istanbul ignore next */ async start() { const transport = new StdioServerTransport(); await this.server.connect(transport); console.error(`${this.serverName} v${this.serverVersion} started`); } } ```