#
tokens: 49539/50000 45/58 files (page 1/3)
lines: off (toggle) GitHub
raw markdown copy
This is page 1 of 3. Use http://codebase.md/omgwtfwow/mcp-crawl4ai-ts?page={x} to view the full context.

# Directory Structure

```
├── .env.example
├── .github
│   ├── CI.md
│   ├── copilot-instructions.md
│   └── workflows
│       └── ci.yml
├── .gitignore
├── .prettierignore
├── .prettierrc.json
├── CHANGELOG.md
├── eslint.config.mjs
├── jest.config.cjs
├── jest.setup.cjs
├── LICENSE
├── package-lock.json
├── package.json
├── README.md
├── src
│   ├── __tests__
│   │   ├── crawl.test.ts
│   │   ├── crawl4ai-service.network.test.ts
│   │   ├── crawl4ai-service.test.ts
│   │   ├── handlers
│   │   │   ├── crawl-handlers.test.ts
│   │   │   ├── parameter-combinations.test.ts
│   │   │   ├── screenshot-saving.test.ts
│   │   │   ├── session-handlers.test.ts
│   │   │   └── utility-handlers.test.ts
│   │   ├── index.cli.test.ts
│   │   ├── index.npx.test.ts
│   │   ├── index.server.test.ts
│   │   ├── index.test.ts
│   │   ├── integration
│   │   │   ├── batch-crawl.integration.test.ts
│   │   │   ├── capture-screenshot.integration.test.ts
│   │   │   ├── crawl-advanced.integration.test.ts
│   │   │   ├── crawl-handlers.integration.test.ts
│   │   │   ├── crawl-recursive.integration.test.ts
│   │   │   ├── crawl.integration.test.ts
│   │   │   ├── execute-js.integration.test.ts
│   │   │   ├── extract-links.integration.test.ts
│   │   │   ├── extract-with-llm.integration.test.ts
│   │   │   ├── generate-pdf.integration.test.ts
│   │   │   ├── get-html.integration.test.ts
│   │   │   ├── get-markdown.integration.test.ts
│   │   │   ├── parse-sitemap.integration.test.ts
│   │   │   ├── session-management.integration.test.ts
│   │   │   ├── smart-crawl.integration.test.ts
│   │   │   └── test-utils.ts
│   │   ├── request-handler.test.ts
│   │   ├── schemas
│   │   │   └── validation-edge-cases.test.ts
│   │   ├── types
│   │   │   └── mocks.ts
│   │   └── utils
│   │       └── javascript-validation.test.ts
│   ├── crawl4ai-service.ts
│   ├── handlers
│   │   ├── base-handler.ts
│   │   ├── content-handlers.ts
│   │   ├── crawl-handlers.ts
│   │   ├── session-handlers.ts
│   │   └── utility-handlers.ts
│   ├── index.ts
│   ├── schemas
│   │   ├── helpers.ts
│   │   └── validation-schemas.ts
│   ├── server.ts
│   └── types.ts
├── tsconfig.build.json
└── tsconfig.json
```

# Files

--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------

```
dist
node_modules
*.md
*.json
.env
.env.*
coverage
.nyc_output
```

--------------------------------------------------------------------------------
/.prettierrc.json:
--------------------------------------------------------------------------------

```json
{
  "semi": true,
  "trailingComma": "all",
  "singleQuote": true,
  "printWidth": 120,
  "tabWidth": 2,
  "useTabs": false,
  "bracketSpacing": true,
  "arrowParens": "always",
  "endOfLine": "lf"
}
```

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
# Dependencies
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*

# Build output
dist/
build/
*.js
*.js.map
*.d.ts
*.d.ts.map

# Environment
.env
.env.local
.env.*.local

# IDE
.vscode/
.idea/
*.swp
*.swo
*~

# OS
.DS_Store
Thumbs.db

# Logs
logs/
*.log

# Testing
coverage/
.nyc_output/
src/__tests__/mock-responses.json

# Temporary files
tmp/
temp/

add-to-claude.sh
```

--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------

```
# Required: URL of your Crawl4AI server
CRAWL4AI_BASE_URL=http://localhost:11235

# Optional: API key for authentication (if your server requires it)
CRAWL4AI_API_KEY=

# Optional: Custom server name and version
SERVER_NAME=crawl4ai-mcp
SERVER_VERSION=0.7.4

# Optional: For LLM extraction tests
LLM_PROVIDER=openai/gpt-4o-mini
LLM_API_TOKEN=your-llm-api-key
LLM_BASE_URL=https://api.openai.com/v1  # If using custom endpoint

```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
# MCP Server for Crawl4AI

> **Note:** Tested with Crawl4AI version 0.7.4

[![npm version](https://img.shields.io/npm/v/mcp-crawl4ai-ts.svg)](https://www.npmjs.com/package/mcp-crawl4ai-ts)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Node.js CI](https://img.shields.io/badge/Node.js-18+-green.svg)](https://nodejs.org/)
[![coverage](https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fomgwtfwow%2Fe2abffb0deb25afa2bf9185f440dae81%2Fraw%2Fcoverage.json&cacheSeconds=300)](https://omgwtfwow.github.io/mcp-crawl4ai-ts/coverage/)

TypeScript implementation of an MCP server for Crawl4AI. Provides tools for web crawling, content extraction, and browser automation.

## Table of Contents

- [Prerequisites](#prerequisites)
- [Quick Start](#quick-start)
- [Configuration](#configuration)
- [Client-Specific Instructions](#client-specific-instructions)
- [Available Tools](#available-tools)
  - [1. get_markdown](#1-get_markdown---extract-content-as-markdown-with-filtering)
  - [2. capture_screenshot](#2-capture_screenshot---capture-webpage-screenshot)
  - [3. generate_pdf](#3-generate_pdf---convert-webpage-to-pdf)
  - [4. execute_js](#4-execute_js---execute-javascript-and-get-return-values)
  - [5. batch_crawl](#5-batch_crawl---crawl-multiple-urls-concurrently)
  - [6. smart_crawl](#6-smart_crawl---auto-detect-and-handle-different-content-types)
  - [7. get_html](#7-get_html---get-sanitized-html-for-analysis)
  - [8. extract_links](#8-extract_links---extract-and-categorize-page-links)
  - [9. crawl_recursive](#9-crawl_recursive---deep-crawl-website-following-links)
  - [10. parse_sitemap](#10-parse_sitemap---extract-urls-from-xml-sitemaps)
  - [11. crawl](#11-crawl---advanced-web-crawling-with-full-configuration)
  - [12. manage_session](#12-manage_session---unified-session-management)
  - [13. extract_with_llm](#13-extract_with_llm---extract-structured-data-using-ai)
- [Advanced Configuration](#advanced-configuration)
- [Development](#development)
- [License](#license)

## Prerequisites

- Node.js 18+ and npm
- A running Crawl4AI server

## Quick Start

### 1. Start the Crawl4AI server (for example, local docker)

```bash
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.4
```

### 2. Add to your MCP client

This MCP server works with any MCP-compatible client (Claude Desktop, Claude Code, Cursor, LMStudio, etc.).

#### Using npx (Recommended)

```json
{
  "mcpServers": {
    "crawl4ai": {
      "command": "npx",
      "args": ["mcp-crawl4ai-ts"],
      "env": {
        "CRAWL4AI_BASE_URL": "http://localhost:11235"
      }
    }
  }
}
```

#### Using local installation

```json
{
  "mcpServers": {
    "crawl4ai": {
      "command": "node",
      "args": ["/path/to/mcp-crawl4ai-ts/dist/index.js"],
      "env": {
        "CRAWL4AI_BASE_URL": "http://localhost:11235"
      }
    }
  }
}
```

#### With all optional variables

```json
{
  "mcpServers": {
    "crawl4ai": {
      "command": "npx",
      "args": ["mcp-crawl4ai-ts"],
      "env": {
        "CRAWL4AI_BASE_URL": "http://localhost:11235",
        "CRAWL4AI_API_KEY": "your-api-key",
        "SERVER_NAME": "custom-name",
        "SERVER_VERSION": "1.0.0"
      }
    }
  }
}
```

## Configuration

### Environment Variables

```env
# Required
CRAWL4AI_BASE_URL=http://localhost:11235

# Optional - Server Configuration
CRAWL4AI_API_KEY=          # If your server requires auth
SERVER_NAME=crawl4ai-mcp   # Custom name for the MCP server
SERVER_VERSION=1.0.0       # Custom version
```

## Client-Specific Instructions

### Claude Desktop

Add to `~/Library/Application Support/Claude/claude_desktop_config.json`

### Claude Code

```bash
claude mcp add crawl4ai -e CRAWL4AI_BASE_URL=http://localhost:11235 -- npx mcp-crawl4ai-ts
```

### Other MCP Clients

Consult your client's documentation for MCP server configuration. The key details:

- **Command**: `npx mcp-crawl4ai-ts` or `node /path/to/dist/index.js`
- **Required env**: `CRAWL4AI_BASE_URL`
- **Optional env**: `CRAWL4AI_API_KEY`, `SERVER_NAME`, `SERVER_VERSION`

## Available Tools

### 1. `get_markdown` - Extract content as markdown with filtering

```typescript
{ 
  url: string,                              // Required: URL to extract markdown from
  filter?: 'raw'|'fit'|'bm25'|'llm',       // Filter type (default: 'fit')
  query?: string,                           // Query for bm25/llm filters
  cache?: string                            // Cache-bust parameter (default: '0')
}
```

Extracts content as markdown with various filtering options. Use 'bm25' or 'llm' filters with a query for specific content extraction.

### 2. `capture_screenshot` - Capture webpage screenshot

```typescript
{ 
  url: string,                   // Required: URL to capture
  screenshot_wait_for?: number   // Seconds to wait before screenshot (default: 2)
}
```

Returns base64-encoded PNG. Note: This is stateless - for screenshots after JS execution, use `crawl` with `screenshot: true`.

### 3. `generate_pdf` - Convert webpage to PDF

```typescript
{ 
  url: string  // Required: URL to convert to PDF
}
```

Returns base64-encoded PDF. Stateless tool - for PDFs after JS execution, use `crawl` with `pdf: true`.

### 4. `execute_js` - Execute JavaScript and get return values

```typescript
{ 
  url: string,                    // Required: URL to load
  scripts: string | string[]      // Required: JavaScript to execute
}
```

Executes JavaScript and returns results. Each script can use 'return' to get values back. Stateless - for persistent JS execution use `crawl` with `js_code`.

### 5. `batch_crawl` - Crawl multiple URLs concurrently

```typescript
{ 
  urls: string[],           // Required: List of URLs to crawl
  max_concurrent?: number,  // Parallel request limit (default: 5)
  remove_images?: boolean,  // Remove images from output (default: false)
  bypass_cache?: boolean,   // Bypass cache for all URLs (default: false)
  configs?: Array<{         // Optional: Per-URL configurations (v3.0.0+)
    url: string,
    [key: string]: any      // Any crawl parameters for this specific URL
  }>
}
```

Efficiently crawls multiple URLs in parallel. Each URL gets a fresh browser instance. With `configs` array, you can specify different parameters for each URL.

### 6. `smart_crawl` - Auto-detect and handle different content types

```typescript
{ 
  url: string,            // Required: URL to crawl
  max_depth?: number,     // Maximum depth for recursive crawling (default: 2)
  follow_links?: boolean, // Follow links in content (default: true)
  bypass_cache?: boolean  // Bypass cache (default: false)
}
```

Intelligently detects content type (HTML/sitemap/RSS) and processes accordingly.

### 7. `get_html` - Get sanitized HTML for analysis

```typescript
{ 
  url: string  // Required: URL to extract HTML from
}
```

Returns preprocessed HTML optimized for structure analysis. Use for building schemas or analyzing patterns.

### 8. `extract_links` - Extract and categorize page links

```typescript
{ 
  url: string,          // Required: URL to extract links from
  categorize?: boolean  // Group by type (default: true)
}
```

Extracts all links and groups them by type: internal, external, social media, documents, images.

### 9. `crawl_recursive` - Deep crawl website following links

```typescript
{ 
  url: string,              // Required: Starting URL
  max_depth?: number,       // Maximum depth to crawl (default: 3)
  max_pages?: number,       // Maximum pages to crawl (default: 50)
  include_pattern?: string, // Regex pattern for URLs to include
  exclude_pattern?: string  // Regex pattern for URLs to exclude
}
```

Crawls a website following internal links up to specified depth. Returns content from all discovered pages.

### 10. `parse_sitemap` - Extract URLs from XML sitemaps

```typescript
{ 
  url: string,              // Required: Sitemap URL (e.g., /sitemap.xml)
  filter_pattern?: string   // Optional: Regex pattern to filter URLs
}
```

Extracts all URLs from XML sitemaps. Supports regex filtering for specific URL patterns.

### 11. `crawl` - Advanced web crawling with full configuration

```typescript
{
  url: string,                              // URL to crawl
  // Browser Configuration
  browser_type?: 'chromium'|'firefox'|'webkit'|'undetected',  // Browser engine (undetected = stealth mode)
  viewport_width?: number,                  // Browser width (default: 1080)
  viewport_height?: number,                 // Browser height (default: 600)
  user_agent?: string,                      // Custom user agent
  proxy_server?: string | {                 // Proxy URL (string or object format)
    server: string,
    username?: string,
    password?: string
  },
  proxy_username?: string,                  // Proxy auth (if using string format)
  proxy_password?: string,                  // Proxy password (if using string format)
  cookies?: Array<{name, value, domain}>,   // Pre-set cookies
  headers?: Record<string,string>,          // Custom headers
  
  // Crawler Configuration
  word_count_threshold?: number,            // Min words per block (default: 200)
  excluded_tags?: string[],                 // HTML tags to exclude
  remove_overlay_elements?: boolean,        // Remove popups/modals
  js_code?: string | string[],              // JavaScript to execute
  wait_for?: string,                        // Wait condition (selector or JS)
  wait_for_timeout?: number,                // Wait timeout (default: 30000)
  delay_before_scroll?: number,             // Pre-scroll delay
  scroll_delay?: number,                    // Between-scroll delay
  process_iframes?: boolean,                // Include iframe content
  exclude_external_links?: boolean,         // Remove external links
  screenshot?: boolean,                     // Capture screenshot
  pdf?: boolean,                           // Generate PDF
  session_id?: string,                      // Reuse browser session (only works with crawl tool)
  cache_mode?: 'ENABLED'|'BYPASS'|'DISABLED',  // Cache control
  
  // New in v3.0.0 (Crawl4AI 0.7.3/0.7.4)
  css_selector?: string,                    // CSS selector to filter content
  delay_before_return_html?: number,        // Delay in seconds before returning HTML
  include_links?: boolean,                  // Include extracted links in response
  resolve_absolute_urls?: boolean,          // Convert relative URLs to absolute
  
  // LLM Extraction (REST API only supports 'llm' type)
  extraction_type?: 'llm',                  // Only 'llm' extraction is supported via REST API
  extraction_schema?: object,               // Schema for structured extraction
  extraction_instruction?: string,          // Natural language extraction prompt
  extraction_strategy?: {                   // Advanced extraction configuration
    provider?: string,
    api_key?: string,
    model?: string,
    [key: string]: any
  },
  table_extraction_strategy?: {             // Table extraction configuration
    enable_chunking?: boolean,
    thresholds?: object,
    [key: string]: any
  },
  markdown_generator_options?: {            // Markdown generation options
    include_links?: boolean,
    preserve_formatting?: boolean,
    [key: string]: any
  },
  
  timeout?: number,                         // Overall timeout (default: 60000)
  verbose?: boolean                         // Detailed logging
}
```

### 12. `manage_session` - Unified session management

```typescript
{ 
  action: 'create' | 'clear' | 'list',    // Required: Action to perform
  session_id?: string,                    // For 'create' and 'clear' actions
  initial_url?: string,                   // For 'create' action: URL to load
  browser_type?: 'chromium' | 'firefox' | 'webkit' | 'undetected'  // For 'create' action
}
```

Unified tool for managing browser sessions. Supports three actions:
- **create**: Start a persistent browser session
- **clear**: Remove a session from local tracking
- **list**: Show all active sessions

Examples:
```typescript
// Create a new session
{ action: 'create', session_id: 'my-session', initial_url: 'https://example.com' }

// Clear a session
{ action: 'clear', session_id: 'my-session' }

// List all sessions
{ action: 'list' }
```

### 13. `extract_with_llm` - Extract structured data using AI

```typescript
{ 
  url: string,          // URL to extract data from
  query: string         // Natural language extraction instructions
}
```

Uses AI to extract structured data from webpages. Returns results immediately without any polling or job management. This is the recommended way to extract specific information since CSS/XPath extraction is not supported via the REST API.

## Advanced Configuration

For detailed information about all available configuration options, extraction strategies, and advanced features, please refer to the official Crawl4AI documentation:

- [Crawl4AI Documentation](https://docs.crawl4ai.com/)
- [Crawl4AI GitHub Repository](https://github.com/unclecode/crawl4ai)

## Changelog

See [CHANGELOG.md](CHANGELOG.md) for detailed version history and recent updates.

## Development

### Setup

```bash
# 1. Start the Crawl4AI server
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest

# 2. Install MCP server
git clone https://github.com/omgwtfwow/mcp-crawl4ai-ts.git
cd mcp-crawl4ai-ts
npm install
cp .env.example .env

# 3. Development commands
npm run dev    # Development mode
npm test       # Run tests
npm run lint   # Check code quality
npm run build  # Production build

# 4. Add to your MCP client (See "Using local installation")
```

### Running Integration Tests

Integration tests require a running Crawl4AI server. Configure your environment:

```bash
# Required for integration tests
export CRAWL4AI_BASE_URL=http://localhost:11235
export CRAWL4AI_API_KEY=your-api-key  # If authentication is required

# Optional: For LLM extraction tests
export LLM_PROVIDER=openai/gpt-4o-mini
export LLM_API_TOKEN=your-llm-api-key
export LLM_BASE_URL=https://api.openai.com/v1  # If using custom endpoint

# Run integration tests (ALWAYS use the npm script; don't call `jest` directly)
npm run test:integration

# Run a single integration test file
npm run test:integration -- src/__tests__/integration/extract-links.integration.test.ts

> IMPORTANT: Do NOT run `npx jest` directly for integration tests. The npm script injects `NODE_OPTIONS=--experimental-vm-modules` which is required for ESM + ts-jest. Running Jest directly will produce `SyntaxError: Cannot use import statement outside a module` and hang.
```

Integration tests cover:

- Dynamic content and JavaScript execution
- Session management and cookies
- Content extraction (LLM-based only)
- Media handling (screenshots, PDFs)
- Performance and caching
- Content filtering
- Bot detection avoidance
- Error handling

### Integration Test Checklist
1. Docker container healthy:
  ```bash
  docker ps --filter name=crawl4ai --format '{{.Names}} {{.Status}}'
  curl -sf http://localhost:11235/health || echo "Health check failed"
  ```
2. Env vars loaded (either exported or in `.env`): `CRAWL4AI_BASE_URL` (required), optional: `CRAWL4AI_API_KEY`, `LLM_PROVIDER`, `LLM_API_TOKEN`, `LLM_BASE_URL`.
3. Use `npm run test:integration` (never raw `jest`).
4. To target one file add it after `--` (see example above).
5. Expect total runtime ~2–3 minutes; longer or immediate hang usually means missing `NODE_OPTIONS` or wrong Jest version.

### Troubleshooting
| Symptom | Likely Cause | Fix |
|---------|--------------|-----|
| `SyntaxError: Cannot use import statement outside a module` | Ran `jest` directly without script flags | Re-run with `npm run test:integration` |
| Hangs on first test (RUNS ...) | Missing experimental VM modules flag | Use npm script / ensure `NODE_OPTIONS=--experimental-vm-modules` |
| Network timeouts | Crawl4AI container not healthy / DNS blocked | Restart container: `docker restart <name>` |
| LLM tests skipped | Missing `LLM_PROVIDER` or `LLM_API_TOKEN` | Export required LLM vars |
| New Jest major upgrade breaks tests | Version mismatch with `ts-jest` | Keep Jest 29.x unless `ts-jest` upgraded accordingly |

### Version Compatibility Note
Current stack: `[email protected]` + `[email protected]` + ESM (`"type": "module"`). Updating Jest to 30+ requires upgrading `ts-jest` and revisiting `jest.config.cjs`. Keep versions aligned to avoid parse errors.

## License

MIT
```

--------------------------------------------------------------------------------
/tsconfig.build.json:
--------------------------------------------------------------------------------

```json
{
  "extends": "./tsconfig.json",
  "exclude": [
    "node_modules",
    "dist",
    "src/**/*.test.ts",
    "src/__tests__/**/*"
  ]
}
```

--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------

```json
{
  "compilerOptions": {
    "target": "ES2022",
    "module": "NodeNext",
    "moduleResolution": "NodeNext",
    "lib": ["ES2022"],
    "outDir": "./dist",
    "rootDir": "./src",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true,
    "resolveJsonModule": true,
    "declaration": true,
    "declarationMap": true,
    "sourceMap": true,
    "isolatedModules": true
  },
  "include": ["src/**/*"],
  "exclude": ["node_modules", "dist"]
}
```

--------------------------------------------------------------------------------
/jest.setup.cjs:
--------------------------------------------------------------------------------

```
// Load dotenv for integration tests
const dotenv = require('dotenv');
const path = require('path');

// The npm script sets an env var to identify integration tests
const isIntegrationTest = process.env.JEST_TEST_TYPE === 'integration';

if (isIntegrationTest) {
  // For integration tests, load from .env file
  dotenv.config({ path: path.resolve(__dirname, '.env') });
  
  // For integration tests, we MUST have proper environment variables
  // No fallback to localhost - tests should fail if not configured
} else {
  // For unit tests, always use localhost
  process.env.CRAWL4AI_BASE_URL = 'http://localhost:11235';
  process.env.CRAWL4AI_API_KEY = 'test-api-key';
}
```

--------------------------------------------------------------------------------
/jest.config.cjs:
--------------------------------------------------------------------------------

```
/** @type {import('jest').Config} */
module.exports = {
  preset: 'ts-jest/presets/default-esm',
  testEnvironment: 'node',
  roots: ['<rootDir>/src'],
  testMatch: ['**/__tests__/**/*.test.ts'],
  setupFiles: ['<rootDir>/jest.setup.cjs'],
  collectCoverageFrom: [
    'src/**/*.ts',
    '!src/**/__tests__/**',
    '!src/**/*.test.ts',
    '!src/**/types/**',
  ],
  coverageDirectory: 'coverage',
  coverageReporters: ['text', 'lcov', 'html', 'json'],
  moduleNameMapper: {
    '^(\\.{1,2}/.*)\\.js$': '$1',
  },
  transform: {
    '^.+\\.tsx?$': [
      'ts-jest',
      {
        useESM: true,
      },
    ],
  },
  extensionsToTreatAsEsm: ['.ts'],
  clearMocks: true,
  // Limit parallelization for integration tests to avoid overwhelming the server
  ...(process.env.NODE_ENV === 'test' && process.argv.some(arg => arg.includes('integration')) ? { maxWorkers: 1 } : {}),
};
```

--------------------------------------------------------------------------------
/src/__tests__/types/mocks.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import type { AxiosResponse } from 'axios';

/**
 * Mock axios instance for testing HTTP client behavior
 */
export interface MockAxiosInstance {
  post: jest.Mock<Promise<AxiosResponse>>;
  get: jest.Mock<Promise<AxiosResponse>>;
  head: jest.Mock<Promise<AxiosResponse>>;
  put?: jest.Mock<Promise<AxiosResponse>>;
  delete?: jest.Mock<Promise<AxiosResponse>>;
  patch?: jest.Mock<Promise<AxiosResponse>>;
}

/**
 * Mock function type that returns a promise with content array
 */
type MockFunction = jest.Mock<Promise<{ content: TestContent }>>;

/**
 * Mock server interface for MCP server testing
 */
export interface MockMCPServer {
  listTools: MockFunction;
  callTool: MockFunction;
  listResources?: MockFunction;
  readResource?: MockFunction;
  listPrompts?: MockFunction;
  getPrompt?: MockFunction;
}

/**
 * Type for test content arrays used in MCP responses
 */
export type TestContent = Array<{
  type: string;
  text?: string;
  resource?: {
    uri: string;
    mimeType: string;
    blob?: string;
  };
}>;

/**
 * Generic test response type
 */
export interface TestResponse<T = unknown> {
  content: TestContent;
  data?: T;
  error?: string;
}

```

--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------

```typescript
#!/usr/bin/env node

import { Crawl4AIServer } from './server.js';

// Try to load dotenv only in development
// In production (via npx), env vars come from the MCP client
try {
  // Only try to load dotenv if CRAWL4AI_BASE_URL is not set
  if (!process.env.CRAWL4AI_BASE_URL) {
    const dotenv = await import('dotenv');
    dotenv.config();
  }
} catch {
  // dotenv is not available in production, which is expected
}

const CRAWL4AI_BASE_URL = process.env.CRAWL4AI_BASE_URL;
const CRAWL4AI_API_KEY = process.env.CRAWL4AI_API_KEY || '';
const SERVER_NAME = process.env.SERVER_NAME || 'crawl4ai-mcp';
const SERVER_VERSION = process.env.SERVER_VERSION || '1.0.0';

if (!CRAWL4AI_BASE_URL) {
  console.error('Error: CRAWL4AI_BASE_URL environment variable is required');
  console.error('Please set it to your Crawl4AI server URL (e.g., http://localhost:8080)');
  process.exit(1);
}

// Always start the server when this script is executed
// This script is meant to be run as an MCP server
const server = new Crawl4AIServer(CRAWL4AI_BASE_URL, CRAWL4AI_API_KEY, SERVER_NAME, SERVER_VERSION);
server.start().catch((err) => {
  console.error('Server failed to start:', err);
  process.exit(1);
});

```

--------------------------------------------------------------------------------
/.github/CI.md:
--------------------------------------------------------------------------------

```markdown
# GitHub Actions CI/CD

This project uses GitHub Actions for continuous integration.

## Workflows

### CI (`ci.yml`)
Runs on every push to main and on pull requests:
- Linting (ESLint)
- Code formatting check (Prettier)
- Build (TypeScript compilation)
- Unit tests (with nock mocks)
- Test coverage report

Tests run on Node.js 18.x and 20.x.

## Mock Maintenance

The unit tests use [nock](https://github.com/nock/nock) for HTTP mocking. This provides:
- Fast test execution (~1 second)
- Predictable test results
- No external dependencies during CI

**How to update mocks:**

Option 1 - Generate mock code from real API:
```bash
# This will call the real API and generate nock mock code
CRAWL4AI_API_KEY=your-key npm run generate-mocks
```

Option 2 - View API responses as JSON:
```bash
# This will save responses to mock-responses.json for inspection
CRAWL4AI_API_KEY=your-key npm run view-mocks
```

Option 3 - Manual update:
1. Run integration tests to see current API behavior: `npm run test:integration`
2. Update the mock responses in `src/__tests__/crawl4ai-service.test.ts`
3. Ensure unit tests pass: `npm run test:unit`

The mocks are intentionally simple and focus on testing our code's behavior, not the API's exact responses.

## Running Tests Locally

```bash
# Run all tests
npm test

# Run only unit tests (fast, with mocks)
npm run test:unit

# Run only integration tests (slow, real API)
npm run test:integration

# Run with coverage
npm run test:coverage
```
```

--------------------------------------------------------------------------------
/src/handlers/base-handler.ts:
--------------------------------------------------------------------------------

```typescript
import { Crawl4AIService } from '../crawl4ai-service.js';
import { AxiosInstance } from 'axios';

// Error handling types
export interface ErrorWithResponse {
  response?: {
    data?:
      | {
          detail?: string;
        }
      | string
      | unknown;
  };
  message?: string;
}

export interface SessionInfo {
  id: string;
  created_at: Date;
  last_used: Date;
  initial_url?: string;
  metadata?: Record<string, unknown>;
}

export abstract class BaseHandler {
  protected service: Crawl4AIService;
  protected axiosClient: AxiosInstance;
  protected sessions: Map<string, SessionInfo>;

  constructor(service: Crawl4AIService, axiosClient: AxiosInstance, sessions: Map<string, SessionInfo>) {
    this.service = service;
    this.axiosClient = axiosClient;
    this.sessions = sessions;
  }

  protected formatError(error: unknown, operation: string): Error {
    const errorWithResponse = error as ErrorWithResponse;
    let errorMessage = '';

    const data = errorWithResponse.response?.data;
    if (typeof data === 'object' && data && 'detail' in data) {
      errorMessage = (data as { detail: string }).detail;
    } else if (data) {
      // If data is an object, stringify it
      errorMessage = typeof data === 'object' ? JSON.stringify(data) : String(data);
    } else if (error instanceof Error) {
      errorMessage = error.message;
    } else {
      errorMessage = String(error);
    }

    return new Error(`Failed to ${operation}: ${errorMessage}`);
  }
}

```

--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------

```
import eslint from '@eslint/js';
import tseslint from '@typescript-eslint/eslint-plugin';
import tsparser from '@typescript-eslint/parser';
import prettier from 'eslint-config-prettier';
import prettierPlugin from 'eslint-plugin-prettier';

export default [
  eslint.configs.recommended,
  prettier,
  {
    files: ['src/**/*.ts'],
    languageOptions: {
      parser: tsparser,
      parserOptions: {
        project: './tsconfig.json',
        ecmaVersion: 'latest',
        sourceType: 'module',
      },
      globals: {
        console: 'readonly',
        process: 'readonly',
        Buffer: 'readonly',
        __dirname: 'readonly',
        __filename: 'readonly',
        setTimeout: 'readonly',
        clearTimeout: 'readonly',
        setInterval: 'readonly',
        clearInterval: 'readonly',
        URL: 'readonly',
      },
    },
    plugins: {
      '@typescript-eslint': tseslint,
      prettier: prettierPlugin,
    },
    rules: {
      ...tseslint.configs.recommended.rules,
      '@typescript-eslint/explicit-function-return-type': 'off',
      '@typescript-eslint/explicit-module-boundary-types': 'off',
      '@typescript-eslint/no-explicit-any': 'warn',
      '@typescript-eslint/no-unused-vars': [
        'error',
        {
          argsIgnorePattern: '^_',
          varsIgnorePattern: '^_',
        },
      ],
      '@typescript-eslint/no-misused-promises': [
        'error',
        {
          checksVoidReturn: false,
        },
      ],
      'prettier/prettier': 'error',
    },
  },
  {
    files: ['src/**/*.test.ts', 'src/**/*.integration.test.ts', 'src/**/test-utils.ts', 'src/__tests__/types/*.ts'],
    languageOptions: {
      globals: {
        describe: 'readonly',
        it: 'readonly',
        expect: 'readonly',
        beforeEach: 'readonly',
        afterEach: 'readonly',
        beforeAll: 'readonly',
        afterAll: 'readonly',
        jest: 'readonly',
      },
    },
  },
  {
    ignores: ['dist/**', 'node_modules/**', '*.js', '*.mjs', '*.cjs', 'coverage/**'],
  },
];
```

--------------------------------------------------------------------------------
/src/schemas/helpers.ts:
--------------------------------------------------------------------------------

```typescript
import { z } from 'zod';

// Helper to validate JavaScript code
export const validateJavaScriptCode = (code: string): boolean => {
  // Check for common HTML entities that shouldn't be in JS
  if (/&quot;|&amp;|&lt;|&gt;|&#\d+;|&\w+;/.test(code)) {
    return false;
  }

  // Basic check to ensure it's not HTML
  if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) {
    return false;
  }

  // Check for literal \n, \t, \r outside of strings (common LLM mistake)
  // This is tricky - we'll check if the code has these patterns in a way that suggests
  // they're meant to be actual newlines/tabs rather than escape sequences in strings
  // Look for patterns like: ;\n or }\n or )\n which suggest literal newlines
  if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) {
    return false;
  }

  // Check for obvious cases of literal \n between statements
  if (/[;})]\s*\\n\s*\w/.test(code)) {
    return false;
  }

  return true;
};

// Helper to create schema that rejects session_id
export const createStatelessSchema = <T extends z.ZodObject<z.ZodRawShape>>(schema: T, toolName: string) => {
  // Tool-specific guidance for common scenarios
  const toolGuidance: Record<string, string> = {
    capture_screenshot: 'To capture screenshots with sessions, use crawl(session_id, screenshot: true)',
    generate_pdf: 'To generate PDFs with sessions, use crawl(session_id, pdf: true)',
    execute_js: 'To run JavaScript with sessions, use crawl(session_id, js_code: [...])',
    get_html: 'To get HTML with sessions, use crawl(session_id)',
    extract_with_llm: 'To extract data with sessions, first use crawl(session_id) then extract from the response',
  };

  const message = `${toolName} does not support session_id. This tool is stateless - each call creates a new browser. ${
    toolGuidance[toolName] || 'For persistent operations, use crawl with session_id.'
  }`;

  return z
    .object({
      session_id: z.never({ message }).optional(),
    })
    .passthrough()
    .and(schema)
    .transform((data) => {
      const { session_id, ...rest } = data;
      if (session_id !== undefined) {
        throw new Error(message);
      }
      return rest as z.infer<T>;
    });
};

```

--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------

```yaml
name: CI

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

permissions:
  contents: write
  pages: write
  id-token: write

jobs:
  test:
    runs-on: ubuntu-latest
    
    strategy:
      matrix:
        node-version: [18.x, 20.x, 22.x]
    
    steps:
    - uses: actions/checkout@v4
    
    - name: Use Node.js ${{ matrix.node-version }}
      uses: actions/setup-node@v4
      with:
        node-version: ${{ matrix.node-version }}
        cache: 'npm'
    
    - name: Install dependencies
      run: npm ci
    
    - name: Run linter
      run: npm run lint
    
    - name: Check formatting
      run: npm run format:check
    
    - name: Build
      run: npm run build
    
    - name: Run unit tests
      run: npm run test:unit
    
    - name: Generate coverage report
      if: matrix.node-version == '18.x'
      run: npm run test:coverage -- --testPathIgnorePatterns=integration --testPathIgnorePatterns=examples
    
    - name: Upload coverage reports
      if: matrix.node-version == '18.x'
      uses: actions/upload-artifact@v4
      with:
        name: coverage-report
        path: coverage/
    
    - name: Update coverage gist
      if: matrix.node-version == '18.x'
      env:
        GIST_SECRET: ${{ secrets.GIST_SECRET }}
      run: |
        # Extract coverage percentage from lcov.info
        COVERAGE=$(awk -F: '/^SF:/{files++} /^LF:/{lines+=$2} /^LH:/{hits+=$2} END {printf "%.0f", (hits/lines)*100}' coverage/lcov.info)
        
        # Determine color based on coverage
        if [ $COVERAGE -ge 90 ]; then COLOR="brightgreen"
        elif [ $COVERAGE -ge 70 ]; then COLOR="green"
        elif [ $COVERAGE -ge 50 ]; then COLOR="yellow"
        elif [ $COVERAGE -ge 30 ]; then COLOR="orange"
        else COLOR="red"; fi
        
        # Update gist
        echo "{\"schemaVersion\":1,\"label\":\"coverage\",\"message\":\"${COVERAGE}%\",\"color\":\"${COLOR}\"}" > coverage.json
        gh auth login --with-token <<< "$GIST_SECRET"
        gh gist edit e2abffb0deb25afa2bf9185f440dae81 coverage.json
    
    - name: Deploy coverage to GitHub Pages
      if: matrix.node-version == '18.x' && github.ref == 'refs/heads/main'
      uses: peaceiris/actions-gh-pages@v4
      with:
        github_token: ${{ secrets.GITHUB_TOKEN }}
        publish_dir: ./coverage/lcov-report
        destination_dir: coverage
```

--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------

```json
{
  "name": "mcp-crawl4ai-ts",
  "version": "3.0.2",
  "description": "TypeScript MCP server for Crawl4AI - web crawling and content extraction",
  "main": "dist/index.js",
  "bin": {
    "mcp-crawl4ai-ts": "dist/index.js"
  },
  "type": "module",
  "engines": {
    "node": ">=18.0.0"
  },
  "scripts": {
    "build": "tsc -p tsconfig.build.json",
    "start": "node dist/index.js",
    "dev": "tsx src/index.ts",
    "test": "NODE_OPTIONS=--experimental-vm-modules jest",
    "test:watch": "NODE_OPTIONS=--experimental-vm-modules jest --watch",
    "test:coverage": "NODE_OPTIONS=--experimental-vm-modules jest --coverage",
    "test:unit": "NODE_OPTIONS=--experimental-vm-modules jest --testPathIgnorePatterns=integration --testPathIgnorePatterns=examples",
    "test:integration": "JEST_TEST_TYPE=integration NODE_OPTIONS=--experimental-vm-modules jest src/__tests__/integration",
    "test:ci": "NODE_OPTIONS=--experimental-vm-modules jest --coverage --maxWorkers=2",
    "lint": "eslint src --ext .ts",
    "lint:fix": "eslint src --ext .ts --fix",
    "format": "prettier --write \"src/**/*.ts\"",
    "format:check": "prettier --check \"src/**/*.ts\"",
    "check": "npm run lint && npm run format:check && npm run build"
  },
  "keywords": [
    "mcp",
    "crawl4ai",
    "web-scraping",
    "markdown",
    "pdf",
    "screenshot"
  ],
  "author": "Juan González Cano",
  "license": "MIT",
  "repository": {
    "type": "git",
    "url": "git+https://github.com/omgwtfwow/mcp-crawl4ai-ts.git"
  },
  "bugs": {
    "url": "https://github.com/omgwtfwow/mcp-crawl4ai-ts/issues"
  },
  "homepage": "https://github.com/omgwtfwow/mcp-crawl4ai-ts#readme",
  "files": [
    "dist/**/*",
    "README.md",
    "LICENSE"
  ],
  "dependencies": {
    "@modelcontextprotocol/sdk": "^1.0.4",
    "axios": "^1.7.9",
    "dotenv": "^16.4.7",
    "zod": "^3.25.76"
  },
  "devDependencies": {
    "@eslint/js": "^9.32.0",
  "@jest/globals": "^29.7.0",
  "@types/jest": "^29.5.12",
    "@types/nock": "^10.0.3",
    "@types/node": "^22.10.6",
    "@typescript-eslint/eslint-plugin": "^8.38.0",
    "@typescript-eslint/parser": "^8.38.0",
    "diff": "^8.0.2",
    "eslint": "^9.32.0",
    "eslint-config-prettier": "^10.1.8",
    "eslint-plugin-prettier": "^5.5.3",
  "jest": "^29.7.0",
    "nock": "^14.0.8",
    "prettier": "^3.6.2",
    "ts-jest": "^29.4.0",
    "tsx": "^4.19.2",
    "typescript": "^5.7.3"
  }
}

```

--------------------------------------------------------------------------------
/src/__tests__/handlers/session-handlers.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { jest } from '@jest/globals';
import { AxiosError } from 'axios';
import type { SessionHandlers as SessionHandlersType } from '../../handlers/session-handlers.js';

// Mock axios before importing SessionHandlers
const mockPost = jest.fn();
const mockAxiosClient = {
  post: mockPost,
};

// Mock the service
const mockService = {} as unknown;

// Import after setting up mocks
const { SessionHandlers } = await import('../../handlers/session-handlers.js');

describe('SessionHandlers', () => {
  let handler: SessionHandlersType;
  let sessions: Map<string, unknown>;

  beforeEach(() => {
    jest.clearAllMocks();
    sessions = new Map();
    handler = new SessionHandlers(mockService, mockAxiosClient as unknown, sessions);
  });

  describe('createSession', () => {
    it('should handle initial crawl failure gracefully', async () => {
      // Mock failed crawl
      mockPost.mockRejectedValue(
        new AxiosError('Request failed with status code 500', 'ERR_BAD_RESPONSE', undefined, undefined, {
          status: 500,
          statusText: 'Internal Server Error',
          data: 'Internal Server Error',
          headers: {},
          config: {} as unknown,
        } as unknown),
      );

      const options = {
        initial_url: 'https://this-domain-definitely-does-not-exist-12345.com',
        browser_type: 'chromium' as const,
      };

      // Create session with initial_url that will fail
      const result = await handler.createSession(options);

      // Session should still be created
      expect(result.content[0].type).toBe('text');
      expect(result.content[0].text).toContain('Session created successfully');
      expect(result.content[0].text).toContain(
        'Pre-warmed with: https://this-domain-definitely-does-not-exist-12345.com',
      );
      expect(result.session_id).toBeDefined();
      expect(result.browser_type).toBe('chromium');

      // Verify crawl was attempted
      expect(mockPost).toHaveBeenCalledWith(
        '/crawl',
        {
          urls: ['https://this-domain-definitely-does-not-exist-12345.com'],
          browser_config: {
            headless: true,
            browser_type: 'chromium',
          },
          crawler_config: {
            session_id: expect.stringMatching(/^session-/),
            cache_mode: 'BYPASS',
          },
        },
        {
          timeout: 30000,
        },
      );

      // Verify session was stored locally
      expect(sessions.size).toBe(1);
      const session = sessions.get(result.session_id);
      expect(session).toBeDefined();
      expect(session.initial_url).toBe('https://this-domain-definitely-does-not-exist-12345.com');
    });

    it('should not attempt crawl when no initial_url provided', async () => {
      const result = await handler.createSession({});

      // Session should be created without crawl
      expect(result.content[0].text).toContain('Session created successfully');
      expect(result.content[0].text).toContain('Ready for use');
      expect(result.content[0].text).not.toContain('Pre-warmed');

      // Verify no crawl was attempted
      expect(mockPost).not.toHaveBeenCalled();
    });
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/generate-pdf.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
    resource?: {
      uri: string;
      mimeType?: string;
      blob?: string;
    };
  }>;
}

describe('generate_pdf Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('PDF generation', () => {
    it(
      'should generate PDF from URL',
      async () => {
        const result = await client.callTool({
          name: 'generate_pdf',
          arguments: {
            url: 'https://httpbin.org/html',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(2);

        // First item should be the PDF as embedded resource
        expect(content[0].type).toBe('resource');
        expect(content[0].resource).toBeDefined();
        expect(content[0].resource?.mimeType).toBe('application/pdf');
        expect(content[0].resource?.blob).toBeTruthy();
        expect(content[0].resource?.blob?.length).toBeGreaterThan(1000); // Should be a substantial base64 string
        expect(content[0].resource?.uri).toContain('data:application/pdf');

        // Second item should be text description
        expect(content[1].type).toBe('text');
        expect(content[1].text).toContain('PDF generated for: https://httpbin.org/html');
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should reject session_id parameter',
      async () => {
        const result = await client.callTool({
          name: 'generate_pdf',
          arguments: {
            url: 'https://httpbin.org/html',
            session_id: 'test-session',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('session_id');
        expect(content[0].text).toContain('does not support');
        expect(content[0].text).toContain('stateless');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle invalid URLs gracefully',
      async () => {
        const result = await client.callTool({
          name: 'generate_pdf',
          arguments: {
            url: 'not-a-valid-url',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('Error');
        expect(content[0].text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle non-existent domains',
      async () => {
        const result = await client.callTool({
          name: 'generate_pdf',
          arguments: {
            url: 'https://this-domain-definitely-does-not-exist-123456789.com',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('Error');
      },
      TEST_TIMEOUTS.short,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/session-management.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';
import { Client } from '@modelcontextprotocol/sdk/client/index.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
  session_id?: string;
  browser_type?: string;
  initial_url?: string;
  created_at?: string;
}

describe('Session Management Integration Tests', () => {
  let client: Client;
  const createdSessions: string[] = [];

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterEach(async () => {
    // Clean up any sessions created during tests
    for (const sessionId of createdSessions) {
      try {
        await client.callTool({
          name: 'manage_session',
          arguments: { action: 'clear', session_id: sessionId },
        });
      } catch (e) {
        // Ignore errors during cleanup
        console.debug('Cleanup error:', e);
      }
    }
    createdSessions.length = 0;
  });

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('manage_session', () => {
    it(
      'should create session with auto-generated ID using manage_session',
      async () => {
        const result = await client.callTool({
          name: 'manage_session',
          arguments: { action: 'create' },
        });

        expect(result).toBeDefined();
        const typedResult = result as ToolResult;
        expect(typedResult.content).toBeDefined();
        expect(Array.isArray(typedResult.content)).toBe(true);

        const textContent = typedResult.content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Session created successfully');

        // Check returned parameters
        expect(typedResult.session_id).toBeDefined();
        expect(typedResult.session_id).toMatch(/^session-/);
        expect(typedResult.browser_type).toBe('chromium');

        // Track for cleanup
        createdSessions.push(typedResult.session_id!);
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should clear session using manage_session',
      async () => {
        // First create a session
        const createResult = await client.callTool({
          name: 'manage_session',
          arguments: {
            action: 'create',
            session_id: 'test-to-clear',
          },
        });

        const typedCreateResult = createResult as ToolResult;
        createdSessions.push(typedCreateResult.session_id!);

        // Then clear it
        const clearResult = await client.callTool({
          name: 'manage_session',
          arguments: {
            action: 'clear',
            session_id: 'test-to-clear',
          },
        });

        const typedClearResult = clearResult as ToolResult;
        expect(typedClearResult.content[0].text).toContain('Session cleared successfully');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should list sessions using manage_session',
      async () => {
        // Create a session first
        const createResult = await client.callTool({
          name: 'manage_session',
          arguments: {
            action: 'create',
            session_id: 'test-list-session',
          },
        });

        const typedCreateResult = createResult as ToolResult;
        createdSessions.push(typedCreateResult.session_id!);

        // List sessions
        const listResult = await client.callTool({
          name: 'manage_session',
          arguments: { action: 'list' },
        });

        const typedListResult = listResult as ToolResult;
        expect(typedListResult.content[0].text).toContain('Active sessions');
        expect(typedListResult.content[0].text).toContain('test-list-session');
      },
      TEST_TIMEOUTS.short,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/get-html.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('get_html Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('HTML extraction', () => {
    it(
      'should extract HTML from URL',
      async () => {
        const result = await client.callTool({
          name: 'get_html',
          arguments: {
            url: 'https://httpbin.org/html',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        // Should contain processed HTML
        const html = content[0].text || '';
        expect(html).toBeTruthy();
        // The HTML endpoint returns sanitized/processed HTML
        // It might be truncated with "..."
        expect(html.length).toBeGreaterThan(0);
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should reject session_id parameter',
      async () => {
        const result = await client.callTool({
          name: 'get_html',
          arguments: {
            url: 'https://example.com',
            session_id: 'test-session',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('session_id');
        expect(content[0].text).toContain('does not support');
        expect(content[0].text).toContain('stateless');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle invalid URLs gracefully',
      async () => {
        const result = await client.callTool({
          name: 'get_html',
          arguments: {
            url: 'not-a-valid-url',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('Error');
        expect(content[0].text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle non-existent domains',
      async () => {
        const result = await client.callTool({
          name: 'get_html',
          arguments: {
            url: 'https://this-domain-definitely-does-not-exist-123456789.com',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        // According to spec, returns success: true with empty HTML for invalid URLs
        const html = content[0].text || '';
        // Could be empty or contain an error message
        expect(typeof html).toBe('string');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should ignore extra parameters',
      async () => {
        const result = await client.callTool({
          name: 'get_html',
          arguments: {
            url: 'https://example.com',
            wait_for: '.some-selector', // Should be ignored
            bypass_cache: true, // Should be ignored
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        // Should still work, ignoring extra params
        const html = content[0].text || '';
        expect(html.length).toBeGreaterThan(0);
      },
      TEST_TIMEOUTS.long,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/capture-screenshot.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
    data?: string;
    mimeType?: string;
  }>;
}

describe('capture_screenshot Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Screenshot capture', () => {
    it(
      'should capture screenshot with default wait time',
      async () => {
        const result = await client.callTool({
          name: 'capture_screenshot',
          arguments: {
            url: 'https://httpbin.org/html',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(2);

        // First item should be the image
        expect(content[0].type).toBe('image');
        expect(content[0].mimeType).toBe('image/png');
        expect(content[0].data).toBeTruthy();
        expect(content[0].data?.length).toBeGreaterThan(1000); // Should be a substantial base64 string

        // Second item should be text description
        expect(content[1].type).toBe('text');
        expect(content[1].text).toContain('Screenshot captured for: https://httpbin.org/html');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should capture screenshot with custom wait time',
      async () => {
        const result = await client.callTool({
          name: 'capture_screenshot',
          arguments: {
            url: 'https://httpbin.org/html',
            screenshot_wait_for: 0.5, // Reduced from 3 seconds
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(2);

        // First item should be the image
        expect(content[0].type).toBe('image');
        expect(content[0].mimeType).toBe('image/png');
        expect(content[0].data).toBeTruthy();

        // Second item should be text description
        expect(content[1].type).toBe('text');
        expect(content[1].text).toContain('Screenshot captured for: https://httpbin.org/html');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should reject session_id parameter',
      async () => {
        const result = await client.callTool({
          name: 'capture_screenshot',
          arguments: {
            url: 'https://example.com',
            session_id: 'test-session',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('session_id');
        expect(content[0].text).toContain('does not support');
        expect(content[0].text).toContain('stateless');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle invalid URLs gracefully',
      async () => {
        const result = await client.callTool({
          name: 'capture_screenshot',
          arguments: {
            url: 'not-a-valid-url',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('Error');
        expect(content[0].text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle non-existent domains',
      async () => {
        const result = await client.callTool({
          name: 'capture_screenshot',
          arguments: {
            url: 'https://this-domain-definitely-does-not-exist-123456789.com',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('Error');
      },
      TEST_TIMEOUTS.short,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/test-utils.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
import dotenv from 'dotenv';

// Load environment variables
dotenv.config();

export interface IntegrationTestConfig {
  baseUrl: string;
  apiKey: string;
  llmProvider?: string;
  llmApiToken?: string;
  llmBaseUrl?: string;
}

export function getTestConfig(): IntegrationTestConfig {
  const config: IntegrationTestConfig = {
    baseUrl: process.env.CRAWL4AI_BASE_URL || '',
    apiKey: process.env.CRAWL4AI_API_KEY || '',
    llmProvider: process.env.LLM_PROVIDER,
    llmApiToken: process.env.LLM_API_TOKEN,
    llmBaseUrl: process.env.LLM_BASE_URL,
  };

  if (!config.baseUrl) {
    throw new Error(
      'CRAWL4AI_BASE_URL is required for integration tests. Please set it in .env file or environment variable.',
    );
  }

  return config;
}

export function hasLLMConfig(): boolean {
  const config = getTestConfig();
  return !!(config.llmProvider && config.llmApiToken);
}

export async function createTestClient(): Promise<Client> {
  const transport = new StdioClientTransport({
    command: 'tsx',
    args: ['src/index.ts'],
    env: {
      ...process.env,
      NODE_ENV: 'test',
    },
    cwd: process.cwd(), // Ensure the child process runs in the correct directory
  });

  const client = new Client(
    {
      name: 'integration-test-client',
      version: '1.0.0',
    },
    {
      capabilities: {},
    },
  );

  await client.connect(transport);
  return client;
}

export async function cleanupTestClient(client: Client): Promise<void> {
  await client.close();
}

// Test data generators
export function generateSessionId(): string {
  return `test-session-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
}

export function generateTestUrl(type: 'simple' | 'dynamic' | 'infinite-scroll' | 'auth' = 'simple'): string {
  const urls = {
    simple: 'https://example.com',
    dynamic: 'https://github.com',
    'infinite-scroll': 'https://twitter.com',
    auth: 'https://github.com/login',
  };
  return urls[type];
}

// Test result types
export interface TestContentItem {
  type: string;
  text?: string;
  data?: string;
  mimeType?: string;
}

export interface TestResult {
  content: TestContentItem[];
}

export interface ToolResult {
  content: TestContentItem[];
  isError?: boolean;
}

// Assertion helpers
export async function expectSuccessfulCrawl(result: unknown): Promise<void> {
  expect(result).toBeDefined();

  // Type guard to check if result has content property
  const typedResult = result as { content?: unknown };
  expect(typedResult.content).toBeDefined();
  expect(typedResult.content).toBeInstanceOf(Array);

  const contentArray = typedResult.content as TestContentItem[];
  expect(contentArray.length).toBeGreaterThan(0);

  const textContent = contentArray.find((c) => c.type === 'text');
  expect(textContent).toBeDefined();
  expect(textContent?.text).toBeTruthy();
}

export async function expectScreenshot(result: unknown): Promise<void> {
  const typedResult = result as { content?: TestContentItem[] };
  expect(typedResult.content).toBeDefined();

  const imageContent = typedResult.content?.find((c) => c.type === 'image');
  expect(imageContent).toBeDefined();
  expect(imageContent?.data).toBeTruthy();
  expect(imageContent?.mimeType).toBe('image/png');
}

export async function expectExtractedData(result: unknown, expectedKeys: string[]): Promise<void> {
  const typedResult = result as { content?: TestContentItem[] };
  expect(typedResult.content).toBeDefined();

  const textContent = typedResult.content?.find((c) => c.type === 'text');
  expect(textContent).toBeDefined();

  // Check if extracted data contains expected keys
  for (const key of expectedKeys) {
    expect(textContent?.text).toContain(key);
  }
}

// Delay helper for tests
export function delay(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

// Rate limiter for integration tests
let lastRequestTime = 0;
export async function rateLimit(minDelayMs: number = 500): Promise<void> {
  const now = Date.now();
  const timeSinceLastRequest = now - lastRequestTime;

  if (timeSinceLastRequest < minDelayMs) {
    await delay(minDelayMs - timeSinceLastRequest);
  }

  lastRequestTime = Date.now();
}

// Skip test if condition is not met
export function skipIf(condition: boolean, message: string) {
  if (condition) {
    console.log(`⚠️  Skipping test: ${message}`);
    return true;
  }
  return false;
}

// Test timeout helper
export const TEST_TIMEOUTS = {
  short: 30000, // 30 seconds
  medium: 60000, // 1 minute
  long: 120000, // 2 minutes
  extraLong: 180000, // 3 minutes
};

```

--------------------------------------------------------------------------------
/.github/copilot-instructions.md:
--------------------------------------------------------------------------------

```markdown
# Copilot Instructions: `mcp-crawl4ai-ts`

Concise, project-specific guidance for AI coding agents. Optimize for correctness, safety, and existing test expectations.

## Architecture & Flow
- Entrypoint `src/index.ts`: loads dotenv only if `CRAWL4AI_BASE_URL` unset; fails fast if missing. Passes env + version into `Crawl4AIServer`.
- `src/server.ts`: registers MCP tools, keeps a `Map<string, SessionInfo>` for persistent browser sessions, and uses `validateAndExecute` (Zod parse + invariant error message format). Do NOT alter error text pattern: `Invalid parameters for <tool>: ...` (tests & LLM reliability depend on it).
- Service layer `src/crawl4ai-service.ts`: pure HTTP wrapper around Crawl4AI endpoints; centralizes axios timeout & error translation (preserve wording like `Request timed out`, `Request failed with status <code>:` — tests rely on these substrings).
- Handlers (`src/handlers/*.ts`): orchestration & response shaping (text content arrays). No direct business logic inside server class beyond wiring.
- Validation schemas (`src/schemas/validation-schemas.ts` + helpers): all tool inputs defined here. Use `createStatelessSchema` for stateless tools; session/persistent tools have discriminated unions.

## Tool Model
- Stateless tools (e.g. `get_markdown`, `capture_screenshot`, `execute_js`) spin up a fresh browser each call.
- Session-based operations use `manage_session` (create/list/clear) + `crawl` for persistent state, allowing chained JS + screenshot/pdf in ONE call. Never try to chain separate stateless calls to reflect JS mutations.
- Output always returned as base64/text blocks; do not add file system side-effects unless explicitly using a save path param already supported (screenshots: optional local save dir).

## JS & Input Validation Nuances
- JS code schema rejects: HTML entities (&quot;), literal `\n` tokens outside strings, embedded HTML tags. Reuse `JsCodeSchema`—do not duplicate logic.
- For `get_markdown`: if filter is `bm25` or `llm`, `query` becomes required (enforced via `.refine`). Keep this logic centralized.

## Sessions
- `SessionInfo` tracks `created_at` & `last_used`. Update `last_used` whenever a session-based action runs. Don't leak sessions: `clear` must delete map entry.

## Error Handling Pattern
- Handlers wrap service calls; on failure use `this.formatError(error, '<operation>')` (see `BaseHandler`). Preserve format: `Failed to <operation>: <detail>`.
- Zod validation errors: keep exact join pattern of `path: message` segments.

## Adding / Modifying a Tool (Checklist)
1. Define or extend schema in `validation-schemas.ts` (prefer composing existing small schemas; wrap with `createStatelessSchema` if ephemeral).
2. Add service method if it maps to a new Crawl4AI endpoint (pure HTTP + validation of URL / JS content; reuse existing validators).
3. Implement handler method (assemble request body, post-process response to `content: [{ type: 'text', text }]`).
4. Register in `setupHandlers()` list (tool description should mirror README style & clarify stateless vs session).
5. Write tests: unit (schema + handler success/failure), integration (happy path with mocked or real endpoint). Place under matching folder in `src/__tests__/`.
6. Update README tool table if user-facing, and CHANGELOG + version bump.

## Commands & Workflows
- Install: `npm install`
- Build: `npm run build` (tsconfig.build.json)
- Dev (watch): `npm run dev`
- Tests: `npm run test` | unit only: `npm run test:unit` | integration: `npm run test:integration` | coverage: `npm run test:coverage`
- Lint/Format: `npm run lint`, `npm run lint:fix`, `npm run format:check`
- Pre-flight composite: `npm run check`

### Testing Invariants
- NEVER invoke `jest` directly for integration tests; rely on `npm run test:integration` (injects `NODE_OPTIONS=--experimental-vm-modules` + `JEST_TEST_TYPE=integration`).
- Unit tests auto-set `CRAWL4AI_BASE_URL` in `jest.setup.cjs`; integration tests require real env vars (`CRAWL4AI_BASE_URL`, optional `CRAWL4AI_API_KEY`, LLM vars) via `.env` or exported.
- To run a single integration file: `npm run test:integration -- path/to/file.test.ts`.
- Jest pinned at 29.x with `ts-jest@29`; do not upgrade one without the other.
- Symptom mapping: import syntax error or hang at first test => you bypassed the npm script.

## Conventions & Invariants
- No `any`; prefer `unknown` + narrowing.
- Keep responses minimal & textual; do not introduce new top-level fields in tool results without updating all tests.
- Timeout remains 120s in axios clients—changing requires test updates.
- Commit style: conventional commits; no emojis, AI signoffs, or verbose bodies.

## References
- README (tools & examples), CLAUDE.md (contrib rules), CHANGELOG (release notes), coverage report for quality gates.

If something is ambiguous, inspect existing handlers first and mirror the closest established pattern before inventing a new one.

```

--------------------------------------------------------------------------------
/src/__tests__/integration/extract-with-llm.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('extract_with_llm Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('LLM extraction', () => {
    it(
      'should extract information about a webpage',
      async () => {
        const result = await client.callTool({
          name: 'extract_with_llm',
          arguments: {
            url: 'https://httpbin.org/html',
            query: 'What is the main topic of this page?',
          },
        });

        expect(result).toBeTruthy();
        const typedResult = result as ToolResult;
        expect(typedResult.content).toBeDefined();
        expect(typedResult.content.length).toBeGreaterThan(0);

        const textContent = (result as ToolResult).content.find((c) => c.type === 'text');
        expect(textContent?.text).toBeTruthy();
        // Should return a meaningful response (LLM responses are non-deterministic)
        expect(textContent?.text?.length || 0).toBeGreaterThan(10);
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should answer specific questions about content',
      async () => {
        const result = await client.callTool({
          name: 'extract_with_llm',
          arguments: {
            url: 'https://httpbin.org/json',
            query: 'What is the slideshow title?',
          },
        });

        expect(result).toBeTruthy();
        expect(result.content).toBeDefined();

        const textContent = (result as ToolResult).content.find((c) => c.type === 'text');
        expect(textContent?.text).toBeTruthy();
        // Should provide an answer about the content
        expect(textContent?.text?.length || 0).toBeGreaterThan(5);
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should handle complex queries',
      async () => {
        const result = await client.callTool({
          name: 'extract_with_llm',
          arguments: {
            url: 'https://httpbin.org/html',
            query: 'List all the links found on this page',
          },
        });

        expect(result).toBeTruthy();
        const textContent = (result as ToolResult).content.find((c) => c.type === 'text');
        expect(textContent?.text).toBeTruthy();
        // Should provide a response about links (content may vary)
        expect(textContent?.text?.length || 0).toBeGreaterThan(10);
      },
      TEST_TIMEOUTS.long,
    );
  });

  describe('Error handling', () => {
    it(
      'should handle server without API key configured',
      async () => {
        // Note: This test may pass if the server has OPENAI_API_KEY configured
        // It's here to document the expected behavior
        const result = await client.callTool({
          name: 'extract_with_llm',
          arguments: {
            url: 'https://httpbin.org/status/200',
            query: 'What is on this page?',
          },
        });

        const typedResult = result as ToolResult;
        // If it succeeds, we have API key configured
        if (typedResult.content && typedResult.content.length > 0) {
          expect(result).toBeTruthy();
        }
        // If it fails, we should get a proper error message
        else if (typedResult.content[0]?.text?.includes('LLM provider')) {
          expect(typedResult.content[0].text).toContain('LLM provider');
        }
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle invalid URLs',
      async () => {
        const result = await client.callTool({
          name: 'extract_with_llm',
          arguments: {
            url: 'not-a-url',
            query: 'What is this?',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
        expect(textContent?.text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle empty query gracefully',
      async () => {
        const result = await client.callTool({
          name: 'extract_with_llm',
          arguments: {
            url: 'https://example.com',
            query: '',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
      },
      TEST_TIMEOUTS.short,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/extract-links.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('extract_links Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Basic functionality', () => {
    it(
      'should extract links with categorization (default)',
      async () => {
        const result = await client.callTool({
          name: 'extract_links',
          arguments: {
            url: 'https://webscraper.io/test-sites',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        expect(Array.isArray(content)).toBe(true);
        expect(content.length).toBeGreaterThan(0);

        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Link analysis for https://webscraper.io/test-sites');
        // Should show categorized output
        expect(textContent?.text).toMatch(/internal \(\d+\)/);
        expect(textContent?.text).toMatch(/external \(\d+\)/);
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should extract links without categorization',
      async () => {
        const result = await client.callTool({
          name: 'extract_links',
          arguments: {
            url: 'https://webscraper.io/test-sites',
            categorize: false,
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        expect(Array.isArray(content)).toBe(true);
        expect(content.length).toBeGreaterThan(0);

        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('All links from https://webscraper.io/test-sites');
        // Should NOT show categorized output
        expect(textContent?.text).not.toMatch(/internal \(\d+\)/);
        expect(textContent?.text).not.toMatch(/external \(\d+\)/);
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle sites with no links',
      async () => {
        // Test with a simple status page
        const result = await client.callTool({
          name: 'extract_links',
          arguments: {
            url: 'https://httpstat.us/200',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should detect JSON endpoints',
      async () => {
        const result = await client.callTool({
          name: 'extract_links',
          arguments: {
            url: 'https://httpbin.org/json',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        // Should show link analysis (even if empty)
        expect(textContent?.text).toContain('Link analysis for https://httpbin.org/json');
      },
      TEST_TIMEOUTS.medium,
    );
  });

  describe('Error handling', () => {
    it(
      'should handle invalid URLs',
      async () => {
        const result = await client.callTool({
          name: 'extract_links',
          arguments: {
            url: 'not-a-url',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
        expect(textContent?.text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle non-existent domains',
      async () => {
        const result = await client.callTool({
          name: 'extract_links',
          arguments: {
            url: 'https://this-domain-definitely-does-not-exist-12345.com',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
        // Could be various error messages: connection error, DNS error, etc.
        expect(textContent?.text?.toLowerCase()).toMatch(/error|failed/);
      },
      TEST_TIMEOUTS.medium,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/smart-crawl.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('smart_crawl Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Smart crawling', () => {
    it(
      'should auto-detect HTML content',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://httpbin.org/html',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Smart crawl detected content type:');
        expect(text).toContain('html');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle sitemap URLs',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://httpbingo.org/xml',
            max_depth: 1,
          },
        });

        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Smart crawl detected content type:');
        expect(text.toLowerCase()).toMatch(/xml|sitemap/);
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle follow_links parameter',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://httpbingo.org/xml',
            follow_links: true,
            max_depth: 1,
          },
        });

        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Smart crawl detected content type:');
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should detect JSON content',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://httpbin.org/json',
          },
        });

        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Smart crawl detected content type:');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should bypass cache when requested',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            bypass_cache: true,
          },
        });

        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Smart crawl detected content type:');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle invalid URLs gracefully',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'not-a-valid-url',
          },
        });

        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].text).toContain('Error');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle non-existent domains',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://this-domain-definitely-does-not-exist-123456789.com',
          },
        });

        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        // Non-existent domains cause 500 errors
        expect(text).toContain('Error');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should reject session_id parameter',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            session_id: 'test-session',
          },
        });

        const content = (result as ToolResult).content;
        expect(content.length).toBeGreaterThanOrEqual(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('session_id');
        expect(content[0].text).toContain('does not support');
        expect(content[0].text).toContain('stateless');
      },
      TEST_TIMEOUTS.short,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/handlers/session-handlers.ts:
--------------------------------------------------------------------------------

```typescript
import { BaseHandler } from './base-handler.js';

export class SessionHandlers extends BaseHandler {
  async manageSession(options: {
    action: 'create' | 'clear' | 'list';
    session_id?: string;
    initial_url?: string;
    browser_type?: string;
  }) {
    switch (options.action) {
      case 'create':
        return this.createSession({
          session_id: options.session_id,
          initial_url: options.initial_url,
          browser_type: options.browser_type,
        });
      case 'clear':
        if (!options.session_id) {
          throw new Error('session_id is required for clear action');
        }
        return this.clearSession({ session_id: options.session_id });
      case 'list':
        return this.listSessions();
      default:
        // This should never happen due to TypeScript types, but handle it for runtime safety
        throw new Error(`Invalid action: ${(options as { action: string }).action}`);
    }
  }

  private async createSession(options: { session_id?: string; initial_url?: string; browser_type?: string }) {
    try {
      // Generate session ID if not provided
      const sessionId = options.session_id || `session-${Date.now()}-${Math.random().toString(36).substring(2, 11)}`;

      // Store session info locally
      this.sessions.set(sessionId, {
        id: sessionId,
        created_at: new Date(),
        last_used: new Date(),
        initial_url: options.initial_url,
        metadata: {
          browser_type: options.browser_type || 'chromium',
        },
      });

      // If initial_url provided, make first crawl to establish session
      if (options.initial_url) {
        try {
          await this.axiosClient.post(
            '/crawl',
            {
              urls: [options.initial_url],
              browser_config: {
                headless: true,
                browser_type: options.browser_type || 'chromium',
              },
              crawler_config: {
                session_id: sessionId,
                cache_mode: 'BYPASS',
              },
            },
            {
              timeout: 30000, // 30 second timeout for initial crawl
            },
          );

          // Update last_used
          const session = this.sessions.get(sessionId);
          if (session) {
            session.last_used = new Date();
          }
        } catch (error) {
          // Session created but initial crawl failed - still return success
          console.error(`Initial crawl failed for session ${sessionId}:`, error);
        }
      }

      return {
        content: [
          {
            type: 'text',
            text: `Session created successfully:\nSession ID: ${sessionId}\nBrowser: ${options.browser_type || 'chromium'}\n${options.initial_url ? `Pre-warmed with: ${options.initial_url}` : 'Ready for use'}\n\nUse this session_id with the crawl tool to maintain state across requests.`,
          },
        ],
        // Include all session parameters for easier programmatic access
        session_id: sessionId,
        browser_type: options.browser_type || 'chromium',
        initial_url: options.initial_url,
        created_at: this.sessions.get(sessionId)?.created_at.toISOString(),
      };
    } catch (error) {
      throw this.formatError(error, 'create session');
    }
  }

  private async clearSession(options: { session_id: string }) {
    try {
      // Remove from local store
      const deleted = this.sessions.delete(options.session_id);

      // Note: The actual browser session in Crawl4AI will be cleaned up
      // automatically after inactivity or when the server restarts

      return {
        content: [
          {
            type: 'text',
            text: deleted
              ? `Session cleared successfully: ${options.session_id}`
              : `Session not found: ${options.session_id}`,
          },
        ],
      };
    } catch (error) {
      throw this.formatError(error, 'clear session');
    }
  }

  private async listSessions() {
    try {
      // Return locally stored sessions
      const sessions = Array.from(this.sessions.entries()).map(([id, info]) => {
        const ageMinutes = Math.floor((Date.now() - info.created_at.getTime()) / 60000);
        const lastUsedMinutes = Math.floor((Date.now() - info.last_used.getTime()) / 60000);

        return {
          session_id: id,
          created_at: info.created_at.toISOString(),
          last_used: info.last_used.toISOString(),
          age_minutes: ageMinutes,
          last_used_minutes_ago: lastUsedMinutes,
          initial_url: info.initial_url,
          browser_type: info.metadata?.browser_type || 'chromium',
        };
      });

      if (sessions.length === 0) {
        return {
          content: [
            {
              type: 'text',
              text: 'No active sessions found.',
            },
          ],
        };
      }

      const sessionList = sessions
        .map(
          (session) =>
            `- ${session.session_id} (${session.browser_type}, created ${session.age_minutes}m ago, last used ${session.last_used_minutes_ago}m ago)`,
        )
        .join('\n');

      return {
        content: [
          {
            type: 'text',
            text: `Active sessions (${sessions.length}):\n${sessionList}`,
          },
        ],
      };
    } catch (error) {
      throw this.formatError(error, 'list sessions');
    }
  }
}

```

--------------------------------------------------------------------------------
/src/__tests__/utils/javascript-validation.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { describe, it, expect } from '@jest/globals';
import { validateJavaScriptCode } from '../../schemas/helpers.js';

describe('JavaScript Code Validation', () => {
  describe('Valid JavaScript', () => {
    it('should accept simple JavaScript code', () => {
      expect(validateJavaScriptCode('console.log("Hello world")')).toBe(true);
      expect(validateJavaScriptCode('return document.title')).toBe(true);
      expect(validateJavaScriptCode('const x = 5; return x * 2;')).toBe(true);
    });

    it('should accept JavaScript with real newlines', () => {
      expect(validateJavaScriptCode('console.log("Hello");\nconsole.log("World");')).toBe(true);
      expect(validateJavaScriptCode('function test() {\n  return true;\n}')).toBe(true);
    });

    it('should accept JavaScript with escape sequences in strings', () => {
      expect(validateJavaScriptCode('console.log("Line 1\\nLine 2")')).toBe(true);
      expect(validateJavaScriptCode('const msg = "Tab\\there\\tand\\tthere"')).toBe(true);
      expect(validateJavaScriptCode('return "Quote: \\"Hello\\""')).toBe(true);
    });

    it('should accept complex JavaScript patterns', () => {
      const complexCode = `
        const elements = document.querySelectorAll('.item');
        elements.forEach((el, i) => {
          el.textContent = \`Item \${i + 1}\`;
        });
        return elements.length;
      `;
      expect(validateJavaScriptCode(complexCode)).toBe(true);
    });

    it('should accept JavaScript with regex patterns', () => {
      expect(validateJavaScriptCode('return /test\\d+/.test(str)')).toBe(true);
      expect(validateJavaScriptCode('const pattern = /\\w+@\\w+\\.\\w+/')).toBe(true);
    });
  });

  describe('Invalid JavaScript - HTML Entities', () => {
    it('should reject code with HTML entities', () => {
      expect(validateJavaScriptCode('console.log(&quot;Hello&quot;)')).toBe(false);
      expect(validateJavaScriptCode('const x = &amp;&amp; true')).toBe(false);
      expect(validateJavaScriptCode('if (x &lt; 5) return')).toBe(false);
      expect(validateJavaScriptCode('if (x &gt; 5) return')).toBe(false);
    });

    it('should reject code with numeric HTML entities', () => {
      expect(validateJavaScriptCode('const char = &#65;')).toBe(false);
      // Note: hex entities like &#x41; are not caught by the current regex
    });

    it('should reject code with named HTML entities', () => {
      expect(validateJavaScriptCode('const copy = &copy;')).toBe(false);
      expect(validateJavaScriptCode('const nbsp = &nbsp;')).toBe(false);
    });
  });

  describe('Invalid JavaScript - HTML Tags', () => {
    it('should reject HTML markup', () => {
      expect(validateJavaScriptCode('<!DOCTYPE html>')).toBe(false);
      expect(validateJavaScriptCode('<html><body>test</body></html>')).toBe(false);
      expect(validateJavaScriptCode('<script>alert("test")</script>')).toBe(false);
      expect(validateJavaScriptCode('<style>body { color: red; }</style>')).toBe(false);
    });

    it('should reject mixed HTML and JavaScript', () => {
      expect(validateJavaScriptCode('<head>\nconst x = 5;\n</head>')).toBe(false);
      expect(validateJavaScriptCode('console.log("test");\n<body>')).toBe(false);
    });
  });

  describe('Invalid JavaScript - Literal Escape Sequences', () => {
    it('should reject literal \\n outside of strings', () => {
      expect(validateJavaScriptCode('console.log("Hello");\\nconsole.log("World");')).toBe(false);
      expect(validateJavaScriptCode('const x = 5;\\nreturn x;')).toBe(false);
      expect(validateJavaScriptCode('if (true) {\\n  return;\\n}')).toBe(false);
    });

    it('should reject literal \\n in various positions', () => {
      expect(validateJavaScriptCode('}\\nfunction')).toBe(false);
      expect(validateJavaScriptCode(');\\nconst')).toBe(false);
      expect(validateJavaScriptCode('\\n{')).toBe(false);
      expect(validateJavaScriptCode('\\n(')).toBe(false);
    });

    it('should reject literal \\n between statements', () => {
      expect(validateJavaScriptCode('const x = 5;\\nconst y = 10;')).toBe(false);
      expect(validateJavaScriptCode('doSomething();\\ndoAnother();')).toBe(false);
    });
  });

  describe('Edge Cases', () => {
    it('should handle empty strings', () => {
      expect(validateJavaScriptCode('')).toBe(true);
    });

    it('should handle whitespace-only strings', () => {
      expect(validateJavaScriptCode('   ')).toBe(true);
      expect(validateJavaScriptCode('\n\n\n')).toBe(true);
      expect(validateJavaScriptCode('\t\t')).toBe(true);
    });

    it('should handle single-line comments', () => {
      expect(validateJavaScriptCode('// This is a comment')).toBe(true);
      expect(validateJavaScriptCode('return 5; // Comment here')).toBe(true);
    });

    it('should handle multi-line comments', () => {
      expect(validateJavaScriptCode('/* Multi\nline\ncomment */')).toBe(true);
      expect(validateJavaScriptCode('/* Comment */ return 5;')).toBe(true);
    });

    it('should reject HTML tags even in what looks like strings', () => {
      // The current validation is quite strict and rejects HTML tags even if they appear to be in strings
      // This is by design to prevent malformed JavaScript that contains actual HTML
      expect(validateJavaScriptCode('const html = "<div>Hello</div>"')).toBe(true); // <div> is ok
      expect(validateJavaScriptCode("return '<style>body{}</style>'")).toBe(false); // <style> is rejected
    });
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/handlers/utility-handlers.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { jest } from '@jest/globals';
import type { UtilityHandlers } from '../../handlers/utility-handlers.js';
import type { Crawl4AIService } from '../../crawl4ai-service.js';

// Mock the service
const mockCrawl = jest.fn();
const mockService = {
  crawl: mockCrawl,
} as unknown as Crawl4AIService;

// Mock axios client
const mockPost = jest.fn();
const mockAxiosClient = {
  post: mockPost,
} as unknown;

// Import after setting up mocks
const { UtilityHandlers: UtilityHandlersClass } = await import('../../handlers/utility-handlers.js');

describe('UtilityHandlers', () => {
  let handler: UtilityHandlers;
  let sessions: Map<string, unknown>;

  beforeEach(() => {
    jest.clearAllMocks();
    sessions = new Map();
    handler = new UtilityHandlersClass(mockService, mockAxiosClient, sessions);
  });

  describe('extractLinks', () => {
    it('should manually extract links from markdown when API returns empty links', async () => {
      // Mock crawl response with empty links but markdown containing href attributes
      mockPost.mockResolvedValue({
        data: {
          results: [
            {
              success: true,
              links: {
                internal: [],
                external: [],
              },
              markdown: {
                raw_markdown: `
            # Test Page
            
            Here are some links:
            <a href="https://example.com/page1">Internal Link</a>
            <a href="https://external.com/page">External Link</a>
            <a href="/relative/path">Relative Link</a>
            <a href='https://example.com/page2'>Another Internal</a>
          `,
              },
            },
          ],
        },
      });

      const result = await handler.extractLinks({
        url: 'https://example.com',
        categorize: true,
      });

      // Should have manually extracted and categorized links
      expect(result.content[0].type).toBe('text');
      expect(result.content[0].text).toContain('Link analysis for https://example.com');
      expect(result.content[0].text).toContain('internal (3)');
      expect(result.content[0].text).toContain('https://example.com/page1');
      expect(result.content[0].text).toContain('https://example.com/page2');
      expect(result.content[0].text).toContain('https://example.com/relative/path');
      expect(result.content[0].text).toContain('external (1)');
      expect(result.content[0].text).toContain('https://external.com/page');
    });

    it('should handle manual extraction without categorization', async () => {
      // Mock crawl response with empty links
      mockPost.mockResolvedValue({
        data: {
          results: [
            {
              success: true,
              links: {
                internal: [],
                external: [],
              },
              markdown: {
                raw_markdown: `<a href="https://example.com/page1">Link 1</a>
                         <a href="https://external.com/page">Link 2</a>`,
              },
            },
          ],
        },
      });

      const result = await handler.extractLinks({
        url: 'https://example.com',
        categorize: false,
      });

      // Should show all links without categorization
      expect(result.content[0].text).toContain('All links from https://example.com');
      expect(result.content[0].text).toContain('https://example.com/page1');
      expect(result.content[0].text).toContain('https://external.com/page');
      expect(result.content[0].text).not.toContain('Internal links:');
    });

    it('should handle malformed URLs during manual extraction', async () => {
      // Mock crawl response with a malformed URL in href
      mockPost.mockResolvedValue({
        data: {
          results: [
            {
              success: true,
              links: {
                internal: [],
                external: [],
              },
              markdown: {
                raw_markdown: `<a href="javascript:void(0)">JS Link</a>
                         <a href="https://example.com/valid">Valid Link</a>
                         <a href="not-a-url">Invalid URL</a>`,
              },
            },
          ],
        },
      });

      const result = await handler.extractLinks({
        url: 'https://example.com',
        categorize: true,
      });

      // Should handle invalid URLs gracefully
      expect(result.content[0].text).toContain('https://example.com/valid');
      // Invalid URLs should be treated as relative links
      expect(result.content[0].text).toContain('not-a-url');
      expect(result.content[0].text).toContain('javascript:void(0)');
    });

    it('should return empty results when no links found', async () => {
      // Mock crawl response with no links
      mockPost.mockResolvedValue({
        data: {
          results: [
            {
              success: true,
              links: {
                internal: [],
                external: [],
              },
              markdown: {
                raw_markdown: 'Just plain text without any links',
              },
            },
          ],
        },
      });

      const result = await handler.extractLinks({
        url: 'https://example.com',
        categorize: true,
      });

      // Should show empty categories
      expect(result.content[0].text).toContain('Link analysis for https://example.com');
      expect(result.content[0].text).toContain('internal (0)');
      expect(result.content[0].text).toContain('external (0)');
    });
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/index.cli.test.ts:
--------------------------------------------------------------------------------

```typescript
// import { jest } from '@jest/globals';
import { spawn } from 'child_process';
import * as path from 'path';
import * as url from 'url';

const __dirname = url.fileURLToPath(new URL('.', import.meta.url));

describe('CLI Entry Point', () => {
  const cliPath = path.join(__dirname, '..', '..', 'src', 'index.ts');

  // Helper to run CLI with given env vars
  const runCLI = (
    env: Record<string, string> = {},
  ): Promise<{ code: number | null; stdout: string; stderr: string }> => {
    return new Promise((resolve) => {
      const child = spawn('tsx', [cliPath], {
        env: { ...process.env, ...env },
        stdio: 'pipe',
      });

      let stdout = '';
      let stderr = '';

      child.stdout.on('data', (data) => {
        stdout += data.toString();
      });

      child.stderr.on('data', (data) => {
        stderr += data.toString();
      });

      child.on('close', (code) => {
        resolve({ code, stdout, stderr });
      });

      // Kill after 2 seconds to prevent hanging
      setTimeout(() => {
        child.kill();
      }, 2000);
    });
  };

  describe('Environment Variable Validation', () => {
    it('should exit with code 1 when CRAWL4AI_BASE_URL is missing', async () => {
      const { code, stderr } = await runCLI({
        CRAWL4AI_BASE_URL: '',
      });

      expect(code).toBe(1);
      expect(stderr).toContain('Error: CRAWL4AI_BASE_URL environment variable is required');
      expect(stderr).toContain('Please set it to your Crawl4AI server URL');
    });

    it('should start successfully with valid CRAWL4AI_BASE_URL', async () => {
      const { code, stderr } = await runCLI({
        CRAWL4AI_BASE_URL: 'http://localhost:11235',
        CRAWL4AI_API_KEY: 'test-key',
      });

      // Process should be killed by timeout, not exit with error
      expect(code).not.toBe(1);
      // MCP servers output to stderr
      expect(stderr).toContain('crawl4ai-mcp');
    });

    it('should use default values for optional env vars', async () => {
      const { stderr } = await runCLI({
        CRAWL4AI_BASE_URL: 'http://localhost:11235',
        // No API_KEY, SERVER_NAME, or SERVER_VERSION
      });

      expect(stderr).toContain('crawl4ai-mcp'); // default server name
      expect(stderr).toContain('1.0.0'); // default version
    });

    it('should use custom SERVER_NAME and SERVER_VERSION when provided', async () => {
      const { stderr } = await runCLI({
        CRAWL4AI_BASE_URL: 'http://localhost:11235',
        SERVER_NAME: 'custom-server',
        SERVER_VERSION: '2.0.0',
      });

      expect(stderr).toContain('custom-server');
      expect(stderr).toContain('2.0.0');
    });
  });

  describe('Signal Handling', () => {
    it('should handle SIGTERM gracefully', async () => {
      const child = spawn('tsx', [cliPath], {
        env: {
          ...process.env,
          CRAWL4AI_BASE_URL: 'http://localhost:11235',
        },
        stdio: 'pipe',
      });

      // Wait for startup
      await new Promise((resolve) => setTimeout(resolve, 500));

      // Send SIGTERM
      child.kill('SIGTERM');

      const code = await new Promise<number | null>((resolve, reject) => {
        const timeout = setTimeout(() => {
          child.kill('SIGKILL');
          reject(new Error('Process did not exit in time'));
        }, 5000);

        child.on('close', (exitCode) => {
          clearTimeout(timeout);
          resolve(exitCode);
        });
      });

      // Should exit with signal code
      expect(code).toBe(143); // 128 + 15 (SIGTERM)

      // Ensure cleanup
      child.kill();
    }, 10000);

    it('should handle SIGINT gracefully', async () => {
      const child = spawn('tsx', [cliPath], {
        env: {
          ...process.env,
          CRAWL4AI_BASE_URL: 'http://localhost:11235',
        },
        stdio: 'pipe',
      });

      // Wait for startup
      await new Promise((resolve) => setTimeout(resolve, 500));

      // Send SIGINT (Ctrl+C)
      child.kill('SIGINT');

      const code = await new Promise<number | null>((resolve, reject) => {
        const timeout = setTimeout(() => {
          child.kill('SIGKILL');
          reject(new Error('Process did not exit in time'));
        }, 5000);

        child.on('close', (exitCode) => {
          clearTimeout(timeout);
          resolve(exitCode);
        });
      });

      // Should exit with signal code
      expect(code).toBe(130); // 128 + 2 (SIGINT)

      // Ensure cleanup
      child.kill();
    }, 10000);
  });

  describe('Error Handling', () => {
    it('should handle server startup errors', async () => {
      // This will be tricky to test without mocking, but we can at least
      // verify the process starts and attempts to connect
      const { code, stdout, stderr } = await runCLI({
        CRAWL4AI_BASE_URL: 'http://invalid-host-that-does-not-exist:99999',
      });

      // Should not exit with code 1 (that's for missing env vars)
      expect(code).not.toBe(1);
      // But might log connection errors
      const output = stdout + stderr;
      expect(output).toBeTruthy();
    });
  });

  describe('dotenv Loading', () => {
    it('should load .env file if present', async () => {
      // Create a temporary .env file
      const fs = await import('fs/promises');
      const envPath = path.join(__dirname, '..', '..', '.env.test');

      await fs.writeFile(envPath, 'TEST_ENV_VAR=loaded_from_file\n');

      try {
        const { stderr } = await runCLI({
          CRAWL4AI_BASE_URL: 'http://localhost:11235',
          NODE_ENV: 'test',
          DOTENV_CONFIG_PATH: envPath,
        });

        // Verify the server starts (dotenv loaded successfully)
        expect(stderr).toContain('crawl4ai-mcp');
      } finally {
        // Clean up
        await fs.unlink(envPath).catch(() => {});
      }
    });
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/crawl-recursive.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('crawl_recursive Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Basic functionality', () => {
    it(
      'should crawl a site recursively with default settings',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'https://httpbin.org/links/5/0',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        expect(Array.isArray(content)).toBe(true);
        expect(content.length).toBeGreaterThan(0);

        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Recursive crawl completed');
        expect(textContent?.text).toContain('Pages crawled:');
        expect(textContent?.text).toContain('Max depth reached:');
        expect(textContent?.text).toContain('Only internal links');
        // Should have found multiple pages since httpbin.org/links/5/0 has internal links
        expect(textContent?.text).toMatch(/Pages crawled: [2-9]|[1-9][0-9]/);
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should respect max_depth parameter',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'https://httpbin.org/links/10/0',
            max_depth: 1,
            max_pages: 5,
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Max depth reached: ');
        expect(textContent?.text).toMatch(/Max depth reached: [0-1] \(limit: 1\)/);
        // With max_depth=1, should find some pages but not go too deep
        expect(textContent?.text).toMatch(/Pages crawled: [1-5]/);
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should apply include pattern filter',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'https://httpbin.org/links/10/0',
            max_depth: 1,
            max_pages: 5,
            include_pattern: '.*/links/[0-9]+/[0-4]$', // Only include links ending with 0-4
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();

        // Check that we have some results
        expect(textContent?.text).toContain('Pages crawled:');

        // If we crawled pages, they should match our pattern
        if (textContent?.text && textContent.text.includes('Pages found:')) {
          const pagesSection = textContent.text.split('Pages found:')[1];
          if (pagesSection && pagesSection.trim()) {
            // All URLs should end with /0, /1, /2, /3, or /4
            expect(pagesSection).toMatch(/\/[0-4]\b/);
            // Should NOT have URLs ending with /5, /6, /7, /8, /9
            expect(pagesSection).not.toMatch(/\/[5-9]\b/);
          }
        }
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should apply exclude pattern filter',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'https://example.com',
            max_depth: 2,
            max_pages: 10,
            exclude_pattern: '.*\\.(pdf|zip|exe)$',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();

        // Should not have crawled any PDF, ZIP, or EXE files
        expect(textContent?.text).not.toMatch(/\.(pdf|zip|exe)/i);
      },
      TEST_TIMEOUTS.long,
    );
  });

  describe('Error handling', () => {
    it(
      'should handle invalid URLs',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'not-a-url',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
        expect(textContent?.text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle sites with internal links',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'https://httpbin.org/links/5/0',
            max_depth: 2,
            max_pages: 10,
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Pages crawled:');
        // Should crawl multiple pages since httpbin.org/links/5/0 has 5 internal links
        expect(textContent?.text).toMatch(/Pages crawled: [2-9]|1[0-9]/);
        expect(textContent?.text).toContain('Internal links found:');
      },
      TEST_TIMEOUTS.medium,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/handlers/content-handlers.ts:
--------------------------------------------------------------------------------

```typescript
import { BaseHandler } from './base-handler.js';
import {
  MarkdownEndpointOptions,
  MarkdownEndpointResponse,
  ScreenshotEndpointOptions,
  ScreenshotEndpointResponse,
  PDFEndpointOptions,
  PDFEndpointResponse,
  HTMLEndpointOptions,
  HTMLEndpointResponse,
  FilterType,
} from '../types.js';
import * as fs from 'fs/promises';
import * as path from 'path';
import * as os from 'os';

export class ContentHandlers extends BaseHandler {
  async getMarkdown(
    options: Omit<MarkdownEndpointOptions, 'f' | 'q' | 'c'> & { filter?: string; query?: string; cache?: string },
  ) {
    try {
      // Map from schema property names to API parameter names
      const result: MarkdownEndpointResponse = await this.service.getMarkdown({
        url: options.url,
        f: options.filter as FilterType | undefined, // Schema provides 'filter', API expects 'f'
        q: options.query, // Schema provides 'query', API expects 'q'
        c: options.cache, // Schema provides 'cache', API expects 'c'
      });

      // Format the response
      let formattedText = `URL: ${result.url}\nFilter: ${result.filter}`;

      if (result.query) {
        formattedText += `\nQuery: ${result.query}`;
      }

      formattedText += `\nCache: ${result.cache}\n\nMarkdown:\n${result.markdown || 'No content found.'}`;

      return {
        content: [
          {
            type: 'text',
            text: formattedText,
          },
        ],
      };
    } catch (error) {
      throw this.formatError(error, 'get markdown');
    }
  }

  async captureScreenshot(options: ScreenshotEndpointOptions) {
    try {
      const result: ScreenshotEndpointResponse = await this.service.captureScreenshot(options);

      // Response has { success: true, screenshot: "base64string" }
      if (!result.success || !result.screenshot) {
        throw new Error('Screenshot capture failed - no screenshot data in response');
      }

      let savedFilePath: string | undefined;

      // Save to local directory if requested
      if (options.save_to_directory) {
        try {
          // Resolve home directory path
          let resolvedPath = options.save_to_directory;
          if (resolvedPath.startsWith('~')) {
            const homedir = os.homedir();
            resolvedPath = path.join(homedir, resolvedPath.slice(1));
          }

          // Check if user provided a file path instead of directory
          if (resolvedPath.endsWith('.png') || resolvedPath.endsWith('.jpg')) {
            console.warn(
              `Warning: save_to_directory should be a directory path, not a file path. Using parent directory.`,
            );
            resolvedPath = path.dirname(resolvedPath);
          }

          // Ensure directory exists
          await fs.mkdir(resolvedPath, { recursive: true });

          // Generate filename from URL and timestamp
          const url = new URL(options.url);
          const hostname = url.hostname.replace(/[^a-z0-9]/gi, '-');
          const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, -5);
          const filename = `${hostname}-${timestamp}.png`;

          savedFilePath = path.join(resolvedPath, filename);

          // Convert base64 to buffer and save
          const buffer = Buffer.from(result.screenshot, 'base64');
          await fs.writeFile(savedFilePath, buffer);
        } catch (saveError) {
          // Log error but don't fail the operation
          console.error('Failed to save screenshot locally:', saveError);
        }
      }

      const textContent = savedFilePath
        ? `Screenshot captured for: ${options.url}\nSaved to: ${savedFilePath}`
        : `Screenshot captured for: ${options.url}`;

      // If saved locally and screenshot is large (>800KB), don't return the base64 data
      const screenshotSize = Buffer.from(result.screenshot, 'base64').length;
      const shouldReturnImage = !savedFilePath || screenshotSize < 800 * 1024; // 800KB threshold

      const content = [];

      if (shouldReturnImage) {
        content.push({
          type: 'image',
          data: result.screenshot,
          mimeType: 'image/png',
        });
      }

      content.push({
        type: 'text',
        text: shouldReturnImage
          ? textContent
          : `${textContent}\n\nNote: Screenshot data not returned due to size (${Math.round(screenshotSize / 1024)}KB). View the saved file instead.`,
      });

      return { content };
    } catch (error) {
      throw this.formatError(error, 'capture screenshot');
    }
  }

  async generatePDF(options: PDFEndpointOptions) {
    try {
      const result: PDFEndpointResponse = await this.service.generatePDF(options);

      // Response has { success: true, pdf: "base64string" }
      if (!result.success || !result.pdf) {
        throw new Error('PDF generation failed - no PDF data in response');
      }

      return {
        content: [
          {
            type: 'resource',
            resource: {
              uri: `data:application/pdf;name=${encodeURIComponent(new URL(String(options.url)).hostname)}.pdf;base64,${result.pdf}`,
              mimeType: 'application/pdf',
              blob: result.pdf,
            },
          },
          {
            type: 'text',
            text: `PDF generated for: ${options.url}`,
          },
        ],
      };
    } catch (error) {
      throw this.formatError(error, 'generate PDF');
    }
  }

  async getHTML(options: HTMLEndpointOptions) {
    try {
      const result: HTMLEndpointResponse = await this.service.getHTML(options);

      // Response has { html: string, url: string, success: true }
      return {
        content: [
          {
            type: 'text',
            text: result.html || '',
          },
        ],
      };
    } catch (error) {
      throw this.formatError(error, 'get HTML');
    }
  }

  async extractWithLLM(options: { url: string; query: string }) {
    try {
      const result = await this.service.extractWithLLM(options);

      return {
        content: [
          {
            type: 'text',
            text: result.answer,
          },
        ],
      };
    } catch (error) {
      throw this.formatError(error, 'extract with LLM');
    }
  }
}

```

--------------------------------------------------------------------------------
/src/__tests__/integration/get-markdown.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('get_markdown Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Markdown extraction', () => {
    it(
      'should extract markdown with default fit filter',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://httpbin.org/html',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('URL: https://httpbin.org/html');
        expect(text).toContain('Filter: fit');
        expect(text).toContain('Markdown:');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should extract markdown with raw filter',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://httpbin.org/html',
            filter: 'raw',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Filter: raw');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should extract markdown with bm25 filter and query',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://httpbin.org/html',
            filter: 'bm25',
            query: 'Herman Melville',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Filter: bm25');
        expect(text).toContain('Query: Herman Melville');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should extract markdown with llm filter and query',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://httpbin.org/html',
            filter: 'llm',
            query: 'What is this page about?',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Filter: llm');
        expect(text).toContain('Query: What is this page about?');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should use cache parameter',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://httpbin.org/html',
            cache: '1',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Cache: 1');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should reject session_id parameter',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://httpbin.org/html',
            session_id: 'test-session',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('session_id');
        expect(content[0].text).toContain('does not support');
        expect(content[0].text).toContain('stateless');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle invalid URLs gracefully',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'not-a-valid-url',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('Error');
        expect(content[0].text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle non-existent domains',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://this-domain-definitely-does-not-exist-123456789.com',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        // According to the pattern from other tests, might return success with empty content
        const text = content[0].text || '';
        expect(typeof text).toBe('string');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should ignore extra parameters',
      async () => {
        const result = await client.callTool({
          name: 'get_markdown',
          arguments: {
            url: 'https://httpbin.org/html',
            filter: 'fit',
            // These should be ignored
            remove_images: true,
            bypass_cache: true,
            screenshot: true,
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        // Should still work, ignoring extra params
        const text = content[0].text || '';
        expect(text).toContain('Filter: fit');
      },
      TEST_TIMEOUTS.medium,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/execute-js.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('execute_js Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('JavaScript execution', () => {
    it(
      'should execute JavaScript and return results',
      async () => {
        const result = await client.callTool({
          name: 'execute_js',
          arguments: {
            url: 'https://httpbin.org/html',
            scripts: ['return document.title', 'return document.querySelectorAll("h1").length'],
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        // Should contain JavaScript execution results
        expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html');
        expect(content[0].text).toContain('Results:');
        expect(content[0].text).toContain('Script: return document.title');
        expect(content[0].text).toMatch(/Returned: .*/); // Title may be empty or no return value
        expect(content[0].text).toContain('Script: return document.querySelectorAll("h1").length');
        expect(content[0].text).toContain('Returned: 1'); // Should have 1 h1 element
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should execute single script as string',
      async () => {
        console.log('Starting execute_js test...');
        const result = await client.callTool({
          name: 'execute_js',
          arguments: {
            url: 'https://httpbin.org/html',
            scripts: 'return window.location.href',
          },
        });
        console.log('Got result:', result);

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);

        expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html');
        expect(content[0].text).toContain('Script: return window.location.href');
        expect(content[0].text).toContain('Returned: "https://httpbin.org/html');
      },
      TEST_TIMEOUTS.long, // Increase timeout to 120s
    );

    it(
      'should reject session_id parameter',
      async () => {
        const result = await client.callTool({
          name: 'execute_js',
          arguments: {
            url: 'https://httpbin.org/html',
            scripts: 'return true',
            session_id: 'test-session',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('session_id');
        expect(content[0].text).toContain('does not support');
        expect(content[0].text).toContain('stateless');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should reject invalid JavaScript with HTML entities',
      async () => {
        const result = await client.callTool({
          name: 'execute_js',
          arguments: {
            url: 'https://httpbin.org/html',
            scripts: 'return &quot;test&quot;',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].text).toContain('Error');
        expect(content[0].text).toContain('Invalid JavaScript');
        expect(content[0].text).toContain('HTML entities');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should accept JavaScript with newlines in strings',
      async () => {
        const result = await client.callTool({
          name: 'execute_js',
          arguments: {
            url: 'https://httpbin.org/html',
            scripts: 'const text = "line1\\nline2"; return text',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html');
        expect(content[0].text).toContain('Returned: "line1\\nline2"');
      },
      TEST_TIMEOUTS.medium, // Increase from short to medium
    );

    it(
      'should handle JavaScript execution errors',
      async () => {
        const result = await client.callTool({
          name: 'execute_js',
          arguments: {
            url: 'https://httpbin.org/html',
            scripts: [
              'return "This works"',
              'throw new Error("This is a test error")',
              'nonExistentVariable.someMethod()',
            ],
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html');

        // First script should succeed
        expect(content[0].text).toContain('Script: return "This works"');
        expect(content[0].text).toContain('Returned: "This works"');

        // Second script should show error
        expect(content[0].text).toContain('Script: throw new Error("This is a test error")');
        expect(content[0].text).toContain('Returned: Error: Error: This is a test error');

        // Third script should show reference error
        expect(content[0].text).toContain('Script: nonExistentVariable.someMethod()');
        expect(content[0].text).toContain('Returned: Error: ReferenceError: nonExistentVariable is not defined');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle invalid URLs gracefully',
      async () => {
        const result = await client.callTool({
          name: 'execute_js',
          arguments: {
            url: 'not-a-valid-url',
            scripts: 'return true',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].text).toContain('Error');
        expect(content[0].text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/batch-crawl.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('batch_crawl Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Batch crawling', () => {
    it(
      'should crawl multiple URLs',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['https://httpbingo.org/html', 'https://httpbingo.org/json'],
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Batch crawl completed');
        expect(text).toContain('Processed 2 URLs');
        expect(text).toContain('https://httpbingo.org/html: Success');
        expect(text).toContain('https://httpbingo.org/json: Success');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle max_concurrent parameter',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['https://httpbingo.org/html', 'https://httpbingo.org/xml', 'https://httpbingo.org/json'],
            max_concurrent: 1,
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Processed 3 URLs');
        expect(text).toContain(': Success');
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should remove images when requested',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['https://httpbingo.org/html'],
            remove_images: true,
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Batch crawl completed');
        expect(text).toContain('https://httpbingo.org/html: Success');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should bypass cache when requested',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['https://httpbingo.org/html'],
            bypass_cache: true,
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Batch crawl completed');
        expect(text).toContain('https://httpbingo.org/html: Success');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle mixed content types',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['https://httpbin.org/html', 'https://httpbin.org/json', 'https://httpbin.org/xml'],
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Processed 3 URLs');
        expect(text).toContain('https://httpbin.org/html: Success');
        expect(text).toContain('https://httpbin.org/json: Success');
        expect(text).toContain('https://httpbin.org/xml: Success');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle empty URL list',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: [],
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].text).toContain('Error');
        // Just check that it's an error about invalid parameters
        expect(content[0].text?.toLowerCase()).toMatch(/error|invalid|failed/);
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should reject session_id parameter',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['https://httpbingo.org/html'],
            session_id: 'test-session',
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');
        expect(content[0].text).toContain('session_id');
        expect(content[0].text).toContain('does not support');
        expect(content[0].text).toContain('stateless');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle per-URL configs array',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['https://httpbingo.org/html', 'https://httpbingo.org/json'],
            configs: [
              {
                url: 'https://httpbingo.org/html',
                browser_config: { browser_type: 'chromium' },
                crawler_config: { word_count_threshold: 10 },
              },
              {
                url: 'https://httpbingo.org/json',
                browser_config: { browser_type: 'firefox' },
                crawler_config: { word_count_threshold: 20 },
              },
            ],
            max_concurrent: 2,
          },
        });

        const content = (result as ToolResult).content;
        expect(content).toHaveLength(1);
        expect(content[0].type).toBe('text');

        const text = content[0].text || '';
        expect(text).toContain('Batch crawl completed');
        expect(text).toContain('Processed 2 URLs');
        // Both should succeed regardless of different configs
        expect(text).toContain('https://httpbingo.org/html: Success');
        expect(text).toContain('https://httpbingo.org/json: Success');
      },
      TEST_TIMEOUTS.medium,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/parse-sitemap.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('parse_sitemap Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Basic functionality', () => {
    it(
      'should parse nodejs.org sitemap successfully',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://nodejs.org/sitemap.xml',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        expect(Array.isArray(content)).toBe(true);
        expect(content.length).toBeGreaterThan(0);

        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Sitemap parsed successfully');
        expect(textContent?.text).toContain('Total URLs found:');
        expect(textContent?.text).toContain('https://nodejs.org');

        // Should find many URLs in the nodejs sitemap
        expect(textContent?.text).toMatch(/Total URLs found: [1-9][0-9]+/);
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should filter URLs with regex pattern',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://nodejs.org/sitemap.xml',
            filter_pattern: '.*/learn/.*', // Only URLs containing /learn/
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();

        // Check that filtering worked
        expect(textContent?.text).toContain('Filtered URLs:');

        // All URLs in the result should contain /learn/
        const urlsSection = textContent?.text?.split('URLs:\n')[1];
        if (urlsSection) {
          const urls = urlsSection.split('\n').filter((url) => url.trim());
          urls.forEach((url) => {
            if (url && !url.includes('... and')) {
              expect(url).toContain('/learn/');
            }
          });
        }
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle empty sitemaps',
      async () => {
        // Using a URL that returns valid XML but not a sitemap
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://www.w3schools.com/xml/note.xml',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Total URLs found: 0');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle large sitemaps with truncation',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://nodejs.org/sitemap.xml',
            filter_pattern: '.*', // Match all to test truncation
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();

        // Should show max 100 URLs and indicate there are more
        if (textContent?.text && textContent.text.includes('... and')) {
          expect(textContent.text).toMatch(/\.\.\. and \d+ more/);
        }
      },
      TEST_TIMEOUTS.medium,
    );
  });

  describe('Error handling', () => {
    it(
      'should handle invalid URLs',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'not-a-url',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content).toBeDefined();
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
        expect(textContent?.text?.toLowerCase()).toContain('invalid');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should handle non-existent URLs',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://this-domain-definitely-does-not-exist-12345.com/sitemap.xml',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle non-XML content',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://example.com', // HTML page, not XML
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        // Should still parse but likely find 0 URLs since it's not a sitemap
        expect(textContent?.text).toContain('Total URLs found:');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle invalid regex patterns',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://nodejs.org/sitemap.xml',
            filter_pattern: '[invalid(regex', // Invalid regex
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        const textContent = content.find((c) => c.type === 'text');
        expect(textContent).toBeDefined();
        expect(textContent?.text).toContain('Error');
        expect(textContent?.text?.toLowerCase()).toMatch(/failed|error|invalid/);
      },
      TEST_TIMEOUTS.medium,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/crawl-handlers.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
  }>;
}

describe('Crawl Handlers Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('batch_crawl error handling', () => {
    it(
      'should handle batch crawl with invalid URLs',
      async () => {
        const result = await client.callTool({
          name: 'batch_crawl',
          arguments: {
            urls: ['not-a-valid-url', 'https://this-domain-does-not-exist-12345.com'],
            max_concurrent: 2,
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content[0].type).toBe('text');
        // Zod validation will catch the invalid URL format
        expect(content[0].text).toContain('Invalid parameters');
      },
      TEST_TIMEOUTS.medium,
    );
  });

  describe('smart_crawl edge cases', () => {
    it(
      'should detect XML content type for XML URLs',
      async () => {
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://httpbin.org/xml',
            bypass_cache: true,
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content[0].text).toContain('Smart crawl detected content type:');
        // Should detect as XML based on content-type header
        expect(content[0].text?.toLowerCase()).toMatch(/xml|json/); // httpbin.org/xml actually returns JSON
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle follow_links with sitemap URLs',
      async () => {
        // Note: Most sites don't have accessible sitemaps, so this tests the logic
        const result = await client.callTool({
          name: 'smart_crawl',
          arguments: {
            url: 'https://example.com/sitemap.xml',
            follow_links: true,
            max_depth: 2,
            bypass_cache: true,
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content[0].text).toContain('Smart crawl detected content type:');
      },
      TEST_TIMEOUTS.long, // Increase timeout for sitemap processing
    );
  });

  describe('crawl_recursive edge cases', () => {
    it(
      'should respect max_depth limit of 0',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'https://httpbin.org/links/5/0',
            max_depth: 0, // Should only crawl the initial page
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        // The test might show 0 pages if the URL fails, or 1 page if it succeeds
        expect(content[0].text).toMatch(/Pages crawled: [01]/);
        // If pages were crawled, check for max depth message
        if (content[0].text?.includes('Pages crawled: 1')) {
          expect(content[0].text).toContain('Max depth reached: 0');
        }
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle sites with no internal links',
      async () => {
        const result = await client.callTool({
          name: 'crawl_recursive',
          arguments: {
            url: 'https://httpbin.org/json', // JSON endpoint has no links
            max_depth: 2,
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content[0].text).toContain('Pages crawled: 1');
        expect(content[0].text).toContain('Internal links found: 0');
      },
      TEST_TIMEOUTS.medium,
    );
  });

  describe('parse_sitemap error handling', () => {
    it(
      'should handle non-existent sitemap URLs',
      async () => {
        const result = await client.callTool({
          name: 'parse_sitemap',
          arguments: {
            url: 'https://this-domain-does-not-exist-12345.com/sitemap.xml',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content[0].text).toContain('Error');
        expect(content[0].text?.toLowerCase()).toMatch(/failed|error|not found/);
      },
      TEST_TIMEOUTS.medium,
    );
  });

  describe('crawl method edge cases', () => {
    it(
      'should handle crawl with all image and filtering parameters',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://example.com',
            word_count_threshold: 50,
            image_description_min_word_threshold: 10,
            image_score_threshold: 0.5,
            exclude_social_media_links: true,
            cache_mode: 'BYPASS',
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content[0].type).toBe('text');
        // Should successfully crawl with these parameters
        expect(content[0].text).not.toContain('Error');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should handle js_code as null with validation error',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://example.com',
            js_code: null as unknown as string, // Intentionally pass null
          },
        });

        expect(result).toBeDefined();
        const content = (result as ToolResult).content;
        expect(content[0].text).toContain('Invalid parameters for crawl');
        expect(content[0].text).toContain('js_code');
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should work with session_id parameter using manage_session',
      async () => {
        // First create a session using manage_session
        const sessionResult = await client.callTool({
          name: 'manage_session',
          arguments: {
            action: 'create',
            session_id: 'test-crawl-session-new',
          },
        });

        expect(sessionResult).toBeDefined();

        // Then use it for crawling
        const crawlResult = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://example.com',
            session_id: 'test-crawl-session-new',
          },
        });

        expect(crawlResult).toBeDefined();
        const content = (crawlResult as ToolResult).content;
        expect(content[0].type).toBe('text');
        expect(content[0].text).not.toContain('Error');

        // Clean up using manage_session
        await client.callTool({
          name: 'manage_session',
          arguments: {
            action: 'clear',
            session_id: 'test-crawl-session-new',
          },
        });
      },
      TEST_TIMEOUTS.medium,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/integration/crawl-advanced.integration.test.ts:
--------------------------------------------------------------------------------

```typescript
/* eslint-env jest */
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { createTestClient, cleanupTestClient, expectSuccessfulCrawl, TEST_TIMEOUTS } from './test-utils.js';

interface ToolResult {
  content: Array<{
    type: string;
    text?: string;
    data?: string;
    mimeType?: string;
  }>;
}

describe('crawl Advanced Features Integration Tests', () => {
  let client: Client;

  beforeAll(async () => {
    client = await createTestClient();
  }, TEST_TIMEOUTS.medium);

  afterAll(async () => {
    if (client) {
      await cleanupTestClient(client);
    }
  });

  describe('Media and Content Extraction', () => {
    it(
      'should extract images with scoring',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://example.com',
            image_score_threshold: 3,
            exclude_external_images: false,
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
        const textContent = (result as ToolResult).content.find((c) => c.type === 'text');
        expect(textContent?.text).toBeTruthy();
        // Should have extracted content
        expect(textContent?.text).toContain('Example Domain');
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should capture MHTML',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://example.com',
            capture_mhtml: true,
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
        const textContent = (result as ToolResult).content.find((c) => c.type === 'text');
        expect(textContent?.text).toBeTruthy();
        // MHTML should be captured but not in the text output
        expect(textContent?.text).toContain('Example Domain');
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should extract tables from Wikipedia',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)',
            word_count_threshold: 10,
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
        const textContent = (result as ToolResult).content.find((c) => c.type === 'text');
        expect(textContent?.text).toBeTruthy();
        // Should contain country data
        expect(textContent?.text).toMatch(/China|India|United States/);
      },
      TEST_TIMEOUTS.long,
    );
  });

  describe('Link and Content Filtering', () => {
    it(
      'should exclude social media links',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://www.bbc.com/news',
            exclude_social_media_links: true,
            exclude_domains: ['twitter.com', 'facebook.com', 'instagram.com'],
            cache_mode: 'BYPASS',
            word_count_threshold: 50,
          },
        });

        await expectSuccessfulCrawl(result);
        const textContent = (result as ToolResult).content.find((c) => c.type === 'text');
        expect(textContent?.text).toBeTruthy();
        // Should have news content but no social media references in extracted links
        expect(textContent?.text).toContain('BBC');
      },
      TEST_TIMEOUTS.long,
    );

    it(
      'should remove excluded selectors',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            excluded_selector: 'div:first-child',
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.medium,
    );
  });

  describe('Page Navigation Options', () => {
    it(
      'should wait for images to load',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/image/png',
            wait_for_images: true,
            wait_until: 'load',
            page_timeout: 30000,
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.medium,
    );

    it(
      'should scan full page',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            scan_full_page: true,
            delay_before_scroll: 0.5,
            scroll_delay: 0.2,
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.medium,
    );
  });

  describe('Stealth and Bot Detection', () => {
    it(
      'should use magic mode',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/headers',
            magic: true,
            simulate_user: true,
            override_navigator: true,
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.long,
    );
  });

  describe('Extraction Strategies (0.7.3/0.7.4)', () => {
    it(
      'should accept extraction_strategy parameter',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            extraction_strategy: {
              type: 'custom',
              provider: 'openai',
              api_key: 'test-key',
              model: 'gpt-4',
            },
            cache_mode: 'BYPASS',
          },
        });

        // The parameter should be accepted even if not fully processed
        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should accept table_extraction_strategy parameter',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            table_extraction_strategy: {
              enable_chunking: true,
              thresholds: {
                min_rows: 5,
                max_columns: 20,
              },
            },
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.short,
    );

    it(
      'should accept markdown_generator_options parameter',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            markdown_generator_options: {
              include_links: true,
              preserve_formatting: true,
            },
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.short,
    );
  });

  describe('Virtual Scroll', () => {
    it(
      'should handle virtual scroll configuration',
      async () => {
        const result = await client.callTool({
          name: 'crawl',
          arguments: {
            url: 'https://httpbin.org/html',
            virtual_scroll_config: {
              container_selector: 'body',
              scroll_count: 3,
              scroll_by: 'container_height',
              wait_after_scroll: 0.5,
            },
            cache_mode: 'BYPASS',
          },
        });

        await expectSuccessfulCrawl(result);
      },
      TEST_TIMEOUTS.medium,
    );
  });
});

```

--------------------------------------------------------------------------------
/src/__tests__/index.npx.test.ts:
--------------------------------------------------------------------------------

```typescript
import { spawn } from 'child_process';
import * as path from 'path';
import * as url from 'url';
import * as fs from 'fs/promises';

const __dirname = url.fileURLToPath(new URL('.', import.meta.url));

describe('NPX Execution Tests', () => {
  // These tests ensure the package works when installed and run via npx
  // This prevents issues like the one in v2.6.11 where the server wouldn't start

  describe('Simulated NPX execution', () => {
    it('should start server when run from dist/index.js directly', async () => {
      // This simulates how npx runs the built package
      const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js');

      // Check if dist/index.js exists (it should after build)
      try {
        await fs.access(distIndexPath);
      } catch {
        console.warn('Skipping test - dist/index.js not found. Run "npm run build" first.');
        return;
      }

      const child = spawn('node', [distIndexPath], {
        env: {
          ...process.env,
          CRAWL4AI_BASE_URL: 'http://localhost:11235',
          CRAWL4AI_API_KEY: 'test-key',
          // Don't load .env file to simulate production
          NODE_ENV: 'production',
        },
        stdio: 'pipe',
      });

      let stderr = '';
      child.stderr.on('data', (data) => {
        stderr += data.toString();
      });

      // Wait for server to start
      await new Promise<void>((resolve) => {
        const timeout = setTimeout(() => {
          child.kill();
          resolve();
        }, 2000);

        child.stderr.on('data', (data) => {
          const output = data.toString();
          if (output.includes('started')) {
            clearTimeout(timeout);
            child.kill();
            resolve();
          }
        });
      });

      // Server should have started successfully
      expect(stderr).toContain('crawl4ai-mcp');
      expect(stderr).toContain('started');
    });

    it('should start server without dotenv when env vars are provided', async () => {
      // This tests that we don't require dotenv in production
      const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js');

      try {
        await fs.access(distIndexPath);
      } catch {
        console.warn('Skipping test - dist/index.js not found.');
        return;
      }

      // Temporarily rename node_modules/dotenv to simulate it not being available
      const dotenvPath = path.join(__dirname, '..', '..', 'node_modules', 'dotenv');
      const dotenvBackupPath = path.join(__dirname, '..', '..', 'node_modules', 'dotenv.backup');

      let dotenvRenamed = false;
      try {
        // Only rename if dotenv exists
        try {
          await fs.access(dotenvPath);
          await fs.rename(dotenvPath, dotenvBackupPath);
          dotenvRenamed = true;
        } catch {
          // dotenv doesn't exist, which is fine for this test
        }

        const child = spawn('node', [distIndexPath], {
          env: {
            CRAWL4AI_BASE_URL: 'http://localhost:11235',
            CRAWL4AI_API_KEY: 'test-key',
            PATH: process.env.PATH,
          },
          stdio: 'pipe',
        });

        let stderr = '';
        child.stderr.on('data', (data) => {
          stderr += data.toString();
        });

        // Wait for server to start
        await new Promise<void>((resolve) => {
          setTimeout(() => {
            child.kill();
            resolve();
          }, 2000);
        });

        // Server should still start even without dotenv
        expect(stderr).toContain('crawl4ai-mcp');
        expect(stderr).toContain('started');
      } finally {
        // Restore dotenv if we renamed it
        if (dotenvRenamed) {
          await fs.rename(dotenvBackupPath, dotenvPath);
        }
      }
    });

    it('should handle MCP protocol initialization', async () => {
      // This simulates the full MCP handshake that Claude Desktop does
      const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js');

      try {
        await fs.access(distIndexPath);
      } catch {
        console.warn('Skipping test - dist/index.js not found.');
        return;
      }

      const child = spawn('node', [distIndexPath], {
        env: {
          ...process.env,
          CRAWL4AI_BASE_URL: 'http://localhost:11235',
          CRAWL4AI_API_KEY: 'test-key',
        },
        stdio: 'pipe',
      });

      let stdout = '';
      let stderr = '';

      child.stdout.on('data', (data) => {
        stdout += data.toString();
      });

      child.stderr.on('data', (data) => {
        stderr += data.toString();
      });

      // Wait for server to start
      await new Promise((resolve) => setTimeout(resolve, 500));

      // Send MCP initialization request (like Claude Desktop does)
      const initRequest =
        JSON.stringify({
          jsonrpc: '2.0',
          method: 'initialize',
          params: {
            protocolVersion: '2025-06-18',
            capabilities: {},
            clientInfo: {
              name: 'test-client',
              version: '1.0.0',
            },
          },
          id: 1,
        }) + '\n';

      child.stdin.write(initRequest);

      // Wait for response
      await new Promise((resolve) => setTimeout(resolve, 1000));

      // Parse the response
      const response = stdout.trim().split('\n').pop();
      let parsed;
      try {
        parsed = JSON.parse(response || '{}');
      } catch {
        // Response might not be valid JSON yet
        parsed = {};
      }

      child.kill();

      // Should have received an initialization response
      expect(stderr).toContain('started');
      expect(parsed.id).toBe(1);
      expect(parsed.result).toBeDefined();
    });

    it('should fail gracefully when CRAWL4AI_BASE_URL is missing', async () => {
      const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js');

      try {
        await fs.access(distIndexPath);
      } catch {
        console.warn('Skipping test - dist/index.js not found.');
        return;
      }

      const child = spawn('node', [distIndexPath], {
        env: {
          // Explicitly set to empty string to prevent dotenv from loading
          CRAWL4AI_BASE_URL: '',
          PATH: process.env.PATH,
        },
        stdio: 'pipe',
      });

      let stderr = '';
      child.stderr.on('data', (data) => {
        stderr += data.toString();
      });

      const exitCode = await new Promise<number | null>((resolve, reject) => {
        // Add timeout to prevent hanging
        const timeout = setTimeout(() => {
          child.kill('SIGTERM');
          reject(new Error('Process timeout'));
        }, 10000); // 10 second timeout

        child.on('exit', (code) => {
          clearTimeout(timeout);
          resolve(code);
        });

        child.on('error', (err) => {
          clearTimeout(timeout);
          reject(err);
        });
      });

      // Should exit with error code
      expect(exitCode).toBe(1);
      expect(stderr).toContain('CRAWL4AI_BASE_URL environment variable is required');

      // Ensure cleanup
      child.kill();
    }, 15000); // 15 second test timeout
  });

  describe('NPX-specific edge cases', () => {
    it('should work with different Node.js execution paths', async () => {
      // NPX might use different paths for node execution
      const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js');

      try {
        await fs.access(distIndexPath);
      } catch {
        console.warn('Skipping test - dist/index.js not found.');
        return;
      }

      // Test with different argv[1] values that npx might use
      const testPaths = [
        distIndexPath,
        '/tmp/npx-12345/node_modules/.bin/mcp-crawl4ai-ts',
        path.join(process.env.HOME || '', '.npm/_npx/12345/node_modules/mcp-crawl4ai-ts/dist/index.js'),
      ];

      for (const testPath of testPaths) {
        const child = spawn('node', [distIndexPath], {
          env: {
            ...process.env,
            CRAWL4AI_BASE_URL: 'http://localhost:11235',
            // Simulate different execution contexts
            npm_execpath: testPath,
          },
          stdio: 'pipe',
        });

        let started = false;
        child.stderr.on('data', (data) => {
          if (data.toString().includes('started')) {
            started = true;
          }
        });

        // Give it time to start
        await new Promise((resolve) => setTimeout(resolve, 500));
        child.kill();

        expect(started).toBe(true);
      }
    });
  });
});

```

--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------

```typescript
export interface CrawlOptions {
  remove_images?: boolean;
  bypass_cache?: boolean;
  filter_mode?: 'blacklist' | 'whitelist';
  filter_list?: string[];
  screenshot?: boolean;
  wait_for?: string;
  timeout?: number;
}

export interface JSExecuteOptions {
  js_code: string | string[];
  // Only url and js_code (scripts) are supported by /execute_js endpoint
}

export interface JSExecuteEndpointOptions {
  url: string;
  scripts: string | string[];
  // Only url and scripts are supported by /execute_js endpoint
}

export interface JSExecuteEndpointResponse {
  success: boolean;
  js_execution_result: {
    success: boolean;
    results: unknown[];
  };
  markdown?: string | CrawlMarkdownResult;
}

export interface ScreenshotEndpointOptions {
  url: string;
  screenshot_wait_for?: number;
  save_to_directory?: string;
  // output_path is omitted to get base64 response
}

export interface ScreenshotEndpointResponse {
  success: boolean;
  screenshot: string; // base64 encoded image
}

export interface PDFEndpointOptions {
  url: string;
  // Only url is supported by /pdf endpoint
}

export interface PDFEndpointResponse {
  success: boolean;
  pdf: string; // base64 encoded PDF
}

export interface HTMLEndpointOptions {
  url: string;
  // Only url is supported by /html endpoint
}

export interface HTMLEndpointResponse {
  html: string;
  url: string;
  success: boolean;
}

export type FilterType = 'raw' | 'fit' | 'bm25' | 'llm';

export interface MarkdownEndpointOptions {
  url: string;
  f?: FilterType; // Filter type: raw, fit (default), bm25, llm
  q?: string; // Query string for bm25/llm filters
  c?: string; // Cache-bust parameter
}

export interface MarkdownEndpointResponse {
  url: string;
  filter: string;
  query: string | null;
  cache: string;
  markdown: string;
  success: boolean;
}

export interface LLMEndpointOptions {
  url: string;
  query: string;
}

export interface LLMEndpointResponse {
  answer: string;
}

export interface BatchCrawlOptions extends CrawlOptions {
  urls: string[];
  max_concurrent?: number;
  // New: Support per-URL configs array (0.7.3/0.7.4)
  configs?: Array<{
    url: string;
    browser_config?: BrowserConfig;
    crawler_config?: CrawlerConfig;
    extraction_strategy?: ExtractionStrategy;
    table_extraction_strategy?: TableExtractionStrategy;
    markdown_generator_options?: MarkdownGeneratorOptions;
    matcher?: string | ((url: string) => boolean);
  }>;
}

// Browser configuration options
export interface BrowserConfig {
  browser_type?: 'chromium' | 'firefox' | 'webkit' | 'undetected';
  headless?: boolean;
  viewport_width?: number;
  viewport_height?: number;
  user_agent?: string;
  // Unified proxy config - accepts string or object format (new in 0.7.3/0.7.4)
  proxy?:
    | string
    | {
        server: string;
        username?: string;
        password?: string;
      };
  // Legacy field kept for backward compatibility
  proxy_config?: {
    server: string;
    username?: string;
    password?: string;
  };
  cookies?: Array<{
    name: string;
    value: string;
    domain: string;
    path?: string;
  }>;
  headers?: Record<string, string>;
  extra_args?: string[];
}

// Virtual scroll configuration for sites like Twitter/Instagram
export interface VirtualScrollConfig {
  container_selector: string;
  scroll_count?: number;
  scroll_by?: string | number;
  wait_after_scroll?: number;
}

// Crawler configuration options
export interface CrawlerConfig {
  // Content filtering
  word_count_threshold?: number;
  excluded_tags?: string[];
  excluded_selector?: string;
  remove_overlay_elements?: boolean;
  only_text?: boolean;
  remove_forms?: boolean;
  keep_data_attributes?: boolean;

  // JavaScript execution
  js_code?: string | string[];
  js_only?: boolean;
  wait_for?: string;
  wait_for_timeout?: number;

  // Page navigation & timing
  wait_until?: 'domcontentloaded' | 'networkidle' | 'load';
  page_timeout?: number;
  wait_for_images?: boolean;
  ignore_body_visibility?: boolean;

  // Dynamic content handling
  delay_before_scroll?: number;
  scroll_delay?: number;
  scan_full_page?: boolean;
  virtual_scroll_config?: VirtualScrollConfig;

  // Content processing
  process_iframes?: boolean;
  exclude_external_links?: boolean;

  // Media handling
  screenshot?: boolean;
  screenshot_wait_for?: number;
  pdf?: boolean;
  capture_mhtml?: boolean;
  image_description_min_word_threshold?: number;
  image_score_threshold?: number;
  exclude_external_images?: boolean;

  // Link filtering
  exclude_social_media_links?: boolean;
  exclude_domains?: string[];

  // Page interaction
  simulate_user?: boolean;
  override_navigator?: boolean;
  magic?: boolean;

  // Session management
  session_id?: string;

  // Cache control
  cache_mode?: 'ENABLED' | 'BYPASS' | 'DISABLED';

  // Performance options
  timeout?: number;
  verbose?: boolean;

  // Debug
  log_console?: boolean;

  // New parameters from 0.7.3/0.7.4
  delay_before_return_html?: number; // Delay in ms before capturing final HTML
  css_selector?: string; // CSS selector to extract specific elements
  include_links?: boolean; // Whether to include links in the response
  resolve_absolute_urls?: boolean; // Convert relative URLs to absolute ones
}

// Extraction strategy passthrough objects (new in 0.7.3/0.7.4)
export interface ExtractionStrategy {
  [key: string]: unknown;
}

export interface TableExtractionStrategy {
  enable_chunking?: boolean;
  thresholds?: Record<string, unknown>;
  [key: string]: unknown;
}

export interface MarkdownGeneratorOptions {
  include_links?: boolean;
  [key: string]: unknown;
}

// Advanced crawl configuration combining browser and crawler configs
export interface AdvancedCrawlConfig {
  url?: string;
  urls?: string[];
  browser_config?: BrowserConfig;
  crawler_config?: CrawlerConfig;
  priority?: number;
  extraction_strategy?: ExtractionStrategy;
  table_extraction_strategy?: TableExtractionStrategy;
  markdown_generator_options?: MarkdownGeneratorOptions;
}

// Session management types (used internally by MCP server)
export interface SessionInfo {
  id: string;
  created_at: Date;
  last_used: Date;
  initial_url?: string;
  metadata?: Record<string, unknown>;
}

// Crawl endpoint types
export interface CrawlEndpointOptions {
  urls: string[];
  browser_config?: BrowserConfig;
  crawler_config?: CrawlerConfig;
}

export interface CrawlMarkdownResult {
  raw_markdown: string;
  markdown_with_citations: string;
  references_markdown: string;
  fit_markdown: string;
  fit_html: string;
}

export interface CrawlMediaResult {
  images: Array<{
    src?: string | null;
    data?: string;
    alt?: string | null;
    desc?: string;
    score?: number;
    type?: string;
    group_id?: number;
    format?: string | null;
    width?: number | null;
  }>;
  videos: Array<{
    src?: string | null;
    data?: string;
    alt?: string | null;
    desc?: string;
    score?: number;
    type?: string;
    group_id?: number;
    format?: string | null;
    width?: number | null;
  }>;
  audios: Array<{
    src?: string | null;
    data?: string;
    alt?: string | null;
    desc?: string;
    score?: number;
    type?: string;
    group_id?: number;
    format?: string | null;
    width?: number | null;
  }>;
}

interface LinkItem {
  href: string;
  text: string;
  title: string;
  base_domain?: string | null;
  head_data?: Record<string, unknown> | null;
  head_extraction_status?: string | null;
  head_extraction_error?: string | null;
  intrinsic_score?: number;
  contextual_score?: number | null;
  total_score?: number | null;
}

export interface CrawlLinksResult {
  internal: LinkItem[];
  external: LinkItem[];
}

export interface CrawlResultItem {
  url: string;
  html: string;
  cleaned_html: string;
  fit_html: string;
  success: boolean;
  error_message?: string;
  status_code: number;
  response_headers: Record<string, unknown>;
  redirected_url?: string;
  session_id: string | null;
  metadata: Record<string, unknown>;
  links: CrawlLinksResult;
  media: CrawlMediaResult;
  markdown: CrawlMarkdownResult;
  tables: unknown[];
  extracted_content: unknown | null;
  screenshot: string | null; // base64 PNG when screenshot: true
  pdf: string | null; // base64 PDF when pdf: true
  mhtml: string | null;
  js_execution_result: {
    success: boolean;
    results: unknown[];
  } | null;
  downloaded_files: unknown | null;
  network_requests: unknown | null;
  console_messages: unknown | null;
  ssl_certificate: unknown | null;
  dispatch_result: unknown | null;
}

export interface CrawlEndpointResponse {
  success: boolean;
  results: CrawlResultItem[];
  server_processing_time_s: number;
  server_memory_delta_mb: number;
  server_peak_memory_mb: number;
}

```

--------------------------------------------------------------------------------
/src/schemas/validation-schemas.ts:
--------------------------------------------------------------------------------

```typescript
import { z } from 'zod';
import { validateJavaScriptCode, createStatelessSchema } from './helpers.js';

export const JsCodeSchema = z
  .union([
    z.string().refine(validateJavaScriptCode, {
      message:
        'Invalid JavaScript: Contains HTML entities (&quot;), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.',
    }),
    z.array(
      z.string().refine(validateJavaScriptCode, {
        message:
          'Invalid JavaScript: Contains HTML entities (&quot;), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.',
      }),
    ),
  ])
  .describe('JavaScript code as string or array of strings');

export const VirtualScrollConfigSchema = z.object({
  container_selector: z.string(),
  scroll_count: z.number().optional(),
  scroll_by: z.union([z.string(), z.number()]).optional(),
  wait_after_scroll: z.number().optional(),
});

const GetMarkdownBaseSchema = z.object({
  url: z.string().url(),
  filter: z.enum(['raw', 'fit', 'bm25', 'llm']).optional().default('fit'),
  query: z.string().optional(),
  cache: z.string().optional().default('0'),
});

export const GetMarkdownSchema = createStatelessSchema(GetMarkdownBaseSchema, 'get_markdown').refine(
  (data) => {
    // If filter is bm25 or llm, query is required
    if ((data.filter === 'bm25' || data.filter === 'llm') && !data.query) {
      return false;
    }
    return true;
  },
  {
    message: 'Query parameter is required when using bm25 or llm filter',
    path: ['query'],
  },
);

export const ExecuteJsSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    scripts: JsCodeSchema,
  }),
  'execute_js',
);

export const GetHtmlSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
  }),
  'get_html',
);

export const CaptureScreenshotSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    screenshot_wait_for: z.number().optional(),
    save_to_directory: z.string().optional().describe('Local directory to save screenshot file'),
    // output_path not exposed as MCP needs base64 data
  }),
  'capture_screenshot',
);

export const GeneratePdfSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    // Only url is supported - output_path not exposed as MCP needs base64 data
  }),
  'generate_pdf',
);

export const ExtractWithLlmSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    query: z.string(),
  }),
  'extract_with_llm',
);

export const BatchCrawlSchema = createStatelessSchema(
  z.object({
    urls: z.array(z.string().url()),
    max_concurrent: z.number().optional(),
    remove_images: z.boolean().optional(),
    bypass_cache: z.boolean().optional(),
    // New: Support per-URL configs array (0.7.3/0.7.4)
    configs: z
      .array(
        z.object({
          url: z.string().url(),
          browser_config: z.record(z.unknown()).optional(),
          crawler_config: z.record(z.unknown()).optional(),
          extraction_strategy: z.record(z.unknown()).optional(),
          table_extraction_strategy: z.record(z.unknown()).optional(),
          markdown_generator_options: z.record(z.unknown()).optional(),
          matcher: z.union([z.string(), z.function()]).optional(),
        }),
      )
      .optional(),
  }),
  'batch_crawl',
);

export const SmartCrawlSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    max_depth: z.number().optional(),
    follow_links: z.boolean().optional(),
    bypass_cache: z.boolean().optional(),
  }),
  'smart_crawl',
);

export const ExtractLinksSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    categorize: z.boolean().optional().default(true),
  }),
  'extract_links',
);

export const CrawlRecursiveSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    max_depth: z.number().optional(),
    max_pages: z.number().optional(),
    include_pattern: z.string().optional(),
    exclude_pattern: z.string().optional(),
  }),
  'crawl_recursive',
);

export const ParseSitemapSchema = createStatelessSchema(
  z.object({
    url: z.string().url(),
    filter_pattern: z.string().optional(),
  }),
  'parse_sitemap',
);

// Unified session management schema
export const ManageSessionSchema = z.discriminatedUnion('action', [
  z.object({
    action: z.literal('create'),
    session_id: z.string().optional(),
    initial_url: z.string().url().optional(),
    browser_type: z.enum(['chromium', 'firefox', 'webkit']).optional(),
  }),
  z.object({
    action: z.literal('clear'),
    session_id: z.string(),
  }),
  z.object({
    action: z.literal('list'),
  }),
]);

export const CrawlSchema = z
  .object({
    url: z.string().url(),

    // Browser configuration
    browser_type: z.enum(['chromium', 'firefox', 'webkit']).optional(),
    viewport_width: z.number().optional(),
    viewport_height: z.number().optional(),
    user_agent: z.string().optional(),
    proxy_server: z.string().optional(),
    proxy_username: z.string().optional(),
    proxy_password: z.string().optional(),
    cookies: z
      .array(
        z.object({
          name: z.string(),
          value: z.string(),
          domain: z.string(),
          path: z.string().optional(),
        }),
      )
      .optional(),
    headers: z.record(z.string()).optional(),
    extra_args: z.array(z.string()).optional(),

    // Content filtering
    word_count_threshold: z.number().optional(),
    excluded_tags: z.array(z.string()).optional(),
    excluded_selector: z.string().optional(),
    remove_overlay_elements: z.boolean().optional(),
    only_text: z.boolean().optional(),
    remove_forms: z.boolean().optional(),
    keep_data_attributes: z.boolean().optional(),

    // JavaScript execution
    js_code: JsCodeSchema.optional(),
    js_only: z.boolean().optional(),
    wait_for: z.string().optional(),
    wait_for_timeout: z.number().optional(),

    // Page navigation & timing
    wait_until: z.enum(['domcontentloaded', 'networkidle', 'load']).optional(),
    page_timeout: z.number().optional(),
    wait_for_images: z.boolean().optional(),
    ignore_body_visibility: z.boolean().optional(),

    // Dynamic content
    delay_before_scroll: z.number().optional(),
    scroll_delay: z.number().optional(),
    scan_full_page: z.boolean().optional(),
    virtual_scroll_config: VirtualScrollConfigSchema.optional(),

    // Content processing
    process_iframes: z.boolean().optional(),
    exclude_external_links: z.boolean().optional(),

    // Media handling
    screenshot: z.boolean().optional(),
    screenshot_wait_for: z.number().optional(),
    screenshot_directory: z
      .string()
      .optional()
      .describe('Local directory to save screenshot file when screenshot=true'),
    pdf: z.boolean().optional(),
    capture_mhtml: z.boolean().optional(),
    image_description_min_word_threshold: z.number().optional(),
    image_score_threshold: z.number().optional(),
    exclude_external_images: z.boolean().optional(),

    // Link filtering
    exclude_social_media_links: z.boolean().optional(),
    exclude_domains: z.array(z.string()).optional(),

    // Page interaction
    simulate_user: z.boolean().optional(),
    override_navigator: z.boolean().optional(),
    magic: z.boolean().optional(),

    // Session and cache
    session_id: z.string().optional(),
    cache_mode: z.enum(['ENABLED', 'BYPASS', 'DISABLED']).optional(),

    // Performance options
    timeout: z.number().optional(),
    verbose: z.boolean().optional(),

    // Debug
    log_console: z.boolean().optional(),

    // New parameters from 0.7.3/0.7.4
    delay_before_return_html: z.number().optional(),
    css_selector: z.string().optional(),
    include_links: z.boolean().optional(),
    resolve_absolute_urls: z.boolean().optional(),
  })
  .refine(
    (data) => {
      // js_only is for subsequent calls in same session, not first call
      // Using it incorrectly causes server errors
      if (data.js_only && !data.session_id) {
        return false;
      }
      return true;
    },
    {
      message:
        "Error: js_only requires session_id (it's for continuing existing sessions).\n" +
        'For first call with js_code, use: {js_code: [...], screenshot: true}\n' +
        'For multi-step: First {js_code: [...], session_id: "x"}, then {js_only: true, session_id: "x"}',
    },
  )
  .refine(
    (data) => {
      // Empty js_code array is not allowed
      if (Array.isArray(data.js_code) && data.js_code.length === 0) {
        return false;
      }
      return true;
    },
    {
      message:
        'Error: js_code array cannot be empty. Either provide JavaScript code to execute or remove the js_code parameter entirely.',
    },
  );

// Re-export types we need
export type { z };

```

--------------------------------------------------------------------------------
/src/__tests__/schemas/validation-edge-cases.test.ts:
--------------------------------------------------------------------------------

```typescript
// import { jest } from '@jest/globals';
import { validateJavaScriptCode } from '../../schemas/helpers.js';
import { JsCodeSchema, CrawlSchema } from '../../schemas/validation-schemas.js';

describe('JavaScript Validation Edge Cases', () => {
  describe('validateJavaScriptCode', () => {
    describe('Valid JavaScript that might look suspicious', () => {
      it('should accept strings containing HTML-like syntax in string literals', () => {
        const validCases = [
          `const html = '<div class="test">Hello</div>';`,
          `const template = \`<button onclick="alert('test')">Click</button>\`;`,
          `const regex = /<div[^>]*>/g;`,
          `const arrow = () => { return '<span>Arrow</span>'; }`,
          `const className = 'container';`,
        ];

        validCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(true);
        });
      });

      it('should accept legitimate escape sequences', () => {
        const validCases = [
          `const str = "Line 1\\nLine 2";`, // Real newline escape
          `const tab = "Col1\\tCol2";`,
          `const quote = "He said \\"Hello\\"";`,
          `const unicode = "\\u0048\\u0065\\u006C\\u006C\\u006F";`,
          `const template = \`Multi
line
string\`;`, // Real newlines in template literals
        ];

        validCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(true);
        });
      });

      it('should accept complex but valid JavaScript patterns', () => {
        const validCases = [
          // Nested template literals
          `const nested = \`Outer \${inner ? \`Inner: \${value}\` : 'None'}\`;`,
          // Regular expressions that might look like HTML
          `const htmlTag = /<([a-z]+)([^>]*)>/gi;`,
          // JSON strings without HTML entities
          `const json = '{"name": "Test", "value": "Some data"}';`,
          // Function with HTML in comments
          `function render() {
            // This creates div content
            return document.createElement('div');
          }`,
          // Complex string concatenation
          `const result = '<div' + ' class="' + className + '">' + content + '</div>';`,
        ];

        validCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(true);
        });
      });

      it('should accept Unicode and special characters', () => {
        const validCases = [
          `const emoji = "Hello 👋 World 🌍";`,
          `const chinese = "你好世界";`,
          `const arabic = "مرحبا بالعالم";`,
          `const special = "©2024 Company™";`,
          `const math = "∑(n=1 to ∞) = π²/6";`,
        ];

        validCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(true);
        });
      });
    });

    describe('Invalid JavaScript that should be rejected', () => {
      it('should reject HTML entities outside string literals', () => {
        const invalidCases = [
          `const value = &quot;test&quot;;`, // HTML entities as code
          `const text = &amp;&amp; true;`,
          `if (a &lt; b) { }`,
          `const escaped = &nbsp;`,
          `return &apos;hello&apos;;`,
        ];

        invalidCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(false);
        });
      });

      it('should reject literal backslash-n outside strings', () => {
        const invalidCases = [
          `const text = "Hello";\\nconst world = "World";`, // Literal \n between statements
          `console.log("test");\\nconsole.log("more");`,
          `return value;\\nreturn other;`,
        ];

        invalidCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(false);
        });
      });

      it('should reject HTML tags outside string literals', () => {
        const invalidCases = [
          `<script>alert('test')</script>`,
          `<!DOCTYPE html>`,
          `<html><body>test</body></html>`,
          `<style>body { color: red; }</style>`,
        ];

        invalidCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(false);
        });
      });
    });

    describe('Edge cases and boundaries', () => {
      it('should handle empty and whitespace-only input', () => {
        expect(validateJavaScriptCode('')).toBe(true);
        expect(validateJavaScriptCode('   ')).toBe(true);
        expect(validateJavaScriptCode('\n\n\n')).toBe(true);
        expect(validateJavaScriptCode('\t')).toBe(true);
      });

      it('should handle very long valid strings', () => {
        const longString = 'const x = "' + 'a'.repeat(10000) + '";';
        expect(validateJavaScriptCode(longString)).toBe(true);
      });

      it('should handle nested quotes correctly', () => {
        const validCases = [
          `const x = "She said \\"Hello\\" to me";`,
          `const y = 'It\\'s a nice day';`,
          `const z = \`Template with "quotes" and 'apostrophes'\`;`,
        ];

        validCases.forEach((code) => {
          expect(validateJavaScriptCode(code)).toBe(true);
        });
      });

      it('should handle multiline strings correctly', () => {
        const multiline = `
const longText = \`
  This is a multiline
  template literal with
  multiple lines
\`;`;
        expect(validateJavaScriptCode(multiline)).toBe(true);
      });
    });
  });

  describe('Schema Validation Edge Cases', () => {
    describe('JsCodeSchema', () => {
      it('should accept both string and array of strings', () => {
        expect(() => JsCodeSchema.parse('return 1;')).not.toThrow();
        expect(() => JsCodeSchema.parse(['return 1;', 'return 2;'])).not.toThrow();
      });

      it('should reject invalid JavaScript in arrays', () => {
        expect(() => JsCodeSchema.parse(['valid();', '&quot;invalid&quot;'])).toThrow();
      });

      it('should handle empty arrays', () => {
        expect(() => JsCodeSchema.parse([])).not.toThrow();
      });
    });

    describe('CrawlSchema edge cases', () => {
      it('should handle all optional parameters', () => {
        const minimal = { url: 'https://example.com' };
        expect(() => CrawlSchema.parse(minimal)).not.toThrow();
      });

      it('should validate js_only requires session_id', () => {
        const invalid = {
          url: 'https://example.com',
          js_only: true,
          // Missing session_id
        };
        expect(() => CrawlSchema.parse(invalid)).toThrow();
      });

      it('should reject empty js_code array', () => {
        const invalid = {
          url: 'https://example.com',
          js_code: [],
        };
        expect(() => CrawlSchema.parse(invalid)).toThrow();
      });

      it('should accept all valid cache modes', () => {
        const validModes = ['ENABLED', 'BYPASS', 'DISABLED'];
        validModes.forEach((mode) => {
          const config = { url: 'https://example.com', cache_mode: mode };
          expect(() => CrawlSchema.parse(config)).not.toThrow();
        });
      });

      it('should validate viewport dimensions', () => {
        const validViewport = {
          url: 'https://example.com',
          viewport_width: 1920,
          viewport_height: 1080,
        };
        expect(() => CrawlSchema.parse(validViewport)).not.toThrow();
      });

      it('should validate complex configurations', () => {
        const complex = {
          url: 'https://example.com',
          browser_type: 'chromium',
          viewport_width: 1280,
          viewport_height: 720,
          user_agent: 'Custom User Agent',
          headers: { 'X-Custom': 'value' },
          cookies: [{ name: 'session', value: '123', domain: '.example.com' }],
          js_code: ['document.querySelector("button").click()'],
          wait_for: '#loaded',
          screenshot: true,
          pdf: true,
          session_id: 'test-session',
          cache_mode: 'BYPASS',
        };
        expect(() => CrawlSchema.parse(complex)).not.toThrow();
      });
    });
  });

  describe('Property-based testing for regex patterns', () => {
    // Generate random valid JavaScript-like strings
    const generateValidJS = () => {
      const templates = [
        () => `const x = ${Math.random()};`,
        () => `function test() { return "${Math.random()}"; }`,
        () => `if (${Math.random() > 0.5}) { console.log("test"); }`,
        () => `const arr = [${Math.random()}, ${Math.random()}];`,
        () => `// Comment with ${Math.random()}`,
      ];
      return templates[Math.floor(Math.random() * templates.length)]();
    };

    it('should consistently validate generated valid JavaScript', () => {
      for (let i = 0; i < 100; i++) {
        const code = generateValidJS();
        expect(validateJavaScriptCode(code)).toBe(true);
      }
    });

    // Test boundary conditions with special characters
    const specialChars = ['<', '>', '&', '"', "'", '\\', '\n', '\r', '\t'];

    it('should handle special characters in string contexts correctly', () => {
      specialChars.forEach((char) => {
        const inString = `const x = "${char}";`;
        const inTemplate = `const y = \`${char}\`;`;

        // These should be valid (special chars inside strings)
        expect(validateJavaScriptCode(inString)).toBe(true);
        expect(validateJavaScriptCode(inTemplate)).toBe(true);
      });
    });
  });
});

```

--------------------------------------------------------------------------------
/src/handlers/utility-handlers.ts:
--------------------------------------------------------------------------------

```typescript
import { BaseHandler } from './base-handler.js';
import { JSExecuteEndpointOptions, JSExecuteEndpointResponse, CrawlResultItem } from '../types.js';

export class UtilityHandlers extends BaseHandler {
  async executeJS(options: JSExecuteEndpointOptions) {
    try {
      // Check if scripts is provided
      if (!options.scripts || options.scripts === null) {
        throw new Error(
          'scripts is required. Please provide JavaScript code to execute. Use "return" statements to get values back.',
        );
      }

      const result: JSExecuteEndpointResponse = await this.service.executeJS(options);

      // Extract JavaScript execution results
      const jsResults = result.js_execution_result?.results || [];
      // Ensure scripts is always an array for mapping
      const scripts = Array.isArray(options.scripts) ? options.scripts : [options.scripts];

      // Format results for display
      let formattedResults = '';
      if (jsResults.length > 0) {
        formattedResults = jsResults
          .map((res: unknown, idx: number) => {
            const script = scripts[idx] || 'Script ' + (idx + 1);
            // Handle the actual return value or success/error status
            let resultStr = '';
            if (res && typeof res === 'object' && 'success' in res) {
              // This is a status object (e.g., from null return or execution without return)
              const statusObj = res as { success: unknown; error?: unknown };
              resultStr = statusObj.success
                ? 'Executed successfully (no return value)'
                : `Error: ${statusObj.error || 'Unknown error'}`;
            } else {
              // This is an actual return value
              resultStr = JSON.stringify(res, null, 2);
            }
            return `Script: ${script}\nReturned: ${resultStr}`;
          })
          .join('\n\n');
      } else {
        formattedResults = 'No results returned';
      }

      // Handle markdown content - can be string or object
      let markdownContent = '';
      if (result.markdown) {
        if (typeof result.markdown === 'string') {
          markdownContent = result.markdown;
        } else if (typeof result.markdown === 'object' && result.markdown.raw_markdown) {
          // Use raw_markdown from the object structure
          markdownContent = result.markdown.raw_markdown;
        }
      }

      return {
        content: [
          {
            type: 'text',
            text: `JavaScript executed on: ${options.url}\n\nResults:\n${formattedResults}${markdownContent ? `\n\nPage Content After Execution:\n${markdownContent}` : ''}`,
          },
        ],
      };
    } catch (error) {
      throw this.formatError(error, 'execute JavaScript');
    }
  }

  async extractLinks(options: { url: string; categorize?: boolean }) {
    try {
      // Use crawl endpoint instead of md to get full link data
      const response = await this.axiosClient.post('/crawl', {
        urls: [options.url],
        crawler_config: {
          cache_mode: 'bypass',
        },
      });

      const results = response.data.results || [response.data];
      const result: CrawlResultItem = results[0] || {};

      // Variables for manually extracted links
      let manuallyExtractedInternal: string[] = [];
      let manuallyExtractedExternal: string[] = [];
      let hasManuallyExtractedLinks = false;

      // Check if the response is likely JSON or non-HTML content
      if (!result.links || (result.links.internal.length === 0 && result.links.external.length === 0)) {
        // Try to detect if this might be a JSON endpoint
        const markdownContent = result.markdown?.raw_markdown || result.markdown?.fit_markdown || '';
        const htmlContent = result.html || '';

        // Check for JSON indicators
        if (
          // Check URL pattern
          options.url.includes('/api/') ||
          options.url.includes('/api.') ||
          // Check content type (often shown in markdown conversion)
          markdownContent.includes('application/json') ||
          // Check for JSON structure patterns
          (markdownContent.startsWith('{') && markdownContent.endsWith('}')) ||
          (markdownContent.startsWith('[') && markdownContent.endsWith(']')) ||
          // Check HTML for JSON indicators
          htmlContent.includes('application/json') ||
          // Common JSON patterns
          markdownContent.includes('"links"') ||
          markdownContent.includes('"url"') ||
          markdownContent.includes('"data"')
        ) {
          return {
            content: [
              {
                type: 'text',
                text: `Note: ${options.url} appears to return JSON data rather than HTML. The extract_links tool is designed for HTML pages with <a> tags. To extract URLs from JSON, you would need to parse the JSON structure directly.`,
              },
            ],
          };
        }
        // If no links found but it's HTML, let's check the markdown content for href patterns
        if (markdownContent && markdownContent.includes('href=')) {
          // Extract links manually from markdown if server didn't provide them
          const hrefPattern = /href=["']([^"']+)["']/g;
          const foundLinks: string[] = [];
          let match;
          while ((match = hrefPattern.exec(markdownContent)) !== null) {
            foundLinks.push(match[1]);
          }
          if (foundLinks.length > 0) {
            hasManuallyExtractedLinks = true;
            // Categorize found links
            const currentDomain = new URL(options.url).hostname;

            foundLinks.forEach((link) => {
              try {
                const linkUrl = new URL(link, options.url);
                if (linkUrl.hostname === currentDomain) {
                  manuallyExtractedInternal.push(linkUrl.href);
                } else {
                  manuallyExtractedExternal.push(linkUrl.href);
                }
              } catch {
                // Relative link
                manuallyExtractedInternal.push(link);
              }
            });
          }
        }
      }

      // Handle both cases: API-provided links and manually extracted links
      let internalUrls: string[] = [];
      let externalUrls: string[] = [];

      if (result.links && (result.links.internal.length > 0 || result.links.external.length > 0)) {
        // Use API-provided links
        internalUrls = result.links.internal.map((link) => (typeof link === 'string' ? link : link.href));
        externalUrls = result.links.external.map((link) => (typeof link === 'string' ? link : link.href));
      } else if (hasManuallyExtractedLinks) {
        // Use manually extracted links
        internalUrls = manuallyExtractedInternal;
        externalUrls = manuallyExtractedExternal;
      }

      const allUrls = [...internalUrls, ...externalUrls];

      if (!options.categorize) {
        return {
          content: [
            {
              type: 'text',
              text: `All links from ${options.url}:\n${allUrls.join('\n')}`,
            },
          ],
        };
      }

      // Categorize links
      const categorized: Record<string, string[]> = {
        internal: [],
        external: [],
        social: [],
        documents: [],
        images: [],
        scripts: [],
      };

      // Further categorize links
      const socialDomains = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', 'youtube.com'];
      const docExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'];
      const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp'];
      const scriptExtensions = ['.js', '.css'];

      // Categorize internal URLs
      internalUrls.forEach((href: string) => {
        if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) {
          categorized.documents.push(href);
        } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) {
          categorized.images.push(href);
        } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) {
          categorized.scripts.push(href);
        } else {
          categorized.internal.push(href);
        }
      });

      // Categorize external URLs
      externalUrls.forEach((href: string) => {
        if (socialDomains.some((domain) => href.includes(domain))) {
          categorized.social.push(href);
        } else if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) {
          categorized.documents.push(href);
        } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) {
          categorized.images.push(href);
        } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) {
          categorized.scripts.push(href);
        } else {
          categorized.external.push(href);
        }
      });

      // Return based on categorize option (defaults to true)
      if (options.categorize) {
        return {
          content: [
            {
              type: 'text',
              text: `Link analysis for ${options.url}:\n\n${Object.entries(categorized)
                .map(
                  ([category, links]: [string, string[]]) =>
                    `${category} (${links.length}):\n${links.slice(0, 10).join('\n')}${links.length > 10 ? '\n...' : ''}`,
                )
                .join('\n\n')}`,
            },
          ],
        };
      } else {
        // Return simple list without categorization
        const allLinks = [...internalUrls, ...externalUrls];
        return {
          content: [
            {
              type: 'text',
              text: `All links from ${options.url} (${allLinks.length} total):\n\n${allLinks.slice(0, 50).join('\n')}${allLinks.length > 50 ? '\n...' : ''}`,
            },
          ],
        };
      }
    } catch (error) {
      throw this.formatError(error, 'extract links');
    }
  }
}

```

--------------------------------------------------------------------------------
/src/crawl4ai-service.ts:
--------------------------------------------------------------------------------

```typescript
import axios, { AxiosInstance, AxiosError } from 'axios';
import {
  BatchCrawlOptions,
  AdvancedCrawlConfig,
  CrawlEndpointOptions,
  CrawlEndpointResponse,
  JSExecuteEndpointOptions,
  JSExecuteEndpointResponse,
  ScreenshotEndpointOptions,
  ScreenshotEndpointResponse,
  PDFEndpointOptions,
  PDFEndpointResponse,
  HTMLEndpointOptions,
  HTMLEndpointResponse,
  MarkdownEndpointOptions,
  MarkdownEndpointResponse,
  LLMEndpointOptions,
  LLMEndpointResponse,
} from './types.js';

// Helper to validate JavaScript code
const validateJavaScriptCode = (code: string): boolean => {
  // Check for common HTML entities that shouldn't be in JS
  if (/&quot;|&amp;|&lt;|&gt;|&#\d+;|&\w+;/.test(code)) {
    return false;
  }

  // Basic check to ensure it's not HTML
  if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) {
    return false;
  }

  // Check for literal \n, \t, \r outside of strings (common LLM mistake)
  // Look for patterns like: ;\n or }\n or )\n which suggest literal newlines
  if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) {
    return false;
  }

  // Check for obvious cases of literal \n between statements
  if (/[;})]\s*\\n\s*\w/.test(code)) {
    return false;
  }

  return true;
};

// Helper to validate URL format
const validateURL = (url: string): boolean => {
  try {
    new URL(url);
    return true;
  } catch {
    return false;
  }
};

// Helper to handle axios errors consistently
const handleAxiosError = (error: unknown): never => {
  if (axios.isAxiosError(error)) {
    const axiosError = error as AxiosError;

    // Handle timeout errors
    if (axiosError.code === 'ECONNABORTED') {
      throw new Error('Request timed out');
    }

    if (axiosError.code === 'ETIMEDOUT') {
      throw new Error('Request timeout');
    }

    // Handle network errors
    if (axiosError.code === 'ENOTFOUND') {
      throw new Error(`DNS resolution failed: ${axiosError.message}`);
    }

    if (axiosError.code === 'ECONNREFUSED') {
      throw new Error(`Connection refused: ${axiosError.message}`);
    }

    if (axiosError.code === 'ECONNRESET') {
      throw new Error(`Connection reset: ${axiosError.message}`);
    }

    if (axiosError.code === 'ENETUNREACH') {
      throw new Error(`Network unreachable: ${axiosError.message}`);
    }

    // Handle HTTP errors
    if (axiosError.response) {
      const status = axiosError.response.status;
      const data = axiosError.response.data as any; // eslint-disable-line @typescript-eslint/no-explicit-any
      const message = data?.error || data?.detail || data?.message || axiosError.message;
      throw new Error(`Request failed with status ${status}: ${message}`);
    }

    // Handle request errors (e.g., invalid URL)
    if (axiosError.request) {
      throw new Error(`Request failed: ${axiosError.message}`);
    }
  }

  // Re-throw unknown errors
  throw error;
};

export class Crawl4AIService {
  private axiosClient: AxiosInstance;

  constructor(baseURL: string, apiKey: string) {
    this.axiosClient = axios.create({
      baseURL,
      headers: {
        'X-API-Key': apiKey,
        'Content-Type': 'application/json',
      },
      timeout: 120000,
    });
  }

  async getMarkdown(options: MarkdownEndpointOptions): Promise<MarkdownEndpointResponse> {
    // Validate URL
    if (!validateURL(options.url)) {
      throw new Error('Invalid URL format');
    }

    try {
      const response = await this.axiosClient.post('/md', {
        url: options.url,
        f: options.f,
        q: options.q,
        c: options.c,
      });

      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async captureScreenshot(options: ScreenshotEndpointOptions): Promise<ScreenshotEndpointResponse> {
    // Validate URL
    if (!validateURL(options.url)) {
      throw new Error('Invalid URL format');
    }

    try {
      const response = await this.axiosClient.post('/screenshot', {
        url: options.url,
        screenshot_wait_for: options.screenshot_wait_for,
        // output_path is omitted to get base64 response
      });

      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async generatePDF(options: PDFEndpointOptions): Promise<PDFEndpointResponse> {
    // Validate URL
    if (!validateURL(options.url)) {
      throw new Error('Invalid URL format');
    }

    try {
      const response = await this.axiosClient.post('/pdf', {
        url: options.url,
        // output_path is omitted to get base64 response
      });

      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async executeJS(options: JSExecuteEndpointOptions): Promise<JSExecuteEndpointResponse> {
    // Validate URL
    if (!validateURL(options.url)) {
      throw new Error('Invalid URL format');
    }

    // Ensure scripts is always an array
    const scripts = Array.isArray(options.scripts) ? options.scripts : [options.scripts];

    // Validate each script
    for (const script of scripts) {
      if (!validateJavaScriptCode(script)) {
        throw new Error(
          'Invalid JavaScript: Contains HTML entities (&quot;), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.',
        );
      }
    }

    try {
      const response = await this.axiosClient.post('/execute_js', {
        url: options.url,
        scripts: scripts, // Always send as array
        // Only url and scripts are supported by the endpoint
      });

      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async batchCrawl(options: BatchCrawlOptions) {
    // Validate URLs
    if (!options.urls || options.urls.length === 0) {
      throw new Error('URLs array cannot be empty');
    }

    // Build crawler config if needed
    const crawler_config: Record<string, unknown> = {};

    // Handle remove_images by using exclude_tags
    if (options.remove_images) {
      crawler_config.exclude_tags = ['img', 'picture', 'svg'];
    }

    if (options.bypass_cache) {
      crawler_config.cache_mode = 'BYPASS';
    }

    try {
      const response = await this.axiosClient.post('/crawl', {
        urls: options.urls,
        max_concurrent: options.max_concurrent,
        crawler_config: Object.keys(crawler_config).length > 0 ? crawler_config : undefined,
      });

      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async getHTML(options: HTMLEndpointOptions): Promise<HTMLEndpointResponse> {
    // Validate URL
    if (!validateURL(options.url)) {
      throw new Error('Invalid URL format');
    }

    try {
      const response = await this.axiosClient.post('/html', {
        url: options.url,
        // Only url is supported by the endpoint
      });

      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async parseSitemap(url: string) {
    try {
      // Use axios directly without baseURL for fetching external URLs
      const response = await axios.get(url);
      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async detectContentType(url: string): Promise<string> {
    try {
      // Use axios directly without baseURL for external URLs
      const response = await axios.head(url);
      return response.headers['content-type'] || '';
    } catch {
      return '';
    }
  }

  async crawl(options: AdvancedCrawlConfig): Promise<CrawlEndpointResponse> {
    // Validate JS code if present
    if (options.crawler_config?.js_code) {
      const scripts = Array.isArray(options.crawler_config.js_code)
        ? options.crawler_config.js_code
        : [options.crawler_config.js_code];

      for (const script of scripts) {
        if (!validateJavaScriptCode(script)) {
          throw new Error(
            'Invalid JavaScript: Contains HTML entities (&quot;), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.',
          );
        }
      }
    }

    // Server only accepts urls array, not url string
    const urls = options.url ? [options.url] : options.urls || [];

    const requestBody: CrawlEndpointOptions & {
      extraction_strategy?: unknown;
      table_extraction_strategy?: unknown;
      markdown_generator_options?: unknown;
    } = {
      urls,
      browser_config: options.browser_config,
      crawler_config: options.crawler_config || {}, // Always include crawler_config, even if empty
    };

    // Add extraction strategy passthrough fields if present
    if (options.extraction_strategy) {
      requestBody.extraction_strategy = options.extraction_strategy;
    }
    if (options.table_extraction_strategy) {
      requestBody.table_extraction_strategy = options.table_extraction_strategy;
    }
    if (options.markdown_generator_options) {
      requestBody.markdown_generator_options = options.markdown_generator_options;
    }

    try {
      const response = await this.axiosClient.post('/crawl', requestBody);
      return response.data;
    } catch (error) {
      return handleAxiosError(error);
    }
  }

  async extractWithLLM(options: LLMEndpointOptions): Promise<LLMEndpointResponse> {
    // Validate URL
    if (!validateURL(options.url)) {
      throw new Error('Invalid URL format');
    }

    try {
      const encodedUrl = encodeURIComponent(options.url);
      const encodedQuery = encodeURIComponent(options.query);
      const response = await this.axiosClient.get(`/llm/${encodedUrl}?q=${encodedQuery}`);
      return response.data;
    } catch (error) {
      // Special handling for LLM-specific errors
      if (axios.isAxiosError(error)) {
        const axiosError = error as AxiosError;
        if (axiosError.code === 'ECONNABORTED' || axiosError.response?.status === 504) {
          throw new Error('LLM extraction timed out. Try a simpler query or different URL.');
        }
        if (axiosError.response?.status === 401) {
          throw new Error(
            'LLM extraction failed: No LLM provider configured on server. Please ensure the server has an API key set.',
          );
        }
      }
      return handleAxiosError(error);
    }
  }
}

```
Page 1/3FirstPrevNextLast