# Directory Structure
```
├── .npmignore
├── LICENSE
├── package-lock.json
├── package.json
├── README.md
├── src
│ ├── api-client.ts
│ ├── embeddings.ts
│ ├── handlers
│ │ ├── add-documentation.ts
│ │ ├── base-handler.ts
│ │ ├── list-documentation.ts
│ │ ├── search-documentation.ts
│ │ └── test-embeddings.ts
│ ├── index.ts
│ ├── tools
│ │ ├── add-documentation.ts
│ │ ├── content-fetcher.ts
│ │ ├── list-utils.ts
│ │ ├── qdrant-client.ts
│ │ ├── search-utils.ts
│ │ ├── text-chunker.ts
│ │ └── url-processor.ts
│ ├── types
│ │ └── ollama.d.ts
│ └── types.ts
└── tsconfig.json
```
# Files
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
```
1 | src/
2 | ref/
3 | .clinecontext
4 | .clinelearn
5 | .clinerules
6 | ragdocs_plan.md
7 | tsconfig.json
8 | .git
9 | .gitignore
10 | node_modules/
11 |
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | # RagDocs MCP Server
2 |
3 | A Model Context Protocol (MCP) server that provides RAG (Retrieval-Augmented Generation) capabilities using Qdrant vector database and Ollama/OpenAI embeddings. This server enables semantic search and management of documentation through vector similarity.
4 |
5 | ## Features
6 |
7 | - Add documentation with metadata
8 | - Semantic search through documents
9 | - List and organize documentation
10 | - Delete documents
11 | - Support for both Ollama (free) and OpenAI (paid) embeddings
12 | - Automatic text chunking and embedding generation
13 | - Vector storage with Qdrant
14 |
15 | ## Prerequisites
16 |
17 | - Node.js 16 or higher
18 | - One of the following Qdrant setups:
19 | - Local instance using Docker (free)
20 | - Qdrant Cloud account with API key (managed service)
21 | - One of the following for embeddings:
22 | - Ollama running locally (default, free)
23 | - OpenAI API key (optional, paid)
24 |
25 | ## Available Tools
26 |
27 | ### 1. add_document
28 | Add a document to the RAG system.
29 |
30 | Parameters:
31 | - `url` (required): Document URL/identifier
32 | - `content` (required): Document content
33 | - `metadata` (optional): Document metadata
34 | - `title`: Document title
35 | - `contentType`: Content type (e.g., "text/markdown")
36 |
37 | ### 2. search_documents
38 | Search through stored documents using semantic similarity.
39 |
40 | Parameters:
41 | - `query` (required): Natural language search query
42 | - `options` (optional):
43 | - `limit`: Maximum number of results (1-20, default: 5)
44 | - `scoreThreshold`: Minimum similarity score (0-1, default: 0.7)
45 | - `filters`:
46 | - `domain`: Filter by domain
47 | - `hasCode`: Filter for documents containing code
48 | - `after`: Filter for documents after date (ISO format)
49 | - `before`: Filter for documents before date (ISO format)
50 |
51 | ### 3. list_documents
52 | List all stored documents with pagination and grouping options.
53 |
54 | Parameters (all optional):
55 | - `page`: Page number (default: 1)
56 | - `pageSize`: Number of documents per page (1-100, default: 20)
57 | - `groupByDomain`: Group documents by domain (default: false)
58 | - `sortBy`: Sort field ("timestamp", "title", or "domain")
59 | - `sortOrder`: Sort order ("asc" or "desc")
60 |
61 | ### 4. delete_document
62 | Delete a document from the RAG system.
63 |
64 | Parameters:
65 | - `url` (required): URL of the document to delete
66 |
67 | ## Installation
68 |
69 | ```bash
70 | npm install -g @mcpservers/ragdocs
71 | ```
72 |
73 | ## MCP Server Configuration
74 |
75 | ```json
76 | {
77 | "mcpServers": {
78 | "ragdocs": {
79 | "command": "node",
80 | "args": ["@mcpservers/ragdocs"],
81 | "env": {
82 | "QDRANT_URL": "http://127.0.0.1:6333",
83 | "EMBEDDING_PROVIDER": "ollama"
84 | }
85 | }
86 | }
87 | }
88 | ```
89 |
90 | Using Qdrant Cloud:
91 | ```json
92 | {
93 | "mcpServers": {
94 | "ragdocs": {
95 | "command": "node",
96 | "args": ["@mcpservers/ragdocs"],
97 | "env": {
98 | "QDRANT_URL": "https://your-cluster-url.qdrant.tech",
99 | "QDRANT_API_KEY": "your-qdrant-api-key",
100 | "EMBEDDING_PROVIDER": "ollama"
101 | }
102 | }
103 | }
104 | }
105 | ```
106 |
107 | Using OpenAI:
108 | ```json
109 | {
110 | "mcpServers": {
111 | "ragdocs": {
112 | "command": "node",
113 | "args": ["@mcpservers/ragdocs"],
114 | "env": {
115 | "QDRANT_URL": "http://127.0.0.1:6333",
116 | "EMBEDDING_PROVIDER": "openai",
117 | "OPENAI_API_KEY": "your-api-key"
118 | }
119 | }
120 | }
121 | }
122 | ```
123 |
124 | ## Local Qdrant with Docker
125 |
126 | ```bash
127 | docker run -d --name qdrant -p 6333:6333 -p 6334:6334 qdrant/qdrant
128 | ```
129 |
130 | ## Environment Variables
131 |
132 | - `QDRANT_URL`: URL of your Qdrant instance
133 | - For local: "http://127.0.0.1:6333" (default)
134 | - For cloud: "https://your-cluster-url.qdrant.tech"
135 | - `QDRANT_API_KEY`: API key for Qdrant Cloud (required when using cloud instance)
136 | - `EMBEDDING_PROVIDER`: Choice of embedding provider ("ollama" or "openai", default: "ollama")
137 | - `OPENAI_API_KEY`: OpenAI API key (required if using OpenAI)
138 | - `EMBEDDING_MODEL`: Model to use for embeddings
139 | - For Ollama: defaults to "nomic-embed-text"
140 | - For OpenAI: defaults to "text-embedding-3-small"
141 |
142 | ## License
143 |
144 | Apache License 2.0
145 |
```
--------------------------------------------------------------------------------
/src/types/ollama.d.ts:
--------------------------------------------------------------------------------
```typescript
1 | declare module 'ollama' {
2 | export interface EmbeddingsRequest {
3 | model: string;
4 | prompt: string;
5 | options?: Record<string, any>;
6 | }
7 |
8 | export interface EmbeddingsResponse {
9 | embedding: number[];
10 | }
11 |
12 | const ollama: {
13 | embeddings(request: EmbeddingsRequest): Promise<EmbeddingsResponse>;
14 | };
15 |
16 | export default ollama;
17 | }
18 |
```
--------------------------------------------------------------------------------
/src/handlers/base-handler.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
2 | import { ApiClient } from '../api-client.js';
3 | import { ToolResult } from '../types.js';
4 |
5 | export abstract class BaseHandler {
6 | constructor(
7 | protected readonly server: Server,
8 | protected readonly apiClient: ApiClient
9 | ) {}
10 |
11 | abstract handle(args: any): Promise<ToolResult>;
12 | }
13 |
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "compilerOptions": {
3 | "target": "es2022",
4 | "module": "es2022",
5 | "moduleResolution": "node",
6 | "outDir": "build",
7 | "rootDir": "src",
8 | "strict": true,
9 | "esModuleInterop": true,
10 | "skipLibCheck": true,
11 | "forceConsistentCasingInFileNames": true,
12 | "resolveJsonModule": true
13 | },
14 | "include": ["src/**/*"],
15 | "exclude": ["node_modules"]
16 | }
17 |
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "name": "@mcpservers/ragdocs",
3 | "version": "1.0.0",
4 | "type": "module",
5 | "main": "build/index.js",
6 | "bin": {
7 | "mcp-ragdocs": "build/index.js"
8 | },
9 | "files": [
10 | "build/**/*"
11 | ],
12 | "scripts": {
13 | "build": "tsc && node -e \"require('fs').chmodSync('build/index.js', '755')\"",
14 | "prepublishOnly": "npm run build",
15 | "test": "echo \"Error: no test specified\" && exit 1"
16 | },
17 | "keywords": [
18 | "mcp",
19 | "rag",
20 | "documentation",
21 | "search",
22 | "embeddings"
23 | ],
24 | "author": "bossying",
25 | "license": "Apache License 2.0",
26 | "description": "MCP server for RAG-based document search and management",
27 | "homepage": "https://github.com/heltonteixeira/ragdocs",
28 | "repository": {
29 | "type": "git",
30 | "url": "git+https://github.com/heltonteixeira/ragdocs.git"
31 | },
32 | "dependencies": {
33 | "@modelcontextprotocol/sdk": "^1.0.4",
34 | "@qdrant/js-client-rest": "^1.12.0",
35 | "axios": "^1.7.9",
36 | "cheerio": "^1.0.0",
37 | "ollama": "^0.5.11",
38 | "openai": "^4.77.0",
39 | "playwright": "^1.49.1"
40 | },
41 | "devDependencies": {
42 | "typescript": "^5.7.2"
43 | }
44 | }
45 |
```
--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { DocumentMetadata } from './tools/qdrant-client.js';
2 |
3 | export interface Document {
4 | url: string;
5 | content: string;
6 | metadata: Partial<DocumentMetadata>;
7 | }
8 |
9 | export interface DocumentChunk {
10 | text: string;
11 | url: string;
12 | title: string;
13 | timestamp: string;
14 | }
15 |
16 | export interface DocumentPayload extends DocumentChunk {
17 | _type: 'DocumentChunk';
18 | [key: string]: unknown;
19 | }
20 |
21 | export function isDocumentPayload(payload: unknown): payload is DocumentPayload {
22 | if (!payload || typeof payload !== 'object') return false;
23 | const p = payload as Partial<DocumentPayload>;
24 | return (
25 | p._type === 'DocumentChunk' &&
26 | typeof p.text === 'string' &&
27 | typeof p.url === 'string' &&
28 | typeof p.title === 'string' &&
29 | typeof p.timestamp === 'string'
30 | );
31 | }
32 |
33 | export interface SearchOptions {
34 | limit?: number;
35 | scoreThreshold?: number;
36 | filters?: {
37 | domain?: string;
38 | hasCode?: boolean;
39 | after?: string;
40 | before?: string;
41 | };
42 | }
43 |
44 | export interface ToolDefinition {
45 | name: string;
46 | description: string;
47 | inputSchema: {
48 | type: string;
49 | properties: Record<string, any>;
50 | required: string[];
51 | };
52 | }
53 |
54 | export interface ToolResult {
55 | content: Array<{
56 | type: string;
57 | text: string;
58 | }>;
59 | isError?: boolean;
60 | }
61 |
62 | export interface RagDocsConfig {
63 | qdrantUrl: string;
64 | qdrantApiKey?: string;
65 | openaiApiKey: string;
66 | collectionName: string;
67 | }
68 |
```
--------------------------------------------------------------------------------
/src/handlers/list-documentation.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
2 | import { BaseHandler } from './base-handler.js';
3 | import { QdrantWrapper } from '../tools/qdrant-client.js';
4 | import { ListOptions, ListResult, ListUtils } from '../tools/list-utils.js';
5 | import { ToolResult } from '../types.js';
6 | import { ApiClient } from '../api-client.js';
7 |
8 | export class ListDocumentationHandler extends BaseHandler {
9 | protected server: Server;
10 | protected apiClient: ApiClient;
11 |
12 | constructor(server: Server, apiClient: ApiClient) {
13 | super(server, apiClient);
14 | this.server = server;
15 | this.apiClient = apiClient;
16 | }
17 |
18 | async handle(args: ListOptions): Promise<ToolResult> {
19 | try {
20 | // Ensure Qdrant is initialized
21 | await this.apiClient.qdrant.initializeCollection();
22 |
23 | // Set default values
24 | const page = args.page || 1;
25 | const pageSize = args.pageSize || 20;
26 | const sortBy = args.sortBy || 'timestamp';
27 | const sortOrder = args.sortOrder || 'desc';
28 |
29 | // Get documents with pagination
30 | const { total, documents } = await this.apiClient.qdrant.listDocuments({
31 | offset: (page - 1) * pageSize,
32 | limit: pageSize,
33 | sortBy,
34 | sortOrder,
35 | });
36 |
37 | // Calculate pagination details
38 | const { totalPages } = ListUtils.getPaginationDetails(total, page, pageSize);
39 |
40 | // Sort documents if needed
41 | const sortedDocs = ListUtils.sortDocuments(documents, sortBy, sortOrder);
42 |
43 | // Group by domain if requested
44 | const groupedDocs = args.groupByDomain
45 | ? ListUtils.groupByDomain(sortedDocs)
46 | : [{ documents: sortedDocs }];
47 |
48 | // Prepare result
49 | const result: ListResult = {
50 | total,
51 | page,
52 | pageSize,
53 | totalPages,
54 | documents: groupedDocs,
55 | };
56 |
57 | // Format as markdown
58 | const markdown = ListUtils.formatAsMarkdown(result);
59 |
60 | return {
61 | content: [
62 | {
63 | type: 'text',
64 | text: markdown,
65 | },
66 | ],
67 | };
68 | } catch (error) {
69 | return {
70 | content: [
71 | {
72 | type: 'text',
73 | text: `Failed to list documentation: ${(error as Error).message}`,
74 | },
75 | ],
76 | isError: true,
77 | };
78 | }
79 | }
80 | }
81 |
```
--------------------------------------------------------------------------------
/src/handlers/search-documentation.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
2 | import { BaseHandler } from './base-handler.js';
3 | import { QdrantWrapper } from '../tools/qdrant-client.js';
4 | import { EmbeddingService } from '../embeddings.js';
5 | import {
6 | SearchOptions,
7 | SearchResult,
8 | validateSearchOptions,
9 | extractSnippet,
10 | normalizeScore,
11 | formatResultsAsMarkdown,
12 | } from '../tools/search-utils.js';
13 |
14 | interface SearchDocumentationArgs {
15 | query: string;
16 | options?: SearchOptions;
17 | }
18 |
19 | export class SearchDocumentationHandler extends BaseHandler {
20 | private qdrant: QdrantWrapper;
21 | private embeddings: EmbeddingService;
22 |
23 | constructor(
24 | qdrant: QdrantWrapper,
25 | embeddings: EmbeddingService,
26 | ...args: ConstructorParameters<typeof BaseHandler>
27 | ) {
28 | super(...args);
29 | this.qdrant = qdrant;
30 | this.embeddings = embeddings;
31 | }
32 |
33 | async handle(args: SearchDocumentationArgs) {
34 | // Validate input
35 | if (!args.query?.trim()) {
36 | throw new McpError(
37 | ErrorCode.InvalidRequest,
38 | 'Query string is required'
39 | );
40 | }
41 |
42 | // Validate search options if provided
43 | if (args.options) {
44 | validateSearchOptions(args.options);
45 | }
46 |
47 | try {
48 | // Generate embeddings for the query
49 | console.error('Generating embeddings for query:', args.query);
50 | const queryVector = await this.embeddings.generateEmbeddings(args.query);
51 |
52 | // Search for similar documents
53 | console.error('Searching for similar documents...');
54 | const searchResults = await this.qdrant.searchSimilar(queryVector, args.options);
55 |
56 | // Process and format results
57 | const formattedResults: SearchResult[] = searchResults.map(result => ({
58 | url: result.url,
59 | title: result.title,
60 | domain: result.domain,
61 | timestamp: result.timestamp,
62 | score: normalizeScore(result.score),
63 | snippet: extractSnippet(result.content),
64 | metadata: {
65 | contentType: result.contentType,
66 | wordCount: result.wordCount,
67 | hasCode: result.hasCode,
68 | chunkIndex: result.chunkIndex,
69 | totalChunks: result.totalChunks,
70 | },
71 | }));
72 |
73 | // Format results as markdown
74 | const markdown = formatResultsAsMarkdown(formattedResults);
75 |
76 | return {
77 | content: [
78 | {
79 | type: 'text',
80 | text: markdown,
81 | },
82 | ],
83 | };
84 | } catch (error) {
85 | console.error('Search error:', error);
86 | throw new McpError(
87 | ErrorCode.InternalError,
88 | `Failed to search documentation: ${error}`
89 | );
90 | }
91 | }
92 | }
93 |
```
--------------------------------------------------------------------------------
/src/handlers/test-embeddings.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
2 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
3 | import { BaseHandler } from './base-handler.js';
4 | import { ApiClient } from '../api-client.js';
5 | import { ToolResult } from '../types.js';
6 | import { EmbeddingService } from '../embeddings.js';
7 |
8 | const COLLECTION_NAME = 'documentation';
9 |
10 | export class TestEmbeddingsHandler extends BaseHandler {
11 | constructor(server: Server, apiClient: ApiClient) {
12 | super(server, apiClient);
13 | }
14 |
15 | async handle(args: any): Promise<ToolResult> {
16 | if (!args.text || typeof args.text !== 'string') {
17 | throw new McpError(ErrorCode.InvalidParams, 'Text is required');
18 | }
19 |
20 | try {
21 | // Create a new embedding service instance with the requested configuration
22 | const tempEmbeddingService = EmbeddingService.createFromConfig({
23 | provider: args.provider || 'ollama',
24 | apiKey: args.apiKey,
25 | model: args.model
26 | });
27 |
28 | const embedding = await tempEmbeddingService.generateEmbeddings(args.text);
29 | const provider = args.provider || 'ollama';
30 | const model = args.model || (provider === 'ollama' ? 'nomic-embed-text' : 'text-embedding-3-small');
31 |
32 | // If test is successful, update the server's embedding service
33 | const newApiClient = new ApiClient({
34 | embeddingConfig: {
35 | provider: args.provider || 'ollama',
36 | apiKey: args.apiKey,
37 | model: args.model
38 | },
39 | qdrantUrl: process.env.QDRANT_URL,
40 | qdrantApiKey: process.env.QDRANT_API_KEY
41 | });
42 |
43 | // Initialize collection with new vector size
44 | await newApiClient.initCollection(COLLECTION_NAME);
45 |
46 | return {
47 | content: [
48 | {
49 | type: 'text',
50 | text: `Successfully configured ${provider} embeddings (${model}).\nVector size: ${embedding.length}\nQdrant collection updated to match new vector size.`,
51 | },
52 | ],
53 | };
54 | } catch (error) {
55 | return {
56 | content: [
57 | {
58 | type: 'text',
59 | text: `Failed to test embeddings: ${error}`,
60 | },
61 | ],
62 | isError: true,
63 | };
64 | }
65 | }
66 | }
67 |
68 | export const testEmbeddingsSchema = {
69 | type: 'object',
70 | properties: {
71 | text: {
72 | type: 'string',
73 | description: 'Text to generate embeddings for',
74 | },
75 | provider: {
76 | type: 'string',
77 | description: 'Embedding provider to use (ollama or openai)',
78 | enum: ['ollama', 'openai'],
79 | default: 'ollama',
80 | },
81 | apiKey: {
82 | type: 'string',
83 | description: 'OpenAI API key (required if provider is openai)',
84 | },
85 | model: {
86 | type: 'string',
87 | description: 'Model to use for embeddings',
88 | },
89 | },
90 | required: ['text'],
91 | } as const;
92 |
```
--------------------------------------------------------------------------------
/src/tools/search-utils.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
2 | import { DocumentMetadata } from './qdrant-client.js';
3 |
4 | export interface SearchResult {
5 | url: string;
6 | title: string;
7 | domain: string;
8 | timestamp: string;
9 | score: number;
10 | snippet: string;
11 | metadata: Partial<DocumentMetadata>;
12 | }
13 |
14 | export interface SearchOptions {
15 | limit?: number;
16 | scoreThreshold?: number;
17 | filters?: {
18 | domain?: string;
19 | hasCode?: boolean;
20 | after?: string;
21 | before?: string;
22 | };
23 | }
24 |
25 | /**
26 | * Extracts a relevant snippet around the most relevant content
27 | */
28 | export function extractSnippet(content: string, maxLength: number = 300): string {
29 | // If content is shorter than maxLength, return it as is
30 | if (content.length <= maxLength) {
31 | return content;
32 | }
33 |
34 | // Find a good breaking point near the middle
35 | const middle = Math.floor(content.length / 2);
36 | const radius = Math.floor(maxLength / 2);
37 |
38 | let start = Math.max(0, middle - radius);
39 | let end = Math.min(content.length, middle + radius);
40 |
41 | // Adjust to avoid breaking words
42 | while (start > 0 && /\S/.test(content[start - 1])) start--;
43 | while (end < content.length && /\S/.test(content[end])) end++;
44 |
45 | let snippet = content.slice(start, end).trim();
46 |
47 | // Add ellipsis if we're not at the boundaries
48 | if (start > 0) snippet = '...' + snippet;
49 | if (end < content.length) snippet = snippet + '...';
50 |
51 | return snippet;
52 | }
53 |
54 | /**
55 | * Normalizes scores to be between 0 and 1
56 | */
57 | export function normalizeScore(score: number): number {
58 | // Qdrant uses cosine similarity which is already between -1 and 1
59 | // Convert to 0-1 range
60 | return (score + 1) / 2;
61 | }
62 |
63 | /**
64 | * Formats search results as markdown
65 | */
66 | export function formatResultsAsMarkdown(results: SearchResult[]): string {
67 | if (results.length === 0) {
68 | return 'No matching documents found.';
69 | }
70 |
71 | return results
72 | .map((result, index) => {
73 | const score = (result.score * 100).toFixed(1);
74 | return `
75 | ### ${index + 1}. ${result.title} (${score}% match)
76 | **URL:** ${result.url}
77 | **Domain:** ${result.domain}
78 | **Date:** ${new Date(result.timestamp).toLocaleDateString()}
79 |
80 | ${result.snippet}
81 | `;
82 | })
83 | .join('\n---\n');
84 | }
85 |
86 | /**
87 | * Validates search options
88 | */
89 | export function validateSearchOptions(options: SearchOptions): void {
90 | if (options.limit !== undefined && (options.limit < 1 || options.limit > 20)) {
91 | throw new McpError(
92 | ErrorCode.InvalidRequest,
93 | 'Limit must be between 1 and 20'
94 | );
95 | }
96 |
97 | if (
98 | options.scoreThreshold !== undefined &&
99 | (options.scoreThreshold < 0 || options.scoreThreshold > 1)
100 | ) {
101 | throw new McpError(
102 | ErrorCode.InvalidRequest,
103 | 'Score threshold must be between 0 and 1'
104 | );
105 | }
106 |
107 | if (options.filters?.after && isNaN(Date.parse(options.filters.after))) {
108 | throw new McpError(ErrorCode.InvalidRequest, 'Invalid after date format');
109 | }
110 |
111 | if (options.filters?.before && isNaN(Date.parse(options.filters.before))) {
112 | throw new McpError(ErrorCode.InvalidRequest, 'Invalid before date format');
113 | }
114 | }
115 |
```
--------------------------------------------------------------------------------
/src/tools/list-utils.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { DocumentMetadata } from './qdrant-client.js';
2 |
3 | export interface ListOptions {
4 | page?: number;
5 | pageSize?: number;
6 | groupByDomain?: boolean;
7 | sortBy?: 'timestamp' | 'title' | 'domain';
8 | sortOrder?: 'asc' | 'desc';
9 | }
10 |
11 | export interface ListResult {
12 | total: number;
13 | page: number;
14 | pageSize: number;
15 | totalPages: number;
16 | documents: DocumentGroup[];
17 | }
18 |
19 | export interface DocumentGroup {
20 | domain?: string;
21 | documents: DocumentMetadata[];
22 | }
23 |
24 | export class ListUtils {
25 | /**
26 | * Groups documents by domain
27 | */
28 | static groupByDomain(documents: DocumentMetadata[]): DocumentGroup[] {
29 | const groupedMap = new Map<string, DocumentMetadata[]>();
30 |
31 | for (const doc of documents) {
32 | const domain = doc.domain;
33 | if (!groupedMap.has(domain)) {
34 | groupedMap.set(domain, []);
35 | }
36 | groupedMap.get(domain)!.push(doc);
37 | }
38 |
39 | return Array.from(groupedMap.entries()).map(([domain, docs]) => ({
40 | domain,
41 | documents: docs
42 | }));
43 | }
44 |
45 | /**
46 | * Sorts documents based on specified criteria
47 | */
48 | static sortDocuments(
49 | documents: DocumentMetadata[],
50 | sortBy: 'timestamp' | 'title' | 'domain' = 'timestamp',
51 | sortOrder: 'asc' | 'desc' = 'desc'
52 | ): DocumentMetadata[] {
53 | return [...documents].sort((a, b) => {
54 | let comparison: number;
55 | switch (sortBy) {
56 | case 'timestamp':
57 | comparison = new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime();
58 | break;
59 | case 'title':
60 | comparison = a.title.localeCompare(b.title);
61 | break;
62 | case 'domain':
63 | comparison = a.domain.localeCompare(b.domain);
64 | break;
65 | default:
66 | comparison = 0;
67 | }
68 | return sortOrder === 'desc' ? -comparison : comparison;
69 | });
70 | }
71 |
72 | /**
73 | * Formats the list result as markdown
74 | */
75 | static formatAsMarkdown(result: ListResult): string {
76 | const lines: string[] = [];
77 |
78 | // Add header with pagination info
79 | lines.push(`# Documentation List`);
80 | lines.push(`Page ${result.page} of ${result.totalPages} (${result.total} total documents)\n`);
81 |
82 | // Add documents grouped by domain
83 | for (const group of result.documents) {
84 | if (group.domain) {
85 | lines.push(`## ${group.domain}`);
86 | }
87 |
88 | for (const doc of group.documents) {
89 | const date = new Date(doc.timestamp).toLocaleDateString();
90 | lines.push(`- [${doc.title}](${doc.url})`);
91 | lines.push(` - Added: ${date}`);
92 | lines.push(` - Type: ${doc.contentType}`);
93 | lines.push(` - Words: ${doc.wordCount}`);
94 | if (doc.hasCode) {
95 | lines.push(` - Contains code snippets`);
96 | }
97 | lines.push(``);
98 | }
99 | }
100 |
101 | return lines.join('\n');
102 | }
103 |
104 | /**
105 | * Calculates pagination details
106 | */
107 | static getPaginationDetails(
108 | total: number,
109 | page: number = 1,
110 | pageSize: number = 20
111 | ): { offset: number; limit: number; totalPages: number } {
112 | const totalPages = Math.ceil(total / pageSize);
113 | const currentPage = Math.min(Math.max(1, page), totalPages);
114 | const offset = (currentPage - 1) * pageSize;
115 |
116 | return {
117 | offset,
118 | limit: pageSize,
119 | totalPages
120 | };
121 | }
122 | }
123 |
```
--------------------------------------------------------------------------------
/src/embeddings.ts:
--------------------------------------------------------------------------------
```typescript
1 | import ollama from 'ollama';
2 | import OpenAI from 'openai';
3 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
4 |
5 | export interface EmbeddingProvider {
6 | generateEmbeddings(text: string): Promise<number[]>;
7 | getVectorSize(): number;
8 | }
9 |
10 | export class OllamaProvider implements EmbeddingProvider {
11 | private model: string;
12 |
13 | constructor(model: string = 'nomic-embed-text') {
14 | this.model = model;
15 | }
16 |
17 | async generateEmbeddings(text: string): Promise<number[]> {
18 | try {
19 | console.error('Generating Ollama embeddings for text:', text.substring(0, 50) + '...');
20 | const response = await ollama.embeddings({
21 | model: this.model,
22 | prompt: text
23 | });
24 | console.error('Successfully generated Ollama embeddings with size:', response.embedding.length);
25 | return response.embedding;
26 | } catch (error) {
27 | console.error('Ollama embedding error:', error);
28 | throw new McpError(
29 | ErrorCode.InternalError,
30 | `Failed to generate embeddings with Ollama: ${error}`
31 | );
32 | }
33 | }
34 |
35 | getVectorSize(): number {
36 | // nomic-embed-text produces 768-dimensional vectors
37 | return 768;
38 | }
39 | }
40 |
41 | export class OpenAIProvider implements EmbeddingProvider {
42 | private client: OpenAI;
43 | private model: string;
44 |
45 | constructor(apiKey: string, model: string = 'text-embedding-3-small') {
46 | this.client = new OpenAI({ apiKey });
47 | this.model = model;
48 | }
49 |
50 | async generateEmbeddings(text: string): Promise<number[]> {
51 | try {
52 | console.error('Generating OpenAI embeddings for text:', text.substring(0, 50) + '...');
53 | const response = await this.client.embeddings.create({
54 | model: this.model,
55 | input: text,
56 | });
57 | const embedding = response.data[0].embedding;
58 | console.error('Successfully generated OpenAI embeddings with size:', embedding.length);
59 | return embedding;
60 | } catch (error) {
61 | console.error('OpenAI embedding error:', error);
62 | throw new McpError(
63 | ErrorCode.InternalError,
64 | `Failed to generate embeddings with OpenAI: ${error}`
65 | );
66 | }
67 | }
68 |
69 | getVectorSize(): number {
70 | // text-embedding-3-small produces 1536-dimensional vectors
71 | return 1536;
72 | }
73 | }
74 |
75 | export class EmbeddingService {
76 | private provider: EmbeddingProvider;
77 |
78 | constructor(provider: EmbeddingProvider) {
79 | this.provider = provider;
80 | }
81 |
82 | async generateEmbeddings(text: string): Promise<number[]> {
83 | return this.provider.generateEmbeddings(text);
84 | }
85 |
86 | getVectorSize(): number {
87 | return this.provider.getVectorSize();
88 | }
89 |
90 | static createFromConfig(config: {
91 | provider: 'ollama' | 'openai';
92 | apiKey?: string;
93 | model?: string;
94 | }): EmbeddingService {
95 | switch (config.provider) {
96 | case 'ollama':
97 | return new EmbeddingService(new OllamaProvider(config.model));
98 | case 'openai':
99 | if (!config.apiKey) {
100 | throw new McpError(
101 | ErrorCode.InvalidRequest,
102 | 'OpenAI API key is required'
103 | );
104 | }
105 | return new EmbeddingService(new OpenAIProvider(config.apiKey, config.model));
106 | default:
107 | throw new McpError(
108 | ErrorCode.InvalidRequest,
109 | `Unknown embedding provider: ${config.provider}`
110 | );
111 | }
112 | }
113 | }
114 |
```
--------------------------------------------------------------------------------
/src/tools/url-processor.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { URL } from 'url';
2 |
3 | export class URLProcessingError extends Error {
4 | constructor(message: string) {
5 | super(message);
6 | this.name = 'URLProcessingError';
7 | }
8 | }
9 |
10 | export interface ProcessedURL {
11 | originalUrl: string;
12 | normalizedUrl: string;
13 | domain: string;
14 | path: string;
15 | isValid: boolean;
16 | }
17 |
18 | export class URLProcessor {
19 | /**
20 | * Validates and normalizes a URL, extracting key components
21 | * @param urlString The URL string to process
22 | * @returns ProcessedURL object containing normalized URL and metadata
23 | * @throws URLProcessingError if URL is invalid
24 | */
25 | static processURL(urlString: string): ProcessedURL {
26 | try {
27 | // Trim whitespace and normalize
28 | const trimmedUrl = urlString.trim();
29 |
30 | // Add protocol if missing
31 | const urlWithProtocol = trimmedUrl.startsWith('http')
32 | ? trimmedUrl
33 | : `https://${trimmedUrl}`;
34 |
35 | // Parse URL
36 | const url = new URL(urlWithProtocol);
37 |
38 | // Normalize URL
39 | // - Convert to lowercase
40 | // - Remove trailing slashes
41 | // - Remove default ports
42 | // - Sort query parameters
43 | const normalizedUrl = this.normalizeURL(url);
44 |
45 | return {
46 | originalUrl: urlString,
47 | normalizedUrl,
48 | domain: url.hostname.toLowerCase(),
49 | path: url.pathname,
50 | isValid: true,
51 | };
52 | } catch (error) {
53 | throw new URLProcessingError(
54 | `Invalid URL "${urlString}": ${(error as Error).message}`
55 | );
56 | }
57 | }
58 |
59 | /**
60 | * Normalizes a URL to ensure consistent format
61 | * @param url URL object to normalize
62 | * @returns Normalized URL string
63 | */
64 | private static normalizeURL(url: URL): string {
65 | // Convert hostname to lowercase
66 | const hostname = url.hostname.toLowerCase();
67 |
68 | // Remove default ports
69 | const port = url.port === '80' || url.port === '443' ? '' : url.port;
70 |
71 | // Sort query parameters
72 | const searchParams = new URLSearchParams([...url.searchParams].sort());
73 | const search = searchParams.toString();
74 |
75 | // Construct normalized path (remove trailing slash except for root)
76 | let path = url.pathname;
77 | if (path.length > 1 && path.endsWith('/')) {
78 | path = path.slice(0, -1);
79 | }
80 |
81 | // Construct normalized URL
82 | let normalizedUrl = `${url.protocol}//${hostname}`;
83 | if (port) normalizedUrl += `:${port}`;
84 | normalizedUrl += path;
85 | if (search) normalizedUrl += `?${search}`;
86 | if (url.hash) normalizedUrl += url.hash;
87 |
88 | return normalizedUrl;
89 | }
90 |
91 | /**
92 | * Checks if a URL points to a valid web page
93 | * @param urlString URL to validate
94 | * @returns true if URL is valid and accessible
95 | */
96 | static isValidWebPage(urlString: string): boolean {
97 | try {
98 | const { protocol } = new URL(urlString);
99 | return protocol === 'http:' || protocol === 'https:';
100 | } catch {
101 | return false;
102 | }
103 | }
104 |
105 | /**
106 | * Extracts the root domain from a URL
107 | * @param urlString URL to process
108 | * @returns Root domain string
109 | */
110 | static extractRootDomain(urlString: string): string {
111 | try {
112 | const { hostname } = new URL(urlString);
113 | const parts = hostname.split('.');
114 | if (parts.length <= 2) return hostname;
115 |
116 | // Handle special cases like co.uk, com.au
117 | const sld = parts[parts.length - 2];
118 | const tld = parts[parts.length - 1];
119 | if (sld.length <= 3 && tld.length <= 3 && parts.length > 2) {
120 | return parts.slice(-3).join('.');
121 | }
122 |
123 | return parts.slice(-2).join('.');
124 | } catch {
125 | throw new URLProcessingError(`Cannot extract domain from invalid URL: ${urlString}`);
126 | }
127 | }
128 | }
129 |
```
--------------------------------------------------------------------------------
/src/tools/add-documentation.ts:
--------------------------------------------------------------------------------
```typescript
1 | import OpenAI from 'openai';
2 | import { URLProcessor, URLProcessingError } from './url-processor.js';
3 | import { ContentFetcher, ContentFetchError } from './content-fetcher.js';
4 | import { TextChunker } from './text-chunker.js';
5 | import { QdrantWrapper, QdrantError } from './qdrant-client.js';
6 |
7 | export class AddDocumentationError extends Error {
8 | constructor(message: string, public readonly step: string) {
9 | super(message);
10 | this.name = 'AddDocumentationError';
11 | }
12 | }
13 |
14 | export interface AddDocumentationResult {
15 | url: string;
16 | title: string;
17 | chunks: number;
18 | wordCount: number;
19 | }
20 |
21 | export class AddDocumentationTool {
22 | private openai: OpenAI;
23 | private qdrant: QdrantWrapper;
24 |
25 | constructor(openaiApiKey: string, qdrantUrl?: string) {
26 | if (!openaiApiKey) {
27 | throw new Error('OpenAI API key is required');
28 | }
29 |
30 | this.openai = new OpenAI({
31 | apiKey: openaiApiKey,
32 | });
33 |
34 | this.qdrant = new QdrantWrapper(qdrantUrl);
35 | }
36 |
37 | /**
38 | * Adds a document to the RAG system
39 | * @param url URL of the document to add
40 | * @returns Result of the operation
41 | */
42 | async addDocument(url: string): Promise<AddDocumentationResult> {
43 | try {
44 | // Check Qdrant health
45 | const isHealthy = await this.qdrant.isHealthy();
46 | if (!isHealthy) {
47 | throw new AddDocumentationError(
48 | 'Qdrant server is not available',
49 | 'health_check'
50 | );
51 | }
52 |
53 | // Initialize collection if needed
54 | await this.qdrant.initializeCollection();
55 |
56 | // Process URL
57 | const processedUrl = URLProcessor.processURL(url);
58 | if (!processedUrl.isValid) {
59 | throw new AddDocumentationError('Invalid URL format', 'url_validation');
60 | }
61 |
62 | // Check if document already exists
63 | const exists = await this.qdrant.documentExists(processedUrl.normalizedUrl);
64 | if (exists) {
65 | // Remove existing document before adding new version
66 | await this.qdrant.removeDocument(processedUrl.normalizedUrl);
67 | }
68 |
69 | // Fetch content
70 | const content = await ContentFetcher.fetchContent(processedUrl.normalizedUrl);
71 |
72 | // Chunk content
73 | const chunks = TextChunker.chunkText(content.content, {
74 | maxChunkSize: 1500, // Leave room for metadata in context window
75 | minChunkSize: 100,
76 | overlap: 200,
77 | respectCodeBlocks: true,
78 | });
79 |
80 | // Generate embeddings for each chunk
81 | const embeddings = await this.generateEmbeddings(
82 | chunks.map(chunk => chunk.content)
83 | );
84 |
85 | // Store in Qdrant
86 | await this.qdrant.storeDocumentChunks(chunks, embeddings, {
87 | url: processedUrl.normalizedUrl,
88 | title: content.title,
89 | domain: processedUrl.domain,
90 | timestamp: content.timestamp,
91 | contentType: content.metadata.contentType,
92 | wordCount: content.metadata.wordCount,
93 | hasCode: content.metadata.hasCode,
94 | });
95 |
96 | return {
97 | url: processedUrl.normalizedUrl,
98 | title: content.title,
99 | chunks: chunks.length,
100 | wordCount: content.metadata.wordCount,
101 | };
102 | } catch (error) {
103 | if (
104 | error instanceof URLProcessingError ||
105 | error instanceof ContentFetchError ||
106 | error instanceof QdrantError ||
107 | error instanceof AddDocumentationError
108 | ) {
109 | throw error;
110 | }
111 |
112 | throw new AddDocumentationError(
113 | `Unexpected error: ${(error as Error).message}`,
114 | 'unknown'
115 | );
116 | }
117 | }
118 |
119 | /**
120 | * Generates embeddings for text chunks using OpenAI's API
121 | * @param chunks Array of text chunks
122 | * @returns Array of embeddings
123 | */
124 | private async generateEmbeddings(chunks: string[]): Promise<number[][]> {
125 | try {
126 | const response = await this.openai.embeddings.create({
127 | model: 'text-embedding-ada-002',
128 | input: chunks,
129 | });
130 |
131 | return response.data.map(item => item.embedding);
132 | } catch (error) {
133 | throw new AddDocumentationError(
134 | `Failed to generate embeddings: ${(error as Error).message}`,
135 | 'embedding_generation'
136 | );
137 | }
138 | }
139 | }
140 |
```
--------------------------------------------------------------------------------
/src/handlers/add-documentation.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
2 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
3 | import { BaseHandler } from './base-handler.js';
4 | import { ApiClient } from '../api-client.js';
5 | import { DocumentChunk, ToolResult } from '../types.js';
6 | import * as cheerio from 'cheerio';
7 | import crypto from 'crypto';
8 |
9 | const COLLECTION_NAME = 'documentation';
10 | const BATCH_SIZE = 100;
11 |
12 | export class AddDocumentationHandler extends BaseHandler {
13 | constructor(server: Server, apiClient: ApiClient) {
14 | super(server, apiClient);
15 | }
16 |
17 | async handle(args: any): Promise<ToolResult> {
18 | if (!args.url || typeof args.url !== 'string') {
19 | throw new McpError(ErrorCode.InvalidParams, 'URL is required');
20 | }
21 |
22 | try {
23 | const chunks = await this.fetchAndProcessUrl(args.url);
24 |
25 | // Batch process chunks for better performance
26 | for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
27 | const batch = chunks.slice(i, i + BATCH_SIZE);
28 | const points = await Promise.all(
29 | batch.map(async (chunk) => {
30 | const embedding = await this.apiClient.getEmbeddings(chunk.text);
31 | return {
32 | id: this.generatePointId(),
33 | vector: embedding,
34 | payload: {
35 | ...chunk,
36 | _type: 'DocumentChunk' as const,
37 | } as Record<string, unknown>,
38 | };
39 | })
40 | );
41 |
42 | try {
43 | await this.apiClient.qdrantClient.upsert(COLLECTION_NAME, {
44 | wait: true,
45 | points,
46 | });
47 | } catch (error) {
48 | if (error instanceof Error) {
49 | if (error.message.includes('unauthorized')) {
50 | throw new McpError(
51 | ErrorCode.InvalidRequest,
52 | 'Failed to authenticate with Qdrant cloud while adding documents'
53 | );
54 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) {
55 | throw new McpError(
56 | ErrorCode.InternalError,
57 | 'Connection to Qdrant cloud failed while adding documents'
58 | );
59 | }
60 | }
61 | throw error;
62 | }
63 | }
64 |
65 | return {
66 | content: [
67 | {
68 | type: 'text',
69 | text: `Successfully added documentation from ${args.url} (${chunks.length} chunks processed in ${Math.ceil(chunks.length / BATCH_SIZE)} batches)`,
70 | },
71 | ],
72 | };
73 | } catch (error) {
74 | if (error instanceof McpError) {
75 | throw error;
76 | }
77 | return {
78 | content: [
79 | {
80 | type: 'text',
81 | text: `Failed to add documentation: ${error}`,
82 | },
83 | ],
84 | isError: true,
85 | };
86 | }
87 | }
88 |
89 | private async fetchAndProcessUrl(url: string): Promise<DocumentChunk[]> {
90 | await this.apiClient.initBrowser();
91 | const page = await this.apiClient.browser.newPage();
92 |
93 | try {
94 | await page.goto(url, { waitUntil: 'networkidle' });
95 | const content = await page.content();
96 | const $ = cheerio.load(content);
97 |
98 | // Remove script tags, style tags, and comments
99 | $('script').remove();
100 | $('style').remove();
101 | $('noscript').remove();
102 |
103 | // Extract main content
104 | const title = $('title').text() || url;
105 | const mainContent = $('main, article, .content, .documentation, body').text();
106 |
107 | // Split content into chunks
108 | const chunks = this.chunkText(mainContent, 1000);
109 |
110 | return chunks.map(chunk => ({
111 | text: chunk,
112 | url,
113 | title,
114 | timestamp: new Date().toISOString(),
115 | }));
116 | } catch (error) {
117 | throw new McpError(
118 | ErrorCode.InternalError,
119 | `Failed to fetch URL ${url}: ${error}`
120 | );
121 | } finally {
122 | await page.close();
123 | }
124 | }
125 |
126 | private chunkText(text: string, maxChunkSize: number): string[] {
127 | const words = text.split(/\s+/);
128 | const chunks: string[] = [];
129 | let currentChunk: string[] = [];
130 |
131 | for (const word of words) {
132 | currentChunk.push(word);
133 | const currentLength = currentChunk.join(' ').length;
134 |
135 | if (currentLength >= maxChunkSize) {
136 | chunks.push(currentChunk.join(' '));
137 | currentChunk = [];
138 | }
139 | }
140 |
141 | if (currentChunk.length > 0) {
142 | chunks.push(currentChunk.join(' '));
143 | }
144 |
145 | return chunks;
146 | }
147 |
148 | private generatePointId(): string {
149 | return crypto.randomBytes(16).toString('hex');
150 | }
151 | }
152 |
```
--------------------------------------------------------------------------------
/src/tools/content-fetcher.ts:
--------------------------------------------------------------------------------
```typescript
1 | import axios, { AxiosError } from 'axios';
2 | import * as cheerio from 'cheerio';
3 |
4 | export class ContentFetchError extends Error {
5 | constructor(message: string, public readonly url: string) {
6 | super(message);
7 | this.name = 'ContentFetchError';
8 | }
9 | }
10 |
11 | export interface FetchedContent {
12 | url: string;
13 | title: string;
14 | content: string;
15 | timestamp: string;
16 | metadata: {
17 | domain: string;
18 | contentType: string;
19 | wordCount: number;
20 | hasCode: boolean;
21 | };
22 | }
23 |
24 | export class ContentFetcher {
25 | private static readonly TIMEOUT = 30000; // 30 seconds
26 | private static readonly MAX_RETRIES = 3;
27 | private static readonly RETRY_DELAY = 1000; // 1 second
28 |
29 | /**
30 | * Fetches and processes content from a URL
31 | * @param url URL to fetch content from
32 | * @returns Processed content with metadata
33 | */
34 | static async fetchContent(url: string): Promise<FetchedContent> {
35 | let retries = 0;
36 | let lastError: Error | null = null;
37 |
38 | while (retries < this.MAX_RETRIES) {
39 | try {
40 | const response = await axios.get(url, {
41 | timeout: this.TIMEOUT,
42 | maxRedirects: 5,
43 | headers: {
44 | 'User-Agent': 'Mozilla/5.0 (compatible; RagDocsBot/1.0)',
45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
46 | 'Accept-Language': 'en-US,en;q=0.5',
47 | },
48 | });
49 |
50 | const contentType = response.headers['content-type'] || '';
51 | if (!contentType.includes('html')) {
52 | throw new ContentFetchError('Unsupported content type: ' + contentType, url);
53 | }
54 |
55 | return this.processHtmlContent(url, response.data);
56 | } catch (error) {
57 | lastError = error as Error;
58 | if (error instanceof AxiosError && error.response?.status === 404) {
59 | throw new ContentFetchError('Page not found', url);
60 | }
61 | retries++;
62 | if (retries < this.MAX_RETRIES) {
63 | await new Promise(resolve => setTimeout(resolve, this.RETRY_DELAY));
64 | }
65 | }
66 | }
67 |
68 | throw new ContentFetchError(
69 | `Failed to fetch content after ${this.MAX_RETRIES} attempts: ${lastError?.message}`,
70 | url
71 | );
72 | }
73 |
74 | /**
75 | * Processes HTML content to extract relevant text and metadata
76 | * @param url Original URL
77 | * @param html Raw HTML content
78 | * @returns Processed content with metadata
79 | */
80 | private static processHtmlContent(url: string, html: string): FetchedContent {
81 | const $ = cheerio.load(html);
82 |
83 | // Remove unwanted elements
84 | this.removeUnwantedElements($);
85 |
86 | // Extract title
87 | const title = $('title').text().trim() ||
88 | $('h1').first().text().trim() ||
89 | 'Untitled Document';
90 |
91 | // Extract main content
92 | const mainContent = this.extractMainContent($);
93 |
94 | // Check for code blocks
95 | const hasCode = $('pre, code').length > 0 ||
96 | mainContent.includes('```') ||
97 | /\`[^\`]+\`/.test(mainContent);
98 |
99 | // Count words
100 | const wordCount = mainContent.split(/\s+/).filter(Boolean).length;
101 |
102 | return {
103 | url,
104 | title,
105 | content: mainContent,
106 | timestamp: new Date().toISOString(),
107 | metadata: {
108 | domain: new URL(url).hostname,
109 | contentType: 'text/html',
110 | wordCount,
111 | hasCode,
112 | },
113 | };
114 | }
115 |
116 | /**
117 | * Removes unwanted elements from the HTML
118 | * @param $ Cheerio instance
119 | */
120 | private static removeUnwantedElements($: cheerio.CheerioAPI): void {
121 | // Remove common non-content elements
122 | const selectorsToRemove = [
123 | 'script',
124 | 'style',
125 | 'nav',
126 | 'header',
127 | 'footer',
128 | 'iframe',
129 | '.advertisement',
130 | '.ads',
131 | '#comments',
132 | '.comments',
133 | '.social-share',
134 | '.related-posts',
135 | 'aside',
136 | ];
137 |
138 | $(selectorsToRemove.join(', ')).remove();
139 | }
140 |
141 | /**
142 | * Extracts main content from the HTML
143 | * @param $ Cheerio instance
144 | * @returns Extracted and cleaned content
145 | */
146 | private static extractMainContent($: cheerio.CheerioAPI): string {
147 | // Try to find main content container
148 | const mainSelectors = [
149 | 'article',
150 | 'main',
151 | '.main-content',
152 | '#main-content',
153 | '.post-content',
154 | '.article-content',
155 | '.entry-content',
156 | ];
157 |
158 | let $content = $();
159 | for (const selector of mainSelectors) {
160 | $content = $(selector);
161 | if ($content.length > 0) break;
162 | }
163 |
164 | // Fallback to body if no main content container found
165 | if ($content.length === 0) {
166 | $content = $('body');
167 | }
168 |
169 | // Extract text content
170 | const text = $content
171 | .find('h1, h2, h3, h4, h5, h6, p, li, pre, code')
172 | .map((_, el) => {
173 | const $el = $(el);
174 | // Preserve code blocks
175 | if ($el.is('pre, code')) {
176 | return '\n```\n' + $el.text() + '\n```\n';
177 | }
178 | return $el.text();
179 | })
180 | .get()
181 | .join('\n')
182 | .trim();
183 |
184 | // Clean up the text
185 | return this.cleanText(text);
186 | }
187 |
188 | /**
189 | * Cleans extracted text content
190 | * @param text Raw text content
191 | * @returns Cleaned text
192 | */
193 | private static cleanText(text: string): string {
194 | return text
195 | .replace(/[\r\n]+/g, '\n') // Normalize line endings
196 | .replace(/\n\s+\n/g, '\n\n') // Remove excess whitespace between paragraphs
197 | .replace(/\s+/g, ' ') // Normalize whitespace within paragraphs
198 | .split('\n') // Split into lines
199 | .map(line => line.trim()) // Trim each line
200 | .filter(Boolean) // Remove empty lines
201 | .join('\n') // Rejoin with newlines
202 | .trim(); // Final trim
203 | }
204 | }
205 |
```
--------------------------------------------------------------------------------
/src/tools/text-chunker.ts:
--------------------------------------------------------------------------------
```typescript
1 | export interface ChunkOptions {
2 | maxChunkSize: number;
3 | minChunkSize: number;
4 | overlap: number;
5 | respectCodeBlocks?: boolean;
6 | }
7 |
8 | export interface TextChunk {
9 | content: string;
10 | index: number;
11 | metadata: {
12 | startPosition: number;
13 | endPosition: number;
14 | isCodeBlock?: boolean;
15 | };
16 | }
17 |
18 | export class TextChunker {
19 | private static readonly DEFAULT_OPTIONS: ChunkOptions = {
20 | maxChunkSize: 1000,
21 | minChunkSize: 100,
22 | overlap: 200,
23 | respectCodeBlocks: true,
24 | };
25 |
26 | /**
27 | * Splits text into chunks while preserving context and natural boundaries
28 | * @param text Text to split into chunks
29 | * @param options Chunking options
30 | * @returns Array of text chunks with metadata
31 | */
32 | static chunkText(text: string, options?: Partial<ChunkOptions>): TextChunk[] {
33 | const opts = { ...this.DEFAULT_OPTIONS, ...options };
34 | const chunks: TextChunk[] = [];
35 |
36 | // First, separate code blocks from regular text
37 | const segments = this.separateCodeBlocks(text);
38 | let currentPosition = 0;
39 | let chunkIndex = 0;
40 |
41 | for (const segment of segments) {
42 | if (segment.isCodeBlock && opts.respectCodeBlocks) {
43 | // Keep code blocks as single chunks if they're not too large
44 | if (segment.content.length <= opts.maxChunkSize * 1.5) {
45 | chunks.push({
46 | content: segment.content,
47 | index: chunkIndex++,
48 | metadata: {
49 | startPosition: currentPosition,
50 | endPosition: currentPosition + segment.content.length,
51 | isCodeBlock: true,
52 | },
53 | });
54 | currentPosition += segment.content.length;
55 | continue;
56 | }
57 | }
58 |
59 | // Process regular text or large code blocks
60 | const segmentChunks = this.chunkSegment(
61 | segment.content,
62 | opts,
63 | currentPosition,
64 | chunkIndex,
65 | segment.isCodeBlock
66 | );
67 |
68 | chunks.push(...segmentChunks);
69 | chunkIndex += segmentChunks.length;
70 | currentPosition += segment.content.length;
71 | }
72 |
73 | return chunks;
74 | }
75 |
76 | /**
77 | * Separates code blocks from regular text
78 | * @param text Input text
79 | * @returns Array of text segments with code block flags
80 | */
81 | private static separateCodeBlocks(text: string): Array<{ content: string; isCodeBlock: boolean }> {
82 | const segments: Array<{ content: string; isCodeBlock: boolean }> = [];
83 | const codeBlockRegex = /```[\s\S]*?```/g;
84 |
85 | let lastIndex = 0;
86 | let match: RegExpExecArray | null;
87 |
88 | while ((match = codeBlockRegex.exec(text)) !== null) {
89 | // Add text before code block
90 | if (match.index > lastIndex) {
91 | segments.push({
92 | content: text.slice(lastIndex, match.index),
93 | isCodeBlock: false,
94 | });
95 | }
96 |
97 | // Add code block
98 | segments.push({
99 | content: match[0],
100 | isCodeBlock: true,
101 | });
102 |
103 | lastIndex = match.index + match[0].length;
104 | }
105 |
106 | // Add remaining text
107 | if (lastIndex < text.length) {
108 | segments.push({
109 | content: text.slice(lastIndex),
110 | isCodeBlock: false,
111 | });
112 | }
113 |
114 | return segments;
115 | }
116 |
117 | /**
118 | * Chunks a single segment of text
119 | * @param text Text segment to chunk
120 | * @param options Chunking options
121 | * @param startPosition Starting position in original text
122 | * @param startIndex Starting chunk index
123 | * @param isCodeBlock Whether this is a code block
124 | * @returns Array of chunks
125 | */
126 | private static chunkSegment(
127 | text: string,
128 | options: ChunkOptions,
129 | startPosition: number,
130 | startIndex: number,
131 | isCodeBlock: boolean
132 | ): TextChunk[] {
133 | const chunks: TextChunk[] = [];
134 | let currentChunk = '';
135 | let currentPosition = 0;
136 |
137 | // Split into sentences/paragraphs first
138 | const blocks = isCodeBlock
139 | ? [text] // Keep code blocks together
140 | : text
141 | .split(/(?<=\.|\?|\!|\n)\s+/)
142 | .filter(Boolean)
143 | .map(block => block.trim());
144 |
145 | for (const block of blocks) {
146 | // If adding this block would exceed max size, start new chunk
147 | if (
148 | currentChunk &&
149 | currentChunk.length + block.length > options.maxChunkSize &&
150 | currentChunk.length >= options.minChunkSize
151 | ) {
152 | chunks.push({
153 | content: currentChunk,
154 | index: startIndex + chunks.length,
155 | metadata: {
156 | startPosition: startPosition + currentPosition - currentChunk.length,
157 | endPosition: startPosition + currentPosition,
158 | isCodeBlock,
159 | },
160 | });
161 |
162 | // Start new chunk with overlap
163 | const words = currentChunk.split(/\s+/);
164 | const overlapWords = words.slice(-Math.ceil(options.overlap / 10)); // Approximate words for overlap
165 | currentChunk = overlapWords.join(' ') + ' ' + block;
166 | } else {
167 | currentChunk = currentChunk
168 | ? currentChunk + ' ' + block
169 | : block;
170 | }
171 |
172 | currentPosition += block.length + 1; // +1 for the space
173 | }
174 |
175 | // Add final chunk if not empty
176 | if (currentChunk) {
177 | chunks.push({
178 | content: currentChunk,
179 | index: startIndex + chunks.length,
180 | metadata: {
181 | startPosition: startPosition + currentPosition - currentChunk.length,
182 | endPosition: startPosition + currentPosition,
183 | isCodeBlock,
184 | },
185 | });
186 | }
187 |
188 | return chunks;
189 | }
190 |
191 | /**
192 | * Validates chunk options and sets defaults
193 | * @param options User-provided options
194 | * @returns Validated options
195 | */
196 | private static validateOptions(options: Partial<ChunkOptions>): ChunkOptions {
197 | const opts = { ...this.DEFAULT_OPTIONS, ...options };
198 |
199 | if (opts.maxChunkSize < opts.minChunkSize) {
200 | throw new Error('maxChunkSize must be greater than minChunkSize');
201 | }
202 |
203 | if (opts.overlap >= opts.maxChunkSize) {
204 | throw new Error('overlap must be less than maxChunkSize');
205 | }
206 |
207 | if (opts.minChunkSize <= 0 || opts.maxChunkSize <= 0 || opts.overlap < 0) {
208 | throw new Error('chunk sizes and overlap must be positive numbers');
209 | }
210 |
211 | return opts;
212 | }
213 | }
214 |
```
--------------------------------------------------------------------------------
/src/api-client.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { QdrantClient } from '@qdrant/js-client-rest';
2 | import { chromium } from 'playwright';
3 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
4 | import { EmbeddingService } from './embeddings.js';
5 | import { QdrantWrapper } from './tools/qdrant-client.js';
6 | import { Document } from './types.js';
7 |
8 | export interface QdrantCollectionConfig {
9 | params: {
10 | vectors: {
11 | size: number;
12 | distance: string;
13 | };
14 | };
15 | }
16 |
17 | export interface QdrantCollectionInfo {
18 | config: QdrantCollectionConfig;
19 | }
20 |
21 | export class ApiClient {
22 | qdrantClient: QdrantClient;
23 | private embeddingService: EmbeddingService;
24 | readonly qdrant: QdrantWrapper;
25 | browser: any;
26 |
27 | constructor(config: {
28 | embeddingConfig: {
29 | provider: 'ollama' | 'openai';
30 | apiKey?: string;
31 | model?: string;
32 | };
33 | qdrantUrl?: string;
34 | qdrantApiKey?: string;
35 | }) {
36 | this.embeddingService = EmbeddingService.createFromConfig(config.embeddingConfig);
37 |
38 | this.qdrant = new QdrantWrapper(config.qdrantUrl, config.qdrantApiKey);
39 | this.qdrantClient = this.qdrant.client;
40 | }
41 |
42 | async initBrowser() {
43 | if (!this.browser) {
44 | this.browser = await chromium.launch();
45 | }
46 | }
47 |
48 | async cleanup() {
49 | if (this.browser) {
50 | await this.browser.close();
51 | }
52 | }
53 |
54 | async getEmbeddings(text: string): Promise<number[]> {
55 | return this.embeddingService.generateEmbeddings(text);
56 | }
57 |
58 | get embeddings(): EmbeddingService {
59 | return this.embeddingService;
60 | }
61 |
62 | async initCollection(collectionName: string) {
63 | try {
64 | const collections = await this.qdrantClient.getCollections();
65 | const exists = collections.collections.some(c => c.name === collectionName);
66 |
67 | const requiredVectorSize = this.embeddingService.getVectorSize();
68 |
69 | if (!exists) {
70 | console.error(`Creating new collection with vector size ${requiredVectorSize}`);
71 | await this.createCollection(collectionName, requiredVectorSize);
72 | return;
73 | }
74 |
75 | // Verify vector size of existing collection
76 | const collectionInfo = await this.qdrantClient.getCollection(collectionName) as QdrantCollectionInfo;
77 | const currentVectorSize = collectionInfo.config?.params?.vectors?.size;
78 |
79 | if (!currentVectorSize) {
80 | console.error('Could not determine current vector size, recreating collection...');
81 | await this.recreateCollection(collectionName, requiredVectorSize);
82 | return;
83 | }
84 |
85 | if (currentVectorSize !== requiredVectorSize) {
86 | console.error(`Vector size mismatch: collection=${currentVectorSize}, required=${requiredVectorSize}`);
87 | await this.recreateCollection(collectionName, requiredVectorSize);
88 | }
89 | } catch (error) {
90 | if (error instanceof Error) {
91 | if (error.message.includes('unauthorized')) {
92 | throw new McpError(
93 | ErrorCode.InvalidRequest,
94 | 'Failed to authenticate with Qdrant. Please check your API key.'
95 | );
96 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) {
97 | throw new McpError(
98 | ErrorCode.InternalError,
99 | 'Failed to connect to Qdrant. Please check your QDRANT_URL.'
100 | );
101 | }
102 | }
103 | throw new McpError(
104 | ErrorCode.InternalError,
105 | `Failed to initialize Qdrant collection: ${error}`
106 | );
107 | }
108 | }
109 |
110 | private async createCollection(collectionName: string, vectorSize: number) {
111 | await this.qdrantClient.createCollection(collectionName, {
112 | vectors: {
113 | size: vectorSize,
114 | distance: 'Cosine',
115 | },
116 | optimizers_config: {
117 | default_segment_number: 2,
118 | memmap_threshold: 20000,
119 | },
120 | replication_factor: 2,
121 | });
122 |
123 | // Create indexes for efficient filtering
124 | await this.qdrantClient.createPayloadIndex(collectionName, {
125 | field_name: 'url',
126 | field_schema: 'keyword',
127 | });
128 |
129 | await this.qdrantClient.createPayloadIndex(collectionName, {
130 | field_name: 'timestamp',
131 | field_schema: 'datetime',
132 | });
133 | }
134 |
135 | private async recreateCollection(collectionName: string, vectorSize: number) {
136 | try {
137 | console.error('Recreating collection with new vector size...');
138 | await this.qdrantClient.deleteCollection(collectionName);
139 | await this.createCollection(collectionName, vectorSize);
140 | console.error(`Collection recreated with new vector size ${vectorSize}`);
141 | } catch (error) {
142 | throw new McpError(
143 | ErrorCode.InternalError,
144 | `Failed to recreate collection: ${error}`
145 | );
146 | }
147 | }
148 |
149 | async isHealthy(): Promise<boolean> {
150 | try {
151 | await this.qdrantClient.getCollections();
152 | return true;
153 | } catch {
154 | return false;
155 | }
156 | }
157 |
158 | async addDocument(doc: Document): Promise<void> {
159 | try {
160 | // Check if document already exists
161 | if (await this.qdrant.documentExists(doc.url)) {
162 | throw new McpError(
163 | ErrorCode.InvalidRequest,
164 | `Document with URL ${doc.url} already exists`
165 | );
166 | }
167 |
168 | // Generate embeddings for the content
169 | const embedding = await this.embeddingService.generateEmbeddings(doc.content);
170 |
171 | // Store document in Qdrant
172 | await this.qdrant.storeDocumentChunks(
173 | [{
174 | content: doc.content,
175 | index: 0,
176 | metadata: {
177 | startPosition: 0,
178 | endPosition: doc.content.length,
179 | isCodeBlock: /```/.test(doc.content)
180 | }
181 | }],
182 | [embedding],
183 | {
184 | url: doc.url,
185 | title: doc.metadata.title || '',
186 | domain: new URL(doc.url).hostname,
187 | timestamp: new Date().toISOString(),
188 | contentType: doc.metadata.contentType || 'text/plain',
189 | wordCount: doc.content.split(/\s+/).length,
190 | hasCode: /```|\bfunction\b|\bclass\b|\bconst\b|\blet\b|\bvar\b/.test(doc.content),
191 | }
192 | );
193 | } catch (error) {
194 | throw new McpError(
195 | ErrorCode.InternalError,
196 | `Failed to add document: ${error}`
197 | );
198 | }
199 | }
200 |
201 | async deleteDocument(url: string): Promise<void> {
202 | try {
203 | await this.qdrant.removeDocument(url);
204 | } catch (error) {
205 | throw new McpError(
206 | ErrorCode.InternalError,
207 | `Failed to delete document: ${error}`
208 | );
209 | }
210 | }
211 | }
212 |
```
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
```typescript
1 | #!/usr/bin/env node
2 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
3 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4 | import { CallToolRequestSchema, ListToolsRequestSchema, McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
5 | import axios from 'axios';
6 | import { ApiClient } from './api-client.js';
7 | import { SearchDocumentationHandler } from './handlers/search-documentation.js';
8 | import { ListDocumentationHandler } from './handlers/list-documentation.js';
9 | import { ListOptions } from './tools/list-utils.js';
10 | import { Document } from './types.js';
11 |
12 | // Force using IP address to avoid hostname resolution issues
13 | const QDRANT_URL = process.env.QDRANT_URL || 'http://127.0.0.1:6333';
14 | const QDRANT_API_KEY = process.env.QDRANT_API_KEY;
15 | const EMBEDDING_PROVIDER = process.env.EMBEDDING_PROVIDER || 'ollama';
16 | const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
17 |
18 | // Test connection with direct axios call first
19 | try {
20 | const response = await axios.get(`${QDRANT_URL}/collections`);
21 | console.error('Successfully connected to Qdrant:', response.data);
22 | } catch (error) {
23 | console.error('Failed to connect to Qdrant:', error);
24 | throw new McpError(
25 | ErrorCode.InternalError,
26 | 'Failed to establish initial connection to Qdrant server'
27 | );
28 | }
29 |
30 | const client = new ApiClient({
31 | qdrantUrl: QDRANT_URL,
32 | qdrantApiKey: QDRANT_API_KEY,
33 | embeddingConfig: {
34 | provider: EMBEDDING_PROVIDER as 'ollama' | 'openai',
35 | apiKey: OPENAI_API_KEY,
36 | model: EMBEDDING_PROVIDER === 'ollama' ? 'nomic-embed-text' : 'text-embedding-3-small'
37 | }
38 | });
39 |
40 | try {
41 | // Initialize Qdrant collection
42 | await client.qdrant.initializeCollection();
43 | console.error('Successfully initialized Qdrant collection');
44 | } catch (error) {
45 | console.error('Failed to initialize Qdrant collection:', error);
46 | throw error;
47 | }
48 |
49 | class RagDocsServer {
50 | private server: Server;
51 |
52 | constructor() {
53 | this.server = new Server(
54 | {
55 | name: 'ragdocs',
56 | version: '0.1.0',
57 | },
58 | {
59 | capabilities: {
60 | tools: {},
61 | },
62 | }
63 | );
64 |
65 | this.setupToolHandlers();
66 | this.server.onerror = (error) => console.error('[MCP Error]', error);
67 | }
68 |
69 | private setupToolHandlers() {
70 | this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
71 | tools: [
72 | {
73 | name: 'add_document',
74 | description: 'Add a document to the RAG system',
75 | inputSchema: {
76 | type: 'object',
77 | properties: {
78 | url: { type: 'string', description: 'Document URL' },
79 | content: { type: 'string', description: 'Document content' },
80 | metadata: {
81 | type: 'object',
82 | properties: {
83 | title: { type: 'string', description: 'Document title' },
84 | contentType: { type: 'string', description: 'Content type (e.g., text/plain, text/markdown)' },
85 | },
86 | additionalProperties: true,
87 | },
88 | },
89 | required: ['url', 'content'],
90 | },
91 | },
92 | {
93 | name: 'search_documents',
94 | description: 'Search for documents using semantic similarity',
95 | inputSchema: {
96 | type: 'object',
97 | properties: {
98 | query: {
99 | type: 'string',
100 | description: 'Natural language search query'
101 | },
102 | options: {
103 | type: 'object',
104 | description: 'Search options',
105 | properties: {
106 | limit: {
107 | type: 'number',
108 | description: 'Maximum number of results (1-20)',
109 | minimum: 1,
110 | maximum: 20
111 | },
112 | scoreThreshold: {
113 | type: 'number',
114 | description: 'Minimum similarity score (0-1)',
115 | minimum: 0,
116 | maximum: 1
117 | },
118 | filters: {
119 | type: 'object',
120 | description: 'Optional filters',
121 | properties: {
122 | domain: {
123 | type: 'string',
124 | description: 'Filter by domain'
125 | },
126 | hasCode: {
127 | type: 'boolean',
128 | description: 'Filter for documents containing code'
129 | },
130 | after: {
131 | type: 'string',
132 | description: 'Filter for documents after date (ISO format)'
133 | },
134 | before: {
135 | type: 'string',
136 | description: 'Filter for documents before date (ISO format)'
137 | }
138 | }
139 | }
140 | }
141 | }
142 | },
143 | required: ['query'],
144 | },
145 | },
146 | {
147 | name: 'delete_document',
148 | description: 'Delete a document from the RAG system',
149 | inputSchema: {
150 | type: 'object',
151 | properties: {
152 | url: { type: 'string', description: 'Document URL to delete' },
153 | },
154 | required: ['url'],
155 | },
156 | },
157 | {
158 | name: 'list_documents',
159 | description: 'List all stored documents with pagination and grouping options',
160 | inputSchema: {
161 | type: 'object',
162 | properties: {
163 | page: {
164 | type: 'number',
165 | description: 'Page number (default: 1)',
166 | minimum: 1
167 | },
168 | pageSize: {
169 | type: 'number',
170 | description: 'Number of documents per page (default: 20)',
171 | minimum: 1,
172 | maximum: 100
173 | },
174 | groupByDomain: {
175 | type: 'boolean',
176 | description: 'Group documents by domain (default: false)'
177 | },
178 | sortBy: {
179 | type: 'string',
180 | description: 'Sort field (default: timestamp)',
181 | enum: ['timestamp', 'title', 'domain']
182 | },
183 | sortOrder: {
184 | type: 'string',
185 | description: 'Sort order (default: desc)',
186 | enum: ['asc', 'desc']
187 | }
188 | }
189 | }
190 | },
191 | ],
192 | }));
193 |
194 | this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
195 | try {
196 | switch (request.params.name) {
197 | case 'add_document': {
198 | const args = request.params.arguments as Record<string, unknown>;
199 | if (!args || typeof args.url !== 'string' || typeof args.content !== 'string') {
200 | throw new Error('Invalid document format: url and content must be strings');
201 | }
202 | const doc: Document = {
203 | url: args.url,
204 | content: args.content,
205 | metadata: (args.metadata as Record<string, unknown>) || {}
206 | };
207 | await client.addDocument(doc);
208 | return {
209 | content: [{ type: 'text', text: `Document ${doc.url} added successfully` }],
210 | };
211 | }
212 |
213 | case 'search_documents': {
214 | const { query, options } = request.params.arguments as {
215 | query: string;
216 | options?: {
217 | limit?: number;
218 | scoreThreshold?: number;
219 | filters?: {
220 | domain?: string;
221 | hasCode?: boolean;
222 | after?: string;
223 | before?: string;
224 | };
225 | };
226 | };
227 |
228 | const searchHandler = new SearchDocumentationHandler(
229 | client.qdrant,
230 | client.embeddings,
231 | this.server,
232 | client
233 | );
234 |
235 | return await searchHandler.handle({ query, options });
236 | }
237 |
238 | case 'delete_document': {
239 | const { url } = request.params.arguments as { url: string };
240 | await client.deleteDocument(url);
241 | return {
242 | content: [{ type: 'text', text: `Document ${url} deleted successfully` }],
243 | };
244 | }
245 |
246 | case 'list_documents': {
247 | const args = request.params.arguments as ListOptions;
248 | const listHandler = new ListDocumentationHandler(this.server, client);
249 | return await listHandler.handle(args || {});
250 | }
251 |
252 | default:
253 | throw new Error(`Unknown tool: ${request.params.name}`);
254 | }
255 | } catch (error) {
256 | const errorMessage = error instanceof Error ? error.message : 'Unknown error occurred';
257 | console.error('[Tool Error]', errorMessage);
258 | return {
259 | content: [{ type: 'text', text: `Error: ${errorMessage}` }],
260 | isError: true,
261 | };
262 | }
263 | });
264 | }
265 |
266 | async run() {
267 | const transport = new StdioServerTransport();
268 | await this.server.connect(transport);
269 | console.error('RagDocs MCP server running on stdio');
270 | }
271 | }
272 |
273 | const server = new RagDocsServer();
274 | server.run().catch(console.error);
275 |
```
--------------------------------------------------------------------------------
/src/tools/qdrant-client.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { QdrantClient } from '@qdrant/js-client-rest';
2 | import { TextChunk } from './text-chunker.js';
3 |
4 | export interface DocumentMetadata {
5 | url: string;
6 | title: string;
7 | domain: string;
8 | timestamp: string;
9 | contentType: string;
10 | wordCount: number;
11 | hasCode: boolean;
12 | chunkIndex: number;
13 | totalChunks: number;
14 | }
15 |
16 | export class QdrantError extends Error {
17 | constructor(message: string) {
18 | super(message);
19 | this.name = 'QdrantError';
20 | }
21 | }
22 |
23 | export class QdrantWrapper {
24 | public client: QdrantClient;
25 | private readonly collectionName = 'documentation';
26 | private readonly vectorSize = 768; // Ollama nomic-embed-text size
27 |
28 | constructor(url?: string, apiKey?: string) {
29 | this.client = new QdrantClient({
30 | url: url || 'http://10.1.1.199:6333',
31 | apiKey: apiKey,
32 | timeout: 10000 // Add timeout to help debug connection issues
33 | });
34 | }
35 |
36 | /**
37 | * Initializes the Qdrant collection if it doesn't exist
38 | */
39 | async initializeCollection(): Promise<void> {
40 | try {
41 | const collections = await this.client.getCollections();
42 | const exists = collections.collections.some(c => c.name === this.collectionName);
43 |
44 | if (!exists) {
45 | await this.client.createCollection(this.collectionName, {
46 | vectors: {
47 | size: this.vectorSize,
48 | distance: 'Cosine',
49 | },
50 | optimizers_config: {
51 | default_segment_number: 2,
52 | },
53 | replication_factor: 1,
54 | });
55 |
56 | // Create indexes for efficient filtering
57 | await this.client.createPayloadIndex(this.collectionName, {
58 | field_name: 'url',
59 | field_schema: 'keyword',
60 | });
61 |
62 | await this.client.createPayloadIndex(this.collectionName, {
63 | field_name: 'domain',
64 | field_schema: 'keyword',
65 | });
66 |
67 | await this.client.createPayloadIndex(this.collectionName, {
68 | field_name: 'timestamp',
69 | field_schema: 'datetime',
70 | });
71 | }
72 | } catch (error) {
73 | console.error('Qdrant initialization error:', error);
74 | if (error instanceof Error) {
75 | console.error('Error details:', {
76 | name: error.name,
77 | message: error.message,
78 | stack: error.stack
79 | });
80 | }
81 | throw new QdrantError(
82 | `Failed to initialize Qdrant collection: ${error instanceof Error ? error.message : String(error)}`
83 | );
84 | }
85 | }
86 |
87 | /**
88 | * Stores document chunks in the Qdrant collection
89 | * @param chunks Text chunks to store
90 | * @param embeddings Corresponding embeddings for each chunk
91 | * @param metadata Document metadata
92 | */
93 | async storeDocumentChunks(
94 | chunks: TextChunk[],
95 | embeddings: number[][],
96 | metadata: Omit<DocumentMetadata, 'chunkIndex' | 'totalChunks'>
97 | ): Promise<void> {
98 | if (chunks.length !== embeddings.length) {
99 | throw new QdrantError('Number of chunks does not match number of embeddings');
100 | }
101 |
102 | try {
103 | const points = chunks.map((chunk, index) => ({
104 | id: this.generatePointId(metadata.url, chunk.index),
105 | vector: embeddings[index],
106 | payload: {
107 | ...metadata,
108 | content: chunk.content,
109 | chunkIndex: chunk.index,
110 | totalChunks: chunks.length,
111 | chunkMetadata: chunk.metadata,
112 | },
113 | }));
114 |
115 | await this.client.upsert(this.collectionName, {
116 | wait: true,
117 | points,
118 | });
119 | } catch (error) {
120 | throw new QdrantError(
121 | `Failed to store document chunks: ${(error as Error).message}`
122 | );
123 | }
124 | }
125 |
126 | /**
127 | * Checks if a document already exists in the collection
128 | * @param url Document URL
129 | * @returns true if document exists
130 | */
131 | async documentExists(url: string): Promise<boolean> {
132 | try {
133 | const response = await this.client.scroll(this.collectionName, {
134 | filter: {
135 | must: [
136 | {
137 | key: 'url',
138 | match: {
139 | value: url,
140 | },
141 | },
142 | ],
143 | },
144 | limit: 1,
145 | });
146 |
147 | return response.points.length > 0;
148 | } catch (error) {
149 | throw new QdrantError(
150 | `Failed to check document existence: ${(error as Error).message}`
151 | );
152 | }
153 | }
154 |
155 | /**
156 | * Removes a document and all its chunks from the collection
157 | * @param url Document URL
158 | */
159 | async removeDocument(url: string): Promise<void> {
160 | try {
161 | await this.client.delete(this.collectionName, {
162 | filter: {
163 | must: [
164 | {
165 | key: 'url',
166 | match: {
167 | value: url,
168 | },
169 | },
170 | ],
171 | },
172 | wait: true,
173 | });
174 | } catch (error) {
175 | throw new QdrantError(
176 | `Failed to remove document: ${(error as Error).message}`
177 | );
178 | }
179 | }
180 |
181 | /**
182 | * Generates a unique point ID for a chunk
183 | * @param url Document URL
184 | * @param chunkIndex Chunk index
185 | * @returns Unique point ID
186 | */
187 | private generatePointId(url: string, chunkIndex: number): number {
188 | // Create a hash of the URL + chunk index
189 | const str = `${url}:${chunkIndex}`;
190 | let hash = 0;
191 | for (let i = 0; i < str.length; i++) {
192 | const char = str.charCodeAt(i);
193 | hash = ((hash << 5) - hash) + char;
194 | hash = hash & hash; // Convert to 32-bit integer
195 | }
196 | return Math.abs(hash);
197 | }
198 |
199 | /**
200 | * Gets the health status of the Qdrant server
201 | * @returns true if server is healthy
202 | */
203 | async isHealthy(): Promise<boolean> {
204 | try {
205 | await this.client.getCollections();
206 | return true;
207 | } catch {
208 | return false;
209 | }
210 | }
211 |
212 | /**
213 | * Lists all documents with pagination support
214 | * @param options Listing options including pagination and filtering
215 | * @returns Array of document metadata with pagination info
216 | */
217 | async listDocuments(options: {
218 | offset?: number;
219 | limit?: number;
220 | domain?: string;
221 | sortBy?: 'timestamp' | 'title' | 'domain';
222 | sortOrder?: 'asc' | 'desc';
223 | } = {}): Promise<{ total: number; documents: DocumentMetadata[] }> {
224 | const filter: any = {
225 | must: [
226 | {
227 | key: 'chunkIndex',
228 | match: { value: 0 }, // Only get first chunk to avoid duplicates
229 | },
230 | ],
231 | };
232 |
233 | if (options.domain) {
234 | filter.must.push({
235 | key: 'domain',
236 | match: { value: options.domain },
237 | });
238 | }
239 |
240 | try {
241 | // Get total count first
242 | const countResponse = await this.client.count(this.collectionName, {
243 | filter,
244 | });
245 |
246 | // Then get paginated results
247 | const response = await this.client.scroll(this.collectionName, {
248 | filter,
249 | limit: options.limit || 20,
250 | offset: options.offset || 0,
251 | with_payload: true,
252 | with_vector: false,
253 | });
254 |
255 | const documents = response.points.map(point => {
256 | const payload = point.payload as any;
257 | return {
258 | url: String(payload.url),
259 | title: String(payload.title),
260 | domain: String(payload.domain),
261 | timestamp: String(payload.timestamp),
262 | contentType: String(payload.contentType),
263 | wordCount: Number(payload.wordCount),
264 | hasCode: Boolean(payload.hasCode),
265 | chunkIndex: Number(payload.chunkIndex),
266 | totalChunks: Number(payload.totalChunks),
267 | };
268 | });
269 |
270 | return {
271 | total: countResponse.count,
272 | documents,
273 | };
274 | } catch (error) {
275 | throw new QdrantError(
276 | `Failed to list documents: ${(error as Error).message}`
277 | );
278 | }
279 | }
280 |
281 | /**
282 | * Performs a semantic search using vector similarity
283 | * @param queryVector Query embedding vector
284 | * @param options Search options
285 | * @returns Array of search results with scores
286 | */
287 | async searchSimilar(
288 | queryVector: number[],
289 | options: {
290 | limit?: number;
291 | scoreThreshold?: number;
292 | filters?: {
293 | domain?: string;
294 | hasCode?: boolean;
295 | after?: string;
296 | before?: string;
297 | };
298 | } = {}
299 | ): Promise<Array<DocumentMetadata & { score: number; content: string }>> {
300 | const limit = options.limit || 5;
301 | const scoreThreshold = options.scoreThreshold || 0.7;
302 | const filter: any = { must: [] };
303 |
304 | // Add filters if specified
305 | if (options.filters?.domain) {
306 | filter.must.push({
307 | key: 'domain',
308 | match: { value: options.filters.domain },
309 | });
310 | }
311 |
312 | if (options.filters?.hasCode !== undefined) {
313 | filter.must.push({
314 | key: 'hasCode',
315 | match: { value: options.filters.hasCode },
316 | });
317 | }
318 |
319 | if (options.filters?.after) {
320 | filter.must.push({
321 | key: 'timestamp',
322 | range: { gte: options.filters.after },
323 | });
324 | }
325 |
326 | if (options.filters?.before) {
327 | filter.must.push({
328 | key: 'timestamp',
329 | range: { lte: options.filters.before },
330 | });
331 | }
332 |
333 | try {
334 | const response = await this.client.search(this.collectionName, {
335 | vector: queryVector,
336 | limit: Math.ceil(limit * 1.5), // Request extra results for post-filtering
337 | score_threshold: scoreThreshold,
338 | filter: filter.must.length > 0 ? filter : undefined,
339 | with_payload: true,
340 | });
341 |
342 | return response
343 | .map(hit => {
344 | const payload = hit.payload as any;
345 | if (!payload || typeof payload !== 'object') {
346 | throw new QdrantError('Invalid payload structure in search result');
347 | }
348 |
349 | // Extract and validate required fields
350 | const result = {
351 | score: hit.score || 0,
352 | url: String(payload.url),
353 | title: String(payload.title),
354 | domain: String(payload.domain),
355 | timestamp: String(payload.timestamp),
356 | contentType: String(payload.contentType),
357 | wordCount: Number(payload.wordCount),
358 | hasCode: Boolean(payload.hasCode),
359 | chunkIndex: Number(payload.chunkIndex),
360 | totalChunks: Number(payload.totalChunks),
361 | content: String(payload.content),
362 | };
363 |
364 | // Validate all fields are present and of correct type
365 | if (Object.values(result).some(v => v === undefined)) {
366 | throw new QdrantError('Missing required fields in search result');
367 | }
368 |
369 | return result;
370 | })
371 | .slice(0, limit); // Return only requested number of results
372 | } catch (error) {
373 | throw new QdrantError(
374 | `Failed to perform search: ${(error as Error).message}`
375 | );
376 | }
377 | }
378 | }
379 |
```