fkesheh/code-context-mcp # codebase.md

# Directory Structure

```
├── .env.example
├── .gitignore
├── claude_desktop_config.example.json
├── config.ts
├── index.ts
├── jest.config.mjs
├── LICENSE
├── package-lock.json
├── package.json
├── README.md
├── SETUP.md
├── start.ts
├── tools
│   ├── embedFiles.ts
│   ├── ingestBranch.ts
│   ├── processFiles.ts
│   └── queryRepo.ts
├── tsconfig.json
└── utils
    ├── codeSplitter.ts
    ├── db.ts
    ├── filePatternMatcher.ts
    ├── ollamaEmbeddings.ts
    ├── repoConfig.ts
    └── types.ts
```

# Files

--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------

```
1 | DATA_DIR=/home/user/.config/Claude/data
2 | REPO_CONFIG_DIR=/home/user/.config/Claude/repos
3 | NODE_ENV=development
4 | 
```

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
 1 | # Node.js
 2 | node_modules/
 3 | npm-debug.log
 4 | yarn-debug.log
 5 | yarn-error.log
 6 | 
 7 | # TypeScript
 8 | dist/
 9 | *.tsbuildinfo
10 | 
11 | # Data directories
12 | data/
13 | cache/
14 | repos/
15 | 
16 | # HuggingFace specific
17 | .transformers/
18 | .cache/
19 | huggingface/
20 | models/
21 | **/temp_test_repos/
22 | 
23 | # Test temporary files
24 | coverage/
25 | .nyc_output/
26 | junit.xml
27 | 
28 | # Database files
29 | *.db
30 | *.sqlite
31 | *.sqlite3
32 | 
33 | # Environment variables
34 | .env
35 | .env.local
36 | .env.development.local
37 | .env.test.local
38 | .env.production.local
39 | 
40 | # Log files
41 | logs/
42 | *.log
43 | 
44 | # Editor directories and files
45 | .idea/
46 | .vscode/
47 | *.swp
48 | *.swo
49 | 
50 | # OS files
51 | .DS_Store
52 | Thumbs.db
53 | 
54 | # Build files
55 | build/
56 | out/
57 | 
```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Code Context MCP Server
  2 | 
  3 | A Model Context Protocol (MCP) server for providing code context from local git repositories. This server allows you to:
  4 | 
  5 | 1. Clone git repositories locally
  6 | 2. Process branches and files
  7 | 3. Generate embeddings for code chunks
  8 | 4. Perform semantic search over code
  9 | 
 10 | ## Features
 11 | 
 12 | - Uses local git repositories instead of GitHub API
 13 | - Stores data in SQLite database
 14 | - Splits code into semantic chunks
 15 | - Generates embeddings for code chunks using Ollama
 16 | - Provides semantic search over code
 17 | 
 18 | ## Prerequisites
 19 | 
 20 | - Node.js (v16+)
 21 | - Git
 22 | - Ollama with an embedding model
 23 | 
 24 | ## Installation
 25 | 
 26 | ```bash
 27 | # Clone the repository
 28 | git clone <repository-url>
 29 | cd code-context-mcp
 30 | 
 31 | # Install dependencies
 32 | npm install
 33 | 
 34 | # Build the project
 35 | npm run build
 36 | ```
 37 | 
 38 | ## Configuration
 39 | 
 40 | Set the following environment variables:
 41 | 
 42 | - `DATA_DIR`: Directory for SQLite database (default: '~/.codeContextMcp/data')
 43 | - `REPO_CACHE_DIR`: Directory for cloned repositories (default: '~/.codeContextMcp/repos')
 44 | 
 45 | ### Using Ollama
 46 | 
 47 | For faster and more powerful embeddings, you can use Ollama:
 48 | 
 49 | ```bash
 50 | # Install Ollama from https://ollama.ai/
 51 | 
 52 | # Pull an embedding model (unclemusclez/jina-embeddings-v2-base-code is recommended)
 53 | ollama pull unclemusclez/jina-embeddings-v2-base-code
 54 | 
 55 | ```
 56 | 
 57 | ## Usage
 58 | 
 59 | ### Using with Claude Desktop
 60 | 
 61 | Add the following configuration to your Claude Desktop configuration file (`claude_desktop_config.json`):
 62 | 
 63 | ```json
 64 | {
 65 |   "mcpServers": {
 66 |     "code-context-mcp": {
 67 |       "command": "/path/to/your/node",
 68 |       "args": ["/path/to/code-context-mcp/dist/index.js"]
 69 |     }
 70 |   }
 71 | }
 72 | ```
 73 | 
 74 | ## Tools
 75 | 
 76 | The server provides the following tool:
 77 | 
 78 | ### queryRepo
 79 | 
 80 | Clones a repository, processes code, and performs semantic search:
 81 | 
 82 | ```json
 83 | {
 84 |   "repoUrl": "https://github.com/username/repo.git",
 85 |   "branch": "main", // Optional - defaults to repository's default branch
 86 |   "query": "Your search query",
 87 |   "keywords": ["keyword1", "keyword2"], // Optional - filter results by keywords
 88 |   "filePatterns": ["**/*.ts", "src/*.js"], // Optional - filter files by glob patterns
 89 |   "excludePatterns": ["**/node_modules/**"], // Optional - exclude files by glob patterns
 90 |   "limit": 10 // Optional - number of results to return, default: 10
 91 | }
 92 | ```
 93 | 
 94 | The `branch` parameter is optional. If not provided, the tool will automatically use the repository's default branch.
 95 | 
 96 | The `keywords` parameter is optional. If provided, the results will be filtered to only include chunks that contain at least one of the specified keywords (case-insensitive matching).
 97 | 
 98 | The `filePatterns` and `excludePatterns` parameters are optional. They allow you to filter which files are processed and searched using glob patterns (e.g., `**/*.ts` for all TypeScript files).
 99 | 
100 | ## Database Schema
101 | 
102 | The server uses SQLite with the following schema:
103 | 
104 | - `repository`: Stores information about repositories
105 | - `branch`: Stores information about branches
106 | - `file`: Stores information about files
107 | - `branch_file_association`: Associates files with branches
108 | - `file_chunk`: Stores code chunks and their embeddings
109 | 
110 | # Debugging
111 | 
112 | ## MAC Mx Series - ARM Architecture Issues
113 | 
114 | When installing better-sqlite3 on Mac M-series chips (ARM architecture), if you encounter errors like "mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64e' or 'arm64')", you need to ensure the binary matches your architecture. Here's how to resolve this issue:
115 | 
116 | ```bash
117 | # Check your Node.js architecture
118 | node -p "process.arch"
119 | 
120 | # If it shows 'arm64', but you're still having issues, try:
121 | npm rebuild better-sqlite3 --build-from-source
122 | 
123 | # Or for a clean install:
124 | npm uninstall better-sqlite3
125 | export npm_config_arch=arm64
126 | export npm_config_target_arch=arm64
127 | npm install better-sqlite3 --build-from-source
128 | ```
129 | 
130 | If you're using Rosetta, make sure your entire environment is consistent. Your error shows x86_64 binaries being built but your system needs arm64.
131 | For persistent configuration, add to your .zshrc or .bashrc:
132 | 
133 | ```
134 | export npm_config_arch=arm64
135 | export npm_config_target_arch=arm64
136 | ```
137 | 
138 | ## Testing Ollama Embeddings
139 | 
140 | curl http://localhost:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
141 | curl http://127.0.01:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
142 | curl http://[::1]:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
143 | 
144 | ## License
145 | 
146 | MIT
147 | 
```

--------------------------------------------------------------------------------
/utils/types.ts:
--------------------------------------------------------------------------------

```typescript
 1 | /**
 2 |  * Common interfaces and types used across the codebase
 3 |  */
 4 | 
 5 | /**
 6 |  * Interface for objects that can send progress notifications
 7 |  */
 8 | export interface ProgressNotifier {
 9 |   sendProgress: (progress: number, total: number) => Promise<void>;
10 | }
11 | 
```

--------------------------------------------------------------------------------
/claude_desktop_config.example.json:
--------------------------------------------------------------------------------

```json
 1 | {
 2 |   "mcpServers": {
 3 |     "code-context": {
 4 |       "command": "node",
 5 |       "args": ["<CLAUDE_CONFIG_DIR>/mcp-servers/code-context-mcp/dist/start.js"],
 6 |       "env": {
 7 |         "DATA_DIR": "<CLAUDE_CONFIG_DIR>/data",
 8 |         "REPO_CONFIG_DIR": "<CLAUDE_CONFIG_DIR>/repos",
 9 |         "NODE_ENV": "development"
10 |       }
11 |     }
12 |   }
13 | }
14 | 
```

--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------

```json
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2020",
 4 |     "module": "NodeNext",
 5 |     "moduleResolution": "NodeNext",
 6 |     "esModuleInterop": true,
 7 |     "strict": true,
 8 |     "outDir": "./dist",
 9 |     "rootDir": ".",
10 |     "declaration": true,
11 |     "skipLibCheck": true,
12 |     "isolatedModules": true,
13 |     "allowJs": true,
14 |     "resolveJsonModule": true,
15 |     "forceConsistentCasingInFileNames": true,
16 |     "baseUrl": ".",
17 |     "paths": {
18 |       "*": ["*"]
19 |     }
20 |   },
21 |   "include": [
22 |     "./**/*.ts",
23 |     "./**/*.mts",
24 |     "./tests/**/*.ts"
25 |   ],
26 |   "exclude": [
27 |     "node_modules",
28 |     "dist",
29 |     "repos"
30 |   ]
31 | }
32 | 
```

--------------------------------------------------------------------------------
/config.ts:
--------------------------------------------------------------------------------

```typescript
 1 | import path from "path";
 2 | import os from "os";
 3 | 
 4 | // Available models for code embeddings
 5 | export const EMBEDDING_MODELS = {
 6 |   OLLAMA: {
 7 |     model: "unclemusclez/jina-embeddings-v2-base-code",
 8 |     contextSize: 8192,
 9 |     dimensions: 768,
10 |     baseUrl: "http://127.0.0.1:11434",
11 |   },
12 | };
13 | 
14 | 
15 | 
16 | export const codeContextConfig = {
17 |   ENV: process.env.NODE_ENV || "development",
18 |   REPO_CONFIG_DIR:
19 |     process.env.REPO_CONFIG_DIR ||
20 |     path.join(os.homedir(), ".codeContextMcp", "repos"),
21 |   BATCH_SIZE: 100,
22 |   DATA_DIR:
23 |     process.env.DATA_DIR || path.join(os.homedir(), ".codeContextMcp", "data"),
24 |   DB_PATH: process.env.DB_PATH || "code_context.db",
25 |   EMBEDDING_MODEL: EMBEDDING_MODELS.OLLAMA,
26 | };
27 | 
28 | export default codeContextConfig;
29 | 
```

--------------------------------------------------------------------------------
/jest.config.mjs:
--------------------------------------------------------------------------------

```
 1 | export default {
 2 |   preset: 'ts-jest/presets/default-esm',
 3 |   clearMocks: true,
 4 |   coverageDirectory: "coverage",
 5 |   roots: [
 6 |     "./tests"
 7 |   ],
 8 |   moduleNameMapper: {
 9 |     '^(\\.{1,2}/.*)\\.js$': '$1',
10 |   },
11 |   transform: {
12 |     '^.+\\.tsx?$': [
13 |       'ts-jest',
14 |       {
15 |         isolatedModules: true,
16 |         useESM: true,
17 |         tsconfig: './tsconfig.json'
18 |       }
19 |     ]
20 |   },
21 |   testEnvironment: 'node',
22 |   moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node', 'mjs'],
23 |   extensionsToTreatAsEsm: ['.ts', '.mts'],
24 |   transformIgnorePatterns: [
25 |     'node_modules/(?!(@huggingface)/)'
26 |   ],
27 |   testMatch: [
28 |     '**/?(*.)+(spec|test).ts',
29 |     '**/tests/*EmbeddingsTest.ts',
30 |     '**/tests/githubRepoTest.ts'
31 |   ],
32 |   globals: {
33 |     'ts-jest': {
34 |       useESM: true,
35 |     },
36 |   },
37 |   setupFilesAfterEnv: ['<rootDir>/tests/setup.ts'],
38 |   verbose: true
39 | };
40 | 
```

--------------------------------------------------------------------------------
/SETUP.md:
--------------------------------------------------------------------------------

```markdown
 1 | # Code Context MCP Setup
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | ```bash
 6 | ollama pull unclemusclez/jina-embeddings-v2-base-code
 7 | ```
 8 | 
 9 | ## Install
10 | 
11 | ```bash
12 | npm install
13 | npm run build
14 | ```
15 | 
16 | ## Configuration
17 | 
18 | Copy `claude_desktop_config.example.json` to your Claude Desktop config location:
19 | 
20 | **Linux/macOS**: `~/.config/Claude/claude_desktop_config.json`  
21 | **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
22 | 
23 | Replace `<CLAUDE_CONFIG_DIR>` with your actual path:
24 | - Linux/macOS: `/home/username/.config/Claude`
25 | - Windows: `C:\Users\username\AppData\Roaming\Claude`
26 | 
27 | ## Environment
28 | 
29 | Copy `.env.example` to `.env` and adjust paths if needed.
30 | 
31 | The `repos/` directory stores configuration metadata for repositories, not full clones.
32 | For local repositories (file:// URLs), no cloning occurs - files are accessed directly.
33 | 
34 | ## Test
35 | 
36 | ```bash
37 | npm run start:mcp
38 | ```
39 | 
40 | Restart Claude Desktop.
41 | 
```

--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------

```json
 1 | {
 2 |   "name": "@modelcontextprotocol/server-code-context",
 3 |   "version": "0.1.0",
 4 |   "description": "MCP server for code context from local git repositories",
 5 |   "license": "MIT",
 6 |   "type": "module",
 7 |   "bin": {
 8 |     "mcp-server-code-context": "dist/index.js"
 9 |   },
10 |   "files": [
11 |     "dist"
12 |   ],
13 |   "scripts": {
14 |     "build": "tsc && shx chmod +x dist/*.js",
15 |     "watch": "tsc --watch",
16 |     "start": "node dist/index.js",
17 |     "start:mcp": "node dist/start.js",
18 |     "inspect": "npm run build && npx @modelcontextprotocol/inspector node dist/index.js"
19 |   },
20 |   "dependencies": {
21 |     "@langchain/textsplitters": "^0.1.0",
22 |     "@modelcontextprotocol/sdk": "1.0.1",
23 |     "axios": "^1.8.4",
24 |     "better-sqlite3": "^11.9.1",
25 |     "express": "^4.21.1",
26 |     "simple-git": "^3.20.0",
27 |     "zod": "^3.23.8",
28 |     "zod-to-json-schema": "^3.23.5"
29 |   },
30 |   "devDependencies": {
31 |     "@types/better-sqlite3": "^7.6.4",
32 |     "@types/express": "^5.0.0",
33 |     "@types/jest": "^29.5.14",
34 |     "@types/minimatch": "^5.1.2",
35 |     "@types/node": "^20.10.0",
36 |     "jest": "^29.7.0",
37 |     "jest-environment-node-single-context": "^29.4.0",
38 |     "shx": "^0.3.4",
39 |     "ts-jest": "^29.3.0",
40 |     "tsx": "^4.19.3",
41 |     "typescript": "^5.6.2"
42 |   }
43 | }
44 | 
```

--------------------------------------------------------------------------------
/utils/filePatternMatcher.ts:
--------------------------------------------------------------------------------

```typescript
 1 | /**
 2 |  * Convert a glob pattern to an SQL LIKE pattern
 3 |  */
 4 | export function globToSqlPattern(pattern: string): string {
 5 |   // Handle ** (any depth of directories)
 6 |   let sqlPattern = pattern.replace(/\*\*/g, '%');
 7 |   
 8 |   // Handle * (any characters within a directory)
 9 |   sqlPattern = sqlPattern.replace(/\*/g, '%');
10 |   
11 |   return sqlPattern;
12 | }
13 | 
14 | /**
15 |  * Create SQL WHERE conditions for file pattern filtering using numbered parameters
16 |  * for better SQLite compatibility
17 |  */
18 | export function createFilePatternCondition(
19 |   includePatterns: string[] | undefined,
20 |   excludePatterns: string[] | undefined
21 | ): string {
22 |   let conditions = '';
23 |   
24 |   // Include patterns (files must match at least one pattern)
25 |   if (includePatterns && includePatterns.length > 0) {
26 |     const includeConditions = includePatterns.map(pattern => {
27 |       const sqlPattern = globToSqlPattern(pattern);
28 |       return `f.path LIKE '${sqlPattern}'`;
29 |     });
30 |     conditions += ` AND (${includeConditions.join(' OR ')})`;
31 |   }
32 |   
33 |   // Exclude patterns (files must not match any pattern)
34 |   if (excludePatterns && excludePatterns.length > 0) {
35 |     const excludeConditions = excludePatterns.map(pattern => {
36 |       const sqlPattern = globToSqlPattern(pattern);
37 |       return `f.path NOT LIKE '${sqlPattern}'`;
38 |     });
39 |     conditions += ` AND (${excludeConditions.join(' AND ')})`;
40 |   }
41 |   
42 |   return conditions;
43 | }
44 | 
```

--------------------------------------------------------------------------------
/start.ts:
--------------------------------------------------------------------------------

```typescript
 1 | #!/usr/bin/env node
 2 | 
 3 | import { spawn } from 'child_process';
 4 | import { existsSync, mkdirSync } from 'fs';
 5 | import { dirname, join } from 'path';
 6 | import { fileURLToPath } from 'url';
 7 | 
 8 | const __dirname = dirname(fileURLToPath(import.meta.url));
 9 | 
10 | const DATA_DIR = process.env.DATA_DIR || join(process.env.HOME!, '.config', 'Claude', 'data');
11 | const REPO_CONFIG_DIR = process.env.REPO_CONFIG_DIR || join(process.env.HOME!, '.config', 'Claude', 'repos');
12 | const NODE_ENV = process.env.NODE_ENV || 'development';
13 | 
14 | [DATA_DIR, REPO_CONFIG_DIR].forEach(dir => {
15 |   if (!existsSync(dir)) {
16 |     mkdirSync(dir, { recursive: true, mode: 0o755 });
17 |   }
18 | });
19 | 
20 | process.stderr.write(`Starting Code Context MCP Server\n`);
21 | process.stderr.write(`Data Directory: ${DATA_DIR}\n`);
22 | process.stderr.write(`Repo Config: ${REPO_CONFIG_DIR}\n`);
23 | process.stderr.write(`Node Environment: ${NODE_ENV}\n\n`);
24 | 
25 | const checkOllama = () => {
26 |   try {
27 |     const result = spawn('pgrep', ['ollama'], { stdio: 'pipe' });
28 |     result.on('exit', (code) => {
29 |       if (code !== 0) {
30 |         process.stderr.write('Starting Ollama...\n');
31 |         spawn('ollama', ['serve'], { detached: true, stdio: 'ignore' }).unref();
32 |         setTimeout(() => startMcpServer(), 3000);
33 |       } else {
34 |         startMcpServer();
35 |       }
36 |     });
37 |   } catch {
38 |     startMcpServer();
39 |   }
40 | };
41 | 
42 | const startMcpServer = () => {
43 |   const serverPath = join(__dirname, 'index.js');
44 |   
45 |   if (!existsSync(serverPath)) {
46 |     process.stderr.write(`Error: MCP server not found at ${serverPath}\n`);
47 |     process.stderr.write('Run: npm run build\n');
48 |     process.exit(1);
49 |   }
50 | 
51 |   process.env.DATA_DIR = DATA_DIR;
52 |   process.env.REPO_CONFIG_DIR = REPO_CONFIG_DIR;
53 |   process.env.NODE_ENV = NODE_ENV;
54 | 
55 |   const server = spawn('node', [serverPath, ...process.argv.slice(2)], {
56 |     stdio: 'inherit',
57 |     cwd: __dirname
58 |   });
59 | 
60 |   server.on('exit', (code) => process.exit(code || 0));
61 | };
62 | 
63 | checkOllama();
64 | 
```

--------------------------------------------------------------------------------
/utils/ollamaEmbeddings.ts:
--------------------------------------------------------------------------------

```typescript
 1 | import axios from "axios";
 2 | import config from "../config.js";
 3 | 
 4 | // Cache for API
 5 | let apiInitialized = false;
 6 | 
 7 | /**
 8 |  * Generate embeddings for text using Ollama API
 9 |  * @param texts Array of text strings to embed
10 |  * @param embeddingModel Optional model configuration to use
11 |  * @returns Promise containing array of embeddings
12 |  */
13 | export async function generateOllamaEmbeddings(
14 |   texts: string[],
15 |   embeddingModel: {
16 |     model: string;
17 |     contextSize: number;
18 |     dimensions: number;
19 |     baseUrl?: string;
20 |   } = config.EMBEDDING_MODEL
21 | ): Promise<number[][]> {
22 |   try {
23 |     // Log initialization
24 |     if (!apiInitialized) {
25 |       console.error(
26 |         `Initializing Ollama embeddings with model: ${embeddingModel.model}...`
27 |       );
28 |       apiInitialized = true;
29 |     }
30 | 
31 |     const baseUrl = embeddingModel.baseUrl || "http://127.0.0.1:11434";
32 |     const embeddings: number[][] = [];
33 | 
34 |     // Process texts in parallel with a rate limit
35 |     console.error(`Generating embeddings for ${texts.length} chunks...`);
36 |     const batchSize = 1000; // Process 5 at a time to avoid overwhelming the API
37 |     for (let i = 0; i < texts.length; i += batchSize) {
38 |       const batch = texts.slice(i, i + batchSize);
39 |       const response = await axios.post(
40 |             `${baseUrl}/api/embed`,
41 |             {
42 |               model: embeddingModel.model,
43 |               input: batch,
44 |               options: {
45 |                 num_ctx: embeddingModel.contextSize,
46 |               },
47 |             },
48 |             {
49 |               headers: {
50 |                 "Content-Type": "application/json",
51 |               },
52 |             }
53 |           );
54 |       // Await all promises in this batch
55 |       embeddings.push(...response.data.embeddings);
56 |     }
57 | 
58 |     console.error(`Successfully generated ${embeddings.length} embeddings`);
59 |     return embeddings;
60 |   } catch (error) {
61 |     console.error("Error generating embeddings:", error);
62 | 
63 |     // For testing purposes, return mock embeddings if running in test environment
64 |     if (config.ENV === "test") {
65 |       console.error("Using mock embeddings for testing");
66 |       return texts.map(() => generateMockEmbedding(embeddingModel.dimensions));
67 |     }
68 | 
69 |     throw error;
70 |   }
71 | }
72 | 
73 | /**
74 |  * Generate a simple mock embedding vector for testing
75 |  * @param dimensions The number of dimensions in the embedding vector
76 |  * @returns A normalized random vector of the specified dimensions
77 |  */
78 | function generateMockEmbedding(dimensions: number): number[] {
79 |   // Create a random vector
80 |   const vector = Array.from({ length: dimensions }, () => Math.random() - 0.5);
81 | 
82 |   // Normalize the vector
83 |   const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
84 |   return vector.map((val) => val / magnitude);
85 | }
86 | 
```

--------------------------------------------------------------------------------
/utils/repoConfig.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import { existsSync, writeFileSync, readFileSync, mkdirSync } from 'fs';
  2 | import { join, basename } from 'path';
  3 | import { createHash } from 'crypto';
  4 | import config from '../config.js';
  5 | 
  6 | interface RepoConfig {
  7 |   url: string;
  8 |   localPath?: string;
  9 |   lastAccessed: number;
 10 |   type: 'local' | 'remote' | 'cached';
 11 |   branch?: string;
 12 | }
 13 | 
 14 | export class RepositoryConfigManager {
 15 |   private configDir: string;
 16 | 
 17 |   constructor() {
 18 |     this.configDir = config.REPO_CONFIG_DIR;
 19 |     if (!existsSync(this.configDir)) {
 20 |       mkdirSync(this.configDir, { recursive: true });
 21 |     }
 22 |   }
 23 | 
 24 |   private getConfigPath(repoUrl: string): string {
 25 |     const hash = createHash('md5').update(repoUrl).digest('hex');
 26 |     return join(this.configDir, `${hash}.json`);
 27 |   }
 28 | 
 29 |   private sanitizeLocalPath(repoUrl: string): string | null {
 30 |     if (repoUrl.startsWith('file://')) {
 31 |       const localPath = repoUrl.replace('file://', '');
 32 |       return existsSync(localPath) ? localPath : null;
 33 |     }
 34 |     return null;
 35 |   }
 36 | 
 37 |   getRepositoryPath(repoUrl: string, branch?: string): { path: string; config: RepoConfig } {
 38 |     const localPath = this.sanitizeLocalPath(repoUrl);
 39 |     
 40 |     if (localPath) {
 41 |       const repoConfig: RepoConfig = {
 42 |         url: repoUrl,
 43 |         localPath,
 44 |         lastAccessed: Date.now(),
 45 |         type: 'local',
 46 |         branch
 47 |       };
 48 |       
 49 |       this.saveConfig(repoUrl, repoConfig);
 50 |       return { path: localPath, config: repoConfig };
 51 |     }
 52 | 
 53 |     const configPath = this.getConfigPath(repoUrl);
 54 |     let repoConfig: RepoConfig;
 55 | 
 56 |     if (existsSync(configPath)) {
 57 |       try {
 58 |         repoConfig = JSON.parse(readFileSync(configPath, 'utf8'));
 59 |         repoConfig.lastAccessed = Date.now();
 60 |       } catch {
 61 |         repoConfig = this.createRemoteConfig(repoUrl, branch);
 62 |       }
 63 |     } else {
 64 |       repoConfig = this.createRemoteConfig(repoUrl, branch);
 65 |     }
 66 | 
 67 |     this.saveConfig(repoUrl, repoConfig);
 68 |     return { path: repoConfig.localPath || '', config: repoConfig };
 69 |   }
 70 | 
 71 |   private createRemoteConfig(repoUrl: string, branch?: string): RepoConfig {
 72 |     const repoName = basename(repoUrl.replace('.git', ''));
 73 |     const cacheDir = join(this.configDir, 'cache');
 74 |     
 75 |     if (!existsSync(cacheDir)) {
 76 |       mkdirSync(cacheDir, { recursive: true });
 77 |     }
 78 | 
 79 |     return {
 80 |       url: repoUrl,
 81 |       localPath: join(cacheDir, repoName),
 82 |       lastAccessed: Date.now(),
 83 |       type: 'remote',
 84 |       branch
 85 |     };
 86 |   }
 87 | 
 88 |   private saveConfig(repoUrl: string, config: RepoConfig): void {
 89 |     const configPath = this.getConfigPath(repoUrl);
 90 |     writeFileSync(configPath, JSON.stringify(config, null, 2));
 91 |   }
 92 | 
 93 |   isLocalRepository(repoUrl: string): boolean {
 94 |     return repoUrl.startsWith('file://');
 95 |   }
 96 | 
 97 |   needsCloning(repoUrl: string): boolean {
 98 |     if (this.isLocalRepository(repoUrl)) {
 99 |       return false;
100 |     }
101 |     
102 |     const { config } = this.getRepositoryPath(repoUrl);
103 |     return !config.localPath || !existsSync(config.localPath);
104 |   }
105 | 
106 |   getRepoType(repoUrl: string): 'local' | 'remote' {
107 |     return this.isLocalRepository(repoUrl) ? 'local' : 'remote';
108 |   }
109 | }
110 | 
111 | export const repoConfigManager = new RepositoryConfigManager();
112 | 
```

--------------------------------------------------------------------------------
/index.ts:
--------------------------------------------------------------------------------

```typescript
  1 | #!/usr/bin/env node
  2 | import { Server } from "@modelcontextprotocol/sdk/server/index.js";
  3 | import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
  4 | import {
  5 |   CallToolRequestSchema,
  6 |   ErrorCode,
  7 |   ListToolsRequestSchema,
  8 |   McpError,
  9 | } from "@modelcontextprotocol/sdk/types.js";
 10 | import { QueryRepoSchema, queryRepo } from "./tools/queryRepo.js";
 11 | import { zodToJsonSchema } from "zod-to-json-schema";
 12 | import { z } from "zod";
 13 | import { ProgressNotifier } from "utils/types.js";
 14 | 
 15 | enum ToolName {
 16 |   QUERY_REPO = "query_repo",
 17 | }
 18 | 
 19 | class CodeContextServer {
 20 |   private server: Server;
 21 | 
 22 |   constructor() {
 23 |     this.server = new Server(
 24 |       {
 25 |         name: "code-context-mcp",
 26 |         version: "0.1.0",
 27 |       },
 28 |       {
 29 |         capabilities: {
 30 |           tools: {},
 31 |         },
 32 |       }
 33 |     );
 34 | 
 35 |     this.setupToolHandlers();
 36 | 
 37 |     // Error handling
 38 |     this.server.onerror = (error) => console.error("[MCP Error]", error);
 39 |     process.on("SIGINT", async () => {
 40 |       await this.server.close();
 41 |       process.exit(0);
 42 |     });
 43 |   }
 44 | 
 45 |   private setupToolHandlers() {
 46 |     this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
 47 |       tools: [
 48 |         {
 49 |           name: ToolName.QUERY_REPO,
 50 |           description: "Queries a git repository using semantic and keyword search. Use keywords and file patterns if you want to targer specific files or terms",
 51 |           inputSchema: zodToJsonSchema(QueryRepoSchema),
 52 |         },
 53 |       ],
 54 |     }));
 55 | 
 56 |     this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
 57 |       const { name, arguments: input } = request.params;
 58 |       const progressToken = request.params._meta?.progressToken;
 59 | 
 60 |       switch (name) {
 61 |         case ToolName.QUERY_REPO:
 62 |           try {
 63 |             // Create a progress notifier if we have a progress token
 64 |             let progressNotifier: ProgressNotifier | undefined;
 65 |             
 66 |             if (progressToken !== undefined) {
 67 |               progressNotifier = {
 68 |                 sendProgress: async (progress: number, total: number) => {
 69 |                   await this.server.notification({
 70 |                     method: "notifications/progress",
 71 |                     params: {
 72 |                       progress: Math.floor(progress * 100),
 73 |                       total: total * 100,
 74 |                       progressToken,
 75 |                     },
 76 |                   });
 77 |                 },
 78 |               };
 79 |             }
 80 |             
 81 |             // Get the raw result from queryRepo with progress notifications
 82 |             const result = await queryRepo(
 83 |               input as z.infer<typeof QueryRepoSchema>,
 84 |               progressNotifier
 85 |             );
 86 |             
 87 |             // Format the response in Claude's expected structure
 88 |             return {
 89 |               content: [
 90 |                 {
 91 |                   type: "text",
 92 |                   text: JSON.stringify(result),
 93 |                 },
 94 |               ],
 95 |             };
 96 |           } catch (error) {
 97 |             console.error("Error in query_repo:", error);
 98 |             return {
 99 |               content: [
100 |                 {
101 |                   type: "text",
102 |                   text: `Error executing query: ${error instanceof Error ? error.message : String(error)}`,
103 |                 },
104 |               ],
105 |             };
106 |           }
107 |         default:
108 |           throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${name}`);
109 |       }
110 |     });
111 |   }
112 | 
113 |   async run() {
114 |     const transport = new StdioServerTransport();
115 |     await this.server.connect(transport);
116 |     console.error("Code Context MCP server running on stdio");
117 |   }
118 | }
119 | 
120 | const server = new CodeContextServer();
121 | server.run().catch(console.error);
122 | 
```

--------------------------------------------------------------------------------
/utils/db.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import Database from "better-sqlite3";
  2 | import fs from "fs";
  3 | import path from "path";
  4 | import config from "../config.js";
  5 | 
  6 | // Ensure the data directory exists
  7 | const DATA_DIR = config.DATA_DIR;
  8 | if (!fs.existsSync(DATA_DIR)) {
  9 |   fs.mkdirSync(DATA_DIR, { recursive: true });
 10 | }
 11 | 
 12 | const DB_PATH = path.join(DATA_DIR, "code_context.db");
 13 | const db = new Database(DB_PATH);
 14 | 
 15 | console.error(`Using db at: ${DB_PATH}`)
 16 | 
 17 | // Enable foreign keys
 18 | db.pragma("foreign_keys = ON");
 19 | 
 20 | // SQL schema for the database
 21 | export const SCHEMA_SQL = `
 22 | CREATE TABLE IF NOT EXISTS repository (
 23 |   id INTEGER PRIMARY KEY AUTOINCREMENT,
 24 |   name TEXT NOT NULL,
 25 |   path TEXT NOT NULL,
 26 |   last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 27 |   UNIQUE(path)
 28 | );
 29 | 
 30 | CREATE TABLE IF NOT EXISTS branch (
 31 |   id INTEGER PRIMARY KEY AUTOINCREMENT,
 32 |   name TEXT NOT NULL,
 33 |   repository_id INTEGER NOT NULL,
 34 |   last_commit_sha TEXT NOT NULL,
 35 |   status TEXT CHECK(status IN ('pending', 'files_processed', 'embeddings_generated')) DEFAULT 'pending',
 36 |   created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 37 |   updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 38 |   FOREIGN KEY (repository_id) REFERENCES repository(id) ON DELETE CASCADE,
 39 |   UNIQUE(name, repository_id)
 40 | );
 41 | 
 42 | CREATE TABLE IF NOT EXISTS file (
 43 |   id INTEGER PRIMARY KEY AUTOINCREMENT,
 44 |   repository_id INTEGER NOT NULL,
 45 |   path TEXT NOT NULL,
 46 |   name TEXT NOT NULL,
 47 |   sha TEXT NOT NULL,
 48 |   status TEXT CHECK(status IN ('pending', 'fetched', 'ingested', 'done')) DEFAULT 'pending',
 49 |   created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 50 |   updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 51 |   FOREIGN KEY (repository_id) REFERENCES repository(id) ON DELETE CASCADE,
 52 |   UNIQUE(repository_id, path, sha)
 53 | );
 54 | 
 55 | CREATE TABLE IF NOT EXISTS branch_file_association (
 56 |   branch_id INTEGER NOT NULL,
 57 |   file_id INTEGER NOT NULL,
 58 |   PRIMARY KEY (branch_id, file_id),
 59 |   FOREIGN KEY (branch_id) REFERENCES branch(id) ON DELETE CASCADE,
 60 |   FOREIGN KEY (file_id) REFERENCES file(id) ON DELETE CASCADE
 61 | );
 62 | 
 63 | CREATE TABLE IF NOT EXISTS file_chunk (
 64 |   id INTEGER PRIMARY KEY AUTOINCREMENT,
 65 |   file_id INTEGER NOT NULL,
 66 |   content TEXT NOT NULL,
 67 |   chunk_number INTEGER NOT NULL,
 68 |   embedding TEXT,
 69 |   model_version TEXT,
 70 |   token_count INTEGER,
 71 |   created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 72 |   updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 73 |   FOREIGN KEY (file_id) REFERENCES file(id) ON DELETE CASCADE,
 74 |   UNIQUE(file_id, chunk_number)
 75 | );
 76 | `;
 77 | 
 78 | // Initialize the database
 79 | export const initializeDatabase = () => {
 80 |   try {
 81 |     // Split the schema SQL into individual statements
 82 |     const statements = SCHEMA_SQL.split(";").filter(
 83 |       (stmt) => stmt.trim().length > 0
 84 |     );
 85 | 
 86 |     // Execute each statement
 87 |     for (const statement of statements) {
 88 |       db.exec(statement + ";");
 89 |     }
 90 |   } catch (error) {
 91 |     console.error("Error initializing database:", error);
 92 |     throw error;
 93 |   }
 94 | };
 95 | 
 96 | // Helper function to run queries with parameters
 97 | const run = (sql: string, params: any = {}) => {
 98 |   return db.prepare(sql).run(params);
 99 | };
100 | 
101 | // Helper function to get a single row
102 | const get = (sql: string, params: any = {}) => {
103 |   return db.prepare(sql).get(params);
104 | };
105 | 
106 | // Helper function to get all rows
107 | const all = (sql: string, params: any = {}) => {
108 |   return db.prepare(sql).all(params);
109 | };
110 | 
111 | // Define a type for the database operations that can be performed in a transaction
112 | export interface DatabaseOperations {
113 |   prepare: (sql: string) => {
114 |     run: (params?: any) => any;
115 |     get: (params?: any) => any;
116 |     all: (params?: any) => any;
117 |   };
118 | }
119 | 
120 | // Create a transaction function that's compatible with the existing code
121 | const transaction = (cb: (dbOps: any) => any): any => {
122 |   const runTransaction = db.transaction(cb);
123 |   return runTransaction(db);
124 | };
125 | 
126 | // Define a public interface for our database module
127 | export interface DatabaseInterface {
128 |   run: (sql: string, params?: any) => any;
129 |   get: (sql: string, params?: any) => any;
130 |   all: (sql: string, params?: any) => any;
131 |   transaction: (cb: (dbOps: any) => any) => any;
132 |   close: () => void;
133 | }
134 | 
135 | // Initialize the database
136 | initializeDatabase();
137 | 
138 | // Export the database interface
139 | const dbInterface: DatabaseInterface = {
140 |   run,
141 |   get,
142 |   all,
143 |   transaction,
144 |   close: () => db.close(),
145 | };
146 | 
147 | export default dbInterface;
148 | 
```

--------------------------------------------------------------------------------
/tools/embedFiles.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import { z } from "zod";
  2 | import dbInterface from "../utils/db.js";
  3 | import { generateOllamaEmbeddings } from "../utils/ollamaEmbeddings.js";
  4 | import { ProgressNotifier } from "../utils/types.js";
  5 | import config from "../config.js";
  6 | 
  7 | // Define input schema for embedFiles
  8 | export const EmbedFilesSchema = z.object({
  9 |   repoLocalPath: z.string().describe("Local path to the cloned repository"),
 10 |   branchId: z.number().describe("Branch ID in the database"),
 11 |   _meta: z
 12 |     .object({
 13 |       progressToken: z.union([z.string(), z.number()]).optional(),
 14 |     })
 15 |     .optional(),
 16 | });
 17 | 
 18 | // Define chunk interface
 19 | interface Chunk {
 20 |   id: number;
 21 |   content: string;
 22 |   file_id: number;
 23 | }
 24 | 
 25 | export async function embedFiles(
 26 |   input: z.infer<typeof EmbedFilesSchema>,
 27 |   progressNotifier?: ProgressNotifier
 28 | ) {
 29 |   try {
 30 |     console.error(
 31 |       `[embedFiles] Starting with parameters: ${JSON.stringify(input)}`
 32 |     );
 33 | 
 34 |     // Check if input is defined
 35 |     if (!input) {
 36 |       console.error(`[embedFiles] Error: Input parameters are undefined`);
 37 |       return {
 38 |         error: {
 39 |           message: "Input parameters are required for embedFiles tool",
 40 |         },
 41 |       };
 42 |     }
 43 | 
 44 |     const startTime = Date.now();
 45 |     const { branchId } = input;
 46 | 
 47 |     // First check if the branch exists
 48 |     const branchExists = dbInterface.get(
 49 |       "SELECT id, status FROM branch WHERE id = ?",
 50 |       branchId
 51 |     );
 52 | 
 53 |     if (!branchExists) {
 54 |       console.error(`[embedFiles] Error: Branch with ID ${branchId} does not exist`);
 55 |       return {
 56 |         error: {
 57 |           message: `Branch with ID ${branchId} does not exist`,
 58 |         },
 59 |       };
 60 |     }
 61 | 
 62 |     // Check if there are any files associated with this branch
 63 |     const fileCount = dbInterface.get(
 64 |       "SELECT COUNT(*) as count FROM branch_file_association WHERE branch_id = ?",
 65 |       branchId
 66 |     );
 67 | 
 68 |     if (!fileCount || fileCount.count === 0) {
 69 |       console.error(`[embedFiles] No files found for branch ${branchId}`);
 70 |       // Still update the branch status
 71 |       console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
 72 |       dbInterface.run(
 73 |         "UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
 74 |         branchId
 75 |       );
 76 |       return { success: true, chunksProcessed: 0 };
 77 |     }
 78 | 
 79 |     // Get all chunks that need embeddings
 80 |     console.error(`[embedFiles] Finding chunks that need embeddings for branch ${branchId}`);
 81 |     const chunks = dbInterface.all(
 82 |       `SELECT fc.id, fc.content, f.id as file_id
 83 |        FROM file_chunk fc
 84 |        JOIN file f ON fc.file_id = f.id
 85 |        JOIN branch_file_association bfa ON f.id = bfa.file_id
 86 |        WHERE bfa.branch_id = ?
 87 |        AND fc.embedding IS NULL`,
 88 |       branchId
 89 |     );
 90 | 
 91 |     if (chunks.length === 0) {
 92 |       console.error(`[embedFiles] No chunks need embeddings, skipping`);
 93 |       // Update branch status even when no chunks need embeddings
 94 |       console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
 95 |       dbInterface.run(
 96 |         "UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
 97 |         branchId
 98 |       );
 99 |       
100 |       if (progressNotifier) {
101 |         await progressNotifier.sendProgress(1, 1);
102 |       }
103 |       return { success: true, chunksProcessed: 0 };
104 |     }
105 | 
106 |     console.error(`[embedFiles] Found ${chunks.length} chunks that need embeddings`);
107 | 
108 |     let processedChunks = 0;
109 |     const totalChunks = chunks.length;
110 | 
111 |     const BATCH_SIZE = 100
112 | 
113 |     // Process chunks in batches of BATCH_SIZE
114 |     for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
115 |       const batch = chunks.slice(i, i + BATCH_SIZE);
116 |       console.error(
117 |         `[embedFiles] Processing batch ${Math.floor(i/BATCH_SIZE) + 1}/${Math.ceil(totalChunks/BATCH_SIZE)}`
118 |       );
119 | 
120 |       // Generate embeddings for the batch
121 |       const chunkContents = batch.map((chunk: Chunk) => chunk.content);
122 |       console.error(`[embedFiles] Generating embeddings for ${batch.length} chunks`);
123 |       const embeddingStartTime = Date.now();
124 |       const embeddings = await generateOllamaEmbeddings(chunkContents);
125 |       console.error(
126 |         `[embedFiles] Generated embeddings in ${Date.now() - embeddingStartTime}ms`
127 |       );
128 | 
129 |       // Store embeddings in transaction
130 |       console.error(`[embedFiles] Storing embeddings`);
131 |       dbInterface.transaction((db) => {
132 |         const updateStmt = db.prepare(
133 |           `UPDATE file_chunk 
134 |            SET embedding = ?, model_version = ? 
135 |            WHERE id = ?`
136 |         );
137 |         for (let j = 0; j < batch.length; j++) {
138 |           const chunk = batch[j];
139 |           const embedding = JSON.stringify(embeddings[j]);
140 |           updateStmt.run(embedding, config.EMBEDDING_MODEL.model, chunk.id);
141 |         }
142 |       });
143 | 
144 |       processedChunks += batch.length;
145 | 
146 |       // Update progress
147 |       if (progressNotifier) {
148 |         const progress = processedChunks / totalChunks;
149 |         await progressNotifier.sendProgress(progress, 1);
150 |       }
151 |     }
152 | 
153 |     // Update branch status
154 |     console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
155 |     dbInterface.run(
156 |       "UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
157 |       branchId
158 |     );
159 | 
160 |     console.error(
161 |       `[embedFiles] Processed ${processedChunks} chunks in ${
162 |         Date.now() - startTime
163 |       }ms`
164 |     );
165 | 
166 |     return { 
167 |       success: true, 
168 |       chunksProcessed: processedChunks 
169 |     };
170 |   } catch (error) {
171 |     console.error(`[embedFiles] Error executing tool:`, error);
172 |     return {
173 |       error: {
174 |         message: `Error executing embedFiles tool: ${
175 |           error instanceof Error ? error.message : String(error)
176 |         }`,
177 |       },
178 |     };
179 |   }
180 | }
181 | 
```

--------------------------------------------------------------------------------
/tools/ingestBranch.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import { z } from "zod";
  2 | import { simpleGit } from "simple-git";
  3 | import path from "path";
  4 | import fs from "fs";
  5 | import dbInterface from "../utils/db.js";
  6 | import { ProgressNotifier } from "../utils/types.js";
  7 | import config from "../config.js";
  8 | import { repoConfigManager } from "../utils/repoConfig.js";
  9 | 
 10 | // Define input schema for ingestBranch
 11 | export const IngestBranchSchema = z.object({
 12 |   repoUrl: z.string().describe("GitHub repository URL"),
 13 |   branch: z
 14 |     .string()
 15 |     .optional()
 16 |     .describe("Branch name to query (defaults to repository's default branch)"),
 17 |   _meta: z
 18 |     .object({
 19 |       progressToken: z.union([z.string(), z.number()]).optional(),
 20 |     })
 21 |     .optional(),
 22 | });
 23 | 
 24 | // Define chunk interface
 25 | interface Chunk {
 26 |   content: string;
 27 |   chunkNumber: number;
 28 |   tokenCount: number;
 29 | }
 30 | 
 31 | 
 32 | const cloneRepository = async (
 33 |   repoUrl: string,
 34 |   localPath: string
 35 | ): Promise<string> => {
 36 |   // Extract repository name from URL
 37 |   const repoName = path.basename(repoUrl, ".git");
 38 |   const fullPath = path.join(localPath, repoName);
 39 | 
 40 |   // Check if repository already exists
 41 |   if (fs.existsSync(fullPath)) {
 42 |     console.error(`Repository already exists at ${fullPath}`);
 43 |     return fullPath;
 44 |   }
 45 | 
 46 |   // Clone the repository
 47 |   console.error(`Cloning repository ${repoUrl} to ${fullPath}`);
 48 |   const git = simpleGit();
 49 |   await git.clone(repoUrl, fullPath);
 50 | 
 51 |   return fullPath;
 52 | };
 53 | 
 54 | // Modified cloneRepository function wrapper that reports progress
 55 | async function cloneRepositoryWithProgress(
 56 |   repoUrl: string, 
 57 |   reposDir: string, 
 58 |   progressNotifier?: ProgressNotifier
 59 | ): Promise<string> {
 60 |   // Send initial progress notification (start of cloning - 0% of the 33%)
 61 |   if (progressNotifier) {
 62 |     await progressNotifier.sendProgress(0, 1);
 63 |   }
 64 |   
 65 |   // Set up a timer to periodically send progress updates
 66 |   let progressPercentage = 0;
 67 |   let isCloning = true;
 68 |   const progressInterval = 1500; // 1.5 seconds between updates
 69 |   const maxProgress = 0.30; // Progress up to 30% (reserving 3% for completion)
 70 |   const progressStep = 0.02; // Increments of 2%
 71 |   
 72 |   // Create an interval that will send progress updates periodically
 73 |   let timer: NodeJS.Timeout | null = null;
 74 |   
 75 |   if (progressNotifier) {
 76 |     timer = setInterval(async () => {
 77 |       if (isCloning && progressPercentage < maxProgress) {
 78 |         progressPercentage += progressStep;
 79 |         await progressNotifier!.sendProgress(progressPercentage, 1);
 80 |       }
 81 |     }, progressInterval);
 82 |   }
 83 |   
 84 |   try {
 85 |     // Start cloning operation
 86 |     const repoLocalPath = await cloneRepository(repoUrl, reposDir);
 87 |     
 88 |     // Clone completed
 89 |     isCloning = false;
 90 |     
 91 |     // Send completion of cloning phase (33% of total progress)
 92 |     if (progressNotifier) {
 93 |       await progressNotifier.sendProgress(0.33, 1);
 94 |     }
 95 |     
 96 |     return repoLocalPath;
 97 |   } finally {
 98 |     // Clean up the timer when done
 99 |     if (timer) {
100 |       clearInterval(timer);
101 |     }
102 |   }
103 | }
104 | 
105 | export async function ingestBranch(
106 |   input: z.infer<typeof IngestBranchSchema>,
107 |   progressNotifier?: ProgressNotifier
108 | ) {
109 |   try {
110 |     console.error(
111 |       `[ingestBranch] Starting with parameters: ${JSON.stringify(input)}`
112 |     );
113 | 
114 |     // Check if input is defined
115 |     if (!input) {
116 |       console.error(`[ingestBranch] Error: Input parameters are undefined`);
117 |       return {
118 |         error: {
119 |           message: "Input parameters are required for ingestBranch tool",
120 |         },
121 |       };
122 |     }
123 | 
124 |     const startTime = Date.now();
125 |     const { repoUrl, branch } = input;
126 | 
127 |     // Validate required parameters
128 |     if (!repoUrl) {
129 |       console.error(`[ingestBranch] Error: Missing required parameter repoUrl`);
130 |       return {
131 |         error: {
132 |           message: "Required parameter (repoUrl) is missing",
133 |         },
134 |       };
135 |     }
136 | 
137 |     // Get repository path using config manager
138 |     const { path: repoLocalPath, config: repoConfig } = repoConfigManager.getRepositoryPath(repoUrl, branch);
139 |     let actualBranch = branch || "";
140 | 
141 |     console.error(
142 |       `[ingestBranch] Processing repository: ${repoUrl}, type: ${repoConfig.type}, branch: ${actualBranch || 'default'}`
143 |     );
144 | 
145 |     // Handle repository based on type
146 |     if (repoConfig.type === 'local') {
147 |       console.error(`[ingestBranch] Using local repository at: ${repoLocalPath}`);
148 |     } else {
149 |       // Only clone if needed
150 |       if (repoConfigManager.needsCloning(repoUrl)) {
151 |         console.error(`[ingestBranch] Cloning remote repository to: ${repoLocalPath}`);
152 |         await cloneRepositoryWithProgress(repoUrl, path.dirname(repoLocalPath), progressNotifier);
153 |       } else {
154 |         console.error(`[ingestBranch] Using cached repository at: ${repoLocalPath}`);
155 |       }
156 |     }
157 |     
158 |     console.error(
159 |       `[ingestBranch] Repository cloned to: ${repoLocalPath} (${
160 |         Date.now() - startTime
161 |       }ms)`
162 |     );
163 | 
164 |     // Initialize git
165 |     const git = simpleGit(repoLocalPath);
166 | 
167 |     // If branch is not specified, get the default branch using git
168 |     if (!actualBranch) {
169 |       console.error(`[ingestBranch] Branch not specified, getting default branch`);
170 |       try {
171 |         // Get the default branch name
172 |         const defaultBranch = await git.revparse(['--abbrev-ref', 'HEAD']);
173 |         actualBranch = defaultBranch;
174 |         console.error(`[ingestBranch] Using default branch: ${actualBranch}`);
175 |       } catch (error) {
176 |         console.error(`[ingestBranch] Error getting default branch:`, error);
177 |         // Fallback to 'main' if we can't determine the default branch
178 |         actualBranch = "main";
179 |         console.error(`[ingestBranch] Falling back to branch: ${actualBranch}`);
180 |       }
181 |     }
182 | 
183 |     // Checkout the branch
184 |     console.error(`[ingestBranch] Checking out branch: ${actualBranch}`);
185 |     await git.checkout(actualBranch);
186 |     const latestCommit = await git.revparse([actualBranch]);
187 |     console.error(`[ingestBranch] Latest commit SHA: ${latestCommit}`);
188 | 
189 |     // Extract repo name from URL
190 |     const repoName = path.basename(repoUrl, ".git");
191 | 
192 |     // Check if repo exists in database
193 |     console.error(
194 |       `[ingestBranch] Checking if repo exists in database: ${repoName}`
195 |     );
196 |     const repoExists = dbInterface.get(
197 |       "SELECT id FROM repository WHERE name = ?",
198 |       repoName
199 |     );
200 | 
201 |     let repoId;
202 |     if (repoExists) {
203 |       repoId = repoExists.id;
204 |       console.error(
205 |         `[ingestBranch] Repository found in database with ID: ${repoId}`
206 |       );
207 |     } else {
208 |       // Register repository
209 |       console.error(`[ingestBranch] Registering new repository: ${repoName}`);
210 |       const result = dbInterface.run(
211 |         "INSERT INTO repository (name, path) VALUES (?, ?)",
212 |         [repoName, repoLocalPath]
213 |       );
214 |       repoId = result.lastInsertRowid;
215 |       console.error(`[ingestBranch] Repository registered with ID: ${repoId}`);
216 |     }
217 | 
218 |     // Check if branch exists and has the same commit SHA
219 |     console.error(`[ingestBranch] Checking if branch exists in database`);
220 |     const branchExists = dbInterface.get(
221 |       "SELECT id, last_commit_sha, status FROM branch WHERE name = ? AND repository_id = ?",
222 |       [actualBranch, repoId]
223 |     );
224 | 
225 |     let branchId;
226 |     let needsUpdate = false;
227 | 
228 |     if (branchExists) {
229 |       branchId = branchExists.id;
230 |       console.error(
231 |         `[ingestBranch] Branch found in database with ID: ${branchId}`
232 |       );
233 |       
234 |       // Step 1: Check if SHA changed
235 |       if (branchExists.last_commit_sha !== latestCommit) {
236 |         console.error(`[ingestBranch] Commit SHA changed, updating branch: ${branchId}`);
237 |         // Update branch commit SHA and set status to 'pending'
238 |         dbInterface.run(
239 |           "UPDATE branch SET last_commit_sha = ?, status = 'pending' WHERE id = ?",
240 |           [latestCommit, branchId]
241 |         );
242 |         needsUpdate = true;
243 |       }
244 |       
245 |       // Step 2: Check if status is not embeddings_generated
246 |       if (branchExists.status !== 'embeddings_generated') {
247 |         console.error(`[ingestBranch] Branch status is "${branchExists.status}" not "embeddings_generated", needs processing`);
248 |         needsUpdate = true;
249 |       }
250 | 
251 |       if (!needsUpdate) {
252 |         console.error(`[ingestBranch] No changes needed, skipping update`);
253 |       }
254 |     } else {
255 |       // Register the branch
256 |       console.error(`[ingestBranch] Registering new branch: ${actualBranch}`);
257 |       const result = dbInterface.run(
258 |         "INSERT INTO branch (name, repository_id, last_commit_sha, status) VALUES (?, ?, ?, 'pending')",
259 |         [actualBranch, repoId, latestCommit]
260 |       );
261 |       branchId = result.lastInsertRowid;
262 |       needsUpdate = true;
263 |       console.error(`[ingestBranch] Branch registered with ID: ${branchId}`);
264 |     }
265 | 
266 |     // We don't process files directly here, just return the state
267 |     // The actual file processing will happen in processFiles.ts
268 |     return {
269 |       repoLocalPath,
270 |       repoId,
271 |       branchId,
272 |       needsUpdate,
273 |       repoName,
274 |       actualBranch,
275 |       latestCommit
276 |     };
277 |   } catch (error) {
278 |     console.error(`[ingestBranch] Error executing tool:`, error);
279 |     return {
280 |       error: {
281 |         message: `Error executing ingestBranch tool: ${
282 |           error instanceof Error ? error.message : String(error)
283 |         }`,
284 |       },
285 |     };
286 |   }
287 | }
288 | 
```

--------------------------------------------------------------------------------
/tools/queryRepo.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import { z } from "zod";
  2 | import dbInterface from "../utils/db.js";
  3 | import { generateOllamaEmbeddings } from "../utils/ollamaEmbeddings.js";
  4 | import { createFilePatternCondition } from "../utils/filePatternMatcher.js";
  5 | import { ProgressNotifier } from "../utils/types.js";
  6 | import { ingestBranch } from "./ingestBranch.js";
  7 | import { processFiles } from "./processFiles.js";
  8 | 
  9 | // Define input schemas for tools
 10 | export const QueryRepoSchema = z.object({
 11 |   repoUrl: z.string().describe("GitHub repository URL"),
 12 |   branch: z
 13 |     .string()
 14 |     .optional()
 15 |     .describe("Branch name to query (defaults to repository's default branch)"),
 16 |   semanticSearch: z.string().describe("Query for semantic search. This search is not exact, it will try to find the most relevant files, it doesn't accept file: or path: prefixes."),
 17 |   keywordsSearch: z
 18 |     .array(z.string())
 19 |     .describe(
 20 |       "Search to the files that contain at least one of the keywords in this list. Leave empty to disable. This can work in conjunction with the semantic search."
 21 |     ),
 22 |   filePatterns: z
 23 |     .array(z.string())
 24 |     .describe(
 25 |       "Array of glob patterns to filter files (e.g. '**/*.ts', 'src/*.js'). Use it for a more effective search or to target specific files for example 'somefile.tsx'. Leave empty to disable"
 26 |     ),
 27 |   excludePatterns: z
 28 |     .array(z.string())
 29 |     .optional()
 30 |     .describe(
 31 |       "Array of glob patterns to exclude files (e.g. '**/node_modules/**', '**/dist/**'). Use it to exclude files that are not relevant to the search. Leave empty to disable"
 32 |     ),
 33 |   limit: z.number().optional().describe("Maximum number of results to return"),
 34 |   _meta: z
 35 |     .object({
 36 |       progressToken: z.union([z.string(), z.number()]).optional(),
 37 |     })
 38 |     .optional(),
 39 | });
 40 | 
 41 | // Helper function to create a heartbeat progress notifier
 42 | function createHeartbeatNotifier(originalNotifier?: ProgressNotifier, heartbeatMs: number = 2000): {
 43 |   notifier: ProgressNotifier;
 44 |   stopHeartbeat: () => void;
 45 | } {
 46 |   if (!originalNotifier) {
 47 |     return {
 48 |       notifier: {
 49 |         sendProgress: async () => {} // No-op if no original notifier
 50 |       },
 51 |       stopHeartbeat: () => {}
 52 |     };
 53 |   }
 54 |   
 55 |   let currentProgress = 0;
 56 |   let currentMax = 1;
 57 |   let isActive = true;
 58 |   let lastUpdate = Date.now();
 59 |   
 60 |   // Heartbeat interval
 61 |   const intervalId = setInterval(async () => {
 62 |     if (!isActive) return;
 63 |     
 64 |     // Only send if it's been more than heartbeatMs since the last update
 65 |     if (Date.now() - lastUpdate >= heartbeatMs) {
 66 |       console.error(`[queryRepo] Heartbeat progress: ${currentProgress}/${currentMax}`);
 67 |       await originalNotifier.sendProgress(currentProgress, currentMax);
 68 |     }
 69 |   }, heartbeatMs);
 70 |   
 71 |   return {
 72 |     notifier: {
 73 |       sendProgress: async (progress: number, max: number) => {
 74 |         currentProgress = progress;
 75 |         currentMax = max;
 76 |         lastUpdate = Date.now();
 77 |         await originalNotifier.sendProgress(progress, max);
 78 |       }
 79 |     },
 80 |     stopHeartbeat: () => {
 81 |       isActive = false;
 82 |       clearInterval(intervalId);
 83 |     }
 84 |   };
 85 | }
 86 | 
 87 | export async function queryRepo(
 88 |   input: z.infer<typeof QueryRepoSchema>,
 89 |   progressNotifier?: ProgressNotifier
 90 | ) {
 91 |   // Create heartbeat notifier that will send regular updates
 92 |   const { notifier: heartbeatNotifier, stopHeartbeat } = createHeartbeatNotifier(progressNotifier);
 93 |   
 94 |   try {
 95 |     console.error(
 96 |       `[queryRepo] Starting with parameters: ${JSON.stringify(input)}`
 97 |     );
 98 | 
 99 |     // Check if input is defined
100 |     if (!input) {
101 |       console.error(`[queryRepo] Error: Input parameters are undefined`);
102 |       return {
103 |         error: {
104 |           message: "Input parameters are required for queryRepo tool",
105 |         },
106 |       };
107 |     }
108 | 
109 |     const startTime = Date.now();
110 | 
111 |     const {
112 |       repoUrl,
113 |       branch,
114 |       semanticSearch: semanticSearchInput,
115 |       keywordsSearch,
116 |       limit,
117 |       filePatterns,
118 |       excludePatterns,
119 |     } = input;
120 | 
121 |     // Validate required parameters
122 |     if (!repoUrl ||(!semanticSearchInput && !keywordsSearch)) {
123 |       console.error(`[queryRepo] Error: Missing required parameters`);
124 |       return {
125 |         error: {
126 |           message: "Required parameters (repoUrl, semanticSearch or keywordsSearch) are missing",
127 |         },
128 |       };
129 |     }
130 | 
131 |     let semanticSearch = semanticSearchInput;
132 |     if(!semanticSearchInput) {
133 |       semanticSearch = keywordsSearch.join(" ");
134 |     }
135 | 
136 |     // Initialize progress at start
137 |     await heartbeatNotifier.sendProgress(0.05, 1);
138 | 
139 |     // Step 1: Ingest the branch (25% of progress)
140 |     console.error(`[queryRepo] Ingesting branch: ${repoUrl}, ${branch || 'default'}`);
141 |     const branchResult = await ingestBranch(
142 |       { 
143 |         repoUrl, 
144 |         branch
145 |       }, 
146 |       undefined // Don't pass progress notifier to individual tools
147 |     );
148 | 
149 |     // Update progress after branch ingestion
150 |     await heartbeatNotifier.sendProgress(0.25, 1);
151 | 
152 |     // Check for error
153 |     if ('error' in branchResult) {
154 |       console.error(`[queryRepo] Error in ingestBranch:`, branchResult.error);
155 |       return { error: branchResult.error };
156 |     }
157 | 
158 |     const branchData = branchResult;
159 | 
160 |     // Step 2: Process files if needed (50% of progress)
161 |     console.error(`[queryRepo] Processing files for branch: ${branchData.branchId}`);
162 |     const filesResult = await processFiles(
163 |       {
164 |         repoLocalPath: branchData.repoLocalPath,
165 |         repoId: branchData.repoId,
166 |         branchId: branchData.branchId,
167 |         actualBranch: branchData.actualBranch,
168 |         needsUpdate: branchData.needsUpdate
169 |       },
170 |       undefined // Don't pass progress notifier to individual tools
171 |     );
172 | 
173 |     // Update progress after file processing
174 |     await heartbeatNotifier.sendProgress(0.5, 1);
175 | 
176 |     // Check for error
177 |     if ('error' in filesResult) {
178 |       console.error(`[queryRepo] Error in processFiles:`, filesResult.error);
179 |       return { error: filesResult.error };
180 |     }
181 | 
182 |     // Generate embedding for the query
183 |     console.error(`[queryRepo] Generating embedding for query: "${semanticSearch}"`);
184 |     const queryEmbedStart = Date.now();
185 |     const [queryEmbedding] = await generateOllamaEmbeddings([semanticSearch]);
186 |     const queryEmbeddingStr = JSON.stringify(queryEmbedding);
187 |     console.error(
188 |       `[queryRepo] Generated query embedding in ${
189 |         Date.now() - queryEmbedStart
190 |       }ms`
191 |     );
192 | 
193 |     // Update progress after query embedding
194 |     await heartbeatNotifier.sendProgress(0.6, 1);
195 | 
196 |     // Search for similar chunks using SQLite's JSON functions for vector similarity
197 |     console.error(
198 |       `[queryRepo] Searching for similar chunks with limit: ${limit}`
199 |     );
200 |     const searchStart = Date.now();
201 |     // Use a default limit of 10 if undefined
202 |     const effectiveLimit = limit === undefined ? 10 : limit;
203 | 
204 |     // Create SQL condition for file pattern filtering
205 |     const filePatternCondition = createFilePatternCondition(
206 |       filePatterns,
207 |       excludePatterns
208 |     );
209 | 
210 |     const results = dbInterface.all(
211 |       `
212 |       SELECT fc.content, f.path, fc.chunk_number,
213 |              (SELECT  (SELECT SUM(json_extract(value, '$') * json_extract(?, '$[' || key || ']'))
214 |                         FROM json_each(fc.embedding)
215 |                         GROUP BY key IS NOT NULL)
216 |               )/${queryEmbedding.length} as similarity
217 |       FROM file_chunk fc
218 |       JOIN file f ON fc.file_id = f.id
219 |       JOIN branch_file_association bfa ON f.id = bfa.file_id
220 |       WHERE bfa.branch_id = ?
221 |       AND fc.embedding IS NOT NULL
222 |       ${filePatternCondition}
223 |       ORDER BY similarity DESC
224 |       LIMIT ?
225 |     `,
226 |       [queryEmbeddingStr, branchData.branchId, effectiveLimit]
227 |     );
228 |     console.error(
229 |       `[queryRepo] Search completed in ${Date.now() - searchStart}ms, found ${
230 |         results.length
231 |       } results`
232 |     );
233 | 
234 |     // Update progress after initial search
235 |     await heartbeatNotifier.sendProgress(0.7, 1);
236 | 
237 |     // If no results found, check if embeddings need to be generated
238 |     if (results.length === 0) {
239 |       console.error(`[queryRepo] No results found, checking if embeddings need to be generated`);
240 |       
241 |       // Check if there are any chunks without embeddings
242 |       const chunksWithoutEmbeddings = dbInterface.get(
243 |         `SELECT COUNT(*) as count 
244 |          FROM file_chunk fc
245 |          JOIN file f ON fc.file_id = f.id
246 |          JOIN branch_file_association bfa ON f.id = bfa.file_id
247 |          WHERE bfa.branch_id = ?
248 |          AND fc.embedding IS NULL`,
249 |         branchData.branchId
250 |       );
251 | 
252 |       if (chunksWithoutEmbeddings && chunksWithoutEmbeddings.count > 0) {
253 |         console.error(`[queryRepo] Found ${chunksWithoutEmbeddings.count} chunks without embeddings, generating them`);
254 |         
255 |         // Import embedFiles function
256 |         const { embedFiles } = await import('./embedFiles.js');
257 |         
258 |         // Generate embeddings (75-90% of progress)
259 |         await heartbeatNotifier.sendProgress(0.75, 1);
260 |         
261 |         // Generate embeddings
262 |         const embedResult = await embedFiles(
263 |           {
264 |             repoLocalPath: branchData.repoLocalPath,
265 |             branchId: branchData.branchId
266 |           },
267 |           undefined // Don't pass progress notifier to individual tools
268 |         );
269 | 
270 |         // Update progress after embedding generation
271 |         await heartbeatNotifier.sendProgress(0.9, 1);
272 | 
273 |         if ('error' in embedResult) {
274 |           console.error(`[queryRepo] Error generating embeddings:`, embedResult.error);
275 |           return { error: embedResult.error };
276 |         }
277 | 
278 |         // Try searching again after generating embeddings
279 |         console.error(`[queryRepo] Retrying search after generating embeddings`);
280 |         const retryResults = dbInterface.all(
281 |           `
282 |           SELECT fc.content, f.path, fc.chunk_number,
283 |                  (SELECT  (SELECT SUM(json_extract(value, '$') * json_extract(?, '$[' || key || ']'))
284 |                             FROM json_each(fc.embedding)
285 |                             GROUP BY key IS NOT NULL)
286 |                   ) as similarity
287 |           FROM file_chunk fc
288 |           JOIN file f ON fc.file_id = f.id
289 |           JOIN branch_file_association bfa ON f.id = bfa.file_id
290 |           WHERE bfa.branch_id = ?
291 |           AND fc.embedding IS NOT NULL
292 |           ${filePatternCondition}
293 |           ORDER BY similarity DESC
294 |           LIMIT ?
295 |         `,
296 |           [queryEmbeddingStr, branchData.branchId, effectiveLimit]
297 |         );
298 | 
299 |         console.error(
300 |           `[queryRepo] Retry search completed, found ${retryResults.length} results`
301 |         );
302 |         results.push(...retryResults);
303 |       }
304 |     }
305 | 
306 |     // Filter results by keywords if provided
307 |     let filteredResults = results;
308 |     if (keywordsSearch && keywordsSearch.length > 0) {
309 |       console.error(
310 |         `[queryRepo] Filtering results by keywords: ${keywordsSearch.join(", ")}`
311 |       );
312 |       const keywordFilterStart = Date.now();
313 | 
314 |       // Convert keywords to lowercase for case-insensitive matching
315 |       const lowercaseKeywords = keywordsSearch.map((kw) => kw.trim().toLowerCase());
316 | 
317 |       filteredResults = results.filter((result: { content: string }) => {
318 |         const content = result.content.toLowerCase();
319 |         // Check if the content contains at least one of the keywords
320 |         return lowercaseKeywords.some((keyword) => content.includes(keyword));
321 |       });
322 | 
323 |       console.error(
324 |         `[queryRepo] Keyword filtering completed in ${
325 |           Date.now() - keywordFilterStart
326 |         }ms, filtered from ${results.length} to ${
327 |           filteredResults.length
328 |         } results`
329 |       );
330 |     }
331 | 
332 |     // Update progress to completion
333 |     await heartbeatNotifier.sendProgress(1, 1);
334 | 
335 |     const totalTime = Date.now() - startTime;
336 |     console.error(`[queryRepo] Tool completed in ${totalTime}ms`);
337 | 
338 |     return {
339 |       output: {
340 |         success: true,
341 |         repoUrl,
342 |         branch: branchData.actualBranch,
343 |         processingTimeMs: totalTime,
344 |         results: filteredResults.map((result: any) => ({
345 |           filePath: result.path,
346 |           chunkNumber: result.chunk_number,
347 |           content: result.content,
348 |           similarity: result.similarity,
349 |         })),
350 |       },
351 |     };
352 |   } catch (error) {
353 |     console.error(`[queryRepo] Error executing tool:`, error);
354 |     return {
355 |       error: {
356 |         message: `Error executing queryRepo tool: ${
357 |           error instanceof Error ? error.message : String(error)
358 |         }`,
359 |       },
360 |     };
361 |   } finally {
362 |     // Always stop the heartbeat when done
363 |     stopHeartbeat();
364 |   }
365 | } 
```

--------------------------------------------------------------------------------
/tools/processFiles.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import { z } from "zod";
  2 | import dbInterface from "../utils/db.js";
  3 | import { ProgressNotifier } from "../utils/types.js";
  4 | import { simpleGit } from "simple-git";
  5 | import path from "path";
  6 | import { extensionToSplitter, splitDocument } from "../utils/codeSplitter.js";
  7 | import fs from "fs";
  8 | 
  9 | interface RepositoryFile {
 10 |   path: string;
 11 |   name: string;
 12 |   sha: string;
 13 | }
 14 | 
 15 | interface RepositoryFilesResult {
 16 |   files: RepositoryFile[];
 17 |   commitSha: string;
 18 | }
 19 | 
 20 | interface PendingFile {
 21 |   id: number;
 22 |   path: string;
 23 |   sha: string;
 24 | }
 25 | 
 26 | 
 27 | // Define input schema for processFiles
 28 | export const ProcessFilesSchema = z.object({
 29 |   repoLocalPath: z.string().describe("Local path to the cloned repository"),
 30 |   repoId: z.number().describe("Repository ID in the database"),
 31 |   branchId: z.number().describe("Branch ID in the database"),
 32 |   actualBranch: z.string().describe("Actual branch name"),
 33 |   needsUpdate: z.boolean().describe("Whether the branch needs updating"),
 34 |   _meta: z
 35 |     .object({
 36 |       progressToken: z.union([z.string(), z.number()]).optional(),
 37 |     })
 38 |     .optional(),
 39 | });
 40 | 
 41 | 
 42 | /**
 43 |  * Get the files in a repository branch
 44 |  * @param repoPath Path to the repository
 45 |  * @param branchName Name of the branch
 46 |  * @returns List of files with their metadata
 47 |  */
 48 | export const getRepositoryFiles = async (
 49 |   repoPath: string,
 50 |   branchName: string,
 51 | ): Promise<RepositoryFilesResult> => {
 52 |   const git = simpleGit(repoPath);
 53 | 
 54 |   // Checkout the branch
 55 |   await git.checkout(branchName);
 56 | 
 57 |   // Get the latest commit SHA
 58 |   const latestCommit = await git.revparse([branchName]);
 59 | 
 60 |   // Get the file tree
 61 |   const files: RepositoryFile[] = [];
 62 | 
 63 |   // Use git ls-tree to get all files recursively
 64 |   const result = await git.raw(["ls-tree", "-r", branchName]);
 65 |   const stdout = result.toString();
 66 | 
 67 |   // Parse the output
 68 |   const lines = stdout.split("\n").filter((line) => line.trim() !== "");
 69 | 
 70 |   for (const line of lines) {
 71 |     // Format: <mode> <type> <object> <file>
 72 |     const [info, filePath] = line.split("\t");
 73 |     const [, , sha] = info.split(" ");
 74 | 
 75 |     if (filePath) {
 76 |       files.push({
 77 |         path: filePath,
 78 |         name: path.basename(filePath),
 79 |         sha,
 80 |       });
 81 |     }
 82 |   }
 83 | 
 84 |   return { files, commitSha: latestCommit };
 85 | };
 86 | 
 87 | 
 88 | /**
 89 |  * Process file content and split into chunks
 90 |  * @param branchName Branch name
 91 |  * @param repoPath Repository path
 92 |  */
 93 | export const processFileContents = async (
 94 |   branchName: string,
 95 |   repoPath: string
 96 | ): Promise<void> => {
 97 |   const git = simpleGit(repoPath);
 98 | 
 99 |   // Checkout the branch
100 |   await git.checkout(branchName);
101 | 
102 |   // Get repository and branch IDs
103 |   const repo = dbInterface.get("SELECT id FROM repository WHERE path = ?", repoPath) as { id: number };
104 |   const branch = dbInterface.get(
105 |     "SELECT id FROM branch WHERE name = ? AND repository_id = ?",
106 |     [branchName, repo.id]
107 |   ) as { id: number };
108 | 
109 |   // Get all pending files for the branch
110 |   const pendingFiles = dbInterface.all(
111 |     `SELECT f.id, f.path, f.sha
112 |      FROM file f
113 |      JOIN branch_file_association bfa ON f.id = bfa.file_id
114 |      WHERE f.status = 'pending' AND bfa.branch_id = ?`,
115 |     branch.id
116 |   ) as PendingFile[];
117 | 
118 |   for (const file of pendingFiles) {
119 |     console.error(`Processing file: ${file.path}`);
120 |     const extension = file.path.split(".").pop()?.toLowerCase();
121 |     const splitType = extension ? extensionToSplitter(extension) : "ignore";
122 | 
123 |     if (splitType !== "ignore") {
124 |       try {
125 |         // Get file content
126 |         const filePath = path.join(repoPath, file.path);
127 | 
128 |         // Skip if file doesn't exist (might have been deleted)
129 |         if (!fs.existsSync(filePath)) {
130 |           console.error(`File ${file.path} doesn't exist, skipping`);
131 |           continue;
132 |         }
133 | 
134 |         let content = fs.readFileSync(filePath, "utf-8");
135 | 
136 |         // Check for null bytes in the content
137 |         if (content.includes("\0")) {
138 |           console.error(
139 |             `File ${file.path} contains null bytes. Removing them.`
140 |           );
141 |           content = content.replace(/\0/g, "");
142 |         }
143 | 
144 |         // Check if the content is valid UTF-8
145 |         try {
146 |           new TextDecoder("utf-8", { fatal: true }).decode(
147 |             new TextEncoder().encode(content)
148 |           );
149 |         } catch (e) {
150 |           console.error(
151 |             `File ${file.path} contains invalid UTF-8 characters. Replacing them.`
152 |           );
153 |           content = content.replace(/[^\x00-\x7F]/g, ""); // Remove non-ASCII characters
154 |         }
155 | 
156 |         // Truncate content if it's too long
157 |         const maxLength = 1000000; // Adjust this value based on your database column size
158 |         if (content.length > maxLength) {
159 |           console.error(
160 |             `File ${file.path} content is too long. Truncating to ${maxLength} characters.`
161 |           );
162 |           content = content.substring(0, maxLength);
163 |         }
164 | 
165 |         // Split the document
166 |         const chunks = await splitDocument(file.path, content);
167 | 
168 |         // Store chunks in the database using dbInterface.transaction
169 |         dbInterface.transaction((db) => {
170 |           for (let i = 0; i < chunks.length; i++) {
171 |             db.prepare(
172 |               `INSERT INTO file_chunk (file_id, content, chunk_number)
173 |                VALUES (?, ?, ?)
174 |                ON CONFLICT(file_id, chunk_number) DO NOTHING`
175 |             ).run(file.id, chunks[i].pageContent, i + 1);
176 |           }
177 | 
178 |           // Update file status to 'fetched'
179 |           db.prepare("UPDATE file SET status = ? WHERE id = ?").run(
180 |             "fetched",
181 |             file.id
182 |           );
183 |         });
184 |       } catch (error) {
185 |         console.error(`Error processing file ${file.path}:`, error);
186 |       }
187 |     } else {
188 |       // Update file status to 'done' for ignored files
189 |       dbInterface.run("UPDATE file SET status = ? WHERE id = ?", ["done", file.id]);
190 |     }
191 |   }
192 | };
193 | 
194 | export async function processFiles(
195 |   input: z.infer<typeof ProcessFilesSchema>,
196 |   progressNotifier?: ProgressNotifier
197 | ) {
198 |   try {
199 |     console.error(
200 |       `[processFiles] Starting with parameters: ${JSON.stringify(input)}`
201 |     );
202 | 
203 |     // Check if input is defined
204 |     if (!input) {
205 |       console.error(`[processFiles] Error: Input parameters are undefined`);
206 |       return {
207 |         error: {
208 |           message: "Input parameters are required for processFiles tool",
209 |         },
210 |       };
211 |     }
212 | 
213 |     const startTime = Date.now();
214 |     const { repoLocalPath, repoId, branchId, actualBranch, needsUpdate } = input;
215 | 
216 |     // Skip if no update is needed
217 |     if (!needsUpdate) {
218 |       console.error(`[processFiles] No update needed, skipping`);
219 |       return { 
220 |         needsUpdate: false,
221 |         filesToProcess: []
222 |       };
223 |     }
224 | 
225 |     // Process the repository files
226 |     console.error(
227 |       `[processFiles] Processing repository files (${Date.now() - startTime}ms)`
228 |     );
229 |     // Get all files in the repository
230 |     const { files } = await getRepositoryFiles(repoLocalPath, actualBranch);
231 |     console.error(`[processFiles] Found ${files.length} files in repository`);
232 | 
233 |     // Define transaction function
234 |     console.error(`[processFiles] Starting file database transaction`);
235 |     const processFiles = (db: any) => {
236 |       // Get existing files to compare
237 |       const existingFiles = db
238 |         .prepare(
239 |           `SELECT f.id, f.path, f.sha FROM file f
240 |                JOIN branch_file_association bfa ON f.id = bfa.file_id
241 |                WHERE bfa.branch_id = ?`
242 |         )
243 |         .all(branchId);
244 |       console.error(
245 |         `[processFiles] Found ${existingFiles.length} existing files in database`
246 |       );
247 | 
248 |       const existingFileMap = new Map();
249 |       for (const file of existingFiles) {
250 |         existingFileMap.set(file.path, file);
251 |       }
252 | 
253 |       // Track files that need processing
254 |       const filesToProcess: any[] = [];
255 | 
256 |       // File counters for logging
257 |       let newFiles = 0;
258 |       let updatedFiles = 0;
259 |       let unchangedFiles = 0;
260 |       let removedFiles = 0;
261 | 
262 |       // Process each file
263 |       for (const file of files) {
264 |         const existingFile = existingFileMap.get(file.path);
265 |         existingFileMap.delete(file.path); // Remove from map to track what's left later
266 | 
267 |         if (!existingFile) {
268 |           // New file - but first check if it already exists in the database for another branch
269 |           const existingFileInDB = db.prepare(
270 |             "SELECT id FROM file WHERE repository_id = ? AND path = ? AND sha = ?"
271 |           ).get(repoId, file.path, file.sha);
272 | 
273 |           let fileId;
274 |           if (existingFileInDB) {
275 |             // File exists but not associated with this branch
276 |             console.error(`[processFiles] File exists in DB but not associated with branch: ${file.path}`);
277 |             fileId = existingFileInDB.id;
278 |             
279 |             // Check if the file is already associated with this branch
280 |             const associationExists = db.prepare(
281 |               "SELECT 1 FROM branch_file_association WHERE branch_id = ? AND file_id = ?"
282 |             ).get(branchId, fileId);
283 | 
284 |             if (!associationExists) {
285 |               // Associate existing file with current branch
286 |               db.prepare(
287 |                 "INSERT INTO branch_file_association (branch_id, file_id) VALUES (?, ?)"
288 |               ).run(branchId, fileId);
289 |             }
290 |           } else {
291 |             // Truly new file
292 |             newFiles++;
293 |             const result = db
294 |               .prepare(
295 |                 "INSERT INTO file (repository_id, path, sha, name, status) VALUES (?, ?, ?, ?, 'pending')"
296 |               )
297 |               .run(repoId, file.path, file.sha, file.name);
298 | 
299 |             fileId = result.lastInsertRowid;
300 | 
301 |             // Associate with branch
302 |             db.prepare(
303 |               "INSERT INTO branch_file_association (branch_id, file_id) VALUES (?, ?)"
304 |             ).run(branchId, fileId);
305 |           }
306 | 
307 |           filesToProcess.push({
308 |             id: fileId,
309 |             path: file.path,
310 |             name: file.name,
311 |           });
312 |         } else if (existingFile.sha !== file.sha) {
313 |           // Updated file - SHA changed
314 |           updatedFiles++;
315 |           db.prepare(
316 |             "UPDATE file SET sha = ?, status = 'pending' WHERE id = ?"
317 |           ).run(file.sha, existingFile.id);
318 | 
319 |           filesToProcess.push({
320 |             id: existingFile.id,
321 |             path: file.path,
322 |             name: file.name,
323 |           });
324 |         } else {
325 |           // Unchanged file
326 |           unchangedFiles++;
327 |         }
328 |       }
329 | 
330 |       // Remove files that no longer exist in the branch
331 |       for (const [path, file] of existingFileMap.entries()) {
332 |         removedFiles++;
333 |         db.prepare(
334 |           "DELETE FROM branch_file_association WHERE branch_id = ? AND file_id = ?"
335 |         ).run(branchId, file.id);
336 | 
337 |         // If no other branches reference this file, delete it and its chunks
338 |         const fileStillInUse = db
339 |           .prepare(
340 |             "SELECT 1 FROM branch_file_association WHERE file_id = ? LIMIT 1"
341 |           )
342 |           .get(file.id);
343 | 
344 |         if (!fileStillInUse) {
345 |           // Delete chunks first
346 |           db.prepare("DELETE FROM file_chunk WHERE file_id = ?").run(file.id);
347 |           // Then delete the file
348 |           db.prepare("DELETE FROM file WHERE id = ?").run(file.id);
349 |         }
350 |       }
351 | 
352 |       console.error(
353 |         `[processFiles] Files summary: ${newFiles} new, ${updatedFiles} updated, ${unchangedFiles} unchanged, ${removedFiles} removed`
354 |       );
355 |       return filesToProcess;
356 |     };
357 | 
358 |     // Execute the transaction
359 |     console.error(`[processFiles] Executing file processing transaction`);
360 |     const filesToProcess = dbInterface.transaction((db) => processFiles(db));
361 |     console.error(
362 |       `[processFiles] Transaction completed, processing ${
363 |         filesToProcess.length
364 |       } files (${Date.now() - startTime}ms)`
365 |     );
366 | 
367 |     // Limit the number of files processed to avoid timeouts
368 |     // This might need adjustment based on actual performance
369 |     const MAX_FILES_TO_PROCESS = 1000000;
370 |     const limitedFiles = filesToProcess.slice(0, MAX_FILES_TO_PROCESS);
371 | 
372 |     if (limitedFiles.length < filesToProcess.length) {
373 |       console.error(
374 |         `[processFiles] WARNING: Processing only ${limitedFiles.length} of ${filesToProcess.length} files to avoid timeout`
375 |       );
376 |     }
377 | 
378 |     // Update progress for file processing phase (33% to 66%)
379 |     if (progressNotifier) {
380 |       await progressNotifier.sendProgress(0.33, 1);
381 |     }
382 | 
383 |     // Process file contents to generate chunks - this was the missing step
384 |     console.error(`[processFiles] Processing file contents for branch: ${actualBranch}`);
385 |     try {
386 |       await processFileContents(actualBranch, repoLocalPath);
387 |       console.error(`[processFiles] File contents processed successfully`);
388 |       
389 |       // Update branch status to files_processed
390 |       dbInterface.run(
391 |         "UPDATE branch SET status = 'files_processed' WHERE id = ?",
392 |         branchId
393 |       );
394 |       
395 |       // Update progress after file content processing
396 |       if (progressNotifier) {
397 |         await progressNotifier.sendProgress(0.66, 1);
398 |       }
399 |     } catch (error) {
400 |       console.error(`[processFiles] Error processing file contents:`, error);
401 |     }
402 | 
403 |     return {
404 |       needsUpdate: true,
405 |       filesToProcess: limitedFiles,
406 |       repoLocalPath
407 |     };
408 |   } catch (error) {
409 |     console.error(`[processFiles] Error executing tool:`, error);
410 |     return {
411 |       error: {
412 |         message: `Error executing processFiles tool: ${
413 |           error instanceof Error ? error.message : String(error)
414 |         }`,
415 |       },
416 |     };
417 |   }
418 | }
419 | 
```

--------------------------------------------------------------------------------
/utils/codeSplitter.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import {
  2 |   RecursiveCharacterTextSplitter,
  3 |   TextSplitter,
  4 | } from "@langchain/textsplitters";
  5 | import fs from "fs";
  6 | 
  7 | class SQLSchemaSplitter extends TextSplitter {
  8 |   private maxCharacters: number;
  9 | 
 10 |   constructor(maxCharacters: number) {
 11 |     super();
 12 |     this.maxCharacters = maxCharacters;
 13 |   }
 14 | 
 15 |   // Helper function to parse INSERT statements
 16 |   parseValues(valuesPart: string): string[] {
 17 |     let valuesArray: string[] = [];
 18 |     let currentTuple = "";
 19 |     let nestingLevel = 0;
 20 |     let inString: boolean = false;
 21 |     let stringChar = "";
 22 |     let escapeNext = false;
 23 | 
 24 |     for (let i = 0; i < valuesPart.length; i++) {
 25 |       const char = valuesPart[i];
 26 |       currentTuple += char;
 27 | 
 28 |       if (escapeNext) {
 29 |         escapeNext = false;
 30 |       } else if (char === "\\") {
 31 |         escapeNext = true;
 32 |       } else if (char === "'" || char === '"') {
 33 |         if (inString && char === stringChar) {
 34 |           inString = false;
 35 |         } else if (!inString) {
 36 |           inString = true;
 37 |           stringChar = char;
 38 |         }
 39 |       } else if (!inString) {
 40 |         if (char === "(") {
 41 |           nestingLevel += 1;
 42 |         } else if (char === ")") {
 43 |           nestingLevel -= 1;
 44 |           if (nestingLevel === 0) {
 45 |             valuesArray.push(currentTuple.trim());
 46 |             currentTuple = "";
 47 |             // Skip any commas and spaces
 48 |             while (
 49 |               i + 1 < valuesPart.length &&
 50 |               (valuesPart[i + 1] === "," ||
 51 |                 valuesPart[i + 1] === " " ||
 52 |                 valuesPart[i + 1] === "\n")
 53 |             ) {
 54 |               i++;
 55 |             }
 56 |           }
 57 |         }
 58 |       }
 59 |     }
 60 |     return valuesArray;
 61 |   }
 62 | 
 63 |   // Split long INSERT statements
 64 |   splitInsertStatement(statement: string): string[] {
 65 |     const insertIndex = statement.toUpperCase().indexOf("VALUES");
 66 |     if (insertIndex === -1) {
 67 |       // Cannot split, return the statement as is
 68 |       return [statement];
 69 |     }
 70 | 
 71 |     const insertIntoPart =
 72 |       statement.slice(0, insertIndex + "VALUES".length) + " ";
 73 |     const valuesPart = statement.slice(insertIndex + "VALUES".length);
 74 | 
 75 |     const valuesArray = this.parseValues(valuesPart);
 76 |     const insertStatements: string[] = [];
 77 | 
 78 |     let currentValues = "";
 79 |     for (const valueTuple of valuesArray) {
 80 |       const newStatementLength =
 81 |         insertIntoPart.length + currentValues.length + valueTuple.length + 1; // +1 for ',' or ';'
 82 | 
 83 |       if (newStatementLength <= this.maxCharacters) {
 84 |         if (currentValues !== "") {
 85 |           currentValues += "," + valueTuple;
 86 |         } else {
 87 |           currentValues = valueTuple;
 88 |         }
 89 |       } else {
 90 |         // Create a new INSERT statement
 91 |         const newStatement = insertIntoPart + currentValues + ";";
 92 |         insertStatements.push(newStatement);
 93 |         currentValues = valueTuple;
 94 |       }
 95 |     }
 96 |     if (currentValues !== "") {
 97 |       const newStatement = insertIntoPart + currentValues + ";";
 98 |       insertStatements.push(newStatement);
 99 |     }
100 |     return insertStatements;
101 |   }
102 | 
103 |   /**
104 |    * Enhanced function to split SQL script into statements while handling various SQL constructs,
105 |    * including custom keywords like BBEGI/EEN and EEXCEPTIO/EEN.
106 |    */
107 |   splitSQLStatements(text: string): string[] {
108 |     const statements: string[] = [];
109 |     let currentStatement = "";
110 |     let index = 0;
111 |     let insideString: boolean = false;
112 |     let stringChar = "";
113 |     let insideComment = false;
114 |     let commentType = "";
115 |     let insideFunction = false;
116 |     let insideProcedure = false;
117 |     let insideView = false;
118 |     let insideBlock = false;
119 |     let blockLevel = 0;
120 | 
121 |     const upperText = text.toUpperCase();
122 | 
123 |     // Define mappings for custom keywords to standard ones
124 |     const beginKeywords = ["BEGIN", "BBEGI", "BEGINN"];
125 |     const endKeywords = ["END", "EEN"];
126 |     const exceptionKeywords = ["EXCEPTION", "EEXCEPTIO"];
127 | 
128 |     while (index < text.length) {
129 |       const char = text[index];
130 |       const remainingText = upperText.substring(index);
131 |       currentStatement += char;
132 | 
133 |       if (insideString) {
134 |         if (char === stringChar) {
135 |           insideString = false;
136 |         } else if (char === "\\") {
137 |           // Skip escaped characters
138 |           index++;
139 |           if (index < text.length) {
140 |             currentStatement += text[index];
141 |           }
142 |         }
143 |       } else if (insideComment) {
144 |         if (commentType === "--" && (char === "\n" || char === "\r")) {
145 |           insideComment = false;
146 |         } else if (commentType === "/*" && remainingText.startsWith("*/")) {
147 |           insideComment = false;
148 |           currentStatement += "*/";
149 |           index += 1; // Skip '/'
150 |         }
151 |       } else if (char === "'" || char === '"') {
152 |         insideString = true;
153 |         stringChar = char;
154 |       } else if (remainingText.startsWith("/*")) {
155 |         insideComment = true;
156 |         commentType = "/*";
157 |         currentStatement += "/*";
158 |         index += 1; // Skip '*'
159 |       } else if (remainingText.startsWith("--")) {
160 |         insideComment = true;
161 |         commentType = "--";
162 |         currentStatement += "--";
163 |         index += 1; // Skip second '-'
164 |       } else if (
165 |         !insideFunction &&
166 |         !insideProcedure &&
167 |         !insideView &&
168 |         !insideBlock
169 |       ) {
170 |         if (
171 |           remainingText.startsWith("CREATE FUNCTION") ||
172 |           remainingText.startsWith("CREATE OR REPLACE FUNCTION")
173 |         ) {
174 |           insideFunction = true;
175 |           blockLevel = 0;
176 |         } else if (
177 |           remainingText.startsWith("CREATE PROCEDURE") ||
178 |           remainingText.startsWith("CREATE OR REPLACE PROCEDURE")
179 |         ) {
180 |           insideProcedure = true;
181 |           blockLevel = 0;
182 |         } else if (
183 |           remainingText.startsWith("CREATE VIEW") ||
184 |           remainingText.startsWith("CREATE OR REPLACE VIEW")
185 |         ) {
186 |           insideView = true;
187 |         } else if (beginKeywords.some((kw) => remainingText.startsWith(kw))) {
188 |           insideBlock = true;
189 |           blockLevel = 1;
190 |           const matchedBegin = beginKeywords.find((kw) =>
191 |             remainingText.startsWith(kw)
192 |           );
193 |           if (matchedBegin && matchedBegin.length > "BEGIN".length) {
194 |             index += matchedBegin.length - "BEGIN".length;
195 |             currentStatement += matchedBegin.substring("BEGIN".length);
196 |           }
197 |         }
198 |       }
199 | 
200 |       if (insideFunction || insideProcedure || insideBlock) {
201 |         // Check for BEGIN keywords to increase block level
202 |         const matchedBegin = beginKeywords.find((kw) =>
203 |           remainingText.startsWith(kw)
204 |         );
205 |         if (matchedBegin) {
206 |           blockLevel++;
207 |           index += matchedBegin.length - 1;
208 |           currentStatement += matchedBegin.substring(1);
209 |           continue;
210 |         }
211 | 
212 |         // Check for END keywords to decrease block level
213 |         const matchedEnd = endKeywords.find((kw) =>
214 |           remainingText.startsWith(kw)
215 |         );
216 |         if (
217 |           matchedEnd &&
218 |           (matchedEnd.length === "END".length ||
219 |             matchedEnd.length === "END;".length)
220 |         ) {
221 |           blockLevel--;
222 |           index += matchedEnd.length - 1;
223 |           currentStatement += matchedEnd.substring(1);
224 | 
225 |           if (blockLevel === 0) {
226 |             if (insideFunction) {
227 |               insideFunction = false;
228 |               statements.push(currentStatement.trim());
229 |               currentStatement = "";
230 |             } else if (insideProcedure) {
231 |               insideProcedure = false;
232 |               statements.push(currentStatement.trim());
233 |               currentStatement = "";
234 |             } else if (insideBlock) {
235 |               insideBlock = false;
236 |               statements.push(currentStatement.trim());
237 |               currentStatement = "";
238 |             }
239 |           }
240 |           continue;
241 |         }
242 |       } else if (insideView) {
243 |         if (char === ";") {
244 |           insideView = false;
245 |           statements.push(currentStatement.trim());
246 |           currentStatement = "";
247 |         }
248 |       } else if (
249 |         char === ";" &&
250 |         !insideFunction &&
251 |         !insideProcedure &&
252 |         !insideView &&
253 |         !insideBlock
254 |       ) {
255 |         statements.push(currentStatement.trim());
256 |         currentStatement = "";
257 |       }
258 | 
259 |       index++;
260 |     }
261 | 
262 |     if (currentStatement.trim() !== "") {
263 |       statements.push(currentStatement.trim());
264 |     }
265 | 
266 |     return statements;
267 |   }
268 | 
269 |   // Helper method to match keywords from a list at the start of the given text.
270 |   // Returns the matched keyword or null.
271 |   matchKeyword(text: string, keywords: string[]): string | null {
272 |     for (const keyword of keywords) {
273 |       if (text.startsWith(keyword)) {
274 |         return keyword;
275 |       }
276 |     }
277 |     return null;
278 |   }
279 | 
280 |   async splitText(text: string): Promise<string[]> {
281 |     const statements = this.splitSQLStatements(text);
282 |     const splits: string[] = [];
283 | 
284 |     for (const statement of statements) {
285 |       // Check if the statement is an INSERT statement
286 |       if (
287 |         statement.toUpperCase().includes("INSERT INTO") &&
288 |         statement.toUpperCase().includes("VALUES")
289 |       ) {
290 |         // Split long INSERT statements
291 |         const splitInserts = this.splitInsertStatement(statement);
292 |         splits.push(...splitInserts);
293 |       } else {
294 |         // For other statements, check if they are too long
295 |         if (statement.length <= this.maxCharacters) {
296 |           splits.push(statement);
297 |         } else {
298 |           // For long statements, split them into chunks
299 |           let currentSplit = "";
300 |           const lines = statement.split("\n");
301 | 
302 |           for (const line of lines) {
303 |             if (currentSplit.length + line.length + 1 <= this.maxCharacters) {
304 |               currentSplit += (currentSplit ? "\n" : "") + line;
305 |             } else {
306 |               if (currentSplit) {
307 |                 splits.push(currentSplit);
308 |               }
309 |               currentSplit = line;
310 |             }
311 |           }
312 | 
313 |           if (currentSplit) {
314 |             splits.push(currentSplit);
315 |           }
316 |         }
317 |       }
318 |     }
319 | 
320 |     return splits;
321 |   }
322 | }
323 | 
324 | export function extensionToSplitter(extension: string): string {
325 |   if (!extension) {
326 |     return "text";
327 |   }
328 |   const extensionLower = extension.toLowerCase();
329 |   switch (extensionLower) {
330 |     // C/C++ extensions
331 |     case "c++":
332 |     case "cpp":
333 |     case "c":
334 |     case "h":
335 |     case "hpp":
336 |     case "m":
337 |     case "mm":
338 |       return "cpp";
339 |     // Go
340 |     case "go":
341 |       return "go";
342 |     // Java
343 |     case "java":
344 |       return "java";
345 |     // JavaScript and related
346 |     case "js":
347 |     case "ts":
348 |     case "typescript":
349 |     case "tsx":
350 |     case "jsx":
351 |     case "javascript":
352 |     case "json":
353 |     case "pbxproj":
354 |       return "js";
355 |     // YAML and related
356 |     case "yaml":
357 |     case "yml":
358 |     case "toml":
359 |     case "ini":
360 |     case "cfg":
361 |     case "conf":
362 |     case "props":
363 |     case "env":
364 |     case "plist":
365 |     case "gemfile":
366 |     case "dockerfile":
367 |     case "podfile":
368 |     case "patch":
369 |       return "text";
370 |     // Shell scripts and related
371 |     case "sh":
372 |     case "bash":
373 |     case "zsh":
374 |     case "fish":
375 |     case "bat":
376 |     case "cmd":
377 |       return "text";
378 |     // Properties and XSD
379 |     case "properties":
380 |     case "xsd":
381 |       return "text";
382 |     // SQL
383 |     case "sql":
384 |       return "sql";
385 |     // PHP
386 |     case "php":
387 |       return "php";
388 |     // Protocol buffers
389 |     case "proto":
390 |       return "proto";
391 |     // Python
392 |     case "py":
393 |     case "python":
394 |       return "python";
395 |     // reStructuredText
396 |     case "rst":
397 |       return "rst";
398 |     // Ruby
399 |     case "rb":
400 |     case "ruby":
401 |       return "ruby";
402 |     // Rust
403 |     case "rs":
404 |     case "rust":
405 |       return "rust";
406 |     // Scala
407 |     case "scala":
408 |       return "scala";
409 |     // Swift
410 |     case "swift":
411 |       return "swift";
412 |     // Markdown
413 |     case "md":
414 |     case "markdown":
415 |       return "markdown";
416 |     // LaTeX
417 |     case "tex":
418 |     case "latex":
419 |       return "latex";
420 |     // HTML and related
421 |     case "html":
422 |     case "htm":
423 |     case "xml":
424 |     case "xsl":
425 |     case "xdt":
426 |     case "xcworkspacedata":
427 |     case "xcprivacy":
428 |     case "xcsettings":
429 |     case "xcscheme":
430 |       return "html";
431 |     // Solidity
432 |     case "sol":
433 |     case "solidity":
434 |       return "sol";
435 |     // Text
436 |     case "text":
437 |     case "txt":
438 |     case "lst":
439 |     case "reg":
440 |       return "text";
441 |     // Additional file extensions
442 |     case "jpr":
443 |     case "jws":
444 |     case "iml":
445 |       return "html";
446 |     case "lock":
447 |     case "jpg":
448 |     case "jpeg":
449 |     case "png":
450 |     case "gif":
451 |     case "bmp":
452 |     case "svg":
453 |     case "ico":
454 |     case "webp":
455 |     case "tiff":
456 |     case "bin":
457 |     case "exe":
458 |     case "dll":
459 |     case "so":
460 |     case "dylib":
461 |     case "obj":
462 |     case "o":
463 |     case "zip":
464 |     case "tar":
465 |     case "gz":
466 |     case "rar":
467 |     case "7z":
468 |     case "jar":
469 |     case "war":
470 |     case "ear":
471 |     case "class":
472 |       return "ignore";
473 |     default:
474 |       return "text";
475 |   }
476 | }
477 | 
478 | export const splitDocument = (filename: string, code: string) => {
479 |   const extension = filename.split(".").pop();
480 | 
481 |   const splitType = extensionToSplitter(extension || "");
482 |   if (splitType === "ignore") {
483 |     return [];
484 |   }
485 | 
486 |   const CHUNK_SIZE_TOKENS = 7000;
487 |   const CHUNK_OVERLAP_TOKENS = 200;
488 | 
489 |   const CHUNK_SIZE_CHARACTERS = CHUNK_SIZE_TOKENS * 3.25;
490 |   const CHUNK_OVERLAP_CHARACTERS = CHUNK_OVERLAP_TOKENS * 3.25;
491 | 
492 |   let splitter;
493 | 
494 |   if (splitType !== "text" && splitType !== "sql") {
495 |     splitter = RecursiveCharacterTextSplitter.fromLanguage(
496 |       splitType as
497 |         | "cpp"
498 |         | "go"
499 |         | "java"
500 |         | "js"
501 |         | "php"
502 |         | "proto"
503 |         | "python"
504 |         | "rst"
505 |         | "ruby"
506 |         | "rust"
507 |         | "scala"
508 |         | "swift"
509 |         | "markdown"
510 |         | "latex"
511 |         | "html"
512 |         | "sol",
513 |       {
514 |         chunkSize: CHUNK_SIZE_CHARACTERS,
515 |         chunkOverlap: CHUNK_OVERLAP_CHARACTERS,
516 |       }
517 |     );
518 |   } else if (splitType === "sql") {
519 |     splitter = new SQLSchemaSplitter(CHUNK_SIZE_CHARACTERS);
520 |   } else {
521 |     splitter = new RecursiveCharacterTextSplitter({
522 |       chunkSize: CHUNK_SIZE_CHARACTERS,
523 |       chunkOverlap: CHUNK_OVERLAP_CHARACTERS,
524 |     });
525 |   }
526 |   return splitter.createDocuments([code], [], {
527 |     chunkHeader: `FILE NAME: ${filename}\n\n---\n\n`,
528 |     appendChunkOverlapHeader: true,
529 |   });
530 | };
531 | 
```