# Directory Structure
```
├── .env.example
├── .gitignore
├── claude_desktop_config.example.json
├── config.ts
├── index.ts
├── jest.config.mjs
├── LICENSE
├── package-lock.json
├── package.json
├── README.md
├── SETUP.md
├── start.ts
├── tools
│ ├── embedFiles.ts
│ ├── ingestBranch.ts
│ ├── processFiles.ts
│ └── queryRepo.ts
├── tsconfig.json
└── utils
├── codeSplitter.ts
├── db.ts
├── filePatternMatcher.ts
├── ollamaEmbeddings.ts
├── repoConfig.ts
└── types.ts
```
# Files
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
```
1 | DATA_DIR=/home/user/.config/Claude/data
2 | REPO_CONFIG_DIR=/home/user/.config/Claude/repos
3 | NODE_ENV=development
4 |
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
1 | # Node.js
2 | node_modules/
3 | npm-debug.log
4 | yarn-debug.log
5 | yarn-error.log
6 |
7 | # TypeScript
8 | dist/
9 | *.tsbuildinfo
10 |
11 | # Data directories
12 | data/
13 | cache/
14 | repos/
15 |
16 | # HuggingFace specific
17 | .transformers/
18 | .cache/
19 | huggingface/
20 | models/
21 | **/temp_test_repos/
22 |
23 | # Test temporary files
24 | coverage/
25 | .nyc_output/
26 | junit.xml
27 |
28 | # Database files
29 | *.db
30 | *.sqlite
31 | *.sqlite3
32 |
33 | # Environment variables
34 | .env
35 | .env.local
36 | .env.development.local
37 | .env.test.local
38 | .env.production.local
39 |
40 | # Log files
41 | logs/
42 | *.log
43 |
44 | # Editor directories and files
45 | .idea/
46 | .vscode/
47 | *.swp
48 | *.swo
49 |
50 | # OS files
51 | .DS_Store
52 | Thumbs.db
53 |
54 | # Build files
55 | build/
56 | out/
57 |
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | # Code Context MCP Server
2 |
3 | A Model Context Protocol (MCP) server for providing code context from local git repositories. This server allows you to:
4 |
5 | 1. Clone git repositories locally
6 | 2. Process branches and files
7 | 3. Generate embeddings for code chunks
8 | 4. Perform semantic search over code
9 |
10 | ## Features
11 |
12 | - Uses local git repositories instead of GitHub API
13 | - Stores data in SQLite database
14 | - Splits code into semantic chunks
15 | - Generates embeddings for code chunks using Ollama
16 | - Provides semantic search over code
17 |
18 | ## Prerequisites
19 |
20 | - Node.js (v16+)
21 | - Git
22 | - Ollama with an embedding model
23 |
24 | ## Installation
25 |
26 | ```bash
27 | # Clone the repository
28 | git clone <repository-url>
29 | cd code-context-mcp
30 |
31 | # Install dependencies
32 | npm install
33 |
34 | # Build the project
35 | npm run build
36 | ```
37 |
38 | ## Configuration
39 |
40 | Set the following environment variables:
41 |
42 | - `DATA_DIR`: Directory for SQLite database (default: '~/.codeContextMcp/data')
43 | - `REPO_CACHE_DIR`: Directory for cloned repositories (default: '~/.codeContextMcp/repos')
44 |
45 | ### Using Ollama
46 |
47 | For faster and more powerful embeddings, you can use Ollama:
48 |
49 | ```bash
50 | # Install Ollama from https://ollama.ai/
51 |
52 | # Pull an embedding model (unclemusclez/jina-embeddings-v2-base-code is recommended)
53 | ollama pull unclemusclez/jina-embeddings-v2-base-code
54 |
55 | ```
56 |
57 | ## Usage
58 |
59 | ### Using with Claude Desktop
60 |
61 | Add the following configuration to your Claude Desktop configuration file (`claude_desktop_config.json`):
62 |
63 | ```json
64 | {
65 | "mcpServers": {
66 | "code-context-mcp": {
67 | "command": "/path/to/your/node",
68 | "args": ["/path/to/code-context-mcp/dist/index.js"]
69 | }
70 | }
71 | }
72 | ```
73 |
74 | ## Tools
75 |
76 | The server provides the following tool:
77 |
78 | ### queryRepo
79 |
80 | Clones a repository, processes code, and performs semantic search:
81 |
82 | ```json
83 | {
84 | "repoUrl": "https://github.com/username/repo.git",
85 | "branch": "main", // Optional - defaults to repository's default branch
86 | "query": "Your search query",
87 | "keywords": ["keyword1", "keyword2"], // Optional - filter results by keywords
88 | "filePatterns": ["**/*.ts", "src/*.js"], // Optional - filter files by glob patterns
89 | "excludePatterns": ["**/node_modules/**"], // Optional - exclude files by glob patterns
90 | "limit": 10 // Optional - number of results to return, default: 10
91 | }
92 | ```
93 |
94 | The `branch` parameter is optional. If not provided, the tool will automatically use the repository's default branch.
95 |
96 | The `keywords` parameter is optional. If provided, the results will be filtered to only include chunks that contain at least one of the specified keywords (case-insensitive matching).
97 |
98 | The `filePatterns` and `excludePatterns` parameters are optional. They allow you to filter which files are processed and searched using glob patterns (e.g., `**/*.ts` for all TypeScript files).
99 |
100 | ## Database Schema
101 |
102 | The server uses SQLite with the following schema:
103 |
104 | - `repository`: Stores information about repositories
105 | - `branch`: Stores information about branches
106 | - `file`: Stores information about files
107 | - `branch_file_association`: Associates files with branches
108 | - `file_chunk`: Stores code chunks and their embeddings
109 |
110 | # Debugging
111 |
112 | ## MAC Mx Series - ARM Architecture Issues
113 |
114 | When installing better-sqlite3 on Mac M-series chips (ARM architecture), if you encounter errors like "mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64e' or 'arm64')", you need to ensure the binary matches your architecture. Here's how to resolve this issue:
115 |
116 | ```bash
117 | # Check your Node.js architecture
118 | node -p "process.arch"
119 |
120 | # If it shows 'arm64', but you're still having issues, try:
121 | npm rebuild better-sqlite3 --build-from-source
122 |
123 | # Or for a clean install:
124 | npm uninstall better-sqlite3
125 | export npm_config_arch=arm64
126 | export npm_config_target_arch=arm64
127 | npm install better-sqlite3 --build-from-source
128 | ```
129 |
130 | If you're using Rosetta, make sure your entire environment is consistent. Your error shows x86_64 binaries being built but your system needs arm64.
131 | For persistent configuration, add to your .zshrc or .bashrc:
132 |
133 | ```
134 | export npm_config_arch=arm64
135 | export npm_config_target_arch=arm64
136 | ```
137 |
138 | ## Testing Ollama Embeddings
139 |
140 | curl http://localhost:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
141 | curl http://127.0.01:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
142 | curl http://[::1]:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
143 |
144 | ## License
145 |
146 | MIT
147 |
```
--------------------------------------------------------------------------------
/utils/types.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Common interfaces and types used across the codebase
3 | */
4 |
5 | /**
6 | * Interface for objects that can send progress notifications
7 | */
8 | export interface ProgressNotifier {
9 | sendProgress: (progress: number, total: number) => Promise<void>;
10 | }
11 |
```
--------------------------------------------------------------------------------
/claude_desktop_config.example.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "mcpServers": {
3 | "code-context": {
4 | "command": "node",
5 | "args": ["<CLAUDE_CONFIG_DIR>/mcp-servers/code-context-mcp/dist/start.js"],
6 | "env": {
7 | "DATA_DIR": "<CLAUDE_CONFIG_DIR>/data",
8 | "REPO_CONFIG_DIR": "<CLAUDE_CONFIG_DIR>/repos",
9 | "NODE_ENV": "development"
10 | }
11 | }
12 | }
13 | }
14 |
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "NodeNext",
5 | "moduleResolution": "NodeNext",
6 | "esModuleInterop": true,
7 | "strict": true,
8 | "outDir": "./dist",
9 | "rootDir": ".",
10 | "declaration": true,
11 | "skipLibCheck": true,
12 | "isolatedModules": true,
13 | "allowJs": true,
14 | "resolveJsonModule": true,
15 | "forceConsistentCasingInFileNames": true,
16 | "baseUrl": ".",
17 | "paths": {
18 | "*": ["*"]
19 | }
20 | },
21 | "include": [
22 | "./**/*.ts",
23 | "./**/*.mts",
24 | "./tests/**/*.ts"
25 | ],
26 | "exclude": [
27 | "node_modules",
28 | "dist",
29 | "repos"
30 | ]
31 | }
32 |
```
--------------------------------------------------------------------------------
/config.ts:
--------------------------------------------------------------------------------
```typescript
1 | import path from "path";
2 | import os from "os";
3 |
4 | // Available models for code embeddings
5 | export const EMBEDDING_MODELS = {
6 | OLLAMA: {
7 | model: "unclemusclez/jina-embeddings-v2-base-code",
8 | contextSize: 8192,
9 | dimensions: 768,
10 | baseUrl: "http://127.0.0.1:11434",
11 | },
12 | };
13 |
14 |
15 |
16 | export const codeContextConfig = {
17 | ENV: process.env.NODE_ENV || "development",
18 | REPO_CONFIG_DIR:
19 | process.env.REPO_CONFIG_DIR ||
20 | path.join(os.homedir(), ".codeContextMcp", "repos"),
21 | BATCH_SIZE: 100,
22 | DATA_DIR:
23 | process.env.DATA_DIR || path.join(os.homedir(), ".codeContextMcp", "data"),
24 | DB_PATH: process.env.DB_PATH || "code_context.db",
25 | EMBEDDING_MODEL: EMBEDDING_MODELS.OLLAMA,
26 | };
27 |
28 | export default codeContextConfig;
29 |
```
--------------------------------------------------------------------------------
/jest.config.mjs:
--------------------------------------------------------------------------------
```
1 | export default {
2 | preset: 'ts-jest/presets/default-esm',
3 | clearMocks: true,
4 | coverageDirectory: "coverage",
5 | roots: [
6 | "./tests"
7 | ],
8 | moduleNameMapper: {
9 | '^(\\.{1,2}/.*)\\.js$': '$1',
10 | },
11 | transform: {
12 | '^.+\\.tsx?$': [
13 | 'ts-jest',
14 | {
15 | isolatedModules: true,
16 | useESM: true,
17 | tsconfig: './tsconfig.json'
18 | }
19 | ]
20 | },
21 | testEnvironment: 'node',
22 | moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node', 'mjs'],
23 | extensionsToTreatAsEsm: ['.ts', '.mts'],
24 | transformIgnorePatterns: [
25 | 'node_modules/(?!(@huggingface)/)'
26 | ],
27 | testMatch: [
28 | '**/?(*.)+(spec|test).ts',
29 | '**/tests/*EmbeddingsTest.ts',
30 | '**/tests/githubRepoTest.ts'
31 | ],
32 | globals: {
33 | 'ts-jest': {
34 | useESM: true,
35 | },
36 | },
37 | setupFilesAfterEnv: ['<rootDir>/tests/setup.ts'],
38 | verbose: true
39 | };
40 |
```
--------------------------------------------------------------------------------
/SETUP.md:
--------------------------------------------------------------------------------
```markdown
1 | # Code Context MCP Setup
2 |
3 | ## Prerequisites
4 |
5 | ```bash
6 | ollama pull unclemusclez/jina-embeddings-v2-base-code
7 | ```
8 |
9 | ## Install
10 |
11 | ```bash
12 | npm install
13 | npm run build
14 | ```
15 |
16 | ## Configuration
17 |
18 | Copy `claude_desktop_config.example.json` to your Claude Desktop config location:
19 |
20 | **Linux/macOS**: `~/.config/Claude/claude_desktop_config.json`
21 | **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
22 |
23 | Replace `<CLAUDE_CONFIG_DIR>` with your actual path:
24 | - Linux/macOS: `/home/username/.config/Claude`
25 | - Windows: `C:\Users\username\AppData\Roaming\Claude`
26 |
27 | ## Environment
28 |
29 | Copy `.env.example` to `.env` and adjust paths if needed.
30 |
31 | The `repos/` directory stores configuration metadata for repositories, not full clones.
32 | For local repositories (file:// URLs), no cloning occurs - files are accessed directly.
33 |
34 | ## Test
35 |
36 | ```bash
37 | npm run start:mcp
38 | ```
39 |
40 | Restart Claude Desktop.
41 |
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "name": "@modelcontextprotocol/server-code-context",
3 | "version": "0.1.0",
4 | "description": "MCP server for code context from local git repositories",
5 | "license": "MIT",
6 | "type": "module",
7 | "bin": {
8 | "mcp-server-code-context": "dist/index.js"
9 | },
10 | "files": [
11 | "dist"
12 | ],
13 | "scripts": {
14 | "build": "tsc && shx chmod +x dist/*.js",
15 | "watch": "tsc --watch",
16 | "start": "node dist/index.js",
17 | "start:mcp": "node dist/start.js",
18 | "inspect": "npm run build && npx @modelcontextprotocol/inspector node dist/index.js"
19 | },
20 | "dependencies": {
21 | "@langchain/textsplitters": "^0.1.0",
22 | "@modelcontextprotocol/sdk": "1.0.1",
23 | "axios": "^1.8.4",
24 | "better-sqlite3": "^11.9.1",
25 | "express": "^4.21.1",
26 | "simple-git": "^3.20.0",
27 | "zod": "^3.23.8",
28 | "zod-to-json-schema": "^3.23.5"
29 | },
30 | "devDependencies": {
31 | "@types/better-sqlite3": "^7.6.4",
32 | "@types/express": "^5.0.0",
33 | "@types/jest": "^29.5.14",
34 | "@types/minimatch": "^5.1.2",
35 | "@types/node": "^20.10.0",
36 | "jest": "^29.7.0",
37 | "jest-environment-node-single-context": "^29.4.0",
38 | "shx": "^0.3.4",
39 | "ts-jest": "^29.3.0",
40 | "tsx": "^4.19.3",
41 | "typescript": "^5.6.2"
42 | }
43 | }
44 |
```
--------------------------------------------------------------------------------
/utils/filePatternMatcher.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Convert a glob pattern to an SQL LIKE pattern
3 | */
4 | export function globToSqlPattern(pattern: string): string {
5 | // Handle ** (any depth of directories)
6 | let sqlPattern = pattern.replace(/\*\*/g, '%');
7 |
8 | // Handle * (any characters within a directory)
9 | sqlPattern = sqlPattern.replace(/\*/g, '%');
10 |
11 | return sqlPattern;
12 | }
13 |
14 | /**
15 | * Create SQL WHERE conditions for file pattern filtering using numbered parameters
16 | * for better SQLite compatibility
17 | */
18 | export function createFilePatternCondition(
19 | includePatterns: string[] | undefined,
20 | excludePatterns: string[] | undefined
21 | ): string {
22 | let conditions = '';
23 |
24 | // Include patterns (files must match at least one pattern)
25 | if (includePatterns && includePatterns.length > 0) {
26 | const includeConditions = includePatterns.map(pattern => {
27 | const sqlPattern = globToSqlPattern(pattern);
28 | return `f.path LIKE '${sqlPattern}'`;
29 | });
30 | conditions += ` AND (${includeConditions.join(' OR ')})`;
31 | }
32 |
33 | // Exclude patterns (files must not match any pattern)
34 | if (excludePatterns && excludePatterns.length > 0) {
35 | const excludeConditions = excludePatterns.map(pattern => {
36 | const sqlPattern = globToSqlPattern(pattern);
37 | return `f.path NOT LIKE '${sqlPattern}'`;
38 | });
39 | conditions += ` AND (${excludeConditions.join(' AND ')})`;
40 | }
41 |
42 | return conditions;
43 | }
44 |
```
--------------------------------------------------------------------------------
/start.ts:
--------------------------------------------------------------------------------
```typescript
1 | #!/usr/bin/env node
2 |
3 | import { spawn } from 'child_process';
4 | import { existsSync, mkdirSync } from 'fs';
5 | import { dirname, join } from 'path';
6 | import { fileURLToPath } from 'url';
7 |
8 | const __dirname = dirname(fileURLToPath(import.meta.url));
9 |
10 | const DATA_DIR = process.env.DATA_DIR || join(process.env.HOME!, '.config', 'Claude', 'data');
11 | const REPO_CONFIG_DIR = process.env.REPO_CONFIG_DIR || join(process.env.HOME!, '.config', 'Claude', 'repos');
12 | const NODE_ENV = process.env.NODE_ENV || 'development';
13 |
14 | [DATA_DIR, REPO_CONFIG_DIR].forEach(dir => {
15 | if (!existsSync(dir)) {
16 | mkdirSync(dir, { recursive: true, mode: 0o755 });
17 | }
18 | });
19 |
20 | process.stderr.write(`Starting Code Context MCP Server\n`);
21 | process.stderr.write(`Data Directory: ${DATA_DIR}\n`);
22 | process.stderr.write(`Repo Config: ${REPO_CONFIG_DIR}\n`);
23 | process.stderr.write(`Node Environment: ${NODE_ENV}\n\n`);
24 |
25 | const checkOllama = () => {
26 | try {
27 | const result = spawn('pgrep', ['ollama'], { stdio: 'pipe' });
28 | result.on('exit', (code) => {
29 | if (code !== 0) {
30 | process.stderr.write('Starting Ollama...\n');
31 | spawn('ollama', ['serve'], { detached: true, stdio: 'ignore' }).unref();
32 | setTimeout(() => startMcpServer(), 3000);
33 | } else {
34 | startMcpServer();
35 | }
36 | });
37 | } catch {
38 | startMcpServer();
39 | }
40 | };
41 |
42 | const startMcpServer = () => {
43 | const serverPath = join(__dirname, 'index.js');
44 |
45 | if (!existsSync(serverPath)) {
46 | process.stderr.write(`Error: MCP server not found at ${serverPath}\n`);
47 | process.stderr.write('Run: npm run build\n');
48 | process.exit(1);
49 | }
50 |
51 | process.env.DATA_DIR = DATA_DIR;
52 | process.env.REPO_CONFIG_DIR = REPO_CONFIG_DIR;
53 | process.env.NODE_ENV = NODE_ENV;
54 |
55 | const server = spawn('node', [serverPath, ...process.argv.slice(2)], {
56 | stdio: 'inherit',
57 | cwd: __dirname
58 | });
59 |
60 | server.on('exit', (code) => process.exit(code || 0));
61 | };
62 |
63 | checkOllama();
64 |
```
--------------------------------------------------------------------------------
/utils/ollamaEmbeddings.ts:
--------------------------------------------------------------------------------
```typescript
1 | import axios from "axios";
2 | import config from "../config.js";
3 |
4 | // Cache for API
5 | let apiInitialized = false;
6 |
7 | /**
8 | * Generate embeddings for text using Ollama API
9 | * @param texts Array of text strings to embed
10 | * @param embeddingModel Optional model configuration to use
11 | * @returns Promise containing array of embeddings
12 | */
13 | export async function generateOllamaEmbeddings(
14 | texts: string[],
15 | embeddingModel: {
16 | model: string;
17 | contextSize: number;
18 | dimensions: number;
19 | baseUrl?: string;
20 | } = config.EMBEDDING_MODEL
21 | ): Promise<number[][]> {
22 | try {
23 | // Log initialization
24 | if (!apiInitialized) {
25 | console.error(
26 | `Initializing Ollama embeddings with model: ${embeddingModel.model}...`
27 | );
28 | apiInitialized = true;
29 | }
30 |
31 | const baseUrl = embeddingModel.baseUrl || "http://127.0.0.1:11434";
32 | const embeddings: number[][] = [];
33 |
34 | // Process texts in parallel with a rate limit
35 | console.error(`Generating embeddings for ${texts.length} chunks...`);
36 | const batchSize = 1000; // Process 5 at a time to avoid overwhelming the API
37 | for (let i = 0; i < texts.length; i += batchSize) {
38 | const batch = texts.slice(i, i + batchSize);
39 | const response = await axios.post(
40 | `${baseUrl}/api/embed`,
41 | {
42 | model: embeddingModel.model,
43 | input: batch,
44 | options: {
45 | num_ctx: embeddingModel.contextSize,
46 | },
47 | },
48 | {
49 | headers: {
50 | "Content-Type": "application/json",
51 | },
52 | }
53 | );
54 | // Await all promises in this batch
55 | embeddings.push(...response.data.embeddings);
56 | }
57 |
58 | console.error(`Successfully generated ${embeddings.length} embeddings`);
59 | return embeddings;
60 | } catch (error) {
61 | console.error("Error generating embeddings:", error);
62 |
63 | // For testing purposes, return mock embeddings if running in test environment
64 | if (config.ENV === "test") {
65 | console.error("Using mock embeddings for testing");
66 | return texts.map(() => generateMockEmbedding(embeddingModel.dimensions));
67 | }
68 |
69 | throw error;
70 | }
71 | }
72 |
73 | /**
74 | * Generate a simple mock embedding vector for testing
75 | * @param dimensions The number of dimensions in the embedding vector
76 | * @returns A normalized random vector of the specified dimensions
77 | */
78 | function generateMockEmbedding(dimensions: number): number[] {
79 | // Create a random vector
80 | const vector = Array.from({ length: dimensions }, () => Math.random() - 0.5);
81 |
82 | // Normalize the vector
83 | const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
84 | return vector.map((val) => val / magnitude);
85 | }
86 |
```
--------------------------------------------------------------------------------
/utils/repoConfig.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { existsSync, writeFileSync, readFileSync, mkdirSync } from 'fs';
2 | import { join, basename } from 'path';
3 | import { createHash } from 'crypto';
4 | import config from '../config.js';
5 |
6 | interface RepoConfig {
7 | url: string;
8 | localPath?: string;
9 | lastAccessed: number;
10 | type: 'local' | 'remote' | 'cached';
11 | branch?: string;
12 | }
13 |
14 | export class RepositoryConfigManager {
15 | private configDir: string;
16 |
17 | constructor() {
18 | this.configDir = config.REPO_CONFIG_DIR;
19 | if (!existsSync(this.configDir)) {
20 | mkdirSync(this.configDir, { recursive: true });
21 | }
22 | }
23 |
24 | private getConfigPath(repoUrl: string): string {
25 | const hash = createHash('md5').update(repoUrl).digest('hex');
26 | return join(this.configDir, `${hash}.json`);
27 | }
28 |
29 | private sanitizeLocalPath(repoUrl: string): string | null {
30 | if (repoUrl.startsWith('file://')) {
31 | const localPath = repoUrl.replace('file://', '');
32 | return existsSync(localPath) ? localPath : null;
33 | }
34 | return null;
35 | }
36 |
37 | getRepositoryPath(repoUrl: string, branch?: string): { path: string; config: RepoConfig } {
38 | const localPath = this.sanitizeLocalPath(repoUrl);
39 |
40 | if (localPath) {
41 | const repoConfig: RepoConfig = {
42 | url: repoUrl,
43 | localPath,
44 | lastAccessed: Date.now(),
45 | type: 'local',
46 | branch
47 | };
48 |
49 | this.saveConfig(repoUrl, repoConfig);
50 | return { path: localPath, config: repoConfig };
51 | }
52 |
53 | const configPath = this.getConfigPath(repoUrl);
54 | let repoConfig: RepoConfig;
55 |
56 | if (existsSync(configPath)) {
57 | try {
58 | repoConfig = JSON.parse(readFileSync(configPath, 'utf8'));
59 | repoConfig.lastAccessed = Date.now();
60 | } catch {
61 | repoConfig = this.createRemoteConfig(repoUrl, branch);
62 | }
63 | } else {
64 | repoConfig = this.createRemoteConfig(repoUrl, branch);
65 | }
66 |
67 | this.saveConfig(repoUrl, repoConfig);
68 | return { path: repoConfig.localPath || '', config: repoConfig };
69 | }
70 |
71 | private createRemoteConfig(repoUrl: string, branch?: string): RepoConfig {
72 | const repoName = basename(repoUrl.replace('.git', ''));
73 | const cacheDir = join(this.configDir, 'cache');
74 |
75 | if (!existsSync(cacheDir)) {
76 | mkdirSync(cacheDir, { recursive: true });
77 | }
78 |
79 | return {
80 | url: repoUrl,
81 | localPath: join(cacheDir, repoName),
82 | lastAccessed: Date.now(),
83 | type: 'remote',
84 | branch
85 | };
86 | }
87 |
88 | private saveConfig(repoUrl: string, config: RepoConfig): void {
89 | const configPath = this.getConfigPath(repoUrl);
90 | writeFileSync(configPath, JSON.stringify(config, null, 2));
91 | }
92 |
93 | isLocalRepository(repoUrl: string): boolean {
94 | return repoUrl.startsWith('file://');
95 | }
96 |
97 | needsCloning(repoUrl: string): boolean {
98 | if (this.isLocalRepository(repoUrl)) {
99 | return false;
100 | }
101 |
102 | const { config } = this.getRepositoryPath(repoUrl);
103 | return !config.localPath || !existsSync(config.localPath);
104 | }
105 |
106 | getRepoType(repoUrl: string): 'local' | 'remote' {
107 | return this.isLocalRepository(repoUrl) ? 'local' : 'remote';
108 | }
109 | }
110 |
111 | export const repoConfigManager = new RepositoryConfigManager();
112 |
```
--------------------------------------------------------------------------------
/index.ts:
--------------------------------------------------------------------------------
```typescript
1 | #!/usr/bin/env node
2 | import { Server } from "@modelcontextprotocol/sdk/server/index.js";
3 | import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4 | import {
5 | CallToolRequestSchema,
6 | ErrorCode,
7 | ListToolsRequestSchema,
8 | McpError,
9 | } from "@modelcontextprotocol/sdk/types.js";
10 | import { QueryRepoSchema, queryRepo } from "./tools/queryRepo.js";
11 | import { zodToJsonSchema } from "zod-to-json-schema";
12 | import { z } from "zod";
13 | import { ProgressNotifier } from "utils/types.js";
14 |
15 | enum ToolName {
16 | QUERY_REPO = "query_repo",
17 | }
18 |
19 | class CodeContextServer {
20 | private server: Server;
21 |
22 | constructor() {
23 | this.server = new Server(
24 | {
25 | name: "code-context-mcp",
26 | version: "0.1.0",
27 | },
28 | {
29 | capabilities: {
30 | tools: {},
31 | },
32 | }
33 | );
34 |
35 | this.setupToolHandlers();
36 |
37 | // Error handling
38 | this.server.onerror = (error) => console.error("[MCP Error]", error);
39 | process.on("SIGINT", async () => {
40 | await this.server.close();
41 | process.exit(0);
42 | });
43 | }
44 |
45 | private setupToolHandlers() {
46 | this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
47 | tools: [
48 | {
49 | name: ToolName.QUERY_REPO,
50 | description: "Queries a git repository using semantic and keyword search. Use keywords and file patterns if you want to targer specific files or terms",
51 | inputSchema: zodToJsonSchema(QueryRepoSchema),
52 | },
53 | ],
54 | }));
55 |
56 | this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
57 | const { name, arguments: input } = request.params;
58 | const progressToken = request.params._meta?.progressToken;
59 |
60 | switch (name) {
61 | case ToolName.QUERY_REPO:
62 | try {
63 | // Create a progress notifier if we have a progress token
64 | let progressNotifier: ProgressNotifier | undefined;
65 |
66 | if (progressToken !== undefined) {
67 | progressNotifier = {
68 | sendProgress: async (progress: number, total: number) => {
69 | await this.server.notification({
70 | method: "notifications/progress",
71 | params: {
72 | progress: Math.floor(progress * 100),
73 | total: total * 100,
74 | progressToken,
75 | },
76 | });
77 | },
78 | };
79 | }
80 |
81 | // Get the raw result from queryRepo with progress notifications
82 | const result = await queryRepo(
83 | input as z.infer<typeof QueryRepoSchema>,
84 | progressNotifier
85 | );
86 |
87 | // Format the response in Claude's expected structure
88 | return {
89 | content: [
90 | {
91 | type: "text",
92 | text: JSON.stringify(result),
93 | },
94 | ],
95 | };
96 | } catch (error) {
97 | console.error("Error in query_repo:", error);
98 | return {
99 | content: [
100 | {
101 | type: "text",
102 | text: `Error executing query: ${error instanceof Error ? error.message : String(error)}`,
103 | },
104 | ],
105 | };
106 | }
107 | default:
108 | throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${name}`);
109 | }
110 | });
111 | }
112 |
113 | async run() {
114 | const transport = new StdioServerTransport();
115 | await this.server.connect(transport);
116 | console.error("Code Context MCP server running on stdio");
117 | }
118 | }
119 |
120 | const server = new CodeContextServer();
121 | server.run().catch(console.error);
122 |
```
--------------------------------------------------------------------------------
/utils/db.ts:
--------------------------------------------------------------------------------
```typescript
1 | import Database from "better-sqlite3";
2 | import fs from "fs";
3 | import path from "path";
4 | import config from "../config.js";
5 |
6 | // Ensure the data directory exists
7 | const DATA_DIR = config.DATA_DIR;
8 | if (!fs.existsSync(DATA_DIR)) {
9 | fs.mkdirSync(DATA_DIR, { recursive: true });
10 | }
11 |
12 | const DB_PATH = path.join(DATA_DIR, "code_context.db");
13 | const db = new Database(DB_PATH);
14 |
15 | console.error(`Using db at: ${DB_PATH}`)
16 |
17 | // Enable foreign keys
18 | db.pragma("foreign_keys = ON");
19 |
20 | // SQL schema for the database
21 | export const SCHEMA_SQL = `
22 | CREATE TABLE IF NOT EXISTS repository (
23 | id INTEGER PRIMARY KEY AUTOINCREMENT,
24 | name TEXT NOT NULL,
25 | path TEXT NOT NULL,
26 | last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
27 | UNIQUE(path)
28 | );
29 |
30 | CREATE TABLE IF NOT EXISTS branch (
31 | id INTEGER PRIMARY KEY AUTOINCREMENT,
32 | name TEXT NOT NULL,
33 | repository_id INTEGER NOT NULL,
34 | last_commit_sha TEXT NOT NULL,
35 | status TEXT CHECK(status IN ('pending', 'files_processed', 'embeddings_generated')) DEFAULT 'pending',
36 | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
37 | updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
38 | FOREIGN KEY (repository_id) REFERENCES repository(id) ON DELETE CASCADE,
39 | UNIQUE(name, repository_id)
40 | );
41 |
42 | CREATE TABLE IF NOT EXISTS file (
43 | id INTEGER PRIMARY KEY AUTOINCREMENT,
44 | repository_id INTEGER NOT NULL,
45 | path TEXT NOT NULL,
46 | name TEXT NOT NULL,
47 | sha TEXT NOT NULL,
48 | status TEXT CHECK(status IN ('pending', 'fetched', 'ingested', 'done')) DEFAULT 'pending',
49 | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
50 | updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
51 | FOREIGN KEY (repository_id) REFERENCES repository(id) ON DELETE CASCADE,
52 | UNIQUE(repository_id, path, sha)
53 | );
54 |
55 | CREATE TABLE IF NOT EXISTS branch_file_association (
56 | branch_id INTEGER NOT NULL,
57 | file_id INTEGER NOT NULL,
58 | PRIMARY KEY (branch_id, file_id),
59 | FOREIGN KEY (branch_id) REFERENCES branch(id) ON DELETE CASCADE,
60 | FOREIGN KEY (file_id) REFERENCES file(id) ON DELETE CASCADE
61 | );
62 |
63 | CREATE TABLE IF NOT EXISTS file_chunk (
64 | id INTEGER PRIMARY KEY AUTOINCREMENT,
65 | file_id INTEGER NOT NULL,
66 | content TEXT NOT NULL,
67 | chunk_number INTEGER NOT NULL,
68 | embedding TEXT,
69 | model_version TEXT,
70 | token_count INTEGER,
71 | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
72 | updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
73 | FOREIGN KEY (file_id) REFERENCES file(id) ON DELETE CASCADE,
74 | UNIQUE(file_id, chunk_number)
75 | );
76 | `;
77 |
78 | // Initialize the database
79 | export const initializeDatabase = () => {
80 | try {
81 | // Split the schema SQL into individual statements
82 | const statements = SCHEMA_SQL.split(";").filter(
83 | (stmt) => stmt.trim().length > 0
84 | );
85 |
86 | // Execute each statement
87 | for (const statement of statements) {
88 | db.exec(statement + ";");
89 | }
90 | } catch (error) {
91 | console.error("Error initializing database:", error);
92 | throw error;
93 | }
94 | };
95 |
96 | // Helper function to run queries with parameters
97 | const run = (sql: string, params: any = {}) => {
98 | return db.prepare(sql).run(params);
99 | };
100 |
101 | // Helper function to get a single row
102 | const get = (sql: string, params: any = {}) => {
103 | return db.prepare(sql).get(params);
104 | };
105 |
106 | // Helper function to get all rows
107 | const all = (sql: string, params: any = {}) => {
108 | return db.prepare(sql).all(params);
109 | };
110 |
111 | // Define a type for the database operations that can be performed in a transaction
112 | export interface DatabaseOperations {
113 | prepare: (sql: string) => {
114 | run: (params?: any) => any;
115 | get: (params?: any) => any;
116 | all: (params?: any) => any;
117 | };
118 | }
119 |
120 | // Create a transaction function that's compatible with the existing code
121 | const transaction = (cb: (dbOps: any) => any): any => {
122 | const runTransaction = db.transaction(cb);
123 | return runTransaction(db);
124 | };
125 |
126 | // Define a public interface for our database module
127 | export interface DatabaseInterface {
128 | run: (sql: string, params?: any) => any;
129 | get: (sql: string, params?: any) => any;
130 | all: (sql: string, params?: any) => any;
131 | transaction: (cb: (dbOps: any) => any) => any;
132 | close: () => void;
133 | }
134 |
135 | // Initialize the database
136 | initializeDatabase();
137 |
138 | // Export the database interface
139 | const dbInterface: DatabaseInterface = {
140 | run,
141 | get,
142 | all,
143 | transaction,
144 | close: () => db.close(),
145 | };
146 |
147 | export default dbInterface;
148 |
```
--------------------------------------------------------------------------------
/tools/embedFiles.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { z } from "zod";
2 | import dbInterface from "../utils/db.js";
3 | import { generateOllamaEmbeddings } from "../utils/ollamaEmbeddings.js";
4 | import { ProgressNotifier } from "../utils/types.js";
5 | import config from "../config.js";
6 |
7 | // Define input schema for embedFiles
8 | export const EmbedFilesSchema = z.object({
9 | repoLocalPath: z.string().describe("Local path to the cloned repository"),
10 | branchId: z.number().describe("Branch ID in the database"),
11 | _meta: z
12 | .object({
13 | progressToken: z.union([z.string(), z.number()]).optional(),
14 | })
15 | .optional(),
16 | });
17 |
18 | // Define chunk interface
19 | interface Chunk {
20 | id: number;
21 | content: string;
22 | file_id: number;
23 | }
24 |
25 | export async function embedFiles(
26 | input: z.infer<typeof EmbedFilesSchema>,
27 | progressNotifier?: ProgressNotifier
28 | ) {
29 | try {
30 | console.error(
31 | `[embedFiles] Starting with parameters: ${JSON.stringify(input)}`
32 | );
33 |
34 | // Check if input is defined
35 | if (!input) {
36 | console.error(`[embedFiles] Error: Input parameters are undefined`);
37 | return {
38 | error: {
39 | message: "Input parameters are required for embedFiles tool",
40 | },
41 | };
42 | }
43 |
44 | const startTime = Date.now();
45 | const { branchId } = input;
46 |
47 | // First check if the branch exists
48 | const branchExists = dbInterface.get(
49 | "SELECT id, status FROM branch WHERE id = ?",
50 | branchId
51 | );
52 |
53 | if (!branchExists) {
54 | console.error(`[embedFiles] Error: Branch with ID ${branchId} does not exist`);
55 | return {
56 | error: {
57 | message: `Branch with ID ${branchId} does not exist`,
58 | },
59 | };
60 | }
61 |
62 | // Check if there are any files associated with this branch
63 | const fileCount = dbInterface.get(
64 | "SELECT COUNT(*) as count FROM branch_file_association WHERE branch_id = ?",
65 | branchId
66 | );
67 |
68 | if (!fileCount || fileCount.count === 0) {
69 | console.error(`[embedFiles] No files found for branch ${branchId}`);
70 | // Still update the branch status
71 | console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
72 | dbInterface.run(
73 | "UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
74 | branchId
75 | );
76 | return { success: true, chunksProcessed: 0 };
77 | }
78 |
79 | // Get all chunks that need embeddings
80 | console.error(`[embedFiles] Finding chunks that need embeddings for branch ${branchId}`);
81 | const chunks = dbInterface.all(
82 | `SELECT fc.id, fc.content, f.id as file_id
83 | FROM file_chunk fc
84 | JOIN file f ON fc.file_id = f.id
85 | JOIN branch_file_association bfa ON f.id = bfa.file_id
86 | WHERE bfa.branch_id = ?
87 | AND fc.embedding IS NULL`,
88 | branchId
89 | );
90 |
91 | if (chunks.length === 0) {
92 | console.error(`[embedFiles] No chunks need embeddings, skipping`);
93 | // Update branch status even when no chunks need embeddings
94 | console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
95 | dbInterface.run(
96 | "UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
97 | branchId
98 | );
99 |
100 | if (progressNotifier) {
101 | await progressNotifier.sendProgress(1, 1);
102 | }
103 | return { success: true, chunksProcessed: 0 };
104 | }
105 |
106 | console.error(`[embedFiles] Found ${chunks.length} chunks that need embeddings`);
107 |
108 | let processedChunks = 0;
109 | const totalChunks = chunks.length;
110 |
111 | const BATCH_SIZE = 100
112 |
113 | // Process chunks in batches of BATCH_SIZE
114 | for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
115 | const batch = chunks.slice(i, i + BATCH_SIZE);
116 | console.error(
117 | `[embedFiles] Processing batch ${Math.floor(i/BATCH_SIZE) + 1}/${Math.ceil(totalChunks/BATCH_SIZE)}`
118 | );
119 |
120 | // Generate embeddings for the batch
121 | const chunkContents = batch.map((chunk: Chunk) => chunk.content);
122 | console.error(`[embedFiles] Generating embeddings for ${batch.length} chunks`);
123 | const embeddingStartTime = Date.now();
124 | const embeddings = await generateOllamaEmbeddings(chunkContents);
125 | console.error(
126 | `[embedFiles] Generated embeddings in ${Date.now() - embeddingStartTime}ms`
127 | );
128 |
129 | // Store embeddings in transaction
130 | console.error(`[embedFiles] Storing embeddings`);
131 | dbInterface.transaction((db) => {
132 | const updateStmt = db.prepare(
133 | `UPDATE file_chunk
134 | SET embedding = ?, model_version = ?
135 | WHERE id = ?`
136 | );
137 | for (let j = 0; j < batch.length; j++) {
138 | const chunk = batch[j];
139 | const embedding = JSON.stringify(embeddings[j]);
140 | updateStmt.run(embedding, config.EMBEDDING_MODEL.model, chunk.id);
141 | }
142 | });
143 |
144 | processedChunks += batch.length;
145 |
146 | // Update progress
147 | if (progressNotifier) {
148 | const progress = processedChunks / totalChunks;
149 | await progressNotifier.sendProgress(progress, 1);
150 | }
151 | }
152 |
153 | // Update branch status
154 | console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
155 | dbInterface.run(
156 | "UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
157 | branchId
158 | );
159 |
160 | console.error(
161 | `[embedFiles] Processed ${processedChunks} chunks in ${
162 | Date.now() - startTime
163 | }ms`
164 | );
165 |
166 | return {
167 | success: true,
168 | chunksProcessed: processedChunks
169 | };
170 | } catch (error) {
171 | console.error(`[embedFiles] Error executing tool:`, error);
172 | return {
173 | error: {
174 | message: `Error executing embedFiles tool: ${
175 | error instanceof Error ? error.message : String(error)
176 | }`,
177 | },
178 | };
179 | }
180 | }
181 |
```
--------------------------------------------------------------------------------
/tools/ingestBranch.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { z } from "zod";
2 | import { simpleGit } from "simple-git";
3 | import path from "path";
4 | import fs from "fs";
5 | import dbInterface from "../utils/db.js";
6 | import { ProgressNotifier } from "../utils/types.js";
7 | import config from "../config.js";
8 | import { repoConfigManager } from "../utils/repoConfig.js";
9 |
10 | // Define input schema for ingestBranch
11 | export const IngestBranchSchema = z.object({
12 | repoUrl: z.string().describe("GitHub repository URL"),
13 | branch: z
14 | .string()
15 | .optional()
16 | .describe("Branch name to query (defaults to repository's default branch)"),
17 | _meta: z
18 | .object({
19 | progressToken: z.union([z.string(), z.number()]).optional(),
20 | })
21 | .optional(),
22 | });
23 |
24 | // Define chunk interface
25 | interface Chunk {
26 | content: string;
27 | chunkNumber: number;
28 | tokenCount: number;
29 | }
30 |
31 |
32 | const cloneRepository = async (
33 | repoUrl: string,
34 | localPath: string
35 | ): Promise<string> => {
36 | // Extract repository name from URL
37 | const repoName = path.basename(repoUrl, ".git");
38 | const fullPath = path.join(localPath, repoName);
39 |
40 | // Check if repository already exists
41 | if (fs.existsSync(fullPath)) {
42 | console.error(`Repository already exists at ${fullPath}`);
43 | return fullPath;
44 | }
45 |
46 | // Clone the repository
47 | console.error(`Cloning repository ${repoUrl} to ${fullPath}`);
48 | const git = simpleGit();
49 | await git.clone(repoUrl, fullPath);
50 |
51 | return fullPath;
52 | };
53 |
54 | // Modified cloneRepository function wrapper that reports progress
55 | async function cloneRepositoryWithProgress(
56 | repoUrl: string,
57 | reposDir: string,
58 | progressNotifier?: ProgressNotifier
59 | ): Promise<string> {
60 | // Send initial progress notification (start of cloning - 0% of the 33%)
61 | if (progressNotifier) {
62 | await progressNotifier.sendProgress(0, 1);
63 | }
64 |
65 | // Set up a timer to periodically send progress updates
66 | let progressPercentage = 0;
67 | let isCloning = true;
68 | const progressInterval = 1500; // 1.5 seconds between updates
69 | const maxProgress = 0.30; // Progress up to 30% (reserving 3% for completion)
70 | const progressStep = 0.02; // Increments of 2%
71 |
72 | // Create an interval that will send progress updates periodically
73 | let timer: NodeJS.Timeout | null = null;
74 |
75 | if (progressNotifier) {
76 | timer = setInterval(async () => {
77 | if (isCloning && progressPercentage < maxProgress) {
78 | progressPercentage += progressStep;
79 | await progressNotifier!.sendProgress(progressPercentage, 1);
80 | }
81 | }, progressInterval);
82 | }
83 |
84 | try {
85 | // Start cloning operation
86 | const repoLocalPath = await cloneRepository(repoUrl, reposDir);
87 |
88 | // Clone completed
89 | isCloning = false;
90 |
91 | // Send completion of cloning phase (33% of total progress)
92 | if (progressNotifier) {
93 | await progressNotifier.sendProgress(0.33, 1);
94 | }
95 |
96 | return repoLocalPath;
97 | } finally {
98 | // Clean up the timer when done
99 | if (timer) {
100 | clearInterval(timer);
101 | }
102 | }
103 | }
104 |
105 | export async function ingestBranch(
106 | input: z.infer<typeof IngestBranchSchema>,
107 | progressNotifier?: ProgressNotifier
108 | ) {
109 | try {
110 | console.error(
111 | `[ingestBranch] Starting with parameters: ${JSON.stringify(input)}`
112 | );
113 |
114 | // Check if input is defined
115 | if (!input) {
116 | console.error(`[ingestBranch] Error: Input parameters are undefined`);
117 | return {
118 | error: {
119 | message: "Input parameters are required for ingestBranch tool",
120 | },
121 | };
122 | }
123 |
124 | const startTime = Date.now();
125 | const { repoUrl, branch } = input;
126 |
127 | // Validate required parameters
128 | if (!repoUrl) {
129 | console.error(`[ingestBranch] Error: Missing required parameter repoUrl`);
130 | return {
131 | error: {
132 | message: "Required parameter (repoUrl) is missing",
133 | },
134 | };
135 | }
136 |
137 | // Get repository path using config manager
138 | const { path: repoLocalPath, config: repoConfig } = repoConfigManager.getRepositoryPath(repoUrl, branch);
139 | let actualBranch = branch || "";
140 |
141 | console.error(
142 | `[ingestBranch] Processing repository: ${repoUrl}, type: ${repoConfig.type}, branch: ${actualBranch || 'default'}`
143 | );
144 |
145 | // Handle repository based on type
146 | if (repoConfig.type === 'local') {
147 | console.error(`[ingestBranch] Using local repository at: ${repoLocalPath}`);
148 | } else {
149 | // Only clone if needed
150 | if (repoConfigManager.needsCloning(repoUrl)) {
151 | console.error(`[ingestBranch] Cloning remote repository to: ${repoLocalPath}`);
152 | await cloneRepositoryWithProgress(repoUrl, path.dirname(repoLocalPath), progressNotifier);
153 | } else {
154 | console.error(`[ingestBranch] Using cached repository at: ${repoLocalPath}`);
155 | }
156 | }
157 |
158 | console.error(
159 | `[ingestBranch] Repository cloned to: ${repoLocalPath} (${
160 | Date.now() - startTime
161 | }ms)`
162 | );
163 |
164 | // Initialize git
165 | const git = simpleGit(repoLocalPath);
166 |
167 | // If branch is not specified, get the default branch using git
168 | if (!actualBranch) {
169 | console.error(`[ingestBranch] Branch not specified, getting default branch`);
170 | try {
171 | // Get the default branch name
172 | const defaultBranch = await git.revparse(['--abbrev-ref', 'HEAD']);
173 | actualBranch = defaultBranch;
174 | console.error(`[ingestBranch] Using default branch: ${actualBranch}`);
175 | } catch (error) {
176 | console.error(`[ingestBranch] Error getting default branch:`, error);
177 | // Fallback to 'main' if we can't determine the default branch
178 | actualBranch = "main";
179 | console.error(`[ingestBranch] Falling back to branch: ${actualBranch}`);
180 | }
181 | }
182 |
183 | // Checkout the branch
184 | console.error(`[ingestBranch] Checking out branch: ${actualBranch}`);
185 | await git.checkout(actualBranch);
186 | const latestCommit = await git.revparse([actualBranch]);
187 | console.error(`[ingestBranch] Latest commit SHA: ${latestCommit}`);
188 |
189 | // Extract repo name from URL
190 | const repoName = path.basename(repoUrl, ".git");
191 |
192 | // Check if repo exists in database
193 | console.error(
194 | `[ingestBranch] Checking if repo exists in database: ${repoName}`
195 | );
196 | const repoExists = dbInterface.get(
197 | "SELECT id FROM repository WHERE name = ?",
198 | repoName
199 | );
200 |
201 | let repoId;
202 | if (repoExists) {
203 | repoId = repoExists.id;
204 | console.error(
205 | `[ingestBranch] Repository found in database with ID: ${repoId}`
206 | );
207 | } else {
208 | // Register repository
209 | console.error(`[ingestBranch] Registering new repository: ${repoName}`);
210 | const result = dbInterface.run(
211 | "INSERT INTO repository (name, path) VALUES (?, ?)",
212 | [repoName, repoLocalPath]
213 | );
214 | repoId = result.lastInsertRowid;
215 | console.error(`[ingestBranch] Repository registered with ID: ${repoId}`);
216 | }
217 |
218 | // Check if branch exists and has the same commit SHA
219 | console.error(`[ingestBranch] Checking if branch exists in database`);
220 | const branchExists = dbInterface.get(
221 | "SELECT id, last_commit_sha, status FROM branch WHERE name = ? AND repository_id = ?",
222 | [actualBranch, repoId]
223 | );
224 |
225 | let branchId;
226 | let needsUpdate = false;
227 |
228 | if (branchExists) {
229 | branchId = branchExists.id;
230 | console.error(
231 | `[ingestBranch] Branch found in database with ID: ${branchId}`
232 | );
233 |
234 | // Step 1: Check if SHA changed
235 | if (branchExists.last_commit_sha !== latestCommit) {
236 | console.error(`[ingestBranch] Commit SHA changed, updating branch: ${branchId}`);
237 | // Update branch commit SHA and set status to 'pending'
238 | dbInterface.run(
239 | "UPDATE branch SET last_commit_sha = ?, status = 'pending' WHERE id = ?",
240 | [latestCommit, branchId]
241 | );
242 | needsUpdate = true;
243 | }
244 |
245 | // Step 2: Check if status is not embeddings_generated
246 | if (branchExists.status !== 'embeddings_generated') {
247 | console.error(`[ingestBranch] Branch status is "${branchExists.status}" not "embeddings_generated", needs processing`);
248 | needsUpdate = true;
249 | }
250 |
251 | if (!needsUpdate) {
252 | console.error(`[ingestBranch] No changes needed, skipping update`);
253 | }
254 | } else {
255 | // Register the branch
256 | console.error(`[ingestBranch] Registering new branch: ${actualBranch}`);
257 | const result = dbInterface.run(
258 | "INSERT INTO branch (name, repository_id, last_commit_sha, status) VALUES (?, ?, ?, 'pending')",
259 | [actualBranch, repoId, latestCommit]
260 | );
261 | branchId = result.lastInsertRowid;
262 | needsUpdate = true;
263 | console.error(`[ingestBranch] Branch registered with ID: ${branchId}`);
264 | }
265 |
266 | // We don't process files directly here, just return the state
267 | // The actual file processing will happen in processFiles.ts
268 | return {
269 | repoLocalPath,
270 | repoId,
271 | branchId,
272 | needsUpdate,
273 | repoName,
274 | actualBranch,
275 | latestCommit
276 | };
277 | } catch (error) {
278 | console.error(`[ingestBranch] Error executing tool:`, error);
279 | return {
280 | error: {
281 | message: `Error executing ingestBranch tool: ${
282 | error instanceof Error ? error.message : String(error)
283 | }`,
284 | },
285 | };
286 | }
287 | }
288 |
```
--------------------------------------------------------------------------------
/tools/queryRepo.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { z } from "zod";
2 | import dbInterface from "../utils/db.js";
3 | import { generateOllamaEmbeddings } from "../utils/ollamaEmbeddings.js";
4 | import { createFilePatternCondition } from "../utils/filePatternMatcher.js";
5 | import { ProgressNotifier } from "../utils/types.js";
6 | import { ingestBranch } from "./ingestBranch.js";
7 | import { processFiles } from "./processFiles.js";
8 |
9 | // Define input schemas for tools
10 | export const QueryRepoSchema = z.object({
11 | repoUrl: z.string().describe("GitHub repository URL"),
12 | branch: z
13 | .string()
14 | .optional()
15 | .describe("Branch name to query (defaults to repository's default branch)"),
16 | semanticSearch: z.string().describe("Query for semantic search. This search is not exact, it will try to find the most relevant files, it doesn't accept file: or path: prefixes."),
17 | keywordsSearch: z
18 | .array(z.string())
19 | .describe(
20 | "Search to the files that contain at least one of the keywords in this list. Leave empty to disable. This can work in conjunction with the semantic search."
21 | ),
22 | filePatterns: z
23 | .array(z.string())
24 | .describe(
25 | "Array of glob patterns to filter files (e.g. '**/*.ts', 'src/*.js'). Use it for a more effective search or to target specific files for example 'somefile.tsx'. Leave empty to disable"
26 | ),
27 | excludePatterns: z
28 | .array(z.string())
29 | .optional()
30 | .describe(
31 | "Array of glob patterns to exclude files (e.g. '**/node_modules/**', '**/dist/**'). Use it to exclude files that are not relevant to the search. Leave empty to disable"
32 | ),
33 | limit: z.number().optional().describe("Maximum number of results to return"),
34 | _meta: z
35 | .object({
36 | progressToken: z.union([z.string(), z.number()]).optional(),
37 | })
38 | .optional(),
39 | });
40 |
41 | // Helper function to create a heartbeat progress notifier
42 | function createHeartbeatNotifier(originalNotifier?: ProgressNotifier, heartbeatMs: number = 2000): {
43 | notifier: ProgressNotifier;
44 | stopHeartbeat: () => void;
45 | } {
46 | if (!originalNotifier) {
47 | return {
48 | notifier: {
49 | sendProgress: async () => {} // No-op if no original notifier
50 | },
51 | stopHeartbeat: () => {}
52 | };
53 | }
54 |
55 | let currentProgress = 0;
56 | let currentMax = 1;
57 | let isActive = true;
58 | let lastUpdate = Date.now();
59 |
60 | // Heartbeat interval
61 | const intervalId = setInterval(async () => {
62 | if (!isActive) return;
63 |
64 | // Only send if it's been more than heartbeatMs since the last update
65 | if (Date.now() - lastUpdate >= heartbeatMs) {
66 | console.error(`[queryRepo] Heartbeat progress: ${currentProgress}/${currentMax}`);
67 | await originalNotifier.sendProgress(currentProgress, currentMax);
68 | }
69 | }, heartbeatMs);
70 |
71 | return {
72 | notifier: {
73 | sendProgress: async (progress: number, max: number) => {
74 | currentProgress = progress;
75 | currentMax = max;
76 | lastUpdate = Date.now();
77 | await originalNotifier.sendProgress(progress, max);
78 | }
79 | },
80 | stopHeartbeat: () => {
81 | isActive = false;
82 | clearInterval(intervalId);
83 | }
84 | };
85 | }
86 |
87 | export async function queryRepo(
88 | input: z.infer<typeof QueryRepoSchema>,
89 | progressNotifier?: ProgressNotifier
90 | ) {
91 | // Create heartbeat notifier that will send regular updates
92 | const { notifier: heartbeatNotifier, stopHeartbeat } = createHeartbeatNotifier(progressNotifier);
93 |
94 | try {
95 | console.error(
96 | `[queryRepo] Starting with parameters: ${JSON.stringify(input)}`
97 | );
98 |
99 | // Check if input is defined
100 | if (!input) {
101 | console.error(`[queryRepo] Error: Input parameters are undefined`);
102 | return {
103 | error: {
104 | message: "Input parameters are required for queryRepo tool",
105 | },
106 | };
107 | }
108 |
109 | const startTime = Date.now();
110 |
111 | const {
112 | repoUrl,
113 | branch,
114 | semanticSearch: semanticSearchInput,
115 | keywordsSearch,
116 | limit,
117 | filePatterns,
118 | excludePatterns,
119 | } = input;
120 |
121 | // Validate required parameters
122 | if (!repoUrl ||(!semanticSearchInput && !keywordsSearch)) {
123 | console.error(`[queryRepo] Error: Missing required parameters`);
124 | return {
125 | error: {
126 | message: "Required parameters (repoUrl, semanticSearch or keywordsSearch) are missing",
127 | },
128 | };
129 | }
130 |
131 | let semanticSearch = semanticSearchInput;
132 | if(!semanticSearchInput) {
133 | semanticSearch = keywordsSearch.join(" ");
134 | }
135 |
136 | // Initialize progress at start
137 | await heartbeatNotifier.sendProgress(0.05, 1);
138 |
139 | // Step 1: Ingest the branch (25% of progress)
140 | console.error(`[queryRepo] Ingesting branch: ${repoUrl}, ${branch || 'default'}`);
141 | const branchResult = await ingestBranch(
142 | {
143 | repoUrl,
144 | branch
145 | },
146 | undefined // Don't pass progress notifier to individual tools
147 | );
148 |
149 | // Update progress after branch ingestion
150 | await heartbeatNotifier.sendProgress(0.25, 1);
151 |
152 | // Check for error
153 | if ('error' in branchResult) {
154 | console.error(`[queryRepo] Error in ingestBranch:`, branchResult.error);
155 | return { error: branchResult.error };
156 | }
157 |
158 | const branchData = branchResult;
159 |
160 | // Step 2: Process files if needed (50% of progress)
161 | console.error(`[queryRepo] Processing files for branch: ${branchData.branchId}`);
162 | const filesResult = await processFiles(
163 | {
164 | repoLocalPath: branchData.repoLocalPath,
165 | repoId: branchData.repoId,
166 | branchId: branchData.branchId,
167 | actualBranch: branchData.actualBranch,
168 | needsUpdate: branchData.needsUpdate
169 | },
170 | undefined // Don't pass progress notifier to individual tools
171 | );
172 |
173 | // Update progress after file processing
174 | await heartbeatNotifier.sendProgress(0.5, 1);
175 |
176 | // Check for error
177 | if ('error' in filesResult) {
178 | console.error(`[queryRepo] Error in processFiles:`, filesResult.error);
179 | return { error: filesResult.error };
180 | }
181 |
182 | // Generate embedding for the query
183 | console.error(`[queryRepo] Generating embedding for query: "${semanticSearch}"`);
184 | const queryEmbedStart = Date.now();
185 | const [queryEmbedding] = await generateOllamaEmbeddings([semanticSearch]);
186 | const queryEmbeddingStr = JSON.stringify(queryEmbedding);
187 | console.error(
188 | `[queryRepo] Generated query embedding in ${
189 | Date.now() - queryEmbedStart
190 | }ms`
191 | );
192 |
193 | // Update progress after query embedding
194 | await heartbeatNotifier.sendProgress(0.6, 1);
195 |
196 | // Search for similar chunks using SQLite's JSON functions for vector similarity
197 | console.error(
198 | `[queryRepo] Searching for similar chunks with limit: ${limit}`
199 | );
200 | const searchStart = Date.now();
201 | // Use a default limit of 10 if undefined
202 | const effectiveLimit = limit === undefined ? 10 : limit;
203 |
204 | // Create SQL condition for file pattern filtering
205 | const filePatternCondition = createFilePatternCondition(
206 | filePatterns,
207 | excludePatterns
208 | );
209 |
210 | const results = dbInterface.all(
211 | `
212 | SELECT fc.content, f.path, fc.chunk_number,
213 | (SELECT (SELECT SUM(json_extract(value, '$') * json_extract(?, '$[' || key || ']'))
214 | FROM json_each(fc.embedding)
215 | GROUP BY key IS NOT NULL)
216 | )/${queryEmbedding.length} as similarity
217 | FROM file_chunk fc
218 | JOIN file f ON fc.file_id = f.id
219 | JOIN branch_file_association bfa ON f.id = bfa.file_id
220 | WHERE bfa.branch_id = ?
221 | AND fc.embedding IS NOT NULL
222 | ${filePatternCondition}
223 | ORDER BY similarity DESC
224 | LIMIT ?
225 | `,
226 | [queryEmbeddingStr, branchData.branchId, effectiveLimit]
227 | );
228 | console.error(
229 | `[queryRepo] Search completed in ${Date.now() - searchStart}ms, found ${
230 | results.length
231 | } results`
232 | );
233 |
234 | // Update progress after initial search
235 | await heartbeatNotifier.sendProgress(0.7, 1);
236 |
237 | // If no results found, check if embeddings need to be generated
238 | if (results.length === 0) {
239 | console.error(`[queryRepo] No results found, checking if embeddings need to be generated`);
240 |
241 | // Check if there are any chunks without embeddings
242 | const chunksWithoutEmbeddings = dbInterface.get(
243 | `SELECT COUNT(*) as count
244 | FROM file_chunk fc
245 | JOIN file f ON fc.file_id = f.id
246 | JOIN branch_file_association bfa ON f.id = bfa.file_id
247 | WHERE bfa.branch_id = ?
248 | AND fc.embedding IS NULL`,
249 | branchData.branchId
250 | );
251 |
252 | if (chunksWithoutEmbeddings && chunksWithoutEmbeddings.count > 0) {
253 | console.error(`[queryRepo] Found ${chunksWithoutEmbeddings.count} chunks without embeddings, generating them`);
254 |
255 | // Import embedFiles function
256 | const { embedFiles } = await import('./embedFiles.js');
257 |
258 | // Generate embeddings (75-90% of progress)
259 | await heartbeatNotifier.sendProgress(0.75, 1);
260 |
261 | // Generate embeddings
262 | const embedResult = await embedFiles(
263 | {
264 | repoLocalPath: branchData.repoLocalPath,
265 | branchId: branchData.branchId
266 | },
267 | undefined // Don't pass progress notifier to individual tools
268 | );
269 |
270 | // Update progress after embedding generation
271 | await heartbeatNotifier.sendProgress(0.9, 1);
272 |
273 | if ('error' in embedResult) {
274 | console.error(`[queryRepo] Error generating embeddings:`, embedResult.error);
275 | return { error: embedResult.error };
276 | }
277 |
278 | // Try searching again after generating embeddings
279 | console.error(`[queryRepo] Retrying search after generating embeddings`);
280 | const retryResults = dbInterface.all(
281 | `
282 | SELECT fc.content, f.path, fc.chunk_number,
283 | (SELECT (SELECT SUM(json_extract(value, '$') * json_extract(?, '$[' || key || ']'))
284 | FROM json_each(fc.embedding)
285 | GROUP BY key IS NOT NULL)
286 | ) as similarity
287 | FROM file_chunk fc
288 | JOIN file f ON fc.file_id = f.id
289 | JOIN branch_file_association bfa ON f.id = bfa.file_id
290 | WHERE bfa.branch_id = ?
291 | AND fc.embedding IS NOT NULL
292 | ${filePatternCondition}
293 | ORDER BY similarity DESC
294 | LIMIT ?
295 | `,
296 | [queryEmbeddingStr, branchData.branchId, effectiveLimit]
297 | );
298 |
299 | console.error(
300 | `[queryRepo] Retry search completed, found ${retryResults.length} results`
301 | );
302 | results.push(...retryResults);
303 | }
304 | }
305 |
306 | // Filter results by keywords if provided
307 | let filteredResults = results;
308 | if (keywordsSearch && keywordsSearch.length > 0) {
309 | console.error(
310 | `[queryRepo] Filtering results by keywords: ${keywordsSearch.join(", ")}`
311 | );
312 | const keywordFilterStart = Date.now();
313 |
314 | // Convert keywords to lowercase for case-insensitive matching
315 | const lowercaseKeywords = keywordsSearch.map((kw) => kw.trim().toLowerCase());
316 |
317 | filteredResults = results.filter((result: { content: string }) => {
318 | const content = result.content.toLowerCase();
319 | // Check if the content contains at least one of the keywords
320 | return lowercaseKeywords.some((keyword) => content.includes(keyword));
321 | });
322 |
323 | console.error(
324 | `[queryRepo] Keyword filtering completed in ${
325 | Date.now() - keywordFilterStart
326 | }ms, filtered from ${results.length} to ${
327 | filteredResults.length
328 | } results`
329 | );
330 | }
331 |
332 | // Update progress to completion
333 | await heartbeatNotifier.sendProgress(1, 1);
334 |
335 | const totalTime = Date.now() - startTime;
336 | console.error(`[queryRepo] Tool completed in ${totalTime}ms`);
337 |
338 | return {
339 | output: {
340 | success: true,
341 | repoUrl,
342 | branch: branchData.actualBranch,
343 | processingTimeMs: totalTime,
344 | results: filteredResults.map((result: any) => ({
345 | filePath: result.path,
346 | chunkNumber: result.chunk_number,
347 | content: result.content,
348 | similarity: result.similarity,
349 | })),
350 | },
351 | };
352 | } catch (error) {
353 | console.error(`[queryRepo] Error executing tool:`, error);
354 | return {
355 | error: {
356 | message: `Error executing queryRepo tool: ${
357 | error instanceof Error ? error.message : String(error)
358 | }`,
359 | },
360 | };
361 | } finally {
362 | // Always stop the heartbeat when done
363 | stopHeartbeat();
364 | }
365 | }
```
--------------------------------------------------------------------------------
/tools/processFiles.ts:
--------------------------------------------------------------------------------
```typescript
1 | import { z } from "zod";
2 | import dbInterface from "../utils/db.js";
3 | import { ProgressNotifier } from "../utils/types.js";
4 | import { simpleGit } from "simple-git";
5 | import path from "path";
6 | import { extensionToSplitter, splitDocument } from "../utils/codeSplitter.js";
7 | import fs from "fs";
8 |
9 | interface RepositoryFile {
10 | path: string;
11 | name: string;
12 | sha: string;
13 | }
14 |
15 | interface RepositoryFilesResult {
16 | files: RepositoryFile[];
17 | commitSha: string;
18 | }
19 |
20 | interface PendingFile {
21 | id: number;
22 | path: string;
23 | sha: string;
24 | }
25 |
26 |
27 | // Define input schema for processFiles
28 | export const ProcessFilesSchema = z.object({
29 | repoLocalPath: z.string().describe("Local path to the cloned repository"),
30 | repoId: z.number().describe("Repository ID in the database"),
31 | branchId: z.number().describe("Branch ID in the database"),
32 | actualBranch: z.string().describe("Actual branch name"),
33 | needsUpdate: z.boolean().describe("Whether the branch needs updating"),
34 | _meta: z
35 | .object({
36 | progressToken: z.union([z.string(), z.number()]).optional(),
37 | })
38 | .optional(),
39 | });
40 |
41 |
42 | /**
43 | * Get the files in a repository branch
44 | * @param repoPath Path to the repository
45 | * @param branchName Name of the branch
46 | * @returns List of files with their metadata
47 | */
48 | export const getRepositoryFiles = async (
49 | repoPath: string,
50 | branchName: string,
51 | ): Promise<RepositoryFilesResult> => {
52 | const git = simpleGit(repoPath);
53 |
54 | // Checkout the branch
55 | await git.checkout(branchName);
56 |
57 | // Get the latest commit SHA
58 | const latestCommit = await git.revparse([branchName]);
59 |
60 | // Get the file tree
61 | const files: RepositoryFile[] = [];
62 |
63 | // Use git ls-tree to get all files recursively
64 | const result = await git.raw(["ls-tree", "-r", branchName]);
65 | const stdout = result.toString();
66 |
67 | // Parse the output
68 | const lines = stdout.split("\n").filter((line) => line.trim() !== "");
69 |
70 | for (const line of lines) {
71 | // Format: <mode> <type> <object> <file>
72 | const [info, filePath] = line.split("\t");
73 | const [, , sha] = info.split(" ");
74 |
75 | if (filePath) {
76 | files.push({
77 | path: filePath,
78 | name: path.basename(filePath),
79 | sha,
80 | });
81 | }
82 | }
83 |
84 | return { files, commitSha: latestCommit };
85 | };
86 |
87 |
88 | /**
89 | * Process file content and split into chunks
90 | * @param branchName Branch name
91 | * @param repoPath Repository path
92 | */
93 | export const processFileContents = async (
94 | branchName: string,
95 | repoPath: string
96 | ): Promise<void> => {
97 | const git = simpleGit(repoPath);
98 |
99 | // Checkout the branch
100 | await git.checkout(branchName);
101 |
102 | // Get repository and branch IDs
103 | const repo = dbInterface.get("SELECT id FROM repository WHERE path = ?", repoPath) as { id: number };
104 | const branch = dbInterface.get(
105 | "SELECT id FROM branch WHERE name = ? AND repository_id = ?",
106 | [branchName, repo.id]
107 | ) as { id: number };
108 |
109 | // Get all pending files for the branch
110 | const pendingFiles = dbInterface.all(
111 | `SELECT f.id, f.path, f.sha
112 | FROM file f
113 | JOIN branch_file_association bfa ON f.id = bfa.file_id
114 | WHERE f.status = 'pending' AND bfa.branch_id = ?`,
115 | branch.id
116 | ) as PendingFile[];
117 |
118 | for (const file of pendingFiles) {
119 | console.error(`Processing file: ${file.path}`);
120 | const extension = file.path.split(".").pop()?.toLowerCase();
121 | const splitType = extension ? extensionToSplitter(extension) : "ignore";
122 |
123 | if (splitType !== "ignore") {
124 | try {
125 | // Get file content
126 | const filePath = path.join(repoPath, file.path);
127 |
128 | // Skip if file doesn't exist (might have been deleted)
129 | if (!fs.existsSync(filePath)) {
130 | console.error(`File ${file.path} doesn't exist, skipping`);
131 | continue;
132 | }
133 |
134 | let content = fs.readFileSync(filePath, "utf-8");
135 |
136 | // Check for null bytes in the content
137 | if (content.includes("\0")) {
138 | console.error(
139 | `File ${file.path} contains null bytes. Removing them.`
140 | );
141 | content = content.replace(/\0/g, "");
142 | }
143 |
144 | // Check if the content is valid UTF-8
145 | try {
146 | new TextDecoder("utf-8", { fatal: true }).decode(
147 | new TextEncoder().encode(content)
148 | );
149 | } catch (e) {
150 | console.error(
151 | `File ${file.path} contains invalid UTF-8 characters. Replacing them.`
152 | );
153 | content = content.replace(/[^\x00-\x7F]/g, ""); // Remove non-ASCII characters
154 | }
155 |
156 | // Truncate content if it's too long
157 | const maxLength = 1000000; // Adjust this value based on your database column size
158 | if (content.length > maxLength) {
159 | console.error(
160 | `File ${file.path} content is too long. Truncating to ${maxLength} characters.`
161 | );
162 | content = content.substring(0, maxLength);
163 | }
164 |
165 | // Split the document
166 | const chunks = await splitDocument(file.path, content);
167 |
168 | // Store chunks in the database using dbInterface.transaction
169 | dbInterface.transaction((db) => {
170 | for (let i = 0; i < chunks.length; i++) {
171 | db.prepare(
172 | `INSERT INTO file_chunk (file_id, content, chunk_number)
173 | VALUES (?, ?, ?)
174 | ON CONFLICT(file_id, chunk_number) DO NOTHING`
175 | ).run(file.id, chunks[i].pageContent, i + 1);
176 | }
177 |
178 | // Update file status to 'fetched'
179 | db.prepare("UPDATE file SET status = ? WHERE id = ?").run(
180 | "fetched",
181 | file.id
182 | );
183 | });
184 | } catch (error) {
185 | console.error(`Error processing file ${file.path}:`, error);
186 | }
187 | } else {
188 | // Update file status to 'done' for ignored files
189 | dbInterface.run("UPDATE file SET status = ? WHERE id = ?", ["done", file.id]);
190 | }
191 | }
192 | };
193 |
194 | export async function processFiles(
195 | input: z.infer<typeof ProcessFilesSchema>,
196 | progressNotifier?: ProgressNotifier
197 | ) {
198 | try {
199 | console.error(
200 | `[processFiles] Starting with parameters: ${JSON.stringify(input)}`
201 | );
202 |
203 | // Check if input is defined
204 | if (!input) {
205 | console.error(`[processFiles] Error: Input parameters are undefined`);
206 | return {
207 | error: {
208 | message: "Input parameters are required for processFiles tool",
209 | },
210 | };
211 | }
212 |
213 | const startTime = Date.now();
214 | const { repoLocalPath, repoId, branchId, actualBranch, needsUpdate } = input;
215 |
216 | // Skip if no update is needed
217 | if (!needsUpdate) {
218 | console.error(`[processFiles] No update needed, skipping`);
219 | return {
220 | needsUpdate: false,
221 | filesToProcess: []
222 | };
223 | }
224 |
225 | // Process the repository files
226 | console.error(
227 | `[processFiles] Processing repository files (${Date.now() - startTime}ms)`
228 | );
229 | // Get all files in the repository
230 | const { files } = await getRepositoryFiles(repoLocalPath, actualBranch);
231 | console.error(`[processFiles] Found ${files.length} files in repository`);
232 |
233 | // Define transaction function
234 | console.error(`[processFiles] Starting file database transaction`);
235 | const processFiles = (db: any) => {
236 | // Get existing files to compare
237 | const existingFiles = db
238 | .prepare(
239 | `SELECT f.id, f.path, f.sha FROM file f
240 | JOIN branch_file_association bfa ON f.id = bfa.file_id
241 | WHERE bfa.branch_id = ?`
242 | )
243 | .all(branchId);
244 | console.error(
245 | `[processFiles] Found ${existingFiles.length} existing files in database`
246 | );
247 |
248 | const existingFileMap = new Map();
249 | for (const file of existingFiles) {
250 | existingFileMap.set(file.path, file);
251 | }
252 |
253 | // Track files that need processing
254 | const filesToProcess: any[] = [];
255 |
256 | // File counters for logging
257 | let newFiles = 0;
258 | let updatedFiles = 0;
259 | let unchangedFiles = 0;
260 | let removedFiles = 0;
261 |
262 | // Process each file
263 | for (const file of files) {
264 | const existingFile = existingFileMap.get(file.path);
265 | existingFileMap.delete(file.path); // Remove from map to track what's left later
266 |
267 | if (!existingFile) {
268 | // New file - but first check if it already exists in the database for another branch
269 | const existingFileInDB = db.prepare(
270 | "SELECT id FROM file WHERE repository_id = ? AND path = ? AND sha = ?"
271 | ).get(repoId, file.path, file.sha);
272 |
273 | let fileId;
274 | if (existingFileInDB) {
275 | // File exists but not associated with this branch
276 | console.error(`[processFiles] File exists in DB but not associated with branch: ${file.path}`);
277 | fileId = existingFileInDB.id;
278 |
279 | // Check if the file is already associated with this branch
280 | const associationExists = db.prepare(
281 | "SELECT 1 FROM branch_file_association WHERE branch_id = ? AND file_id = ?"
282 | ).get(branchId, fileId);
283 |
284 | if (!associationExists) {
285 | // Associate existing file with current branch
286 | db.prepare(
287 | "INSERT INTO branch_file_association (branch_id, file_id) VALUES (?, ?)"
288 | ).run(branchId, fileId);
289 | }
290 | } else {
291 | // Truly new file
292 | newFiles++;
293 | const result = db
294 | .prepare(
295 | "INSERT INTO file (repository_id, path, sha, name, status) VALUES (?, ?, ?, ?, 'pending')"
296 | )
297 | .run(repoId, file.path, file.sha, file.name);
298 |
299 | fileId = result.lastInsertRowid;
300 |
301 | // Associate with branch
302 | db.prepare(
303 | "INSERT INTO branch_file_association (branch_id, file_id) VALUES (?, ?)"
304 | ).run(branchId, fileId);
305 | }
306 |
307 | filesToProcess.push({
308 | id: fileId,
309 | path: file.path,
310 | name: file.name,
311 | });
312 | } else if (existingFile.sha !== file.sha) {
313 | // Updated file - SHA changed
314 | updatedFiles++;
315 | db.prepare(
316 | "UPDATE file SET sha = ?, status = 'pending' WHERE id = ?"
317 | ).run(file.sha, existingFile.id);
318 |
319 | filesToProcess.push({
320 | id: existingFile.id,
321 | path: file.path,
322 | name: file.name,
323 | });
324 | } else {
325 | // Unchanged file
326 | unchangedFiles++;
327 | }
328 | }
329 |
330 | // Remove files that no longer exist in the branch
331 | for (const [path, file] of existingFileMap.entries()) {
332 | removedFiles++;
333 | db.prepare(
334 | "DELETE FROM branch_file_association WHERE branch_id = ? AND file_id = ?"
335 | ).run(branchId, file.id);
336 |
337 | // If no other branches reference this file, delete it and its chunks
338 | const fileStillInUse = db
339 | .prepare(
340 | "SELECT 1 FROM branch_file_association WHERE file_id = ? LIMIT 1"
341 | )
342 | .get(file.id);
343 |
344 | if (!fileStillInUse) {
345 | // Delete chunks first
346 | db.prepare("DELETE FROM file_chunk WHERE file_id = ?").run(file.id);
347 | // Then delete the file
348 | db.prepare("DELETE FROM file WHERE id = ?").run(file.id);
349 | }
350 | }
351 |
352 | console.error(
353 | `[processFiles] Files summary: ${newFiles} new, ${updatedFiles} updated, ${unchangedFiles} unchanged, ${removedFiles} removed`
354 | );
355 | return filesToProcess;
356 | };
357 |
358 | // Execute the transaction
359 | console.error(`[processFiles] Executing file processing transaction`);
360 | const filesToProcess = dbInterface.transaction((db) => processFiles(db));
361 | console.error(
362 | `[processFiles] Transaction completed, processing ${
363 | filesToProcess.length
364 | } files (${Date.now() - startTime}ms)`
365 | );
366 |
367 | // Limit the number of files processed to avoid timeouts
368 | // This might need adjustment based on actual performance
369 | const MAX_FILES_TO_PROCESS = 1000000;
370 | const limitedFiles = filesToProcess.slice(0, MAX_FILES_TO_PROCESS);
371 |
372 | if (limitedFiles.length < filesToProcess.length) {
373 | console.error(
374 | `[processFiles] WARNING: Processing only ${limitedFiles.length} of ${filesToProcess.length} files to avoid timeout`
375 | );
376 | }
377 |
378 | // Update progress for file processing phase (33% to 66%)
379 | if (progressNotifier) {
380 | await progressNotifier.sendProgress(0.33, 1);
381 | }
382 |
383 | // Process file contents to generate chunks - this was the missing step
384 | console.error(`[processFiles] Processing file contents for branch: ${actualBranch}`);
385 | try {
386 | await processFileContents(actualBranch, repoLocalPath);
387 | console.error(`[processFiles] File contents processed successfully`);
388 |
389 | // Update branch status to files_processed
390 | dbInterface.run(
391 | "UPDATE branch SET status = 'files_processed' WHERE id = ?",
392 | branchId
393 | );
394 |
395 | // Update progress after file content processing
396 | if (progressNotifier) {
397 | await progressNotifier.sendProgress(0.66, 1);
398 | }
399 | } catch (error) {
400 | console.error(`[processFiles] Error processing file contents:`, error);
401 | }
402 |
403 | return {
404 | needsUpdate: true,
405 | filesToProcess: limitedFiles,
406 | repoLocalPath
407 | };
408 | } catch (error) {
409 | console.error(`[processFiles] Error executing tool:`, error);
410 | return {
411 | error: {
412 | message: `Error executing processFiles tool: ${
413 | error instanceof Error ? error.message : String(error)
414 | }`,
415 | },
416 | };
417 | }
418 | }
419 |
```
--------------------------------------------------------------------------------
/utils/codeSplitter.ts:
--------------------------------------------------------------------------------
```typescript
1 | import {
2 | RecursiveCharacterTextSplitter,
3 | TextSplitter,
4 | } from "@langchain/textsplitters";
5 | import fs from "fs";
6 |
7 | class SQLSchemaSplitter extends TextSplitter {
8 | private maxCharacters: number;
9 |
10 | constructor(maxCharacters: number) {
11 | super();
12 | this.maxCharacters = maxCharacters;
13 | }
14 |
15 | // Helper function to parse INSERT statements
16 | parseValues(valuesPart: string): string[] {
17 | let valuesArray: string[] = [];
18 | let currentTuple = "";
19 | let nestingLevel = 0;
20 | let inString: boolean = false;
21 | let stringChar = "";
22 | let escapeNext = false;
23 |
24 | for (let i = 0; i < valuesPart.length; i++) {
25 | const char = valuesPart[i];
26 | currentTuple += char;
27 |
28 | if (escapeNext) {
29 | escapeNext = false;
30 | } else if (char === "\\") {
31 | escapeNext = true;
32 | } else if (char === "'" || char === '"') {
33 | if (inString && char === stringChar) {
34 | inString = false;
35 | } else if (!inString) {
36 | inString = true;
37 | stringChar = char;
38 | }
39 | } else if (!inString) {
40 | if (char === "(") {
41 | nestingLevel += 1;
42 | } else if (char === ")") {
43 | nestingLevel -= 1;
44 | if (nestingLevel === 0) {
45 | valuesArray.push(currentTuple.trim());
46 | currentTuple = "";
47 | // Skip any commas and spaces
48 | while (
49 | i + 1 < valuesPart.length &&
50 | (valuesPart[i + 1] === "," ||
51 | valuesPart[i + 1] === " " ||
52 | valuesPart[i + 1] === "\n")
53 | ) {
54 | i++;
55 | }
56 | }
57 | }
58 | }
59 | }
60 | return valuesArray;
61 | }
62 |
63 | // Split long INSERT statements
64 | splitInsertStatement(statement: string): string[] {
65 | const insertIndex = statement.toUpperCase().indexOf("VALUES");
66 | if (insertIndex === -1) {
67 | // Cannot split, return the statement as is
68 | return [statement];
69 | }
70 |
71 | const insertIntoPart =
72 | statement.slice(0, insertIndex + "VALUES".length) + " ";
73 | const valuesPart = statement.slice(insertIndex + "VALUES".length);
74 |
75 | const valuesArray = this.parseValues(valuesPart);
76 | const insertStatements: string[] = [];
77 |
78 | let currentValues = "";
79 | for (const valueTuple of valuesArray) {
80 | const newStatementLength =
81 | insertIntoPart.length + currentValues.length + valueTuple.length + 1; // +1 for ',' or ';'
82 |
83 | if (newStatementLength <= this.maxCharacters) {
84 | if (currentValues !== "") {
85 | currentValues += "," + valueTuple;
86 | } else {
87 | currentValues = valueTuple;
88 | }
89 | } else {
90 | // Create a new INSERT statement
91 | const newStatement = insertIntoPart + currentValues + ";";
92 | insertStatements.push(newStatement);
93 | currentValues = valueTuple;
94 | }
95 | }
96 | if (currentValues !== "") {
97 | const newStatement = insertIntoPart + currentValues + ";";
98 | insertStatements.push(newStatement);
99 | }
100 | return insertStatements;
101 | }
102 |
103 | /**
104 | * Enhanced function to split SQL script into statements while handling various SQL constructs,
105 | * including custom keywords like BBEGI/EEN and EEXCEPTIO/EEN.
106 | */
107 | splitSQLStatements(text: string): string[] {
108 | const statements: string[] = [];
109 | let currentStatement = "";
110 | let index = 0;
111 | let insideString: boolean = false;
112 | let stringChar = "";
113 | let insideComment = false;
114 | let commentType = "";
115 | let insideFunction = false;
116 | let insideProcedure = false;
117 | let insideView = false;
118 | let insideBlock = false;
119 | let blockLevel = 0;
120 |
121 | const upperText = text.toUpperCase();
122 |
123 | // Define mappings for custom keywords to standard ones
124 | const beginKeywords = ["BEGIN", "BBEGI", "BEGINN"];
125 | const endKeywords = ["END", "EEN"];
126 | const exceptionKeywords = ["EXCEPTION", "EEXCEPTIO"];
127 |
128 | while (index < text.length) {
129 | const char = text[index];
130 | const remainingText = upperText.substring(index);
131 | currentStatement += char;
132 |
133 | if (insideString) {
134 | if (char === stringChar) {
135 | insideString = false;
136 | } else if (char === "\\") {
137 | // Skip escaped characters
138 | index++;
139 | if (index < text.length) {
140 | currentStatement += text[index];
141 | }
142 | }
143 | } else if (insideComment) {
144 | if (commentType === "--" && (char === "\n" || char === "\r")) {
145 | insideComment = false;
146 | } else if (commentType === "/*" && remainingText.startsWith("*/")) {
147 | insideComment = false;
148 | currentStatement += "*/";
149 | index += 1; // Skip '/'
150 | }
151 | } else if (char === "'" || char === '"') {
152 | insideString = true;
153 | stringChar = char;
154 | } else if (remainingText.startsWith("/*")) {
155 | insideComment = true;
156 | commentType = "/*";
157 | currentStatement += "/*";
158 | index += 1; // Skip '*'
159 | } else if (remainingText.startsWith("--")) {
160 | insideComment = true;
161 | commentType = "--";
162 | currentStatement += "--";
163 | index += 1; // Skip second '-'
164 | } else if (
165 | !insideFunction &&
166 | !insideProcedure &&
167 | !insideView &&
168 | !insideBlock
169 | ) {
170 | if (
171 | remainingText.startsWith("CREATE FUNCTION") ||
172 | remainingText.startsWith("CREATE OR REPLACE FUNCTION")
173 | ) {
174 | insideFunction = true;
175 | blockLevel = 0;
176 | } else if (
177 | remainingText.startsWith("CREATE PROCEDURE") ||
178 | remainingText.startsWith("CREATE OR REPLACE PROCEDURE")
179 | ) {
180 | insideProcedure = true;
181 | blockLevel = 0;
182 | } else if (
183 | remainingText.startsWith("CREATE VIEW") ||
184 | remainingText.startsWith("CREATE OR REPLACE VIEW")
185 | ) {
186 | insideView = true;
187 | } else if (beginKeywords.some((kw) => remainingText.startsWith(kw))) {
188 | insideBlock = true;
189 | blockLevel = 1;
190 | const matchedBegin = beginKeywords.find((kw) =>
191 | remainingText.startsWith(kw)
192 | );
193 | if (matchedBegin && matchedBegin.length > "BEGIN".length) {
194 | index += matchedBegin.length - "BEGIN".length;
195 | currentStatement += matchedBegin.substring("BEGIN".length);
196 | }
197 | }
198 | }
199 |
200 | if (insideFunction || insideProcedure || insideBlock) {
201 | // Check for BEGIN keywords to increase block level
202 | const matchedBegin = beginKeywords.find((kw) =>
203 | remainingText.startsWith(kw)
204 | );
205 | if (matchedBegin) {
206 | blockLevel++;
207 | index += matchedBegin.length - 1;
208 | currentStatement += matchedBegin.substring(1);
209 | continue;
210 | }
211 |
212 | // Check for END keywords to decrease block level
213 | const matchedEnd = endKeywords.find((kw) =>
214 | remainingText.startsWith(kw)
215 | );
216 | if (
217 | matchedEnd &&
218 | (matchedEnd.length === "END".length ||
219 | matchedEnd.length === "END;".length)
220 | ) {
221 | blockLevel--;
222 | index += matchedEnd.length - 1;
223 | currentStatement += matchedEnd.substring(1);
224 |
225 | if (blockLevel === 0) {
226 | if (insideFunction) {
227 | insideFunction = false;
228 | statements.push(currentStatement.trim());
229 | currentStatement = "";
230 | } else if (insideProcedure) {
231 | insideProcedure = false;
232 | statements.push(currentStatement.trim());
233 | currentStatement = "";
234 | } else if (insideBlock) {
235 | insideBlock = false;
236 | statements.push(currentStatement.trim());
237 | currentStatement = "";
238 | }
239 | }
240 | continue;
241 | }
242 | } else if (insideView) {
243 | if (char === ";") {
244 | insideView = false;
245 | statements.push(currentStatement.trim());
246 | currentStatement = "";
247 | }
248 | } else if (
249 | char === ";" &&
250 | !insideFunction &&
251 | !insideProcedure &&
252 | !insideView &&
253 | !insideBlock
254 | ) {
255 | statements.push(currentStatement.trim());
256 | currentStatement = "";
257 | }
258 |
259 | index++;
260 | }
261 |
262 | if (currentStatement.trim() !== "") {
263 | statements.push(currentStatement.trim());
264 | }
265 |
266 | return statements;
267 | }
268 |
269 | // Helper method to match keywords from a list at the start of the given text.
270 | // Returns the matched keyword or null.
271 | matchKeyword(text: string, keywords: string[]): string | null {
272 | for (const keyword of keywords) {
273 | if (text.startsWith(keyword)) {
274 | return keyword;
275 | }
276 | }
277 | return null;
278 | }
279 |
280 | async splitText(text: string): Promise<string[]> {
281 | const statements = this.splitSQLStatements(text);
282 | const splits: string[] = [];
283 |
284 | for (const statement of statements) {
285 | // Check if the statement is an INSERT statement
286 | if (
287 | statement.toUpperCase().includes("INSERT INTO") &&
288 | statement.toUpperCase().includes("VALUES")
289 | ) {
290 | // Split long INSERT statements
291 | const splitInserts = this.splitInsertStatement(statement);
292 | splits.push(...splitInserts);
293 | } else {
294 | // For other statements, check if they are too long
295 | if (statement.length <= this.maxCharacters) {
296 | splits.push(statement);
297 | } else {
298 | // For long statements, split them into chunks
299 | let currentSplit = "";
300 | const lines = statement.split("\n");
301 |
302 | for (const line of lines) {
303 | if (currentSplit.length + line.length + 1 <= this.maxCharacters) {
304 | currentSplit += (currentSplit ? "\n" : "") + line;
305 | } else {
306 | if (currentSplit) {
307 | splits.push(currentSplit);
308 | }
309 | currentSplit = line;
310 | }
311 | }
312 |
313 | if (currentSplit) {
314 | splits.push(currentSplit);
315 | }
316 | }
317 | }
318 | }
319 |
320 | return splits;
321 | }
322 | }
323 |
324 | export function extensionToSplitter(extension: string): string {
325 | if (!extension) {
326 | return "text";
327 | }
328 | const extensionLower = extension.toLowerCase();
329 | switch (extensionLower) {
330 | // C/C++ extensions
331 | case "c++":
332 | case "cpp":
333 | case "c":
334 | case "h":
335 | case "hpp":
336 | case "m":
337 | case "mm":
338 | return "cpp";
339 | // Go
340 | case "go":
341 | return "go";
342 | // Java
343 | case "java":
344 | return "java";
345 | // JavaScript and related
346 | case "js":
347 | case "ts":
348 | case "typescript":
349 | case "tsx":
350 | case "jsx":
351 | case "javascript":
352 | case "json":
353 | case "pbxproj":
354 | return "js";
355 | // YAML and related
356 | case "yaml":
357 | case "yml":
358 | case "toml":
359 | case "ini":
360 | case "cfg":
361 | case "conf":
362 | case "props":
363 | case "env":
364 | case "plist":
365 | case "gemfile":
366 | case "dockerfile":
367 | case "podfile":
368 | case "patch":
369 | return "text";
370 | // Shell scripts and related
371 | case "sh":
372 | case "bash":
373 | case "zsh":
374 | case "fish":
375 | case "bat":
376 | case "cmd":
377 | return "text";
378 | // Properties and XSD
379 | case "properties":
380 | case "xsd":
381 | return "text";
382 | // SQL
383 | case "sql":
384 | return "sql";
385 | // PHP
386 | case "php":
387 | return "php";
388 | // Protocol buffers
389 | case "proto":
390 | return "proto";
391 | // Python
392 | case "py":
393 | case "python":
394 | return "python";
395 | // reStructuredText
396 | case "rst":
397 | return "rst";
398 | // Ruby
399 | case "rb":
400 | case "ruby":
401 | return "ruby";
402 | // Rust
403 | case "rs":
404 | case "rust":
405 | return "rust";
406 | // Scala
407 | case "scala":
408 | return "scala";
409 | // Swift
410 | case "swift":
411 | return "swift";
412 | // Markdown
413 | case "md":
414 | case "markdown":
415 | return "markdown";
416 | // LaTeX
417 | case "tex":
418 | case "latex":
419 | return "latex";
420 | // HTML and related
421 | case "html":
422 | case "htm":
423 | case "xml":
424 | case "xsl":
425 | case "xdt":
426 | case "xcworkspacedata":
427 | case "xcprivacy":
428 | case "xcsettings":
429 | case "xcscheme":
430 | return "html";
431 | // Solidity
432 | case "sol":
433 | case "solidity":
434 | return "sol";
435 | // Text
436 | case "text":
437 | case "txt":
438 | case "lst":
439 | case "reg":
440 | return "text";
441 | // Additional file extensions
442 | case "jpr":
443 | case "jws":
444 | case "iml":
445 | return "html";
446 | case "lock":
447 | case "jpg":
448 | case "jpeg":
449 | case "png":
450 | case "gif":
451 | case "bmp":
452 | case "svg":
453 | case "ico":
454 | case "webp":
455 | case "tiff":
456 | case "bin":
457 | case "exe":
458 | case "dll":
459 | case "so":
460 | case "dylib":
461 | case "obj":
462 | case "o":
463 | case "zip":
464 | case "tar":
465 | case "gz":
466 | case "rar":
467 | case "7z":
468 | case "jar":
469 | case "war":
470 | case "ear":
471 | case "class":
472 | return "ignore";
473 | default:
474 | return "text";
475 | }
476 | }
477 |
478 | export const splitDocument = (filename: string, code: string) => {
479 | const extension = filename.split(".").pop();
480 |
481 | const splitType = extensionToSplitter(extension || "");
482 | if (splitType === "ignore") {
483 | return [];
484 | }
485 |
486 | const CHUNK_SIZE_TOKENS = 7000;
487 | const CHUNK_OVERLAP_TOKENS = 200;
488 |
489 | const CHUNK_SIZE_CHARACTERS = CHUNK_SIZE_TOKENS * 3.25;
490 | const CHUNK_OVERLAP_CHARACTERS = CHUNK_OVERLAP_TOKENS * 3.25;
491 |
492 | let splitter;
493 |
494 | if (splitType !== "text" && splitType !== "sql") {
495 | splitter = RecursiveCharacterTextSplitter.fromLanguage(
496 | splitType as
497 | | "cpp"
498 | | "go"
499 | | "java"
500 | | "js"
501 | | "php"
502 | | "proto"
503 | | "python"
504 | | "rst"
505 | | "ruby"
506 | | "rust"
507 | | "scala"
508 | | "swift"
509 | | "markdown"
510 | | "latex"
511 | | "html"
512 | | "sol",
513 | {
514 | chunkSize: CHUNK_SIZE_CHARACTERS,
515 | chunkOverlap: CHUNK_OVERLAP_CHARACTERS,
516 | }
517 | );
518 | } else if (splitType === "sql") {
519 | splitter = new SQLSchemaSplitter(CHUNK_SIZE_CHARACTERS);
520 | } else {
521 | splitter = new RecursiveCharacterTextSplitter({
522 | chunkSize: CHUNK_SIZE_CHARACTERS,
523 | chunkOverlap: CHUNK_OVERLAP_CHARACTERS,
524 | });
525 | }
526 | return splitter.createDocuments([code], [], {
527 | chunkHeader: `FILE NAME: ${filename}\n\n---\n\n`,
528 | appendChunkOverlapHeader: true,
529 | });
530 | };
531 |
```