# Directory Structure
```
├── .env.example
├── .gitignore
├── claude_desktop_config.example.json
├── config.ts
├── index.ts
├── jest.config.mjs
├── LICENSE
├── package-lock.json
├── package.json
├── README.md
├── SETUP.md
├── start.ts
├── tools
│ ├── embedFiles.ts
│ ├── ingestBranch.ts
│ ├── processFiles.ts
│ └── queryRepo.ts
├── tsconfig.json
└── utils
├── codeSplitter.ts
├── db.ts
├── filePatternMatcher.ts
├── ollamaEmbeddings.ts
├── repoConfig.ts
└── types.ts
```
# Files
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
```
DATA_DIR=/home/user/.config/Claude/data
REPO_CONFIG_DIR=/home/user/.config/Claude/repos
NODE_ENV=development
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
# Node.js
node_modules/
npm-debug.log
yarn-debug.log
yarn-error.log
# TypeScript
dist/
*.tsbuildinfo
# Data directories
data/
cache/
repos/
# HuggingFace specific
.transformers/
.cache/
huggingface/
models/
**/temp_test_repos/
# Test temporary files
coverage/
.nyc_output/
junit.xml
# Database files
*.db
*.sqlite
*.sqlite3
# Environment variables
.env
.env.local
.env.development.local
.env.test.local
.env.production.local
# Log files
logs/
*.log
# Editor directories and files
.idea/
.vscode/
*.swp
*.swo
# OS files
.DS_Store
Thumbs.db
# Build files
build/
out/
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
# Code Context MCP Server
A Model Context Protocol (MCP) server for providing code context from local git repositories. This server allows you to:
1. Clone git repositories locally
2. Process branches and files
3. Generate embeddings for code chunks
4. Perform semantic search over code
## Features
- Uses local git repositories instead of GitHub API
- Stores data in SQLite database
- Splits code into semantic chunks
- Generates embeddings for code chunks using Ollama
- Provides semantic search over code
## Prerequisites
- Node.js (v16+)
- Git
- Ollama with an embedding model
## Installation
```bash
# Clone the repository
git clone <repository-url>
cd code-context-mcp
# Install dependencies
npm install
# Build the project
npm run build
```
## Configuration
Set the following environment variables:
- `DATA_DIR`: Directory for SQLite database (default: '~/.codeContextMcp/data')
- `REPO_CACHE_DIR`: Directory for cloned repositories (default: '~/.codeContextMcp/repos')
### Using Ollama
For faster and more powerful embeddings, you can use Ollama:
```bash
# Install Ollama from https://ollama.ai/
# Pull an embedding model (unclemusclez/jina-embeddings-v2-base-code is recommended)
ollama pull unclemusclez/jina-embeddings-v2-base-code
```
## Usage
### Using with Claude Desktop
Add the following configuration to your Claude Desktop configuration file (`claude_desktop_config.json`):
```json
{
"mcpServers": {
"code-context-mcp": {
"command": "/path/to/your/node",
"args": ["/path/to/code-context-mcp/dist/index.js"]
}
}
}
```
## Tools
The server provides the following tool:
### queryRepo
Clones a repository, processes code, and performs semantic search:
```json
{
"repoUrl": "https://github.com/username/repo.git",
"branch": "main", // Optional - defaults to repository's default branch
"query": "Your search query",
"keywords": ["keyword1", "keyword2"], // Optional - filter results by keywords
"filePatterns": ["**/*.ts", "src/*.js"], // Optional - filter files by glob patterns
"excludePatterns": ["**/node_modules/**"], // Optional - exclude files by glob patterns
"limit": 10 // Optional - number of results to return, default: 10
}
```
The `branch` parameter is optional. If not provided, the tool will automatically use the repository's default branch.
The `keywords` parameter is optional. If provided, the results will be filtered to only include chunks that contain at least one of the specified keywords (case-insensitive matching).
The `filePatterns` and `excludePatterns` parameters are optional. They allow you to filter which files are processed and searched using glob patterns (e.g., `**/*.ts` for all TypeScript files).
## Database Schema
The server uses SQLite with the following schema:
- `repository`: Stores information about repositories
- `branch`: Stores information about branches
- `file`: Stores information about files
- `branch_file_association`: Associates files with branches
- `file_chunk`: Stores code chunks and their embeddings
# Debugging
## MAC Mx Series - ARM Architecture Issues
When installing better-sqlite3 on Mac M-series chips (ARM architecture), if you encounter errors like "mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64e' or 'arm64')", you need to ensure the binary matches your architecture. Here's how to resolve this issue:
```bash
# Check your Node.js architecture
node -p "process.arch"
# If it shows 'arm64', but you're still having issues, try:
npm rebuild better-sqlite3 --build-from-source
# Or for a clean install:
npm uninstall better-sqlite3
export npm_config_arch=arm64
export npm_config_target_arch=arm64
npm install better-sqlite3 --build-from-source
```
If you're using Rosetta, make sure your entire environment is consistent. Your error shows x86_64 binaries being built but your system needs arm64.
For persistent configuration, add to your .zshrc or .bashrc:
```
export npm_config_arch=arm64
export npm_config_target_arch=arm64
```
## Testing Ollama Embeddings
curl http://localhost:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
curl http://127.0.01:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
curl http://[::1]:11434/api/embed -d '{"model":"unclemusclez/jina-embeddings-v2-base-code","input":"Llamas are members of the camelid family"}'
## License
MIT
```
--------------------------------------------------------------------------------
/utils/types.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Common interfaces and types used across the codebase
*/
/**
* Interface for objects that can send progress notifications
*/
export interface ProgressNotifier {
sendProgress: (progress: number, total: number) => Promise<void>;
}
```
--------------------------------------------------------------------------------
/claude_desktop_config.example.json:
--------------------------------------------------------------------------------
```json
{
"mcpServers": {
"code-context": {
"command": "node",
"args": ["<CLAUDE_CONFIG_DIR>/mcp-servers/code-context-mcp/dist/start.js"],
"env": {
"DATA_DIR": "<CLAUDE_CONFIG_DIR>/data",
"REPO_CONFIG_DIR": "<CLAUDE_CONFIG_DIR>/repos",
"NODE_ENV": "development"
}
}
}
}
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
{
"compilerOptions": {
"target": "ES2020",
"module": "NodeNext",
"moduleResolution": "NodeNext",
"esModuleInterop": true,
"strict": true,
"outDir": "./dist",
"rootDir": ".",
"declaration": true,
"skipLibCheck": true,
"isolatedModules": true,
"allowJs": true,
"resolveJsonModule": true,
"forceConsistentCasingInFileNames": true,
"baseUrl": ".",
"paths": {
"*": ["*"]
}
},
"include": [
"./**/*.ts",
"./**/*.mts",
"./tests/**/*.ts"
],
"exclude": [
"node_modules",
"dist",
"repos"
]
}
```
--------------------------------------------------------------------------------
/config.ts:
--------------------------------------------------------------------------------
```typescript
import path from "path";
import os from "os";
// Available models for code embeddings
export const EMBEDDING_MODELS = {
OLLAMA: {
model: "unclemusclez/jina-embeddings-v2-base-code",
contextSize: 8192,
dimensions: 768,
baseUrl: "http://127.0.0.1:11434",
},
};
export const codeContextConfig = {
ENV: process.env.NODE_ENV || "development",
REPO_CONFIG_DIR:
process.env.REPO_CONFIG_DIR ||
path.join(os.homedir(), ".codeContextMcp", "repos"),
BATCH_SIZE: 100,
DATA_DIR:
process.env.DATA_DIR || path.join(os.homedir(), ".codeContextMcp", "data"),
DB_PATH: process.env.DB_PATH || "code_context.db",
EMBEDDING_MODEL: EMBEDDING_MODELS.OLLAMA,
};
export default codeContextConfig;
```
--------------------------------------------------------------------------------
/jest.config.mjs:
--------------------------------------------------------------------------------
```
export default {
preset: 'ts-jest/presets/default-esm',
clearMocks: true,
coverageDirectory: "coverage",
roots: [
"./tests"
],
moduleNameMapper: {
'^(\\.{1,2}/.*)\\.js$': '$1',
},
transform: {
'^.+\\.tsx?$': [
'ts-jest',
{
isolatedModules: true,
useESM: true,
tsconfig: './tsconfig.json'
}
]
},
testEnvironment: 'node',
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node', 'mjs'],
extensionsToTreatAsEsm: ['.ts', '.mts'],
transformIgnorePatterns: [
'node_modules/(?!(@huggingface)/)'
],
testMatch: [
'**/?(*.)+(spec|test).ts',
'**/tests/*EmbeddingsTest.ts',
'**/tests/githubRepoTest.ts'
],
globals: {
'ts-jest': {
useESM: true,
},
},
setupFilesAfterEnv: ['<rootDir>/tests/setup.ts'],
verbose: true
};
```
--------------------------------------------------------------------------------
/SETUP.md:
--------------------------------------------------------------------------------
```markdown
# Code Context MCP Setup
## Prerequisites
```bash
ollama pull unclemusclez/jina-embeddings-v2-base-code
```
## Install
```bash
npm install
npm run build
```
## Configuration
Copy `claude_desktop_config.example.json` to your Claude Desktop config location:
**Linux/macOS**: `~/.config/Claude/claude_desktop_config.json`
**Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
Replace `<CLAUDE_CONFIG_DIR>` with your actual path:
- Linux/macOS: `/home/username/.config/Claude`
- Windows: `C:\Users\username\AppData\Roaming\Claude`
## Environment
Copy `.env.example` to `.env` and adjust paths if needed.
The `repos/` directory stores configuration metadata for repositories, not full clones.
For local repositories (file:// URLs), no cloning occurs - files are accessed directly.
## Test
```bash
npm run start:mcp
```
Restart Claude Desktop.
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
{
"name": "@modelcontextprotocol/server-code-context",
"version": "0.1.0",
"description": "MCP server for code context from local git repositories",
"license": "MIT",
"type": "module",
"bin": {
"mcp-server-code-context": "dist/index.js"
},
"files": [
"dist"
],
"scripts": {
"build": "tsc && shx chmod +x dist/*.js",
"watch": "tsc --watch",
"start": "node dist/index.js",
"start:mcp": "node dist/start.js",
"inspect": "npm run build && npx @modelcontextprotocol/inspector node dist/index.js"
},
"dependencies": {
"@langchain/textsplitters": "^0.1.0",
"@modelcontextprotocol/sdk": "1.0.1",
"axios": "^1.8.4",
"better-sqlite3": "^11.9.1",
"express": "^4.21.1",
"simple-git": "^3.20.0",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.5"
},
"devDependencies": {
"@types/better-sqlite3": "^7.6.4",
"@types/express": "^5.0.0",
"@types/jest": "^29.5.14",
"@types/minimatch": "^5.1.2",
"@types/node": "^20.10.0",
"jest": "^29.7.0",
"jest-environment-node-single-context": "^29.4.0",
"shx": "^0.3.4",
"ts-jest": "^29.3.0",
"tsx": "^4.19.3",
"typescript": "^5.6.2"
}
}
```
--------------------------------------------------------------------------------
/utils/filePatternMatcher.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Convert a glob pattern to an SQL LIKE pattern
*/
export function globToSqlPattern(pattern: string): string {
// Handle ** (any depth of directories)
let sqlPattern = pattern.replace(/\*\*/g, '%');
// Handle * (any characters within a directory)
sqlPattern = sqlPattern.replace(/\*/g, '%');
return sqlPattern;
}
/**
* Create SQL WHERE conditions for file pattern filtering using numbered parameters
* for better SQLite compatibility
*/
export function createFilePatternCondition(
includePatterns: string[] | undefined,
excludePatterns: string[] | undefined
): string {
let conditions = '';
// Include patterns (files must match at least one pattern)
if (includePatterns && includePatterns.length > 0) {
const includeConditions = includePatterns.map(pattern => {
const sqlPattern = globToSqlPattern(pattern);
return `f.path LIKE '${sqlPattern}'`;
});
conditions += ` AND (${includeConditions.join(' OR ')})`;
}
// Exclude patterns (files must not match any pattern)
if (excludePatterns && excludePatterns.length > 0) {
const excludeConditions = excludePatterns.map(pattern => {
const sqlPattern = globToSqlPattern(pattern);
return `f.path NOT LIKE '${sqlPattern}'`;
});
conditions += ` AND (${excludeConditions.join(' AND ')})`;
}
return conditions;
}
```
--------------------------------------------------------------------------------
/start.ts:
--------------------------------------------------------------------------------
```typescript
#!/usr/bin/env node
import { spawn } from 'child_process';
import { existsSync, mkdirSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const DATA_DIR = process.env.DATA_DIR || join(process.env.HOME!, '.config', 'Claude', 'data');
const REPO_CONFIG_DIR = process.env.REPO_CONFIG_DIR || join(process.env.HOME!, '.config', 'Claude', 'repos');
const NODE_ENV = process.env.NODE_ENV || 'development';
[DATA_DIR, REPO_CONFIG_DIR].forEach(dir => {
if (!existsSync(dir)) {
mkdirSync(dir, { recursive: true, mode: 0o755 });
}
});
process.stderr.write(`Starting Code Context MCP Server\n`);
process.stderr.write(`Data Directory: ${DATA_DIR}\n`);
process.stderr.write(`Repo Config: ${REPO_CONFIG_DIR}\n`);
process.stderr.write(`Node Environment: ${NODE_ENV}\n\n`);
const checkOllama = () => {
try {
const result = spawn('pgrep', ['ollama'], { stdio: 'pipe' });
result.on('exit', (code) => {
if (code !== 0) {
process.stderr.write('Starting Ollama...\n');
spawn('ollama', ['serve'], { detached: true, stdio: 'ignore' }).unref();
setTimeout(() => startMcpServer(), 3000);
} else {
startMcpServer();
}
});
} catch {
startMcpServer();
}
};
const startMcpServer = () => {
const serverPath = join(__dirname, 'index.js');
if (!existsSync(serverPath)) {
process.stderr.write(`Error: MCP server not found at ${serverPath}\n`);
process.stderr.write('Run: npm run build\n');
process.exit(1);
}
process.env.DATA_DIR = DATA_DIR;
process.env.REPO_CONFIG_DIR = REPO_CONFIG_DIR;
process.env.NODE_ENV = NODE_ENV;
const server = spawn('node', [serverPath, ...process.argv.slice(2)], {
stdio: 'inherit',
cwd: __dirname
});
server.on('exit', (code) => process.exit(code || 0));
};
checkOllama();
```
--------------------------------------------------------------------------------
/utils/ollamaEmbeddings.ts:
--------------------------------------------------------------------------------
```typescript
import axios from "axios";
import config from "../config.js";
// Cache for API
let apiInitialized = false;
/**
* Generate embeddings for text using Ollama API
* @param texts Array of text strings to embed
* @param embeddingModel Optional model configuration to use
* @returns Promise containing array of embeddings
*/
export async function generateOllamaEmbeddings(
texts: string[],
embeddingModel: {
model: string;
contextSize: number;
dimensions: number;
baseUrl?: string;
} = config.EMBEDDING_MODEL
): Promise<number[][]> {
try {
// Log initialization
if (!apiInitialized) {
console.error(
`Initializing Ollama embeddings with model: ${embeddingModel.model}...`
);
apiInitialized = true;
}
const baseUrl = embeddingModel.baseUrl || "http://127.0.0.1:11434";
const embeddings: number[][] = [];
// Process texts in parallel with a rate limit
console.error(`Generating embeddings for ${texts.length} chunks...`);
const batchSize = 1000; // Process 5 at a time to avoid overwhelming the API
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const response = await axios.post(
`${baseUrl}/api/embed`,
{
model: embeddingModel.model,
input: batch,
options: {
num_ctx: embeddingModel.contextSize,
},
},
{
headers: {
"Content-Type": "application/json",
},
}
);
// Await all promises in this batch
embeddings.push(...response.data.embeddings);
}
console.error(`Successfully generated ${embeddings.length} embeddings`);
return embeddings;
} catch (error) {
console.error("Error generating embeddings:", error);
// For testing purposes, return mock embeddings if running in test environment
if (config.ENV === "test") {
console.error("Using mock embeddings for testing");
return texts.map(() => generateMockEmbedding(embeddingModel.dimensions));
}
throw error;
}
}
/**
* Generate a simple mock embedding vector for testing
* @param dimensions The number of dimensions in the embedding vector
* @returns A normalized random vector of the specified dimensions
*/
function generateMockEmbedding(dimensions: number): number[] {
// Create a random vector
const vector = Array.from({ length: dimensions }, () => Math.random() - 0.5);
// Normalize the vector
const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
return vector.map((val) => val / magnitude);
}
```
--------------------------------------------------------------------------------
/utils/repoConfig.ts:
--------------------------------------------------------------------------------
```typescript
import { existsSync, writeFileSync, readFileSync, mkdirSync } from 'fs';
import { join, basename } from 'path';
import { createHash } from 'crypto';
import config from '../config.js';
interface RepoConfig {
url: string;
localPath?: string;
lastAccessed: number;
type: 'local' | 'remote' | 'cached';
branch?: string;
}
export class RepositoryConfigManager {
private configDir: string;
constructor() {
this.configDir = config.REPO_CONFIG_DIR;
if (!existsSync(this.configDir)) {
mkdirSync(this.configDir, { recursive: true });
}
}
private getConfigPath(repoUrl: string): string {
const hash = createHash('md5').update(repoUrl).digest('hex');
return join(this.configDir, `${hash}.json`);
}
private sanitizeLocalPath(repoUrl: string): string | null {
if (repoUrl.startsWith('file://')) {
const localPath = repoUrl.replace('file://', '');
return existsSync(localPath) ? localPath : null;
}
return null;
}
getRepositoryPath(repoUrl: string, branch?: string): { path: string; config: RepoConfig } {
const localPath = this.sanitizeLocalPath(repoUrl);
if (localPath) {
const repoConfig: RepoConfig = {
url: repoUrl,
localPath,
lastAccessed: Date.now(),
type: 'local',
branch
};
this.saveConfig(repoUrl, repoConfig);
return { path: localPath, config: repoConfig };
}
const configPath = this.getConfigPath(repoUrl);
let repoConfig: RepoConfig;
if (existsSync(configPath)) {
try {
repoConfig = JSON.parse(readFileSync(configPath, 'utf8'));
repoConfig.lastAccessed = Date.now();
} catch {
repoConfig = this.createRemoteConfig(repoUrl, branch);
}
} else {
repoConfig = this.createRemoteConfig(repoUrl, branch);
}
this.saveConfig(repoUrl, repoConfig);
return { path: repoConfig.localPath || '', config: repoConfig };
}
private createRemoteConfig(repoUrl: string, branch?: string): RepoConfig {
const repoName = basename(repoUrl.replace('.git', ''));
const cacheDir = join(this.configDir, 'cache');
if (!existsSync(cacheDir)) {
mkdirSync(cacheDir, { recursive: true });
}
return {
url: repoUrl,
localPath: join(cacheDir, repoName),
lastAccessed: Date.now(),
type: 'remote',
branch
};
}
private saveConfig(repoUrl: string, config: RepoConfig): void {
const configPath = this.getConfigPath(repoUrl);
writeFileSync(configPath, JSON.stringify(config, null, 2));
}
isLocalRepository(repoUrl: string): boolean {
return repoUrl.startsWith('file://');
}
needsCloning(repoUrl: string): boolean {
if (this.isLocalRepository(repoUrl)) {
return false;
}
const { config } = this.getRepositoryPath(repoUrl);
return !config.localPath || !existsSync(config.localPath);
}
getRepoType(repoUrl: string): 'local' | 'remote' {
return this.isLocalRepository(repoUrl) ? 'local' : 'remote';
}
}
export const repoConfigManager = new RepositoryConfigManager();
```
--------------------------------------------------------------------------------
/index.ts:
--------------------------------------------------------------------------------
```typescript
#!/usr/bin/env node
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
CallToolRequestSchema,
ErrorCode,
ListToolsRequestSchema,
McpError,
} from "@modelcontextprotocol/sdk/types.js";
import { QueryRepoSchema, queryRepo } from "./tools/queryRepo.js";
import { zodToJsonSchema } from "zod-to-json-schema";
import { z } from "zod";
import { ProgressNotifier } from "utils/types.js";
enum ToolName {
QUERY_REPO = "query_repo",
}
class CodeContextServer {
private server: Server;
constructor() {
this.server = new Server(
{
name: "code-context-mcp",
version: "0.1.0",
},
{
capabilities: {
tools: {},
},
}
);
this.setupToolHandlers();
// Error handling
this.server.onerror = (error) => console.error("[MCP Error]", error);
process.on("SIGINT", async () => {
await this.server.close();
process.exit(0);
});
}
private setupToolHandlers() {
this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools: [
{
name: ToolName.QUERY_REPO,
description: "Queries a git repository using semantic and keyword search. Use keywords and file patterns if you want to targer specific files or terms",
inputSchema: zodToJsonSchema(QueryRepoSchema),
},
],
}));
this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: input } = request.params;
const progressToken = request.params._meta?.progressToken;
switch (name) {
case ToolName.QUERY_REPO:
try {
// Create a progress notifier if we have a progress token
let progressNotifier: ProgressNotifier | undefined;
if (progressToken !== undefined) {
progressNotifier = {
sendProgress: async (progress: number, total: number) => {
await this.server.notification({
method: "notifications/progress",
params: {
progress: Math.floor(progress * 100),
total: total * 100,
progressToken,
},
});
},
};
}
// Get the raw result from queryRepo with progress notifications
const result = await queryRepo(
input as z.infer<typeof QueryRepoSchema>,
progressNotifier
);
// Format the response in Claude's expected structure
return {
content: [
{
type: "text",
text: JSON.stringify(result),
},
],
};
} catch (error) {
console.error("Error in query_repo:", error);
return {
content: [
{
type: "text",
text: `Error executing query: ${error instanceof Error ? error.message : String(error)}`,
},
],
};
}
default:
throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${name}`);
}
});
}
async run() {
const transport = new StdioServerTransport();
await this.server.connect(transport);
console.error("Code Context MCP server running on stdio");
}
}
const server = new CodeContextServer();
server.run().catch(console.error);
```
--------------------------------------------------------------------------------
/utils/db.ts:
--------------------------------------------------------------------------------
```typescript
import Database from "better-sqlite3";
import fs from "fs";
import path from "path";
import config from "../config.js";
// Ensure the data directory exists
const DATA_DIR = config.DATA_DIR;
if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR, { recursive: true });
}
const DB_PATH = path.join(DATA_DIR, "code_context.db");
const db = new Database(DB_PATH);
console.error(`Using db at: ${DB_PATH}`)
// Enable foreign keys
db.pragma("foreign_keys = ON");
// SQL schema for the database
export const SCHEMA_SQL = `
CREATE TABLE IF NOT EXISTS repository (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
path TEXT NOT NULL,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(path)
);
CREATE TABLE IF NOT EXISTS branch (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
repository_id INTEGER NOT NULL,
last_commit_sha TEXT NOT NULL,
status TEXT CHECK(status IN ('pending', 'files_processed', 'embeddings_generated')) DEFAULT 'pending',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (repository_id) REFERENCES repository(id) ON DELETE CASCADE,
UNIQUE(name, repository_id)
);
CREATE TABLE IF NOT EXISTS file (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repository_id INTEGER NOT NULL,
path TEXT NOT NULL,
name TEXT NOT NULL,
sha TEXT NOT NULL,
status TEXT CHECK(status IN ('pending', 'fetched', 'ingested', 'done')) DEFAULT 'pending',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (repository_id) REFERENCES repository(id) ON DELETE CASCADE,
UNIQUE(repository_id, path, sha)
);
CREATE TABLE IF NOT EXISTS branch_file_association (
branch_id INTEGER NOT NULL,
file_id INTEGER NOT NULL,
PRIMARY KEY (branch_id, file_id),
FOREIGN KEY (branch_id) REFERENCES branch(id) ON DELETE CASCADE,
FOREIGN KEY (file_id) REFERENCES file(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS file_chunk (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
content TEXT NOT NULL,
chunk_number INTEGER NOT NULL,
embedding TEXT,
model_version TEXT,
token_count INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (file_id) REFERENCES file(id) ON DELETE CASCADE,
UNIQUE(file_id, chunk_number)
);
`;
// Initialize the database
export const initializeDatabase = () => {
try {
// Split the schema SQL into individual statements
const statements = SCHEMA_SQL.split(";").filter(
(stmt) => stmt.trim().length > 0
);
// Execute each statement
for (const statement of statements) {
db.exec(statement + ";");
}
} catch (error) {
console.error("Error initializing database:", error);
throw error;
}
};
// Helper function to run queries with parameters
const run = (sql: string, params: any = {}) => {
return db.prepare(sql).run(params);
};
// Helper function to get a single row
const get = (sql: string, params: any = {}) => {
return db.prepare(sql).get(params);
};
// Helper function to get all rows
const all = (sql: string, params: any = {}) => {
return db.prepare(sql).all(params);
};
// Define a type for the database operations that can be performed in a transaction
export interface DatabaseOperations {
prepare: (sql: string) => {
run: (params?: any) => any;
get: (params?: any) => any;
all: (params?: any) => any;
};
}
// Create a transaction function that's compatible with the existing code
const transaction = (cb: (dbOps: any) => any): any => {
const runTransaction = db.transaction(cb);
return runTransaction(db);
};
// Define a public interface for our database module
export interface DatabaseInterface {
run: (sql: string, params?: any) => any;
get: (sql: string, params?: any) => any;
all: (sql: string, params?: any) => any;
transaction: (cb: (dbOps: any) => any) => any;
close: () => void;
}
// Initialize the database
initializeDatabase();
// Export the database interface
const dbInterface: DatabaseInterface = {
run,
get,
all,
transaction,
close: () => db.close(),
};
export default dbInterface;
```
--------------------------------------------------------------------------------
/tools/embedFiles.ts:
--------------------------------------------------------------------------------
```typescript
import { z } from "zod";
import dbInterface from "../utils/db.js";
import { generateOllamaEmbeddings } from "../utils/ollamaEmbeddings.js";
import { ProgressNotifier } from "../utils/types.js";
import config from "../config.js";
// Define input schema for embedFiles
export const EmbedFilesSchema = z.object({
repoLocalPath: z.string().describe("Local path to the cloned repository"),
branchId: z.number().describe("Branch ID in the database"),
_meta: z
.object({
progressToken: z.union([z.string(), z.number()]).optional(),
})
.optional(),
});
// Define chunk interface
interface Chunk {
id: number;
content: string;
file_id: number;
}
export async function embedFiles(
input: z.infer<typeof EmbedFilesSchema>,
progressNotifier?: ProgressNotifier
) {
try {
console.error(
`[embedFiles] Starting with parameters: ${JSON.stringify(input)}`
);
// Check if input is defined
if (!input) {
console.error(`[embedFiles] Error: Input parameters are undefined`);
return {
error: {
message: "Input parameters are required for embedFiles tool",
},
};
}
const startTime = Date.now();
const { branchId } = input;
// First check if the branch exists
const branchExists = dbInterface.get(
"SELECT id, status FROM branch WHERE id = ?",
branchId
);
if (!branchExists) {
console.error(`[embedFiles] Error: Branch with ID ${branchId} does not exist`);
return {
error: {
message: `Branch with ID ${branchId} does not exist`,
},
};
}
// Check if there are any files associated with this branch
const fileCount = dbInterface.get(
"SELECT COUNT(*) as count FROM branch_file_association WHERE branch_id = ?",
branchId
);
if (!fileCount || fileCount.count === 0) {
console.error(`[embedFiles] No files found for branch ${branchId}`);
// Still update the branch status
console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
dbInterface.run(
"UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
branchId
);
return { success: true, chunksProcessed: 0 };
}
// Get all chunks that need embeddings
console.error(`[embedFiles] Finding chunks that need embeddings for branch ${branchId}`);
const chunks = dbInterface.all(
`SELECT fc.id, fc.content, f.id as file_id
FROM file_chunk fc
JOIN file f ON fc.file_id = f.id
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE bfa.branch_id = ?
AND fc.embedding IS NULL`,
branchId
);
if (chunks.length === 0) {
console.error(`[embedFiles] No chunks need embeddings, skipping`);
// Update branch status even when no chunks need embeddings
console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
dbInterface.run(
"UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
branchId
);
if (progressNotifier) {
await progressNotifier.sendProgress(1, 1);
}
return { success: true, chunksProcessed: 0 };
}
console.error(`[embedFiles] Found ${chunks.length} chunks that need embeddings`);
let processedChunks = 0;
const totalChunks = chunks.length;
const BATCH_SIZE = 100
// Process chunks in batches of BATCH_SIZE
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
const batch = chunks.slice(i, i + BATCH_SIZE);
console.error(
`[embedFiles] Processing batch ${Math.floor(i/BATCH_SIZE) + 1}/${Math.ceil(totalChunks/BATCH_SIZE)}`
);
// Generate embeddings for the batch
const chunkContents = batch.map((chunk: Chunk) => chunk.content);
console.error(`[embedFiles] Generating embeddings for ${batch.length} chunks`);
const embeddingStartTime = Date.now();
const embeddings = await generateOllamaEmbeddings(chunkContents);
console.error(
`[embedFiles] Generated embeddings in ${Date.now() - embeddingStartTime}ms`
);
// Store embeddings in transaction
console.error(`[embedFiles] Storing embeddings`);
dbInterface.transaction((db) => {
const updateStmt = db.prepare(
`UPDATE file_chunk
SET embedding = ?, model_version = ?
WHERE id = ?`
);
for (let j = 0; j < batch.length; j++) {
const chunk = batch[j];
const embedding = JSON.stringify(embeddings[j]);
updateStmt.run(embedding, config.EMBEDDING_MODEL.model, chunk.id);
}
});
processedChunks += batch.length;
// Update progress
if (progressNotifier) {
const progress = processedChunks / totalChunks;
await progressNotifier.sendProgress(progress, 1);
}
}
// Update branch status
console.error(`[embedFiles] Setting branch status to 'embeddings_generated'`);
dbInterface.run(
"UPDATE branch SET status = 'embeddings_generated' WHERE id = ?",
branchId
);
console.error(
`[embedFiles] Processed ${processedChunks} chunks in ${
Date.now() - startTime
}ms`
);
return {
success: true,
chunksProcessed: processedChunks
};
} catch (error) {
console.error(`[embedFiles] Error executing tool:`, error);
return {
error: {
message: `Error executing embedFiles tool: ${
error instanceof Error ? error.message : String(error)
}`,
},
};
}
}
```
--------------------------------------------------------------------------------
/tools/ingestBranch.ts:
--------------------------------------------------------------------------------
```typescript
import { z } from "zod";
import { simpleGit } from "simple-git";
import path from "path";
import fs from "fs";
import dbInterface from "../utils/db.js";
import { ProgressNotifier } from "../utils/types.js";
import config from "../config.js";
import { repoConfigManager } from "../utils/repoConfig.js";
// Define input schema for ingestBranch
export const IngestBranchSchema = z.object({
repoUrl: z.string().describe("GitHub repository URL"),
branch: z
.string()
.optional()
.describe("Branch name to query (defaults to repository's default branch)"),
_meta: z
.object({
progressToken: z.union([z.string(), z.number()]).optional(),
})
.optional(),
});
// Define chunk interface
interface Chunk {
content: string;
chunkNumber: number;
tokenCount: number;
}
const cloneRepository = async (
repoUrl: string,
localPath: string
): Promise<string> => {
// Extract repository name from URL
const repoName = path.basename(repoUrl, ".git");
const fullPath = path.join(localPath, repoName);
// Check if repository already exists
if (fs.existsSync(fullPath)) {
console.error(`Repository already exists at ${fullPath}`);
return fullPath;
}
// Clone the repository
console.error(`Cloning repository ${repoUrl} to ${fullPath}`);
const git = simpleGit();
await git.clone(repoUrl, fullPath);
return fullPath;
};
// Modified cloneRepository function wrapper that reports progress
async function cloneRepositoryWithProgress(
repoUrl: string,
reposDir: string,
progressNotifier?: ProgressNotifier
): Promise<string> {
// Send initial progress notification (start of cloning - 0% of the 33%)
if (progressNotifier) {
await progressNotifier.sendProgress(0, 1);
}
// Set up a timer to periodically send progress updates
let progressPercentage = 0;
let isCloning = true;
const progressInterval = 1500; // 1.5 seconds between updates
const maxProgress = 0.30; // Progress up to 30% (reserving 3% for completion)
const progressStep = 0.02; // Increments of 2%
// Create an interval that will send progress updates periodically
let timer: NodeJS.Timeout | null = null;
if (progressNotifier) {
timer = setInterval(async () => {
if (isCloning && progressPercentage < maxProgress) {
progressPercentage += progressStep;
await progressNotifier!.sendProgress(progressPercentage, 1);
}
}, progressInterval);
}
try {
// Start cloning operation
const repoLocalPath = await cloneRepository(repoUrl, reposDir);
// Clone completed
isCloning = false;
// Send completion of cloning phase (33% of total progress)
if (progressNotifier) {
await progressNotifier.sendProgress(0.33, 1);
}
return repoLocalPath;
} finally {
// Clean up the timer when done
if (timer) {
clearInterval(timer);
}
}
}
export async function ingestBranch(
input: z.infer<typeof IngestBranchSchema>,
progressNotifier?: ProgressNotifier
) {
try {
console.error(
`[ingestBranch] Starting with parameters: ${JSON.stringify(input)}`
);
// Check if input is defined
if (!input) {
console.error(`[ingestBranch] Error: Input parameters are undefined`);
return {
error: {
message: "Input parameters are required for ingestBranch tool",
},
};
}
const startTime = Date.now();
const { repoUrl, branch } = input;
// Validate required parameters
if (!repoUrl) {
console.error(`[ingestBranch] Error: Missing required parameter repoUrl`);
return {
error: {
message: "Required parameter (repoUrl) is missing",
},
};
}
// Get repository path using config manager
const { path: repoLocalPath, config: repoConfig } = repoConfigManager.getRepositoryPath(repoUrl, branch);
let actualBranch = branch || "";
console.error(
`[ingestBranch] Processing repository: ${repoUrl}, type: ${repoConfig.type}, branch: ${actualBranch || 'default'}`
);
// Handle repository based on type
if (repoConfig.type === 'local') {
console.error(`[ingestBranch] Using local repository at: ${repoLocalPath}`);
} else {
// Only clone if needed
if (repoConfigManager.needsCloning(repoUrl)) {
console.error(`[ingestBranch] Cloning remote repository to: ${repoLocalPath}`);
await cloneRepositoryWithProgress(repoUrl, path.dirname(repoLocalPath), progressNotifier);
} else {
console.error(`[ingestBranch] Using cached repository at: ${repoLocalPath}`);
}
}
console.error(
`[ingestBranch] Repository cloned to: ${repoLocalPath} (${
Date.now() - startTime
}ms)`
);
// Initialize git
const git = simpleGit(repoLocalPath);
// If branch is not specified, get the default branch using git
if (!actualBranch) {
console.error(`[ingestBranch] Branch not specified, getting default branch`);
try {
// Get the default branch name
const defaultBranch = await git.revparse(['--abbrev-ref', 'HEAD']);
actualBranch = defaultBranch;
console.error(`[ingestBranch] Using default branch: ${actualBranch}`);
} catch (error) {
console.error(`[ingestBranch] Error getting default branch:`, error);
// Fallback to 'main' if we can't determine the default branch
actualBranch = "main";
console.error(`[ingestBranch] Falling back to branch: ${actualBranch}`);
}
}
// Checkout the branch
console.error(`[ingestBranch] Checking out branch: ${actualBranch}`);
await git.checkout(actualBranch);
const latestCommit = await git.revparse([actualBranch]);
console.error(`[ingestBranch] Latest commit SHA: ${latestCommit}`);
// Extract repo name from URL
const repoName = path.basename(repoUrl, ".git");
// Check if repo exists in database
console.error(
`[ingestBranch] Checking if repo exists in database: ${repoName}`
);
const repoExists = dbInterface.get(
"SELECT id FROM repository WHERE name = ?",
repoName
);
let repoId;
if (repoExists) {
repoId = repoExists.id;
console.error(
`[ingestBranch] Repository found in database with ID: ${repoId}`
);
} else {
// Register repository
console.error(`[ingestBranch] Registering new repository: ${repoName}`);
const result = dbInterface.run(
"INSERT INTO repository (name, path) VALUES (?, ?)",
[repoName, repoLocalPath]
);
repoId = result.lastInsertRowid;
console.error(`[ingestBranch] Repository registered with ID: ${repoId}`);
}
// Check if branch exists and has the same commit SHA
console.error(`[ingestBranch] Checking if branch exists in database`);
const branchExists = dbInterface.get(
"SELECT id, last_commit_sha, status FROM branch WHERE name = ? AND repository_id = ?",
[actualBranch, repoId]
);
let branchId;
let needsUpdate = false;
if (branchExists) {
branchId = branchExists.id;
console.error(
`[ingestBranch] Branch found in database with ID: ${branchId}`
);
// Step 1: Check if SHA changed
if (branchExists.last_commit_sha !== latestCommit) {
console.error(`[ingestBranch] Commit SHA changed, updating branch: ${branchId}`);
// Update branch commit SHA and set status to 'pending'
dbInterface.run(
"UPDATE branch SET last_commit_sha = ?, status = 'pending' WHERE id = ?",
[latestCommit, branchId]
);
needsUpdate = true;
}
// Step 2: Check if status is not embeddings_generated
if (branchExists.status !== 'embeddings_generated') {
console.error(`[ingestBranch] Branch status is "${branchExists.status}" not "embeddings_generated", needs processing`);
needsUpdate = true;
}
if (!needsUpdate) {
console.error(`[ingestBranch] No changes needed, skipping update`);
}
} else {
// Register the branch
console.error(`[ingestBranch] Registering new branch: ${actualBranch}`);
const result = dbInterface.run(
"INSERT INTO branch (name, repository_id, last_commit_sha, status) VALUES (?, ?, ?, 'pending')",
[actualBranch, repoId, latestCommit]
);
branchId = result.lastInsertRowid;
needsUpdate = true;
console.error(`[ingestBranch] Branch registered with ID: ${branchId}`);
}
// We don't process files directly here, just return the state
// The actual file processing will happen in processFiles.ts
return {
repoLocalPath,
repoId,
branchId,
needsUpdate,
repoName,
actualBranch,
latestCommit
};
} catch (error) {
console.error(`[ingestBranch] Error executing tool:`, error);
return {
error: {
message: `Error executing ingestBranch tool: ${
error instanceof Error ? error.message : String(error)
}`,
},
};
}
}
```
--------------------------------------------------------------------------------
/tools/queryRepo.ts:
--------------------------------------------------------------------------------
```typescript
import { z } from "zod";
import dbInterface from "../utils/db.js";
import { generateOllamaEmbeddings } from "../utils/ollamaEmbeddings.js";
import { createFilePatternCondition } from "../utils/filePatternMatcher.js";
import { ProgressNotifier } from "../utils/types.js";
import { ingestBranch } from "./ingestBranch.js";
import { processFiles } from "./processFiles.js";
// Define input schemas for tools
export const QueryRepoSchema = z.object({
repoUrl: z.string().describe("GitHub repository URL"),
branch: z
.string()
.optional()
.describe("Branch name to query (defaults to repository's default branch)"),
semanticSearch: z.string().describe("Query for semantic search. This search is not exact, it will try to find the most relevant files, it doesn't accept file: or path: prefixes."),
keywordsSearch: z
.array(z.string())
.describe(
"Search to the files that contain at least one of the keywords in this list. Leave empty to disable. This can work in conjunction with the semantic search."
),
filePatterns: z
.array(z.string())
.describe(
"Array of glob patterns to filter files (e.g. '**/*.ts', 'src/*.js'). Use it for a more effective search or to target specific files for example 'somefile.tsx'. Leave empty to disable"
),
excludePatterns: z
.array(z.string())
.optional()
.describe(
"Array of glob patterns to exclude files (e.g. '**/node_modules/**', '**/dist/**'). Use it to exclude files that are not relevant to the search. Leave empty to disable"
),
limit: z.number().optional().describe("Maximum number of results to return"),
_meta: z
.object({
progressToken: z.union([z.string(), z.number()]).optional(),
})
.optional(),
});
// Helper function to create a heartbeat progress notifier
function createHeartbeatNotifier(originalNotifier?: ProgressNotifier, heartbeatMs: number = 2000): {
notifier: ProgressNotifier;
stopHeartbeat: () => void;
} {
if (!originalNotifier) {
return {
notifier: {
sendProgress: async () => {} // No-op if no original notifier
},
stopHeartbeat: () => {}
};
}
let currentProgress = 0;
let currentMax = 1;
let isActive = true;
let lastUpdate = Date.now();
// Heartbeat interval
const intervalId = setInterval(async () => {
if (!isActive) return;
// Only send if it's been more than heartbeatMs since the last update
if (Date.now() - lastUpdate >= heartbeatMs) {
console.error(`[queryRepo] Heartbeat progress: ${currentProgress}/${currentMax}`);
await originalNotifier.sendProgress(currentProgress, currentMax);
}
}, heartbeatMs);
return {
notifier: {
sendProgress: async (progress: number, max: number) => {
currentProgress = progress;
currentMax = max;
lastUpdate = Date.now();
await originalNotifier.sendProgress(progress, max);
}
},
stopHeartbeat: () => {
isActive = false;
clearInterval(intervalId);
}
};
}
export async function queryRepo(
input: z.infer<typeof QueryRepoSchema>,
progressNotifier?: ProgressNotifier
) {
// Create heartbeat notifier that will send regular updates
const { notifier: heartbeatNotifier, stopHeartbeat } = createHeartbeatNotifier(progressNotifier);
try {
console.error(
`[queryRepo] Starting with parameters: ${JSON.stringify(input)}`
);
// Check if input is defined
if (!input) {
console.error(`[queryRepo] Error: Input parameters are undefined`);
return {
error: {
message: "Input parameters are required for queryRepo tool",
},
};
}
const startTime = Date.now();
const {
repoUrl,
branch,
semanticSearch: semanticSearchInput,
keywordsSearch,
limit,
filePatterns,
excludePatterns,
} = input;
// Validate required parameters
if (!repoUrl ||(!semanticSearchInput && !keywordsSearch)) {
console.error(`[queryRepo] Error: Missing required parameters`);
return {
error: {
message: "Required parameters (repoUrl, semanticSearch or keywordsSearch) are missing",
},
};
}
let semanticSearch = semanticSearchInput;
if(!semanticSearchInput) {
semanticSearch = keywordsSearch.join(" ");
}
// Initialize progress at start
await heartbeatNotifier.sendProgress(0.05, 1);
// Step 1: Ingest the branch (25% of progress)
console.error(`[queryRepo] Ingesting branch: ${repoUrl}, ${branch || 'default'}`);
const branchResult = await ingestBranch(
{
repoUrl,
branch
},
undefined // Don't pass progress notifier to individual tools
);
// Update progress after branch ingestion
await heartbeatNotifier.sendProgress(0.25, 1);
// Check for error
if ('error' in branchResult) {
console.error(`[queryRepo] Error in ingestBranch:`, branchResult.error);
return { error: branchResult.error };
}
const branchData = branchResult;
// Step 2: Process files if needed (50% of progress)
console.error(`[queryRepo] Processing files for branch: ${branchData.branchId}`);
const filesResult = await processFiles(
{
repoLocalPath: branchData.repoLocalPath,
repoId: branchData.repoId,
branchId: branchData.branchId,
actualBranch: branchData.actualBranch,
needsUpdate: branchData.needsUpdate
},
undefined // Don't pass progress notifier to individual tools
);
// Update progress after file processing
await heartbeatNotifier.sendProgress(0.5, 1);
// Check for error
if ('error' in filesResult) {
console.error(`[queryRepo] Error in processFiles:`, filesResult.error);
return { error: filesResult.error };
}
// Generate embedding for the query
console.error(`[queryRepo] Generating embedding for query: "${semanticSearch}"`);
const queryEmbedStart = Date.now();
const [queryEmbedding] = await generateOllamaEmbeddings([semanticSearch]);
const queryEmbeddingStr = JSON.stringify(queryEmbedding);
console.error(
`[queryRepo] Generated query embedding in ${
Date.now() - queryEmbedStart
}ms`
);
// Update progress after query embedding
await heartbeatNotifier.sendProgress(0.6, 1);
// Search for similar chunks using SQLite's JSON functions for vector similarity
console.error(
`[queryRepo] Searching for similar chunks with limit: ${limit}`
);
const searchStart = Date.now();
// Use a default limit of 10 if undefined
const effectiveLimit = limit === undefined ? 10 : limit;
// Create SQL condition for file pattern filtering
const filePatternCondition = createFilePatternCondition(
filePatterns,
excludePatterns
);
const results = dbInterface.all(
`
SELECT fc.content, f.path, fc.chunk_number,
(SELECT (SELECT SUM(json_extract(value, '$') * json_extract(?, '$[' || key || ']'))
FROM json_each(fc.embedding)
GROUP BY key IS NOT NULL)
)/${queryEmbedding.length} as similarity
FROM file_chunk fc
JOIN file f ON fc.file_id = f.id
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE bfa.branch_id = ?
AND fc.embedding IS NOT NULL
${filePatternCondition}
ORDER BY similarity DESC
LIMIT ?
`,
[queryEmbeddingStr, branchData.branchId, effectiveLimit]
);
console.error(
`[queryRepo] Search completed in ${Date.now() - searchStart}ms, found ${
results.length
} results`
);
// Update progress after initial search
await heartbeatNotifier.sendProgress(0.7, 1);
// If no results found, check if embeddings need to be generated
if (results.length === 0) {
console.error(`[queryRepo] No results found, checking if embeddings need to be generated`);
// Check if there are any chunks without embeddings
const chunksWithoutEmbeddings = dbInterface.get(
`SELECT COUNT(*) as count
FROM file_chunk fc
JOIN file f ON fc.file_id = f.id
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE bfa.branch_id = ?
AND fc.embedding IS NULL`,
branchData.branchId
);
if (chunksWithoutEmbeddings && chunksWithoutEmbeddings.count > 0) {
console.error(`[queryRepo] Found ${chunksWithoutEmbeddings.count} chunks without embeddings, generating them`);
// Import embedFiles function
const { embedFiles } = await import('./embedFiles.js');
// Generate embeddings (75-90% of progress)
await heartbeatNotifier.sendProgress(0.75, 1);
// Generate embeddings
const embedResult = await embedFiles(
{
repoLocalPath: branchData.repoLocalPath,
branchId: branchData.branchId
},
undefined // Don't pass progress notifier to individual tools
);
// Update progress after embedding generation
await heartbeatNotifier.sendProgress(0.9, 1);
if ('error' in embedResult) {
console.error(`[queryRepo] Error generating embeddings:`, embedResult.error);
return { error: embedResult.error };
}
// Try searching again after generating embeddings
console.error(`[queryRepo] Retrying search after generating embeddings`);
const retryResults = dbInterface.all(
`
SELECT fc.content, f.path, fc.chunk_number,
(SELECT (SELECT SUM(json_extract(value, '$') * json_extract(?, '$[' || key || ']'))
FROM json_each(fc.embedding)
GROUP BY key IS NOT NULL)
) as similarity
FROM file_chunk fc
JOIN file f ON fc.file_id = f.id
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE bfa.branch_id = ?
AND fc.embedding IS NOT NULL
${filePatternCondition}
ORDER BY similarity DESC
LIMIT ?
`,
[queryEmbeddingStr, branchData.branchId, effectiveLimit]
);
console.error(
`[queryRepo] Retry search completed, found ${retryResults.length} results`
);
results.push(...retryResults);
}
}
// Filter results by keywords if provided
let filteredResults = results;
if (keywordsSearch && keywordsSearch.length > 0) {
console.error(
`[queryRepo] Filtering results by keywords: ${keywordsSearch.join(", ")}`
);
const keywordFilterStart = Date.now();
// Convert keywords to lowercase for case-insensitive matching
const lowercaseKeywords = keywordsSearch.map((kw) => kw.trim().toLowerCase());
filteredResults = results.filter((result: { content: string }) => {
const content = result.content.toLowerCase();
// Check if the content contains at least one of the keywords
return lowercaseKeywords.some((keyword) => content.includes(keyword));
});
console.error(
`[queryRepo] Keyword filtering completed in ${
Date.now() - keywordFilterStart
}ms, filtered from ${results.length} to ${
filteredResults.length
} results`
);
}
// Update progress to completion
await heartbeatNotifier.sendProgress(1, 1);
const totalTime = Date.now() - startTime;
console.error(`[queryRepo] Tool completed in ${totalTime}ms`);
return {
output: {
success: true,
repoUrl,
branch: branchData.actualBranch,
processingTimeMs: totalTime,
results: filteredResults.map((result: any) => ({
filePath: result.path,
chunkNumber: result.chunk_number,
content: result.content,
similarity: result.similarity,
})),
},
};
} catch (error) {
console.error(`[queryRepo] Error executing tool:`, error);
return {
error: {
message: `Error executing queryRepo tool: ${
error instanceof Error ? error.message : String(error)
}`,
},
};
} finally {
// Always stop the heartbeat when done
stopHeartbeat();
}
}
```
--------------------------------------------------------------------------------
/tools/processFiles.ts:
--------------------------------------------------------------------------------
```typescript
import { z } from "zod";
import dbInterface from "../utils/db.js";
import { ProgressNotifier } from "../utils/types.js";
import { simpleGit } from "simple-git";
import path from "path";
import { extensionToSplitter, splitDocument } from "../utils/codeSplitter.js";
import fs from "fs";
interface RepositoryFile {
path: string;
name: string;
sha: string;
}
interface RepositoryFilesResult {
files: RepositoryFile[];
commitSha: string;
}
interface PendingFile {
id: number;
path: string;
sha: string;
}
// Define input schema for processFiles
export const ProcessFilesSchema = z.object({
repoLocalPath: z.string().describe("Local path to the cloned repository"),
repoId: z.number().describe("Repository ID in the database"),
branchId: z.number().describe("Branch ID in the database"),
actualBranch: z.string().describe("Actual branch name"),
needsUpdate: z.boolean().describe("Whether the branch needs updating"),
_meta: z
.object({
progressToken: z.union([z.string(), z.number()]).optional(),
})
.optional(),
});
/**
* Get the files in a repository branch
* @param repoPath Path to the repository
* @param branchName Name of the branch
* @returns List of files with their metadata
*/
export const getRepositoryFiles = async (
repoPath: string,
branchName: string,
): Promise<RepositoryFilesResult> => {
const git = simpleGit(repoPath);
// Checkout the branch
await git.checkout(branchName);
// Get the latest commit SHA
const latestCommit = await git.revparse([branchName]);
// Get the file tree
const files: RepositoryFile[] = [];
// Use git ls-tree to get all files recursively
const result = await git.raw(["ls-tree", "-r", branchName]);
const stdout = result.toString();
// Parse the output
const lines = stdout.split("\n").filter((line) => line.trim() !== "");
for (const line of lines) {
// Format: <mode> <type> <object> <file>
const [info, filePath] = line.split("\t");
const [, , sha] = info.split(" ");
if (filePath) {
files.push({
path: filePath,
name: path.basename(filePath),
sha,
});
}
}
return { files, commitSha: latestCommit };
};
/**
* Process file content and split into chunks
* @param branchName Branch name
* @param repoPath Repository path
*/
export const processFileContents = async (
branchName: string,
repoPath: string
): Promise<void> => {
const git = simpleGit(repoPath);
// Checkout the branch
await git.checkout(branchName);
// Get repository and branch IDs
const repo = dbInterface.get("SELECT id FROM repository WHERE path = ?", repoPath) as { id: number };
const branch = dbInterface.get(
"SELECT id FROM branch WHERE name = ? AND repository_id = ?",
[branchName, repo.id]
) as { id: number };
// Get all pending files for the branch
const pendingFiles = dbInterface.all(
`SELECT f.id, f.path, f.sha
FROM file f
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE f.status = 'pending' AND bfa.branch_id = ?`,
branch.id
) as PendingFile[];
for (const file of pendingFiles) {
console.error(`Processing file: ${file.path}`);
const extension = file.path.split(".").pop()?.toLowerCase();
const splitType = extension ? extensionToSplitter(extension) : "ignore";
if (splitType !== "ignore") {
try {
// Get file content
const filePath = path.join(repoPath, file.path);
// Skip if file doesn't exist (might have been deleted)
if (!fs.existsSync(filePath)) {
console.error(`File ${file.path} doesn't exist, skipping`);
continue;
}
let content = fs.readFileSync(filePath, "utf-8");
// Check for null bytes in the content
if (content.includes("\0")) {
console.error(
`File ${file.path} contains null bytes. Removing them.`
);
content = content.replace(/\0/g, "");
}
// Check if the content is valid UTF-8
try {
new TextDecoder("utf-8", { fatal: true }).decode(
new TextEncoder().encode(content)
);
} catch (e) {
console.error(
`File ${file.path} contains invalid UTF-8 characters. Replacing them.`
);
content = content.replace(/[^\x00-\x7F]/g, ""); // Remove non-ASCII characters
}
// Truncate content if it's too long
const maxLength = 1000000; // Adjust this value based on your database column size
if (content.length > maxLength) {
console.error(
`File ${file.path} content is too long. Truncating to ${maxLength} characters.`
);
content = content.substring(0, maxLength);
}
// Split the document
const chunks = await splitDocument(file.path, content);
// Store chunks in the database using dbInterface.transaction
dbInterface.transaction((db) => {
for (let i = 0; i < chunks.length; i++) {
db.prepare(
`INSERT INTO file_chunk (file_id, content, chunk_number)
VALUES (?, ?, ?)
ON CONFLICT(file_id, chunk_number) DO NOTHING`
).run(file.id, chunks[i].pageContent, i + 1);
}
// Update file status to 'fetched'
db.prepare("UPDATE file SET status = ? WHERE id = ?").run(
"fetched",
file.id
);
});
} catch (error) {
console.error(`Error processing file ${file.path}:`, error);
}
} else {
// Update file status to 'done' for ignored files
dbInterface.run("UPDATE file SET status = ? WHERE id = ?", ["done", file.id]);
}
}
};
export async function processFiles(
input: z.infer<typeof ProcessFilesSchema>,
progressNotifier?: ProgressNotifier
) {
try {
console.error(
`[processFiles] Starting with parameters: ${JSON.stringify(input)}`
);
// Check if input is defined
if (!input) {
console.error(`[processFiles] Error: Input parameters are undefined`);
return {
error: {
message: "Input parameters are required for processFiles tool",
},
};
}
const startTime = Date.now();
const { repoLocalPath, repoId, branchId, actualBranch, needsUpdate } = input;
// Skip if no update is needed
if (!needsUpdate) {
console.error(`[processFiles] No update needed, skipping`);
return {
needsUpdate: false,
filesToProcess: []
};
}
// Process the repository files
console.error(
`[processFiles] Processing repository files (${Date.now() - startTime}ms)`
);
// Get all files in the repository
const { files } = await getRepositoryFiles(repoLocalPath, actualBranch);
console.error(`[processFiles] Found ${files.length} files in repository`);
// Define transaction function
console.error(`[processFiles] Starting file database transaction`);
const processFiles = (db: any) => {
// Get existing files to compare
const existingFiles = db
.prepare(
`SELECT f.id, f.path, f.sha FROM file f
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE bfa.branch_id = ?`
)
.all(branchId);
console.error(
`[processFiles] Found ${existingFiles.length} existing files in database`
);
const existingFileMap = new Map();
for (const file of existingFiles) {
existingFileMap.set(file.path, file);
}
// Track files that need processing
const filesToProcess: any[] = [];
// File counters for logging
let newFiles = 0;
let updatedFiles = 0;
let unchangedFiles = 0;
let removedFiles = 0;
// Process each file
for (const file of files) {
const existingFile = existingFileMap.get(file.path);
existingFileMap.delete(file.path); // Remove from map to track what's left later
if (!existingFile) {
// New file - but first check if it already exists in the database for another branch
const existingFileInDB = db.prepare(
"SELECT id FROM file WHERE repository_id = ? AND path = ? AND sha = ?"
).get(repoId, file.path, file.sha);
let fileId;
if (existingFileInDB) {
// File exists but not associated with this branch
console.error(`[processFiles] File exists in DB but not associated with branch: ${file.path}`);
fileId = existingFileInDB.id;
// Check if the file is already associated with this branch
const associationExists = db.prepare(
"SELECT 1 FROM branch_file_association WHERE branch_id = ? AND file_id = ?"
).get(branchId, fileId);
if (!associationExists) {
// Associate existing file with current branch
db.prepare(
"INSERT INTO branch_file_association (branch_id, file_id) VALUES (?, ?)"
).run(branchId, fileId);
}
} else {
// Truly new file
newFiles++;
const result = db
.prepare(
"INSERT INTO file (repository_id, path, sha, name, status) VALUES (?, ?, ?, ?, 'pending')"
)
.run(repoId, file.path, file.sha, file.name);
fileId = result.lastInsertRowid;
// Associate with branch
db.prepare(
"INSERT INTO branch_file_association (branch_id, file_id) VALUES (?, ?)"
).run(branchId, fileId);
}
filesToProcess.push({
id: fileId,
path: file.path,
name: file.name,
});
} else if (existingFile.sha !== file.sha) {
// Updated file - SHA changed
updatedFiles++;
db.prepare(
"UPDATE file SET sha = ?, status = 'pending' WHERE id = ?"
).run(file.sha, existingFile.id);
filesToProcess.push({
id: existingFile.id,
path: file.path,
name: file.name,
});
} else {
// Unchanged file
unchangedFiles++;
}
}
// Remove files that no longer exist in the branch
for (const [path, file] of existingFileMap.entries()) {
removedFiles++;
db.prepare(
"DELETE FROM branch_file_association WHERE branch_id = ? AND file_id = ?"
).run(branchId, file.id);
// If no other branches reference this file, delete it and its chunks
const fileStillInUse = db
.prepare(
"SELECT 1 FROM branch_file_association WHERE file_id = ? LIMIT 1"
)
.get(file.id);
if (!fileStillInUse) {
// Delete chunks first
db.prepare("DELETE FROM file_chunk WHERE file_id = ?").run(file.id);
// Then delete the file
db.prepare("DELETE FROM file WHERE id = ?").run(file.id);
}
}
console.error(
`[processFiles] Files summary: ${newFiles} new, ${updatedFiles} updated, ${unchangedFiles} unchanged, ${removedFiles} removed`
);
return filesToProcess;
};
// Execute the transaction
console.error(`[processFiles] Executing file processing transaction`);
const filesToProcess = dbInterface.transaction((db) => processFiles(db));
console.error(
`[processFiles] Transaction completed, processing ${
filesToProcess.length
} files (${Date.now() - startTime}ms)`
);
// Limit the number of files processed to avoid timeouts
// This might need adjustment based on actual performance
const MAX_FILES_TO_PROCESS = 1000000;
const limitedFiles = filesToProcess.slice(0, MAX_FILES_TO_PROCESS);
if (limitedFiles.length < filesToProcess.length) {
console.error(
`[processFiles] WARNING: Processing only ${limitedFiles.length} of ${filesToProcess.length} files to avoid timeout`
);
}
// Update progress for file processing phase (33% to 66%)
if (progressNotifier) {
await progressNotifier.sendProgress(0.33, 1);
}
// Process file contents to generate chunks - this was the missing step
console.error(`[processFiles] Processing file contents for branch: ${actualBranch}`);
try {
await processFileContents(actualBranch, repoLocalPath);
console.error(`[processFiles] File contents processed successfully`);
// Update branch status to files_processed
dbInterface.run(
"UPDATE branch SET status = 'files_processed' WHERE id = ?",
branchId
);
// Update progress after file content processing
if (progressNotifier) {
await progressNotifier.sendProgress(0.66, 1);
}
} catch (error) {
console.error(`[processFiles] Error processing file contents:`, error);
}
return {
needsUpdate: true,
filesToProcess: limitedFiles,
repoLocalPath
};
} catch (error) {
console.error(`[processFiles] Error executing tool:`, error);
return {
error: {
message: `Error executing processFiles tool: ${
error instanceof Error ? error.message : String(error)
}`,
},
};
}
}
```
--------------------------------------------------------------------------------
/utils/codeSplitter.ts:
--------------------------------------------------------------------------------
```typescript
import {
RecursiveCharacterTextSplitter,
TextSplitter,
} from "@langchain/textsplitters";
import fs from "fs";
class SQLSchemaSplitter extends TextSplitter {
private maxCharacters: number;
constructor(maxCharacters: number) {
super();
this.maxCharacters = maxCharacters;
}
// Helper function to parse INSERT statements
parseValues(valuesPart: string): string[] {
let valuesArray: string[] = [];
let currentTuple = "";
let nestingLevel = 0;
let inString: boolean = false;
let stringChar = "";
let escapeNext = false;
for (let i = 0; i < valuesPart.length; i++) {
const char = valuesPart[i];
currentTuple += char;
if (escapeNext) {
escapeNext = false;
} else if (char === "\\") {
escapeNext = true;
} else if (char === "'" || char === '"') {
if (inString && char === stringChar) {
inString = false;
} else if (!inString) {
inString = true;
stringChar = char;
}
} else if (!inString) {
if (char === "(") {
nestingLevel += 1;
} else if (char === ")") {
nestingLevel -= 1;
if (nestingLevel === 0) {
valuesArray.push(currentTuple.trim());
currentTuple = "";
// Skip any commas and spaces
while (
i + 1 < valuesPart.length &&
(valuesPart[i + 1] === "," ||
valuesPart[i + 1] === " " ||
valuesPart[i + 1] === "\n")
) {
i++;
}
}
}
}
}
return valuesArray;
}
// Split long INSERT statements
splitInsertStatement(statement: string): string[] {
const insertIndex = statement.toUpperCase().indexOf("VALUES");
if (insertIndex === -1) {
// Cannot split, return the statement as is
return [statement];
}
const insertIntoPart =
statement.slice(0, insertIndex + "VALUES".length) + " ";
const valuesPart = statement.slice(insertIndex + "VALUES".length);
const valuesArray = this.parseValues(valuesPart);
const insertStatements: string[] = [];
let currentValues = "";
for (const valueTuple of valuesArray) {
const newStatementLength =
insertIntoPart.length + currentValues.length + valueTuple.length + 1; // +1 for ',' or ';'
if (newStatementLength <= this.maxCharacters) {
if (currentValues !== "") {
currentValues += "," + valueTuple;
} else {
currentValues = valueTuple;
}
} else {
// Create a new INSERT statement
const newStatement = insertIntoPart + currentValues + ";";
insertStatements.push(newStatement);
currentValues = valueTuple;
}
}
if (currentValues !== "") {
const newStatement = insertIntoPart + currentValues + ";";
insertStatements.push(newStatement);
}
return insertStatements;
}
/**
* Enhanced function to split SQL script into statements while handling various SQL constructs,
* including custom keywords like BBEGI/EEN and EEXCEPTIO/EEN.
*/
splitSQLStatements(text: string): string[] {
const statements: string[] = [];
let currentStatement = "";
let index = 0;
let insideString: boolean = false;
let stringChar = "";
let insideComment = false;
let commentType = "";
let insideFunction = false;
let insideProcedure = false;
let insideView = false;
let insideBlock = false;
let blockLevel = 0;
const upperText = text.toUpperCase();
// Define mappings for custom keywords to standard ones
const beginKeywords = ["BEGIN", "BBEGI", "BEGINN"];
const endKeywords = ["END", "EEN"];
const exceptionKeywords = ["EXCEPTION", "EEXCEPTIO"];
while (index < text.length) {
const char = text[index];
const remainingText = upperText.substring(index);
currentStatement += char;
if (insideString) {
if (char === stringChar) {
insideString = false;
} else if (char === "\\") {
// Skip escaped characters
index++;
if (index < text.length) {
currentStatement += text[index];
}
}
} else if (insideComment) {
if (commentType === "--" && (char === "\n" || char === "\r")) {
insideComment = false;
} else if (commentType === "/*" && remainingText.startsWith("*/")) {
insideComment = false;
currentStatement += "*/";
index += 1; // Skip '/'
}
} else if (char === "'" || char === '"') {
insideString = true;
stringChar = char;
} else if (remainingText.startsWith("/*")) {
insideComment = true;
commentType = "/*";
currentStatement += "/*";
index += 1; // Skip '*'
} else if (remainingText.startsWith("--")) {
insideComment = true;
commentType = "--";
currentStatement += "--";
index += 1; // Skip second '-'
} else if (
!insideFunction &&
!insideProcedure &&
!insideView &&
!insideBlock
) {
if (
remainingText.startsWith("CREATE FUNCTION") ||
remainingText.startsWith("CREATE OR REPLACE FUNCTION")
) {
insideFunction = true;
blockLevel = 0;
} else if (
remainingText.startsWith("CREATE PROCEDURE") ||
remainingText.startsWith("CREATE OR REPLACE PROCEDURE")
) {
insideProcedure = true;
blockLevel = 0;
} else if (
remainingText.startsWith("CREATE VIEW") ||
remainingText.startsWith("CREATE OR REPLACE VIEW")
) {
insideView = true;
} else if (beginKeywords.some((kw) => remainingText.startsWith(kw))) {
insideBlock = true;
blockLevel = 1;
const matchedBegin = beginKeywords.find((kw) =>
remainingText.startsWith(kw)
);
if (matchedBegin && matchedBegin.length > "BEGIN".length) {
index += matchedBegin.length - "BEGIN".length;
currentStatement += matchedBegin.substring("BEGIN".length);
}
}
}
if (insideFunction || insideProcedure || insideBlock) {
// Check for BEGIN keywords to increase block level
const matchedBegin = beginKeywords.find((kw) =>
remainingText.startsWith(kw)
);
if (matchedBegin) {
blockLevel++;
index += matchedBegin.length - 1;
currentStatement += matchedBegin.substring(1);
continue;
}
// Check for END keywords to decrease block level
const matchedEnd = endKeywords.find((kw) =>
remainingText.startsWith(kw)
);
if (
matchedEnd &&
(matchedEnd.length === "END".length ||
matchedEnd.length === "END;".length)
) {
blockLevel--;
index += matchedEnd.length - 1;
currentStatement += matchedEnd.substring(1);
if (blockLevel === 0) {
if (insideFunction) {
insideFunction = false;
statements.push(currentStatement.trim());
currentStatement = "";
} else if (insideProcedure) {
insideProcedure = false;
statements.push(currentStatement.trim());
currentStatement = "";
} else if (insideBlock) {
insideBlock = false;
statements.push(currentStatement.trim());
currentStatement = "";
}
}
continue;
}
} else if (insideView) {
if (char === ";") {
insideView = false;
statements.push(currentStatement.trim());
currentStatement = "";
}
} else if (
char === ";" &&
!insideFunction &&
!insideProcedure &&
!insideView &&
!insideBlock
) {
statements.push(currentStatement.trim());
currentStatement = "";
}
index++;
}
if (currentStatement.trim() !== "") {
statements.push(currentStatement.trim());
}
return statements;
}
// Helper method to match keywords from a list at the start of the given text.
// Returns the matched keyword or null.
matchKeyword(text: string, keywords: string[]): string | null {
for (const keyword of keywords) {
if (text.startsWith(keyword)) {
return keyword;
}
}
return null;
}
async splitText(text: string): Promise<string[]> {
const statements = this.splitSQLStatements(text);
const splits: string[] = [];
for (const statement of statements) {
// Check if the statement is an INSERT statement
if (
statement.toUpperCase().includes("INSERT INTO") &&
statement.toUpperCase().includes("VALUES")
) {
// Split long INSERT statements
const splitInserts = this.splitInsertStatement(statement);
splits.push(...splitInserts);
} else {
// For other statements, check if they are too long
if (statement.length <= this.maxCharacters) {
splits.push(statement);
} else {
// For long statements, split them into chunks
let currentSplit = "";
const lines = statement.split("\n");
for (const line of lines) {
if (currentSplit.length + line.length + 1 <= this.maxCharacters) {
currentSplit += (currentSplit ? "\n" : "") + line;
} else {
if (currentSplit) {
splits.push(currentSplit);
}
currentSplit = line;
}
}
if (currentSplit) {
splits.push(currentSplit);
}
}
}
}
return splits;
}
}
export function extensionToSplitter(extension: string): string {
if (!extension) {
return "text";
}
const extensionLower = extension.toLowerCase();
switch (extensionLower) {
// C/C++ extensions
case "c++":
case "cpp":
case "c":
case "h":
case "hpp":
case "m":
case "mm":
return "cpp";
// Go
case "go":
return "go";
// Java
case "java":
return "java";
// JavaScript and related
case "js":
case "ts":
case "typescript":
case "tsx":
case "jsx":
case "javascript":
case "json":
case "pbxproj":
return "js";
// YAML and related
case "yaml":
case "yml":
case "toml":
case "ini":
case "cfg":
case "conf":
case "props":
case "env":
case "plist":
case "gemfile":
case "dockerfile":
case "podfile":
case "patch":
return "text";
// Shell scripts and related
case "sh":
case "bash":
case "zsh":
case "fish":
case "bat":
case "cmd":
return "text";
// Properties and XSD
case "properties":
case "xsd":
return "text";
// SQL
case "sql":
return "sql";
// PHP
case "php":
return "php";
// Protocol buffers
case "proto":
return "proto";
// Python
case "py":
case "python":
return "python";
// reStructuredText
case "rst":
return "rst";
// Ruby
case "rb":
case "ruby":
return "ruby";
// Rust
case "rs":
case "rust":
return "rust";
// Scala
case "scala":
return "scala";
// Swift
case "swift":
return "swift";
// Markdown
case "md":
case "markdown":
return "markdown";
// LaTeX
case "tex":
case "latex":
return "latex";
// HTML and related
case "html":
case "htm":
case "xml":
case "xsl":
case "xdt":
case "xcworkspacedata":
case "xcprivacy":
case "xcsettings":
case "xcscheme":
return "html";
// Solidity
case "sol":
case "solidity":
return "sol";
// Text
case "text":
case "txt":
case "lst":
case "reg":
return "text";
// Additional file extensions
case "jpr":
case "jws":
case "iml":
return "html";
case "lock":
case "jpg":
case "jpeg":
case "png":
case "gif":
case "bmp":
case "svg":
case "ico":
case "webp":
case "tiff":
case "bin":
case "exe":
case "dll":
case "so":
case "dylib":
case "obj":
case "o":
case "zip":
case "tar":
case "gz":
case "rar":
case "7z":
case "jar":
case "war":
case "ear":
case "class":
return "ignore";
default:
return "text";
}
}
export const splitDocument = (filename: string, code: string) => {
const extension = filename.split(".").pop();
const splitType = extensionToSplitter(extension || "");
if (splitType === "ignore") {
return [];
}
const CHUNK_SIZE_TOKENS = 7000;
const CHUNK_OVERLAP_TOKENS = 200;
const CHUNK_SIZE_CHARACTERS = CHUNK_SIZE_TOKENS * 3.25;
const CHUNK_OVERLAP_CHARACTERS = CHUNK_OVERLAP_TOKENS * 3.25;
let splitter;
if (splitType !== "text" && splitType !== "sql") {
splitter = RecursiveCharacterTextSplitter.fromLanguage(
splitType as
| "cpp"
| "go"
| "java"
| "js"
| "php"
| "proto"
| "python"
| "rst"
| "ruby"
| "rust"
| "scala"
| "swift"
| "markdown"
| "latex"
| "html"
| "sol",
{
chunkSize: CHUNK_SIZE_CHARACTERS,
chunkOverlap: CHUNK_OVERLAP_CHARACTERS,
}
);
} else if (splitType === "sql") {
splitter = new SQLSchemaSplitter(CHUNK_SIZE_CHARACTERS);
} else {
splitter = new RecursiveCharacterTextSplitter({
chunkSize: CHUNK_SIZE_CHARACTERS,
chunkOverlap: CHUNK_OVERLAP_CHARACTERS,
});
}
return splitter.createDocuments([code], [], {
chunkHeader: `FILE NAME: ${filename}\n\n---\n\n`,
appendChunkOverlapHeader: true,
});
};
```