# Directory Structure
```
├── .cargo
│ └── config.toml
├── .env
├── .gitignore
├── .vscode
│ ├── extensions.json
│ └── settings.json
├── Cargo.lock
├── Cargo.toml
├── data
│ └── .keep
├── documents
│ ├── .DS_Store
│ └── .keep
├── how-to-use.md
├── Makefile
├── README.md
├── setup.md
└── src
├── embeddings.rs
├── main.rs
├── mcp_server.rs
└── rag_engine.rs
```
# Files
--------------------------------------------------------------------------------
/data/.keep:
--------------------------------------------------------------------------------
```
```
--------------------------------------------------------------------------------
/documents/.keep:
--------------------------------------------------------------------------------
```
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
/target
/logs
documents/*.pdf
data/chunks.json
!documents/.keep
!data/.keep
```
--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
```
DATA_DIR=./data
DOCUMENTS_DIR=./documents
LOG_DIR=./logs
LOG_LEVEL=info
LOG_MAX_MB=5
# Development and debugging configuration (uncomment to enable)
# DEVELOPMENT=true
# DEV=true
# CONSOLE_LOGS=true
```
--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
```json
{
"recommendations": [
"rust-lang.rust-analyzer",
"vadimcn.vscode-lldb",
"serayuzgur.crates"
]
}
```
--------------------------------------------------------------------------------
/.cargo/config.toml:
--------------------------------------------------------------------------------
```toml
[alias]
c = "check"
cc = "clippy"
ccf = "clippy --fix --allow-dirty --allow-staged"
ccd = ["clippy", "--", "-D", "warnings"]
f = "fmt"
t = "test"
r = "run"
b = "build"
br = "build --release"
[build]
rustflags = ["-D", "warnings"]
```
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
```toml
[package]
name = "rust-local-rag"
version = "0.1.0"
edition = "2024"
authors = ["Mehmet Koray Sariteke"]
description = "A local RAG (Retrieval-Augmented Generation) MCP server built with Rust"
license = "MIT"
readme = "README.md"
repository = "https://github.com/yourusername/rust-local-rag"
keywords = ["rag", "ai", "llm", "embeddings", "pdf", "search", "mcp"]
categories = ["command-line-utilities"]
[[bin]]
name = "rust-local-rag"
path = "src/main.rs"
[dependencies]
rmcp = { version = "0.1", features = ["transport-io"] }
rmcp-macros = "0.1"
schemars = "0.9"
tokio = { version = "1", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
reqwest = { version = "0.12", features = ["json"] }
anyhow = "1.0"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = [
"fmt",
"json",
"time",
"env-filter",
] }
uuid = { version = "1.17", features = ["v4"] }
walkdir = "2.5"
dotenv = "0.15"
```
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
```json
{
"rust-analyzer.server.path": "rust-analyzer",
"rust-analyzer.cargo.buildScripts.enable": true,
"rust-analyzer.checkOnSave.command": "cargo check",
"rust-analyzer.completion.addCallParentheses": true,
"rust-analyzer.completion.addCallArgumentSnippets": true,
"rust-analyzer.inlayHints.enable": true,
"rust-analyzer.inlayHints.parameterHints.enable": true,
"rust-analyzer.inlayHints.typeHints.enable": true,
"rust-analyzer.lens.enable": true,
"rust-analyzer.lens.methodReferences": true,
"rust-analyzer.hover.actions.enable": true,
"editor.formatOnSave": true,
"[rust]": {
"editor.defaultFormatter": "rust-lang.rust-analyzer",
"editor.tabSize": 4,
"editor.insertSpaces": true
},
"rust-analyzer.diagnostics.enable": true,
"rust-analyzer.procMacro.enable": true,
"rust-analyzer.cargo.allFeatures": true,
"rust-analyzer.cargo.loadOutDirsFromCheck": true,
"rust-analyzer.cargo.runBuildScripts": true,
"rust-analyzer.workspace.symbol.search.scope": "workspace_and_dependencies",
"rust-analyzer.workspace.symbol.search.kind": "all_symbols",
"rust-analyzer.check.command": "clippy",
"rust-analyzer.check.allTargets": false,
"rust-analyzer.checkOnSave": true,
"rust-analyzer.cargo.autoreload": true
}
```
--------------------------------------------------------------------------------
/src/embeddings.rs:
--------------------------------------------------------------------------------
```rust
use anyhow::Result;
use serde::{Deserialize, Serialize};
#[derive(Serialize)]
struct OllamaEmbeddingRequest {
model: String,
prompt: String,
}
#[derive(Deserialize)]
struct OllamaEmbeddingResponse {
embedding: Vec<f32>,
}
pub struct EmbeddingService {
client: reqwest::Client,
ollama_url: String,
model: String,
}
impl EmbeddingService {
pub async fn new() -> Result<Self> {
let service = Self {
client: reqwest::Client::new(),
ollama_url: "http://localhost:11434".to_string(),
model: "nomic-embed-text".to_string(),
};
service.test_connection().await?;
Ok(service)
}
pub async fn get_embedding(&self, text: &str) -> Result<Vec<f32>> {
let request = OllamaEmbeddingRequest {
model: self.model.clone(),
prompt: text.to_string(),
};
let response = self
.client
.post(format!("{}/api/embeddings", self.ollama_url))
.json(&request)
.send()
.await?;
if !response.status().is_success() {
return Err(anyhow::anyhow!(
"Ollama API error: {} - {}",
response.status(),
response.text().await.unwrap_or_default()
));
}
let embedding_response: OllamaEmbeddingResponse = response.json().await?;
Ok(embedding_response.embedding)
}
async fn test_connection(&self) -> Result<()> {
let response = self
.client
.get(format!("{}/api/tags", self.ollama_url))
.send()
.await?;
if !response.status().is_success() {
return Err(anyhow::anyhow!(
"Cannot connect to Ollama at {}. Make sure Ollama is running.",
self.ollama_url
));
}
tracing::info!("Successfully connected to Ollama at {}", self.ollama_url);
Ok(())
}
}
```
--------------------------------------------------------------------------------
/setup.md:
--------------------------------------------------------------------------------
```markdown
# Rust Local RAG - Setup for Claude Desktop
## Prerequisites
### 1. Install Rust
```bash
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
source ~/.cargo/env
```
### 2. Install Ollama
```bash
# macOS
brew install ollama
# Linux
curl -fsSL https://ollama.com/install.sh | sh
# Start Ollama
ollama serve
# Install embedding model
ollama pull nomic-embed-text
```
### 3. Install Poppler (for PDF parsing)
```bash
# macOS
brew install poppler
# Linux (Ubuntu/Debian)
sudo apt-get install poppler-utils
# Linux (CentOS/RHEL)
sudo yum install poppler-utils
```
## Setup
### 1. Build and Install
```bash
# Clone and build
git clone <repository-url>
cd rust-local-rag
cargo build --release
# Install globally
cargo install --path .
```
### 2. Create Directories and Add Documents
```bash
# Create required directories
mkdir -p ~/Documents/data
mkdir -p ~/Documents/rag
mkdir -p /tmp/rust-local-rag
# Add your PDF documents
cp your-pdfs/*.pdf ~/Documents/rag/
```
## Claude Desktop Integration
### 1. Find Claude Desktop Config
- **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
- **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
- **Linux**: `~/.config/Claude/claude_desktop_config.json`
### 2. Add This Configuration
```json
{
"mcpServers": {
"rust-local-rag": {
"command": "/Users/yourusername/.cargo/bin/rust-local-rag",
"env": {
"DATA_DIR": "/Users/yourusername/Documents/data",
"DOCUMENTS_DIR": "/Users/yourusername/Documents/rag",
"LOG_DIR": "/tmp/rust-local-rag",
"LOG_LEVEL": "info",
"LOG_MAX_MB": "10"
}
}
}
}
```
**Important**: Replace `yourusername` with your actual username, or use absolute paths specific to your system.
### 3. Find Your Actual Paths
```bash
# Find your cargo bin directory
echo "$HOME/.cargo/bin/rust-local-rag"
# Verify the binary exists
which rust-local-rag
```
### 4. Restart Claude Desktop
## How PDF Processing Works
The application automatically:
1. **Scans the documents directory** on startup for PDF files
2. **Extracts text** using poppler's `pdftotext` utility
3. **Chunks the text** into manageable segments (typically 500-1000 characters)
4. **Generates embeddings** using Ollama's `nomic-embed-text` model
5. **Stores embeddings** in the data directory for fast retrieval
6. **Indexes documents** for semantic search
Supported formats:
- PDF files (via poppler)
- Text extraction preserves basic formatting
- Each document is split into searchable chunks
## Troubleshooting
### Installation Issues
1. **Rust not found**: Restart terminal after installing Rust
2. **Ollama connection failed**: Ensure `ollama serve` is running
3. **Poppler not found**: Verify installation with `pdftotext --version`
### Claude Desktop Issues
1. **Binary not found**: Check path with `which rust-local-rag`
2. **Permission denied**: Ensure directories are writable
3. **No documents indexed**: Check PDF files exist in `DOCUMENTS_DIR`
4. **Connection failed**: Check logs in `LOG_DIR` directory
### PDF Processing Issues
1. **Text extraction failed**: Ensure PDFs are not password-protected or corrupted
2. **Empty results**: Some PDFs may be image-only (scanned documents)
3. **Slow indexing**: Large documents take time to process on first run
### Log Files
Check application logs for detailed error information:
```bash
# View latest logs
tail -f /tmp/rust-local-rag/rust-local-rag.log
# Check for errors
grep -i error /tmp/rust-local-rag/rust-local-rag.log
```
That's it! Your documents will be automatically indexed and searchable in Claude Desktop.
```
--------------------------------------------------------------------------------
/src/mcp_server.rs:
--------------------------------------------------------------------------------
```rust
use anyhow::Result;
use rmcp::{
Error as McpError, ServerHandler, ServiceExt, model::*, schemars, tool, transport::stdio,
};
use std::sync::Arc;
use tokio::sync::RwLock;
use crate::rag_engine::RagEngine;
#[derive(Debug, serde::Deserialize, schemars::JsonSchema)]
pub struct SearchRequest {
#[schemars(description = "The search query")]
pub query: String,
#[schemars(description = "Number of results to return (default: 5)")]
pub top_k: Option<usize>,
}
#[derive(Clone)]
pub struct RagMcpServer {
rag_state: Arc<RwLock<RagEngine>>,
}
#[tool(tool_box)]
impl RagMcpServer {
pub fn new(rag_state: Arc<RwLock<RagEngine>>) -> Self {
Self { rag_state }
}
#[tool(description = "Search through uploaded documents using semantic similarity")]
async fn search_documents(
&self,
#[tool(aggr)] SearchRequest { query, top_k }: SearchRequest,
) -> Result<CallToolResult, McpError> {
let top_k = top_k.unwrap_or(5);
let engine = self.rag_state.read().await;
match engine.search(&query, top_k).await {
Ok(results) => {
let formatted_results = if results.is_empty() {
"No results found.".to_string()
} else {
results
.iter()
.enumerate()
.map(|(i, result)| {
format!(
"**Result {}** (Score: {:.3}) [{}] (Chunk: {})\n{}\n",
i + 1,
result.score,
result.document,
result.chunk_id,
result.text
)
})
.collect::<Vec<_>>()
.join("\n---\n\n")
};
Ok(CallToolResult::success(vec![Content::text(format!(
"Found {} results for '{}':\n\n{}",
results.len(),
query,
formatted_results
))]))
}
Err(e) => Ok(CallToolResult {
content: vec![Content::text(format!("Search error: {}", e))],
is_error: Some(true),
}),
}
}
#[tool(description = "List all uploaded documents")]
async fn list_documents(&self) -> Result<CallToolResult, McpError> {
let engine = self.rag_state.read().await;
let documents = engine.list_documents();
let response = if documents.is_empty() {
"No documents uploaded yet.".to_string()
} else {
format!(
"Uploaded documents ({}):\n{}",
documents.len(),
documents
.iter()
.enumerate()
.map(|(i, doc)| format!("{}. {}", i + 1, doc))
.collect::<Vec<_>>()
.join("\n")
)
};
Ok(CallToolResult::success(vec![Content::text(response)]))
}
#[tool(description = "Get RAG system statistics")]
async fn get_stats(&self) -> Result<CallToolResult, McpError> {
let engine = self.rag_state.read().await;
let stats = engine.get_stats();
Ok(CallToolResult::success(vec![Content::text(format!(
"RAG System Stats:\n{}",
serde_json::to_string_pretty(&stats).unwrap()
))]))
}
}
#[tool(tool_box)]
impl ServerHandler for RagMcpServer {
fn get_info(&self) -> ServerInfo {
ServerInfo {
protocol_version: ProtocolVersion::V_2024_11_05,
capabilities: ServerCapabilities::builder().enable_tools().build(),
server_info: Implementation {
name: "rust-rag-server".to_string(),
version: "0.1.0".to_string(),
},
instructions: Some(
"A Rust-based RAG server for document search and analysis.".to_string(),
),
}
}
}
pub async fn start_mcp_server(rag_state: Arc<RwLock<RagEngine>>) -> Result<()> {
tracing::info!("Starting MCP server");
let server = RagMcpServer::new(rag_state);
let service = server.serve(stdio()).await?;
service.waiting().await?;
Ok(())
}
```
--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
```rust
use anyhow::Result;
use std::sync::Arc;
use tokio::sync::RwLock;
use tracing_subscriber::EnvFilter;
mod embeddings;
mod mcp_server;
mod rag_engine;
use rag_engine::RagEngine;
fn get_data_dir() -> String {
std::env::var("DATA_DIR").unwrap_or_else(|_| "./data".to_string())
}
fn get_documents_dir() -> String {
std::env::var("DOCUMENTS_DIR").unwrap_or_else(|_| "./documents".to_string())
}
fn get_log_dir() -> String {
std::env::var("LOG_DIR").unwrap_or_else(|_| {
if std::path::Path::new("/var/log").exists() && is_writable("/var/log") {
"/var/log/rust-local-rag".to_string()
} else {
"./logs".to_string()
}
})
}
fn get_log_level() -> String {
std::env::var("LOG_LEVEL").unwrap_or_else(|_| "info".to_string())
}
fn get_log_max_mb() -> u64 {
std::env::var("LOG_MAX_MB")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(5)
}
fn is_writable(path: &str) -> bool {
std::fs::OpenOptions::new()
.create(true)
.write(true)
.open(format!("{}/test_write", path))
.map(|_| {
let _ = std::fs::remove_file(format!("{}/test_write", path));
true
})
.unwrap_or(false)
}
fn setup_logging() -> Result<()> {
let log_dir = get_log_dir();
let log_level = get_log_level();
let log_max_mb = get_log_max_mb();
std::fs::create_dir_all(&log_dir)?;
let env_filter =
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&log_level));
let is_development = std::env::var("DEVELOPMENT").is_ok() || std::env::var("DEV").is_ok();
let force_console = std::env::var("CONSOLE_LOGS").is_ok();
if is_development || force_console {
tracing_subscriber::fmt()
.with_env_filter(env_filter)
.compact()
.init();
tracing::info!("Development mode: logging to console");
} else {
let log_file = format!("{}/rust-local-rag.log", log_dir);
let file_appender = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(&log_file)?;
tracing_subscriber::fmt()
.with_env_filter(env_filter)
.with_writer(file_appender)
.json()
.init();
}
tracing::info!("Logging initialized");
tracing::info!("Log directory: {}", log_dir);
tracing::info!("Log level: {}", log_level);
tracing::info!("Log max size: {}MB (auto-truncate)", log_max_mb);
tracing::info!("Development mode: {}", is_development || force_console);
Ok(())
}
async fn start_log_cleanup_task(log_dir: String, max_mb: u64) {
let max_bytes = max_mb * 1024 * 1024;
let log_file = format!("{}/rust-local-rag.log", log_dir);
tokio::spawn(async move {
let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(300));
loop {
interval.tick().await;
if let Ok(metadata) = std::fs::metadata(&log_file) {
if metadata.len() > max_bytes {
if let Err(e) = std::fs::write(
&log_file,
format!("[LOG TRUNCATED - Size exceeded {}MB]\n", max_mb),
) {
eprintln!("Failed to truncate log file: {}", e);
}
}
}
}
});
}
#[tokio::main]
async fn main() -> Result<()> {
if let Err(e) = dotenv::dotenv() {
eprintln!("Warning: Could not load .env file: {}", e);
}
setup_logging()?;
let data_dir = get_data_dir();
let documents_dir = get_documents_dir();
let log_dir = get_log_dir();
let log_max_mb = get_log_max_mb();
tokio::fs::create_dir_all(&data_dir).await?;
tokio::fs::create_dir_all(&documents_dir).await?;
start_log_cleanup_task(log_dir, log_max_mb).await;
tracing::info!("Started automatic log cleanup task (max: {}MB)", log_max_mb);
let rag_engine = RagEngine::new(&data_dir).await?;
let rag_state = Arc::new(RwLock::new(rag_engine));
let document_loading_state = rag_state.clone();
let docs_dir = documents_dir.clone();
tokio::spawn(async move {
tracing::info!("Starting document loading in background...");
let mut engine = document_loading_state.write().await;
if let Err(e) = engine.load_documents_from_dir(&docs_dir).await {
tracing::error!("Failed to load documents: {}", e);
} else {
tracing::info!("Document loading completed successfully");
}
});
tracing::info!("Starting MCP server (stdin/stdout mode)");
tracing::info!("Data directory: {}", data_dir);
tracing::info!("Documents directory: {}", documents_dir);
mcp_server::start_mcp_server(rag_state).await?;
Ok(())
}
```
--------------------------------------------------------------------------------
/how-to-use.md:
--------------------------------------------------------------------------------
```markdown
# Rust Local RAG - Claude Desktop Usage
## Claude Desktop Configuration Template
```json
{
"mcpServers": {
"rust-local-rag": {
"command": "/Users/yourusername/.cargo/bin/rust-local-rag",
"env": {
"DATA_DIR": "/Users/yourusername/Documents/data",
"DOCUMENTS_DIR": "/Users/yourusername/Documents/rag",
"LOG_DIR": "/tmp/rust-local-rag",
"LOG_LEVEL": "info",
"LOG_MAX_MB": "10"
}
}
}
}
```
**Important**: Replace `yourusername` with your actual username. Use absolute paths for reliable operation.
## Environment Variables
| Variable | Description | Default |
|----------|-------------|---------|
| `DATA_DIR` | Embeddings storage directory | `./data` |
| `DOCUMENTS_DIR` | PDF documents directory | `./documents` |
| `LOG_DIR` | Log files directory | `./logs` |
| `LOG_LEVEL` | Logging level (error/warn/info/debug) | `info` |
| `LOG_MAX_MB` | Log file size limit in MB | `5` |
## Adding Documents
### 1. Add PDFs to Documents Directory
```bash
# Copy PDFs to your documents directory
cp your-file.pdf ~/Documents/rag/
# Or move multiple files
mv /path/to/pdfs/*.pdf ~/Documents/rag/
```
### 2. Restart Claude Desktop
The application will automatically:
- Detect new PDF files
- Extract text using poppler
- Generate embeddings
- Index documents for search
## Available MCP Tools
When configured, Claude Desktop can use these tools:
### 1. Search Documents
Search through your documents using semantic similarity.
- **Tool**: `search_documents`
- **Parameters**: `query` (string), `top_k` (optional number, default: 5)
### 2. List Documents
Get a list of all indexed documents.
- **Tool**: `list_documents`
- **Parameters**: None
### 3. Get Statistics
View RAG system statistics and status.
- **Tool**: `get_stats`
- **Parameters**: None
## Usage in Claude
Once configured, you can ask Claude to:
### Document Search Examples
- "Search my documents for information about machine learning"
- "What does my documentation say about API authentication?"
- "Find references to database optimization in my PDFs"
### Document Management Examples
- "List all the documents you can access"
- "Show me statistics about the document index"
- "How many documents do you have indexed?"
### Analysis Examples
- "Summarize the key points from documents about project requirements"
- "Compare what different documents say about security best practices"
- "Find common themes across all my documentation"
## PDF Processing Details
### Supported PDF Types
- ✅ **Text-based PDFs**: Searchable text content
- ✅ **Mixed content**: PDFs with both text and images
- ⚠️ **Scanned PDFs**: Image-only documents (limited text extraction)
- ❌ **Password-protected**: Encrypted PDFs cannot be processed
### Text Extraction Process
1. **PDF to Text**: Uses poppler's `pdftotext` for reliable extraction
2. **Text Chunking**: Splits documents into ~500-1000 character segments
3. **Embedding Generation**: Creates vector embeddings using Ollama
4. **Indexing**: Stores embeddings for fast semantic search
### Performance Notes
- **First-time indexing**: May take several minutes for large document collections
- **Subsequent startups**: Uses cached embeddings for fast loading
- **Memory usage**: Scales with document collection size
- **Search speed**: Sub-second search responses after indexing
## Troubleshooting
### MCP Server Issues
1. **Server not connecting**:
```bash
# Check binary exists and is executable
which rust-local-rag
ls -la ~/.cargo/bin/rust-local-rag
```
2. **Check Claude Desktop logs**:
- **macOS**: `~/Library/Logs/Claude/mcp*.log`
- **Windows**: `%APPDATA%\Claude\Logs\mcp*.log`
- **Linux**: `~/.local/share/Claude/logs/mcp*.log`
### Document Processing Issues
1. **Documents not found**:
```bash
# Verify directory exists and contains PDFs
ls -la ~/Documents/rag/
file ~/Documents/rag/*.pdf
```
2. **PDF processing failures**:
```bash
# Test poppler installation
pdftotext --version
# Test PDF text extraction manually
pdftotext ~/Documents/rag/sample.pdf -
```
3. **Empty search results**:
- Check if documents were successfully indexed
- Verify Ollama is running (`ollama serve`)
- Check embedding model is installed (`ollama list`)
### Log Analysis
```bash
# View real-time logs
tail -f /tmp/rust-local-rag/rust-local-rag.log
# Search for errors
grep -i "error\|failed\|panic" /tmp/rust-local-rag/rust-local-rag.log
# Check document loading
grep -i "document" /tmp/rust-local-rag/rust-local-rag.log
```
### Configuration Validation
```bash
# Test configuration with manual run
DATA_DIR="~/Documents/data" \
DOCUMENTS_DIR="~/Documents/rag" \
LOG_DIR="/tmp/rust-local-rag" \
LOG_LEVEL="debug" \
rust-local-rag
```
## Performance Optimization
### For Large Document Collections
- Use SSD storage for `DATA_DIR`
- Increase `LOG_MAX_MB` for detailed logging
- Consider splitting large PDFs into smaller files
- Monitor memory usage during initial indexing
### For Faster Searches
- Keep Ollama running continuously
- Use specific search terms rather than broad queries
- Adjust `top_k` parameter based on needs (lower = faster)
## Security Considerations
- Documents are processed locally (no external API calls)
- Embeddings stored locally in `DATA_DIR`
- Ollama runs locally for embedding generation
- No document content sent to external services
```
--------------------------------------------------------------------------------
/src/rag_engine.rs:
--------------------------------------------------------------------------------
```rust
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use uuid::Uuid;
use walkdir::WalkDir;
use crate::embeddings::EmbeddingService;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChunk {
pub id: String,
pub document_name: String,
pub text: String,
pub embedding: Vec<f32>,
pub chunk_index: usize,
}
#[derive(Debug, Clone)]
pub struct SearchResult {
pub text: String,
pub score: f32,
pub document: String,
pub chunk_id: String,
}
pub struct RagEngine {
chunks: HashMap<String, DocumentChunk>,
embedding_service: EmbeddingService,
data_dir: String,
}
impl RagEngine {
pub async fn new(data_dir: &str) -> Result<Self> {
let embedding_service = EmbeddingService::new().await?;
let mut engine = Self {
chunks: HashMap::new(),
embedding_service,
data_dir: data_dir.to_string(),
};
if let Err(e) = engine.load_from_disk().await {
tracing::warn!("Could not load existing data: {}", e);
}
Ok(engine)
}
pub async fn add_document(&mut self, filename: &str, data: &[u8]) -> Result<usize> {
tracing::info!("Processing document: {}", filename);
let text = self.extract_pdf_text(data)?;
if text.trim().is_empty() {
return Err(anyhow::anyhow!("No text extracted from PDF"));
}
let chunks = self.chunk_text(&text, 500);
tracing::info!("Created {} chunks for {}", chunks.len(), filename);
self.chunks
.retain(|_, chunk| chunk.document_name != filename);
let mut chunk_count = 0;
for (i, chunk_text) in chunks.into_iter().enumerate() {
if chunk_text.trim().len() < 10 {
continue;
}
tracing::debug!("Generating embedding for chunk {} of {}", i + 1, filename);
let embedding = self.embedding_service.get_embedding(&chunk_text).await?;
let chunk = DocumentChunk {
id: Uuid::new_v4().to_string(),
document_name: filename.to_string(),
text: chunk_text,
embedding,
chunk_index: i,
};
self.chunks.insert(chunk.id.clone(), chunk);
chunk_count += 1;
}
self.save_to_disk().await?;
tracing::info!(
"Successfully processed {} chunks for {}",
chunk_count,
filename
);
Ok(chunk_count)
}
pub async fn search(&self, query: &str, top_k: usize) -> Result<Vec<SearchResult>> {
if self.chunks.is_empty() {
return Ok(vec![]);
}
tracing::debug!("Searching for: '{}'", query);
let query_embedding = self.embedding_service.get_embedding(query).await?;
let mut scores: Vec<(f32, &DocumentChunk)> = self
.chunks
.values()
.map(|chunk| {
let similarity = cosine_similarity(&query_embedding, &chunk.embedding);
(similarity, chunk)
})
.collect();
scores.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap());
Ok(scores
.into_iter()
.take(top_k)
.map(|(score, chunk)| SearchResult {
text: chunk.text.clone(),
score,
document: chunk.document_name.clone(),
chunk_id: chunk.id.clone(),
})
.collect())
}
pub fn list_documents(&self) -> Vec<String> {
let mut docs: Vec<String> = self
.chunks
.values()
.map(|chunk| chunk.document_name.clone())
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
docs.sort();
docs
}
pub fn get_stats(&self) -> serde_json::Value {
let doc_count = self.list_documents().len();
let chunk_count = self.chunks.len();
serde_json::json!({
"documents": doc_count,
"chunks": chunk_count,
"status": "ready"
})
}
pub async fn load_documents_from_dir(&mut self, dir: &str) -> Result<()> {
for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("pdf") {
let filename = path.file_name().unwrap().to_str().unwrap();
if self.chunks.values().any(|c| c.document_name == filename) {
tracing::info!("Document {} already processed, skipping", filename);
continue;
}
match tokio::fs::read(&path).await {
Ok(data) => {
tracing::info!("Loading document: {}", filename);
match self.add_document(filename, &data).await {
Ok(chunk_count) => {
tracing::info!(
"Successfully processed {} with {} chunks",
filename,
chunk_count
);
}
Err(e) => {
tracing::warn!("Skipping {}: {}", filename, e);
}
}
}
Err(e) => {
tracing::error!("Failed to read {}: {}", filename, e);
}
}
}
}
Ok(())
}
fn extract_pdf_text(&self, data: &[u8]) -> Result<String> {
tracing::info!("Extracting PDF text using pdftotext system binary");
self.extract_pdf_with_pdftotext(data)
}
fn extract_pdf_with_pdftotext(&self, data: &[u8]) -> Result<String> {
use std::process::Command;
let temp_dir = std::env::temp_dir();
let temp_file = temp_dir.join(format!("temp_pdf_{}.pdf", std::process::id()));
std::fs::write(&temp_file, data)
.map_err(|e| anyhow::anyhow!("Failed to write temp PDF: {}", e))?;
let output = Command::new("pdftotext")
.arg("-layout")
.arg("-enc")
.arg("UTF-8")
.arg(&temp_file)
.arg("-")
.output();
let _ = std::fs::remove_file(&temp_file);
match output {
Ok(output) if output.status.success() => {
let text = String::from_utf8_lossy(&output.stdout).to_string();
let text_chars = text.chars().count();
if text.trim().is_empty() {
tracing::warn!("pdftotext extracted 0 characters");
Err(anyhow::anyhow!("pdftotext produced no text output"))
} else {
tracing::info!("✅ pdftotext extracted {} characters", text_chars);
Ok(text)
}
}
Ok(output) => {
let error_msg = String::from_utf8_lossy(&output.stderr);
tracing::warn!("pdftotext failed with error: {}", error_msg);
Err(anyhow::anyhow!("pdftotext failed: {}", error_msg))
}
Err(e) => {
tracing::warn!("Failed to run pdftotext command: {}", e);
Err(anyhow::anyhow!(
"pdftotext command failed: {} (is poppler installed?)",
e
))
}
}
}
fn chunk_text(&self, text: &str, chunk_size: usize) -> Vec<String> {
let words: Vec<&str> = text.split_whitespace().collect();
let mut chunks = Vec::new();
for chunk in words.chunks(chunk_size) {
let chunk_text = chunk.join(" ");
if !chunk_text.trim().is_empty() {
chunks.push(chunk_text);
}
}
chunks
}
async fn save_to_disk(&self) -> Result<()> {
let path = format!("{}/chunks.json", self.data_dir);
let data = serde_json::to_string_pretty(&self.chunks)?;
tokio::fs::write(path, data).await?;
tracing::debug!("Saved {} chunks to disk", self.chunks.len());
Ok(())
}
async fn load_from_disk(&mut self) -> Result<()> {
let path = format!("{}/chunks.json", self.data_dir);
if tokio::fs::try_exists(&path).await? {
let data = tokio::fs::read_to_string(path).await?;
self.chunks = serde_json::from_str(&data)?;
tracing::info!("Loaded {} chunks from disk", self.chunks.len());
}
Ok(())
}
}
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
if a.len() != b.len() {
return 0.0;
}
let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
0.0
} else {
dot_product / (norm_a * norm_b)
}
}
```