This is page 1 of 3. Use http://codebase.md/tokidoo/crawl4ai-rag-mcp?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .dockerignore
├── .env.example
├── .gitattributes
├── .gitignore
├── Caddyfile
├── crawled_pages.sql
├── docker-compose.yml
├── Dockerfile
├── knowledge_graphs
│   ├── ai_hallucination_detector.py
│   ├── ai_script_analyzer.py
│   ├── hallucination_reporter.py
│   ├── knowledge_graph_validator.py
│   ├── parse_repo_into_neo4j.py
│   ├── query_knowledge_graph.py
│   └── test_script.py
├── LICENSE
├── pyproject.toml
├── README.md
├── searxng
│   ├── limiter.toml
│   └── settings.yml
├── src
│   ├── crawl4ai_mcp.py
│   └── utils.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
```
1 | crawl4ai_mcp.egg-info
2 | __pycache__
3 | .venv
4 | .env
```
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
```
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
1 | .env
2 | .venv
3 | __pycache__
4 | crawl4ai_mcp.egg-info
5 | repos
6 | .claude
7 | test_script_hallucination*
```
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
```
 1 | # ========================================
 2 | # MCP CRAWL4AI SERVER CONFIGURATION
 3 | # ========================================
 4 | 
 5 | # The transport for the MCP server - either 'sse' or 'stdio' (defaults to sse if left empty)
 6 | TRANSPORT=sse
 7 | 
 8 | # Host to bind to if using sse as the transport (leave empty if using stdio)
 9 | # Set this to 0.0.0.0 if using Docker, otherwise set to localhost (if using uv)
10 | HOST=0.0.0.0
11 | 
12 | # Port to listen on if using sse as the transport (leave empty if using stdio).
13 | # Default port for Docker deployment
14 | PORT=8051
15 | 
16 | # Get your Open AI API Key by following these instructions -
17 | # https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key
18 | # This is for the embedding model - text-embed-small-3 will be used
19 | OPENAI_API_KEY=
20 | 
21 | # The LLM you want to use for summaries and contextual embeddings
22 | # Generally this is a very cheap and fast LLM like gpt-4.1-nano-2025-04-14
23 | MODEL_CHOICE=gpt-4.1-nano-2025-04-14
24 | 
25 | # ========================================
26 | # SEARXNG INTEGRATION CONFIGURATION
27 | # ========================================
28 | 
29 | # SearXNG URL for your search instance - configured for Docker Compose integration
30 | # When using Docker Compose, this points to the internal SearXNG service
31 | # For external SearXNG instances, change to: https://searx.example.com or http://host.docker.internal:8080 (for docker hosted searxng instances)
32 | SEARXNG_URL=http://searxng:8080
33 | 
34 | # Optional: Custom user agent for SearXNG requests
35 | SEARXNG_USER_AGENT=MCP-Crawl4AI-RAG-Server/1.0
36 | 
37 | # Optional: Default search engines to use (comma-separated list)
38 | # Available engines depend on your SearXNG instance configuration
39 | SEARXNG_DEFAULT_ENGINES=google,bing,duckduckgo
40 | 
41 | # Optional: Request timeout in seconds for SearXNG API calls
42 | SEARXNG_TIMEOUT=30
43 | 
44 | # ========================================
45 | # OPTIONAL CADDY CONFIGURATION
46 | # ========================================
47 | 
48 | # Optional: Hostname for SearXNG Caddy reverse proxy
49 | # Change this to your domain name for production deployment
50 | # Default: http://localhost (matches docker-compose.yml default)
51 | SEARXNG_HOSTNAME=http://localhost
52 | 
53 | # TLS configuration for Caddy
54 | # Use 'internal' for self-signed certificates or provide your email for Let's Encrypt
55 | # For production with real domain: [email protected]
56 | # SEARXNG_TLS=internal
57 | 
58 | # RAG strategies - set these to "true" or "false" (default to "false")
59 | # USE_CONTEXTUAL_EMBEDDINGS: Enhances embeddings with contextual information for better retrieval
60 | USE_CONTEXTUAL_EMBEDDINGS=false
61 | 
62 | # USE_HYBRID_SEARCH: Combines vector similarity search with keyword search for better results
63 | USE_HYBRID_SEARCH=false
64 | 
65 | # USE_AGENTIC_RAG: Enables code example extraction, storage, and specialized code search functionality
66 | USE_AGENTIC_RAG=false
67 | 
68 | # USE_RERANKING: Applies cross-encoder reranking to improve search result relevance
69 | USE_RERANKING=false
70 | 
71 | # USE_KNOWLEDGE_GRAPH: Enables AI hallucination detection and repository parsing tools using Neo4j
72 | # If you set this to true, you must also set the Neo4j environment variables below.
73 | USE_KNOWLEDGE_GRAPH=false
74 | 
75 | # For the Supabase version (sample_supabase_agent.py), set your Supabase URL and Service Key.
76 | # Get your SUPABASE_URL from the API section of your Supabase project settings -
77 | # https://supabase.com/dashboard/project/<your project ID>/settings/api
78 | SUPABASE_URL=
79 | 
80 | # Get your SUPABASE_SERVICE_KEY from the API section of your Supabase project settings -
81 | # https://supabase.com/dashboard/project/<your project ID>/settings/api
82 | # On this page it is called the service_role secret.
83 | SUPABASE_SERVICE_KEY=
84 | 
85 | # Neo4j Configuration for Knowledge Graph Tools
86 | # These are required for the AI hallucination detection and repository parsing tools
87 | # Leave empty to disable knowledge graph functionality
88 | 
89 | # Neo4j connection URI - use bolt://localhost:7687 for local, neo4j:// for cloud instances
90 | # IMPORTANT: If running the MCP server through Docker, change localhost to host.docker.internal
91 | NEO4J_URI=bolt://localhost:7687
92 | 
93 | # Neo4j username (usually 'neo4j' for default installations)
94 | NEO4J_USER=neo4j
95 | 
96 | # Neo4j password for your database instance
97 | NEO4J_PASSWORD=
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
  1 | <h1 align="left">🐳 Crawl4AI+SearXNG MCP Server</h1>
  2 | 
  3 | <em>Web Crawling, Search and RAG Capabilities for AI Agents and AI Coding Assistants</em>
  4 | 
  5 | > **(FORKED FROM https://github.com/coleam00/mcp-crawl4ai-rag). Added SearXNG integration and batch scrape and processing capabilities.**
  6 | 
  7 | A **self-contained Docker solution** that combines the [Model Context Protocol (MCP)](https://modelcontextprotocol.io), [Crawl4AI](https://crawl4ai.com), [SearXNG](https://github.com/searxng/searxng), and [Supabase](https://supabase.com/) to provide AI agents and coding assistants with complete web **search, crawling, and RAG capabilities**.
  8 | 
  9 | **🚀 Complete Stack in One Command**: Deploy everything with `docker compose up -d` - no Python setup, no dependencies, no external services required.
 10 | 
 11 | ### 🎯 Smart RAG vs Traditional Scraping
 12 | 
 13 | Unlike traditional scraping (such as [Firecrawl](https://github.com/mendableai/firecrawl-mcp-server)) that dumps raw content and overwhelms LLM context windows, this solution uses **intelligent RAG (Retrieval Augmented Generation)** to:
 14 | 
 15 | - **🔍 Extract only relevant content** using semantic similarity search
 16 | - **⚡ Prevent context overflow** by returning focused, pertinent information
 17 | - **🧠 Enhance AI responses** with precisely targeted knowledge
 18 | - **📊 Maintain context efficiency** for better LLM performance
 19 | 
 20 | **Flexible Output Options:**
 21 | - **RAG Mode** (default): Returns semantically relevant chunks with similarity scores
 22 | - **Raw Markdown Mode**: Full content extraction when complete context is needed
 23 | - **Hybrid Search**: Combines semantic and keyword search for comprehensive results
 24 | 
 25 | ## 💡 Key Benefits
 26 | 
 27 | - **🔧 Zero Configuration**: Pre-configured SearXNG instance included
 28 | - **🐳 Docker-Only**: No Python environment setup required
 29 | - **🔍 Integrated Search**: Built-in SearXNG for private, fast search
 30 | - **⚡ Production Ready**: HTTPS, security, and monitoring included
 31 | - **🎯 AI-Optimized**: RAG strategies built for coding assistants
 32 | 
 33 | ## Overview
 34 | 
 35 | This Docker-based MCP server provides a complete web intelligence stack that enables AI agents to:
 36 | - **Search the web** using the integrated SearXNG instance
 37 | - **Crawl and scrape** websites with advanced content extraction
 38 | - **Store content** in vector databases with intelligent chunking
 39 | - **Perform RAG queries** with multiple enhancement strategies
 40 | 
 41 | **Advanced RAG Strategies Available:**
 42 | - **Contextual Embeddings** for enriched semantic understanding
 43 | - **Hybrid Search** combining vector and keyword search
 44 | - **Agentic RAG** for specialized code example extraction
 45 | - **Reranking** for improved result relevance using cross-encoder models
 46 | - **Knowledge Graph** for AI hallucination detection and repository code analysis
 47 | 
 48 | See the [Configuration section](#configuration) below for details on how to enable and configure these strategies.
 49 | 
 50 | ## Features
 51 | 
 52 | - **Smart URL Detection**: Automatically detects and handles different URL types (regular webpages, sitemaps, text files)
 53 | - **Recursive Crawling**: Follows internal links to discover content
 54 | - **Parallel Processing**: Efficiently crawls multiple pages simultaneously
 55 | - **Content Chunking**: Intelligently splits content by headers and size for better processing
 56 | - **Vector Search**: Performs RAG over crawled content, optionally filtering by data source for precision
 57 | - **Source Retrieval**: Retrieve sources available for filtering to guide the RAG process
 58 | 
 59 | ## Tools
 60 | 
 61 | The server provides essential web crawling and search tools:
 62 | 
 63 | ### Core Tools (Always Available)
 64 | 
 65 | 1. **`scrape_urls`**: Scrape one or more URLs and store their content in the vector database. Supports both single URLs and lists of URLs for batch processing.
 66 | 2. **`smart_crawl_url`**: Intelligently crawl a full website based on the type of URL provided (sitemap, llms-full.txt, or a regular webpage that needs to be crawled recursively)
 67 | 3. **`get_available_sources`**: Get a list of all available sources (domains) in the database
 68 | 4. **`perform_rag_query`**: Search for relevant content using semantic search with optional source filtering
 69 | 5. **NEW!** **`search`**: Comprehensive web search tool that integrates SearXNG search with automated scraping and RAG processing. Performs a complete workflow: (1) searches SearXNG with the provided query, (2) extracts URLs from search results, (3) automatically scrapes all found URLs using existing scraping infrastructure, (4) stores content in vector database, and (5) returns either RAG-processed results organized by URL or raw markdown content. Key parameters: `query` (search terms), `return_raw_markdown` (bypasses RAG for raw content), `num_results` (search result limit), `batch_size` (database operation batching), `max_concurrent` (parallel scraping sessions). Ideal for research workflows, competitive analysis, and content discovery with built-in intelligence.
 70 | 
 71 | ### Conditional Tools
 72 | 
 73 | 6. **`search_code_examples`** (requires `USE_AGENTIC_RAG=true`): Search specifically for code examples and their summaries from crawled documentation. This tool provides targeted code snippet retrieval for AI coding assistants.
 74 | 
 75 | ### Knowledge Graph Tools (requires `USE_KNOWLEDGE_GRAPH=true`, see below)
 76 | 
 77 | 7. **`parse_github_repository`**: Parse a GitHub repository into a Neo4j knowledge graph, extracting classes, methods, functions, and their relationships for hallucination detection
 78 | 8. **`check_ai_script_hallucinations`**: Analyze Python scripts for AI hallucinations by validating imports, method calls, and class usage against the knowledge graph
 79 | 9. **`query_knowledge_graph`**: Explore and query the Neo4j knowledge graph with commands like `repos`, `classes`, `methods`, and custom Cypher queries
 80 | 
 81 | ## Prerequisites
 82 | 
 83 | **Required:**
 84 | - [Docker and Docker Compose](https://www.docker.com/products/docker-desktop/) - This is a Docker-only solution
 85 | - [Supabase account](https://supabase.com/) - For vector database and RAG functionality
 86 | - [OpenAI API key](https://platform.openai.com/api-keys) - For generating embeddings
 87 | 
 88 | **Optional:**
 89 | - [Neo4j instance](https://neo4j.com/) - For knowledge graph functionality (see [Knowledge Graph Setup](#knowledge-graph-setup))
 90 | - Custom domain - For production HTTPS deployment
 91 | 
 92 | ## Installation
 93 | 
 94 | This is a **Docker-only solution** - no Python environment setup required!
 95 | 
 96 | ### Quick Start
 97 | 
 98 | 1. **Clone this repository:**
 99 |    ```bash
100 |    git clone https://github.com/coleam00/mcp-crawl4ai-rag.git
101 |    cd mcp-crawl4ai-rag
102 |    ```
103 | 
104 | 2. **Configure environment:**
105 |    ```bash
106 |    cp .env.example .env
107 |    # Edit .env with your API keys (see Configuration section below)
108 |    ```
109 | 
110 | 3. **Deploy the complete stack:**
111 |    ```bash
112 |    docker compose up -d
113 |    ```
114 | 
115 | That's it! Your complete search, crawl, and RAG stack is now running:
116 | - **MCP Server**: http://localhost:8051
117 | - **SearXNG Search**: http://localhost:8080 (internal)
118 | - **Caddy Proxy**: Handles HTTPS and routing
119 | 
120 | ### What Gets Deployed
121 | 
122 | The Docker Compose stack includes:
123 | - **MCP Crawl4AI Server** - Main application server
124 | - **SearXNG** - Private search engine instance
125 | - **Valkey** - Redis-compatible cache for SearXNG
126 | - **Caddy** - Reverse proxy with automatic HTTPS
127 | 
128 | ## Database Setup *IMPORTANT!*
129 | 
130 | Before running the server, you need to set up the database with the pgvector extension:
131 | 
132 | 1. Go to the SQL Editor in your Supabase dashboard (create a new project first if necessary)
133 | 
134 | 2. Create a new query and paste the contents of `crawled_pages.sql`
135 | 
136 | 3. Run the query to create the necessary tables and functions
137 | 
138 | ## Knowledge Graph Setup (Optional)
139 | 
140 | To enable AI hallucination detection and repository analysis features, you need to set up Neo4j.
141 | 
142 | **Note:** The knowledge graph functionality works fully with Docker and supports all features.
143 | 
144 | ### Neo4j Setup Options
145 | 
146 | **Option 1: Local AI Package (Recommended)**
147 | 
148 | The easiest way to get Neo4j running is with the [Local AI Package](https://github.com/coleam00/local-ai-packaged):
149 | 
150 | 1. **Clone and start Neo4j**:
151 |    ```bash
152 |    git clone https://github.com/coleam00/local-ai-packaged.git
153 |    cd local-ai-packaged
154 |    # Follow repository instructions to start Neo4j with Docker Compose
155 |    ```
156 | 
157 | 2. **Connection details for Docker**:
158 |    - URI: `bolt://host.docker.internal:7687` (for Docker containers)
159 |    - URI: `bolt://localhost:7687` (for local access)
160 |    - Username: `neo4j`
161 |    - Password: Check Local AI Package documentation
162 | 
163 | **Option 2: Neo4j Docker**
164 | 
165 | Run Neo4j directly with Docker:
166 | 
167 | ```bash
168 | docker run -d \
169 |   --name neo4j \
170 |   -p 7474:7474 -p 7687:7687 \
171 |   -e NEO4J_AUTH=neo4j/your-password \
172 |   neo4j:latest
173 | ```
174 | 
175 | **Option 3: Neo4j Desktop**
176 | 
177 | Use Neo4j Desktop for a local GUI-based installation:
178 | 
179 | 1. **Download and install**: [Neo4j Desktop](https://neo4j.com/download/)
180 | 2. **Create a new database** with your preferred settings
181 | 3. **Connection details**:
182 |    - URI: `bolt://host.docker.internal:7687` (for Docker containers)
183 |    - URI: `bolt://localhost:7687` (for local access)
184 |    - Username: `neo4j`
185 |    - Password: Whatever you set during database creation
186 | 
187 | ## Configuration
188 | 
189 | Configure the Docker stack by editing your `.env` file (copy from `.env.example`):
190 | 
191 | ```bash
192 | # ========================================
193 | # MCP SERVER CONFIGURATION
194 | # ========================================
195 | TRANSPORT=sse
196 | HOST=0.0.0.0
197 | PORT=8051
198 | 
199 | # ========================================
200 | # INTEGRATED SEARXNG CONFIGURATION
201 | # ========================================
202 | # Pre-configured for Docker Compose - SearXNG runs internally
203 | SEARXNG_URL=http://searxng:8080
204 | SEARXNG_USER_AGENT=MCP-Crawl4AI-RAG-Server/1.0
205 | SEARXNG_DEFAULT_ENGINES=google,bing,duckduckgo
206 | SEARXNG_TIMEOUT=30
207 | 
208 | # Optional: Custom domain for production HTTPS
209 | SEARXNG_HOSTNAME=http://localhost
210 | # [email protected]  # For Let's Encrypt
211 | 
212 | # ========================================
213 | # AI SERVICES CONFIGURATION
214 | # ========================================
215 | # Required: OpenAI API for embeddings
216 | OPENAI_API_KEY=your_openai_api_key
217 | 
218 | # LLM for summaries and contextual embeddings
219 | MODEL_CHOICE=gpt-4.1-nano-2025-04-14
220 | 
221 | # Required: Supabase for vector database
222 | SUPABASE_URL=your_supabase_project_url
223 | SUPABASE_SERVICE_KEY=your_supabase_service_key
224 | 
225 | # ========================================
226 | # RAG ENHANCEMENT STRATEGIES
227 | # ========================================
228 | USE_CONTEXTUAL_EMBEDDINGS=false
229 | USE_HYBRID_SEARCH=false
230 | USE_AGENTIC_RAG=false
231 | USE_RERANKING=false
232 | USE_KNOWLEDGE_GRAPH=false
233 | 
234 | # Optional: Neo4j for knowledge graph (if USE_KNOWLEDGE_GRAPH=true)
235 | # Use host.docker.internal:7687 for Docker Desktop on Windows/Mac
236 | NEO4J_URI=bolt://localhost:7687
237 | NEO4J_USER=neo4j
238 | NEO4J_PASSWORD=your_neo4j_password
239 | ```
240 | 
241 | ### Key Configuration Notes
242 | 
243 | **🔍 SearXNG Integration**: The stack includes a pre-configured SearXNG instance that runs automatically. No external setup required!
244 | 
245 | **🐳 Docker Networking**: The default configuration uses Docker internal networking (`http://searxng:8080`) which works out of the box.
246 | 
247 | **🔐 Production Setup**: For production, set `SEARXNG_HOSTNAME` to your domain and `SEARXNG_TLS` to your email for automatic HTTPS.
248 | 
249 | ### RAG Strategy Options
250 | 
251 | The Crawl4AI RAG MCP server supports four powerful RAG strategies that can be enabled independently:
252 | 
253 | #### 1. **USE_CONTEXTUAL_EMBEDDINGS**
254 | When enabled, this strategy enhances each chunk's embedding with additional context from the entire document. The system passes both the full document and the specific chunk to an LLM (configured via `MODEL_CHOICE`) to generate enriched context that gets embedded alongside the chunk content.
255 | 
256 | - **When to use**: Enable this when you need high-precision retrieval where context matters, such as technical documentation where terms might have different meanings in different sections.
257 | - **Trade-offs**: Slower indexing due to LLM calls for each chunk, but significantly better retrieval accuracy.
258 | - **Cost**: Additional LLM API calls during indexing.
259 | 
260 | #### 2. **USE_HYBRID_SEARCH**
261 | Combines traditional keyword search with semantic vector search to provide more comprehensive results. The system performs both searches in parallel and intelligently merges results, prioritizing documents that appear in both result sets.
262 | 
263 | - **When to use**: Enable this when users might search using specific technical terms, function names, or when exact keyword matches are important alongside semantic understanding.
264 | - **Trade-offs**: Slightly slower search queries but more robust results, especially for technical content.
265 | - **Cost**: No additional API costs, just computational overhead.
266 | 
267 | #### 3. **USE_AGENTIC_RAG**
268 | Enables specialized code example extraction and storage. When crawling documentation, the system identifies code blocks (≥300 characters), extracts them with surrounding context, generates summaries, and stores them in a separate vector database table specifically designed for code search.
269 | 
270 | - **When to use**: Essential for AI coding assistants that need to find specific code examples, implementation patterns, or usage examples from documentation.
271 | - **Trade-offs**: Significantly slower crawling due to code extraction and summarization, requires more storage space.
272 | - **Cost**: Additional LLM API calls for summarizing each code example.
273 | - **Benefits**: Provides a dedicated `search_code_examples` tool that AI agents can use to find specific code implementations.
274 | 
275 | #### 4. **USE_RERANKING**
276 | Applies cross-encoder reranking to search results after initial retrieval. Uses a lightweight cross-encoder model (`cross-encoder/ms-marco-MiniLM-L-6-v2`) to score each result against the original query, then reorders results by relevance.
277 | 
278 | - **When to use**: Enable this when search precision is critical and you need the most relevant results at the top. Particularly useful for complex queries where semantic similarity alone might not capture query intent.
279 | - **Trade-offs**: Adds ~100-200ms to search queries depending on result count, but significantly improves result ordering.
280 | - **Cost**: No additional API costs - uses a local model that runs on CPU.
281 | - **Benefits**: Better result relevance, especially for complex queries. Works with both regular RAG search and code example search.
282 | 
283 | #### 5. **USE_KNOWLEDGE_GRAPH**
284 | Enables AI hallucination detection and repository analysis using Neo4j knowledge graphs. When enabled, the system can parse GitHub repositories into a graph database and validate AI-generated code against real repository structures. **Fully compatible with Docker** - all functionality works within the containerized environment.
285 | 
286 | - **When to use**: Enable this for AI coding assistants that need to validate generated code against real implementations, or when you want to detect when AI models hallucinate non-existent methods, classes, or incorrect usage patterns.
287 | - **Trade-offs**: Requires Neo4j setup and additional dependencies. Repository parsing can be slow for large codebases, and validation requires repositories to be pre-indexed.
288 | - **Cost**: No additional API costs for validation, but requires Neo4j infrastructure (can use free local installation or cloud AuraDB).
289 | - **Benefits**: Provides three powerful tools: `parse_github_repository` for indexing codebases, `check_ai_script_hallucinations` for validating AI-generated code, and `query_knowledge_graph` for exploring indexed repositories.
290 | 
291 | **Usage with MCP Tools:**
292 | 
293 | You can tell the AI coding assistant to add a Python GitHub repository to the knowledge graph:
294 | 
295 | "Add https://github.com/pydantic/pydantic-ai.git to the knowledge graph"
296 | 
297 | Make sure the repo URL ends with .git.
298 | 
299 | You can also have the AI coding assistant check for hallucinations with scripts it creates using the MCP `check_ai_script_hallucinations` tool.
300 | 
301 | ### Recommended Configurations
302 | 
303 | **For general documentation RAG:**
304 | ```
305 | USE_CONTEXTUAL_EMBEDDINGS=false
306 | USE_HYBRID_SEARCH=true
307 | USE_AGENTIC_RAG=false
308 | USE_RERANKING=true
309 | ```
310 | 
311 | **For AI coding assistant with code examples:**
312 | ```
313 | USE_CONTEXTUAL_EMBEDDINGS=true
314 | USE_HYBRID_SEARCH=true
315 | USE_AGENTIC_RAG=true
316 | USE_RERANKING=true
317 | USE_KNOWLEDGE_GRAPH=false
318 | ```
319 | 
320 | **For AI coding assistant with hallucination detection:**
321 | ```
322 | USE_CONTEXTUAL_EMBEDDINGS=true
323 | USE_HYBRID_SEARCH=true
324 | USE_AGENTIC_RAG=true
325 | USE_RERANKING=true
326 | USE_KNOWLEDGE_GRAPH=true
327 | ```
328 | 
329 | **For fast, basic RAG:**
330 | ```
331 | USE_CONTEXTUAL_EMBEDDINGS=false
332 | USE_HYBRID_SEARCH=true
333 | USE_AGENTIC_RAG=false
334 | USE_RERANKING=false
335 | USE_KNOWLEDGE_GRAPH=false
336 | ```
337 | 
338 | ## Running the Server
339 | 
340 | The complete stack is managed through Docker Compose:
341 | 
342 | ### Start the Stack
343 | ```bash
344 | docker compose up -d
345 | ```
346 | 
347 | ### View Logs
348 | ```bash
349 | # All services
350 | docker compose logs -f
351 | 
352 | # Specific service
353 | docker compose logs -f mcp-crawl4ai
354 | docker compose logs -f searxng
355 | ```
356 | 
357 | ### Stop the Stack
358 | ```bash
359 | docker compose down
360 | ```
361 | 
362 | ### Restart Services
363 | ```bash
364 | # Restart all
365 | docker compose restart
366 | 
367 | # Restart specific service
368 | docker compose restart mcp-crawl4ai
369 | ```
370 | 
371 | The MCP server will be available at `http://localhost:8051` for SSE connections.
372 | 
373 | ## Integration with MCP Clients
374 | 
375 | After starting the Docker stack with `docker compose up -d`, your MCP server will be available for integration.
376 | 
377 | ### SSE Configuration (Recommended)
378 | 
379 | The Docker stack runs with SSE transport by default. Connect using:
380 | 
381 | **Claude Desktop/Windsurf:**
382 | ```json
383 | {
384 |   "mcpServers": {
385 |     "crawl4ai-rag": {
386 |       "transport": "sse",
387 |       "url": "http://localhost:8051/sse"
388 |     }
389 |   }
390 | }
391 | ```
392 | 
393 | **Windsurf (alternative syntax):**
394 | ```json
395 | {
396 |   "mcpServers": {
397 |     "crawl4ai-rag": {
398 |       "transport": "sse",
399 |       "serverUrl": "http://localhost:8051/sse"
400 |     }
401 |   }
402 | }
403 | ```
404 | 
405 | **Claude Code CLI:**
406 | ```bash
407 | claude mcp add-json crawl4ai-rag '{"type":"http","url":"http://localhost:8051/sse"}' --scope user
408 | ```
409 | 
410 | ### Docker Networking Notes
411 | 
412 | - **Same machine**: Use `http://localhost:8051/sse`
413 | - **Different container**: Use `http://host.docker.internal:8051/sse`
414 | - **Remote access**: Replace `localhost` with your server's IP address
415 | 
416 | ### Production Deployment
417 | 
418 | For production use with custom domains:
419 | 
420 | 1. **Update your `.env`**:
421 |    ```bash
422 |    SEARXNG_HOSTNAME=https://yourdomain.com
423 |    [email protected]
424 |    ```
425 | 
426 | 2. **Access via HTTPS**:
427 |    ```
428 |    https://yourdomain.com:8051/sse
429 |    ```
430 | 
431 | ### Health Check
432 | 
433 | Verify the server is running:
434 | ```bash
435 | curl http://localhost:8051/health
436 | ```
437 | 
438 | ## Knowledge Graph Architecture
439 | 
440 | The knowledge graph system stores repository code structure in Neo4j with the following components:
441 | 
442 | ### Core Components (`knowledge_graphs/` folder):
443 | 
444 | - **`parse_repo_into_neo4j.py`**: Clones and analyzes GitHub repositories, extracting Python classes, methods, functions, and imports into Neo4j nodes and relationships
445 | - **`ai_script_analyzer.py`**: Parses Python scripts using AST to extract imports, class instantiations, method calls, and function usage
446 | - **`knowledge_graph_validator.py`**: Validates AI-generated code against the knowledge graph to detect hallucinations (non-existent methods, incorrect parameters, etc.)
447 | - **`hallucination_reporter.py`**: Generates comprehensive reports about detected hallucinations with confidence scores and recommendations
448 | - **`query_knowledge_graph.py`**: Interactive CLI tool for exploring the knowledge graph (functionality now integrated into MCP tools)
449 | 
450 | ### Knowledge Graph Schema:
451 | 
452 | The Neo4j database stores code structure as:
453 | 
454 | **Nodes:**
455 | - `Repository`: GitHub repositories
456 | - `File`: Python files within repositories  
457 | - `Class`: Python classes with methods and attributes
458 | - `Method`: Class methods with parameter information
459 | - `Function`: Standalone functions
460 | - `Attribute`: Class attributes
461 | 
462 | **Relationships:**
463 | - `Repository` -[:CONTAINS]-> `File`
464 | - `File` -[:DEFINES]-> `Class`
465 | - `File` -[:DEFINES]-> `Function`
466 | - `Class` -[:HAS_METHOD]-> `Method`
467 | - `Class` -[:HAS_ATTRIBUTE]-> `Attribute`
468 | 
469 | ### Workflow:
470 | 
471 | 1. **Repository Parsing**: Use `parse_github_repository` tool to clone and analyze open-source repositories
472 | 2. **Code Validation**: Use `check_ai_script_hallucinations` tool to validate AI-generated Python scripts
473 | 3. **Knowledge Exploration**: Use `query_knowledge_graph` tool to explore available repositories, classes, and methods
474 | 
475 | ## Troubleshooting
476 | 
477 | ### Docker Issues
478 | 
479 | **Container won't start:**
480 | ```bash
481 | # Check logs for specific errors
482 | docker compose logs mcp-crawl4ai
483 | 
484 | # Verify configuration is valid
485 | docker compose config
486 | 
487 | # Restart problematic services
488 | docker compose restart mcp-crawl4ai
489 | ```
490 | 
491 | **SearXNG not accessible:**
492 | ```bash
493 | # Check if SearXNG is running
494 | docker compose logs searxng
495 | 
496 | # Verify internal networking
497 | docker compose exec mcp-crawl4ai curl http://searxng:8080
498 | ```
499 | 
500 | **Port conflicts:**
501 | ```bash
502 | # Check what's using ports
503 | netstat -tulpn | grep -E ":(8051|8080)"
504 | 
505 | # Change ports in docker-compose.yml if needed
506 | ports:
507 |   - "8052:8051"  # Changed from 8051:8051
508 | ```
509 | 
510 | ### Common Configuration Issues
511 | 
512 | **Environment variables not loading:**
513 | - Ensure `.env` file is in the same directory as `docker-compose.yml`
514 | - Verify no spaces around `=` in `.env` file
515 | - Check for special characters that need quoting
516 | 
517 | **API connection failures:**
518 | - Verify `OPENAI_API_KEY` is valid and has credits
519 | - Check `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` are correct
520 | - Test API connectivity from within container:
521 |   ```bash
522 |   docker compose exec mcp-crawl4ai curl -H "Authorization: Bearer $OPENAI_API_KEY" https://api.openai.com/v1/models
523 |   ```
524 | 
525 | **Neo4j connection issues:**
526 | - Use `host.docker.internal:7687` instead of `localhost:7687` for Neo4j running on host
527 | - Verify Neo4j is running and accessible
528 | - Check firewall settings for port 7687
529 | 
530 | ### Performance Optimization
531 | 
532 | **Memory usage:**
533 | ```bash
534 | # Monitor resource usage
535 | docker stats
536 | 
537 | # Adjust memory limits in docker-compose.yml
538 | deploy:
539 |   resources:
540 |     limits:
541 |       memory: 2G
542 | ```
543 | 
544 | **Disk space:**
545 | ```bash
546 | # Clean up Docker
547 | docker system prune -a
548 | 
549 | # Check volume usage
550 | docker volume ls
551 | ```
552 | 
553 | ### Getting Help
554 | 
555 | 1. **Check logs first**: `docker compose logs -f`
556 | 2. **Verify configuration**: `docker compose config`
557 | 3. **Test connectivity**: Use `curl` commands shown above
558 | 4. **Reset everything**: `docker compose down -v && docker compose up -d`
559 | 
560 | ## Development & Customization
561 | 
562 | This Docker stack provides a foundation for building more complex MCP servers:
563 | 
564 | 1. **Modify the MCP server**: Edit files in `src/` and rebuild: `docker compose build mcp-crawl4ai`
565 | 2. **Add custom tools**: Extend `src/crawl4ai_mcp.py` with `@mcp.tool()` decorators
566 | 3. **Customize SearXNG**: Edit `searxng/settings.yml` and restart
567 | 4. **Add services**: Extend `docker-compose.yml` with additional containers
568 | 
569 | 
```
--------------------------------------------------------------------------------
/searxng/limiter.toml:
--------------------------------------------------------------------------------
```toml
1 | # This configuration file updates the default configuration file
2 | # See https://github.com/searxng/searxng/blob/master/searx/limiter.toml
3 | 
4 | [botdetection.ip_limit]
5 | # activate advanced bot protection
6 | # enable this when running the instance for a public usage on the internet
7 | link_token = false
```
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
```dockerfile
 1 | FROM python:3.12-slim
 2 | 
 3 | ARG PORT=8051
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | # Install uv
 8 | RUN pip install uv
 9 | 
10 | # Copy the MCP server files
11 | COPY . .
12 | 
13 | # Install packages directly to the system (no virtual environment)
14 | # Combining commands to reduce Docker layers
15 | RUN uv pip install --system -e . && \
16 |     crawl4ai-setup
17 | 
18 | EXPOSE ${PORT}
19 | 
20 | # Command to run the MCP server
21 | CMD ["python", "src/crawl4ai_mcp.py"]
22 | 
```
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
```toml
 1 | [project]
 2 | name = "crawl4ai-mcp"
 3 | version = "0.1.0"
 4 | description = "MCP server for integrating web crawling and RAG into AI agents and AI coding assistants"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "crawl4ai==0.6.2",
 9 |     "mcp==1.7.1",
10 |     "supabase==2.15.1",
11 |     "openai==1.71.0",
12 |     "dotenv==0.9.9",
13 |     "sentence-transformers>=4.1.0",
14 |     "neo4j>=5.28.1",
15 |     "requests>=2.25.0",
16 | ]
17 | 
```
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
```yaml
  1 | version: "3.7"
  2 | 
  3 | services:
  4 |   # MCP Crawl4AI Server
  5 |   mcp-crawl4ai:
  6 |     build:
  7 |       context: .
  8 |       dockerfile: Dockerfile
  9 |     container_name: mcp-crawl4ai
 10 |     restart: unless-stopped
 11 |     environment:
 12 |       - TRANSPORT=${TRANSPORT:-sse}
 13 |       - HOST=${HOST:-0.0.0.0}
 14 |       - PORT=${PORT:-8051}
 15 |       - SEARXNG_URL=${SEARXNG_URL:-http://searxng:8080}
 16 |     env_file:
 17 |       - .env
 18 |     ports:
 19 |       - "8051:8051"
 20 |     depends_on:
 21 |       - searxng
 22 |     networks:
 23 |       - searxng
 24 |     volumes:
 25 |       - ./data:/app/data
 26 |       - ./logs:/app/logs
 27 |     logging:
 28 |       driver: "json-file"
 29 |       options:
 30 |         max-size: "1m"
 31 |         max-file: "1"
 32 | 
 33 |   # Reverse Proxy (Caddy)
 34 |   caddy:
 35 |     container_name: caddy
 36 |     image: docker.io/library/caddy:2-alpine
 37 |     network_mode: host
 38 |     restart: unless-stopped
 39 |     environment:
 40 |       - SEARXNG_HOSTNAME=${SEARXNG_HOSTNAME:-http://localhost}
 41 |       - SEARXNG_TLS=${SEARXNG_TLS:-internal}
 42 |     volumes:
 43 |       - ./Caddyfile:/etc/caddy/Caddyfile:ro
 44 |       - caddy-data:/data:rw
 45 |       - caddy-config:/config:rw
 46 |     cap_drop:
 47 |       - ALL
 48 |     cap_add:
 49 |       - NET_BIND_SERVICE
 50 |     logging:
 51 |       driver: "json-file"
 52 |       options:
 53 |         max-size: "1m"
 54 |         max-file: "1"
 55 | 
 56 |   # Redis Alternative - Valkey
 57 |   valkey:
 58 |     container_name: valkey
 59 |     image: docker.io/valkey/valkey:8-alpine
 60 |     command: valkey-server --save 30 1 --loglevel warning
 61 |     restart: unless-stopped
 62 |     networks:
 63 |       - searxng
 64 |     volumes:
 65 |       - valkey-data:/data
 66 |     cap_drop:
 67 |       - ALL
 68 |     cap_add:
 69 |       - SETGID
 70 |       - SETUID
 71 |       - DAC_OVERRIDE
 72 |     logging:
 73 |       driver: "json-file"
 74 |       options:
 75 |         max-size: "1m"
 76 |         max-file: "1"
 77 | 
 78 |   # SearXNG Search Engine
 79 |   searxng:
 80 |     container_name: searxng
 81 |     image: docker.io/searxng/searxng:latest
 82 |     restart: unless-stopped
 83 |     networks:
 84 |       - searxng
 85 |     ports:
 86 |       - "127.0.0.1:8080:8080"
 87 |     volumes:
 88 |       - ./searxng:/etc/searxng:rw
 89 |       - searxng-data:/var/cache/searxng:rw
 90 |     environment:
 91 |       - SEARXNG_BASE_URL=https://${SEARXNG_HOSTNAME:-localhost}/
 92 |     cap_drop:
 93 |       - ALL
 94 |     cap_add:
 95 |       - CHOWN
 96 |       - SETGID
 97 |       - SETUID
 98 |     logging:
 99 |       driver: "json-file"
100 |       options:
101 |         max-size: "1m"
102 |         max-file: "1"
103 |     depends_on:
104 |       - valkey
105 | 
106 | networks:
107 |   searxng:
108 |     driver: bridge
109 | 
110 | volumes:
111 |   caddy-data:
112 |   caddy-config:
113 |   valkey-data:
114 |   searxng-data:
```
--------------------------------------------------------------------------------
/searxng/settings.yml:
--------------------------------------------------------------------------------
```yaml
  1 | # SearXNG Configuration
  2 | # see https://docs.searxng.org/admin/settings/index.html
  3 | 
  4 | use_default_settings: true
  5 | 
  6 | general:
  7 |   debug: false
  8 |   instance_name: "SearXNG for MCP Crawl4AI"
  9 |   contact_url: false
 10 |   enable_metrics: true
 11 | 
 12 | brand:
 13 |   new_issue_url: https://github.com/searxng/searxng/issues/new
 14 |   docs_url: https://docs.searxng.org/
 15 |   public_instances: https://searx.space
 16 |   wiki_url: https://github.com/searxng/searxng/wiki
 17 |   issue_url: https://github.com/searxng/searxng/issues
 18 | 
 19 | search:
 20 |   safe_search: 0
 21 |   autocomplete: ""
 22 |   autocomplete_min: 4
 23 |   default_lang: ""
 24 |   ban_time_on_fail: 5
 25 |   max_ban_time_on_fail: 120
 26 |   formats:
 27 |     - html
 28 |     - json  # Enable JSON format for API access
 29 | 
 30 | server:
 31 |   port: 8080
 32 |   bind_address: "0.0.0.0"
 33 |   secret_key: "randomkey"  # Change this in production
 34 |   base_url: false
 35 |   image_proxy: false
 36 |   method: "POST"
 37 |   default_http_headers:
 38 |     X-Content-Type-Options: nosniff
 39 |     X-XSS-Protection: 1; mode=block
 40 |     X-Download-Options: noopen
 41 |     X-Robots-Tag: noindex, nofollow
 42 |     Referrer-Policy: no-referrer
 43 | 
 44 | ui:
 45 |   static_use_hash: false
 46 |   default_locale: ""
 47 |   query_in_title: false
 48 |   infinite_scroll: false
 49 |   center_alignment: false
 50 |   cache_url: https://web.archive.org/web/
 51 |   search_on_category_select: true
 52 |   hotkeys: default
 53 | 
 54 | # Valkey configuration for caching (Redis alternative)
 55 | valkey:
 56 |   url: redis://valkey:6379/0
 57 | 
 58 | # Outgoing request configuration  
 59 | outgoing:
 60 |   request_timeout: 3.0
 61 |   useragent_suffix: ""
 62 |   pool_connections: 100
 63 |   pool_maxsize: 20
 64 |   enable_http2: true
 65 | 
 66 | # Enable engines suitable for API usage
 67 | engines:
 68 |   - name: google
 69 |     engine: google
 70 |     shortcut: g
 71 |     use_mobile_ui: false
 72 | 
 73 |   - name: bing
 74 |     engine: bing
 75 |     shortcut: b
 76 |     
 77 |   - name: duckduckgo
 78 |     engine: duckduckgo
 79 |     shortcut: ddg
 80 |     
 81 |   - name: wikipedia
 82 |     engine: wikipedia
 83 |     shortcut: wp
 84 |     base_url: 'https://{language}.wikipedia.org/'
 85 |     
 86 |   - name: github
 87 |     engine: github
 88 |     shortcut: gh
 89 |     
 90 |   - name: reddit
 91 |     engine: reddit
 92 |     shortcut: re
 93 |     
 94 |   - name: arxiv
 95 |     engine: arxiv
 96 |     shortcut: arx
 97 |     
 98 |   - name: scholar
 99 |     engine: google_scholar
100 |     shortcut: gs
101 | 
102 | # Disable engines that might cause issues in API usage
103 | disabled_engines:
104 |   - 'Ahmia'
105 |   - 'Annas Archive'
106 |   - 'Dictzone'
107 |   - 'Lingva'
108 |   - 'Wikidata'
109 |   - 'Wikimini'
110 |   - 'Wikiquote'
111 |   - 'Wiktionary'
112 |   - 'Wikinews'
113 |   - 'Wikisource'
114 |   - 'Wikibooks'
115 |   - 'Wikiversity'
116 |   - 'Wikivoyage'
117 |   - 'Wolfram|Alpha'
```
--------------------------------------------------------------------------------
/crawled_pages.sql:
--------------------------------------------------------------------------------
```sql
  1 | -- Enable the pgvector extension
  2 | create extension if not exists vector;
  3 | 
  4 | -- Drop tables if they exist (to allow rerunning the script)
  5 | drop table if exists crawled_pages;
  6 | drop table if exists code_examples;
  7 | drop table if exists sources;
  8 | 
  9 | -- Create the sources table
 10 | create table sources (
 11 |     source_id text primary key,
 12 |     summary text,
 13 |     total_word_count integer default 0,
 14 |     created_at timestamp with time zone default timezone('utc'::text, now()) not null,
 15 |     updated_at timestamp with time zone default timezone('utc'::text, now()) not null
 16 | );
 17 | 
 18 | -- Create the documentation chunks table
 19 | create table crawled_pages (
 20 |     id bigserial primary key,
 21 |     url varchar not null,
 22 |     chunk_number integer not null,
 23 |     content text not null,
 24 |     metadata jsonb not null default '{}'::jsonb,
 25 |     source_id text not null,
 26 |     embedding vector(1536),  -- OpenAI embeddings are 1536 dimensions
 27 |     created_at timestamp with time zone default timezone('utc'::text, now()) not null,
 28 |     
 29 |     -- Add a unique constraint to prevent duplicate chunks for the same URL
 30 |     unique(url, chunk_number),
 31 |     
 32 |     -- Add foreign key constraint to sources table
 33 |     foreign key (source_id) references sources(source_id)
 34 | );
 35 | 
 36 | -- Create an index for better vector similarity search performance
 37 | create index on crawled_pages using ivfflat (embedding vector_cosine_ops);
 38 | 
 39 | -- Create an index on metadata for faster filtering
 40 | create index idx_crawled_pages_metadata on crawled_pages using gin (metadata);
 41 | 
 42 | -- Create an index on source_id for faster filtering
 43 | CREATE INDEX idx_crawled_pages_source_id ON crawled_pages (source_id);
 44 | 
 45 | -- Create a function to search for documentation chunks
 46 | create or replace function match_crawled_pages (
 47 |   query_embedding vector(1536),
 48 |   match_count int default 10,
 49 |   filter jsonb DEFAULT '{}'::jsonb,
 50 |   source_filter text DEFAULT NULL
 51 | ) returns table (
 52 |   id bigint,
 53 |   url varchar,
 54 |   chunk_number integer,
 55 |   content text,
 56 |   metadata jsonb,
 57 |   source_id text,
 58 |   similarity float
 59 | )
 60 | language plpgsql
 61 | as $$
 62 | #variable_conflict use_column
 63 | begin
 64 |   return query
 65 |   select
 66 |     id,
 67 |     url,
 68 |     chunk_number,
 69 |     content,
 70 |     metadata,
 71 |     source_id,
 72 |     1 - (crawled_pages.embedding <=> query_embedding) as similarity
 73 |   from crawled_pages
 74 |   where metadata @> filter
 75 |     AND (source_filter IS NULL OR source_id = source_filter)
 76 |   order by crawled_pages.embedding <=> query_embedding
 77 |   limit match_count;
 78 | end;
 79 | $$;
 80 | 
 81 | -- Enable RLS on the crawled_pages table
 82 | alter table crawled_pages enable row level security;
 83 | 
 84 | -- Create a policy that allows anyone to read crawled_pages
 85 | create policy "Allow public read access to crawled_pages"
 86 |   on crawled_pages
 87 |   for select
 88 |   to public
 89 |   using (true);
 90 | 
 91 | -- Enable RLS on the sources table
 92 | alter table sources enable row level security;
 93 | 
 94 | -- Create a policy that allows anyone to read sources
 95 | create policy "Allow public read access to sources"
 96 |   on sources
 97 |   for select
 98 |   to public
 99 |   using (true);
100 | 
101 | -- Create the code_examples table
102 | create table code_examples (
103 |     id bigserial primary key,
104 |     url varchar not null,
105 |     chunk_number integer not null,
106 |     content text not null,  -- The code example content
107 |     summary text not null,  -- Summary of the code example
108 |     metadata jsonb not null default '{}'::jsonb,
109 |     source_id text not null,
110 |     embedding vector(1536),  -- OpenAI embeddings are 1536 dimensions
111 |     created_at timestamp with time zone default timezone('utc'::text, now()) not null,
112 |     
113 |     -- Add a unique constraint to prevent duplicate chunks for the same URL
114 |     unique(url, chunk_number),
115 |     
116 |     -- Add foreign key constraint to sources table
117 |     foreign key (source_id) references sources(source_id)
118 | );
119 | 
120 | -- Create an index for better vector similarity search performance
121 | create index on code_examples using ivfflat (embedding vector_cosine_ops);
122 | 
123 | -- Create an index on metadata for faster filtering
124 | create index idx_code_examples_metadata on code_examples using gin (metadata);
125 | 
126 | -- Create an index on source_id for faster filtering
127 | CREATE INDEX idx_code_examples_source_id ON code_examples (source_id);
128 | 
129 | -- Create a function to search for code examples
130 | create or replace function match_code_examples (
131 |   query_embedding vector(1536),
132 |   match_count int default 10,
133 |   filter jsonb DEFAULT '{}'::jsonb,
134 |   source_filter text DEFAULT NULL
135 | ) returns table (
136 |   id bigint,
137 |   url varchar,
138 |   chunk_number integer,
139 |   content text,
140 |   summary text,
141 |   metadata jsonb,
142 |   source_id text,
143 |   similarity float
144 | )
145 | language plpgsql
146 | as $$
147 | #variable_conflict use_column
148 | begin
149 |   return query
150 |   select
151 |     id,
152 |     url,
153 |     chunk_number,
154 |     content,
155 |     summary,
156 |     metadata,
157 |     source_id,
158 |     1 - (code_examples.embedding <=> query_embedding) as similarity
159 |   from code_examples
160 |   where metadata @> filter
161 |     AND (source_filter IS NULL OR source_id = source_filter)
162 |   order by code_examples.embedding <=> query_embedding
163 |   limit match_count;
164 | end;
165 | $$;
166 | 
167 | -- Enable RLS on the code_examples table
168 | alter table code_examples enable row level security;
169 | 
170 | -- Create a policy that allows anyone to read code_examples
171 | create policy "Allow public read access to code_examples"
172 |   on code_examples
173 |   for select
174 |   to public
175 |   using (true);
```
--------------------------------------------------------------------------------
/knowledge_graphs/test_script.py:
--------------------------------------------------------------------------------
```python
  1 | from __future__ import annotations
  2 | from typing import Dict, List, Optional
  3 | from dataclasses import dataclass
  4 | from pydantic import BaseModel, Field
  5 | from dotenv import load_dotenv
  6 | from rich.markdown import Markdown
  7 | from rich.console import Console
  8 | from rich.live import Live
  9 | import asyncio
 10 | import os
 11 | 
 12 | from pydantic_ai.providers.openai import OpenAIProvider
 13 | from pydantic_ai.models.openai import OpenAIModel
 14 | from pydantic_ai import Agent, RunContext
 15 | from graphiti_core import Graphiti
 16 | 
 17 | load_dotenv()
 18 | 
 19 | # ========== Define dependencies ==========
 20 | @dataclass
 21 | class GraphitiDependencies:
 22 |     """Dependencies for the Graphiti agent."""
 23 |     graphiti_client: Graphiti
 24 | 
 25 | # ========== Helper function to get model configuration ==========
 26 | def get_model():
 27 |     """Configure and return the LLM model to use."""
 28 |     model_choice = os.getenv('MODEL_CHOICE', 'gpt-4.1-mini')
 29 |     api_key = os.getenv('OPENAI_API_KEY', 'no-api-key-provided')
 30 | 
 31 |     return OpenAIModel(model_choice, provider=OpenAIProvider(api_key=api_key))
 32 | 
 33 | # ========== Create the Graphiti agent ==========
 34 | graphiti_agent = Agent(
 35 |     get_model(),
 36 |     system_prompt="""You are a helpful assistant with access to a knowledge graph filled with temporal data about LLMs.
 37 |     When the user asks you a question, use your search tool to query the knowledge graph and then answer honestly.
 38 |     Be willing to admit when you didn't find the information necessary to answer the question.""",
 39 |     deps_type=GraphitiDependencies
 40 | )
 41 | 
 42 | # ========== Define a result model for Graphiti search ==========
 43 | class GraphitiSearchResult(BaseModel):
 44 |     """Model representing a search result from Graphiti."""
 45 |     uuid: str = Field(description="The unique identifier for this fact")
 46 |     fact: str = Field(description="The factual statement retrieved from the knowledge graph")
 47 |     valid_at: Optional[str] = Field(None, description="When this fact became valid (if known)")
 48 |     invalid_at: Optional[str] = Field(None, description="When this fact became invalid (if known)")
 49 |     source_node_uuid: Optional[str] = Field(None, description="UUID of the source node")
 50 | 
 51 | # ========== Graphiti search tool ==========
 52 | @graphiti_agent.tool
 53 | async def search_graphiti(ctx: RunContext[GraphitiDependencies], query: str) -> List[GraphitiSearchResult]:
 54 |     """Search the Graphiti knowledge graph with the given query.
 55 |     
 56 |     Args:
 57 |         ctx: The run context containing dependencies
 58 |         query: The search query to find information in the knowledge graph
 59 |         
 60 |     Returns:
 61 |         A list of search results containing facts that match the query
 62 |     """
 63 |     # Access the Graphiti client from dependencies
 64 |     graphiti = ctx.deps.graphiti_client
 65 |     
 66 |     try:
 67 |         # Perform the search
 68 |         results = await graphiti.search(query)
 69 |         
 70 |         # Format the results
 71 |         formatted_results = []
 72 |         for result in results:
 73 |             formatted_result = GraphitiSearchResult(
 74 |                 uuid=result.uuid,
 75 |                 fact=result.fact,
 76 |                 source_node_uuid=result.source_node_uuid if hasattr(result, 'source_node_uuid') else None
 77 |             )
 78 |             
 79 |             # Add temporal information if available
 80 |             if hasattr(result, 'valid_at') and result.valid_at:
 81 |                 formatted_result.valid_at = str(result.valid_at)
 82 |             if hasattr(result, 'invalid_at') and result.invalid_at:
 83 |                 formatted_result.invalid_at = str(result.invalid_at)
 84 |             
 85 |             formatted_results.append(formatted_result)
 86 |         
 87 |         return formatted_results
 88 |     except Exception as e:
 89 |         # Log the error but don't close the connection since it's managed by the dependency
 90 |         print(f"Error searching Graphiti: {str(e)}")
 91 |         raise
 92 | 
 93 | # ========== Main execution function ==========
 94 | async def main():
 95 |     """Run the Graphiti agent with user queries."""
 96 |     print("Graphiti Agent - Powered by Pydantic AI, Graphiti, and Neo4j")
 97 |     print("Enter 'exit' to quit the program.")
 98 | 
 99 |     # Neo4j connection parameters
100 |     neo4j_uri = os.environ.get('NEO4J_URI', 'bolt://localhost:7687')
101 |     neo4j_user = os.environ.get('NEO4J_USER', 'neo4j')
102 |     neo4j_password = os.environ.get('NEO4J_PASSWORD', 'password')
103 |     
104 |     # Initialize Graphiti with Neo4j connection
105 |     graphiti_client = Graphiti(neo4j_uri, neo4j_user, neo4j_password)
106 |     
107 |     # Initialize the graph database with graphiti's indices if needed
108 |     try:
109 |         await graphiti_client.build_indices_and_constraints()
110 |         print("Graphiti indices built successfully.")
111 |     except Exception as e:
112 |         print(f"Note: {str(e)}")
113 |         print("Continuing with existing indices...")
114 | 
115 |     console = Console()
116 |     messages = []
117 |     
118 |     try:
119 |         while True:
120 |             # Get user input
121 |             user_input = input("\n[You] ")
122 |             
123 |             # Check if user wants to exit
124 |             if user_input.lower() in ['exit', 'quit', 'bye', 'goodbye']:
125 |                 print("Goodbye!")
126 |                 break
127 |             
128 |             try:
129 |                 # Process the user input and output the response
130 |                 print("\n[Assistant]")
131 |                 with Live('', console=console, vertical_overflow='visible') as live:
132 |                     # Pass the Graphiti client as a dependency
133 |                     deps = GraphitiDependencies(graphiti_client=graphiti_client)
134 |                     
135 |                     async with graphiti_agent.run_a_stream(
136 |                         user_input, message_history=messages, deps=deps
137 |                     ) as result:
138 |                         curr_message = ""
139 |                         async for message in result.stream_text(delta=True):
140 |                             curr_message += message
141 |                             live.update(Markdown(curr_message))
142 |                     
143 |                     # Add the new messages to the chat history
144 |                     messages.extend(result.all_messages())
145 |                 
146 |             except Exception as e:
147 |                 print(f"\n[Error] An error occurred: {str(e)}")
148 |     finally:
149 |         # Close the Graphiti connection when done
150 |         await graphiti_client.close()
151 |         print("\nGraphiti connection closed.")
152 | 
153 | if __name__ == "__main__":
154 |     try:
155 |         asyncio.run(main())
156 |     except KeyboardInterrupt:
157 |         print("\nProgram terminated by user.")
158 |     except Exception as e:
159 |         print(f"\nUnexpected error: {str(e)}")
160 |         raise
161 | 
```
--------------------------------------------------------------------------------
/knowledge_graphs/ai_hallucination_detector.py:
--------------------------------------------------------------------------------
```python
  1 | """
  2 | AI Hallucination Detector
  3 | 
  4 | Main orchestrator for detecting AI coding assistant hallucinations in Python scripts.
  5 | Combines AST analysis, knowledge graph validation, and comprehensive reporting.
  6 | """
  7 | 
  8 | import asyncio
  9 | import argparse
 10 | import logging
 11 | import os
 12 | import sys
 13 | from pathlib import Path
 14 | from typing import Optional, List
 15 | 
 16 | from dotenv import load_dotenv
 17 | 
 18 | from ai_script_analyzer import AIScriptAnalyzer, analyze_ai_script
 19 | from knowledge_graph_validator import KnowledgeGraphValidator
 20 | from hallucination_reporter import HallucinationReporter
 21 | 
 22 | # Configure logging
 23 | logging.basicConfig(
 24 |     level=logging.INFO,
 25 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 26 |     datefmt='%Y-%m-%d %H:%M:%S'
 27 | )
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | class AIHallucinationDetector:
 32 |     """Main detector class that orchestrates the entire process"""
 33 |     
 34 |     def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
 35 |         self.validator = KnowledgeGraphValidator(neo4j_uri, neo4j_user, neo4j_password)
 36 |         self.reporter = HallucinationReporter()
 37 |         self.analyzer = AIScriptAnalyzer()
 38 |     
 39 |     async def initialize(self):
 40 |         """Initialize connections and components"""
 41 |         await self.validator.initialize()
 42 |         logger.info("AI Hallucination Detector initialized successfully")
 43 |     
 44 |     async def close(self):
 45 |         """Close connections"""
 46 |         await self.validator.close()
 47 |     
 48 |     async def detect_hallucinations(self, script_path: str, 
 49 |                                   output_dir: Optional[str] = None,
 50 |                                   save_json: bool = True,
 51 |                                   save_markdown: bool = True,
 52 |                                   print_summary: bool = True) -> dict:
 53 |         """
 54 |         Main detection function that analyzes a script and generates reports
 55 |         
 56 |         Args:
 57 |             script_path: Path to the AI-generated Python script
 58 |             output_dir: Directory to save reports (defaults to script directory)
 59 |             save_json: Whether to save JSON report
 60 |             save_markdown: Whether to save Markdown report
 61 |             print_summary: Whether to print summary to console
 62 |         
 63 |         Returns:
 64 |             Complete validation report as dictionary
 65 |         """
 66 |         logger.info(f"Starting hallucination detection for: {script_path}")
 67 |         
 68 |         # Validate input
 69 |         if not os.path.exists(script_path):
 70 |             raise FileNotFoundError(f"Script not found: {script_path}")
 71 |         
 72 |         if not script_path.endswith('.py'):
 73 |             raise ValueError("Only Python (.py) files are supported")
 74 |         
 75 |         # Set output directory
 76 |         if output_dir is None:
 77 |             output_dir = str(Path(script_path).parent)
 78 |         
 79 |         os.makedirs(output_dir, exist_ok=True)
 80 |         
 81 |         try:
 82 |             # Step 1: Analyze the script using AST
 83 |             logger.info("Step 1: Analyzing script structure...")
 84 |             analysis_result = self.analyzer.analyze_script(script_path)
 85 |             
 86 |             if analysis_result.errors:
 87 |                 logger.warning(f"Analysis warnings: {analysis_result.errors}")
 88 |             
 89 |             logger.info(f"Found: {len(analysis_result.imports)} imports, "
 90 |                        f"{len(analysis_result.class_instantiations)} class instantiations, "
 91 |                        f"{len(analysis_result.method_calls)} method calls, "
 92 |                        f"{len(analysis_result.function_calls)} function calls, "
 93 |                        f"{len(analysis_result.attribute_accesses)} attribute accesses")
 94 |             
 95 |             # Step 2: Validate against knowledge graph
 96 |             logger.info("Step 2: Validating against knowledge graph...")
 97 |             validation_result = await self.validator.validate_script(analysis_result)
 98 |             
 99 |             logger.info(f"Validation complete. Overall confidence: {validation_result.overall_confidence:.1%}")
100 |             
101 |             # Step 3: Generate comprehensive report
102 |             logger.info("Step 3: Generating reports...")
103 |             report = self.reporter.generate_comprehensive_report(validation_result)
104 |             
105 |             # Step 4: Save reports
106 |             script_name = Path(script_path).stem
107 |             
108 |             if save_json:
109 |                 json_path = os.path.join(output_dir, f"{script_name}_hallucination_report.json")
110 |                 self.reporter.save_json_report(report, json_path)
111 |             
112 |             if save_markdown:
113 |                 md_path = os.path.join(output_dir, f"{script_name}_hallucination_report.md")
114 |                 self.reporter.save_markdown_report(report, md_path)
115 |             
116 |             # Step 5: Print summary
117 |             if print_summary:
118 |                 self.reporter.print_summary(report)
119 |             
120 |             logger.info("Hallucination detection completed successfully")
121 |             return report
122 |             
123 |         except Exception as e:
124 |             logger.error(f"Error during hallucination detection: {str(e)}")
125 |             raise
126 |     
127 |     async def batch_detect(self, script_paths: List[str], 
128 |                           output_dir: Optional[str] = None) -> List[dict]:
129 |         """
130 |         Detect hallucinations in multiple scripts
131 |         
132 |         Args:
133 |             script_paths: List of paths to Python scripts
134 |             output_dir: Directory to save all reports
135 |         
136 |         Returns:
137 |             List of validation reports
138 |         """
139 |         logger.info(f"Starting batch detection for {len(script_paths)} scripts")
140 |         
141 |         results = []
142 |         for i, script_path in enumerate(script_paths, 1):
143 |             logger.info(f"Processing script {i}/{len(script_paths)}: {script_path}")
144 |             
145 |             try:
146 |                 result = await self.detect_hallucinations(
147 |                     script_path=script_path,
148 |                     output_dir=output_dir,
149 |                     print_summary=False  # Don't print individual summaries in batch mode
150 |                 )
151 |                 results.append(result)
152 |                 
153 |             except Exception as e:
154 |                 logger.error(f"Failed to process {script_path}: {str(e)}")
155 |                 # Continue with other scripts
156 |                 continue
157 |         
158 |         # Print batch summary
159 |         self._print_batch_summary(results)
160 |         
161 |         return results
162 |     
163 |     def _print_batch_summary(self, results: List[dict]):
164 |         """Print summary of batch processing results"""
165 |         if not results:
166 |             print("No scripts were successfully processed.")
167 |             return
168 |         
169 |         print("\n" + "="*80)
170 |         print("🚀 BATCH HALLUCINATION DETECTION SUMMARY")
171 |         print("="*80)
172 |         
173 |         total_scripts = len(results)
174 |         total_validations = sum(r['validation_summary']['total_validations'] for r in results)
175 |         total_valid = sum(r['validation_summary']['valid_count'] for r in results)
176 |         total_invalid = sum(r['validation_summary']['invalid_count'] for r in results)
177 |         total_not_found = sum(r['validation_summary']['not_found_count'] for r in results)
178 |         total_hallucinations = sum(len(r['hallucinations_detected']) for r in results)
179 |         
180 |         avg_confidence = sum(r['validation_summary']['overall_confidence'] for r in results) / total_scripts
181 |         
182 |         print(f"Scripts Processed: {total_scripts}")
183 |         print(f"Total Validations: {total_validations}")
184 |         print(f"Average Confidence: {avg_confidence:.1%}")
185 |         print(f"Total Hallucinations: {total_hallucinations}")
186 |         
187 |         print(f"\nAggregated Results:")
188 |         print(f"  ✅ Valid: {total_valid} ({total_valid/total_validations:.1%})")
189 |         print(f"  ❌ Invalid: {total_invalid} ({total_invalid/total_validations:.1%})")
190 |         print(f"  🔍 Not Found: {total_not_found} ({total_not_found/total_validations:.1%})")
191 |         
192 |         # Show worst performing scripts
193 |         print(f"\n🚨 Scripts with Most Hallucinations:")
194 |         sorted_results = sorted(results, key=lambda x: len(x['hallucinations_detected']), reverse=True)
195 |         for result in sorted_results[:5]:
196 |             script_name = Path(result['analysis_metadata']['script_path']).name
197 |             hall_count = len(result['hallucinations_detected'])
198 |             confidence = result['validation_summary']['overall_confidence']
199 |             print(f"  - {script_name}: {hall_count} hallucinations ({confidence:.1%} confidence)")
200 |         
201 |         print("="*80)
202 | 
203 | 
204 | async def main():
205 |     """Command-line interface for the AI Hallucination Detector"""
206 |     parser = argparse.ArgumentParser(
207 |         description="Detect AI coding assistant hallucinations in Python scripts",
208 |         formatter_class=argparse.RawDescriptionHelpFormatter,
209 |         epilog="""
210 | Examples:
211 |   # Analyze single script
212 |   python ai_hallucination_detector.py script.py
213 |   
214 |   # Analyze multiple scripts
215 |   python ai_hallucination_detector.py script1.py script2.py script3.py
216 |   
217 |   # Specify output directory
218 |   python ai_hallucination_detector.py script.py --output-dir reports/
219 |   
220 |   # Skip markdown report
221 |   python ai_hallucination_detector.py script.py --no-markdown
222 |         """
223 |     )
224 |     
225 |     parser.add_argument(
226 |         'scripts',
227 |         nargs='+',
228 |         help='Python script(s) to analyze for hallucinations'
229 |     )
230 |     
231 |     parser.add_argument(
232 |         '--output-dir',
233 |         help='Directory to save reports (defaults to script directory)'
234 |     )
235 |     
236 |     parser.add_argument(
237 |         '--no-json',
238 |         action='store_true',
239 |         help='Skip JSON report generation'
240 |     )
241 |     
242 |     parser.add_argument(
243 |         '--no-markdown',
244 |         action='store_true',
245 |         help='Skip Markdown report generation'
246 |     )
247 |     
248 |     parser.add_argument(
249 |         '--no-summary',
250 |         action='store_true',
251 |         help='Skip printing summary to console'
252 |     )
253 |     
254 |     parser.add_argument(
255 |         '--neo4j-uri',
256 |         default=None,
257 |         help='Neo4j URI (default: from environment NEO4J_URI)'
258 |     )
259 |     
260 |     parser.add_argument(
261 |         '--neo4j-user',
262 |         default=None,
263 |         help='Neo4j username (default: from environment NEO4J_USER)'
264 |     )
265 |     
266 |     parser.add_argument(
267 |         '--neo4j-password',
268 |         default=None,
269 |         help='Neo4j password (default: from environment NEO4J_PASSWORD)'
270 |     )
271 |     
272 |     parser.add_argument(
273 |         '--verbose',
274 |         action='store_true',
275 |         help='Enable verbose logging'
276 |     )
277 |     
278 |     args = parser.parse_args()
279 |     
280 |     if args.verbose:
281 |         logging.getLogger().setLevel(logging.INFO)
282 |         # Only enable debug for our modules, not neo4j
283 |         logging.getLogger('neo4j').setLevel(logging.WARNING)
284 |         logging.getLogger('neo4j.pool').setLevel(logging.WARNING)
285 |         logging.getLogger('neo4j.io').setLevel(logging.WARNING)
286 |     
287 |     # Load environment variables
288 |     load_dotenv()
289 |     
290 |     # Get Neo4j credentials
291 |     neo4j_uri = args.neo4j_uri or os.environ.get('NEO4J_URI', 'bolt://localhost:7687')
292 |     neo4j_user = args.neo4j_user or os.environ.get('NEO4J_USER', 'neo4j')
293 |     neo4j_password = args.neo4j_password or os.environ.get('NEO4J_PASSWORD', 'password')
294 |     
295 |     if not neo4j_password or neo4j_password == 'password':
296 |         logger.error("Please set NEO4J_PASSWORD environment variable or use --neo4j-password")
297 |         sys.exit(1)
298 |     
299 |     # Initialize detector
300 |     detector = AIHallucinationDetector(neo4j_uri, neo4j_user, neo4j_password)
301 |     
302 |     try:
303 |         await detector.initialize()
304 |         
305 |         # Process scripts
306 |         if len(args.scripts) == 1:
307 |             # Single script mode
308 |             await detector.detect_hallucinations(
309 |                 script_path=args.scripts[0],
310 |                 output_dir=args.output_dir,
311 |                 save_json=not args.no_json,
312 |                 save_markdown=not args.no_markdown,
313 |                 print_summary=not args.no_summary
314 |             )
315 |         else:
316 |             # Batch mode
317 |             await detector.batch_detect(
318 |                 script_paths=args.scripts,
319 |                 output_dir=args.output_dir
320 |             )
321 |     
322 |     except KeyboardInterrupt:
323 |         logger.info("Detection interrupted by user")
324 |         sys.exit(1)
325 |     
326 |     except Exception as e:
327 |         logger.error(f"Detection failed: {str(e)}")
328 |         sys.exit(1)
329 |     
330 |     finally:
331 |         await detector.close()
332 | 
333 | 
334 | if __name__ == "__main__":
335 |     asyncio.run(main())
```
--------------------------------------------------------------------------------
/knowledge_graphs/query_knowledge_graph.py:
--------------------------------------------------------------------------------
```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Knowledge Graph Query Tool
  4 | 
  5 | Interactive script to explore what's actually stored in your Neo4j knowledge graph.
  6 | Useful for debugging hallucination detection and understanding graph contents.
  7 | """
  8 | 
  9 | import asyncio
 10 | import os
 11 | from dotenv import load_dotenv
 12 | from neo4j import AsyncGraphDatabase
 13 | from typing import List, Dict, Any
 14 | import argparse
 15 | 
 16 | 
 17 | class KnowledgeGraphQuerier:
 18 |     """Interactive tool to query the knowledge graph"""
 19 |     
 20 |     def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
 21 |         self.neo4j_uri = neo4j_uri
 22 |         self.neo4j_user = neo4j_user
 23 |         self.neo4j_password = neo4j_password
 24 |         self.driver = None
 25 |     
 26 |     async def initialize(self):
 27 |         """Initialize Neo4j connection"""
 28 |         self.driver = AsyncGraphDatabase.driver(
 29 |             self.neo4j_uri, 
 30 |             auth=(self.neo4j_user, self.neo4j_password)
 31 |         )
 32 |         print("🔗 Connected to Neo4j knowledge graph")
 33 |     
 34 |     async def close(self):
 35 |         """Close Neo4j connection"""
 36 |         if self.driver:
 37 |             await self.driver.close()
 38 |     
 39 |     async def list_repositories(self):
 40 |         """List all repositories in the knowledge graph"""
 41 |         print("\n📚 Repositories in Knowledge Graph:")
 42 |         print("=" * 50)
 43 |         
 44 |         async with self.driver.session() as session:
 45 |             query = "MATCH (r:Repository) RETURN r.name as name ORDER BY r.name"
 46 |             result = await session.run(query)
 47 |             
 48 |             repos = []
 49 |             async for record in result:
 50 |                 repos.append(record['name'])
 51 |             
 52 |             if repos:
 53 |                 for i, repo in enumerate(repos, 1):
 54 |                     print(f"{i}. {repo}")
 55 |             else:
 56 |                 print("No repositories found in knowledge graph.")
 57 |         
 58 |         return repos
 59 |     
 60 |     async def explore_repository(self, repo_name: str):
 61 |         """Get overview of a specific repository"""
 62 |         print(f"\n🔍 Exploring Repository: {repo_name}")
 63 |         print("=" * 60)
 64 |         
 65 |         async with self.driver.session() as session:
 66 |             # Get file count
 67 |             files_query = """
 68 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
 69 |             RETURN count(f) as file_count
 70 |             """
 71 |             result = await session.run(files_query, repo_name=repo_name)
 72 |             file_count = (await result.single())['file_count']
 73 |             
 74 |             # Get class count
 75 |             classes_query = """
 76 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
 77 |             RETURN count(DISTINCT c) as class_count
 78 |             """
 79 |             result = await session.run(classes_query, repo_name=repo_name)
 80 |             class_count = (await result.single())['class_count']
 81 |             
 82 |             # Get function count
 83 |             functions_query = """
 84 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
 85 |             RETURN count(DISTINCT func) as function_count
 86 |             """
 87 |             result = await session.run(functions_query, repo_name=repo_name)
 88 |             function_count = (await result.single())['function_count']
 89 |             
 90 |             print(f"📄 Files: {file_count}")
 91 |             print(f"🏗️  Classes: {class_count}")
 92 |             print(f"⚙️  Functions: {function_count}")
 93 |     
 94 |     async def list_classes(self, repo_name: str = None, limit: int = 20):
 95 |         """List classes in the knowledge graph"""
 96 |         title = f"Classes in {repo_name}" if repo_name else "All Classes"
 97 |         print(f"\n🏗️  {title} (limit {limit}):")
 98 |         print("=" * 50)
 99 |         
100 |         async with self.driver.session() as session:
101 |             if repo_name:
102 |                 query = """
103 |                 MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
104 |                 RETURN c.name as name, c.full_name as full_name
105 |                 ORDER BY c.name
106 |                 LIMIT $limit
107 |                 """
108 |                 result = await session.run(query, repo_name=repo_name, limit=limit)
109 |             else:
110 |                 query = """
111 |                 MATCH (c:Class)
112 |                 RETURN c.name as name, c.full_name as full_name
113 |                 ORDER BY c.name
114 |                 LIMIT $limit
115 |                 """
116 |                 result = await session.run(query, limit=limit)
117 |             
118 |             classes = []
119 |             async for record in result:
120 |                 classes.append({
121 |                     'name': record['name'],
122 |                     'full_name': record['full_name']
123 |                 })
124 |             
125 |             if classes:
126 |                 for i, cls in enumerate(classes, 1):
127 |                     print(f"{i:2d}. {cls['name']} ({cls['full_name']})")
128 |             else:
129 |                 print("No classes found.")
130 |         
131 |         return classes
132 |     
133 |     async def explore_class(self, class_name: str):
134 |         """Get detailed information about a specific class"""
135 |         print(f"\n🔍 Exploring Class: {class_name}")
136 |         print("=" * 60)
137 |         
138 |         async with self.driver.session() as session:
139 |             # Find the class
140 |             class_query = """
141 |             MATCH (c:Class)
142 |             WHERE c.name = $class_name OR c.full_name = $class_name
143 |             RETURN c.name as name, c.full_name as full_name
144 |             LIMIT 1
145 |             """
146 |             result = await session.run(class_query, class_name=class_name)
147 |             class_record = await result.single()
148 |             
149 |             if not class_record:
150 |                 print(f"❌ Class '{class_name}' not found in knowledge graph.")
151 |                 return None
152 |             
153 |             actual_name = class_record['name']
154 |             full_name = class_record['full_name']
155 |             
156 |             print(f"📋 Name: {actual_name}")
157 |             print(f"📋 Full Name: {full_name}")
158 |             
159 |             # Get methods
160 |             methods_query = """
161 |             MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
162 |             WHERE c.name = $class_name OR c.full_name = $class_name
163 |             RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, m.return_type as return_type
164 |             ORDER BY m.name
165 |             """
166 |             result = await session.run(methods_query, class_name=class_name)
167 |             
168 |             methods = []
169 |             async for record in result:
170 |                 methods.append({
171 |                     'name': record['name'],
172 |                     'params_list': record['params_list'] or [],
173 |                     'params_detailed': record['params_detailed'] or [],
174 |                     'return_type': record['return_type'] or 'Any'
175 |                 })
176 |             
177 |             if methods:
178 |                 print(f"\n⚙️  Methods ({len(methods)}):")
179 |                 for i, method in enumerate(methods, 1):
180 |                     # Use detailed params if available, fall back to simple params
181 |                     params_to_show = method['params_detailed'] or method['params_list']
182 |                     params = ', '.join(params_to_show) if params_to_show else ''
183 |                     print(f"{i:2d}. {method['name']}({params}) -> {method['return_type']}")
184 |             else:
185 |                 print("\n⚙️  No methods found.")
186 |             
187 |             # Get attributes
188 |             attributes_query = """
189 |             MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
190 |             WHERE c.name = $class_name OR c.full_name = $class_name
191 |             RETURN a.name as name, a.type as type
192 |             ORDER BY a.name
193 |             """
194 |             result = await session.run(attributes_query, class_name=class_name)
195 |             
196 |             attributes = []
197 |             async for record in result:
198 |                 attributes.append({
199 |                     'name': record['name'],
200 |                     'type': record['type'] or 'Any'
201 |                 })
202 |             
203 |             if attributes:
204 |                 print(f"\n📋 Attributes ({len(attributes)}):")
205 |                 for i, attr in enumerate(attributes, 1):
206 |                     print(f"{i:2d}. {attr['name']}: {attr['type']}")
207 |             else:
208 |                 print("\n📋 No attributes found.")
209 |         
210 |         return {'methods': methods, 'attributes': attributes}
211 |     
212 |     async def search_method(self, method_name: str, class_name: str = None):
213 |         """Search for methods by name"""
214 |         title = f"Method '{method_name}'"
215 |         if class_name:
216 |             title += f" in class '{class_name}'"
217 |         
218 |         print(f"\n🔍 Searching for {title}:")
219 |         print("=" * 60)
220 |         
221 |         async with self.driver.session() as session:
222 |             if class_name:
223 |                 query = """
224 |                 MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
225 |                 WHERE (c.name = $class_name OR c.full_name = $class_name)
226 |                   AND m.name = $method_name
227 |                 RETURN c.name as class_name, c.full_name as class_full_name,
228 |                        m.name as method_name, m.params_list as params_list, 
229 |                        m.return_type as return_type, m.args as args
230 |                 """
231 |                 result = await session.run(query, class_name=class_name, method_name=method_name)
232 |             else:
233 |                 query = """
234 |                 MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
235 |                 WHERE m.name = $method_name
236 |                 RETURN c.name as class_name, c.full_name as class_full_name,
237 |                        m.name as method_name, m.params_list as params_list, 
238 |                        m.return_type as return_type, m.args as args
239 |                 ORDER BY c.name
240 |                 """
241 |                 result = await session.run(query, method_name=method_name)
242 |             
243 |             methods = []
244 |             async for record in result:
245 |                 methods.append({
246 |                     'class_name': record['class_name'],
247 |                     'class_full_name': record['class_full_name'],
248 |                     'method_name': record['method_name'],
249 |                     'params_list': record['params_list'] or [],
250 |                     'return_type': record['return_type'] or 'Any',
251 |                     'args': record['args'] or []
252 |                 })
253 |             
254 |             if methods:
255 |                 for i, method in enumerate(methods, 1):
256 |                     params = ', '.join(method['params_list']) if method['params_list'] else ''
257 |                     print(f"{i}. {method['class_full_name']}.{method['method_name']}({params}) -> {method['return_type']}")
258 |                     if method['args']:
259 |                         print(f"   Legacy args: {method['args']}")
260 |             else:
261 |                 print(f"❌ Method '{method_name}' not found.")
262 |         
263 |         return methods
264 |     
265 |     async def run_custom_query(self, query: str):
266 |         """Run a custom Cypher query"""
267 |         print(f"\n🔍 Running Custom Query:")
268 |         print("=" * 60)
269 |         print(f"Query: {query}")
270 |         print("-" * 60)
271 |         
272 |         async with self.driver.session() as session:
273 |             try:
274 |                 result = await session.run(query)
275 |                 
276 |                 records = []
277 |                 async for record in result:
278 |                     records.append(dict(record))
279 |                 
280 |                 if records:
281 |                     for i, record in enumerate(records, 1):
282 |                         print(f"{i:2d}. {record}")
283 |                         if i >= 20:  # Limit output
284 |                             print(f"... and {len(records) - 20} more records")
285 |                             break
286 |                 else:
287 |                     print("No results found.")
288 |                 
289 |                 return records
290 |                 
291 |             except Exception as e:
292 |                 print(f"❌ Query error: {str(e)}")
293 |                 return None
294 | 
295 | 
296 | async def interactive_mode(querier: KnowledgeGraphQuerier):
297 |     """Interactive exploration mode"""
298 |     print("\n🚀 Welcome to Knowledge Graph Explorer!")
299 |     print("Available commands:")
300 |     print("  repos          - List all repositories")
301 |     print("  explore <repo> - Explore a specific repository") 
302 |     print("  classes [repo] - List classes (optionally in specific repo)")
303 |     print("  class <name>   - Explore a specific class")
304 |     print("  method <name> [class] - Search for method")
305 |     print("  query <cypher> - Run custom Cypher query")
306 |     print("  quit           - Exit")
307 |     print()
308 |     
309 |     while True:
310 |         try:
311 |             command = input("🔍 > ").strip()
312 |             
313 |             if not command:
314 |                 continue
315 |             elif command == "quit":
316 |                 break
317 |             elif command == "repos":
318 |                 await querier.list_repositories()
319 |             elif command.startswith("explore "):
320 |                 repo_name = command[8:].strip()
321 |                 await querier.explore_repository(repo_name)
322 |             elif command == "classes":
323 |                 await querier.list_classes()
324 |             elif command.startswith("classes "):
325 |                 repo_name = command[8:].strip()
326 |                 await querier.list_classes(repo_name)
327 |             elif command.startswith("class "):
328 |                 class_name = command[6:].strip()
329 |                 await querier.explore_class(class_name)
330 |             elif command.startswith("method "):
331 |                 parts = command[7:].strip().split()
332 |                 if len(parts) >= 2:
333 |                     await querier.search_method(parts[0], parts[1])
334 |                 else:
335 |                     await querier.search_method(parts[0])
336 |             elif command.startswith("query "):
337 |                 query = command[6:].strip()
338 |                 await querier.run_custom_query(query)
339 |             else:
340 |                 print("❌ Unknown command. Type 'quit' to exit.")
341 |                 
342 |         except KeyboardInterrupt:
343 |             print("\n👋 Goodbye!")
344 |             break
345 |         except Exception as e:
346 |             print(f"❌ Error: {str(e)}")
347 | 
348 | 
349 | async def main():
350 |     """Main function with CLI argument support"""
351 |     parser = argparse.ArgumentParser(description="Query the knowledge graph")
352 |     parser.add_argument('--repos', action='store_true', help='List repositories')
353 |     parser.add_argument('--classes', metavar='REPO', nargs='?', const='', help='List classes')
354 |     parser.add_argument('--explore', metavar='REPO', help='Explore repository')
355 |     parser.add_argument('--class', dest='class_name', metavar='NAME', help='Explore class')
356 |     parser.add_argument('--method', nargs='+', metavar=('NAME', 'CLASS'), help='Search method')
357 |     parser.add_argument('--query', metavar='CYPHER', help='Run custom query')
358 |     parser.add_argument('--interactive', action='store_true', help='Interactive mode')
359 |     
360 |     args = parser.parse_args()
361 |     
362 |     # Load environment
363 |     load_dotenv()
364 |     neo4j_uri = os.environ.get('NEO4J_URI', 'bolt://localhost:7687')
365 |     neo4j_user = os.environ.get('NEO4J_USER', 'neo4j')
366 |     neo4j_password = os.environ.get('NEO4J_PASSWORD', 'password')
367 |     
368 |     querier = KnowledgeGraphQuerier(neo4j_uri, neo4j_user, neo4j_password)
369 |     
370 |     try:
371 |         await querier.initialize()
372 |         
373 |         # Execute commands based on arguments
374 |         if args.repos:
375 |             await querier.list_repositories()
376 |         elif args.classes is not None:
377 |             await querier.list_classes(args.classes if args.classes else None)
378 |         elif args.explore:
379 |             await querier.explore_repository(args.explore)
380 |         elif args.class_name:
381 |             await querier.explore_class(args.class_name)
382 |         elif args.method:
383 |             if len(args.method) >= 2:
384 |                 await querier.search_method(args.method[0], args.method[1])
385 |             else:
386 |                 await querier.search_method(args.method[0])
387 |         elif args.query:
388 |             await querier.run_custom_query(args.query)
389 |         elif args.interactive or len(sys.argv) == 1:
390 |             await interactive_mode(querier)
391 |         else:
392 |             parser.print_help()
393 |     
394 |     finally:
395 |         await querier.close()
396 | 
397 | 
398 | if __name__ == "__main__":
399 |     import sys
400 |     asyncio.run(main())
```
--------------------------------------------------------------------------------
/knowledge_graphs/ai_script_analyzer.py:
--------------------------------------------------------------------------------
```python
  1 | """
  2 | AI Script Analyzer
  3 | 
  4 | Parses Python scripts generated by AI coding assistants using AST to extract:
  5 | - Import statements and their usage
  6 | - Class instantiations and method calls  
  7 | - Function calls with parameters
  8 | - Attribute access patterns
  9 | - Variable type tracking
 10 | """
 11 | 
 12 | import ast
 13 | import logging
 14 | from pathlib import Path
 15 | from typing import Dict, List, Set, Any, Optional, Tuple
 16 | from dataclasses import dataclass, field
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | @dataclass
 22 | class ImportInfo:
 23 |     """Information about an import statement"""
 24 |     module: str
 25 |     name: str
 26 |     alias: Optional[str] = None
 27 |     is_from_import: bool = False
 28 |     line_number: int = 0
 29 | 
 30 | 
 31 | @dataclass
 32 | class MethodCall:
 33 |     """Information about a method call"""
 34 |     object_name: str
 35 |     method_name: str
 36 |     args: List[str]
 37 |     kwargs: Dict[str, str]
 38 |     line_number: int
 39 |     object_type: Optional[str] = None  # Inferred class type
 40 | 
 41 | 
 42 | @dataclass
 43 | class AttributeAccess:
 44 |     """Information about attribute access"""
 45 |     object_name: str
 46 |     attribute_name: str
 47 |     line_number: int
 48 |     object_type: Optional[str] = None  # Inferred class type
 49 | 
 50 | 
 51 | @dataclass
 52 | class FunctionCall:
 53 |     """Information about a function call"""
 54 |     function_name: str
 55 |     args: List[str]
 56 |     kwargs: Dict[str, str]
 57 |     line_number: int
 58 |     full_name: Optional[str] = None  # Module.function_name
 59 | 
 60 | 
 61 | @dataclass
 62 | class ClassInstantiation:
 63 |     """Information about class instantiation"""
 64 |     variable_name: str
 65 |     class_name: str
 66 |     args: List[str]
 67 |     kwargs: Dict[str, str]
 68 |     line_number: int
 69 |     full_class_name: Optional[str] = None  # Module.ClassName
 70 | 
 71 | 
 72 | @dataclass
 73 | class AnalysisResult:
 74 |     """Complete analysis results for a Python script"""
 75 |     file_path: str
 76 |     imports: List[ImportInfo] = field(default_factory=list)
 77 |     class_instantiations: List[ClassInstantiation] = field(default_factory=list)
 78 |     method_calls: List[MethodCall] = field(default_factory=list)
 79 |     attribute_accesses: List[AttributeAccess] = field(default_factory=list)
 80 |     function_calls: List[FunctionCall] = field(default_factory=list)
 81 |     variable_types: Dict[str, str] = field(default_factory=dict)  # variable_name -> class_type
 82 |     errors: List[str] = field(default_factory=list)
 83 | 
 84 | 
 85 | class AIScriptAnalyzer:
 86 |     """Analyzes AI-generated Python scripts for validation against knowledge graph"""
 87 |     
 88 |     def __init__(self):
 89 |         self.import_map: Dict[str, str] = {}  # alias -> actual_module_name
 90 |         self.variable_types: Dict[str, str] = {}  # variable_name -> class_type
 91 |         self.context_manager_vars: Dict[str, Tuple[int, int, str]] = {}  # var_name -> (start_line, end_line, type)
 92 |         
 93 |     def analyze_script(self, script_path: str) -> AnalysisResult:
 94 |         """Analyze a Python script and extract all relevant information"""
 95 |         try:
 96 |             with open(script_path, 'r', encoding='utf-8') as f:
 97 |                 content = f.read()
 98 |             
 99 |             tree = ast.parse(content)
100 |             result = AnalysisResult(file_path=script_path)
101 |             
102 |             # Reset state for new analysis
103 |             self.import_map.clear()
104 |             self.variable_types.clear()
105 |             self.context_manager_vars.clear()
106 |             
107 |             # Track processed nodes to avoid duplicates
108 |             self.processed_calls = set()
109 |             self.method_call_attributes = set()
110 |             
111 |             # First pass: collect imports and build import map
112 |             for node in ast.walk(tree):
113 |                 if isinstance(node, (ast.Import, ast.ImportFrom)):
114 |                     self._extract_imports(node, result)
115 |             
116 |             # Second pass: analyze usage patterns
117 |             for node in ast.walk(tree):
118 |                 self._analyze_node(node, result)
119 |             
120 |             # Set inferred types on method calls and attribute accesses
121 |             self._infer_object_types(result)
122 |             
123 |             result.variable_types = self.variable_types.copy()
124 |             
125 |             return result
126 |             
127 |         except Exception as e:
128 |             error_msg = f"Failed to analyze script {script_path}: {str(e)}"
129 |             logger.error(error_msg)
130 |             result = AnalysisResult(file_path=script_path)
131 |             result.errors.append(error_msg)
132 |             return result
133 |     
134 |     def _extract_imports(self, node: ast.AST, result: AnalysisResult):
135 |         """Extract import information and build import mapping"""
136 |         line_num = getattr(node, 'lineno', 0)
137 |         
138 |         if isinstance(node, ast.Import):
139 |             for alias in node.names:
140 |                 import_name = alias.name
141 |                 alias_name = alias.asname or import_name
142 |                 
143 |                 result.imports.append(ImportInfo(
144 |                     module=import_name,
145 |                     name=import_name,
146 |                     alias=alias.asname,
147 |                     is_from_import=False,
148 |                     line_number=line_num
149 |                 ))
150 |                 
151 |                 self.import_map[alias_name] = import_name
152 |                 
153 |         elif isinstance(node, ast.ImportFrom):
154 |             module = node.module or ""
155 |             for alias in node.names:
156 |                 import_name = alias.name
157 |                 alias_name = alias.asname or import_name
158 |                 
159 |                 result.imports.append(ImportInfo(
160 |                     module=module,
161 |                     name=import_name,
162 |                     alias=alias.asname,
163 |                     is_from_import=True,
164 |                     line_number=line_num
165 |                 ))
166 |                 
167 |                 # Map alias to full module.name
168 |                 if module:
169 |                     full_name = f"{module}.{import_name}"
170 |                     self.import_map[alias_name] = full_name
171 |                 else:
172 |                     self.import_map[alias_name] = import_name
173 |     
174 |     def _analyze_node(self, node: ast.AST, result: AnalysisResult):
175 |         """Analyze individual AST nodes for usage patterns"""
176 |         line_num = getattr(node, 'lineno', 0)
177 |         
178 |         # Assignments (class instantiations and method call results)
179 |         if isinstance(node, ast.Assign):
180 |             if len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
181 |                 if isinstance(node.value, ast.Call):
182 |                     # Check if it's a class instantiation or method call
183 |                     if isinstance(node.value.func, ast.Name):
184 |                         # Direct function/class call
185 |                         self._extract_class_instantiation(node, result)
186 |                         # Mark this call as processed to avoid duplicate processing
187 |                         self.processed_calls.add(id(node.value))
188 |                     elif isinstance(node.value.func, ast.Attribute):
189 |                         # Method call - track the variable assignment for type inference
190 |                         var_name = node.targets[0].id
191 |                         self._track_method_result_assignment(node.value, var_name)
192 |                         # Still process the method call
193 |                         self._extract_method_call(node.value, result)
194 |                         self.processed_calls.add(id(node.value))
195 |         
196 |         # AsyncWith statements (context managers)
197 |         elif isinstance(node, ast.AsyncWith):
198 |             self._handle_async_with(node, result)
199 |         elif isinstance(node, ast.With):
200 |             self._handle_with(node, result)
201 |         
202 |         # Method calls and function calls
203 |         elif isinstance(node, ast.Call):
204 |             # Skip if this call was already processed as part of an assignment
205 |             if id(node) in self.processed_calls:
206 |                 return
207 |                 
208 |             if isinstance(node.func, ast.Attribute):
209 |                 self._extract_method_call(node, result)
210 |                 # Mark this attribute as used in method call to avoid duplicate processing
211 |                 self.method_call_attributes.add(id(node.func))
212 |             elif isinstance(node.func, ast.Name):
213 |                 # Check if this is likely a class instantiation (based on imported classes)
214 |                 func_name = node.func.id
215 |                 full_name = self._resolve_full_name(func_name)
216 |                 
217 |                 # If this is a known imported class, treat as class instantiation
218 |                 if self._is_likely_class_instantiation(func_name, full_name):
219 |                     self._extract_nested_class_instantiation(node, result)
220 |                 else:
221 |                     self._extract_function_call(node, result)
222 |         
223 |         # Attribute access (not in call context)
224 |         elif isinstance(node, ast.Attribute):
225 |             # Skip if this attribute was already processed as part of a method call
226 |             if id(node) in self.method_call_attributes:
227 |                 return
228 |             self._extract_attribute_access(node, result)
229 |     
230 |     def _extract_class_instantiation(self, node: ast.Assign, result: AnalysisResult):
231 |         """Extract class instantiation from assignment"""
232 |         target = node.targets[0]
233 |         call = node.value
234 |         line_num = getattr(node, 'lineno', 0)
235 |         
236 |         if isinstance(target, ast.Name) and isinstance(call, ast.Call):
237 |             var_name = target.id
238 |             class_name = self._get_name_from_call(call.func)
239 |             
240 |             if class_name:
241 |                 args = [self._get_arg_representation(arg) for arg in call.args]
242 |                 kwargs = {
243 |                     kw.arg: self._get_arg_representation(kw.value) 
244 |                     for kw in call.keywords if kw.arg
245 |                 }
246 |                 
247 |                 # Resolve full class name using import map
248 |                 full_class_name = self._resolve_full_name(class_name)
249 |                 
250 |                 instantiation = ClassInstantiation(
251 |                     variable_name=var_name,
252 |                     class_name=class_name,
253 |                     args=args,
254 |                     kwargs=kwargs,
255 |                     line_number=line_num,
256 |                     full_class_name=full_class_name
257 |                 )
258 |                 
259 |                 result.class_instantiations.append(instantiation)
260 |                 
261 |                 # Track variable type for later method call analysis
262 |                 self.variable_types[var_name] = full_class_name or class_name
263 |     
264 |     def _extract_method_call(self, node: ast.Call, result: AnalysisResult):
265 |         """Extract method call information"""
266 |         if isinstance(node.func, ast.Attribute):
267 |             line_num = getattr(node, 'lineno', 0)
268 |             
269 |             # Get object and method names
270 |             obj_name = self._get_name_from_node(node.func.value)
271 |             method_name = node.func.attr
272 |             
273 |             if obj_name and method_name:
274 |                 args = [self._get_arg_representation(arg) for arg in node.args]
275 |                 kwargs = {
276 |                     kw.arg: self._get_arg_representation(kw.value) 
277 |                     for kw in node.keywords if kw.arg
278 |                 }
279 |                 
280 |                 method_call = MethodCall(
281 |                     object_name=obj_name,
282 |                     method_name=method_name,
283 |                     args=args,
284 |                     kwargs=kwargs,
285 |                     line_number=line_num,
286 |                     object_type=self.variable_types.get(obj_name)
287 |                 )
288 |                 
289 |                 result.method_calls.append(method_call)
290 |     
291 |     def _extract_function_call(self, node: ast.Call, result: AnalysisResult):
292 |         """Extract function call information"""
293 |         if isinstance(node.func, ast.Name):
294 |             line_num = getattr(node, 'lineno', 0)
295 |             func_name = node.func.id
296 |             
297 |             args = [self._get_arg_representation(arg) for arg in node.args]
298 |             kwargs = {
299 |                 kw.arg: self._get_arg_representation(kw.value) 
300 |                 for kw in node.keywords if kw.arg
301 |             }
302 |             
303 |             # Resolve full function name using import map
304 |             full_func_name = self._resolve_full_name(func_name)
305 |             
306 |             function_call = FunctionCall(
307 |                 function_name=func_name,
308 |                 args=args,
309 |                 kwargs=kwargs,
310 |                 line_number=line_num,
311 |                 full_name=full_func_name
312 |             )
313 |             
314 |             result.function_calls.append(function_call)
315 |     
316 |     def _extract_attribute_access(self, node: ast.Attribute, result: AnalysisResult):
317 |         """Extract attribute access information"""
318 |         line_num = getattr(node, 'lineno', 0)
319 |         
320 |         obj_name = self._get_name_from_node(node.value)
321 |         attr_name = node.attr
322 |         
323 |         if obj_name and attr_name:
324 |             attribute_access = AttributeAccess(
325 |                 object_name=obj_name,
326 |                 attribute_name=attr_name,
327 |                 line_number=line_num,
328 |                 object_type=self.variable_types.get(obj_name)
329 |             )
330 |             
331 |             result.attribute_accesses.append(attribute_access)
332 |     
333 |     def _infer_object_types(self, result: AnalysisResult):
334 |         """Update object types for method calls and attribute accesses"""
335 |         for method_call in result.method_calls:
336 |             if not method_call.object_type:
337 |                 # First check context manager variables
338 |                 obj_type = self._get_context_aware_type(method_call.object_name, method_call.line_number)
339 |                 if obj_type:
340 |                     method_call.object_type = obj_type
341 |                 else:
342 |                     method_call.object_type = self.variable_types.get(method_call.object_name)
343 |         
344 |         for attr_access in result.attribute_accesses:
345 |             if not attr_access.object_type:
346 |                 # First check context manager variables
347 |                 obj_type = self._get_context_aware_type(attr_access.object_name, attr_access.line_number)
348 |                 if obj_type:
349 |                     attr_access.object_type = obj_type
350 |                 else:
351 |                     attr_access.object_type = self.variable_types.get(attr_access.object_name)
352 |     
353 |     def _get_context_aware_type(self, var_name: str, line_number: int) -> Optional[str]:
354 |         """Get the type of a variable considering its context (e.g., async with scope)"""
355 |         if var_name in self.context_manager_vars:
356 |             start_line, end_line, var_type = self.context_manager_vars[var_name]
357 |             if start_line <= line_number <= end_line:
358 |                 return var_type
359 |         return None
360 |     
361 |     def _get_name_from_call(self, node: ast.AST) -> Optional[str]:
362 |         """Get the name from a call node (for class instantiation)"""
363 |         if isinstance(node, ast.Name):
364 |             return node.id
365 |         elif isinstance(node, ast.Attribute):
366 |             value_name = self._get_name_from_node(node.value)
367 |             if value_name:
368 |                 return f"{value_name}.{node.attr}"
369 |         return None
370 |     
371 |     def _get_name_from_node(self, node: ast.AST) -> Optional[str]:
372 |         """Get string representation of a node (for object names)"""
373 |         if isinstance(node, ast.Name):
374 |             return node.id
375 |         elif isinstance(node, ast.Attribute):
376 |             value_name = self._get_name_from_node(node.value)
377 |             if value_name:
378 |                 return f"{value_name}.{node.attr}"
379 |         return None
380 |     
381 |     def _get_arg_representation(self, node: ast.AST) -> str:
382 |         """Get string representation of an argument"""
383 |         if isinstance(node, ast.Constant):
384 |             return repr(node.value)
385 |         elif isinstance(node, ast.Name):
386 |             return node.id
387 |         elif isinstance(node, ast.Attribute):
388 |             return self._get_name_from_node(node) or "<?>"
389 |         elif isinstance(node, ast.Call):
390 |             func_name = self._get_name_from_call(node.func)
391 |             return f"{func_name}(...)" if func_name else "call(...)"
392 |         else:
393 |             return f"<{type(node).__name__}>"
394 |     
395 |     def _is_likely_class_instantiation(self, func_name: str, full_name: Optional[str]) -> bool:
396 |         """Determine if a function call is likely a class instantiation"""
397 |         # Check if it's a known imported class (classes typically start with uppercase)
398 |         if func_name and func_name[0].isupper():
399 |             return True
400 |         
401 |         # Check if the full name suggests a class (contains known class patterns)
402 |         if full_name:
403 |             # Common class patterns in module names
404 |             class_patterns = [
405 |                 'Model', 'Provider', 'Client', 'Agent', 'Manager', 'Handler',
406 |                 'Builder', 'Factory', 'Service', 'Controller', 'Processor'
407 |             ]
408 |             return any(pattern in full_name for pattern in class_patterns)
409 |         
410 |         return False
411 |     
412 |     def _extract_nested_class_instantiation(self, node: ast.Call, result: AnalysisResult):
413 |         """Extract class instantiation that's not in direct assignment (e.g., as parameter)"""
414 |         line_num = getattr(node, 'lineno', 0)
415 |         
416 |         if isinstance(node.func, ast.Name):
417 |             class_name = node.func.id
418 |             
419 |             args = [self._get_arg_representation(arg) for arg in node.args]
420 |             kwargs = {
421 |                 kw.arg: self._get_arg_representation(kw.value) 
422 |                 for kw in node.keywords if kw.arg
423 |             }
424 |             
425 |             # Resolve full class name using import map
426 |             full_class_name = self._resolve_full_name(class_name)
427 |             
428 |             # Use a synthetic variable name since this isn't assigned to a variable
429 |             var_name = f"<{class_name.lower()}_instance>"
430 |             
431 |             instantiation = ClassInstantiation(
432 |                 variable_name=var_name,
433 |                 class_name=class_name,
434 |                 args=args,
435 |                 kwargs=kwargs,
436 |                 line_number=line_num,
437 |                 full_class_name=full_class_name
438 |             )
439 |             
440 |             result.class_instantiations.append(instantiation)
441 |     
442 |     def _track_method_result_assignment(self, call_node: ast.Call, var_name: str):
443 |         """Track when a variable is assigned the result of a method call"""
444 |         if isinstance(call_node.func, ast.Attribute):
445 |             # For now, we'll use a generic type hint for method results
446 |             # In a more sophisticated system, we could look up the return type
447 |             self.variable_types[var_name] = "method_result"
448 |     
449 |     def _handle_async_with(self, node: ast.AsyncWith, result: AnalysisResult):
450 |         """Handle async with statements and track context manager variables"""
451 |         for item in node.items:
452 |             if item.optional_vars and isinstance(item.optional_vars, ast.Name):
453 |                 var_name = item.optional_vars.id
454 |                 
455 |                 # If the context manager is a method call, track the result type
456 |                 if isinstance(item.context_expr, ast.Call) and isinstance(item.context_expr.func, ast.Attribute):
457 |                     # Extract and process the method call
458 |                     self._extract_method_call(item.context_expr, result)
459 |                     self.processed_calls.add(id(item.context_expr))
460 |                     
461 |                     # Track context manager scope for pydantic_ai run_stream calls
462 |                     obj_name = self._get_name_from_node(item.context_expr.func.value)
463 |                     method_name = item.context_expr.func.attr
464 |                     
465 |                     if (obj_name and obj_name in self.variable_types and 
466 |                         'pydantic_ai' in str(self.variable_types[obj_name]) and 
467 |                         method_name == 'run_stream'):
468 |                         
469 |                         # Calculate the scope of this async with block
470 |                         start_line = getattr(node, 'lineno', 0)
471 |                         end_line = getattr(node, 'end_lineno', start_line + 50)  # fallback estimate
472 |                         
473 |                         # For run_stream, the return type is specifically StreamedRunResult
474 |                         # This is the actual return type, not a generic placeholder
475 |                         self.context_manager_vars[var_name] = (start_line, end_line, "pydantic_ai.StreamedRunResult")
476 |     
477 |     def _handle_with(self, node: ast.With, result: AnalysisResult):
478 |         """Handle regular with statements and track context manager variables"""
479 |         for item in node.items:
480 |             if item.optional_vars and isinstance(item.optional_vars, ast.Name):
481 |                 var_name = item.optional_vars.id
482 |                 
483 |                 # If the context manager is a method call, track the result type
484 |                 if isinstance(item.context_expr, ast.Call) and isinstance(item.context_expr.func, ast.Attribute):
485 |                     # Extract and process the method call
486 |                     self._extract_method_call(item.context_expr, result)
487 |                     self.processed_calls.add(id(item.context_expr))
488 |                     
489 |                     # Track basic type information
490 |                     self.variable_types[var_name] = "context_manager_result"
491 |     
492 |     def _resolve_full_name(self, name: str) -> Optional[str]:
493 |         """Resolve a name to its full module.name using import map"""
494 |         # Check if it's a direct import mapping
495 |         if name in self.import_map:
496 |             return self.import_map[name]
497 |         
498 |         # Check if it's a dotted name with first part in import map
499 |         parts = name.split('.')
500 |         if len(parts) > 1 and parts[0] in self.import_map:
501 |             base_module = self.import_map[parts[0]]
502 |             return f"{base_module}.{'.'.join(parts[1:])}"
503 |         
504 |         return None
505 | 
506 | 
507 | def analyze_ai_script(script_path: str) -> AnalysisResult:
508 |     """Convenience function to analyze a single AI-generated script"""
509 |     analyzer = AIScriptAnalyzer()
510 |     return analyzer.analyze_script(script_path)
511 | 
512 | 
513 | if __name__ == "__main__":
514 |     # Example usage
515 |     import sys
516 |     
517 |     if len(sys.argv) != 2:
518 |         print("Usage: python ai_script_analyzer.py <script_path>")
519 |         sys.exit(1)
520 |     
521 |     script_path = sys.argv[1]
522 |     result = analyze_ai_script(script_path)
523 |     
524 |     print(f"Analysis Results for: {result.file_path}")
525 |     print(f"Imports: {len(result.imports)}")
526 |     print(f"Class Instantiations: {len(result.class_instantiations)}")
527 |     print(f"Method Calls: {len(result.method_calls)}")
528 |     print(f"Function Calls: {len(result.function_calls)}")
529 |     print(f"Attribute Accesses: {len(result.attribute_accesses)}")
530 |     
531 |     if result.errors:
532 |         print(f"Errors: {result.errors}")
```
--------------------------------------------------------------------------------
/knowledge_graphs/hallucination_reporter.py:
--------------------------------------------------------------------------------
```python
  1 | """
  2 | Hallucination Reporter
  3 | 
  4 | Generates comprehensive reports about AI coding assistant hallucinations
  5 | detected in Python scripts. Supports multiple output formats.
  6 | """
  7 | 
  8 | import json
  9 | import logging
 10 | from datetime import datetime, timezone
 11 | from pathlib import Path
 12 | from typing import Dict, List, Any, Optional
 13 | 
 14 | from knowledge_graph_validator import (
 15 |     ScriptValidationResult, ValidationStatus, ValidationResult
 16 | )
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class HallucinationReporter:
 22 |     """Generates reports about detected hallucinations"""
 23 |     
 24 |     def __init__(self):
 25 |         self.report_timestamp = datetime.now(timezone.utc)
 26 |     
 27 |     def generate_comprehensive_report(self, validation_result: ScriptValidationResult) -> Dict[str, Any]:
 28 |         """Generate a comprehensive report in JSON format"""
 29 |         
 30 |         # Categorize validations by status (knowledge graph items only)
 31 |         valid_items = []
 32 |         invalid_items = []
 33 |         uncertain_items = []
 34 |         not_found_items = []
 35 |         
 36 |         # Process imports (only knowledge graph ones)
 37 |         for val in validation_result.import_validations:
 38 |             if not val.validation.details.get('in_knowledge_graph', False):
 39 |                 continue  # Skip external libraries
 40 |             item = {
 41 |                 'type': 'IMPORT',
 42 |                 'name': val.import_info.module,
 43 |                 'line': val.import_info.line_number,
 44 |                 'status': val.validation.status.value,
 45 |                 'confidence': val.validation.confidence,
 46 |                 'message': val.validation.message,
 47 |                 'details': {
 48 |                     'is_from_import': val.import_info.is_from_import,
 49 |                     'alias': val.import_info.alias,
 50 |                     'available_classes': val.available_classes,
 51 |                     'available_functions': val.available_functions
 52 |                 }
 53 |             }
 54 |             self._categorize_item(item, val.validation.status, valid_items, invalid_items, uncertain_items, not_found_items)
 55 |         
 56 |         # Process classes (only knowledge graph ones)
 57 |         for val in validation_result.class_validations:
 58 |             class_name = val.class_instantiation.full_class_name or val.class_instantiation.class_name
 59 |             if not self._is_from_knowledge_graph(class_name, validation_result):
 60 |                 continue  # Skip external classes
 61 |             item = {
 62 |                 'type': 'CLASS_INSTANTIATION',
 63 |                 'name': val.class_instantiation.class_name,
 64 |                 'full_name': val.class_instantiation.full_class_name,
 65 |                 'variable': val.class_instantiation.variable_name,
 66 |                 'line': val.class_instantiation.line_number,
 67 |                 'status': val.validation.status.value,
 68 |                 'confidence': val.validation.confidence,
 69 |                 'message': val.validation.message,
 70 |                 'details': {
 71 |                     'args_provided': val.class_instantiation.args,
 72 |                     'kwargs_provided': list(val.class_instantiation.kwargs.keys()),
 73 |                     'constructor_params': val.constructor_params,
 74 |                     'parameter_validation': self._serialize_validation_result(val.parameter_validation) if val.parameter_validation else None
 75 |                 }
 76 |             }
 77 |             self._categorize_item(item, val.validation.status, valid_items, invalid_items, uncertain_items, not_found_items)
 78 |         
 79 |         # Track reported items to avoid duplicates
 80 |         reported_items = set()
 81 |         
 82 |         # Process methods (only knowledge graph ones)
 83 |         for val in validation_result.method_validations:
 84 |             if not (val.method_call.object_type and self._is_from_knowledge_graph(val.method_call.object_type, validation_result)):
 85 |                 continue  # Skip external methods
 86 |             
 87 |             # Create unique key to avoid duplicates
 88 |             key = (val.method_call.line_number, val.method_call.method_name, val.method_call.object_type)
 89 |             if key not in reported_items:
 90 |                 reported_items.add(key)
 91 |                 item = {
 92 |                     'type': 'METHOD_CALL',
 93 |                     'name': val.method_call.method_name,
 94 |                     'object': val.method_call.object_name,
 95 |                     'object_type': val.method_call.object_type,
 96 |                     'line': val.method_call.line_number,
 97 |                     'status': val.validation.status.value,
 98 |                     'confidence': val.validation.confidence,
 99 |                     'message': val.validation.message,
100 |                     'details': {
101 |                         'args_provided': val.method_call.args,
102 |                         'kwargs_provided': list(val.method_call.kwargs.keys()),
103 |                         'expected_params': val.expected_params,
104 |                         'parameter_validation': self._serialize_validation_result(val.parameter_validation) if val.parameter_validation else None,
105 |                         'suggestions': val.validation.suggestions
106 |                     }
107 |                 }
108 |                 self._categorize_item(item, val.validation.status, valid_items, invalid_items, uncertain_items, not_found_items)
109 |         
110 |         # Process attributes (only knowledge graph ones) - but skip if already reported as method
111 |         for val in validation_result.attribute_validations:
112 |             if not (val.attribute_access.object_type and self._is_from_knowledge_graph(val.attribute_access.object_type, validation_result)):
113 |                 continue  # Skip external attributes
114 |             
115 |             # Create unique key - if this was already reported as a method, skip it
116 |             key = (val.attribute_access.line_number, val.attribute_access.attribute_name, val.attribute_access.object_type)
117 |             if key not in reported_items:
118 |                 reported_items.add(key)
119 |                 item = {
120 |                     'type': 'ATTRIBUTE_ACCESS',
121 |                     'name': val.attribute_access.attribute_name,
122 |                     'object': val.attribute_access.object_name,
123 |                     'object_type': val.attribute_access.object_type,
124 |                     'line': val.attribute_access.line_number,
125 |                     'status': val.validation.status.value,
126 |                     'confidence': val.validation.confidence,
127 |                     'message': val.validation.message,
128 |                     'details': {
129 |                         'expected_type': val.expected_type
130 |                     }
131 |                 }
132 |                 self._categorize_item(item, val.validation.status, valid_items, invalid_items, uncertain_items, not_found_items)
133 |         
134 |         # Process functions (only knowledge graph ones)
135 |         for val in validation_result.function_validations:
136 |             if not (val.function_call.full_name and self._is_from_knowledge_graph(val.function_call.full_name, validation_result)):
137 |                 continue  # Skip external functions
138 |             item = {
139 |                 'type': 'FUNCTION_CALL',
140 |                 'name': val.function_call.function_name,
141 |                 'full_name': val.function_call.full_name,
142 |                 'line': val.function_call.line_number,
143 |                 'status': val.validation.status.value,
144 |                 'confidence': val.validation.confidence,
145 |                 'message': val.validation.message,
146 |                 'details': {
147 |                     'args_provided': val.function_call.args,
148 |                     'kwargs_provided': list(val.function_call.kwargs.keys()),
149 |                     'expected_params': val.expected_params,
150 |                     'parameter_validation': self._serialize_validation_result(val.parameter_validation) if val.parameter_validation else None
151 |                 }
152 |             }
153 |             self._categorize_item(item, val.validation.status, valid_items, invalid_items, uncertain_items, not_found_items)
154 |         
155 |         # Create library summary
156 |         library_summary = self._create_library_summary(validation_result)
157 |         
158 |         # Generate report
159 |         report = {
160 |             'analysis_metadata': {
161 |                 'script_path': validation_result.script_path,
162 |                 'analysis_timestamp': self.report_timestamp.isoformat(),
163 |                 'total_imports': len(validation_result.import_validations),
164 |                 'total_classes': len(validation_result.class_validations),
165 |                 'total_methods': len(validation_result.method_validations),
166 |                 'total_attributes': len(validation_result.attribute_validations),
167 |                 'total_functions': len(validation_result.function_validations)
168 |             },
169 |             'validation_summary': {
170 |                 'overall_confidence': validation_result.overall_confidence,
171 |                 'total_validations': len(valid_items) + len(invalid_items) + len(uncertain_items) + len(not_found_items),
172 |                 'valid_count': len(valid_items),
173 |                 'invalid_count': len(invalid_items),
174 |                 'uncertain_count': len(uncertain_items),
175 |                 'not_found_count': len(not_found_items),
176 |                 'hallucination_rate': len(invalid_items + not_found_items) / max(1, len(valid_items) + len(invalid_items) + len(not_found_items))
177 |             },
178 |             'libraries_analyzed': library_summary,
179 |             'validation_details': {
180 |                 'valid_items': valid_items,
181 |                 'invalid_items': invalid_items,
182 |                 'uncertain_items': uncertain_items,
183 |                 'not_found_items': not_found_items
184 |             },
185 |             'hallucinations_detected': validation_result.hallucinations_detected,
186 |             'recommendations': self._generate_recommendations(validation_result)
187 |         }
188 |         
189 |         return report
190 |     
191 |     def _is_from_knowledge_graph(self, item_name: str, validation_result) -> bool:
192 |         """Check if an item is from a knowledge graph module"""
193 |         if not item_name:
194 |             return False
195 |         
196 |         # Get knowledge graph modules from import validations
197 |         kg_modules = set()
198 |         for val in validation_result.import_validations:
199 |             if val.validation.details.get('in_knowledge_graph', False):
200 |                 kg_modules.add(val.import_info.module)
201 |                 if '.' in val.import_info.module:
202 |                     kg_modules.add(val.import_info.module.split('.')[0])
203 |         
204 |         # Check if the item belongs to any knowledge graph module
205 |         if '.' in item_name:
206 |             base_module = item_name.split('.')[0]
207 |             return base_module in kg_modules
208 |         
209 |         return any(item_name in module or module.endswith(item_name) for module in kg_modules)
210 |     
211 |     def _serialize_validation_result(self, validation_result) -> Dict[str, Any]:
212 |         """Convert ValidationResult to JSON-serializable dictionary"""
213 |         if validation_result is None:
214 |             return None
215 |         
216 |         return {
217 |             'status': validation_result.status.value,
218 |             'confidence': validation_result.confidence,
219 |             'message': validation_result.message,
220 |             'details': validation_result.details,
221 |             'suggestions': validation_result.suggestions
222 |         }
223 |     
224 |     def _categorize_item(self, item: Dict[str, Any], status: ValidationStatus, 
225 |                         valid_items: List, invalid_items: List, uncertain_items: List, not_found_items: List):
226 |         """Categorize validation item by status"""
227 |         if status == ValidationStatus.VALID:
228 |             valid_items.append(item)
229 |         elif status == ValidationStatus.INVALID:
230 |             invalid_items.append(item)
231 |         elif status == ValidationStatus.UNCERTAIN:
232 |             uncertain_items.append(item)
233 |         elif status == ValidationStatus.NOT_FOUND:
234 |             not_found_items.append(item)
235 |     
236 |     def _create_library_summary(self, validation_result: ScriptValidationResult) -> List[Dict[str, Any]]:
237 |         """Create summary of libraries analyzed"""
238 |         library_stats = {}
239 |         
240 |         # Aggregate stats by library/module
241 |         for val in validation_result.import_validations:
242 |             module = val.import_info.module
243 |             if module not in library_stats:
244 |                 library_stats[module] = {
245 |                     'module_name': module,
246 |                     'import_status': val.validation.status.value,
247 |                     'import_confidence': val.validation.confidence,
248 |                     'classes_used': [],
249 |                     'methods_called': [],
250 |                     'attributes_accessed': [],
251 |                     'functions_called': []
252 |                 }
253 |         
254 |         # Add class usage
255 |         for val in validation_result.class_validations:
256 |             class_name = val.class_instantiation.class_name
257 |             full_name = val.class_instantiation.full_class_name
258 |             
259 |             # Try to match to library
260 |             if full_name:
261 |                 parts = full_name.split('.')
262 |                 if len(parts) > 1:
263 |                     module = '.'.join(parts[:-1])
264 |                     if module in library_stats:
265 |                         library_stats[module]['classes_used'].append({
266 |                             'class_name': class_name,
267 |                             'status': val.validation.status.value,
268 |                             'confidence': val.validation.confidence
269 |                         })
270 |         
271 |         # Add method usage
272 |         for val in validation_result.method_validations:
273 |             method_name = val.method_call.method_name
274 |             object_type = val.method_call.object_type
275 |             
276 |             if object_type:
277 |                 parts = object_type.split('.')
278 |                 if len(parts) > 1:
279 |                     module = '.'.join(parts[:-1])
280 |                     if module in library_stats:
281 |                         library_stats[module]['methods_called'].append({
282 |                             'method_name': method_name,
283 |                             'class_name': parts[-1],
284 |                             'status': val.validation.status.value,
285 |                             'confidence': val.validation.confidence
286 |                         })
287 |         
288 |         # Add attribute usage
289 |         for val in validation_result.attribute_validations:
290 |             attr_name = val.attribute_access.attribute_name
291 |             object_type = val.attribute_access.object_type
292 |             
293 |             if object_type:
294 |                 parts = object_type.split('.')
295 |                 if len(parts) > 1:
296 |                     module = '.'.join(parts[:-1])
297 |                     if module in library_stats:
298 |                         library_stats[module]['attributes_accessed'].append({
299 |                             'attribute_name': attr_name,
300 |                             'class_name': parts[-1],
301 |                             'status': val.validation.status.value,
302 |                             'confidence': val.validation.confidence
303 |                         })
304 |         
305 |         # Add function usage
306 |         for val in validation_result.function_validations:
307 |             func_name = val.function_call.function_name
308 |             full_name = val.function_call.full_name
309 |             
310 |             if full_name:
311 |                 parts = full_name.split('.')
312 |                 if len(parts) > 1:
313 |                     module = '.'.join(parts[:-1])
314 |                     if module in library_stats:
315 |                         library_stats[module]['functions_called'].append({
316 |                             'function_name': func_name,
317 |                             'status': val.validation.status.value,
318 |                             'confidence': val.validation.confidence
319 |                         })
320 |         
321 |         return list(library_stats.values())
322 |     
323 |     def _generate_recommendations(self, validation_result: ScriptValidationResult) -> List[str]:
324 |         """Generate recommendations based on validation results"""
325 |         recommendations = []
326 |         
327 |         # Only count actual hallucinations (from knowledge graph libraries)
328 |         kg_hallucinations = [h for h in validation_result.hallucinations_detected]
329 |         
330 |         if kg_hallucinations:
331 |             method_issues = [h for h in kg_hallucinations if h['type'] == 'METHOD_NOT_FOUND']
332 |             attr_issues = [h for h in kg_hallucinations if h['type'] == 'ATTRIBUTE_NOT_FOUND']
333 |             param_issues = [h for h in kg_hallucinations if h['type'] == 'INVALID_PARAMETERS']
334 |             
335 |             if method_issues:
336 |                 recommendations.append(
337 |                     f"Found {len(method_issues)} non-existent methods in knowledge graph libraries. "
338 |                     "Consider checking the official documentation for correct method names."
339 |                 )
340 |             
341 |             if attr_issues:
342 |                 recommendations.append(
343 |                     f"Found {len(attr_issues)} non-existent attributes in knowledge graph libraries. "
344 |                     "Verify attribute names against the class documentation."
345 |                 )
346 |             
347 |             if param_issues:
348 |                 recommendations.append(
349 |                     f"Found {len(param_issues)} parameter mismatches in knowledge graph libraries. "
350 |                     "Check function signatures for correct parameter names and types."
351 |                 )
352 |         else:
353 |             recommendations.append(
354 |                 "No hallucinations detected in knowledge graph libraries. "
355 |                 "External library usage appears to be working as expected."
356 |             )
357 |         
358 |         if validation_result.overall_confidence < 0.7:
359 |             recommendations.append(
360 |                 "Overall confidence is moderate. Most validations were for external libraries not in the knowledge graph."
361 |             )
362 |         
363 |         return recommendations
364 |     
365 |     def save_json_report(self, report: Dict[str, Any], output_path: str):
366 |         """Save report as JSON file"""
367 |         with open(output_path, 'w', encoding='utf-8') as f:
368 |             json.dump(report, f, indent=2, ensure_ascii=False)
369 |         
370 |         logger.info(f"JSON report saved to: {output_path}")
371 |     
372 |     def save_markdown_report(self, report: Dict[str, Any], output_path: str):
373 |         """Save report as Markdown file"""
374 |         md_content = self._generate_markdown_content(report)
375 |         
376 |         with open(output_path, 'w', encoding='utf-8') as f:
377 |             f.write(md_content)
378 |         
379 |         logger.info(f"Markdown report saved to: {output_path}")
380 |     
381 |     def _generate_markdown_content(self, report: Dict[str, Any]) -> str:
382 |         """Generate Markdown content from report"""
383 |         md = []
384 |         
385 |         # Header
386 |         md.append("# AI Hallucination Detection Report")
387 |         md.append("")
388 |         md.append(f"**Script:** `{report['analysis_metadata']['script_path']}`")
389 |         md.append(f"**Analysis Date:** {report['analysis_metadata']['analysis_timestamp']}")
390 |         md.append(f"**Overall Confidence:** {report['validation_summary']['overall_confidence']:.2%}")
391 |         md.append("")
392 |         
393 |         # Summary
394 |         summary = report['validation_summary']
395 |         md.append("## Summary")
396 |         md.append("")
397 |         md.append(f"- **Total Validations:** {summary['total_validations']}")
398 |         md.append(f"- **Valid:** {summary['valid_count']} ({summary['valid_count']/summary['total_validations']:.1%})")
399 |         md.append(f"- **Invalid:** {summary['invalid_count']} ({summary['invalid_count']/summary['total_validations']:.1%})")
400 |         md.append(f"- **Not Found:** {summary['not_found_count']} ({summary['not_found_count']/summary['total_validations']:.1%})")
401 |         md.append(f"- **Uncertain:** {summary['uncertain_count']} ({summary['uncertain_count']/summary['total_validations']:.1%})")
402 |         md.append(f"- **Hallucination Rate:** {summary['hallucination_rate']:.1%}")
403 |         md.append("")
404 |         
405 |         # Hallucinations
406 |         if report['hallucinations_detected']:
407 |             md.append("## 🚨 Hallucinations Detected")
408 |             md.append("")
409 |             for i, hallucination in enumerate(report['hallucinations_detected'], 1):
410 |                 md.append(f"### {i}. {hallucination['type'].replace('_', ' ').title()}")
411 |                 md.append(f"**Location:** {hallucination['location']}")
412 |                 md.append(f"**Description:** {hallucination['description']}")
413 |                 if hallucination.get('suggestion'):
414 |                     md.append(f"**Suggestion:** {hallucination['suggestion']}")
415 |                 md.append("")
416 |         
417 |         # Libraries
418 |         if report['libraries_analyzed']:
419 |             md.append("## 📚 Libraries Analyzed")
420 |             md.append("")
421 |             for lib in report['libraries_analyzed']:
422 |                 md.append(f"### {lib['module_name']}")
423 |                 md.append(f"**Import Status:** {lib['import_status']}")
424 |                 md.append(f"**Import Confidence:** {lib['import_confidence']:.2%}")
425 |                 
426 |                 if lib['classes_used']:
427 |                     md.append("**Classes Used:**")
428 |                     for cls in lib['classes_used']:
429 |                         status_emoji = "✅" if cls['status'] == 'VALID' else "❌"
430 |                         md.append(f"  - {status_emoji} `{cls['class_name']}` ({cls['confidence']:.1%})")
431 |                 
432 |                 if lib['methods_called']:
433 |                     md.append("**Methods Called:**")
434 |                     for method in lib['methods_called']:
435 |                         status_emoji = "✅" if method['status'] == 'VALID' else "❌"
436 |                         md.append(f"  - {status_emoji} `{method['class_name']}.{method['method_name']}()` ({method['confidence']:.1%})")
437 |                 
438 |                 if lib['attributes_accessed']:
439 |                     md.append("**Attributes Accessed:**")
440 |                     for attr in lib['attributes_accessed']:
441 |                         status_emoji = "✅" if attr['status'] == 'VALID' else "❌"
442 |                         md.append(f"  - {status_emoji} `{attr['class_name']}.{attr['attribute_name']}` ({attr['confidence']:.1%})")
443 |                 
444 |                 if lib['functions_called']:
445 |                     md.append("**Functions Called:**")
446 |                     for func in lib['functions_called']:
447 |                         status_emoji = "✅" if func['status'] == 'VALID' else "❌"
448 |                         md.append(f"  - {status_emoji} `{func['function_name']}()` ({func['confidence']:.1%})")
449 |                 
450 |                 md.append("")
451 |         
452 |         # Recommendations
453 |         if report['recommendations']:
454 |             md.append("## 💡 Recommendations")
455 |             md.append("")
456 |             for rec in report['recommendations']:
457 |                 md.append(f"- {rec}")
458 |             md.append("")
459 |         
460 |         # Detailed Results
461 |         md.append("## 📋 Detailed Validation Results")
462 |         md.append("")
463 |         
464 |         # Invalid items
465 |         invalid_items = report['validation_details']['invalid_items']
466 |         if invalid_items:
467 |             md.append("### ❌ Invalid Items")
468 |             md.append("")
469 |             for item in invalid_items:
470 |                 md.append(f"- **{item['type']}** `{item['name']}` (Line {item['line']}) - {item['message']}")
471 |             md.append("")
472 |         
473 |         # Not found items
474 |         not_found_items = report['validation_details']['not_found_items']
475 |         if not_found_items:
476 |             md.append("### 🔍 Not Found Items")
477 |             md.append("")
478 |             for item in not_found_items:
479 |                 md.append(f"- **{item['type']}** `{item['name']}` (Line {item['line']}) - {item['message']}")
480 |             md.append("")
481 |         
482 |         # Valid items (sample)
483 |         valid_items = report['validation_details']['valid_items']
484 |         if valid_items:
485 |             md.append("### ✅ Valid Items (Sample)")
486 |             md.append("")
487 |             for item in valid_items[:10]:  # Show first 10
488 |                 md.append(f"- **{item['type']}** `{item['name']}` (Line {item['line']}) - {item['message']}")
489 |             if len(valid_items) > 10:
490 |                 md.append(f"- ... and {len(valid_items) - 10} more valid items")
491 |             md.append("")
492 |         
493 |         return "\n".join(md)
494 |     
495 |     def print_summary(self, report: Dict[str, Any]):
496 |         """Print a concise summary to console"""
497 |         print("\n" + "="*80)
498 |         print("🤖 AI HALLUCINATION DETECTION REPORT")
499 |         print("="*80)
500 |         
501 |         print(f"Script: {report['analysis_metadata']['script_path']}")
502 |         print(f"Overall Confidence: {report['validation_summary']['overall_confidence']:.1%}")
503 |         
504 |         summary = report['validation_summary']
505 |         print(f"\nValidation Results:")
506 |         print(f"  ✅ Valid: {summary['valid_count']}")
507 |         print(f"  ❌ Invalid: {summary['invalid_count']}")
508 |         print(f"  🔍 Not Found: {summary['not_found_count']}")
509 |         print(f"  ❓ Uncertain: {summary['uncertain_count']}")
510 |         print(f"  📊 Hallucination Rate: {summary['hallucination_rate']:.1%}")
511 |         
512 |         if report['hallucinations_detected']:
513 |             print(f"\n🚨 {len(report['hallucinations_detected'])} Hallucinations Detected:")
514 |             for hall in report['hallucinations_detected'][:5]:  # Show first 5
515 |                 print(f"  - {hall['type'].replace('_', ' ').title()} at {hall['location']}")
516 |                 print(f"    {hall['description']}")
517 |         
518 |         if report['recommendations']:
519 |             print(f"\n💡 Recommendations:")
520 |             for rec in report['recommendations'][:3]:  # Show first 3
521 |                 print(f"  - {rec}")
522 |         
523 |         print("="*80)
```
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
```python
  1 | """
  2 | Utility functions for the Crawl4AI MCP server.
  3 | """
  4 | import os
  5 | import concurrent.futures
  6 | from typing import List, Dict, Any, Optional, Tuple
  7 | import json
  8 | from supabase import create_client, Client
  9 | from urllib.parse import urlparse
 10 | import openai
 11 | import re
 12 | import time
 13 | 
 14 | # Load OpenAI API key for embeddings
 15 | openai.api_key = os.getenv("OPENAI_API_KEY")
 16 | 
 17 | def get_supabase_client() -> Client:
 18 |     """
 19 |     Get a Supabase client with the URL and key from environment variables.
 20 |     
 21 |     Returns:
 22 |         Supabase client instance
 23 |     """
 24 |     url = os.getenv("SUPABASE_URL")
 25 |     key = os.getenv("SUPABASE_SERVICE_KEY")
 26 |     
 27 |     if not url or not key:
 28 |         raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in environment variables")
 29 |     
 30 |     return create_client(url, key)
 31 | 
 32 | def create_embeddings_batch(texts: List[str]) -> List[List[float]]:
 33 |     """
 34 |     Create embeddings for multiple texts in a single API call.
 35 |     
 36 |     Args:
 37 |         texts: List of texts to create embeddings for
 38 |         
 39 |     Returns:
 40 |         List of embeddings (each embedding is a list of floats)
 41 |     """
 42 |     if not texts:
 43 |         return []
 44 |     
 45 |     max_retries = 3
 46 |     retry_delay = 1.0  # Start with 1 second delay
 47 |     
 48 |     for retry in range(max_retries):
 49 |         try:
 50 |             response = openai.embeddings.create(
 51 |                 model="text-embedding-3-small", # Hardcoding embedding model for now, will change this later to be more dynamic
 52 |                 input=texts
 53 |             )
 54 |             return [item.embedding for item in response.data]
 55 |         except Exception as e:
 56 |             if retry < max_retries - 1:
 57 |                 print(f"Error creating batch embeddings (attempt {retry + 1}/{max_retries}): {e}")
 58 |                 print(f"Retrying in {retry_delay} seconds...")
 59 |                 time.sleep(retry_delay)
 60 |                 retry_delay *= 2  # Exponential backoff
 61 |             else:
 62 |                 print(f"Failed to create batch embeddings after {max_retries} attempts: {e}")
 63 |                 # Try creating embeddings one by one as fallback
 64 |                 print("Attempting to create embeddings individually...")
 65 |                 embeddings = []
 66 |                 successful_count = 0
 67 |                 
 68 |                 for i, text in enumerate(texts):
 69 |                     try:
 70 |                         individual_response = openai.embeddings.create(
 71 |                             model="text-embedding-3-small",
 72 |                             input=[text]
 73 |                         )
 74 |                         embeddings.append(individual_response.data[0].embedding)
 75 |                         successful_count += 1
 76 |                     except Exception as individual_error:
 77 |                         print(f"Failed to create embedding for text {i}: {individual_error}")
 78 |                         # Add zero embedding as fallback
 79 |                         embeddings.append([0.0] * 1536)
 80 |                 
 81 |                 print(f"Successfully created {successful_count}/{len(texts)} embeddings individually")
 82 |                 return embeddings
 83 | 
 84 | def create_embedding(text: str) -> List[float]:
 85 |     """
 86 |     Create an embedding for a single text using OpenAI's API.
 87 |     
 88 |     Args:
 89 |         text: Text to create an embedding for
 90 |         
 91 |     Returns:
 92 |         List of floats representing the embedding
 93 |     """
 94 |     try:
 95 |         embeddings = create_embeddings_batch([text])
 96 |         return embeddings[0] if embeddings else [0.0] * 1536
 97 |     except Exception as e:
 98 |         print(f"Error creating embedding: {e}")
 99 |         # Return empty embedding if there's an error
100 |         return [0.0] * 1536
101 | 
102 | def generate_contextual_embedding(full_document: str, chunk: str) -> Tuple[str, bool]:
103 |     """
104 |     Generate contextual information for a chunk within a document to improve retrieval.
105 |     
106 |     Args:
107 |         full_document: The complete document text
108 |         chunk: The specific chunk of text to generate context for
109 |         
110 |     Returns:
111 |         Tuple containing:
112 |         - The contextual text that situates the chunk within the document
113 |         - Boolean indicating if contextual embedding was performed
114 |     """
115 |     model_choice = os.getenv("MODEL_CHOICE")
116 |     
117 |     try:
118 |         # Create the prompt for generating contextual information
119 |         prompt = f"""<document> 
120 | {full_document[:25000]} 
121 | </document>
122 | Here is the chunk we want to situate within the whole document 
123 | <chunk> 
124 | {chunk}
125 | </chunk> 
126 | Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
127 | 
128 |         # Call the OpenAI API to generate contextual information
129 |         response = openai.chat.completions.create(
130 |             model=model_choice,
131 |             messages=[
132 |                 {"role": "system", "content": "You are a helpful assistant that provides concise contextual information."},
133 |                 {"role": "user", "content": prompt}
134 |             ],
135 |             temperature=0.3,
136 |             max_tokens=200
137 |         )
138 |         
139 |         # Extract the generated context
140 |         context = response.choices[0].message.content.strip()
141 |         
142 |         # Combine the context with the original chunk
143 |         contextual_text = f"{context}\n---\n{chunk}"
144 |         
145 |         return contextual_text, True
146 |     
147 |     except Exception as e:
148 |         print(f"Error generating contextual embedding: {e}. Using original chunk instead.")
149 |         return chunk, False
150 | 
151 | def process_chunk_with_context(args):
152 |     """
153 |     Process a single chunk with contextual embedding.
154 |     This function is designed to be used with concurrent.futures.
155 |     
156 |     Args:
157 |         args: Tuple containing (url, content, full_document)
158 |         
159 |     Returns:
160 |         Tuple containing:
161 |         - The contextual text that situates the chunk within the document
162 |         - Boolean indicating if contextual embedding was performed
163 |     """
164 |     url, content, full_document = args
165 |     return generate_contextual_embedding(full_document, content)
166 | 
167 | def add_documents_to_supabase(
168 |     client: Client, 
169 |     urls: List[str], 
170 |     chunk_numbers: List[int],
171 |     contents: List[str], 
172 |     metadatas: List[Dict[str, Any]],
173 |     url_to_full_document: Dict[str, str],
174 |     batch_size: int = 20
175 | ) -> None:
176 |     """
177 |     Add documents to the Supabase crawled_pages table in batches.
178 |     Deletes existing records with the same URLs before inserting to prevent duplicates.
179 |     
180 |     Args:
181 |         client: Supabase client
182 |         urls: List of URLs
183 |         chunk_numbers: List of chunk numbers
184 |         contents: List of document contents
185 |         metadatas: List of document metadata
186 |         url_to_full_document: Dictionary mapping URLs to their full document content
187 |         batch_size: Size of each batch for insertion
188 |     """
189 |     # Get unique URLs to delete existing records
190 |     unique_urls = list(set(urls))
191 |     
192 |     # Delete existing records for these URLs in a single operation
193 |     try:
194 |         if unique_urls:
195 |             # Use the .in_() filter to delete all records with matching URLs
196 |             client.table("crawled_pages").delete().in_("url", unique_urls).execute()
197 |     except Exception as e:
198 |         print(f"Batch delete failed: {e}. Trying one-by-one deletion as fallback.")
199 |         # Fallback: delete records one by one
200 |         for url in unique_urls:
201 |             try:
202 |                 client.table("crawled_pages").delete().eq("url", url).execute()
203 |             except Exception as inner_e:
204 |                 print(f"Error deleting record for URL {url}: {inner_e}")
205 |                 # Continue with the next URL even if one fails
206 |     
207 |     # Check if MODEL_CHOICE is set for contextual embeddings
208 |     use_contextual_embeddings = os.getenv("USE_CONTEXTUAL_EMBEDDINGS", "false") == "true"
209 |     print(f"\n\nUse contextual embeddings: {use_contextual_embeddings}\n\n")
210 |     
211 |     # Process in batches to avoid memory issues
212 |     for i in range(0, len(contents), batch_size):
213 |         batch_end = min(i + batch_size, len(contents))
214 |         
215 |         # Get batch slices
216 |         batch_urls = urls[i:batch_end]
217 |         batch_chunk_numbers = chunk_numbers[i:batch_end]
218 |         batch_contents = contents[i:batch_end]
219 |         batch_metadatas = metadatas[i:batch_end]
220 |         
221 |         # Apply contextual embedding to each chunk if MODEL_CHOICE is set
222 |         if use_contextual_embeddings:
223 |             # Prepare arguments for parallel processing
224 |             process_args = []
225 |             for j, content in enumerate(batch_contents):
226 |                 url = batch_urls[j]
227 |                 full_document = url_to_full_document.get(url, "")
228 |                 process_args.append((url, content, full_document))
229 |             
230 |             # Process in parallel using ThreadPoolExecutor
231 |             contextual_contents = []
232 |             with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
233 |                 # Submit all tasks and collect results
234 |                 future_to_idx = {executor.submit(process_chunk_with_context, arg): idx 
235 |                                 for idx, arg in enumerate(process_args)}
236 |                 
237 |                 # Process results as they complete
238 |                 for future in concurrent.futures.as_completed(future_to_idx):
239 |                     idx = future_to_idx[future]
240 |                     try:
241 |                         result, success = future.result()
242 |                         contextual_contents.append(result)
243 |                         if success:
244 |                             batch_metadatas[idx]["contextual_embedding"] = True
245 |                     except Exception as e:
246 |                         print(f"Error processing chunk {idx}: {e}")
247 |                         # Use original content as fallback
248 |                         contextual_contents.append(batch_contents[idx])
249 |             
250 |             # Sort results back into original order if needed
251 |             if len(contextual_contents) != len(batch_contents):
252 |                 print(f"Warning: Expected {len(batch_contents)} results but got {len(contextual_contents)}")
253 |                 # Use original contents as fallback
254 |                 contextual_contents = batch_contents
255 |         else:
256 |             # If not using contextual embeddings, use original contents
257 |             contextual_contents = batch_contents
258 |         
259 |         # Create embeddings for the entire batch at once
260 |         batch_embeddings = create_embeddings_batch(contextual_contents)
261 |         
262 |         batch_data = []
263 |         for j in range(len(contextual_contents)):
264 |             # Extract metadata fields
265 |             chunk_size = len(contextual_contents[j])
266 |             
267 |             # Extract source_id from URL
268 |             parsed_url = urlparse(batch_urls[j])
269 |             source_id = parsed_url.netloc or parsed_url.path
270 |             
271 |             # Prepare data for insertion
272 |             data = {
273 |                 "url": batch_urls[j],
274 |                 "chunk_number": batch_chunk_numbers[j],
275 |                 "content": contextual_contents[j],  # Store original content
276 |                 "metadata": {
277 |                     "chunk_size": chunk_size,
278 |                     **batch_metadatas[j]
279 |                 },
280 |                 "source_id": source_id,  # Add source_id field
281 |                 "embedding": batch_embeddings[j]  # Use embedding from contextual content
282 |             }
283 |             
284 |             batch_data.append(data)
285 |         
286 |         # Insert batch into Supabase with retry logic
287 |         max_retries = 3
288 |         retry_delay = 1.0  # Start with 1 second delay
289 |         
290 |         for retry in range(max_retries):
291 |             try:
292 |                 client.table("crawled_pages").insert(batch_data).execute()
293 |                 # Success - break out of retry loop
294 |                 break
295 |             except Exception as e:
296 |                 if retry < max_retries - 1:
297 |                     print(f"Error inserting batch into Supabase (attempt {retry + 1}/{max_retries}): {e}")
298 |                     print(f"Retrying in {retry_delay} seconds...")
299 |                     time.sleep(retry_delay)
300 |                     retry_delay *= 2  # Exponential backoff
301 |                 else:
302 |                     # Final attempt failed
303 |                     print(f"Failed to insert batch after {max_retries} attempts: {e}")
304 |                     # Optionally, try inserting records one by one as a last resort
305 |                     print("Attempting to insert records individually...")
306 |                     successful_inserts = 0
307 |                     for record in batch_data:
308 |                         try:
309 |                             client.table("crawled_pages").insert(record).execute()
310 |                             successful_inserts += 1
311 |                         except Exception as individual_error:
312 |                             print(f"Failed to insert individual record for URL {record['url']}: {individual_error}")
313 |                     
314 |                     if successful_inserts > 0:
315 |                         print(f"Successfully inserted {successful_inserts}/{len(batch_data)} records individually")
316 | 
317 | def search_documents(
318 |     client: Client,
319 |     query: str,
320 |     match_count: int = 10,
321 |     filter_metadata: Optional[Dict[str, Any]] = None,
322 |     source_id_filter: Optional[str] = None
323 | ) -> List[Dict[str, Any]]:
324 |     """
325 |     Search for documents in Supabase using vector similarity.
326 |     
327 |     Args:
328 |         client: Supabase client
329 |         query: Query text
330 |         match_count: Maximum number of results to return
331 |         filter_metadata: Optional metadata filter (for filtering on metadata fields)
332 |         source_id_filter: Optional source_id filter (for filtering on top-level source_id field)
333 |         
334 |     Returns:
335 |         List of matching documents
336 |     """
337 |     import threading
338 |     import time
339 |     
340 |     # Use threading.Timer for timeout instead of signal (works in threads)
341 |     timeout_event = threading.Event()
342 |     
343 |     def set_timeout():
344 |         timeout_event.set()
345 |     
346 |     timer = threading.Timer(30.0, set_timeout)
347 |     timer.start()
348 |     
349 |     try:
350 |         print(f"[DEBUG] Creating embedding for query: '{query[:50]}...'")
351 |         # Create embedding for the query
352 |         query_embedding = create_embedding(query)
353 |         
354 |         if not query_embedding or all(v == 0.0 for v in query_embedding):
355 |             print("[ERROR] Failed to create valid embedding")
356 |             return []
357 |         
358 |         if timeout_event.is_set():
359 |             raise TimeoutError("Embedding creation timed out")
360 |         
361 |         print("[DEBUG] Executing vector search in database...")
362 |         
363 |         # Build parameters for RPC call
364 |         params = {
365 |             'query_embedding': query_embedding,
366 |             'match_count': match_count * 3 if source_id_filter else match_count  # Get more results if we need to filter
367 |         }
368 |         
369 |         # Add source filter to RPC (supported by the stored procedure as 'source_filter')
370 |         if source_id_filter:
371 |             params['source_filter'] = source_id_filter  # Correct parameter name from SQL function
372 |             print(f"[DEBUG] Using source_filter parameter: '{source_id_filter}'")
373 |         
374 |         # Add metadata filter if provided (for backward compatibility)
375 |         if filter_metadata:
376 |             params['filter'] = filter_metadata
377 |             print(f"[DEBUG] Using metadata filter: {filter_metadata}")
378 |         
379 |         # Debug log the RPC parameters
380 |         print(f"[DEBUG] RPC params keys: {params.keys()}")
381 |         
382 |         result = client.rpc('match_crawled_pages', params).execute()
383 |         
384 |         if timeout_event.is_set():
385 |             raise TimeoutError("Vector search timed out")
386 |         
387 |         if result and result.data:
388 |             print(f"[DEBUG] Vector search returned {len(result.data)} results before filtering")
389 |             
390 |             # If source_id_filter is specified and we got results, filter them
391 |             if source_id_filter and result.data:
392 |                 # Filter results by source_id
393 |                 filtered_results = []
394 |                 for item in result.data:
395 |                     # Check if source_id matches (handle both top-level and metadata locations)
396 |                     item_source_id = item.get('source_id', '')
397 |                     if not item_source_id and 'metadata' in item and isinstance(item['metadata'], dict):
398 |                         item_source_id = item['metadata'].get('source', '')
399 |                     
400 |                     if item_source_id == source_id_filter:
401 |                         filtered_results.append(item)
402 |                         if len(filtered_results) >= match_count:
403 |                             break
404 |                 
405 |                 print(f"[SUCCESS] Vector search completed: {len(filtered_results)} results after source filtering")
406 |                 return filtered_results[:match_count]
407 |             else:
408 |                 print(f"[SUCCESS] Vector search completed: {len(result.data)} results")
409 |                 return result.data[:match_count]
410 |         else:
411 |             print("[WARNING] Vector search returned no results")
412 |             print(f"[DEBUG] RPC response data: {result.data if result else 'No result object'}")
413 |             return []
414 |             
415 |     except TimeoutError as e:
416 |         print(f"[ERROR] Vector search timed out: {e}")
417 |         return []
418 |     except Exception as e:
419 |         print(f"[ERROR] Error searching documents: {e}")
420 |         print(f"[DEBUG] Exception type: {type(e).__name__}")
421 |         import traceback
422 |         print(f"[DEBUG] Traceback: {traceback.format_exc()}")
423 |         return []
424 |     finally:
425 |         # Cancel the timer
426 |         timer.cancel()
427 | 
428 | 
429 | def extract_code_blocks(markdown_content: str, min_length: int = 1000) -> List[Dict[str, Any]]:
430 |     """
431 |     Extract code blocks from markdown content along with context.
432 |     
433 |     Args:
434 |         markdown_content: The markdown content to extract code blocks from
435 |         min_length: Minimum length of code blocks to extract (default: 1000 characters)
436 |         
437 |     Returns:
438 |         List of dictionaries containing code blocks and their context
439 |     """
440 |     code_blocks = []
441 |     
442 |     # Skip if content starts with triple backticks (edge case for files wrapped in backticks)
443 |     content = markdown_content.strip()
444 |     start_offset = 0
445 |     if content.startswith('```'):
446 |         # Skip the first triple backticks
447 |         start_offset = 3
448 |         print("Skipping initial triple backticks")
449 |     
450 |     # Find all occurrences of triple backticks
451 |     backtick_positions = []
452 |     pos = start_offset
453 |     while True:
454 |         pos = markdown_content.find('```', pos)
455 |         if pos == -1:
456 |             break
457 |         backtick_positions.append(pos)
458 |         pos += 3
459 |     
460 |     # Process pairs of backticks
461 |     i = 0
462 |     while i < len(backtick_positions) - 1:
463 |         start_pos = backtick_positions[i]
464 |         end_pos = backtick_positions[i + 1]
465 |         
466 |         # Extract the content between backticks
467 |         code_section = markdown_content[start_pos+3:end_pos]
468 |         
469 |         # Check if there's a language specifier on the first line
470 |         lines = code_section.split('\n', 1)
471 |         if len(lines) > 1:
472 |             # Check if first line is a language specifier (no spaces, common language names)
473 |             first_line = lines[0].strip()
474 |             if first_line and not ' ' in first_line and len(first_line) < 20:
475 |                 language = first_line
476 |                 code_content = lines[1].strip() if len(lines) > 1 else ""
477 |             else:
478 |                 language = ""
479 |                 code_content = code_section.strip()
480 |         else:
481 |             language = ""
482 |             code_content = code_section.strip()
483 |         
484 |         # Skip if code block is too short
485 |         if len(code_content) < min_length:
486 |             i += 2  # Move to next pair
487 |             continue
488 |         
489 |         # Extract context before (1000 chars)
490 |         context_start = max(0, start_pos - 1000)
491 |         context_before = markdown_content[context_start:start_pos].strip()
492 |         
493 |         # Extract context after (1000 chars)
494 |         context_end = min(len(markdown_content), end_pos + 3 + 1000)
495 |         context_after = markdown_content[end_pos + 3:context_end].strip()
496 |         
497 |         code_blocks.append({
498 |             'code': code_content,
499 |             'language': language,
500 |             'context_before': context_before,
501 |             'context_after': context_after,
502 |             'full_context': f"{context_before}\n\n{code_content}\n\n{context_after}"
503 |         })
504 |         
505 |         # Move to next pair (skip the closing backtick we just processed)
506 |         i += 2
507 |     
508 |     return code_blocks
509 | 
510 | 
511 | def generate_code_example_summary(code: str, context_before: str, context_after: str) -> str:
512 |     """
513 |     Generate a summary for a code example using its surrounding context.
514 |     
515 |     Args:
516 |         code: The code example
517 |         context_before: Context before the code
518 |         context_after: Context after the code
519 |         
520 |     Returns:
521 |         A summary of what the code example demonstrates
522 |     """
523 |     model_choice = os.getenv("MODEL_CHOICE")
524 |     
525 |     # Create the prompt
526 |     prompt = f"""<context_before>
527 | {context_before[-500:] if len(context_before) > 500 else context_before}
528 | </context_before>
529 | 
530 | <code_example>
531 | {code[:1500] if len(code) > 1500 else code}
532 | </code_example>
533 | 
534 | <context_after>
535 | {context_after[:500] if len(context_after) > 500 else context_after}
536 | </context_after>
537 | 
538 | Based on the code example and its surrounding context, provide a concise summary (2-3 sentences) that describes what this code example demonstrates and its purpose. Focus on the practical application and key concepts illustrated.
539 | """
540 |     
541 |     try:
542 |         response = openai.chat.completions.create(
543 |             model=model_choice,
544 |             messages=[
545 |                 {"role": "system", "content": "You are a helpful assistant that provides concise code example summaries."},
546 |                 {"role": "user", "content": prompt}
547 |             ],
548 |             temperature=0.3,
549 |             max_tokens=100
550 |         )
551 |         
552 |         return response.choices[0].message.content.strip()
553 |     
554 |     except Exception as e:
555 |         print(f"Error generating code example summary: {e}")
556 |         return "Code example for demonstration purposes."
557 | 
558 | 
559 | def add_code_examples_to_supabase(
560 |     client: Client,
561 |     urls: List[str],
562 |     chunk_numbers: List[int],
563 |     code_examples: List[str],
564 |     summaries: List[str],
565 |     metadatas: List[Dict[str, Any]],
566 |     batch_size: int = 20
567 | ):
568 |     """
569 |     Add code examples to the Supabase code_examples table in batches.
570 |     
571 |     Args:
572 |         client: Supabase client
573 |         urls: List of URLs
574 |         chunk_numbers: List of chunk numbers
575 |         code_examples: List of code example contents
576 |         summaries: List of code example summaries
577 |         metadatas: List of metadata dictionaries
578 |         batch_size: Size of each batch for insertion
579 |     """
580 |     if not urls:
581 |         return
582 |         
583 |     # Delete existing records for these URLs
584 |     unique_urls = list(set(urls))
585 |     for url in unique_urls:
586 |         try:
587 |             client.table('code_examples').delete().eq('url', url).execute()
588 |         except Exception as e:
589 |             print(f"Error deleting existing code examples for {url}: {e}")
590 |     
591 |     # Process in batches
592 |     total_items = len(urls)
593 |     for i in range(0, total_items, batch_size):
594 |         batch_end = min(i + batch_size, total_items)
595 |         batch_texts = []
596 |         
597 |         # Create combined texts for embedding (code + summary)
598 |         for j in range(i, batch_end):
599 |             combined_text = f"{code_examples[j]}\n\nSummary: {summaries[j]}"
600 |             batch_texts.append(combined_text)
601 |         
602 |         # Create embeddings for the batch
603 |         embeddings = create_embeddings_batch(batch_texts)
604 |         
605 |         # Check if embeddings are valid (not all zeros)
606 |         valid_embeddings = []
607 |         for embedding in embeddings:
608 |             if embedding and not all(v == 0.0 for v in embedding):
609 |                 valid_embeddings.append(embedding)
610 |             else:
611 |                 print(f"Warning: Zero or invalid embedding detected, creating new one...")
612 |                 # Try to create a single embedding as fallback
613 |                 single_embedding = create_embedding(batch_texts[len(valid_embeddings)])
614 |                 valid_embeddings.append(single_embedding)
615 |         
616 |         # Prepare batch data
617 |         batch_data = []
618 |         for j, embedding in enumerate(valid_embeddings):
619 |             idx = i + j
620 |             
621 |             # Extract source_id from URL
622 |             parsed_url = urlparse(urls[idx])
623 |             source_id = parsed_url.netloc or parsed_url.path
624 |             
625 |             batch_data.append({
626 |                 'url': urls[idx],
627 |                 'chunk_number': chunk_numbers[idx],
628 |                 'content': code_examples[idx],
629 |                 'summary': summaries[idx],
630 |                 'metadata': metadatas[idx],  # Store as JSON object, not string
631 |                 'source_id': source_id,
632 |                 'embedding': embedding
633 |             })
634 |         
635 |         # Insert batch into Supabase with retry logic
636 |         max_retries = 3
637 |         retry_delay = 1.0  # Start with 1 second delay
638 |         
639 |         for retry in range(max_retries):
640 |             try:
641 |                 client.table('code_examples').insert(batch_data).execute()
642 |                 # Success - break out of retry loop
643 |                 break
644 |             except Exception as e:
645 |                 if retry < max_retries - 1:
646 |                     print(f"Error inserting batch into Supabase (attempt {retry + 1}/{max_retries}): {e}")
647 |                     print(f"Retrying in {retry_delay} seconds...")
648 |                     time.sleep(retry_delay)
649 |                     retry_delay *= 2  # Exponential backoff
650 |                 else:
651 |                     # Final attempt failed
652 |                     print(f"Failed to insert batch after {max_retries} attempts: {e}")
653 |                     # Optionally, try inserting records one by one as a last resort
654 |                     print("Attempting to insert records individually...")
655 |                     successful_inserts = 0
656 |                     for record in batch_data:
657 |                         try:
658 |                             client.table('code_examples').insert(record).execute()
659 |                             successful_inserts += 1
660 |                         except Exception as individual_error:
661 |                             print(f"Failed to insert individual record for URL {record['url']}: {individual_error}")
662 |                     
663 |                     if successful_inserts > 0:
664 |                         print(f"Successfully inserted {successful_inserts}/{len(batch_data)} records individually")
665 |         print(f"Inserted batch {i//batch_size + 1} of {(total_items + batch_size - 1)//batch_size} code examples")
666 | 
667 | 
668 | def update_source_info(client: Client, source_id: str, summary: str, word_count: int):
669 |     """
670 |     Update or insert source information in the sources table.
671 |     
672 |     Args:
673 |         client: Supabase client
674 |         source_id: The source ID (domain)
675 |         summary: Summary of the source
676 |         word_count: Total word count for the source
677 |     """
678 |     try:
679 |         # Try to update existing source
680 |         result = client.table('sources').update({
681 |             'summary': summary,
682 |             'total_word_count': word_count,
683 |             'updated_at': 'now()'
684 |         }).eq('source_id', source_id).execute()
685 |         
686 |         # If no rows were updated, insert new source
687 |         if not result.data:
688 |             client.table('sources').insert({
689 |                 'source_id': source_id,
690 |                 'summary': summary,
691 |                 'total_word_count': word_count
692 |             }).execute()
693 |             print(f"Created new source: {source_id}")
694 |         else:
695 |             print(f"Updated source: {source_id}")
696 |             
697 |     except Exception as e:
698 |         print(f"Error updating source {source_id}: {e}")
699 | 
700 | 
701 | def extract_source_summary(source_id: str, content: str, max_length: int = 500) -> str:
702 |     """
703 |     Extract a summary for a source from its content using an LLM.
704 |     
705 |     This function uses the OpenAI API to generate a concise summary of the source content.
706 |     
707 |     Args:
708 |         source_id: The source ID (domain)
709 |         content: The content to extract a summary from
710 |         max_length: Maximum length of the summary
711 |         
712 |     Returns:
713 |         A summary string
714 |     """
715 |     # Default summary if we can't extract anything meaningful
716 |     default_summary = f"Content from {source_id}"
717 |     
718 |     if not content or len(content.strip()) == 0:
719 |         return default_summary
720 |     
721 |     # Get the model choice from environment variables
722 |     model_choice = os.getenv("MODEL_CHOICE")
723 |     
724 |     # Limit content length to avoid token limits
725 |     truncated_content = content[:25000] if len(content) > 25000 else content
726 |     
727 |     # Create the prompt for generating the summary
728 |     prompt = f"""<source_content>
729 | {truncated_content}
730 | </source_content>
731 | 
732 | The above content is from the documentation for '{source_id}'. Please provide a concise summary (3-5 sentences) that describes what this library/tool/framework is about. The summary should help understand what the library/tool/framework accomplishes and the purpose.
733 | """
734 |     
735 |     try:
736 |         # Call the OpenAI API to generate the summary
737 |         response = openai.chat.completions.create(
738 |             model=model_choice,
739 |             messages=[
740 |                 {"role": "system", "content": "You are a helpful assistant that provides concise library/tool/framework summaries."},
741 |                 {"role": "user", "content": prompt}
742 |             ],
743 |             temperature=0.3,
744 |             max_tokens=150
745 |         )
746 |         
747 |         # Extract the generated summary
748 |         summary = response.choices[0].message.content.strip()
749 |         
750 |         # Ensure the summary is not too long
751 |         if len(summary) > max_length:
752 |             summary = summary[:max_length] + "..."
753 |             
754 |         return summary
755 |     
756 |     except Exception as e:
757 |         print(f"Error generating summary with LLM for {source_id}: {e}. Using default summary.")
758 |         return default_summary
759 | 
760 | 
761 | def search_code_examples(
762 |     client: Client,
763 |     query: str,
764 |     match_count: int = 10,
765 |     filter_metadata: Optional[Dict[str, Any]] = None,
766 |     source_id: Optional[str] = None
767 | ) -> List[Dict[str, Any]]:
768 |     """
769 |     Search for code examples in Supabase using vector similarity.
770 |     
771 |     Args:
772 |         client: Supabase client
773 |         query: Query text
774 |         match_count: Maximum number of results to return
775 |         filter_metadata: Optional metadata filter
776 |         source_id: Optional source ID to filter results
777 |         
778 |     Returns:
779 |         List of matching code examples
780 |     """
781 |     import threading
782 |     
783 |     # Use threading.Timer for timeout instead of signal (works in threads)
784 |     timeout_event = threading.Event()
785 |     
786 |     def set_timeout():
787 |         timeout_event.set()
788 |     
789 |     timer = threading.Timer(25.0, set_timeout)
790 |     timer.start()
791 |     
792 |     try:
793 |         print(f"[DEBUG] Creating enhanced embedding for code query: '{query[:50]}...'")
794 |         # Create a more descriptive query for better embedding match
795 |         # Since code examples are embedded with their summaries, we should make the query more descriptive
796 |         enhanced_query = f"Code example for {query}\n\nSummary: Example code showing {query}"
797 |         
798 |         # Create embedding for the enhanced query
799 |         query_embedding = create_embedding(enhanced_query)
800 |         
801 |         if not query_embedding or all(v == 0.0 for v in query_embedding):
802 |             print("[ERROR] Failed to create valid embedding for code search")
803 |             return []
804 |         
805 |         if timeout_event.is_set():
806 |             raise TimeoutError("Embedding creation timed out")
807 |         
808 |         print("[DEBUG] Executing code example search in database...")
809 |         # Execute the search using the match_code_examples function
810 |         params = {
811 |             'query_embedding': query_embedding,
812 |             'match_count': match_count * 3 if source_id else match_count  # Get more results if we need to filter
813 |         }
814 |         
815 |         # Only add the filter if it's actually provided and not empty
816 |         if filter_metadata:
817 |             params['filter'] = filter_metadata
818 |             print(f"[DEBUG] Using metadata filter: {filter_metadata}")
819 |             
820 |         # Add source filter if provided (using correct parameter name from SQL function)
821 |         if source_id:
822 |             params['source_filter'] = source_id  # Correct parameter name from SQL function
823 |             print(f"[DEBUG] Using source_filter parameter: '{source_id}'")
824 |         
825 |         result = client.rpc('match_code_examples', params).execute()
826 |         
827 |         if timeout_event.is_set():
828 |             raise TimeoutError("Code search timed out")
829 |         
830 |         if result and result.data:
831 |             print(f"[DEBUG] Code example search returned {len(result.data)} results before filtering")
832 |             
833 |             # If source_id is specified and we got results, filter them
834 |             if source_id and result.data:
835 |                 # Filter results by source_id
836 |                 filtered_results = []
837 |                 for item in result.data:
838 |                     # Check if source_id matches
839 |                     item_source_id = item.get('source_id', '')
840 |                     if item_source_id == source_id:
841 |                         filtered_results.append(item)
842 |                         if len(filtered_results) >= match_count:
843 |                             break
844 |                 
845 |                 print(f"[SUCCESS] Code example search completed: {len(filtered_results)} results after source filtering")
846 |                 return filtered_results[:match_count]
847 |             else:
848 |                 print(f"[SUCCESS] Code example search completed: {len(result.data)} results")
849 |                 return result.data[:match_count]
850 |         else:
851 |             print("[WARNING] Code example search returned no results")
852 |             return []
853 |             
854 |     except TimeoutError as e:
855 |         print(f"[ERROR] Code example search timed out: {e}")
856 |         return []
857 |     except Exception as e:
858 |         print(f"[ERROR] Error searching code examples: {e}")
859 |         print(f"[DEBUG] Exception type: {type(e).__name__}")
860 |         import traceback
861 |         print(f"[DEBUG] Traceback: {traceback.format_exc()}")
862 |         return []
863 |     finally:
864 |         # Cancel the timer
865 |         timer.cancel()
```