This is page 2 of 2. Use http://codebase.md/coleam00/mcp-crawl4ai-rag?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .dockerignore
├── .env.example
├── .gitattributes
├── .gitignore
├── crawled_pages.sql
├── Dockerfile
├── knowledge_graphs
│ ├── ai_hallucination_detector.py
│ ├── ai_script_analyzer.py
│ ├── hallucination_reporter.py
│ ├── knowledge_graph_validator.py
│ ├── parse_repo_into_neo4j.py
│ ├── query_knowledge_graph.py
│ └── test_script.py
├── LICENSE
├── pyproject.toml
├── README.md
├── src
│ ├── crawl4ai_mcp.py
│ └── utils.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/knowledge_graphs/knowledge_graph_validator.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Knowledge Graph Validator
3 |
4 | Validates AI-generated code against Neo4j knowledge graph containing
5 | repository information. Checks imports, methods, attributes, and parameters.
6 | """
7 |
8 | import asyncio
9 | import logging
10 | from typing import Dict, List, Optional, Set, Tuple, Any
11 | from dataclasses import dataclass, field
12 | from enum import Enum
13 | from neo4j import AsyncGraphDatabase
14 |
15 | from ai_script_analyzer import (
16 | AnalysisResult, ImportInfo, MethodCall, AttributeAccess,
17 | FunctionCall, ClassInstantiation
18 | )
19 |
20 | logger = logging.getLogger(__name__)
21 |
22 |
23 | class ValidationStatus(Enum):
24 | VALID = "VALID"
25 | INVALID = "INVALID"
26 | UNCERTAIN = "UNCERTAIN"
27 | NOT_FOUND = "NOT_FOUND"
28 |
29 |
30 | @dataclass
31 | class ValidationResult:
32 | """Result of validating a single element"""
33 | status: ValidationStatus
34 | confidence: float # 0.0 to 1.0
35 | message: str
36 | details: Dict[str, Any] = field(default_factory=dict)
37 | suggestions: List[str] = field(default_factory=list)
38 |
39 |
40 | @dataclass
41 | class ImportValidation:
42 | """Validation result for an import"""
43 | import_info: ImportInfo
44 | validation: ValidationResult
45 | available_classes: List[str] = field(default_factory=list)
46 | available_functions: List[str] = field(default_factory=list)
47 |
48 |
49 | @dataclass
50 | class MethodValidation:
51 | """Validation result for a method call"""
52 | method_call: MethodCall
53 | validation: ValidationResult
54 | expected_params: List[str] = field(default_factory=list)
55 | actual_params: List[str] = field(default_factory=list)
56 | parameter_validation: ValidationResult = None
57 |
58 |
59 | @dataclass
60 | class AttributeValidation:
61 | """Validation result for attribute access"""
62 | attribute_access: AttributeAccess
63 | validation: ValidationResult
64 | expected_type: Optional[str] = None
65 |
66 |
67 | @dataclass
68 | class FunctionValidation:
69 | """Validation result for function call"""
70 | function_call: FunctionCall
71 | validation: ValidationResult
72 | expected_params: List[str] = field(default_factory=list)
73 | actual_params: List[str] = field(default_factory=list)
74 | parameter_validation: ValidationResult = None
75 |
76 |
77 | @dataclass
78 | class ClassValidation:
79 | """Validation result for class instantiation"""
80 | class_instantiation: ClassInstantiation
81 | validation: ValidationResult
82 | constructor_params: List[str] = field(default_factory=list)
83 | parameter_validation: ValidationResult = None
84 |
85 |
86 | @dataclass
87 | class ScriptValidationResult:
88 | """Complete validation results for a script"""
89 | script_path: str
90 | analysis_result: AnalysisResult
91 | import_validations: List[ImportValidation] = field(default_factory=list)
92 | class_validations: List[ClassValidation] = field(default_factory=list)
93 | method_validations: List[MethodValidation] = field(default_factory=list)
94 | attribute_validations: List[AttributeValidation] = field(default_factory=list)
95 | function_validations: List[FunctionValidation] = field(default_factory=list)
96 | overall_confidence: float = 0.0
97 | hallucinations_detected: List[Dict[str, Any]] = field(default_factory=list)
98 |
99 |
100 | class KnowledgeGraphValidator:
101 | """Validates code against Neo4j knowledge graph"""
102 |
103 | def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
104 | self.neo4j_uri = neo4j_uri
105 | self.neo4j_user = neo4j_user
106 | self.neo4j_password = neo4j_password
107 | self.driver = None
108 |
109 | # Cache for performance
110 | self.module_cache: Dict[str, List[str]] = {}
111 | self.class_cache: Dict[str, Dict[str, Any]] = {}
112 | self.method_cache: Dict[str, List[Dict[str, Any]]] = {}
113 | self.repo_cache: Dict[str, str] = {} # module_name -> repo_name
114 | self.knowledge_graph_modules: Set[str] = set() # Track modules in knowledge graph
115 |
116 | async def initialize(self):
117 | """Initialize Neo4j connection"""
118 | self.driver = AsyncGraphDatabase.driver(
119 | self.neo4j_uri,
120 | auth=(self.neo4j_user, self.neo4j_password)
121 | )
122 | logger.info("Knowledge graph validator initialized")
123 |
124 | async def close(self):
125 | """Close Neo4j connection"""
126 | if self.driver:
127 | await self.driver.close()
128 |
129 | async def validate_script(self, analysis_result: AnalysisResult) -> ScriptValidationResult:
130 | """Validate entire script analysis against knowledge graph"""
131 | result = ScriptValidationResult(
132 | script_path=analysis_result.file_path,
133 | analysis_result=analysis_result
134 | )
135 |
136 | # Validate imports first (builds context for other validations)
137 | result.import_validations = await self._validate_imports(analysis_result.imports)
138 |
139 | # Validate class instantiations
140 | result.class_validations = await self._validate_class_instantiations(
141 | analysis_result.class_instantiations
142 | )
143 |
144 | # Validate method calls
145 | result.method_validations = await self._validate_method_calls(
146 | analysis_result.method_calls
147 | )
148 |
149 | # Validate attribute accesses
150 | result.attribute_validations = await self._validate_attribute_accesses(
151 | analysis_result.attribute_accesses
152 | )
153 |
154 | # Validate function calls
155 | result.function_validations = await self._validate_function_calls(
156 | analysis_result.function_calls
157 | )
158 |
159 | # Calculate overall confidence and detect hallucinations
160 | result.overall_confidence = self._calculate_overall_confidence(result)
161 | result.hallucinations_detected = self._detect_hallucinations(result)
162 |
163 | return result
164 |
165 | async def _validate_imports(self, imports: List[ImportInfo]) -> List[ImportValidation]:
166 | """Validate all imports against knowledge graph"""
167 | validations = []
168 |
169 | for import_info in imports:
170 | validation = await self._validate_single_import(import_info)
171 | validations.append(validation)
172 |
173 | return validations
174 |
175 | async def _validate_single_import(self, import_info: ImportInfo) -> ImportValidation:
176 | """Validate a single import"""
177 | # Determine module to search for
178 | search_module = import_info.module if import_info.is_from_import else import_info.name
179 |
180 | # Check cache first
181 | if search_module in self.module_cache:
182 | available_files = self.module_cache[search_module]
183 | else:
184 | # Query Neo4j for matching modules
185 | available_files = await self._find_modules(search_module)
186 | self.module_cache[search_module] = available_files
187 |
188 | if available_files:
189 | # Get available classes and functions from the module
190 | classes, functions = await self._get_module_contents(search_module)
191 |
192 | # Track this module as being in the knowledge graph
193 | self.knowledge_graph_modules.add(search_module)
194 |
195 | # Also track the base module for "from X.Y.Z import ..." patterns
196 | if '.' in search_module:
197 | base_module = search_module.split('.')[0]
198 | self.knowledge_graph_modules.add(base_module)
199 |
200 | validation = ValidationResult(
201 | status=ValidationStatus.VALID,
202 | confidence=0.9,
203 | message=f"Module '{search_module}' found in knowledge graph",
204 | details={"matched_files": available_files, "in_knowledge_graph": True}
205 | )
206 |
207 | return ImportValidation(
208 | import_info=import_info,
209 | validation=validation,
210 | available_classes=classes,
211 | available_functions=functions
212 | )
213 | else:
214 | # External library - mark as such but don't treat as error
215 | validation = ValidationResult(
216 | status=ValidationStatus.UNCERTAIN,
217 | confidence=0.8, # High confidence it's external, not an error
218 | message=f"Module '{search_module}' is external (not in knowledge graph)",
219 | details={"could_be_external": True, "in_knowledge_graph": False}
220 | )
221 |
222 | return ImportValidation(
223 | import_info=import_info,
224 | validation=validation
225 | )
226 |
227 | async def _validate_class_instantiations(self, instantiations: List[ClassInstantiation]) -> List[ClassValidation]:
228 | """Validate class instantiations"""
229 | validations = []
230 |
231 | for instantiation in instantiations:
232 | validation = await self._validate_single_class_instantiation(instantiation)
233 | validations.append(validation)
234 |
235 | return validations
236 |
237 | async def _validate_single_class_instantiation(self, instantiation: ClassInstantiation) -> ClassValidation:
238 | """Validate a single class instantiation"""
239 | class_name = instantiation.full_class_name or instantiation.class_name
240 |
241 | # Skip validation for classes not from knowledge graph
242 | if not self._is_from_knowledge_graph(class_name):
243 | validation = ValidationResult(
244 | status=ValidationStatus.UNCERTAIN,
245 | confidence=0.8,
246 | message=f"Skipping validation: '{class_name}' is not from knowledge graph"
247 | )
248 | return ClassValidation(
249 | class_instantiation=instantiation,
250 | validation=validation
251 | )
252 |
253 | # Find class in knowledge graph
254 | class_info = await self._find_class(class_name)
255 |
256 | if not class_info:
257 | validation = ValidationResult(
258 | status=ValidationStatus.NOT_FOUND,
259 | confidence=0.2,
260 | message=f"Class '{class_name}' not found in knowledge graph"
261 | )
262 | return ClassValidation(
263 | class_instantiation=instantiation,
264 | validation=validation
265 | )
266 |
267 | # Check constructor parameters (look for __init__ method)
268 | init_method = await self._find_method(class_name, "__init__")
269 |
270 | if init_method:
271 | param_validation = self._validate_parameters(
272 | expected_params=init_method.get('params_list', []),
273 | provided_args=instantiation.args,
274 | provided_kwargs=instantiation.kwargs
275 | )
276 | else:
277 | param_validation = ValidationResult(
278 | status=ValidationStatus.UNCERTAIN,
279 | confidence=0.5,
280 | message="Constructor parameters not found"
281 | )
282 |
283 | # Use parameter validation result if it failed
284 | if param_validation.status == ValidationStatus.INVALID:
285 | validation = ValidationResult(
286 | status=ValidationStatus.INVALID,
287 | confidence=param_validation.confidence,
288 | message=f"Class '{class_name}' found but has invalid constructor parameters: {param_validation.message}",
289 | suggestions=param_validation.suggestions
290 | )
291 | else:
292 | validation = ValidationResult(
293 | status=ValidationStatus.VALID,
294 | confidence=0.8,
295 | message=f"Class '{class_name}' found in knowledge graph"
296 | )
297 |
298 | return ClassValidation(
299 | class_instantiation=instantiation,
300 | validation=validation,
301 | parameter_validation=param_validation
302 | )
303 |
304 | async def _validate_method_calls(self, method_calls: List[MethodCall]) -> List[MethodValidation]:
305 | """Validate method calls"""
306 | validations = []
307 |
308 | for method_call in method_calls:
309 | validation = await self._validate_single_method_call(method_call)
310 | validations.append(validation)
311 |
312 | return validations
313 |
314 | async def _validate_single_method_call(self, method_call: MethodCall) -> MethodValidation:
315 | """Validate a single method call"""
316 | class_type = method_call.object_type
317 |
318 | if not class_type:
319 | validation = ValidationResult(
320 | status=ValidationStatus.UNCERTAIN,
321 | confidence=0.3,
322 | message=f"Cannot determine object type for '{method_call.object_name}'"
323 | )
324 | return MethodValidation(
325 | method_call=method_call,
326 | validation=validation
327 | )
328 |
329 | # Skip validation for classes not from knowledge graph
330 | if not self._is_from_knowledge_graph(class_type):
331 | validation = ValidationResult(
332 | status=ValidationStatus.UNCERTAIN,
333 | confidence=0.8,
334 | message=f"Skipping validation: '{class_type}' is not from knowledge graph"
335 | )
336 | return MethodValidation(
337 | method_call=method_call,
338 | validation=validation
339 | )
340 |
341 | # Find method in knowledge graph
342 | method_info = await self._find_method(class_type, method_call.method_name)
343 |
344 | if not method_info:
345 | # Check for similar method names
346 | similar_methods = await self._find_similar_methods(class_type, method_call.method_name)
347 |
348 | validation = ValidationResult(
349 | status=ValidationStatus.NOT_FOUND,
350 | confidence=0.1,
351 | message=f"Method '{method_call.method_name}' not found on class '{class_type}'",
352 | suggestions=similar_methods
353 | )
354 | return MethodValidation(
355 | method_call=method_call,
356 | validation=validation
357 | )
358 |
359 | # Validate parameters
360 | expected_params = method_info.get('params_list', [])
361 | param_validation = self._validate_parameters(
362 | expected_params=expected_params,
363 | provided_args=method_call.args,
364 | provided_kwargs=method_call.kwargs
365 | )
366 |
367 | # Use parameter validation result if it failed
368 | if param_validation.status == ValidationStatus.INVALID:
369 | validation = ValidationResult(
370 | status=ValidationStatus.INVALID,
371 | confidence=param_validation.confidence,
372 | message=f"Method '{method_call.method_name}' found but has invalid parameters: {param_validation.message}",
373 | suggestions=param_validation.suggestions
374 | )
375 | else:
376 | validation = ValidationResult(
377 | status=ValidationStatus.VALID,
378 | confidence=0.9,
379 | message=f"Method '{method_call.method_name}' found on class '{class_type}'"
380 | )
381 |
382 | return MethodValidation(
383 | method_call=method_call,
384 | validation=validation,
385 | expected_params=expected_params,
386 | actual_params=method_call.args + list(method_call.kwargs.keys()),
387 | parameter_validation=param_validation
388 | )
389 |
390 | async def _validate_attribute_accesses(self, attribute_accesses: List[AttributeAccess]) -> List[AttributeValidation]:
391 | """Validate attribute accesses"""
392 | validations = []
393 |
394 | for attr_access in attribute_accesses:
395 | validation = await self._validate_single_attribute_access(attr_access)
396 | validations.append(validation)
397 |
398 | return validations
399 |
400 | async def _validate_single_attribute_access(self, attr_access: AttributeAccess) -> AttributeValidation:
401 | """Validate a single attribute access"""
402 | class_type = attr_access.object_type
403 |
404 | if not class_type:
405 | validation = ValidationResult(
406 | status=ValidationStatus.UNCERTAIN,
407 | confidence=0.3,
408 | message=f"Cannot determine object type for '{attr_access.object_name}'"
409 | )
410 | return AttributeValidation(
411 | attribute_access=attr_access,
412 | validation=validation
413 | )
414 |
415 | # Skip validation for classes not from knowledge graph
416 | if not self._is_from_knowledge_graph(class_type):
417 | validation = ValidationResult(
418 | status=ValidationStatus.UNCERTAIN,
419 | confidence=0.8,
420 | message=f"Skipping validation: '{class_type}' is not from knowledge graph"
421 | )
422 | return AttributeValidation(
423 | attribute_access=attr_access,
424 | validation=validation
425 | )
426 |
427 | # Find attribute in knowledge graph
428 | attr_info = await self._find_attribute(class_type, attr_access.attribute_name)
429 |
430 | if not attr_info:
431 | # If not found as attribute, check if it's a method (for decorators like @agent.tool)
432 | method_info = await self._find_method(class_type, attr_access.attribute_name)
433 |
434 | if method_info:
435 | validation = ValidationResult(
436 | status=ValidationStatus.VALID,
437 | confidence=0.8,
438 | message=f"'{attr_access.attribute_name}' found as method on class '{class_type}' (likely used as decorator)"
439 | )
440 | return AttributeValidation(
441 | attribute_access=attr_access,
442 | validation=validation,
443 | expected_type="method"
444 | )
445 |
446 | validation = ValidationResult(
447 | status=ValidationStatus.NOT_FOUND,
448 | confidence=0.2,
449 | message=f"'{attr_access.attribute_name}' not found on class '{class_type}'"
450 | )
451 | return AttributeValidation(
452 | attribute_access=attr_access,
453 | validation=validation
454 | )
455 |
456 | validation = ValidationResult(
457 | status=ValidationStatus.VALID,
458 | confidence=0.8,
459 | message=f"Attribute '{attr_access.attribute_name}' found on class '{class_type}'"
460 | )
461 |
462 | return AttributeValidation(
463 | attribute_access=attr_access,
464 | validation=validation,
465 | expected_type=attr_info.get('type')
466 | )
467 |
468 | async def _validate_function_calls(self, function_calls: List[FunctionCall]) -> List[FunctionValidation]:
469 | """Validate function calls"""
470 | validations = []
471 |
472 | for func_call in function_calls:
473 | validation = await self._validate_single_function_call(func_call)
474 | validations.append(validation)
475 |
476 | return validations
477 |
478 | async def _validate_single_function_call(self, func_call: FunctionCall) -> FunctionValidation:
479 | """Validate a single function call"""
480 | func_name = func_call.full_name or func_call.function_name
481 |
482 | # Skip validation for functions not from knowledge graph
483 | if func_call.full_name and not self._is_from_knowledge_graph(func_call.full_name):
484 | validation = ValidationResult(
485 | status=ValidationStatus.UNCERTAIN,
486 | confidence=0.8,
487 | message=f"Skipping validation: '{func_name}' is not from knowledge graph"
488 | )
489 | return FunctionValidation(
490 | function_call=func_call,
491 | validation=validation
492 | )
493 |
494 | # Find function in knowledge graph
495 | func_info = await self._find_function(func_name)
496 |
497 | if not func_info:
498 | validation = ValidationResult(
499 | status=ValidationStatus.NOT_FOUND,
500 | confidence=0.2,
501 | message=f"Function '{func_name}' not found in knowledge graph"
502 | )
503 | return FunctionValidation(
504 | function_call=func_call,
505 | validation=validation
506 | )
507 |
508 | # Validate parameters
509 | expected_params = func_info.get('params_list', [])
510 | param_validation = self._validate_parameters(
511 | expected_params=expected_params,
512 | provided_args=func_call.args,
513 | provided_kwargs=func_call.kwargs
514 | )
515 |
516 | # Use parameter validation result if it failed
517 | if param_validation.status == ValidationStatus.INVALID:
518 | validation = ValidationResult(
519 | status=ValidationStatus.INVALID,
520 | confidence=param_validation.confidence,
521 | message=f"Function '{func_name}' found but has invalid parameters: {param_validation.message}",
522 | suggestions=param_validation.suggestions
523 | )
524 | else:
525 | validation = ValidationResult(
526 | status=ValidationStatus.VALID,
527 | confidence=0.8,
528 | message=f"Function '{func_name}' found in knowledge graph"
529 | )
530 |
531 | return FunctionValidation(
532 | function_call=func_call,
533 | validation=validation,
534 | expected_params=expected_params,
535 | actual_params=func_call.args + list(func_call.kwargs.keys()),
536 | parameter_validation=param_validation
537 | )
538 |
539 | def _validate_parameters(self, expected_params: List[str], provided_args: List[str],
540 | provided_kwargs: Dict[str, str]) -> ValidationResult:
541 | """Validate function/method parameters with comprehensive support"""
542 | if not expected_params:
543 | return ValidationResult(
544 | status=ValidationStatus.UNCERTAIN,
545 | confidence=0.5,
546 | message="Parameter information not available"
547 | )
548 |
549 | # Parse expected parameters - handle detailed format
550 | required_positional = []
551 | optional_positional = []
552 | keyword_only_required = []
553 | keyword_only_optional = []
554 | has_varargs = False
555 | has_varkwargs = False
556 |
557 | for param in expected_params:
558 | # Handle detailed format: "[keyword_only] name:type=default" or "name:type"
559 | param_clean = param.strip()
560 |
561 | # Check for parameter kind prefix
562 | kind = 'positional'
563 | if param_clean.startswith('['):
564 | end_bracket = param_clean.find(']')
565 | if end_bracket > 0:
566 | kind = param_clean[1:end_bracket]
567 | param_clean = param_clean[end_bracket+1:].strip()
568 |
569 | # Check for varargs/varkwargs
570 | if param_clean.startswith('*') and not param_clean.startswith('**'):
571 | has_varargs = True
572 | continue
573 | elif param_clean.startswith('**'):
574 | has_varkwargs = True
575 | continue
576 |
577 | # Parse name and check if optional
578 | if ':' in param_clean:
579 | param_name = param_clean.split(':')[0]
580 | is_optional = '=' in param_clean
581 |
582 | if kind == 'keyword_only':
583 | if is_optional:
584 | keyword_only_optional.append(param_name)
585 | else:
586 | keyword_only_required.append(param_name)
587 | else: # positional
588 | if is_optional:
589 | optional_positional.append(param_name)
590 | else:
591 | required_positional.append(param_name)
592 |
593 | # Count provided parameters
594 | provided_positional_count = len(provided_args)
595 | provided_keyword_names = set(provided_kwargs.keys())
596 |
597 | # Validate positional arguments
598 | min_required_positional = len(required_positional)
599 | max_allowed_positional = len(required_positional) + len(optional_positional)
600 |
601 | if not has_varargs and provided_positional_count > max_allowed_positional:
602 | return ValidationResult(
603 | status=ValidationStatus.INVALID,
604 | confidence=0.8,
605 | message=f"Too many positional arguments: provided {provided_positional_count}, max allowed {max_allowed_positional}"
606 | )
607 |
608 | if provided_positional_count < min_required_positional:
609 | return ValidationResult(
610 | status=ValidationStatus.INVALID,
611 | confidence=0.8,
612 | message=f"Too few positional arguments: provided {provided_positional_count}, required {min_required_positional}"
613 | )
614 |
615 | # Validate keyword arguments
616 | all_valid_kwarg_names = set(required_positional + optional_positional + keyword_only_required + keyword_only_optional)
617 | invalid_kwargs = provided_keyword_names - all_valid_kwarg_names
618 |
619 | if invalid_kwargs and not has_varkwargs:
620 | return ValidationResult(
621 | status=ValidationStatus.INVALID,
622 | confidence=0.7,
623 | message=f"Invalid keyword arguments: {list(invalid_kwargs)}",
624 | suggestions=[f"Valid parameters: {list(all_valid_kwarg_names)}"]
625 | )
626 |
627 | # Check required keyword-only arguments
628 | missing_required_kwargs = set(keyword_only_required) - provided_keyword_names
629 | if missing_required_kwargs:
630 | return ValidationResult(
631 | status=ValidationStatus.INVALID,
632 | confidence=0.8,
633 | message=f"Missing required keyword arguments: {list(missing_required_kwargs)}"
634 | )
635 |
636 | return ValidationResult(
637 | status=ValidationStatus.VALID,
638 | confidence=0.9,
639 | message="Parameters are valid"
640 | )
641 |
642 | # Neo4j Query Methods
643 |
644 | async def _find_modules(self, module_name: str) -> List[str]:
645 | """Find repository matching the module name, then return its files"""
646 | async with self.driver.session() as session:
647 | # First, try to find files with module names that match or start with the search term
648 | module_query = """
649 | MATCH (r:Repository)-[:CONTAINS]->(f:File)
650 | WHERE f.module_name = $module_name
651 | OR f.module_name STARTS WITH $module_name + '.'
652 | OR split(f.module_name, '.')[0] = $module_name
653 | RETURN DISTINCT r.name as repo_name, count(f) as file_count
654 | ORDER BY file_count DESC
655 | LIMIT 5
656 | """
657 |
658 | result = await session.run(module_query, module_name=module_name)
659 | repos_from_modules = []
660 | async for record in result:
661 | repos_from_modules.append(record['repo_name'])
662 |
663 | # Also try repository name matching as fallback
664 | repo_query = """
665 | MATCH (r:Repository)
666 | WHERE toLower(r.name) = toLower($module_name)
667 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
668 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
669 | RETURN r.name as repo_name
670 | ORDER BY
671 | CASE
672 | WHEN toLower(r.name) = toLower($module_name) THEN 1
673 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
674 | WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
675 | END
676 | LIMIT 5
677 | """
678 |
679 | result = await session.run(repo_query, module_name=module_name)
680 | repos_from_names = []
681 | async for record in result:
682 | repos_from_names.append(record['repo_name'])
683 |
684 | # Combine results, prioritizing module-based matches
685 | all_repos = repos_from_modules + [r for r in repos_from_names if r not in repos_from_modules]
686 |
687 | if not all_repos:
688 | return []
689 |
690 | # Get files from the best matching repository
691 | best_repo = all_repos[0]
692 | files_query = """
693 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
694 | RETURN f.path, f.module_name
695 | LIMIT 50
696 | """
697 |
698 | result = await session.run(files_query, repo_name=best_repo)
699 | files = []
700 | async for record in result:
701 | files.append(record['f.path'])
702 |
703 | return files
704 |
705 | async def _get_module_contents(self, module_name: str) -> Tuple[List[str], List[str]]:
706 | """Get classes and functions available in a repository matching the module name"""
707 | async with self.driver.session() as session:
708 | # First, try to find repository by module names in files
709 | module_query = """
710 | MATCH (r:Repository)-[:CONTAINS]->(f:File)
711 | WHERE f.module_name = $module_name
712 | OR f.module_name STARTS WITH $module_name + '.'
713 | OR split(f.module_name, '.')[0] = $module_name
714 | RETURN DISTINCT r.name as repo_name, count(f) as file_count
715 | ORDER BY file_count DESC
716 | LIMIT 1
717 | """
718 |
719 | result = await session.run(module_query, module_name=module_name)
720 | record = await result.single()
721 |
722 | if record:
723 | repo_name = record['repo_name']
724 | else:
725 | # Fallback to repository name matching
726 | repo_query = """
727 | MATCH (r:Repository)
728 | WHERE toLower(r.name) = toLower($module_name)
729 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
730 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
731 | RETURN r.name as repo_name
732 | ORDER BY
733 | CASE
734 | WHEN toLower(r.name) = toLower($module_name) THEN 1
735 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
736 | WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
737 | END
738 | LIMIT 1
739 | """
740 |
741 | result = await session.run(repo_query, module_name=module_name)
742 | record = await result.single()
743 |
744 | if not record:
745 | return [], []
746 |
747 | repo_name = record['repo_name']
748 |
749 | # Get classes from this repository
750 | class_query = """
751 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
752 | RETURN DISTINCT c.name as class_name
753 | """
754 |
755 | result = await session.run(class_query, repo_name=repo_name)
756 | classes = []
757 | async for record in result:
758 | classes.append(record['class_name'])
759 |
760 | # Get functions from this repository
761 | func_query = """
762 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
763 | RETURN DISTINCT func.name as function_name
764 | """
765 |
766 | result = await session.run(func_query, repo_name=repo_name)
767 | functions = []
768 | async for record in result:
769 | functions.append(record['function_name'])
770 |
771 | return classes, functions
772 |
773 | async def _find_repository_for_module(self, module_name: str) -> Optional[str]:
774 | """Find the repository name that matches a module name"""
775 | if module_name in self.repo_cache:
776 | return self.repo_cache[module_name]
777 |
778 | async with self.driver.session() as session:
779 | # First, try to find repository by module names in files
780 | module_query = """
781 | MATCH (r:Repository)-[:CONTAINS]->(f:File)
782 | WHERE f.module_name = $module_name
783 | OR f.module_name STARTS WITH $module_name + '.'
784 | OR split(f.module_name, '.')[0] = $module_name
785 | RETURN DISTINCT r.name as repo_name, count(f) as file_count
786 | ORDER BY file_count DESC
787 | LIMIT 1
788 | """
789 |
790 | result = await session.run(module_query, module_name=module_name)
791 | record = await result.single()
792 |
793 | if record:
794 | repo_name = record['repo_name']
795 | else:
796 | # Fallback to repository name matching
797 | query = """
798 | MATCH (r:Repository)
799 | WHERE toLower(r.name) = toLower($module_name)
800 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
801 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
802 | OR toLower(r.name) CONTAINS toLower($module_name)
803 | OR toLower($module_name) CONTAINS toLower(replace(r.name, '-', '_'))
804 | RETURN r.name as repo_name
805 | ORDER BY
806 | CASE
807 | WHEN toLower(r.name) = toLower($module_name) THEN 1
808 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
809 | ELSE 3
810 | END
811 | LIMIT 1
812 | """
813 |
814 | result = await session.run(query, module_name=module_name)
815 | record = await result.single()
816 |
817 | repo_name = record['repo_name'] if record else None
818 |
819 | self.repo_cache[module_name] = repo_name
820 | return repo_name
821 |
822 | async def _find_class(self, class_name: str) -> Optional[Dict[str, Any]]:
823 | """Find class information in knowledge graph"""
824 | async with self.driver.session() as session:
825 | # First try exact match
826 | query = """
827 | MATCH (c:Class)
828 | WHERE c.name = $class_name OR c.full_name = $class_name
829 | RETURN c.name as name, c.full_name as full_name
830 | LIMIT 1
831 | """
832 |
833 | result = await session.run(query, class_name=class_name)
834 | record = await result.single()
835 |
836 | if record:
837 | return {
838 | 'name': record['name'],
839 | 'full_name': record['full_name']
840 | }
841 |
842 | # If no exact match and class_name has dots, try repository-based search
843 | if '.' in class_name:
844 | parts = class_name.split('.')
845 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
846 | class_part = parts[-1] # e.g., "Agent"
847 |
848 | # Find repository for the module
849 | repo_name = await self._find_repository_for_module(module_part)
850 |
851 | if repo_name:
852 | # Search for class within this repository
853 | repo_query = """
854 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
855 | WHERE c.name = $class_name
856 | RETURN c.name as name, c.full_name as full_name
857 | LIMIT 1
858 | """
859 |
860 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part)
861 | record = await result.single()
862 |
863 | if record:
864 | return {
865 | 'name': record['name'],
866 | 'full_name': record['full_name']
867 | }
868 |
869 | return None
870 |
871 | async def _find_method(self, class_name: str, method_name: str) -> Optional[Dict[str, Any]]:
872 | """Find method information for a class"""
873 | cache_key = f"{class_name}.{method_name}"
874 | if cache_key in self.method_cache:
875 | methods = self.method_cache[cache_key]
876 | return methods[0] if methods else None
877 |
878 | async with self.driver.session() as session:
879 | # First try exact match
880 | query = """
881 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
882 | WHERE (c.name = $class_name OR c.full_name = $class_name)
883 | AND m.name = $method_name
884 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
885 | m.return_type as return_type, m.args as args
886 | LIMIT 1
887 | """
888 |
889 | result = await session.run(query, class_name=class_name, method_name=method_name)
890 | record = await result.single()
891 |
892 | if record:
893 | # Use detailed params if available, fall back to simple params
894 | params_to_use = record['params_detailed'] or record['params_list'] or []
895 |
896 | method_info = {
897 | 'name': record['name'],
898 | 'params_list': params_to_use,
899 | 'return_type': record['return_type'],
900 | 'args': record['args'] or []
901 | }
902 | self.method_cache[cache_key] = [method_info]
903 | return method_info
904 |
905 | # If no exact match and class_name has dots, try repository-based search
906 | if '.' in class_name:
907 | parts = class_name.split('.')
908 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
909 | class_part = parts[-1] # e.g., "Agent"
910 |
911 | # Find repository for the module
912 | repo_name = await self._find_repository_for_module(module_part)
913 |
914 | if repo_name:
915 | # Search for method within this repository's classes
916 | repo_query = """
917 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
918 | WHERE c.name = $class_name AND m.name = $method_name
919 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
920 | m.return_type as return_type, m.args as args
921 | LIMIT 1
922 | """
923 |
924 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, method_name=method_name)
925 | record = await result.single()
926 |
927 | if record:
928 | # Use detailed params if available, fall back to simple params
929 | params_to_use = record['params_detailed'] or record['params_list'] or []
930 |
931 | method_info = {
932 | 'name': record['name'],
933 | 'params_list': params_to_use,
934 | 'return_type': record['return_type'],
935 | 'args': record['args'] or []
936 | }
937 | self.method_cache[cache_key] = [method_info]
938 | return method_info
939 |
940 | self.method_cache[cache_key] = []
941 | return None
942 |
943 | async def _find_attribute(self, class_name: str, attr_name: str) -> Optional[Dict[str, Any]]:
944 | """Find attribute information for a class"""
945 | async with self.driver.session() as session:
946 | # First try exact match
947 | query = """
948 | MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
949 | WHERE (c.name = $class_name OR c.full_name = $class_name)
950 | AND a.name = $attr_name
951 | RETURN a.name as name, a.type as type
952 | LIMIT 1
953 | """
954 |
955 | result = await session.run(query, class_name=class_name, attr_name=attr_name)
956 | record = await result.single()
957 |
958 | if record:
959 | return {
960 | 'name': record['name'],
961 | 'type': record['type']
962 | }
963 |
964 | # If no exact match and class_name has dots, try repository-based search
965 | if '.' in class_name:
966 | parts = class_name.split('.')
967 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
968 | class_part = parts[-1] # e.g., "Agent"
969 |
970 | # Find repository for the module
971 | repo_name = await self._find_repository_for_module(module_part)
972 |
973 | if repo_name:
974 | # Search for attribute within this repository's classes
975 | repo_query = """
976 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
977 | WHERE c.name = $class_name AND a.name = $attr_name
978 | RETURN a.name as name, a.type as type
979 | LIMIT 1
980 | """
981 |
982 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, attr_name=attr_name)
983 | record = await result.single()
984 |
985 | if record:
986 | return {
987 | 'name': record['name'],
988 | 'type': record['type']
989 | }
990 |
991 | return None
992 |
993 | async def _find_function(self, func_name: str) -> Optional[Dict[str, Any]]:
994 | """Find function information"""
995 | async with self.driver.session() as session:
996 | # First try exact match
997 | query = """
998 | MATCH (f:Function)
999 | WHERE f.name = $func_name OR f.full_name = $func_name
1000 | RETURN f.name as name, f.params_list as params_list, f.params_detailed as params_detailed,
1001 | f.return_type as return_type, f.args as args
1002 | LIMIT 1
1003 | """
1004 |
1005 | result = await session.run(query, func_name=func_name)
1006 | record = await result.single()
1007 |
1008 | if record:
1009 | # Use detailed params if available, fall back to simple params
1010 | params_to_use = record['params_detailed'] or record['params_list'] or []
1011 |
1012 | return {
1013 | 'name': record['name'],
1014 | 'params_list': params_to_use,
1015 | 'return_type': record['return_type'],
1016 | 'args': record['args'] or []
1017 | }
1018 |
1019 | # If no exact match and func_name has dots, try repository-based search
1020 | if '.' in func_name:
1021 | parts = func_name.split('.')
1022 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
1023 | func_part = parts[-1] # e.g., "some_function"
1024 |
1025 | # Find repository for the module
1026 | repo_name = await self._find_repository_for_module(module_part)
1027 |
1028 | if repo_name:
1029 | # Search for function within this repository
1030 | repo_query = """
1031 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1032 | WHERE func.name = $func_name
1033 | RETURN func.name as name, func.params_list as params_list, func.params_detailed as params_detailed,
1034 | func.return_type as return_type, func.args as args
1035 | LIMIT 1
1036 | """
1037 |
1038 | result = await session.run(repo_query, repo_name=repo_name, func_name=func_part)
1039 | record = await result.single()
1040 |
1041 | if record:
1042 | # Use detailed params if available, fall back to simple params
1043 | params_to_use = record['params_detailed'] or record['params_list'] or []
1044 |
1045 | return {
1046 | 'name': record['name'],
1047 | 'params_list': params_to_use,
1048 | 'return_type': record['return_type'],
1049 | 'args': record['args'] or []
1050 | }
1051 |
1052 | return None
1053 |
1054 | async def _find_pydantic_ai_result_method(self, method_name: str) -> Optional[Dict[str, Any]]:
1055 | """Find method information for pydantic_ai result objects"""
1056 | # Look for methods on pydantic_ai classes that could be result objects
1057 | async with self.driver.session() as session:
1058 | # Search for common result methods in pydantic_ai repository
1059 | query = """
1060 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1061 | WHERE m.name = $method_name
1062 | AND (c.name CONTAINS 'Result' OR c.name CONTAINS 'Stream' OR c.name CONTAINS 'Run')
1063 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
1064 | m.return_type as return_type, m.args as args, c.name as class_name
1065 | LIMIT 1
1066 | """
1067 |
1068 | result = await session.run(query, repo_name="pydantic_ai", method_name=method_name)
1069 | record = await result.single()
1070 |
1071 | if record:
1072 | # Use detailed params if available, fall back to simple params
1073 | params_to_use = record['params_detailed'] or record['params_list'] or []
1074 |
1075 | return {
1076 | 'name': record['name'],
1077 | 'params_list': params_to_use,
1078 | 'return_type': record['return_type'],
1079 | 'args': record['args'] or [],
1080 | 'source_class': record['class_name']
1081 | }
1082 |
1083 | return None
1084 |
1085 | async def _find_similar_modules(self, module_name: str) -> List[str]:
1086 | """Find similar repository names for suggestions"""
1087 | async with self.driver.session() as session:
1088 | query = """
1089 | MATCH (r:Repository)
1090 | WHERE toLower(r.name) CONTAINS toLower($partial_name)
1091 | OR toLower(replace(r.name, '-', '_')) CONTAINS toLower($partial_name)
1092 | OR toLower(replace(r.name, '_', '-')) CONTAINS toLower($partial_name)
1093 | RETURN r.name
1094 | LIMIT 5
1095 | """
1096 |
1097 | result = await session.run(query, partial_name=module_name[:3])
1098 | suggestions = []
1099 | async for record in result:
1100 | suggestions.append(record['name'])
1101 |
1102 | return suggestions
1103 |
1104 | async def _find_similar_methods(self, class_name: str, method_name: str) -> List[str]:
1105 | """Find similar method names for suggestions"""
1106 | async with self.driver.session() as session:
1107 | # First try exact class match
1108 | query = """
1109 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1110 | WHERE (c.name = $class_name OR c.full_name = $class_name)
1111 | AND m.name CONTAINS $partial_name
1112 | RETURN m.name as name
1113 | LIMIT 5
1114 | """
1115 |
1116 | result = await session.run(query, class_name=class_name, partial_name=method_name[:3])
1117 | suggestions = []
1118 | async for record in result:
1119 | suggestions.append(record['name'])
1120 |
1121 | # If no suggestions and class_name has dots, try repository-based search
1122 | if not suggestions and '.' in class_name:
1123 | parts = class_name.split('.')
1124 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
1125 | class_part = parts[-1] # e.g., "Agent"
1126 |
1127 | # Find repository for the module
1128 | repo_name = await self._find_repository_for_module(module_part)
1129 |
1130 | if repo_name:
1131 | repo_query = """
1132 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1133 | WHERE c.name = $class_name AND m.name CONTAINS $partial_name
1134 | RETURN m.name as name
1135 | LIMIT 5
1136 | """
1137 |
1138 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, partial_name=method_name[:3])
1139 | async for record in result:
1140 | suggestions.append(record['name'])
1141 |
1142 | return suggestions
1143 |
1144 | def _calculate_overall_confidence(self, result: ScriptValidationResult) -> float:
1145 | """Calculate overall confidence score for the validation (knowledge graph items only)"""
1146 | kg_validations = []
1147 |
1148 | # Only count validations from knowledge graph imports
1149 | for val in result.import_validations:
1150 | if val.validation.details.get('in_knowledge_graph', False):
1151 | kg_validations.append(val.validation.confidence)
1152 |
1153 | # Only count validations from knowledge graph classes
1154 | for val in result.class_validations:
1155 | class_name = val.class_instantiation.full_class_name or val.class_instantiation.class_name
1156 | if self._is_from_knowledge_graph(class_name):
1157 | kg_validations.append(val.validation.confidence)
1158 |
1159 | # Only count validations from knowledge graph methods
1160 | for val in result.method_validations:
1161 | if val.method_call.object_type and self._is_from_knowledge_graph(val.method_call.object_type):
1162 | kg_validations.append(val.validation.confidence)
1163 |
1164 | # Only count validations from knowledge graph attributes
1165 | for val in result.attribute_validations:
1166 | if val.attribute_access.object_type and self._is_from_knowledge_graph(val.attribute_access.object_type):
1167 | kg_validations.append(val.validation.confidence)
1168 |
1169 | # Only count validations from knowledge graph functions
1170 | for val in result.function_validations:
1171 | if val.function_call.full_name and self._is_from_knowledge_graph(val.function_call.full_name):
1172 | kg_validations.append(val.validation.confidence)
1173 |
1174 | if not kg_validations:
1175 | return 1.0 # No knowledge graph items to validate = perfect confidence
1176 |
1177 | return sum(kg_validations) / len(kg_validations)
1178 |
1179 | def _is_from_knowledge_graph(self, class_type: str) -> bool:
1180 | """Check if a class type comes from a module in the knowledge graph"""
1181 | if not class_type:
1182 | return False
1183 |
1184 | # For dotted names like "pydantic_ai.Agent" or "pydantic_ai.StreamedRunResult", check the base module
1185 | if '.' in class_type:
1186 | base_module = class_type.split('.')[0]
1187 | # Exact match only - "pydantic" should not match "pydantic_ai"
1188 | return base_module in self.knowledge_graph_modules
1189 |
1190 | # For simple names, check if any knowledge graph module matches exactly
1191 | # Don't use substring matching to avoid "pydantic" matching "pydantic_ai"
1192 | return class_type in self.knowledge_graph_modules
1193 |
1194 | def _detect_hallucinations(self, result: ScriptValidationResult) -> List[Dict[str, Any]]:
1195 | """Detect and categorize hallucinations"""
1196 | hallucinations = []
1197 | reported_items = set() # Track reported items to avoid duplicates
1198 |
1199 | # Check method calls (only for knowledge graph classes)
1200 | for val in result.method_validations:
1201 | if (val.validation.status == ValidationStatus.NOT_FOUND and
1202 | val.method_call.object_type and
1203 | self._is_from_knowledge_graph(val.method_call.object_type)):
1204 |
1205 | # Create unique key to avoid duplicates
1206 | key = (val.method_call.line_number, val.method_call.method_name, val.method_call.object_type)
1207 | if key not in reported_items:
1208 | reported_items.add(key)
1209 | hallucinations.append({
1210 | 'type': 'METHOD_NOT_FOUND',
1211 | 'location': f"line {val.method_call.line_number}",
1212 | 'description': f"Method '{val.method_call.method_name}' not found on class '{val.method_call.object_type}'",
1213 | 'suggestion': val.validation.suggestions[0] if val.validation.suggestions else None
1214 | })
1215 |
1216 | # Check attributes (only for knowledge graph classes) - but skip if already reported as method
1217 | for val in result.attribute_validations:
1218 | if (val.validation.status == ValidationStatus.NOT_FOUND and
1219 | val.attribute_access.object_type and
1220 | self._is_from_knowledge_graph(val.attribute_access.object_type)):
1221 |
1222 | # Create unique key - if this was already reported as a method, skip it
1223 | key = (val.attribute_access.line_number, val.attribute_access.attribute_name, val.attribute_access.object_type)
1224 | if key not in reported_items:
1225 | reported_items.add(key)
1226 | hallucinations.append({
1227 | 'type': 'ATTRIBUTE_NOT_FOUND',
1228 | 'location': f"line {val.attribute_access.line_number}",
1229 | 'description': f"Attribute '{val.attribute_access.attribute_name}' not found on class '{val.attribute_access.object_type}'"
1230 | })
1231 |
1232 | # Check parameter issues (only for knowledge graph methods)
1233 | for val in result.method_validations:
1234 | if (val.parameter_validation and
1235 | val.parameter_validation.status == ValidationStatus.INVALID and
1236 | val.method_call.object_type and
1237 | self._is_from_knowledge_graph(val.method_call.object_type)):
1238 | hallucinations.append({
1239 | 'type': 'INVALID_PARAMETERS',
1240 | 'location': f"line {val.method_call.line_number}",
1241 | 'description': f"Invalid parameters for method '{val.method_call.method_name}': {val.parameter_validation.message}"
1242 | })
1243 |
1244 | return hallucinations
```
--------------------------------------------------------------------------------
/src/crawl4ai_mcp.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | MCP server for web crawling with Crawl4AI.
3 |
4 | This server provides tools to crawl websites using Crawl4AI, automatically detecting
5 | the appropriate crawl method based on URL type (sitemap, txt file, or regular webpage).
6 | Also includes AI hallucination detection and repository parsing tools using Neo4j knowledge graphs.
7 | """
8 | from mcp.server.fastmcp import FastMCP, Context
9 | from sentence_transformers import CrossEncoder
10 | from contextlib import asynccontextmanager
11 | from collections.abc import AsyncIterator
12 | from dataclasses import dataclass
13 | from typing import List, Dict, Any, Optional
14 | from urllib.parse import urlparse, urldefrag
15 | from xml.etree import ElementTree
16 | from dotenv import load_dotenv
17 | from supabase import Client
18 | from pathlib import Path
19 | import requests
20 | import asyncio
21 | import json
22 | import os
23 | import re
24 | import concurrent.futures
25 | import sys
26 |
27 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
28 |
29 | # Add knowledge_graphs folder to path for importing knowledge graph modules
30 | knowledge_graphs_path = Path(__file__).resolve().parent.parent / 'knowledge_graphs'
31 | sys.path.append(str(knowledge_graphs_path))
32 |
33 | from utils import (
34 | get_supabase_client,
35 | add_documents_to_supabase,
36 | search_documents,
37 | extract_code_blocks,
38 | generate_code_example_summary,
39 | add_code_examples_to_supabase,
40 | update_source_info,
41 | extract_source_summary,
42 | search_code_examples
43 | )
44 |
45 | # Import knowledge graph modules
46 | from knowledge_graph_validator import KnowledgeGraphValidator
47 | from parse_repo_into_neo4j import DirectNeo4jExtractor
48 | from ai_script_analyzer import AIScriptAnalyzer
49 | from hallucination_reporter import HallucinationReporter
50 |
51 | # Load environment variables from the project root .env file
52 | project_root = Path(__file__).resolve().parent.parent
53 | dotenv_path = project_root / '.env'
54 |
55 | # Force override of existing environment variables
56 | load_dotenv(dotenv_path, override=True)
57 |
58 | # Helper functions for Neo4j validation and error handling
59 | def validate_neo4j_connection() -> bool:
60 | """Check if Neo4j environment variables are configured."""
61 | return all([
62 | os.getenv("NEO4J_URI"),
63 | os.getenv("NEO4J_USER"),
64 | os.getenv("NEO4J_PASSWORD")
65 | ])
66 |
67 | def format_neo4j_error(error: Exception) -> str:
68 | """Format Neo4j connection errors for user-friendly messages."""
69 | error_str = str(error).lower()
70 | if "authentication" in error_str or "unauthorized" in error_str:
71 | return "Neo4j authentication failed. Check NEO4J_USER and NEO4J_PASSWORD."
72 | elif "connection" in error_str or "refused" in error_str or "timeout" in error_str:
73 | return "Cannot connect to Neo4j. Check NEO4J_URI and ensure Neo4j is running."
74 | elif "database" in error_str:
75 | return "Neo4j database error. Check if the database exists and is accessible."
76 | else:
77 | return f"Neo4j error: {str(error)}"
78 |
79 | def validate_script_path(script_path: str) -> Dict[str, Any]:
80 | """Validate script path and return error info if invalid."""
81 | if not script_path or not isinstance(script_path, str):
82 | return {"valid": False, "error": "Script path is required"}
83 |
84 | if not os.path.exists(script_path):
85 | return {"valid": False, "error": f"Script not found: {script_path}"}
86 |
87 | if not script_path.endswith('.py'):
88 | return {"valid": False, "error": "Only Python (.py) files are supported"}
89 |
90 | try:
91 | # Check if file is readable
92 | with open(script_path, 'r', encoding='utf-8') as f:
93 | f.read(1) # Read first character to test
94 | return {"valid": True}
95 | except Exception as e:
96 | return {"valid": False, "error": f"Cannot read script file: {str(e)}"}
97 |
98 | def validate_github_url(repo_url: str) -> Dict[str, Any]:
99 | """Validate GitHub repository URL."""
100 | if not repo_url or not isinstance(repo_url, str):
101 | return {"valid": False, "error": "Repository URL is required"}
102 |
103 | repo_url = repo_url.strip()
104 |
105 | # Basic GitHub URL validation
106 | if not ("github.com" in repo_url.lower() or repo_url.endswith(".git")):
107 | return {"valid": False, "error": "Please provide a valid GitHub repository URL"}
108 |
109 | # Check URL format
110 | if not (repo_url.startswith("https://") or repo_url.startswith("git@")):
111 | return {"valid": False, "error": "Repository URL must start with https:// or git@"}
112 |
113 | return {"valid": True, "repo_name": repo_url.split('/')[-1].replace('.git', '')}
114 |
115 | # Create a dataclass for our application context
116 | @dataclass
117 | class Crawl4AIContext:
118 | """Context for the Crawl4AI MCP server."""
119 | crawler: AsyncWebCrawler
120 | supabase_client: Client
121 | reranking_model: Optional[CrossEncoder] = None
122 | knowledge_validator: Optional[Any] = None # KnowledgeGraphValidator when available
123 | repo_extractor: Optional[Any] = None # DirectNeo4jExtractor when available
124 |
125 | @asynccontextmanager
126 | async def crawl4ai_lifespan(server: FastMCP) -> AsyncIterator[Crawl4AIContext]:
127 | """
128 | Manages the Crawl4AI client lifecycle.
129 |
130 | Args:
131 | server: The FastMCP server instance
132 |
133 | Yields:
134 | Crawl4AIContext: The context containing the Crawl4AI crawler and Supabase client
135 | """
136 | # Create browser configuration
137 | browser_config = BrowserConfig(
138 | headless=True,
139 | verbose=False
140 | )
141 |
142 | # Initialize the crawler
143 | crawler = AsyncWebCrawler(config=browser_config)
144 | await crawler.__aenter__()
145 |
146 | # Initialize Supabase client
147 | supabase_client = get_supabase_client()
148 |
149 | # Initialize cross-encoder model for reranking if enabled
150 | reranking_model = None
151 | if os.getenv("USE_RERANKING", "false") == "true":
152 | try:
153 | reranking_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
154 | except Exception as e:
155 | print(f"Failed to load reranking model: {e}")
156 | reranking_model = None
157 |
158 | # Initialize Neo4j components if configured and enabled
159 | knowledge_validator = None
160 | repo_extractor = None
161 |
162 | # Check if knowledge graph functionality is enabled
163 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
164 |
165 | if knowledge_graph_enabled:
166 | neo4j_uri = os.getenv("NEO4J_URI")
167 | neo4j_user = os.getenv("NEO4J_USER")
168 | neo4j_password = os.getenv("NEO4J_PASSWORD")
169 |
170 | if neo4j_uri and neo4j_user and neo4j_password:
171 | try:
172 | print("Initializing knowledge graph components...")
173 |
174 | # Initialize knowledge graph validator
175 | knowledge_validator = KnowledgeGraphValidator(neo4j_uri, neo4j_user, neo4j_password)
176 | await knowledge_validator.initialize()
177 | print("✓ Knowledge graph validator initialized")
178 |
179 | # Initialize repository extractor
180 | repo_extractor = DirectNeo4jExtractor(neo4j_uri, neo4j_user, neo4j_password)
181 | await repo_extractor.initialize()
182 | print("✓ Repository extractor initialized")
183 |
184 | except Exception as e:
185 | print(f"Failed to initialize Neo4j components: {format_neo4j_error(e)}")
186 | knowledge_validator = None
187 | repo_extractor = None
188 | else:
189 | print("Neo4j credentials not configured - knowledge graph tools will be unavailable")
190 | else:
191 | print("Knowledge graph functionality disabled - set USE_KNOWLEDGE_GRAPH=true to enable")
192 |
193 | try:
194 | yield Crawl4AIContext(
195 | crawler=crawler,
196 | supabase_client=supabase_client,
197 | reranking_model=reranking_model,
198 | knowledge_validator=knowledge_validator,
199 | repo_extractor=repo_extractor
200 | )
201 | finally:
202 | # Clean up all components
203 | await crawler.__aexit__(None, None, None)
204 | if knowledge_validator:
205 | try:
206 | await knowledge_validator.close()
207 | print("✓ Knowledge graph validator closed")
208 | except Exception as e:
209 | print(f"Error closing knowledge validator: {e}")
210 | if repo_extractor:
211 | try:
212 | await repo_extractor.close()
213 | print("✓ Repository extractor closed")
214 | except Exception as e:
215 | print(f"Error closing repository extractor: {e}")
216 |
217 | # Initialize FastMCP server
218 | mcp = FastMCP(
219 | "mcp-crawl4ai-rag",
220 | description="MCP server for RAG and web crawling with Crawl4AI",
221 | lifespan=crawl4ai_lifespan,
222 | host=os.getenv("HOST", "0.0.0.0"),
223 | port=os.getenv("PORT", "8051")
224 | )
225 |
226 | def rerank_results(model: CrossEncoder, query: str, results: List[Dict[str, Any]], content_key: str = "content") -> List[Dict[str, Any]]:
227 | """
228 | Rerank search results using a cross-encoder model.
229 |
230 | Args:
231 | model: The cross-encoder model to use for reranking
232 | query: The search query
233 | results: List of search results
234 | content_key: The key in each result dict that contains the text content
235 |
236 | Returns:
237 | Reranked list of results
238 | """
239 | if not model or not results:
240 | return results
241 |
242 | try:
243 | # Extract content from results
244 | texts = [result.get(content_key, "") for result in results]
245 |
246 | # Create pairs of [query, document] for the cross-encoder
247 | pairs = [[query, text] for text in texts]
248 |
249 | # Get relevance scores from the cross-encoder
250 | scores = model.predict(pairs)
251 |
252 | # Add scores to results and sort by score (descending)
253 | for i, result in enumerate(results):
254 | result["rerank_score"] = float(scores[i])
255 |
256 | # Sort by rerank score
257 | reranked = sorted(results, key=lambda x: x.get("rerank_score", 0), reverse=True)
258 |
259 | return reranked
260 | except Exception as e:
261 | print(f"Error during reranking: {e}")
262 | return results
263 |
264 | def is_sitemap(url: str) -> bool:
265 | """
266 | Check if a URL is a sitemap.
267 |
268 | Args:
269 | url: URL to check
270 |
271 | Returns:
272 | True if the URL is a sitemap, False otherwise
273 | """
274 | return url.endswith('sitemap.xml') or 'sitemap' in urlparse(url).path
275 |
276 | def is_txt(url: str) -> bool:
277 | """
278 | Check if a URL is a text file.
279 |
280 | Args:
281 | url: URL to check
282 |
283 | Returns:
284 | True if the URL is a text file, False otherwise
285 | """
286 | return url.endswith('.txt')
287 |
288 | def parse_sitemap(sitemap_url: str) -> List[str]:
289 | """
290 | Parse a sitemap and extract URLs.
291 |
292 | Args:
293 | sitemap_url: URL of the sitemap
294 |
295 | Returns:
296 | List of URLs found in the sitemap
297 | """
298 | resp = requests.get(sitemap_url)
299 | urls = []
300 |
301 | if resp.status_code == 200:
302 | try:
303 | tree = ElementTree.fromstring(resp.content)
304 | urls = [loc.text for loc in tree.findall('.//{*}loc')]
305 | except Exception as e:
306 | print(f"Error parsing sitemap XML: {e}")
307 |
308 | return urls
309 |
310 | def smart_chunk_markdown(text: str, chunk_size: int = 5000) -> List[str]:
311 | """Split text into chunks, respecting code blocks and paragraphs."""
312 | chunks = []
313 | start = 0
314 | text_length = len(text)
315 |
316 | while start < text_length:
317 | # Calculate end position
318 | end = start + chunk_size
319 |
320 | # If we're at the end of the text, just take what's left
321 | if end >= text_length:
322 | chunks.append(text[start:].strip())
323 | break
324 |
325 | # Try to find a code block boundary first (```)
326 | chunk = text[start:end]
327 | code_block = chunk.rfind('```')
328 | if code_block != -1 and code_block > chunk_size * 0.3:
329 | end = start + code_block
330 |
331 | # If no code block, try to break at a paragraph
332 | elif '\n\n' in chunk:
333 | # Find the last paragraph break
334 | last_break = chunk.rfind('\n\n')
335 | if last_break > chunk_size * 0.3: # Only break if we're past 30% of chunk_size
336 | end = start + last_break
337 |
338 | # If no paragraph break, try to break at a sentence
339 | elif '. ' in chunk:
340 | # Find the last sentence break
341 | last_period = chunk.rfind('. ')
342 | if last_period > chunk_size * 0.3: # Only break if we're past 30% of chunk_size
343 | end = start + last_period + 1
344 |
345 | # Extract chunk and clean it up
346 | chunk = text[start:end].strip()
347 | if chunk:
348 | chunks.append(chunk)
349 |
350 | # Move start position for next chunk
351 | start = end
352 |
353 | return chunks
354 |
355 | def extract_section_info(chunk: str) -> Dict[str, Any]:
356 | """
357 | Extracts headers and stats from a chunk.
358 |
359 | Args:
360 | chunk: Markdown chunk
361 |
362 | Returns:
363 | Dictionary with headers and stats
364 | """
365 | headers = re.findall(r'^(#+)\s+(.+)$', chunk, re.MULTILINE)
366 | header_str = '; '.join([f'{h[0]} {h[1]}' for h in headers]) if headers else ''
367 |
368 | return {
369 | "headers": header_str,
370 | "char_count": len(chunk),
371 | "word_count": len(chunk.split())
372 | }
373 |
374 | def process_code_example(args):
375 | """
376 | Process a single code example to generate its summary.
377 | This function is designed to be used with concurrent.futures.
378 |
379 | Args:
380 | args: Tuple containing (code, context_before, context_after)
381 |
382 | Returns:
383 | The generated summary
384 | """
385 | code, context_before, context_after = args
386 | return generate_code_example_summary(code, context_before, context_after)
387 |
388 | @mcp.tool()
389 | async def crawl_single_page(ctx: Context, url: str) -> str:
390 | """
391 | Crawl a single web page and store its content in Supabase.
392 |
393 | This tool is ideal for quickly retrieving content from a specific URL without following links.
394 | The content is stored in Supabase for later retrieval and querying.
395 |
396 | Args:
397 | ctx: The MCP server provided context
398 | url: URL of the web page to crawl
399 |
400 | Returns:
401 | Summary of the crawling operation and storage in Supabase
402 | """
403 | try:
404 | # Get the crawler from the context
405 | crawler = ctx.request_context.lifespan_context.crawler
406 | supabase_client = ctx.request_context.lifespan_context.supabase_client
407 |
408 | # Configure the crawl
409 | run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
410 |
411 | # Crawl the page
412 | result = await crawler.arun(url=url, config=run_config)
413 |
414 | if result.success and result.markdown:
415 | # Extract source_id
416 | parsed_url = urlparse(url)
417 | source_id = parsed_url.netloc or parsed_url.path
418 |
419 | # Chunk the content
420 | chunks = smart_chunk_markdown(result.markdown)
421 |
422 | # Prepare data for Supabase
423 | urls = []
424 | chunk_numbers = []
425 | contents = []
426 | metadatas = []
427 | total_word_count = 0
428 |
429 | for i, chunk in enumerate(chunks):
430 | urls.append(url)
431 | chunk_numbers.append(i)
432 | contents.append(chunk)
433 |
434 | # Extract metadata
435 | meta = extract_section_info(chunk)
436 | meta["chunk_index"] = i
437 | meta["url"] = url
438 | meta["source"] = source_id
439 | meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
440 | metadatas.append(meta)
441 |
442 | # Accumulate word count
443 | total_word_count += meta.get("word_count", 0)
444 |
445 | # Create url_to_full_document mapping
446 | url_to_full_document = {url: result.markdown}
447 |
448 | # Update source information FIRST (before inserting documents)
449 | source_summary = extract_source_summary(source_id, result.markdown[:5000]) # Use first 5000 chars for summary
450 | update_source_info(supabase_client, source_id, source_summary, total_word_count)
451 |
452 | # Add documentation chunks to Supabase (AFTER source exists)
453 | add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document)
454 |
455 | # Extract and process code examples only if enabled
456 | extract_code_examples = os.getenv("USE_AGENTIC_RAG", "false") == "true"
457 | if extract_code_examples:
458 | code_blocks = extract_code_blocks(result.markdown)
459 | if code_blocks:
460 | code_urls = []
461 | code_chunk_numbers = []
462 | code_examples = []
463 | code_summaries = []
464 | code_metadatas = []
465 |
466 | # Process code examples in parallel
467 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
468 | # Prepare arguments for parallel processing
469 | summary_args = [(block['code'], block['context_before'], block['context_after'])
470 | for block in code_blocks]
471 |
472 | # Generate summaries in parallel
473 | summaries = list(executor.map(process_code_example, summary_args))
474 |
475 | # Prepare code example data
476 | for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
477 | code_urls.append(url)
478 | code_chunk_numbers.append(i)
479 | code_examples.append(block['code'])
480 | code_summaries.append(summary)
481 |
482 | # Create metadata for code example
483 | code_meta = {
484 | "chunk_index": i,
485 | "url": url,
486 | "source": source_id,
487 | "char_count": len(block['code']),
488 | "word_count": len(block['code'].split())
489 | }
490 | code_metadatas.append(code_meta)
491 |
492 | # Add code examples to Supabase
493 | add_code_examples_to_supabase(
494 | supabase_client,
495 | code_urls,
496 | code_chunk_numbers,
497 | code_examples,
498 | code_summaries,
499 | code_metadatas
500 | )
501 |
502 | return json.dumps({
503 | "success": True,
504 | "url": url,
505 | "chunks_stored": len(chunks),
506 | "code_examples_stored": len(code_blocks) if code_blocks else 0,
507 | "content_length": len(result.markdown),
508 | "total_word_count": total_word_count,
509 | "source_id": source_id,
510 | "links_count": {
511 | "internal": len(result.links.get("internal", [])),
512 | "external": len(result.links.get("external", []))
513 | }
514 | }, indent=2)
515 | else:
516 | return json.dumps({
517 | "success": False,
518 | "url": url,
519 | "error": result.error_message
520 | }, indent=2)
521 | except Exception as e:
522 | return json.dumps({
523 | "success": False,
524 | "url": url,
525 | "error": str(e)
526 | }, indent=2)
527 |
528 | @mcp.tool()
529 | async def smart_crawl_url(ctx: Context, url: str, max_depth: int = 3, max_concurrent: int = 10, chunk_size: int = 5000) -> str:
530 | """
531 | Intelligently crawl a URL based on its type and store content in Supabase.
532 |
533 | This tool automatically detects the URL type and applies the appropriate crawling method:
534 | - For sitemaps: Extracts and crawls all URLs in parallel
535 | - For text files (llms.txt): Directly retrieves the content
536 | - For regular webpages: Recursively crawls internal links up to the specified depth
537 |
538 | All crawled content is chunked and stored in Supabase for later retrieval and querying.
539 |
540 | Args:
541 | ctx: The MCP server provided context
542 | url: URL to crawl (can be a regular webpage, sitemap.xml, or .txt file)
543 | max_depth: Maximum recursion depth for regular URLs (default: 3)
544 | max_concurrent: Maximum number of concurrent browser sessions (default: 10)
545 | chunk_size: Maximum size of each content chunk in characters (default: 1000)
546 |
547 | Returns:
548 | JSON string with crawl summary and storage information
549 | """
550 | try:
551 | # Get the crawler from the context
552 | crawler = ctx.request_context.lifespan_context.crawler
553 | supabase_client = ctx.request_context.lifespan_context.supabase_client
554 |
555 | # Determine the crawl strategy
556 | crawl_results = []
557 | crawl_type = None
558 |
559 | if is_txt(url):
560 | # For text files, use simple crawl
561 | crawl_results = await crawl_markdown_file(crawler, url)
562 | crawl_type = "text_file"
563 | elif is_sitemap(url):
564 | # For sitemaps, extract URLs and crawl in parallel
565 | sitemap_urls = parse_sitemap(url)
566 | if not sitemap_urls:
567 | return json.dumps({
568 | "success": False,
569 | "url": url,
570 | "error": "No URLs found in sitemap"
571 | }, indent=2)
572 | crawl_results = await crawl_batch(crawler, sitemap_urls, max_concurrent=max_concurrent)
573 | crawl_type = "sitemap"
574 | else:
575 | # For regular URLs, use recursive crawl
576 | crawl_results = await crawl_recursive_internal_links(crawler, [url], max_depth=max_depth, max_concurrent=max_concurrent)
577 | crawl_type = "webpage"
578 |
579 | if not crawl_results:
580 | return json.dumps({
581 | "success": False,
582 | "url": url,
583 | "error": "No content found"
584 | }, indent=2)
585 |
586 | # Process results and store in Supabase
587 | urls = []
588 | chunk_numbers = []
589 | contents = []
590 | metadatas = []
591 | chunk_count = 0
592 |
593 | # Track sources and their content
594 | source_content_map = {}
595 | source_word_counts = {}
596 |
597 | # Process documentation chunks
598 | for doc in crawl_results:
599 | source_url = doc['url']
600 | md = doc['markdown']
601 | chunks = smart_chunk_markdown(md, chunk_size=chunk_size)
602 |
603 | # Extract source_id
604 | parsed_url = urlparse(source_url)
605 | source_id = parsed_url.netloc or parsed_url.path
606 |
607 | # Store content for source summary generation
608 | if source_id not in source_content_map:
609 | source_content_map[source_id] = md[:5000] # Store first 5000 chars
610 | source_word_counts[source_id] = 0
611 |
612 | for i, chunk in enumerate(chunks):
613 | urls.append(source_url)
614 | chunk_numbers.append(i)
615 | contents.append(chunk)
616 |
617 | # Extract metadata
618 | meta = extract_section_info(chunk)
619 | meta["chunk_index"] = i
620 | meta["url"] = source_url
621 | meta["source"] = source_id
622 | meta["crawl_type"] = crawl_type
623 | meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
624 | metadatas.append(meta)
625 |
626 | # Accumulate word count
627 | source_word_counts[source_id] += meta.get("word_count", 0)
628 |
629 | chunk_count += 1
630 |
631 | # Create url_to_full_document mapping
632 | url_to_full_document = {}
633 | for doc in crawl_results:
634 | url_to_full_document[doc['url']] = doc['markdown']
635 |
636 | # Update source information for each unique source FIRST (before inserting documents)
637 | with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
638 | source_summary_args = [(source_id, content) for source_id, content in source_content_map.items()]
639 | source_summaries = list(executor.map(lambda args: extract_source_summary(args[0], args[1]), source_summary_args))
640 |
641 | for (source_id, _), summary in zip(source_summary_args, source_summaries):
642 | word_count = source_word_counts.get(source_id, 0)
643 | update_source_info(supabase_client, source_id, summary, word_count)
644 |
645 | # Add documentation chunks to Supabase (AFTER sources exist)
646 | batch_size = 20
647 | add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document, batch_size=batch_size)
648 |
649 | # Extract and process code examples from all documents only if enabled
650 | extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
651 | if extract_code_examples_enabled:
652 | all_code_blocks = []
653 | code_urls = []
654 | code_chunk_numbers = []
655 | code_examples = []
656 | code_summaries = []
657 | code_metadatas = []
658 |
659 | # Extract code blocks from all documents
660 | for doc in crawl_results:
661 | source_url = doc['url']
662 | md = doc['markdown']
663 | code_blocks = extract_code_blocks(md)
664 |
665 | if code_blocks:
666 | # Process code examples in parallel
667 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
668 | # Prepare arguments for parallel processing
669 | summary_args = [(block['code'], block['context_before'], block['context_after'])
670 | for block in code_blocks]
671 |
672 | # Generate summaries in parallel
673 | summaries = list(executor.map(process_code_example, summary_args))
674 |
675 | # Prepare code example data
676 | parsed_url = urlparse(source_url)
677 | source_id = parsed_url.netloc or parsed_url.path
678 |
679 | for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
680 | code_urls.append(source_url)
681 | code_chunk_numbers.append(len(code_examples)) # Use global code example index
682 | code_examples.append(block['code'])
683 | code_summaries.append(summary)
684 |
685 | # Create metadata for code example
686 | code_meta = {
687 | "chunk_index": len(code_examples) - 1,
688 | "url": source_url,
689 | "source": source_id,
690 | "char_count": len(block['code']),
691 | "word_count": len(block['code'].split())
692 | }
693 | code_metadatas.append(code_meta)
694 |
695 | # Add all code examples to Supabase
696 | if code_examples:
697 | add_code_examples_to_supabase(
698 | supabase_client,
699 | code_urls,
700 | code_chunk_numbers,
701 | code_examples,
702 | code_summaries,
703 | code_metadatas,
704 | batch_size=batch_size
705 | )
706 |
707 | return json.dumps({
708 | "success": True,
709 | "url": url,
710 | "crawl_type": crawl_type,
711 | "pages_crawled": len(crawl_results),
712 | "chunks_stored": chunk_count,
713 | "code_examples_stored": len(code_examples),
714 | "sources_updated": len(source_content_map),
715 | "urls_crawled": [doc['url'] for doc in crawl_results][:5] + (["..."] if len(crawl_results) > 5 else [])
716 | }, indent=2)
717 | except Exception as e:
718 | return json.dumps({
719 | "success": False,
720 | "url": url,
721 | "error": str(e)
722 | }, indent=2)
723 |
724 | @mcp.tool()
725 | async def get_available_sources(ctx: Context) -> str:
726 | """
727 | Get all available sources from the sources table.
728 |
729 | This tool returns a list of all unique sources (domains) that have been crawled and stored
730 | in the database, along with their summaries and statistics. This is useful for discovering
731 | what content is available for querying.
732 |
733 | Always use this tool before calling the RAG query or code example query tool
734 | with a specific source filter!
735 |
736 | Args:
737 | ctx: The MCP server provided context
738 |
739 | Returns:
740 | JSON string with the list of available sources and their details
741 | """
742 | try:
743 | # Get the Supabase client from the context
744 | supabase_client = ctx.request_context.lifespan_context.supabase_client
745 |
746 | # Query the sources table directly
747 | result = supabase_client.from_('sources')\
748 | .select('*')\
749 | .order('source_id')\
750 | .execute()
751 |
752 | # Format the sources with their details
753 | sources = []
754 | if result.data:
755 | for source in result.data:
756 | sources.append({
757 | "source_id": source.get("source_id"),
758 | "summary": source.get("summary"),
759 | "total_words": source.get("total_words"),
760 | "created_at": source.get("created_at"),
761 | "updated_at": source.get("updated_at")
762 | })
763 |
764 | return json.dumps({
765 | "success": True,
766 | "sources": sources,
767 | "count": len(sources)
768 | }, indent=2)
769 | except Exception as e:
770 | return json.dumps({
771 | "success": False,
772 | "error": str(e)
773 | }, indent=2)
774 |
775 | @mcp.tool()
776 | async def perform_rag_query(ctx: Context, query: str, source: str = None, match_count: int = 5) -> str:
777 | """
778 | Perform a RAG (Retrieval Augmented Generation) query on the stored content.
779 |
780 | This tool searches the vector database for content relevant to the query and returns
781 | the matching documents. Optionally filter by source domain.
782 | Get the source by using the get_available_sources tool before calling this search!
783 |
784 | Args:
785 | ctx: The MCP server provided context
786 | query: The search query
787 | source: Optional source domain to filter results (e.g., 'example.com')
788 | match_count: Maximum number of results to return (default: 5)
789 |
790 | Returns:
791 | JSON string with the search results
792 | """
793 | try:
794 | # Get the Supabase client from the context
795 | supabase_client = ctx.request_context.lifespan_context.supabase_client
796 |
797 | # Check if hybrid search is enabled
798 | use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
799 |
800 | # Prepare filter if source is provided and not empty
801 | filter_metadata = None
802 | if source and source.strip():
803 | filter_metadata = {"source": source}
804 |
805 | if use_hybrid_search:
806 | # Hybrid search: combine vector and keyword search
807 |
808 | # 1. Get vector search results (get more to account for filtering)
809 | vector_results = search_documents(
810 | client=supabase_client,
811 | query=query,
812 | match_count=match_count * 2, # Get double to have room for filtering
813 | filter_metadata=filter_metadata
814 | )
815 |
816 | # 2. Get keyword search results using ILIKE
817 | keyword_query = supabase_client.from_('crawled_pages')\
818 | .select('id, url, chunk_number, content, metadata, source_id')\
819 | .ilike('content', f'%{query}%')
820 |
821 | # Apply source filter if provided
822 | if source and source.strip():
823 | keyword_query = keyword_query.eq('source_id', source)
824 |
825 | # Execute keyword search
826 | keyword_response = keyword_query.limit(match_count * 2).execute()
827 | keyword_results = keyword_response.data if keyword_response.data else []
828 |
829 | # 3. Combine results with preference for items appearing in both
830 | seen_ids = set()
831 | combined_results = []
832 |
833 | # First, add items that appear in both searches (these are the best matches)
834 | vector_ids = {r.get('id') for r in vector_results if r.get('id')}
835 | for kr in keyword_results:
836 | if kr['id'] in vector_ids and kr['id'] not in seen_ids:
837 | # Find the vector result to get similarity score
838 | for vr in vector_results:
839 | if vr.get('id') == kr['id']:
840 | # Boost similarity score for items in both results
841 | vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
842 | combined_results.append(vr)
843 | seen_ids.add(kr['id'])
844 | break
845 |
846 | # Then add remaining vector results (semantic matches without exact keyword)
847 | for vr in vector_results:
848 | if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
849 | combined_results.append(vr)
850 | seen_ids.add(vr['id'])
851 |
852 | # Finally, add pure keyword matches if we still need more results
853 | for kr in keyword_results:
854 | if kr['id'] not in seen_ids and len(combined_results) < match_count:
855 | # Convert keyword result to match vector result format
856 | combined_results.append({
857 | 'id': kr['id'],
858 | 'url': kr['url'],
859 | 'chunk_number': kr['chunk_number'],
860 | 'content': kr['content'],
861 | 'metadata': kr['metadata'],
862 | 'source_id': kr['source_id'],
863 | 'similarity': 0.5 # Default similarity for keyword-only matches
864 | })
865 | seen_ids.add(kr['id'])
866 |
867 | # Use combined results
868 | results = combined_results[:match_count]
869 |
870 | else:
871 | # Standard vector search only
872 | results = search_documents(
873 | client=supabase_client,
874 | query=query,
875 | match_count=match_count,
876 | filter_metadata=filter_metadata
877 | )
878 |
879 | # Apply reranking if enabled
880 | use_reranking = os.getenv("USE_RERANKING", "false") == "true"
881 | if use_reranking and ctx.request_context.lifespan_context.reranking_model:
882 | results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
883 |
884 | # Format the results
885 | formatted_results = []
886 | for result in results:
887 | formatted_result = {
888 | "url": result.get("url"),
889 | "content": result.get("content"),
890 | "metadata": result.get("metadata"),
891 | "similarity": result.get("similarity")
892 | }
893 | # Include rerank score if available
894 | if "rerank_score" in result:
895 | formatted_result["rerank_score"] = result["rerank_score"]
896 | formatted_results.append(formatted_result)
897 |
898 | return json.dumps({
899 | "success": True,
900 | "query": query,
901 | "source_filter": source,
902 | "search_mode": "hybrid" if use_hybrid_search else "vector",
903 | "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
904 | "results": formatted_results,
905 | "count": len(formatted_results)
906 | }, indent=2)
907 | except Exception as e:
908 | return json.dumps({
909 | "success": False,
910 | "query": query,
911 | "error": str(e)
912 | }, indent=2)
913 |
914 | @mcp.tool()
915 | async def search_code_examples(ctx: Context, query: str, source_id: str = None, match_count: int = 5) -> str:
916 | """
917 | Search for code examples relevant to the query.
918 |
919 | This tool searches the vector database for code examples relevant to the query and returns
920 | the matching examples with their summaries. Optionally filter by source_id.
921 | Get the source_id by using the get_available_sources tool before calling this search!
922 |
923 | Use the get_available_sources tool first to see what sources are available for filtering.
924 |
925 | Args:
926 | ctx: The MCP server provided context
927 | query: The search query
928 | source_id: Optional source ID to filter results (e.g., 'example.com')
929 | match_count: Maximum number of results to return (default: 5)
930 |
931 | Returns:
932 | JSON string with the search results
933 | """
934 | # Check if code example extraction is enabled
935 | extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
936 | if not extract_code_examples_enabled:
937 | return json.dumps({
938 | "success": False,
939 | "error": "Code example extraction is disabled. Perform a normal RAG search."
940 | }, indent=2)
941 |
942 | try:
943 | # Get the Supabase client from the context
944 | supabase_client = ctx.request_context.lifespan_context.supabase_client
945 |
946 | # Check if hybrid search is enabled
947 | use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
948 |
949 | # Prepare filter if source is provided and not empty
950 | filter_metadata = None
951 | if source_id and source_id.strip():
952 | filter_metadata = {"source": source_id}
953 |
954 | if use_hybrid_search:
955 | # Hybrid search: combine vector and keyword search
956 |
957 | # Import the search function from utils
958 | from utils import search_code_examples as search_code_examples_impl
959 |
960 | # 1. Get vector search results (get more to account for filtering)
961 | vector_results = search_code_examples_impl(
962 | client=supabase_client,
963 | query=query,
964 | match_count=match_count * 2, # Get double to have room for filtering
965 | filter_metadata=filter_metadata
966 | )
967 |
968 | # 2. Get keyword search results using ILIKE on both content and summary
969 | keyword_query = supabase_client.from_('code_examples')\
970 | .select('id, url, chunk_number, content, summary, metadata, source_id')\
971 | .or_(f'content.ilike.%{query}%,summary.ilike.%{query}%')
972 |
973 | # Apply source filter if provided
974 | if source_id and source_id.strip():
975 | keyword_query = keyword_query.eq('source_id', source_id)
976 |
977 | # Execute keyword search
978 | keyword_response = keyword_query.limit(match_count * 2).execute()
979 | keyword_results = keyword_response.data if keyword_response.data else []
980 |
981 | # 3. Combine results with preference for items appearing in both
982 | seen_ids = set()
983 | combined_results = []
984 |
985 | # First, add items that appear in both searches (these are the best matches)
986 | vector_ids = {r.get('id') for r in vector_results if r.get('id')}
987 | for kr in keyword_results:
988 | if kr['id'] in vector_ids and kr['id'] not in seen_ids:
989 | # Find the vector result to get similarity score
990 | for vr in vector_results:
991 | if vr.get('id') == kr['id']:
992 | # Boost similarity score for items in both results
993 | vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
994 | combined_results.append(vr)
995 | seen_ids.add(kr['id'])
996 | break
997 |
998 | # Then add remaining vector results (semantic matches without exact keyword)
999 | for vr in vector_results:
1000 | if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
1001 | combined_results.append(vr)
1002 | seen_ids.add(vr['id'])
1003 |
1004 | # Finally, add pure keyword matches if we still need more results
1005 | for kr in keyword_results:
1006 | if kr['id'] not in seen_ids and len(combined_results) < match_count:
1007 | # Convert keyword result to match vector result format
1008 | combined_results.append({
1009 | 'id': kr['id'],
1010 | 'url': kr['url'],
1011 | 'chunk_number': kr['chunk_number'],
1012 | 'content': kr['content'],
1013 | 'summary': kr['summary'],
1014 | 'metadata': kr['metadata'],
1015 | 'source_id': kr['source_id'],
1016 | 'similarity': 0.5 # Default similarity for keyword-only matches
1017 | })
1018 | seen_ids.add(kr['id'])
1019 |
1020 | # Use combined results
1021 | results = combined_results[:match_count]
1022 |
1023 | else:
1024 | # Standard vector search only
1025 | from utils import search_code_examples as search_code_examples_impl
1026 |
1027 | results = search_code_examples_impl(
1028 | client=supabase_client,
1029 | query=query,
1030 | match_count=match_count,
1031 | filter_metadata=filter_metadata
1032 | )
1033 |
1034 | # Apply reranking if enabled
1035 | use_reranking = os.getenv("USE_RERANKING", "false") == "true"
1036 | if use_reranking and ctx.request_context.lifespan_context.reranking_model:
1037 | results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
1038 |
1039 | # Format the results
1040 | formatted_results = []
1041 | for result in results:
1042 | formatted_result = {
1043 | "url": result.get("url"),
1044 | "code": result.get("content"),
1045 | "summary": result.get("summary"),
1046 | "metadata": result.get("metadata"),
1047 | "source_id": result.get("source_id"),
1048 | "similarity": result.get("similarity")
1049 | }
1050 | # Include rerank score if available
1051 | if "rerank_score" in result:
1052 | formatted_result["rerank_score"] = result["rerank_score"]
1053 | formatted_results.append(formatted_result)
1054 |
1055 | return json.dumps({
1056 | "success": True,
1057 | "query": query,
1058 | "source_filter": source_id,
1059 | "search_mode": "hybrid" if use_hybrid_search else "vector",
1060 | "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
1061 | "results": formatted_results,
1062 | "count": len(formatted_results)
1063 | }, indent=2)
1064 | except Exception as e:
1065 | return json.dumps({
1066 | "success": False,
1067 | "query": query,
1068 | "error": str(e)
1069 | }, indent=2)
1070 |
1071 | @mcp.tool()
1072 | async def check_ai_script_hallucinations(ctx: Context, script_path: str) -> str:
1073 | """
1074 | Check an AI-generated Python script for hallucinations using the knowledge graph.
1075 |
1076 | This tool analyzes a Python script for potential AI hallucinations by validating
1077 | imports, method calls, class instantiations, and function calls against a Neo4j
1078 | knowledge graph containing real repository data.
1079 |
1080 | The tool performs comprehensive analysis including:
1081 | - Import validation against known repositories
1082 | - Method call validation on classes from the knowledge graph
1083 | - Class instantiation parameter validation
1084 | - Function call parameter validation
1085 | - Attribute access validation
1086 |
1087 | Args:
1088 | ctx: The MCP server provided context
1089 | script_path: Absolute path to the Python script to analyze
1090 |
1091 | Returns:
1092 | JSON string with hallucination detection results, confidence scores, and recommendations
1093 | """
1094 | try:
1095 | # Check if knowledge graph functionality is enabled
1096 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1097 | if not knowledge_graph_enabled:
1098 | return json.dumps({
1099 | "success": False,
1100 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1101 | }, indent=2)
1102 |
1103 | # Get the knowledge validator from context
1104 | knowledge_validator = ctx.request_context.lifespan_context.knowledge_validator
1105 |
1106 | if not knowledge_validator:
1107 | return json.dumps({
1108 | "success": False,
1109 | "error": "Knowledge graph validator not available. Check Neo4j configuration in environment variables."
1110 | }, indent=2)
1111 |
1112 | # Validate script path
1113 | validation = validate_script_path(script_path)
1114 | if not validation["valid"]:
1115 | return json.dumps({
1116 | "success": False,
1117 | "script_path": script_path,
1118 | "error": validation["error"]
1119 | }, indent=2)
1120 |
1121 | # Step 1: Analyze script structure using AST
1122 | analyzer = AIScriptAnalyzer()
1123 | analysis_result = analyzer.analyze_script(script_path)
1124 |
1125 | if analysis_result.errors:
1126 | print(f"Analysis warnings for {script_path}: {analysis_result.errors}")
1127 |
1128 | # Step 2: Validate against knowledge graph
1129 | validation_result = await knowledge_validator.validate_script(analysis_result)
1130 |
1131 | # Step 3: Generate comprehensive report
1132 | reporter = HallucinationReporter()
1133 | report = reporter.generate_comprehensive_report(validation_result)
1134 |
1135 | # Format response with comprehensive information
1136 | return json.dumps({
1137 | "success": True,
1138 | "script_path": script_path,
1139 | "overall_confidence": validation_result.overall_confidence,
1140 | "validation_summary": {
1141 | "total_validations": report["validation_summary"]["total_validations"],
1142 | "valid_count": report["validation_summary"]["valid_count"],
1143 | "invalid_count": report["validation_summary"]["invalid_count"],
1144 | "uncertain_count": report["validation_summary"]["uncertain_count"],
1145 | "not_found_count": report["validation_summary"]["not_found_count"],
1146 | "hallucination_rate": report["validation_summary"]["hallucination_rate"]
1147 | },
1148 | "hallucinations_detected": report["hallucinations_detected"],
1149 | "recommendations": report["recommendations"],
1150 | "analysis_metadata": {
1151 | "total_imports": report["analysis_metadata"]["total_imports"],
1152 | "total_classes": report["analysis_metadata"]["total_classes"],
1153 | "total_methods": report["analysis_metadata"]["total_methods"],
1154 | "total_attributes": report["analysis_metadata"]["total_attributes"],
1155 | "total_functions": report["analysis_metadata"]["total_functions"]
1156 | },
1157 | "libraries_analyzed": report.get("libraries_analyzed", [])
1158 | }, indent=2)
1159 |
1160 | except Exception as e:
1161 | return json.dumps({
1162 | "success": False,
1163 | "script_path": script_path,
1164 | "error": f"Analysis failed: {str(e)}"
1165 | }, indent=2)
1166 |
1167 | @mcp.tool()
1168 | async def query_knowledge_graph(ctx: Context, command: str) -> str:
1169 | """
1170 | Query and explore the Neo4j knowledge graph containing repository data.
1171 |
1172 | This tool provides comprehensive access to the knowledge graph for exploring repositories,
1173 | classes, methods, functions, and their relationships. Perfect for understanding what data
1174 | is available for hallucination detection and debugging validation results.
1175 |
1176 | **⚠️ IMPORTANT: Always start with the `repos` command first!**
1177 | Before using any other commands, run `repos` to see what repositories are available
1178 | in your knowledge graph. This will help you understand what data you can explore.
1179 |
1180 | ## Available Commands:
1181 |
1182 | **Repository Commands:**
1183 | - `repos` - **START HERE!** List all repositories in the knowledge graph
1184 | - `explore <repo_name>` - Get detailed overview of a specific repository
1185 |
1186 | **Class Commands:**
1187 | - `classes` - List all classes across all repositories (limited to 20)
1188 | - `classes <repo_name>` - List classes in a specific repository
1189 | - `class <class_name>` - Get detailed information about a specific class including methods and attributes
1190 |
1191 | **Method Commands:**
1192 | - `method <method_name>` - Search for methods by name across all classes
1193 | - `method <method_name> <class_name>` - Search for a method within a specific class
1194 |
1195 | **Custom Query:**
1196 | - `query <cypher_query>` - Execute a custom Cypher query (results limited to 20 records)
1197 |
1198 | ## Knowledge Graph Schema:
1199 |
1200 | **Node Types:**
1201 | - Repository: `(r:Repository {name: string})`
1202 | - File: `(f:File {path: string, module_name: string})`
1203 | - Class: `(c:Class {name: string, full_name: string})`
1204 | - Method: `(m:Method {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1205 | - Function: `(func:Function {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1206 | - Attribute: `(a:Attribute {name: string, type: string})`
1207 |
1208 | **Relationships:**
1209 | - `(r:Repository)-[:CONTAINS]->(f:File)`
1210 | - `(f:File)-[:DEFINES]->(c:Class)`
1211 | - `(c:Class)-[:HAS_METHOD]->(m:Method)`
1212 | - `(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)`
1213 | - `(f:File)-[:DEFINES]->(func:Function)`
1214 |
1215 | ## Example Workflow:
1216 | ```
1217 | 1. repos # See what repositories are available
1218 | 2. explore pydantic-ai # Explore a specific repository
1219 | 3. classes pydantic-ai # List classes in that repository
1220 | 4. class Agent # Explore the Agent class
1221 | 5. method run_stream # Search for run_stream method
1222 | 6. method __init__ Agent # Find Agent constructor
1223 | 7. query "MATCH (c:Class)-[:HAS_METHOD]->(m:Method) WHERE m.name = 'run' RETURN c.name, m.name LIMIT 5"
1224 | ```
1225 |
1226 | Args:
1227 | ctx: The MCP server provided context
1228 | command: Command string to execute (see available commands above)
1229 |
1230 | Returns:
1231 | JSON string with query results, statistics, and metadata
1232 | """
1233 | try:
1234 | # Check if knowledge graph functionality is enabled
1235 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1236 | if not knowledge_graph_enabled:
1237 | return json.dumps({
1238 | "success": False,
1239 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1240 | }, indent=2)
1241 |
1242 | # Get Neo4j driver from context
1243 | repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1244 | if not repo_extractor or not repo_extractor.driver:
1245 | return json.dumps({
1246 | "success": False,
1247 | "error": "Neo4j connection not available. Check Neo4j configuration in environment variables."
1248 | }, indent=2)
1249 |
1250 | # Parse command
1251 | command = command.strip()
1252 | if not command:
1253 | return json.dumps({
1254 | "success": False,
1255 | "command": "",
1256 | "error": "Command cannot be empty. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1257 | }, indent=2)
1258 |
1259 | parts = command.split()
1260 | cmd = parts[0].lower()
1261 | args = parts[1:] if len(parts) > 1 else []
1262 |
1263 | async with repo_extractor.driver.session() as session:
1264 | # Route to appropriate handler
1265 | if cmd == "repos":
1266 | return await _handle_repos_command(session, command)
1267 | elif cmd == "explore":
1268 | if not args:
1269 | return json.dumps({
1270 | "success": False,
1271 | "command": command,
1272 | "error": "Repository name required. Usage: explore <repo_name>"
1273 | }, indent=2)
1274 | return await _handle_explore_command(session, command, args[0])
1275 | elif cmd == "classes":
1276 | repo_name = args[0] if args else None
1277 | return await _handle_classes_command(session, command, repo_name)
1278 | elif cmd == "class":
1279 | if not args:
1280 | return json.dumps({
1281 | "success": False,
1282 | "command": command,
1283 | "error": "Class name required. Usage: class <class_name>"
1284 | }, indent=2)
1285 | return await _handle_class_command(session, command, args[0])
1286 | elif cmd == "method":
1287 | if not args:
1288 | return json.dumps({
1289 | "success": False,
1290 | "command": command,
1291 | "error": "Method name required. Usage: method <method_name> [class_name]"
1292 | }, indent=2)
1293 | method_name = args[0]
1294 | class_name = args[1] if len(args) > 1 else None
1295 | return await _handle_method_command(session, command, method_name, class_name)
1296 | elif cmd == "query":
1297 | if not args:
1298 | return json.dumps({
1299 | "success": False,
1300 | "command": command,
1301 | "error": "Cypher query required. Usage: query <cypher_query>"
1302 | }, indent=2)
1303 | cypher_query = " ".join(args)
1304 | return await _handle_query_command(session, command, cypher_query)
1305 | else:
1306 | return json.dumps({
1307 | "success": False,
1308 | "command": command,
1309 | "error": f"Unknown command '{cmd}'. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1310 | }, indent=2)
1311 |
1312 | except Exception as e:
1313 | return json.dumps({
1314 | "success": False,
1315 | "command": command,
1316 | "error": f"Query execution failed: {str(e)}"
1317 | }, indent=2)
1318 |
1319 |
1320 | async def _handle_repos_command(session, command: str) -> str:
1321 | """Handle 'repos' command - list all repositories"""
1322 | query = "MATCH (r:Repository) RETURN r.name as name ORDER BY r.name"
1323 | result = await session.run(query)
1324 |
1325 | repos = []
1326 | async for record in result:
1327 | repos.append(record['name'])
1328 |
1329 | return json.dumps({
1330 | "success": True,
1331 | "command": command,
1332 | "data": {
1333 | "repositories": repos
1334 | },
1335 | "metadata": {
1336 | "total_results": len(repos),
1337 | "limited": False
1338 | }
1339 | }, indent=2)
1340 |
1341 |
1342 | async def _handle_explore_command(session, command: str, repo_name: str) -> str:
1343 | """Handle 'explore <repo>' command - get repository overview"""
1344 | # Check if repository exists
1345 | repo_check_query = "MATCH (r:Repository {name: $repo_name}) RETURN r.name as name"
1346 | result = await session.run(repo_check_query, repo_name=repo_name)
1347 | repo_record = await result.single()
1348 |
1349 | if not repo_record:
1350 | return json.dumps({
1351 | "success": False,
1352 | "command": command,
1353 | "error": f"Repository '{repo_name}' not found in knowledge graph"
1354 | }, indent=2)
1355 |
1356 | # Get file count
1357 | files_query = """
1358 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
1359 | RETURN count(f) as file_count
1360 | """
1361 | result = await session.run(files_query, repo_name=repo_name)
1362 | file_count = (await result.single())['file_count']
1363 |
1364 | # Get class count
1365 | classes_query = """
1366 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1367 | RETURN count(DISTINCT c) as class_count
1368 | """
1369 | result = await session.run(classes_query, repo_name=repo_name)
1370 | class_count = (await result.single())['class_count']
1371 |
1372 | # Get function count
1373 | functions_query = """
1374 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1375 | RETURN count(DISTINCT func) as function_count
1376 | """
1377 | result = await session.run(functions_query, repo_name=repo_name)
1378 | function_count = (await result.single())['function_count']
1379 |
1380 | # Get method count
1381 | methods_query = """
1382 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1383 | RETURN count(DISTINCT m) as method_count
1384 | """
1385 | result = await session.run(methods_query, repo_name=repo_name)
1386 | method_count = (await result.single())['method_count']
1387 |
1388 | return json.dumps({
1389 | "success": True,
1390 | "command": command,
1391 | "data": {
1392 | "repository": repo_name,
1393 | "statistics": {
1394 | "files": file_count,
1395 | "classes": class_count,
1396 | "functions": function_count,
1397 | "methods": method_count
1398 | }
1399 | },
1400 | "metadata": {
1401 | "total_results": 1,
1402 | "limited": False
1403 | }
1404 | }, indent=2)
1405 |
1406 |
1407 | async def _handle_classes_command(session, command: str, repo_name: str = None) -> str:
1408 | """Handle 'classes [repo]' command - list classes"""
1409 | limit = 20
1410 |
1411 | if repo_name:
1412 | query = """
1413 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1414 | RETURN c.name as name, c.full_name as full_name
1415 | ORDER BY c.name
1416 | LIMIT $limit
1417 | """
1418 | result = await session.run(query, repo_name=repo_name, limit=limit)
1419 | else:
1420 | query = """
1421 | MATCH (c:Class)
1422 | RETURN c.name as name, c.full_name as full_name
1423 | ORDER BY c.name
1424 | LIMIT $limit
1425 | """
1426 | result = await session.run(query, limit=limit)
1427 |
1428 | classes = []
1429 | async for record in result:
1430 | classes.append({
1431 | 'name': record['name'],
1432 | 'full_name': record['full_name']
1433 | })
1434 |
1435 | return json.dumps({
1436 | "success": True,
1437 | "command": command,
1438 | "data": {
1439 | "classes": classes,
1440 | "repository_filter": repo_name
1441 | },
1442 | "metadata": {
1443 | "total_results": len(classes),
1444 | "limited": len(classes) >= limit
1445 | }
1446 | }, indent=2)
1447 |
1448 |
1449 | async def _handle_class_command(session, command: str, class_name: str) -> str:
1450 | """Handle 'class <name>' command - explore specific class"""
1451 | # Find the class
1452 | class_query = """
1453 | MATCH (c:Class)
1454 | WHERE c.name = $class_name OR c.full_name = $class_name
1455 | RETURN c.name as name, c.full_name as full_name
1456 | LIMIT 1
1457 | """
1458 | result = await session.run(class_query, class_name=class_name)
1459 | class_record = await result.single()
1460 |
1461 | if not class_record:
1462 | return json.dumps({
1463 | "success": False,
1464 | "command": command,
1465 | "error": f"Class '{class_name}' not found in knowledge graph"
1466 | }, indent=2)
1467 |
1468 | actual_name = class_record['name']
1469 | full_name = class_record['full_name']
1470 |
1471 | # Get methods
1472 | methods_query = """
1473 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1474 | WHERE c.name = $class_name OR c.full_name = $class_name
1475 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, m.return_type as return_type
1476 | ORDER BY m.name
1477 | """
1478 | result = await session.run(methods_query, class_name=class_name)
1479 |
1480 | methods = []
1481 | async for record in result:
1482 | # Use detailed params if available, fall back to simple params
1483 | params_to_use = record['params_detailed'] or record['params_list'] or []
1484 | methods.append({
1485 | 'name': record['name'],
1486 | 'parameters': params_to_use,
1487 | 'return_type': record['return_type'] or 'Any'
1488 | })
1489 |
1490 | # Get attributes
1491 | attributes_query = """
1492 | MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
1493 | WHERE c.name = $class_name OR c.full_name = $class_name
1494 | RETURN a.name as name, a.type as type
1495 | ORDER BY a.name
1496 | """
1497 | result = await session.run(attributes_query, class_name=class_name)
1498 |
1499 | attributes = []
1500 | async for record in result:
1501 | attributes.append({
1502 | 'name': record['name'],
1503 | 'type': record['type'] or 'Any'
1504 | })
1505 |
1506 | return json.dumps({
1507 | "success": True,
1508 | "command": command,
1509 | "data": {
1510 | "class": {
1511 | "name": actual_name,
1512 | "full_name": full_name,
1513 | "methods": methods,
1514 | "attributes": attributes
1515 | }
1516 | },
1517 | "metadata": {
1518 | "total_results": 1,
1519 | "methods_count": len(methods),
1520 | "attributes_count": len(attributes),
1521 | "limited": False
1522 | }
1523 | }, indent=2)
1524 |
1525 |
1526 | async def _handle_method_command(session, command: str, method_name: str, class_name: str = None) -> str:
1527 | """Handle 'method <name> [class]' command - search for methods"""
1528 | if class_name:
1529 | query = """
1530 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1531 | WHERE (c.name = $class_name OR c.full_name = $class_name)
1532 | AND m.name = $method_name
1533 | RETURN c.name as class_name, c.full_name as class_full_name,
1534 | m.name as method_name, m.params_list as params_list,
1535 | m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1536 | """
1537 | result = await session.run(query, class_name=class_name, method_name=method_name)
1538 | else:
1539 | query = """
1540 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1541 | WHERE m.name = $method_name
1542 | RETURN c.name as class_name, c.full_name as class_full_name,
1543 | m.name as method_name, m.params_list as params_list,
1544 | m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1545 | ORDER BY c.name
1546 | LIMIT 20
1547 | """
1548 | result = await session.run(query, method_name=method_name)
1549 |
1550 | methods = []
1551 | async for record in result:
1552 | # Use detailed params if available, fall back to simple params
1553 | params_to_use = record['params_detailed'] or record['params_list'] or []
1554 | methods.append({
1555 | 'class_name': record['class_name'],
1556 | 'class_full_name': record['class_full_name'],
1557 | 'method_name': record['method_name'],
1558 | 'parameters': params_to_use,
1559 | 'return_type': record['return_type'] or 'Any',
1560 | 'legacy_args': record['args'] or []
1561 | })
1562 |
1563 | if not methods:
1564 | return json.dumps({
1565 | "success": False,
1566 | "command": command,
1567 | "error": f"Method '{method_name}'" + (f" in class '{class_name}'" if class_name else "") + " not found"
1568 | }, indent=2)
1569 |
1570 | return json.dumps({
1571 | "success": True,
1572 | "command": command,
1573 | "data": {
1574 | "methods": methods,
1575 | "class_filter": class_name
1576 | },
1577 | "metadata": {
1578 | "total_results": len(methods),
1579 | "limited": len(methods) >= 20 and not class_name
1580 | }
1581 | }, indent=2)
1582 |
1583 |
1584 | async def _handle_query_command(session, command: str, cypher_query: str) -> str:
1585 | """Handle 'query <cypher>' command - execute custom Cypher query"""
1586 | try:
1587 | # Execute the query with a limit to prevent overwhelming responses
1588 | result = await session.run(cypher_query)
1589 |
1590 | records = []
1591 | count = 0
1592 | async for record in result:
1593 | records.append(dict(record))
1594 | count += 1
1595 | if count >= 20: # Limit results to prevent overwhelming responses
1596 | break
1597 |
1598 | return json.dumps({
1599 | "success": True,
1600 | "command": command,
1601 | "data": {
1602 | "query": cypher_query,
1603 | "results": records
1604 | },
1605 | "metadata": {
1606 | "total_results": len(records),
1607 | "limited": len(records) >= 20
1608 | }
1609 | }, indent=2)
1610 |
1611 | except Exception as e:
1612 | return json.dumps({
1613 | "success": False,
1614 | "command": command,
1615 | "error": f"Cypher query error: {str(e)}",
1616 | "data": {
1617 | "query": cypher_query
1618 | }
1619 | }, indent=2)
1620 |
1621 |
1622 | @mcp.tool()
1623 | async def parse_github_repository(ctx: Context, repo_url: str) -> str:
1624 | """
1625 | Parse a GitHub repository into the Neo4j knowledge graph.
1626 |
1627 | This tool clones a GitHub repository, analyzes its Python files, and stores
1628 | the code structure (classes, methods, functions, imports) in Neo4j for use
1629 | in hallucination detection. The tool:
1630 |
1631 | - Clones the repository to a temporary location
1632 | - Analyzes Python files to extract code structure
1633 | - Stores classes, methods, functions, and imports in Neo4j
1634 | - Provides detailed statistics about the parsing results
1635 | - Automatically handles module name detection for imports
1636 |
1637 | Args:
1638 | ctx: The MCP server provided context
1639 | repo_url: GitHub repository URL (e.g., 'https://github.com/user/repo.git')
1640 |
1641 | Returns:
1642 | JSON string with parsing results, statistics, and repository information
1643 | """
1644 | try:
1645 | # Check if knowledge graph functionality is enabled
1646 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1647 | if not knowledge_graph_enabled:
1648 | return json.dumps({
1649 | "success": False,
1650 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1651 | }, indent=2)
1652 |
1653 | # Get the repository extractor from context
1654 | repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1655 |
1656 | if not repo_extractor:
1657 | return json.dumps({
1658 | "success": False,
1659 | "error": "Repository extractor not available. Check Neo4j configuration in environment variables."
1660 | }, indent=2)
1661 |
1662 | # Validate repository URL
1663 | validation = validate_github_url(repo_url)
1664 | if not validation["valid"]:
1665 | return json.dumps({
1666 | "success": False,
1667 | "repo_url": repo_url,
1668 | "error": validation["error"]
1669 | }, indent=2)
1670 |
1671 | repo_name = validation["repo_name"]
1672 |
1673 | # Parse the repository (this includes cloning, analysis, and Neo4j storage)
1674 | print(f"Starting repository analysis for: {repo_name}")
1675 | await repo_extractor.analyze_repository(repo_url)
1676 | print(f"Repository analysis completed for: {repo_name}")
1677 |
1678 | # Query Neo4j for statistics about the parsed repository
1679 | async with repo_extractor.driver.session() as session:
1680 | # Get comprehensive repository statistics
1681 | stats_query = """
1682 | MATCH (r:Repository {name: $repo_name})
1683 | OPTIONAL MATCH (r)-[:CONTAINS]->(f:File)
1684 | OPTIONAL MATCH (f)-[:DEFINES]->(c:Class)
1685 | OPTIONAL MATCH (c)-[:HAS_METHOD]->(m:Method)
1686 | OPTIONAL MATCH (f)-[:DEFINES]->(func:Function)
1687 | OPTIONAL MATCH (c)-[:HAS_ATTRIBUTE]->(a:Attribute)
1688 | WITH r,
1689 | count(DISTINCT f) as files_count,
1690 | count(DISTINCT c) as classes_count,
1691 | count(DISTINCT m) as methods_count,
1692 | count(DISTINCT func) as functions_count,
1693 | count(DISTINCT a) as attributes_count
1694 |
1695 | // Get some sample module names
1696 | OPTIONAL MATCH (r)-[:CONTAINS]->(sample_f:File)
1697 | WITH r, files_count, classes_count, methods_count, functions_count, attributes_count,
1698 | collect(DISTINCT sample_f.module_name)[0..5] as sample_modules
1699 |
1700 | RETURN
1701 | r.name as repo_name,
1702 | files_count,
1703 | classes_count,
1704 | methods_count,
1705 | functions_count,
1706 | attributes_count,
1707 | sample_modules
1708 | """
1709 |
1710 | result = await session.run(stats_query, repo_name=repo_name)
1711 | record = await result.single()
1712 |
1713 | if record:
1714 | stats = {
1715 | "repository": record['repo_name'],
1716 | "files_processed": record['files_count'],
1717 | "classes_created": record['classes_count'],
1718 | "methods_created": record['methods_count'],
1719 | "functions_created": record['functions_count'],
1720 | "attributes_created": record['attributes_count'],
1721 | "sample_modules": record['sample_modules'] or []
1722 | }
1723 | else:
1724 | return json.dumps({
1725 | "success": False,
1726 | "repo_url": repo_url,
1727 | "error": f"Repository '{repo_name}' not found in database after parsing"
1728 | }, indent=2)
1729 |
1730 | return json.dumps({
1731 | "success": True,
1732 | "repo_url": repo_url,
1733 | "repo_name": repo_name,
1734 | "message": f"Successfully parsed repository '{repo_name}' into knowledge graph",
1735 | "statistics": stats,
1736 | "ready_for_validation": True,
1737 | "next_steps": [
1738 | "Repository is now available for hallucination detection",
1739 | f"Use check_ai_script_hallucinations to validate scripts against {repo_name}",
1740 | "The knowledge graph contains classes, methods, and functions from this repository"
1741 | ]
1742 | }, indent=2)
1743 |
1744 | except Exception as e:
1745 | return json.dumps({
1746 | "success": False,
1747 | "repo_url": repo_url,
1748 | "error": f"Repository parsing failed: {str(e)}"
1749 | }, indent=2)
1750 |
1751 | async def crawl_markdown_file(crawler: AsyncWebCrawler, url: str) -> List[Dict[str, Any]]:
1752 | """
1753 | Crawl a .txt or markdown file.
1754 |
1755 | Args:
1756 | crawler: AsyncWebCrawler instance
1757 | url: URL of the file
1758 |
1759 | Returns:
1760 | List of dictionaries with URL and markdown content
1761 | """
1762 | crawl_config = CrawlerRunConfig()
1763 |
1764 | result = await crawler.arun(url=url, config=crawl_config)
1765 | if result.success and result.markdown:
1766 | return [{'url': url, 'markdown': result.markdown}]
1767 | else:
1768 | print(f"Failed to crawl {url}: {result.error_message}")
1769 | return []
1770 |
1771 | async def crawl_batch(crawler: AsyncWebCrawler, urls: List[str], max_concurrent: int = 10) -> List[Dict[str, Any]]:
1772 | """
1773 | Batch crawl multiple URLs in parallel.
1774 |
1775 | Args:
1776 | crawler: AsyncWebCrawler instance
1777 | urls: List of URLs to crawl
1778 | max_concurrent: Maximum number of concurrent browser sessions
1779 |
1780 | Returns:
1781 | List of dictionaries with URL and markdown content
1782 | """
1783 | crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1784 | dispatcher = MemoryAdaptiveDispatcher(
1785 | memory_threshold_percent=70.0,
1786 | check_interval=1.0,
1787 | max_session_permit=max_concurrent
1788 | )
1789 |
1790 | results = await crawler.arun_many(urls=urls, config=crawl_config, dispatcher=dispatcher)
1791 | return [{'url': r.url, 'markdown': r.markdown} for r in results if r.success and r.markdown]
1792 |
1793 | async def crawl_recursive_internal_links(crawler: AsyncWebCrawler, start_urls: List[str], max_depth: int = 3, max_concurrent: int = 10) -> List[Dict[str, Any]]:
1794 | """
1795 | Recursively crawl internal links from start URLs up to a maximum depth.
1796 |
1797 | Args:
1798 | crawler: AsyncWebCrawler instance
1799 | start_urls: List of starting URLs
1800 | max_depth: Maximum recursion depth
1801 | max_concurrent: Maximum number of concurrent browser sessions
1802 |
1803 | Returns:
1804 | List of dictionaries with URL and markdown content
1805 | """
1806 | run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1807 | dispatcher = MemoryAdaptiveDispatcher(
1808 | memory_threshold_percent=70.0,
1809 | check_interval=1.0,
1810 | max_session_permit=max_concurrent
1811 | )
1812 |
1813 | visited = set()
1814 |
1815 | def normalize_url(url):
1816 | return urldefrag(url)[0]
1817 |
1818 | current_urls = set([normalize_url(u) for u in start_urls])
1819 | results_all = []
1820 |
1821 | for depth in range(max_depth):
1822 | urls_to_crawl = [normalize_url(url) for url in current_urls if normalize_url(url) not in visited]
1823 | if not urls_to_crawl:
1824 | break
1825 |
1826 | results = await crawler.arun_many(urls=urls_to_crawl, config=run_config, dispatcher=dispatcher)
1827 | next_level_urls = set()
1828 |
1829 | for result in results:
1830 | norm_url = normalize_url(result.url)
1831 | visited.add(norm_url)
1832 |
1833 | if result.success and result.markdown:
1834 | results_all.append({'url': result.url, 'markdown': result.markdown})
1835 | for link in result.links.get("internal", []):
1836 | next_url = normalize_url(link["href"])
1837 | if next_url not in visited:
1838 | next_level_urls.add(next_url)
1839 |
1840 | current_urls = next_level_urls
1841 |
1842 | return results_all
1843 |
1844 | async def main():
1845 | transport = os.getenv("TRANSPORT", "sse")
1846 | if transport == 'sse':
1847 | # Run the MCP server with sse transport
1848 | await mcp.run_sse_async()
1849 | else:
1850 | # Run the MCP server with stdio transport
1851 | await mcp.run_stdio_async()
1852 |
1853 | if __name__ == "__main__":
1854 | asyncio.run(main())
```