#
tokens: 35302/50000 2/17 files (page 2/2)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 2 of 2. Use http://codebase.md/coleam00/mcp-crawl4ai-rag?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .dockerignore
├── .env.example
├── .gitattributes
├── .gitignore
├── crawled_pages.sql
├── Dockerfile
├── knowledge_graphs
│   ├── ai_hallucination_detector.py
│   ├── ai_script_analyzer.py
│   ├── hallucination_reporter.py
│   ├── knowledge_graph_validator.py
│   ├── parse_repo_into_neo4j.py
│   ├── query_knowledge_graph.py
│   └── test_script.py
├── LICENSE
├── pyproject.toml
├── README.md
├── src
│   ├── crawl4ai_mcp.py
│   └── utils.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/knowledge_graphs/knowledge_graph_validator.py:
--------------------------------------------------------------------------------

```python
   1 | """
   2 | Knowledge Graph Validator
   3 | 
   4 | Validates AI-generated code against Neo4j knowledge graph containing
   5 | repository information. Checks imports, methods, attributes, and parameters.
   6 | """
   7 | 
   8 | import asyncio
   9 | import logging
  10 | from typing import Dict, List, Optional, Set, Tuple, Any
  11 | from dataclasses import dataclass, field
  12 | from enum import Enum
  13 | from neo4j import AsyncGraphDatabase
  14 | 
  15 | from ai_script_analyzer import (
  16 |     AnalysisResult, ImportInfo, MethodCall, AttributeAccess, 
  17 |     FunctionCall, ClassInstantiation
  18 | )
  19 | 
  20 | logger = logging.getLogger(__name__)
  21 | 
  22 | 
  23 | class ValidationStatus(Enum):
  24 |     VALID = "VALID"
  25 |     INVALID = "INVALID" 
  26 |     UNCERTAIN = "UNCERTAIN"
  27 |     NOT_FOUND = "NOT_FOUND"
  28 | 
  29 | 
  30 | @dataclass
  31 | class ValidationResult:
  32 |     """Result of validating a single element"""
  33 |     status: ValidationStatus
  34 |     confidence: float  # 0.0 to 1.0
  35 |     message: str
  36 |     details: Dict[str, Any] = field(default_factory=dict)
  37 |     suggestions: List[str] = field(default_factory=list)
  38 | 
  39 | 
  40 | @dataclass
  41 | class ImportValidation:
  42 |     """Validation result for an import"""
  43 |     import_info: ImportInfo
  44 |     validation: ValidationResult
  45 |     available_classes: List[str] = field(default_factory=list)
  46 |     available_functions: List[str] = field(default_factory=list)
  47 | 
  48 | 
  49 | @dataclass
  50 | class MethodValidation:
  51 |     """Validation result for a method call"""
  52 |     method_call: MethodCall
  53 |     validation: ValidationResult
  54 |     expected_params: List[str] = field(default_factory=list)
  55 |     actual_params: List[str] = field(default_factory=list)
  56 |     parameter_validation: ValidationResult = None
  57 | 
  58 | 
  59 | @dataclass
  60 | class AttributeValidation:
  61 |     """Validation result for attribute access"""
  62 |     attribute_access: AttributeAccess
  63 |     validation: ValidationResult
  64 |     expected_type: Optional[str] = None
  65 | 
  66 | 
  67 | @dataclass
  68 | class FunctionValidation:
  69 |     """Validation result for function call"""
  70 |     function_call: FunctionCall
  71 |     validation: ValidationResult
  72 |     expected_params: List[str] = field(default_factory=list)
  73 |     actual_params: List[str] = field(default_factory=list)
  74 |     parameter_validation: ValidationResult = None
  75 | 
  76 | 
  77 | @dataclass
  78 | class ClassValidation:
  79 |     """Validation result for class instantiation"""
  80 |     class_instantiation: ClassInstantiation
  81 |     validation: ValidationResult
  82 |     constructor_params: List[str] = field(default_factory=list)
  83 |     parameter_validation: ValidationResult = None
  84 | 
  85 | 
  86 | @dataclass
  87 | class ScriptValidationResult:
  88 |     """Complete validation results for a script"""
  89 |     script_path: str
  90 |     analysis_result: AnalysisResult
  91 |     import_validations: List[ImportValidation] = field(default_factory=list)
  92 |     class_validations: List[ClassValidation] = field(default_factory=list)
  93 |     method_validations: List[MethodValidation] = field(default_factory=list)
  94 |     attribute_validations: List[AttributeValidation] = field(default_factory=list)
  95 |     function_validations: List[FunctionValidation] = field(default_factory=list)
  96 |     overall_confidence: float = 0.0
  97 |     hallucinations_detected: List[Dict[str, Any]] = field(default_factory=list)
  98 | 
  99 | 
 100 | class KnowledgeGraphValidator:
 101 |     """Validates code against Neo4j knowledge graph"""
 102 |     
 103 |     def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
 104 |         self.neo4j_uri = neo4j_uri
 105 |         self.neo4j_user = neo4j_user
 106 |         self.neo4j_password = neo4j_password
 107 |         self.driver = None
 108 |         
 109 |         # Cache for performance
 110 |         self.module_cache: Dict[str, List[str]] = {}
 111 |         self.class_cache: Dict[str, Dict[str, Any]] = {}
 112 |         self.method_cache: Dict[str, List[Dict[str, Any]]] = {}
 113 |         self.repo_cache: Dict[str, str] = {}  # module_name -> repo_name
 114 |         self.knowledge_graph_modules: Set[str] = set()  # Track modules in knowledge graph
 115 |     
 116 |     async def initialize(self):
 117 |         """Initialize Neo4j connection"""
 118 |         self.driver = AsyncGraphDatabase.driver(
 119 |             self.neo4j_uri, 
 120 |             auth=(self.neo4j_user, self.neo4j_password)
 121 |         )
 122 |         logger.info("Knowledge graph validator initialized")
 123 |     
 124 |     async def close(self):
 125 |         """Close Neo4j connection"""
 126 |         if self.driver:
 127 |             await self.driver.close()
 128 |     
 129 |     async def validate_script(self, analysis_result: AnalysisResult) -> ScriptValidationResult:
 130 |         """Validate entire script analysis against knowledge graph"""
 131 |         result = ScriptValidationResult(
 132 |             script_path=analysis_result.file_path,
 133 |             analysis_result=analysis_result
 134 |         )
 135 |         
 136 |         # Validate imports first (builds context for other validations)
 137 |         result.import_validations = await self._validate_imports(analysis_result.imports)
 138 |         
 139 |         # Validate class instantiations
 140 |         result.class_validations = await self._validate_class_instantiations(
 141 |             analysis_result.class_instantiations
 142 |         )
 143 |         
 144 |         # Validate method calls
 145 |         result.method_validations = await self._validate_method_calls(
 146 |             analysis_result.method_calls
 147 |         )
 148 |         
 149 |         # Validate attribute accesses
 150 |         result.attribute_validations = await self._validate_attribute_accesses(
 151 |             analysis_result.attribute_accesses
 152 |         )
 153 |         
 154 |         # Validate function calls
 155 |         result.function_validations = await self._validate_function_calls(
 156 |             analysis_result.function_calls
 157 |         )
 158 |         
 159 |         # Calculate overall confidence and detect hallucinations
 160 |         result.overall_confidence = self._calculate_overall_confidence(result)
 161 |         result.hallucinations_detected = self._detect_hallucinations(result)
 162 |         
 163 |         return result
 164 |     
 165 |     async def _validate_imports(self, imports: List[ImportInfo]) -> List[ImportValidation]:
 166 |         """Validate all imports against knowledge graph"""
 167 |         validations = []
 168 |         
 169 |         for import_info in imports:
 170 |             validation = await self._validate_single_import(import_info)
 171 |             validations.append(validation)
 172 |         
 173 |         return validations
 174 |     
 175 |     async def _validate_single_import(self, import_info: ImportInfo) -> ImportValidation:
 176 |         """Validate a single import"""
 177 |         # Determine module to search for
 178 |         search_module = import_info.module if import_info.is_from_import else import_info.name
 179 |         
 180 |         # Check cache first
 181 |         if search_module in self.module_cache:
 182 |             available_files = self.module_cache[search_module]
 183 |         else:
 184 |             # Query Neo4j for matching modules
 185 |             available_files = await self._find_modules(search_module)
 186 |             self.module_cache[search_module] = available_files
 187 |         
 188 |         if available_files:
 189 |             # Get available classes and functions from the module
 190 |             classes, functions = await self._get_module_contents(search_module)
 191 |             
 192 |             # Track this module as being in the knowledge graph
 193 |             self.knowledge_graph_modules.add(search_module)
 194 |             
 195 |             # Also track the base module for "from X.Y.Z import ..." patterns
 196 |             if '.' in search_module:
 197 |                 base_module = search_module.split('.')[0]
 198 |                 self.knowledge_graph_modules.add(base_module)
 199 |             
 200 |             validation = ValidationResult(
 201 |                 status=ValidationStatus.VALID,
 202 |                 confidence=0.9,
 203 |                 message=f"Module '{search_module}' found in knowledge graph",
 204 |                 details={"matched_files": available_files, "in_knowledge_graph": True}
 205 |             )
 206 |             
 207 |             return ImportValidation(
 208 |                 import_info=import_info,
 209 |                 validation=validation,
 210 |                 available_classes=classes,
 211 |                 available_functions=functions
 212 |             )
 213 |         else:
 214 |             # External library - mark as such but don't treat as error
 215 |             validation = ValidationResult(
 216 |                 status=ValidationStatus.UNCERTAIN,
 217 |                 confidence=0.8,  # High confidence it's external, not an error
 218 |                 message=f"Module '{search_module}' is external (not in knowledge graph)",
 219 |                 details={"could_be_external": True, "in_knowledge_graph": False}
 220 |             )
 221 |             
 222 |             return ImportValidation(
 223 |                 import_info=import_info,
 224 |                 validation=validation
 225 |             )
 226 |     
 227 |     async def _validate_class_instantiations(self, instantiations: List[ClassInstantiation]) -> List[ClassValidation]:
 228 |         """Validate class instantiations"""
 229 |         validations = []
 230 |         
 231 |         for instantiation in instantiations:
 232 |             validation = await self._validate_single_class_instantiation(instantiation)
 233 |             validations.append(validation)
 234 |         
 235 |         return validations
 236 |     
 237 |     async def _validate_single_class_instantiation(self, instantiation: ClassInstantiation) -> ClassValidation:
 238 |         """Validate a single class instantiation"""
 239 |         class_name = instantiation.full_class_name or instantiation.class_name
 240 |         
 241 |         # Skip validation for classes not from knowledge graph
 242 |         if not self._is_from_knowledge_graph(class_name):
 243 |             validation = ValidationResult(
 244 |                 status=ValidationStatus.UNCERTAIN,
 245 |                 confidence=0.8,
 246 |                 message=f"Skipping validation: '{class_name}' is not from knowledge graph"
 247 |             )
 248 |             return ClassValidation(
 249 |                 class_instantiation=instantiation,
 250 |                 validation=validation
 251 |             )
 252 |         
 253 |         # Find class in knowledge graph
 254 |         class_info = await self._find_class(class_name)
 255 |         
 256 |         if not class_info:
 257 |             validation = ValidationResult(
 258 |                 status=ValidationStatus.NOT_FOUND,
 259 |                 confidence=0.2,
 260 |                 message=f"Class '{class_name}' not found in knowledge graph"
 261 |             )
 262 |             return ClassValidation(
 263 |                 class_instantiation=instantiation,
 264 |                 validation=validation
 265 |             )
 266 |         
 267 |         # Check constructor parameters (look for __init__ method)
 268 |         init_method = await self._find_method(class_name, "__init__")
 269 |         
 270 |         if init_method:
 271 |             param_validation = self._validate_parameters(
 272 |                 expected_params=init_method.get('params_list', []),
 273 |                 provided_args=instantiation.args,
 274 |                 provided_kwargs=instantiation.kwargs
 275 |             )
 276 |         else:
 277 |             param_validation = ValidationResult(
 278 |                 status=ValidationStatus.UNCERTAIN,
 279 |                 confidence=0.5,
 280 |                 message="Constructor parameters not found"
 281 |             )
 282 |         
 283 |         # Use parameter validation result if it failed
 284 |         if param_validation.status == ValidationStatus.INVALID:
 285 |             validation = ValidationResult(
 286 |                 status=ValidationStatus.INVALID,
 287 |                 confidence=param_validation.confidence,
 288 |                 message=f"Class '{class_name}' found but has invalid constructor parameters: {param_validation.message}",
 289 |                 suggestions=param_validation.suggestions
 290 |             )
 291 |         else:
 292 |             validation = ValidationResult(
 293 |                 status=ValidationStatus.VALID,
 294 |                 confidence=0.8,
 295 |                 message=f"Class '{class_name}' found in knowledge graph"
 296 |             )
 297 |         
 298 |         return ClassValidation(
 299 |             class_instantiation=instantiation,
 300 |             validation=validation,
 301 |             parameter_validation=param_validation
 302 |         )
 303 |     
 304 |     async def _validate_method_calls(self, method_calls: List[MethodCall]) -> List[MethodValidation]:
 305 |         """Validate method calls"""
 306 |         validations = []
 307 |         
 308 |         for method_call in method_calls:
 309 |             validation = await self._validate_single_method_call(method_call)
 310 |             validations.append(validation)
 311 |         
 312 |         return validations
 313 |     
 314 |     async def _validate_single_method_call(self, method_call: MethodCall) -> MethodValidation:
 315 |         """Validate a single method call"""
 316 |         class_type = method_call.object_type
 317 |         
 318 |         if not class_type:
 319 |             validation = ValidationResult(
 320 |                 status=ValidationStatus.UNCERTAIN,
 321 |                 confidence=0.3,
 322 |                 message=f"Cannot determine object type for '{method_call.object_name}'"
 323 |             )
 324 |             return MethodValidation(
 325 |                 method_call=method_call,
 326 |                 validation=validation
 327 |             )
 328 |         
 329 |         # Skip validation for classes not from knowledge graph
 330 |         if not self._is_from_knowledge_graph(class_type):
 331 |             validation = ValidationResult(
 332 |                 status=ValidationStatus.UNCERTAIN,
 333 |                 confidence=0.8,
 334 |                 message=f"Skipping validation: '{class_type}' is not from knowledge graph"
 335 |             )
 336 |             return MethodValidation(
 337 |                 method_call=method_call,
 338 |                 validation=validation
 339 |             )
 340 |         
 341 |         # Find method in knowledge graph
 342 |         method_info = await self._find_method(class_type, method_call.method_name)
 343 |         
 344 |         if not method_info:
 345 |             # Check for similar method names
 346 |             similar_methods = await self._find_similar_methods(class_type, method_call.method_name)
 347 |             
 348 |             validation = ValidationResult(
 349 |                 status=ValidationStatus.NOT_FOUND,
 350 |                 confidence=0.1,
 351 |                 message=f"Method '{method_call.method_name}' not found on class '{class_type}'",
 352 |                 suggestions=similar_methods
 353 |             )
 354 |             return MethodValidation(
 355 |                 method_call=method_call,
 356 |                 validation=validation
 357 |             )
 358 |         
 359 |         # Validate parameters
 360 |         expected_params = method_info.get('params_list', [])
 361 |         param_validation = self._validate_parameters(
 362 |             expected_params=expected_params,
 363 |             provided_args=method_call.args,
 364 |             provided_kwargs=method_call.kwargs
 365 |         )
 366 |         
 367 |         # Use parameter validation result if it failed
 368 |         if param_validation.status == ValidationStatus.INVALID:
 369 |             validation = ValidationResult(
 370 |                 status=ValidationStatus.INVALID,
 371 |                 confidence=param_validation.confidence,
 372 |                 message=f"Method '{method_call.method_name}' found but has invalid parameters: {param_validation.message}",
 373 |                 suggestions=param_validation.suggestions
 374 |             )
 375 |         else:
 376 |             validation = ValidationResult(
 377 |                 status=ValidationStatus.VALID,
 378 |                 confidence=0.9,
 379 |                 message=f"Method '{method_call.method_name}' found on class '{class_type}'"
 380 |             )
 381 |         
 382 |         return MethodValidation(
 383 |             method_call=method_call,
 384 |             validation=validation,
 385 |             expected_params=expected_params,
 386 |             actual_params=method_call.args + list(method_call.kwargs.keys()),
 387 |             parameter_validation=param_validation
 388 |         )
 389 |     
 390 |     async def _validate_attribute_accesses(self, attribute_accesses: List[AttributeAccess]) -> List[AttributeValidation]:
 391 |         """Validate attribute accesses"""
 392 |         validations = []
 393 |         
 394 |         for attr_access in attribute_accesses:
 395 |             validation = await self._validate_single_attribute_access(attr_access)
 396 |             validations.append(validation)
 397 |         
 398 |         return validations
 399 |     
 400 |     async def _validate_single_attribute_access(self, attr_access: AttributeAccess) -> AttributeValidation:
 401 |         """Validate a single attribute access"""
 402 |         class_type = attr_access.object_type
 403 |         
 404 |         if not class_type:
 405 |             validation = ValidationResult(
 406 |                 status=ValidationStatus.UNCERTAIN,
 407 |                 confidence=0.3,
 408 |                 message=f"Cannot determine object type for '{attr_access.object_name}'"
 409 |             )
 410 |             return AttributeValidation(
 411 |                 attribute_access=attr_access,
 412 |                 validation=validation
 413 |             )
 414 |         
 415 |         # Skip validation for classes not from knowledge graph
 416 |         if not self._is_from_knowledge_graph(class_type):
 417 |             validation = ValidationResult(
 418 |                 status=ValidationStatus.UNCERTAIN,
 419 |                 confidence=0.8,
 420 |                 message=f"Skipping validation: '{class_type}' is not from knowledge graph"
 421 |             )
 422 |             return AttributeValidation(
 423 |                 attribute_access=attr_access,
 424 |                 validation=validation
 425 |             )
 426 |         
 427 |         # Find attribute in knowledge graph
 428 |         attr_info = await self._find_attribute(class_type, attr_access.attribute_name)
 429 |         
 430 |         if not attr_info:
 431 |             # If not found as attribute, check if it's a method (for decorators like @agent.tool)
 432 |             method_info = await self._find_method(class_type, attr_access.attribute_name)
 433 |             
 434 |             if method_info:
 435 |                 validation = ValidationResult(
 436 |                     status=ValidationStatus.VALID,
 437 |                     confidence=0.8,
 438 |                     message=f"'{attr_access.attribute_name}' found as method on class '{class_type}' (likely used as decorator)"
 439 |                 )
 440 |                 return AttributeValidation(
 441 |                     attribute_access=attr_access,
 442 |                     validation=validation,
 443 |                     expected_type="method"
 444 |                 )
 445 |             
 446 |             validation = ValidationResult(
 447 |                 status=ValidationStatus.NOT_FOUND,
 448 |                 confidence=0.2,
 449 |                 message=f"'{attr_access.attribute_name}' not found on class '{class_type}'"
 450 |             )
 451 |             return AttributeValidation(
 452 |                 attribute_access=attr_access,
 453 |                 validation=validation
 454 |             )
 455 |         
 456 |         validation = ValidationResult(
 457 |             status=ValidationStatus.VALID,
 458 |             confidence=0.8,
 459 |             message=f"Attribute '{attr_access.attribute_name}' found on class '{class_type}'"
 460 |         )
 461 |         
 462 |         return AttributeValidation(
 463 |             attribute_access=attr_access,
 464 |             validation=validation,
 465 |             expected_type=attr_info.get('type')
 466 |         )
 467 |     
 468 |     async def _validate_function_calls(self, function_calls: List[FunctionCall]) -> List[FunctionValidation]:
 469 |         """Validate function calls"""
 470 |         validations = []
 471 |         
 472 |         for func_call in function_calls:
 473 |             validation = await self._validate_single_function_call(func_call)
 474 |             validations.append(validation)
 475 |         
 476 |         return validations
 477 |     
 478 |     async def _validate_single_function_call(self, func_call: FunctionCall) -> FunctionValidation:
 479 |         """Validate a single function call"""
 480 |         func_name = func_call.full_name or func_call.function_name
 481 |         
 482 |         # Skip validation for functions not from knowledge graph
 483 |         if func_call.full_name and not self._is_from_knowledge_graph(func_call.full_name):
 484 |             validation = ValidationResult(
 485 |                 status=ValidationStatus.UNCERTAIN,
 486 |                 confidence=0.8,
 487 |                 message=f"Skipping validation: '{func_name}' is not from knowledge graph"
 488 |             )
 489 |             return FunctionValidation(
 490 |                 function_call=func_call,
 491 |                 validation=validation
 492 |             )
 493 |         
 494 |         # Find function in knowledge graph
 495 |         func_info = await self._find_function(func_name)
 496 |         
 497 |         if not func_info:
 498 |             validation = ValidationResult(
 499 |                 status=ValidationStatus.NOT_FOUND,
 500 |                 confidence=0.2,
 501 |                 message=f"Function '{func_name}' not found in knowledge graph"
 502 |             )
 503 |             return FunctionValidation(
 504 |                 function_call=func_call,
 505 |                 validation=validation
 506 |             )
 507 |         
 508 |         # Validate parameters
 509 |         expected_params = func_info.get('params_list', [])
 510 |         param_validation = self._validate_parameters(
 511 |             expected_params=expected_params,
 512 |             provided_args=func_call.args,
 513 |             provided_kwargs=func_call.kwargs
 514 |         )
 515 |         
 516 |         # Use parameter validation result if it failed
 517 |         if param_validation.status == ValidationStatus.INVALID:
 518 |             validation = ValidationResult(
 519 |                 status=ValidationStatus.INVALID,
 520 |                 confidence=param_validation.confidence,
 521 |                 message=f"Function '{func_name}' found but has invalid parameters: {param_validation.message}",
 522 |                 suggestions=param_validation.suggestions
 523 |             )
 524 |         else:
 525 |             validation = ValidationResult(
 526 |                 status=ValidationStatus.VALID,
 527 |                 confidence=0.8,
 528 |                 message=f"Function '{func_name}' found in knowledge graph"
 529 |             )
 530 |         
 531 |         return FunctionValidation(
 532 |             function_call=func_call,
 533 |             validation=validation,
 534 |             expected_params=expected_params,
 535 |             actual_params=func_call.args + list(func_call.kwargs.keys()),
 536 |             parameter_validation=param_validation
 537 |         )
 538 |     
 539 |     def _validate_parameters(self, expected_params: List[str], provided_args: List[str], 
 540 |                            provided_kwargs: Dict[str, str]) -> ValidationResult:
 541 |         """Validate function/method parameters with comprehensive support"""
 542 |         if not expected_params:
 543 |             return ValidationResult(
 544 |                 status=ValidationStatus.UNCERTAIN,
 545 |                 confidence=0.5,
 546 |                 message="Parameter information not available"
 547 |             )
 548 |         
 549 |         # Parse expected parameters - handle detailed format
 550 |         required_positional = []
 551 |         optional_positional = []
 552 |         keyword_only_required = []
 553 |         keyword_only_optional = []
 554 |         has_varargs = False
 555 |         has_varkwargs = False
 556 |         
 557 |         for param in expected_params:
 558 |             # Handle detailed format: "[keyword_only] name:type=default" or "name:type"
 559 |             param_clean = param.strip()
 560 |             
 561 |             # Check for parameter kind prefix
 562 |             kind = 'positional'
 563 |             if param_clean.startswith('['):
 564 |                 end_bracket = param_clean.find(']')
 565 |                 if end_bracket > 0:
 566 |                     kind = param_clean[1:end_bracket]
 567 |                     param_clean = param_clean[end_bracket+1:].strip()
 568 |             
 569 |             # Check for varargs/varkwargs
 570 |             if param_clean.startswith('*') and not param_clean.startswith('**'):
 571 |                 has_varargs = True
 572 |                 continue
 573 |             elif param_clean.startswith('**'):
 574 |                 has_varkwargs = True
 575 |                 continue
 576 |             
 577 |             # Parse name and check if optional
 578 |             if ':' in param_clean:
 579 |                 param_name = param_clean.split(':')[0]
 580 |                 is_optional = '=' in param_clean
 581 |                 
 582 |                 if kind == 'keyword_only':
 583 |                     if is_optional:
 584 |                         keyword_only_optional.append(param_name)
 585 |                     else:
 586 |                         keyword_only_required.append(param_name)
 587 |                 else:  # positional
 588 |                     if is_optional:
 589 |                         optional_positional.append(param_name)
 590 |                     else:
 591 |                         required_positional.append(param_name)
 592 |         
 593 |         # Count provided parameters
 594 |         provided_positional_count = len(provided_args)
 595 |         provided_keyword_names = set(provided_kwargs.keys())
 596 |         
 597 |         # Validate positional arguments
 598 |         min_required_positional = len(required_positional)
 599 |         max_allowed_positional = len(required_positional) + len(optional_positional)
 600 |         
 601 |         if not has_varargs and provided_positional_count > max_allowed_positional:
 602 |             return ValidationResult(
 603 |                 status=ValidationStatus.INVALID,
 604 |                 confidence=0.8,
 605 |                 message=f"Too many positional arguments: provided {provided_positional_count}, max allowed {max_allowed_positional}"
 606 |             )
 607 |         
 608 |         if provided_positional_count < min_required_positional:
 609 |             return ValidationResult(
 610 |                 status=ValidationStatus.INVALID,
 611 |                 confidence=0.8,
 612 |                 message=f"Too few positional arguments: provided {provided_positional_count}, required {min_required_positional}"
 613 |             )
 614 |         
 615 |         # Validate keyword arguments
 616 |         all_valid_kwarg_names = set(required_positional + optional_positional + keyword_only_required + keyword_only_optional)
 617 |         invalid_kwargs = provided_keyword_names - all_valid_kwarg_names
 618 |         
 619 |         if invalid_kwargs and not has_varkwargs:
 620 |             return ValidationResult(
 621 |                 status=ValidationStatus.INVALID,
 622 |                 confidence=0.7,
 623 |                 message=f"Invalid keyword arguments: {list(invalid_kwargs)}",
 624 |                 suggestions=[f"Valid parameters: {list(all_valid_kwarg_names)}"]
 625 |             )
 626 |         
 627 |         # Check required keyword-only arguments
 628 |         missing_required_kwargs = set(keyword_only_required) - provided_keyword_names
 629 |         if missing_required_kwargs:
 630 |             return ValidationResult(
 631 |                 status=ValidationStatus.INVALID,
 632 |                 confidence=0.8,
 633 |                 message=f"Missing required keyword arguments: {list(missing_required_kwargs)}"
 634 |             )
 635 |         
 636 |         return ValidationResult(
 637 |             status=ValidationStatus.VALID,
 638 |             confidence=0.9,
 639 |             message="Parameters are valid"
 640 |         )
 641 |     
 642 |     # Neo4j Query Methods
 643 |     
 644 |     async def _find_modules(self, module_name: str) -> List[str]:
 645 |         """Find repository matching the module name, then return its files"""
 646 |         async with self.driver.session() as session:
 647 |             # First, try to find files with module names that match or start with the search term
 648 |             module_query = """
 649 |             MATCH (r:Repository)-[:CONTAINS]->(f:File)
 650 |             WHERE f.module_name = $module_name 
 651 |                OR f.module_name STARTS WITH $module_name + '.'
 652 |                OR split(f.module_name, '.')[0] = $module_name
 653 |             RETURN DISTINCT r.name as repo_name, count(f) as file_count
 654 |             ORDER BY file_count DESC
 655 |             LIMIT 5
 656 |             """
 657 |             
 658 |             result = await session.run(module_query, module_name=module_name)
 659 |             repos_from_modules = []
 660 |             async for record in result:
 661 |                 repos_from_modules.append(record['repo_name'])
 662 |             
 663 |             # Also try repository name matching as fallback
 664 |             repo_query = """
 665 |             MATCH (r:Repository)
 666 |             WHERE toLower(r.name) = toLower($module_name)
 667 |                OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
 668 |                OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
 669 |             RETURN r.name as repo_name
 670 |             ORDER BY 
 671 |                 CASE 
 672 |                     WHEN toLower(r.name) = toLower($module_name) THEN 1
 673 |                     WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
 674 |                     WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
 675 |                 END
 676 |             LIMIT 5
 677 |             """
 678 |             
 679 |             result = await session.run(repo_query, module_name=module_name)
 680 |             repos_from_names = []
 681 |             async for record in result:
 682 |                 repos_from_names.append(record['repo_name'])
 683 |             
 684 |             # Combine results, prioritizing module-based matches
 685 |             all_repos = repos_from_modules + [r for r in repos_from_names if r not in repos_from_modules]
 686 |             
 687 |             if not all_repos:
 688 |                 return []
 689 |             
 690 |             # Get files from the best matching repository
 691 |             best_repo = all_repos[0]
 692 |             files_query = """
 693 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
 694 |             RETURN f.path, f.module_name
 695 |             LIMIT 50
 696 |             """
 697 |             
 698 |             result = await session.run(files_query, repo_name=best_repo)
 699 |             files = []
 700 |             async for record in result:
 701 |                 files.append(record['f.path'])
 702 |             
 703 |             return files
 704 |     
 705 |     async def _get_module_contents(self, module_name: str) -> Tuple[List[str], List[str]]:
 706 |         """Get classes and functions available in a repository matching the module name"""
 707 |         async with self.driver.session() as session:
 708 |             # First, try to find repository by module names in files
 709 |             module_query = """
 710 |             MATCH (r:Repository)-[:CONTAINS]->(f:File)
 711 |             WHERE f.module_name = $module_name 
 712 |                OR f.module_name STARTS WITH $module_name + '.'
 713 |                OR split(f.module_name, '.')[0] = $module_name
 714 |             RETURN DISTINCT r.name as repo_name, count(f) as file_count
 715 |             ORDER BY file_count DESC
 716 |             LIMIT 1
 717 |             """
 718 |             
 719 |             result = await session.run(module_query, module_name=module_name)
 720 |             record = await result.single()
 721 |             
 722 |             if record:
 723 |                 repo_name = record['repo_name']
 724 |             else:
 725 |                 # Fallback to repository name matching
 726 |                 repo_query = """
 727 |                 MATCH (r:Repository)
 728 |                 WHERE toLower(r.name) = toLower($module_name)
 729 |                    OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
 730 |                    OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
 731 |                 RETURN r.name as repo_name
 732 |                 ORDER BY 
 733 |                     CASE 
 734 |                         WHEN toLower(r.name) = toLower($module_name) THEN 1
 735 |                         WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
 736 |                         WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
 737 |                     END
 738 |                 LIMIT 1
 739 |                 """
 740 |                 
 741 |                 result = await session.run(repo_query, module_name=module_name)
 742 |                 record = await result.single()
 743 |                 
 744 |                 if not record:
 745 |                     return [], []
 746 |                 
 747 |                 repo_name = record['repo_name']
 748 |             
 749 |             # Get classes from this repository
 750 |             class_query = """
 751 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
 752 |             RETURN DISTINCT c.name as class_name
 753 |             """
 754 |             
 755 |             result = await session.run(class_query, repo_name=repo_name)
 756 |             classes = []
 757 |             async for record in result:
 758 |                 classes.append(record['class_name'])
 759 |             
 760 |             # Get functions from this repository
 761 |             func_query = """
 762 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
 763 |             RETURN DISTINCT func.name as function_name
 764 |             """
 765 |             
 766 |             result = await session.run(func_query, repo_name=repo_name)
 767 |             functions = []
 768 |             async for record in result:
 769 |                 functions.append(record['function_name'])
 770 |             
 771 |             return classes, functions
 772 |     
 773 |     async def _find_repository_for_module(self, module_name: str) -> Optional[str]:
 774 |         """Find the repository name that matches a module name"""
 775 |         if module_name in self.repo_cache:
 776 |             return self.repo_cache[module_name]
 777 |         
 778 |         async with self.driver.session() as session:
 779 |             # First, try to find repository by module names in files
 780 |             module_query = """
 781 |             MATCH (r:Repository)-[:CONTAINS]->(f:File)
 782 |             WHERE f.module_name = $module_name 
 783 |                OR f.module_name STARTS WITH $module_name + '.'
 784 |                OR split(f.module_name, '.')[0] = $module_name
 785 |             RETURN DISTINCT r.name as repo_name, count(f) as file_count
 786 |             ORDER BY file_count DESC
 787 |             LIMIT 1
 788 |             """
 789 |             
 790 |             result = await session.run(module_query, module_name=module_name)
 791 |             record = await result.single()
 792 |             
 793 |             if record:
 794 |                 repo_name = record['repo_name']
 795 |             else:
 796 |                 # Fallback to repository name matching
 797 |                 query = """
 798 |                 MATCH (r:Repository)
 799 |                 WHERE toLower(r.name) = toLower($module_name)
 800 |                    OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
 801 |                    OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
 802 |                    OR toLower(r.name) CONTAINS toLower($module_name)
 803 |                    OR toLower($module_name) CONTAINS toLower(replace(r.name, '-', '_'))
 804 |                 RETURN r.name as repo_name
 805 |                 ORDER BY 
 806 |                     CASE 
 807 |                         WHEN toLower(r.name) = toLower($module_name) THEN 1
 808 |                         WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
 809 |                         ELSE 3
 810 |                     END
 811 |                 LIMIT 1
 812 |                 """
 813 |                 
 814 |                 result = await session.run(query, module_name=module_name)
 815 |                 record = await result.single()
 816 |                 
 817 |                 repo_name = record['repo_name'] if record else None
 818 |             
 819 |             self.repo_cache[module_name] = repo_name
 820 |             return repo_name
 821 |     
 822 |     async def _find_class(self, class_name: str) -> Optional[Dict[str, Any]]:
 823 |         """Find class information in knowledge graph"""
 824 |         async with self.driver.session() as session:
 825 |             # First try exact match
 826 |             query = """
 827 |             MATCH (c:Class)
 828 |             WHERE c.name = $class_name OR c.full_name = $class_name
 829 |             RETURN c.name as name, c.full_name as full_name
 830 |             LIMIT 1
 831 |             """
 832 |             
 833 |             result = await session.run(query, class_name=class_name)
 834 |             record = await result.single()
 835 |             
 836 |             if record:
 837 |                 return {
 838 |                     'name': record['name'],
 839 |                     'full_name': record['full_name']
 840 |                 }
 841 |             
 842 |             # If no exact match and class_name has dots, try repository-based search
 843 |             if '.' in class_name:
 844 |                 parts = class_name.split('.')
 845 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
 846 |                 class_part = parts[-1]  # e.g., "Agent"
 847 |                 
 848 |                 # Find repository for the module
 849 |                 repo_name = await self._find_repository_for_module(module_part)
 850 |                 
 851 |                 if repo_name:
 852 |                     # Search for class within this repository
 853 |                     repo_query = """
 854 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
 855 |                     WHERE c.name = $class_name
 856 |                     RETURN c.name as name, c.full_name as full_name
 857 |                     LIMIT 1
 858 |                     """
 859 |                     
 860 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part)
 861 |                     record = await result.single()
 862 |                     
 863 |                     if record:
 864 |                         return {
 865 |                             'name': record['name'],
 866 |                             'full_name': record['full_name']
 867 |                         }
 868 |             
 869 |             return None
 870 |     
 871 |     async def _find_method(self, class_name: str, method_name: str) -> Optional[Dict[str, Any]]:
 872 |         """Find method information for a class"""
 873 |         cache_key = f"{class_name}.{method_name}"
 874 |         if cache_key in self.method_cache:
 875 |             methods = self.method_cache[cache_key]
 876 |             return methods[0] if methods else None
 877 |         
 878 |         async with self.driver.session() as session:
 879 |             # First try exact match
 880 |             query = """
 881 |             MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
 882 |             WHERE (c.name = $class_name OR c.full_name = $class_name)
 883 |               AND m.name = $method_name
 884 |             RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, 
 885 |                    m.return_type as return_type, m.args as args
 886 |             LIMIT 1
 887 |             """
 888 |             
 889 |             result = await session.run(query, class_name=class_name, method_name=method_name)
 890 |             record = await result.single()
 891 |             
 892 |             if record:
 893 |                 # Use detailed params if available, fall back to simple params
 894 |                 params_to_use = record['params_detailed'] or record['params_list'] or []
 895 |                 
 896 |                 method_info = {
 897 |                     'name': record['name'],
 898 |                     'params_list': params_to_use,
 899 |                     'return_type': record['return_type'],
 900 |                     'args': record['args'] or []
 901 |                 }
 902 |                 self.method_cache[cache_key] = [method_info]
 903 |                 return method_info
 904 |             
 905 |             # If no exact match and class_name has dots, try repository-based search
 906 |             if '.' in class_name:
 907 |                 parts = class_name.split('.')
 908 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
 909 |                 class_part = parts[-1]  # e.g., "Agent"
 910 |                 
 911 |                 # Find repository for the module
 912 |                 repo_name = await self._find_repository_for_module(module_part)
 913 |                 
 914 |                 if repo_name:
 915 |                     # Search for method within this repository's classes
 916 |                     repo_query = """
 917 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
 918 |                     WHERE c.name = $class_name AND m.name = $method_name
 919 |                     RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
 920 |                            m.return_type as return_type, m.args as args
 921 |                     LIMIT 1
 922 |                     """
 923 |                     
 924 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, method_name=method_name)
 925 |                     record = await result.single()
 926 |                     
 927 |                     if record:
 928 |                         # Use detailed params if available, fall back to simple params
 929 |                         params_to_use = record['params_detailed'] or record['params_list'] or []
 930 |                         
 931 |                         method_info = {
 932 |                             'name': record['name'],
 933 |                             'params_list': params_to_use,
 934 |                             'return_type': record['return_type'],
 935 |                             'args': record['args'] or []
 936 |                         }
 937 |                         self.method_cache[cache_key] = [method_info]
 938 |                         return method_info
 939 |             
 940 |             self.method_cache[cache_key] = []
 941 |             return None
 942 |     
 943 |     async def _find_attribute(self, class_name: str, attr_name: str) -> Optional[Dict[str, Any]]:
 944 |         """Find attribute information for a class"""
 945 |         async with self.driver.session() as session:
 946 |             # First try exact match
 947 |             query = """
 948 |             MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
 949 |             WHERE (c.name = $class_name OR c.full_name = $class_name)
 950 |               AND a.name = $attr_name
 951 |             RETURN a.name as name, a.type as type
 952 |             LIMIT 1
 953 |             """
 954 |             
 955 |             result = await session.run(query, class_name=class_name, attr_name=attr_name)
 956 |             record = await result.single()
 957 |             
 958 |             if record:
 959 |                 return {
 960 |                     'name': record['name'],
 961 |                     'type': record['type']
 962 |                 }
 963 |             
 964 |             # If no exact match and class_name has dots, try repository-based search
 965 |             if '.' in class_name:
 966 |                 parts = class_name.split('.')
 967 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
 968 |                 class_part = parts[-1]  # e.g., "Agent"
 969 |                 
 970 |                 # Find repository for the module
 971 |                 repo_name = await self._find_repository_for_module(module_part)
 972 |                 
 973 |                 if repo_name:
 974 |                     # Search for attribute within this repository's classes
 975 |                     repo_query = """
 976 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
 977 |                     WHERE c.name = $class_name AND a.name = $attr_name
 978 |                     RETURN a.name as name, a.type as type
 979 |                     LIMIT 1
 980 |                     """
 981 |                     
 982 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, attr_name=attr_name)
 983 |                     record = await result.single()
 984 |                     
 985 |                     if record:
 986 |                         return {
 987 |                             'name': record['name'],
 988 |                             'type': record['type']
 989 |                         }
 990 |             
 991 |             return None
 992 |     
 993 |     async def _find_function(self, func_name: str) -> Optional[Dict[str, Any]]:
 994 |         """Find function information"""
 995 |         async with self.driver.session() as session:
 996 |             # First try exact match
 997 |             query = """
 998 |             MATCH (f:Function)
 999 |             WHERE f.name = $func_name OR f.full_name = $func_name
1000 |             RETURN f.name as name, f.params_list as params_list, f.params_detailed as params_detailed,
1001 |                    f.return_type as return_type, f.args as args
1002 |             LIMIT 1
1003 |             """
1004 |             
1005 |             result = await session.run(query, func_name=func_name)
1006 |             record = await result.single()
1007 |             
1008 |             if record:
1009 |                 # Use detailed params if available, fall back to simple params
1010 |                 params_to_use = record['params_detailed'] or record['params_list'] or []
1011 |                 
1012 |                 return {
1013 |                     'name': record['name'],
1014 |                     'params_list': params_to_use,
1015 |                     'return_type': record['return_type'],
1016 |                     'args': record['args'] or []
1017 |                 }
1018 |             
1019 |             # If no exact match and func_name has dots, try repository-based search
1020 |             if '.' in func_name:
1021 |                 parts = func_name.split('.')
1022 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
1023 |                 func_part = parts[-1]  # e.g., "some_function"
1024 |                 
1025 |                 # Find repository for the module
1026 |                 repo_name = await self._find_repository_for_module(module_part)
1027 |                 
1028 |                 if repo_name:
1029 |                     # Search for function within this repository
1030 |                     repo_query = """
1031 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1032 |                     WHERE func.name = $func_name
1033 |                     RETURN func.name as name, func.params_list as params_list, func.params_detailed as params_detailed,
1034 |                            func.return_type as return_type, func.args as args
1035 |                     LIMIT 1
1036 |                     """
1037 |                     
1038 |                     result = await session.run(repo_query, repo_name=repo_name, func_name=func_part)
1039 |                     record = await result.single()
1040 |                     
1041 |                     if record:
1042 |                         # Use detailed params if available, fall back to simple params
1043 |                         params_to_use = record['params_detailed'] or record['params_list'] or []
1044 |                         
1045 |                         return {
1046 |                             'name': record['name'],
1047 |                             'params_list': params_to_use,
1048 |                             'return_type': record['return_type'],
1049 |                             'args': record['args'] or []
1050 |                         }
1051 |             
1052 |             return None
1053 |     
1054 |     async def _find_pydantic_ai_result_method(self, method_name: str) -> Optional[Dict[str, Any]]:
1055 |         """Find method information for pydantic_ai result objects"""
1056 |         # Look for methods on pydantic_ai classes that could be result objects
1057 |         async with self.driver.session() as session:
1058 |             # Search for common result methods in pydantic_ai repository
1059 |             query = """
1060 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1061 |             WHERE m.name = $method_name 
1062 |               AND (c.name CONTAINS 'Result' OR c.name CONTAINS 'Stream' OR c.name CONTAINS 'Run')
1063 |             RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
1064 |                    m.return_type as return_type, m.args as args, c.name as class_name
1065 |             LIMIT 1
1066 |             """
1067 |             
1068 |             result = await session.run(query, repo_name="pydantic_ai", method_name=method_name)
1069 |             record = await result.single()
1070 |             
1071 |             if record:
1072 |                 # Use detailed params if available, fall back to simple params
1073 |                 params_to_use = record['params_detailed'] or record['params_list'] or []
1074 |                 
1075 |                 return {
1076 |                     'name': record['name'],
1077 |                     'params_list': params_to_use,
1078 |                     'return_type': record['return_type'],
1079 |                     'args': record['args'] or [],
1080 |                     'source_class': record['class_name']
1081 |                 }
1082 |             
1083 |             return None
1084 |     
1085 |     async def _find_similar_modules(self, module_name: str) -> List[str]:
1086 |         """Find similar repository names for suggestions"""
1087 |         async with self.driver.session() as session:
1088 |             query = """
1089 |             MATCH (r:Repository)
1090 |             WHERE toLower(r.name) CONTAINS toLower($partial_name)
1091 |                OR toLower(replace(r.name, '-', '_')) CONTAINS toLower($partial_name)
1092 |                OR toLower(replace(r.name, '_', '-')) CONTAINS toLower($partial_name)
1093 |             RETURN r.name
1094 |             LIMIT 5
1095 |             """
1096 |             
1097 |             result = await session.run(query, partial_name=module_name[:3])
1098 |             suggestions = []
1099 |             async for record in result:
1100 |                 suggestions.append(record['name'])
1101 |             
1102 |             return suggestions
1103 |     
1104 |     async def _find_similar_methods(self, class_name: str, method_name: str) -> List[str]:
1105 |         """Find similar method names for suggestions"""
1106 |         async with self.driver.session() as session:
1107 |             # First try exact class match
1108 |             query = """
1109 |             MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1110 |             WHERE (c.name = $class_name OR c.full_name = $class_name)
1111 |               AND m.name CONTAINS $partial_name
1112 |             RETURN m.name as name
1113 |             LIMIT 5
1114 |             """
1115 |             
1116 |             result = await session.run(query, class_name=class_name, partial_name=method_name[:3])
1117 |             suggestions = []
1118 |             async for record in result:
1119 |                 suggestions.append(record['name'])
1120 |             
1121 |             # If no suggestions and class_name has dots, try repository-based search
1122 |             if not suggestions and '.' in class_name:
1123 |                 parts = class_name.split('.')
1124 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
1125 |                 class_part = parts[-1]  # e.g., "Agent"
1126 |                 
1127 |                 # Find repository for the module
1128 |                 repo_name = await self._find_repository_for_module(module_part)
1129 |                 
1130 |                 if repo_name:
1131 |                     repo_query = """
1132 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1133 |                     WHERE c.name = $class_name AND m.name CONTAINS $partial_name
1134 |                     RETURN m.name as name
1135 |                     LIMIT 5
1136 |                     """
1137 |                     
1138 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, partial_name=method_name[:3])
1139 |                     async for record in result:
1140 |                         suggestions.append(record['name'])
1141 |             
1142 |             return suggestions
1143 |     
1144 |     def _calculate_overall_confidence(self, result: ScriptValidationResult) -> float:
1145 |         """Calculate overall confidence score for the validation (knowledge graph items only)"""
1146 |         kg_validations = []
1147 |         
1148 |         # Only count validations from knowledge graph imports
1149 |         for val in result.import_validations:
1150 |             if val.validation.details.get('in_knowledge_graph', False):
1151 |                 kg_validations.append(val.validation.confidence)
1152 |         
1153 |         # Only count validations from knowledge graph classes
1154 |         for val in result.class_validations:
1155 |             class_name = val.class_instantiation.full_class_name or val.class_instantiation.class_name
1156 |             if self._is_from_knowledge_graph(class_name):
1157 |                 kg_validations.append(val.validation.confidence)
1158 |         
1159 |         # Only count validations from knowledge graph methods
1160 |         for val in result.method_validations:
1161 |             if val.method_call.object_type and self._is_from_knowledge_graph(val.method_call.object_type):
1162 |                 kg_validations.append(val.validation.confidence)
1163 |         
1164 |         # Only count validations from knowledge graph attributes
1165 |         for val in result.attribute_validations:
1166 |             if val.attribute_access.object_type and self._is_from_knowledge_graph(val.attribute_access.object_type):
1167 |                 kg_validations.append(val.validation.confidence)
1168 |         
1169 |         # Only count validations from knowledge graph functions
1170 |         for val in result.function_validations:
1171 |             if val.function_call.full_name and self._is_from_knowledge_graph(val.function_call.full_name):
1172 |                 kg_validations.append(val.validation.confidence)
1173 |         
1174 |         if not kg_validations:
1175 |             return 1.0  # No knowledge graph items to validate = perfect confidence
1176 |         
1177 |         return sum(kg_validations) / len(kg_validations)
1178 |     
1179 |     def _is_from_knowledge_graph(self, class_type: str) -> bool:
1180 |         """Check if a class type comes from a module in the knowledge graph"""
1181 |         if not class_type:
1182 |             return False
1183 |         
1184 |         # For dotted names like "pydantic_ai.Agent" or "pydantic_ai.StreamedRunResult", check the base module
1185 |         if '.' in class_type:
1186 |             base_module = class_type.split('.')[0]
1187 |             # Exact match only - "pydantic" should not match "pydantic_ai"
1188 |             return base_module in self.knowledge_graph_modules
1189 |         
1190 |         # For simple names, check if any knowledge graph module matches exactly
1191 |         # Don't use substring matching to avoid "pydantic" matching "pydantic_ai"
1192 |         return class_type in self.knowledge_graph_modules
1193 |     
1194 |     def _detect_hallucinations(self, result: ScriptValidationResult) -> List[Dict[str, Any]]:
1195 |         """Detect and categorize hallucinations"""
1196 |         hallucinations = []
1197 |         reported_items = set()  # Track reported items to avoid duplicates
1198 |         
1199 |         # Check method calls (only for knowledge graph classes)
1200 |         for val in result.method_validations:
1201 |             if (val.validation.status == ValidationStatus.NOT_FOUND and 
1202 |                 val.method_call.object_type and 
1203 |                 self._is_from_knowledge_graph(val.method_call.object_type)):
1204 |                 
1205 |                 # Create unique key to avoid duplicates
1206 |                 key = (val.method_call.line_number, val.method_call.method_name, val.method_call.object_type)
1207 |                 if key not in reported_items:
1208 |                     reported_items.add(key)
1209 |                     hallucinations.append({
1210 |                         'type': 'METHOD_NOT_FOUND',
1211 |                         'location': f"line {val.method_call.line_number}",
1212 |                         'description': f"Method '{val.method_call.method_name}' not found on class '{val.method_call.object_type}'",
1213 |                         'suggestion': val.validation.suggestions[0] if val.validation.suggestions else None
1214 |                     })
1215 |         
1216 |         # Check attributes (only for knowledge graph classes) - but skip if already reported as method
1217 |         for val in result.attribute_validations:
1218 |             if (val.validation.status == ValidationStatus.NOT_FOUND and 
1219 |                 val.attribute_access.object_type and 
1220 |                 self._is_from_knowledge_graph(val.attribute_access.object_type)):
1221 |                 
1222 |                 # Create unique key - if this was already reported as a method, skip it
1223 |                 key = (val.attribute_access.line_number, val.attribute_access.attribute_name, val.attribute_access.object_type)
1224 |                 if key not in reported_items:
1225 |                     reported_items.add(key)
1226 |                     hallucinations.append({
1227 |                         'type': 'ATTRIBUTE_NOT_FOUND',
1228 |                         'location': f"line {val.attribute_access.line_number}",
1229 |                         'description': f"Attribute '{val.attribute_access.attribute_name}' not found on class '{val.attribute_access.object_type}'"
1230 |                     })
1231 |         
1232 |         # Check parameter issues (only for knowledge graph methods)
1233 |         for val in result.method_validations:
1234 |             if (val.parameter_validation and 
1235 |                 val.parameter_validation.status == ValidationStatus.INVALID and
1236 |                 val.method_call.object_type and 
1237 |                 self._is_from_knowledge_graph(val.method_call.object_type)):
1238 |                 hallucinations.append({
1239 |                     'type': 'INVALID_PARAMETERS',
1240 |                     'location': f"line {val.method_call.line_number}",
1241 |                     'description': f"Invalid parameters for method '{val.method_call.method_name}': {val.parameter_validation.message}"
1242 |                 })
1243 |         
1244 |         return hallucinations
```

--------------------------------------------------------------------------------
/src/crawl4ai_mcp.py:
--------------------------------------------------------------------------------

```python
   1 | """
   2 | MCP server for web crawling with Crawl4AI.
   3 | 
   4 | This server provides tools to crawl websites using Crawl4AI, automatically detecting
   5 | the appropriate crawl method based on URL type (sitemap, txt file, or regular webpage).
   6 | Also includes AI hallucination detection and repository parsing tools using Neo4j knowledge graphs.
   7 | """
   8 | from mcp.server.fastmcp import FastMCP, Context
   9 | from sentence_transformers import CrossEncoder
  10 | from contextlib import asynccontextmanager
  11 | from collections.abc import AsyncIterator
  12 | from dataclasses import dataclass
  13 | from typing import List, Dict, Any, Optional
  14 | from urllib.parse import urlparse, urldefrag
  15 | from xml.etree import ElementTree
  16 | from dotenv import load_dotenv
  17 | from supabase import Client
  18 | from pathlib import Path
  19 | import requests
  20 | import asyncio
  21 | import json
  22 | import os
  23 | import re
  24 | import concurrent.futures
  25 | import sys
  26 | 
  27 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
  28 | 
  29 | # Add knowledge_graphs folder to path for importing knowledge graph modules
  30 | knowledge_graphs_path = Path(__file__).resolve().parent.parent / 'knowledge_graphs'
  31 | sys.path.append(str(knowledge_graphs_path))
  32 | 
  33 | from utils import (
  34 |     get_supabase_client, 
  35 |     add_documents_to_supabase, 
  36 |     search_documents,
  37 |     extract_code_blocks,
  38 |     generate_code_example_summary,
  39 |     add_code_examples_to_supabase,
  40 |     update_source_info,
  41 |     extract_source_summary,
  42 |     search_code_examples
  43 | )
  44 | 
  45 | # Import knowledge graph modules
  46 | from knowledge_graph_validator import KnowledgeGraphValidator
  47 | from parse_repo_into_neo4j import DirectNeo4jExtractor
  48 | from ai_script_analyzer import AIScriptAnalyzer
  49 | from hallucination_reporter import HallucinationReporter
  50 | 
  51 | # Load environment variables from the project root .env file
  52 | project_root = Path(__file__).resolve().parent.parent
  53 | dotenv_path = project_root / '.env'
  54 | 
  55 | # Force override of existing environment variables
  56 | load_dotenv(dotenv_path, override=True)
  57 | 
  58 | # Helper functions for Neo4j validation and error handling
  59 | def validate_neo4j_connection() -> bool:
  60 |     """Check if Neo4j environment variables are configured."""
  61 |     return all([
  62 |         os.getenv("NEO4J_URI"),
  63 |         os.getenv("NEO4J_USER"),
  64 |         os.getenv("NEO4J_PASSWORD")
  65 |     ])
  66 | 
  67 | def format_neo4j_error(error: Exception) -> str:
  68 |     """Format Neo4j connection errors for user-friendly messages."""
  69 |     error_str = str(error).lower()
  70 |     if "authentication" in error_str or "unauthorized" in error_str:
  71 |         return "Neo4j authentication failed. Check NEO4J_USER and NEO4J_PASSWORD."
  72 |     elif "connection" in error_str or "refused" in error_str or "timeout" in error_str:
  73 |         return "Cannot connect to Neo4j. Check NEO4J_URI and ensure Neo4j is running."
  74 |     elif "database" in error_str:
  75 |         return "Neo4j database error. Check if the database exists and is accessible."
  76 |     else:
  77 |         return f"Neo4j error: {str(error)}"
  78 | 
  79 | def validate_script_path(script_path: str) -> Dict[str, Any]:
  80 |     """Validate script path and return error info if invalid."""
  81 |     if not script_path or not isinstance(script_path, str):
  82 |         return {"valid": False, "error": "Script path is required"}
  83 |     
  84 |     if not os.path.exists(script_path):
  85 |         return {"valid": False, "error": f"Script not found: {script_path}"}
  86 |     
  87 |     if not script_path.endswith('.py'):
  88 |         return {"valid": False, "error": "Only Python (.py) files are supported"}
  89 |     
  90 |     try:
  91 |         # Check if file is readable
  92 |         with open(script_path, 'r', encoding='utf-8') as f:
  93 |             f.read(1)  # Read first character to test
  94 |         return {"valid": True}
  95 |     except Exception as e:
  96 |         return {"valid": False, "error": f"Cannot read script file: {str(e)}"}
  97 | 
  98 | def validate_github_url(repo_url: str) -> Dict[str, Any]:
  99 |     """Validate GitHub repository URL."""
 100 |     if not repo_url or not isinstance(repo_url, str):
 101 |         return {"valid": False, "error": "Repository URL is required"}
 102 |     
 103 |     repo_url = repo_url.strip()
 104 |     
 105 |     # Basic GitHub URL validation
 106 |     if not ("github.com" in repo_url.lower() or repo_url.endswith(".git")):
 107 |         return {"valid": False, "error": "Please provide a valid GitHub repository URL"}
 108 |     
 109 |     # Check URL format
 110 |     if not (repo_url.startswith("https://") or repo_url.startswith("git@")):
 111 |         return {"valid": False, "error": "Repository URL must start with https:// or git@"}
 112 |     
 113 |     return {"valid": True, "repo_name": repo_url.split('/')[-1].replace('.git', '')}
 114 | 
 115 | # Create a dataclass for our application context
 116 | @dataclass
 117 | class Crawl4AIContext:
 118 |     """Context for the Crawl4AI MCP server."""
 119 |     crawler: AsyncWebCrawler
 120 |     supabase_client: Client
 121 |     reranking_model: Optional[CrossEncoder] = None
 122 |     knowledge_validator: Optional[Any] = None  # KnowledgeGraphValidator when available
 123 |     repo_extractor: Optional[Any] = None       # DirectNeo4jExtractor when available
 124 | 
 125 | @asynccontextmanager
 126 | async def crawl4ai_lifespan(server: FastMCP) -> AsyncIterator[Crawl4AIContext]:
 127 |     """
 128 |     Manages the Crawl4AI client lifecycle.
 129 |     
 130 |     Args:
 131 |         server: The FastMCP server instance
 132 |         
 133 |     Yields:
 134 |         Crawl4AIContext: The context containing the Crawl4AI crawler and Supabase client
 135 |     """
 136 |     # Create browser configuration
 137 |     browser_config = BrowserConfig(
 138 |         headless=True,
 139 |         verbose=False
 140 |     )
 141 |     
 142 |     # Initialize the crawler
 143 |     crawler = AsyncWebCrawler(config=browser_config)
 144 |     await crawler.__aenter__()
 145 |     
 146 |     # Initialize Supabase client
 147 |     supabase_client = get_supabase_client()
 148 |     
 149 |     # Initialize cross-encoder model for reranking if enabled
 150 |     reranking_model = None
 151 |     if os.getenv("USE_RERANKING", "false") == "true":
 152 |         try:
 153 |             reranking_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
 154 |         except Exception as e:
 155 |             print(f"Failed to load reranking model: {e}")
 156 |             reranking_model = None
 157 |     
 158 |     # Initialize Neo4j components if configured and enabled
 159 |     knowledge_validator = None
 160 |     repo_extractor = None
 161 |     
 162 |     # Check if knowledge graph functionality is enabled
 163 |     knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
 164 |     
 165 |     if knowledge_graph_enabled:
 166 |         neo4j_uri = os.getenv("NEO4J_URI")
 167 |         neo4j_user = os.getenv("NEO4J_USER")
 168 |         neo4j_password = os.getenv("NEO4J_PASSWORD")
 169 |         
 170 |         if neo4j_uri and neo4j_user and neo4j_password:
 171 |             try:
 172 |                 print("Initializing knowledge graph components...")
 173 |                 
 174 |                 # Initialize knowledge graph validator
 175 |                 knowledge_validator = KnowledgeGraphValidator(neo4j_uri, neo4j_user, neo4j_password)
 176 |                 await knowledge_validator.initialize()
 177 |                 print("✓ Knowledge graph validator initialized")
 178 |                 
 179 |                 # Initialize repository extractor
 180 |                 repo_extractor = DirectNeo4jExtractor(neo4j_uri, neo4j_user, neo4j_password)
 181 |                 await repo_extractor.initialize()
 182 |                 print("✓ Repository extractor initialized")
 183 |                 
 184 |             except Exception as e:
 185 |                 print(f"Failed to initialize Neo4j components: {format_neo4j_error(e)}")
 186 |                 knowledge_validator = None
 187 |                 repo_extractor = None
 188 |         else:
 189 |             print("Neo4j credentials not configured - knowledge graph tools will be unavailable")
 190 |     else:
 191 |         print("Knowledge graph functionality disabled - set USE_KNOWLEDGE_GRAPH=true to enable")
 192 |     
 193 |     try:
 194 |         yield Crawl4AIContext(
 195 |             crawler=crawler,
 196 |             supabase_client=supabase_client,
 197 |             reranking_model=reranking_model,
 198 |             knowledge_validator=knowledge_validator,
 199 |             repo_extractor=repo_extractor
 200 |         )
 201 |     finally:
 202 |         # Clean up all components
 203 |         await crawler.__aexit__(None, None, None)
 204 |         if knowledge_validator:
 205 |             try:
 206 |                 await knowledge_validator.close()
 207 |                 print("✓ Knowledge graph validator closed")
 208 |             except Exception as e:
 209 |                 print(f"Error closing knowledge validator: {e}")
 210 |         if repo_extractor:
 211 |             try:
 212 |                 await repo_extractor.close()
 213 |                 print("✓ Repository extractor closed")
 214 |             except Exception as e:
 215 |                 print(f"Error closing repository extractor: {e}")
 216 | 
 217 | # Initialize FastMCP server
 218 | mcp = FastMCP(
 219 |     "mcp-crawl4ai-rag",
 220 |     description="MCP server for RAG and web crawling with Crawl4AI",
 221 |     lifespan=crawl4ai_lifespan,
 222 |     host=os.getenv("HOST", "0.0.0.0"),
 223 |     port=os.getenv("PORT", "8051")
 224 | )
 225 | 
 226 | def rerank_results(model: CrossEncoder, query: str, results: List[Dict[str, Any]], content_key: str = "content") -> List[Dict[str, Any]]:
 227 |     """
 228 |     Rerank search results using a cross-encoder model.
 229 |     
 230 |     Args:
 231 |         model: The cross-encoder model to use for reranking
 232 |         query: The search query
 233 |         results: List of search results
 234 |         content_key: The key in each result dict that contains the text content
 235 |         
 236 |     Returns:
 237 |         Reranked list of results
 238 |     """
 239 |     if not model or not results:
 240 |         return results
 241 |     
 242 |     try:
 243 |         # Extract content from results
 244 |         texts = [result.get(content_key, "") for result in results]
 245 |         
 246 |         # Create pairs of [query, document] for the cross-encoder
 247 |         pairs = [[query, text] for text in texts]
 248 |         
 249 |         # Get relevance scores from the cross-encoder
 250 |         scores = model.predict(pairs)
 251 |         
 252 |         # Add scores to results and sort by score (descending)
 253 |         for i, result in enumerate(results):
 254 |             result["rerank_score"] = float(scores[i])
 255 |         
 256 |         # Sort by rerank score
 257 |         reranked = sorted(results, key=lambda x: x.get("rerank_score", 0), reverse=True)
 258 |         
 259 |         return reranked
 260 |     except Exception as e:
 261 |         print(f"Error during reranking: {e}")
 262 |         return results
 263 | 
 264 | def is_sitemap(url: str) -> bool:
 265 |     """
 266 |     Check if a URL is a sitemap.
 267 |     
 268 |     Args:
 269 |         url: URL to check
 270 |         
 271 |     Returns:
 272 |         True if the URL is a sitemap, False otherwise
 273 |     """
 274 |     return url.endswith('sitemap.xml') or 'sitemap' in urlparse(url).path
 275 | 
 276 | def is_txt(url: str) -> bool:
 277 |     """
 278 |     Check if a URL is a text file.
 279 |     
 280 |     Args:
 281 |         url: URL to check
 282 |         
 283 |     Returns:
 284 |         True if the URL is a text file, False otherwise
 285 |     """
 286 |     return url.endswith('.txt')
 287 | 
 288 | def parse_sitemap(sitemap_url: str) -> List[str]:
 289 |     """
 290 |     Parse a sitemap and extract URLs.
 291 |     
 292 |     Args:
 293 |         sitemap_url: URL of the sitemap
 294 |         
 295 |     Returns:
 296 |         List of URLs found in the sitemap
 297 |     """
 298 |     resp = requests.get(sitemap_url)
 299 |     urls = []
 300 | 
 301 |     if resp.status_code == 200:
 302 |         try:
 303 |             tree = ElementTree.fromstring(resp.content)
 304 |             urls = [loc.text for loc in tree.findall('.//{*}loc')]
 305 |         except Exception as e:
 306 |             print(f"Error parsing sitemap XML: {e}")
 307 | 
 308 |     return urls
 309 | 
 310 | def smart_chunk_markdown(text: str, chunk_size: int = 5000) -> List[str]:
 311 |     """Split text into chunks, respecting code blocks and paragraphs."""
 312 |     chunks = []
 313 |     start = 0
 314 |     text_length = len(text)
 315 | 
 316 |     while start < text_length:
 317 |         # Calculate end position
 318 |         end = start + chunk_size
 319 | 
 320 |         # If we're at the end of the text, just take what's left
 321 |         if end >= text_length:
 322 |             chunks.append(text[start:].strip())
 323 |             break
 324 | 
 325 |         # Try to find a code block boundary first (```)
 326 |         chunk = text[start:end]
 327 |         code_block = chunk.rfind('```')
 328 |         if code_block != -1 and code_block > chunk_size * 0.3:
 329 |             end = start + code_block
 330 | 
 331 |         # If no code block, try to break at a paragraph
 332 |         elif '\n\n' in chunk:
 333 |             # Find the last paragraph break
 334 |             last_break = chunk.rfind('\n\n')
 335 |             if last_break > chunk_size * 0.3:  # Only break if we're past 30% of chunk_size
 336 |                 end = start + last_break
 337 | 
 338 |         # If no paragraph break, try to break at a sentence
 339 |         elif '. ' in chunk:
 340 |             # Find the last sentence break
 341 |             last_period = chunk.rfind('. ')
 342 |             if last_period > chunk_size * 0.3:  # Only break if we're past 30% of chunk_size
 343 |                 end = start + last_period + 1
 344 | 
 345 |         # Extract chunk and clean it up
 346 |         chunk = text[start:end].strip()
 347 |         if chunk:
 348 |             chunks.append(chunk)
 349 | 
 350 |         # Move start position for next chunk
 351 |         start = end
 352 | 
 353 |     return chunks
 354 | 
 355 | def extract_section_info(chunk: str) -> Dict[str, Any]:
 356 |     """
 357 |     Extracts headers and stats from a chunk.
 358 |     
 359 |     Args:
 360 |         chunk: Markdown chunk
 361 |         
 362 |     Returns:
 363 |         Dictionary with headers and stats
 364 |     """
 365 |     headers = re.findall(r'^(#+)\s+(.+)$', chunk, re.MULTILINE)
 366 |     header_str = '; '.join([f'{h[0]} {h[1]}' for h in headers]) if headers else ''
 367 | 
 368 |     return {
 369 |         "headers": header_str,
 370 |         "char_count": len(chunk),
 371 |         "word_count": len(chunk.split())
 372 |     }
 373 | 
 374 | def process_code_example(args):
 375 |     """
 376 |     Process a single code example to generate its summary.
 377 |     This function is designed to be used with concurrent.futures.
 378 |     
 379 |     Args:
 380 |         args: Tuple containing (code, context_before, context_after)
 381 |         
 382 |     Returns:
 383 |         The generated summary
 384 |     """
 385 |     code, context_before, context_after = args
 386 |     return generate_code_example_summary(code, context_before, context_after)
 387 | 
 388 | @mcp.tool()
 389 | async def crawl_single_page(ctx: Context, url: str) -> str:
 390 |     """
 391 |     Crawl a single web page and store its content in Supabase.
 392 |     
 393 |     This tool is ideal for quickly retrieving content from a specific URL without following links.
 394 |     The content is stored in Supabase for later retrieval and querying.
 395 |     
 396 |     Args:
 397 |         ctx: The MCP server provided context
 398 |         url: URL of the web page to crawl
 399 |     
 400 |     Returns:
 401 |         Summary of the crawling operation and storage in Supabase
 402 |     """
 403 |     try:
 404 |         # Get the crawler from the context
 405 |         crawler = ctx.request_context.lifespan_context.crawler
 406 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 407 |         
 408 |         # Configure the crawl
 409 |         run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
 410 |         
 411 |         # Crawl the page
 412 |         result = await crawler.arun(url=url, config=run_config)
 413 |         
 414 |         if result.success and result.markdown:
 415 |             # Extract source_id
 416 |             parsed_url = urlparse(url)
 417 |             source_id = parsed_url.netloc or parsed_url.path
 418 |             
 419 |             # Chunk the content
 420 |             chunks = smart_chunk_markdown(result.markdown)
 421 |             
 422 |             # Prepare data for Supabase
 423 |             urls = []
 424 |             chunk_numbers = []
 425 |             contents = []
 426 |             metadatas = []
 427 |             total_word_count = 0
 428 |             
 429 |             for i, chunk in enumerate(chunks):
 430 |                 urls.append(url)
 431 |                 chunk_numbers.append(i)
 432 |                 contents.append(chunk)
 433 |                 
 434 |                 # Extract metadata
 435 |                 meta = extract_section_info(chunk)
 436 |                 meta["chunk_index"] = i
 437 |                 meta["url"] = url
 438 |                 meta["source"] = source_id
 439 |                 meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
 440 |                 metadatas.append(meta)
 441 |                 
 442 |                 # Accumulate word count
 443 |                 total_word_count += meta.get("word_count", 0)
 444 |             
 445 |             # Create url_to_full_document mapping
 446 |             url_to_full_document = {url: result.markdown}
 447 |             
 448 |             # Update source information FIRST (before inserting documents)
 449 |             source_summary = extract_source_summary(source_id, result.markdown[:5000])  # Use first 5000 chars for summary
 450 |             update_source_info(supabase_client, source_id, source_summary, total_word_count)
 451 |             
 452 |             # Add documentation chunks to Supabase (AFTER source exists)
 453 |             add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document)
 454 |             
 455 |             # Extract and process code examples only if enabled
 456 |             extract_code_examples = os.getenv("USE_AGENTIC_RAG", "false") == "true"
 457 |             if extract_code_examples:
 458 |                 code_blocks = extract_code_blocks(result.markdown)
 459 |                 if code_blocks:
 460 |                     code_urls = []
 461 |                     code_chunk_numbers = []
 462 |                     code_examples = []
 463 |                     code_summaries = []
 464 |                     code_metadatas = []
 465 |                     
 466 |                     # Process code examples in parallel
 467 |                     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
 468 |                         # Prepare arguments for parallel processing
 469 |                         summary_args = [(block['code'], block['context_before'], block['context_after']) 
 470 |                                         for block in code_blocks]
 471 |                         
 472 |                         # Generate summaries in parallel
 473 |                         summaries = list(executor.map(process_code_example, summary_args))
 474 |                     
 475 |                     # Prepare code example data
 476 |                     for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
 477 |                         code_urls.append(url)
 478 |                         code_chunk_numbers.append(i)
 479 |                         code_examples.append(block['code'])
 480 |                         code_summaries.append(summary)
 481 |                         
 482 |                         # Create metadata for code example
 483 |                         code_meta = {
 484 |                             "chunk_index": i,
 485 |                             "url": url,
 486 |                             "source": source_id,
 487 |                             "char_count": len(block['code']),
 488 |                             "word_count": len(block['code'].split())
 489 |                         }
 490 |                         code_metadatas.append(code_meta)
 491 |                     
 492 |                     # Add code examples to Supabase
 493 |                     add_code_examples_to_supabase(
 494 |                         supabase_client, 
 495 |                         code_urls, 
 496 |                         code_chunk_numbers, 
 497 |                         code_examples, 
 498 |                         code_summaries, 
 499 |                         code_metadatas
 500 |                     )
 501 |             
 502 |             return json.dumps({
 503 |                 "success": True,
 504 |                 "url": url,
 505 |                 "chunks_stored": len(chunks),
 506 |                 "code_examples_stored": len(code_blocks) if code_blocks else 0,
 507 |                 "content_length": len(result.markdown),
 508 |                 "total_word_count": total_word_count,
 509 |                 "source_id": source_id,
 510 |                 "links_count": {
 511 |                     "internal": len(result.links.get("internal", [])),
 512 |                     "external": len(result.links.get("external", []))
 513 |                 }
 514 |             }, indent=2)
 515 |         else:
 516 |             return json.dumps({
 517 |                 "success": False,
 518 |                 "url": url,
 519 |                 "error": result.error_message
 520 |             }, indent=2)
 521 |     except Exception as e:
 522 |         return json.dumps({
 523 |             "success": False,
 524 |             "url": url,
 525 |             "error": str(e)
 526 |         }, indent=2)
 527 | 
 528 | @mcp.tool()
 529 | async def smart_crawl_url(ctx: Context, url: str, max_depth: int = 3, max_concurrent: int = 10, chunk_size: int = 5000) -> str:
 530 |     """
 531 |     Intelligently crawl a URL based on its type and store content in Supabase.
 532 |     
 533 |     This tool automatically detects the URL type and applies the appropriate crawling method:
 534 |     - For sitemaps: Extracts and crawls all URLs in parallel
 535 |     - For text files (llms.txt): Directly retrieves the content
 536 |     - For regular webpages: Recursively crawls internal links up to the specified depth
 537 |     
 538 |     All crawled content is chunked and stored in Supabase for later retrieval and querying.
 539 |     
 540 |     Args:
 541 |         ctx: The MCP server provided context
 542 |         url: URL to crawl (can be a regular webpage, sitemap.xml, or .txt file)
 543 |         max_depth: Maximum recursion depth for regular URLs (default: 3)
 544 |         max_concurrent: Maximum number of concurrent browser sessions (default: 10)
 545 |         chunk_size: Maximum size of each content chunk in characters (default: 1000)
 546 |     
 547 |     Returns:
 548 |         JSON string with crawl summary and storage information
 549 |     """
 550 |     try:
 551 |         # Get the crawler from the context
 552 |         crawler = ctx.request_context.lifespan_context.crawler
 553 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 554 |         
 555 |         # Determine the crawl strategy
 556 |         crawl_results = []
 557 |         crawl_type = None
 558 |         
 559 |         if is_txt(url):
 560 |             # For text files, use simple crawl
 561 |             crawl_results = await crawl_markdown_file(crawler, url)
 562 |             crawl_type = "text_file"
 563 |         elif is_sitemap(url):
 564 |             # For sitemaps, extract URLs and crawl in parallel
 565 |             sitemap_urls = parse_sitemap(url)
 566 |             if not sitemap_urls:
 567 |                 return json.dumps({
 568 |                     "success": False,
 569 |                     "url": url,
 570 |                     "error": "No URLs found in sitemap"
 571 |                 }, indent=2)
 572 |             crawl_results = await crawl_batch(crawler, sitemap_urls, max_concurrent=max_concurrent)
 573 |             crawl_type = "sitemap"
 574 |         else:
 575 |             # For regular URLs, use recursive crawl
 576 |             crawl_results = await crawl_recursive_internal_links(crawler, [url], max_depth=max_depth, max_concurrent=max_concurrent)
 577 |             crawl_type = "webpage"
 578 |         
 579 |         if not crawl_results:
 580 |             return json.dumps({
 581 |                 "success": False,
 582 |                 "url": url,
 583 |                 "error": "No content found"
 584 |             }, indent=2)
 585 |         
 586 |         # Process results and store in Supabase
 587 |         urls = []
 588 |         chunk_numbers = []
 589 |         contents = []
 590 |         metadatas = []
 591 |         chunk_count = 0
 592 |         
 593 |         # Track sources and their content
 594 |         source_content_map = {}
 595 |         source_word_counts = {}
 596 |         
 597 |         # Process documentation chunks
 598 |         for doc in crawl_results:
 599 |             source_url = doc['url']
 600 |             md = doc['markdown']
 601 |             chunks = smart_chunk_markdown(md, chunk_size=chunk_size)
 602 |             
 603 |             # Extract source_id
 604 |             parsed_url = urlparse(source_url)
 605 |             source_id = parsed_url.netloc or parsed_url.path
 606 |             
 607 |             # Store content for source summary generation
 608 |             if source_id not in source_content_map:
 609 |                 source_content_map[source_id] = md[:5000]  # Store first 5000 chars
 610 |                 source_word_counts[source_id] = 0
 611 |             
 612 |             for i, chunk in enumerate(chunks):
 613 |                 urls.append(source_url)
 614 |                 chunk_numbers.append(i)
 615 |                 contents.append(chunk)
 616 |                 
 617 |                 # Extract metadata
 618 |                 meta = extract_section_info(chunk)
 619 |                 meta["chunk_index"] = i
 620 |                 meta["url"] = source_url
 621 |                 meta["source"] = source_id
 622 |                 meta["crawl_type"] = crawl_type
 623 |                 meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
 624 |                 metadatas.append(meta)
 625 |                 
 626 |                 # Accumulate word count
 627 |                 source_word_counts[source_id] += meta.get("word_count", 0)
 628 |                 
 629 |                 chunk_count += 1
 630 |         
 631 |         # Create url_to_full_document mapping
 632 |         url_to_full_document = {}
 633 |         for doc in crawl_results:
 634 |             url_to_full_document[doc['url']] = doc['markdown']
 635 |         
 636 |         # Update source information for each unique source FIRST (before inserting documents)
 637 |         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
 638 |             source_summary_args = [(source_id, content) for source_id, content in source_content_map.items()]
 639 |             source_summaries = list(executor.map(lambda args: extract_source_summary(args[0], args[1]), source_summary_args))
 640 |         
 641 |         for (source_id, _), summary in zip(source_summary_args, source_summaries):
 642 |             word_count = source_word_counts.get(source_id, 0)
 643 |             update_source_info(supabase_client, source_id, summary, word_count)
 644 |         
 645 |         # Add documentation chunks to Supabase (AFTER sources exist)
 646 |         batch_size = 20
 647 |         add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document, batch_size=batch_size)
 648 |         
 649 |         # Extract and process code examples from all documents only if enabled
 650 |         extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
 651 |         if extract_code_examples_enabled:
 652 |             all_code_blocks = []
 653 |             code_urls = []
 654 |             code_chunk_numbers = []
 655 |             code_examples = []
 656 |             code_summaries = []
 657 |             code_metadatas = []
 658 |             
 659 |             # Extract code blocks from all documents
 660 |             for doc in crawl_results:
 661 |                 source_url = doc['url']
 662 |                 md = doc['markdown']
 663 |                 code_blocks = extract_code_blocks(md)
 664 |                 
 665 |                 if code_blocks:
 666 |                     # Process code examples in parallel
 667 |                     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
 668 |                         # Prepare arguments for parallel processing
 669 |                         summary_args = [(block['code'], block['context_before'], block['context_after']) 
 670 |                                         for block in code_blocks]
 671 |                         
 672 |                         # Generate summaries in parallel
 673 |                         summaries = list(executor.map(process_code_example, summary_args))
 674 |                     
 675 |                     # Prepare code example data
 676 |                     parsed_url = urlparse(source_url)
 677 |                     source_id = parsed_url.netloc or parsed_url.path
 678 |                     
 679 |                     for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
 680 |                         code_urls.append(source_url)
 681 |                         code_chunk_numbers.append(len(code_examples))  # Use global code example index
 682 |                         code_examples.append(block['code'])
 683 |                         code_summaries.append(summary)
 684 |                         
 685 |                         # Create metadata for code example
 686 |                         code_meta = {
 687 |                             "chunk_index": len(code_examples) - 1,
 688 |                             "url": source_url,
 689 |                             "source": source_id,
 690 |                             "char_count": len(block['code']),
 691 |                             "word_count": len(block['code'].split())
 692 |                         }
 693 |                         code_metadatas.append(code_meta)
 694 |             
 695 |             # Add all code examples to Supabase
 696 |             if code_examples:
 697 |                 add_code_examples_to_supabase(
 698 |                     supabase_client, 
 699 |                     code_urls, 
 700 |                     code_chunk_numbers, 
 701 |                     code_examples, 
 702 |                     code_summaries, 
 703 |                     code_metadatas,
 704 |                     batch_size=batch_size
 705 |                 )
 706 |         
 707 |         return json.dumps({
 708 |             "success": True,
 709 |             "url": url,
 710 |             "crawl_type": crawl_type,
 711 |             "pages_crawled": len(crawl_results),
 712 |             "chunks_stored": chunk_count,
 713 |             "code_examples_stored": len(code_examples),
 714 |             "sources_updated": len(source_content_map),
 715 |             "urls_crawled": [doc['url'] for doc in crawl_results][:5] + (["..."] if len(crawl_results) > 5 else [])
 716 |         }, indent=2)
 717 |     except Exception as e:
 718 |         return json.dumps({
 719 |             "success": False,
 720 |             "url": url,
 721 |             "error": str(e)
 722 |         }, indent=2)
 723 | 
 724 | @mcp.tool()
 725 | async def get_available_sources(ctx: Context) -> str:
 726 |     """
 727 |     Get all available sources from the sources table.
 728 |     
 729 |     This tool returns a list of all unique sources (domains) that have been crawled and stored
 730 |     in the database, along with their summaries and statistics. This is useful for discovering 
 731 |     what content is available for querying.
 732 | 
 733 |     Always use this tool before calling the RAG query or code example query tool
 734 |     with a specific source filter!
 735 |     
 736 |     Args:
 737 |         ctx: The MCP server provided context
 738 |     
 739 |     Returns:
 740 |         JSON string with the list of available sources and their details
 741 |     """
 742 |     try:
 743 |         # Get the Supabase client from the context
 744 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 745 |         
 746 |         # Query the sources table directly
 747 |         result = supabase_client.from_('sources')\
 748 |             .select('*')\
 749 |             .order('source_id')\
 750 |             .execute()
 751 |         
 752 |         # Format the sources with their details
 753 |         sources = []
 754 |         if result.data:
 755 |             for source in result.data:
 756 |                 sources.append({
 757 |                     "source_id": source.get("source_id"),
 758 |                     "summary": source.get("summary"),
 759 |                     "total_words": source.get("total_words"),
 760 |                     "created_at": source.get("created_at"),
 761 |                     "updated_at": source.get("updated_at")
 762 |                 })
 763 |         
 764 |         return json.dumps({
 765 |             "success": True,
 766 |             "sources": sources,
 767 |             "count": len(sources)
 768 |         }, indent=2)
 769 |     except Exception as e:
 770 |         return json.dumps({
 771 |             "success": False,
 772 |             "error": str(e)
 773 |         }, indent=2)
 774 | 
 775 | @mcp.tool()
 776 | async def perform_rag_query(ctx: Context, query: str, source: str = None, match_count: int = 5) -> str:
 777 |     """
 778 |     Perform a RAG (Retrieval Augmented Generation) query on the stored content.
 779 |     
 780 |     This tool searches the vector database for content relevant to the query and returns
 781 |     the matching documents. Optionally filter by source domain.
 782 |     Get the source by using the get_available_sources tool before calling this search!
 783 |     
 784 |     Args:
 785 |         ctx: The MCP server provided context
 786 |         query: The search query
 787 |         source: Optional source domain to filter results (e.g., 'example.com')
 788 |         match_count: Maximum number of results to return (default: 5)
 789 |     
 790 |     Returns:
 791 |         JSON string with the search results
 792 |     """
 793 |     try:
 794 |         # Get the Supabase client from the context
 795 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 796 |         
 797 |         # Check if hybrid search is enabled
 798 |         use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
 799 |         
 800 |         # Prepare filter if source is provided and not empty
 801 |         filter_metadata = None
 802 |         if source and source.strip():
 803 |             filter_metadata = {"source": source}
 804 |         
 805 |         if use_hybrid_search:
 806 |             # Hybrid search: combine vector and keyword search
 807 |             
 808 |             # 1. Get vector search results (get more to account for filtering)
 809 |             vector_results = search_documents(
 810 |                 client=supabase_client,
 811 |                 query=query,
 812 |                 match_count=match_count * 2,  # Get double to have room for filtering
 813 |                 filter_metadata=filter_metadata
 814 |             )
 815 |             
 816 |             # 2. Get keyword search results using ILIKE
 817 |             keyword_query = supabase_client.from_('crawled_pages')\
 818 |                 .select('id, url, chunk_number, content, metadata, source_id')\
 819 |                 .ilike('content', f'%{query}%')
 820 |             
 821 |             # Apply source filter if provided
 822 |             if source and source.strip():
 823 |                 keyword_query = keyword_query.eq('source_id', source)
 824 |             
 825 |             # Execute keyword search
 826 |             keyword_response = keyword_query.limit(match_count * 2).execute()
 827 |             keyword_results = keyword_response.data if keyword_response.data else []
 828 |             
 829 |             # 3. Combine results with preference for items appearing in both
 830 |             seen_ids = set()
 831 |             combined_results = []
 832 |             
 833 |             # First, add items that appear in both searches (these are the best matches)
 834 |             vector_ids = {r.get('id') for r in vector_results if r.get('id')}
 835 |             for kr in keyword_results:
 836 |                 if kr['id'] in vector_ids and kr['id'] not in seen_ids:
 837 |                     # Find the vector result to get similarity score
 838 |                     for vr in vector_results:
 839 |                         if vr.get('id') == kr['id']:
 840 |                             # Boost similarity score for items in both results
 841 |                             vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
 842 |                             combined_results.append(vr)
 843 |                             seen_ids.add(kr['id'])
 844 |                             break
 845 |             
 846 |             # Then add remaining vector results (semantic matches without exact keyword)
 847 |             for vr in vector_results:
 848 |                 if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
 849 |                     combined_results.append(vr)
 850 |                     seen_ids.add(vr['id'])
 851 |             
 852 |             # Finally, add pure keyword matches if we still need more results
 853 |             for kr in keyword_results:
 854 |                 if kr['id'] not in seen_ids and len(combined_results) < match_count:
 855 |                     # Convert keyword result to match vector result format
 856 |                     combined_results.append({
 857 |                         'id': kr['id'],
 858 |                         'url': kr['url'],
 859 |                         'chunk_number': kr['chunk_number'],
 860 |                         'content': kr['content'],
 861 |                         'metadata': kr['metadata'],
 862 |                         'source_id': kr['source_id'],
 863 |                         'similarity': 0.5  # Default similarity for keyword-only matches
 864 |                     })
 865 |                     seen_ids.add(kr['id'])
 866 |             
 867 |             # Use combined results
 868 |             results = combined_results[:match_count]
 869 |             
 870 |         else:
 871 |             # Standard vector search only
 872 |             results = search_documents(
 873 |                 client=supabase_client,
 874 |                 query=query,
 875 |                 match_count=match_count,
 876 |                 filter_metadata=filter_metadata
 877 |             )
 878 |         
 879 |         # Apply reranking if enabled
 880 |         use_reranking = os.getenv("USE_RERANKING", "false") == "true"
 881 |         if use_reranking and ctx.request_context.lifespan_context.reranking_model:
 882 |             results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
 883 |         
 884 |         # Format the results
 885 |         formatted_results = []
 886 |         for result in results:
 887 |             formatted_result = {
 888 |                 "url": result.get("url"),
 889 |                 "content": result.get("content"),
 890 |                 "metadata": result.get("metadata"),
 891 |                 "similarity": result.get("similarity")
 892 |             }
 893 |             # Include rerank score if available
 894 |             if "rerank_score" in result:
 895 |                 formatted_result["rerank_score"] = result["rerank_score"]
 896 |             formatted_results.append(formatted_result)
 897 |         
 898 |         return json.dumps({
 899 |             "success": True,
 900 |             "query": query,
 901 |             "source_filter": source,
 902 |             "search_mode": "hybrid" if use_hybrid_search else "vector",
 903 |             "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
 904 |             "results": formatted_results,
 905 |             "count": len(formatted_results)
 906 |         }, indent=2)
 907 |     except Exception as e:
 908 |         return json.dumps({
 909 |             "success": False,
 910 |             "query": query,
 911 |             "error": str(e)
 912 |         }, indent=2)
 913 | 
 914 | @mcp.tool()
 915 | async def search_code_examples(ctx: Context, query: str, source_id: str = None, match_count: int = 5) -> str:
 916 |     """
 917 |     Search for code examples relevant to the query.
 918 |     
 919 |     This tool searches the vector database for code examples relevant to the query and returns
 920 |     the matching examples with their summaries. Optionally filter by source_id.
 921 |     Get the source_id by using the get_available_sources tool before calling this search!
 922 | 
 923 |     Use the get_available_sources tool first to see what sources are available for filtering.
 924 |     
 925 |     Args:
 926 |         ctx: The MCP server provided context
 927 |         query: The search query
 928 |         source_id: Optional source ID to filter results (e.g., 'example.com')
 929 |         match_count: Maximum number of results to return (default: 5)
 930 |     
 931 |     Returns:
 932 |         JSON string with the search results
 933 |     """
 934 |     # Check if code example extraction is enabled
 935 |     extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
 936 |     if not extract_code_examples_enabled:
 937 |         return json.dumps({
 938 |             "success": False,
 939 |             "error": "Code example extraction is disabled. Perform a normal RAG search."
 940 |         }, indent=2)
 941 |     
 942 |     try:
 943 |         # Get the Supabase client from the context
 944 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 945 |         
 946 |         # Check if hybrid search is enabled
 947 |         use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
 948 |         
 949 |         # Prepare filter if source is provided and not empty
 950 |         filter_metadata = None
 951 |         if source_id and source_id.strip():
 952 |             filter_metadata = {"source": source_id}
 953 |         
 954 |         if use_hybrid_search:
 955 |             # Hybrid search: combine vector and keyword search
 956 |             
 957 |             # Import the search function from utils
 958 |             from utils import search_code_examples as search_code_examples_impl
 959 |             
 960 |             # 1. Get vector search results (get more to account for filtering)
 961 |             vector_results = search_code_examples_impl(
 962 |                 client=supabase_client,
 963 |                 query=query,
 964 |                 match_count=match_count * 2,  # Get double to have room for filtering
 965 |                 filter_metadata=filter_metadata
 966 |             )
 967 |             
 968 |             # 2. Get keyword search results using ILIKE on both content and summary
 969 |             keyword_query = supabase_client.from_('code_examples')\
 970 |                 .select('id, url, chunk_number, content, summary, metadata, source_id')\
 971 |                 .or_(f'content.ilike.%{query}%,summary.ilike.%{query}%')
 972 |             
 973 |             # Apply source filter if provided
 974 |             if source_id and source_id.strip():
 975 |                 keyword_query = keyword_query.eq('source_id', source_id)
 976 |             
 977 |             # Execute keyword search
 978 |             keyword_response = keyword_query.limit(match_count * 2).execute()
 979 |             keyword_results = keyword_response.data if keyword_response.data else []
 980 |             
 981 |             # 3. Combine results with preference for items appearing in both
 982 |             seen_ids = set()
 983 |             combined_results = []
 984 |             
 985 |             # First, add items that appear in both searches (these are the best matches)
 986 |             vector_ids = {r.get('id') for r in vector_results if r.get('id')}
 987 |             for kr in keyword_results:
 988 |                 if kr['id'] in vector_ids and kr['id'] not in seen_ids:
 989 |                     # Find the vector result to get similarity score
 990 |                     for vr in vector_results:
 991 |                         if vr.get('id') == kr['id']:
 992 |                             # Boost similarity score for items in both results
 993 |                             vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
 994 |                             combined_results.append(vr)
 995 |                             seen_ids.add(kr['id'])
 996 |                             break
 997 |             
 998 |             # Then add remaining vector results (semantic matches without exact keyword)
 999 |             for vr in vector_results:
1000 |                 if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
1001 |                     combined_results.append(vr)
1002 |                     seen_ids.add(vr['id'])
1003 |             
1004 |             # Finally, add pure keyword matches if we still need more results
1005 |             for kr in keyword_results:
1006 |                 if kr['id'] not in seen_ids and len(combined_results) < match_count:
1007 |                     # Convert keyword result to match vector result format
1008 |                     combined_results.append({
1009 |                         'id': kr['id'],
1010 |                         'url': kr['url'],
1011 |                         'chunk_number': kr['chunk_number'],
1012 |                         'content': kr['content'],
1013 |                         'summary': kr['summary'],
1014 |                         'metadata': kr['metadata'],
1015 |                         'source_id': kr['source_id'],
1016 |                         'similarity': 0.5  # Default similarity for keyword-only matches
1017 |                     })
1018 |                     seen_ids.add(kr['id'])
1019 |             
1020 |             # Use combined results
1021 |             results = combined_results[:match_count]
1022 |             
1023 |         else:
1024 |             # Standard vector search only
1025 |             from utils import search_code_examples as search_code_examples_impl
1026 |             
1027 |             results = search_code_examples_impl(
1028 |                 client=supabase_client,
1029 |                 query=query,
1030 |                 match_count=match_count,
1031 |                 filter_metadata=filter_metadata
1032 |             )
1033 |         
1034 |         # Apply reranking if enabled
1035 |         use_reranking = os.getenv("USE_RERANKING", "false") == "true"
1036 |         if use_reranking and ctx.request_context.lifespan_context.reranking_model:
1037 |             results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
1038 |         
1039 |         # Format the results
1040 |         formatted_results = []
1041 |         for result in results:
1042 |             formatted_result = {
1043 |                 "url": result.get("url"),
1044 |                 "code": result.get("content"),
1045 |                 "summary": result.get("summary"),
1046 |                 "metadata": result.get("metadata"),
1047 |                 "source_id": result.get("source_id"),
1048 |                 "similarity": result.get("similarity")
1049 |             }
1050 |             # Include rerank score if available
1051 |             if "rerank_score" in result:
1052 |                 formatted_result["rerank_score"] = result["rerank_score"]
1053 |             formatted_results.append(formatted_result)
1054 |         
1055 |         return json.dumps({
1056 |             "success": True,
1057 |             "query": query,
1058 |             "source_filter": source_id,
1059 |             "search_mode": "hybrid" if use_hybrid_search else "vector",
1060 |             "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
1061 |             "results": formatted_results,
1062 |             "count": len(formatted_results)
1063 |         }, indent=2)
1064 |     except Exception as e:
1065 |         return json.dumps({
1066 |             "success": False,
1067 |             "query": query,
1068 |             "error": str(e)
1069 |         }, indent=2)
1070 | 
1071 | @mcp.tool()
1072 | async def check_ai_script_hallucinations(ctx: Context, script_path: str) -> str:
1073 |     """
1074 |     Check an AI-generated Python script for hallucinations using the knowledge graph.
1075 |     
1076 |     This tool analyzes a Python script for potential AI hallucinations by validating
1077 |     imports, method calls, class instantiations, and function calls against a Neo4j
1078 |     knowledge graph containing real repository data.
1079 |     
1080 |     The tool performs comprehensive analysis including:
1081 |     - Import validation against known repositories
1082 |     - Method call validation on classes from the knowledge graph
1083 |     - Class instantiation parameter validation
1084 |     - Function call parameter validation
1085 |     - Attribute access validation
1086 |     
1087 |     Args:
1088 |         ctx: The MCP server provided context
1089 |         script_path: Absolute path to the Python script to analyze
1090 |     
1091 |     Returns:
1092 |         JSON string with hallucination detection results, confidence scores, and recommendations
1093 |     """
1094 |     try:
1095 |         # Check if knowledge graph functionality is enabled
1096 |         knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1097 |         if not knowledge_graph_enabled:
1098 |             return json.dumps({
1099 |                 "success": False,
1100 |                 "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1101 |             }, indent=2)
1102 |         
1103 |         # Get the knowledge validator from context
1104 |         knowledge_validator = ctx.request_context.lifespan_context.knowledge_validator
1105 |         
1106 |         if not knowledge_validator:
1107 |             return json.dumps({
1108 |                 "success": False,
1109 |                 "error": "Knowledge graph validator not available. Check Neo4j configuration in environment variables."
1110 |             }, indent=2)
1111 |         
1112 |         # Validate script path
1113 |         validation = validate_script_path(script_path)
1114 |         if not validation["valid"]:
1115 |             return json.dumps({
1116 |                 "success": False,
1117 |                 "script_path": script_path,
1118 |                 "error": validation["error"]
1119 |             }, indent=2)
1120 |         
1121 |         # Step 1: Analyze script structure using AST
1122 |         analyzer = AIScriptAnalyzer()
1123 |         analysis_result = analyzer.analyze_script(script_path)
1124 |         
1125 |         if analysis_result.errors:
1126 |             print(f"Analysis warnings for {script_path}: {analysis_result.errors}")
1127 |         
1128 |         # Step 2: Validate against knowledge graph
1129 |         validation_result = await knowledge_validator.validate_script(analysis_result)
1130 |         
1131 |         # Step 3: Generate comprehensive report
1132 |         reporter = HallucinationReporter()
1133 |         report = reporter.generate_comprehensive_report(validation_result)
1134 |         
1135 |         # Format response with comprehensive information
1136 |         return json.dumps({
1137 |             "success": True,
1138 |             "script_path": script_path,
1139 |             "overall_confidence": validation_result.overall_confidence,
1140 |             "validation_summary": {
1141 |                 "total_validations": report["validation_summary"]["total_validations"],
1142 |                 "valid_count": report["validation_summary"]["valid_count"],
1143 |                 "invalid_count": report["validation_summary"]["invalid_count"],
1144 |                 "uncertain_count": report["validation_summary"]["uncertain_count"],
1145 |                 "not_found_count": report["validation_summary"]["not_found_count"],
1146 |                 "hallucination_rate": report["validation_summary"]["hallucination_rate"]
1147 |             },
1148 |             "hallucinations_detected": report["hallucinations_detected"],
1149 |             "recommendations": report["recommendations"],
1150 |             "analysis_metadata": {
1151 |                 "total_imports": report["analysis_metadata"]["total_imports"],
1152 |                 "total_classes": report["analysis_metadata"]["total_classes"],
1153 |                 "total_methods": report["analysis_metadata"]["total_methods"],
1154 |                 "total_attributes": report["analysis_metadata"]["total_attributes"],
1155 |                 "total_functions": report["analysis_metadata"]["total_functions"]
1156 |             },
1157 |             "libraries_analyzed": report.get("libraries_analyzed", [])
1158 |         }, indent=2)
1159 |         
1160 |     except Exception as e:
1161 |         return json.dumps({
1162 |             "success": False,
1163 |             "script_path": script_path,
1164 |             "error": f"Analysis failed: {str(e)}"
1165 |         }, indent=2)
1166 | 
1167 | @mcp.tool()
1168 | async def query_knowledge_graph(ctx: Context, command: str) -> str:
1169 |     """
1170 |     Query and explore the Neo4j knowledge graph containing repository data.
1171 |     
1172 |     This tool provides comprehensive access to the knowledge graph for exploring repositories,
1173 |     classes, methods, functions, and their relationships. Perfect for understanding what data
1174 |     is available for hallucination detection and debugging validation results.
1175 |     
1176 |     **⚠️ IMPORTANT: Always start with the `repos` command first!**
1177 |     Before using any other commands, run `repos` to see what repositories are available
1178 |     in your knowledge graph. This will help you understand what data you can explore.
1179 |     
1180 |     ## Available Commands:
1181 |     
1182 |     **Repository Commands:**
1183 |     - `repos` - **START HERE!** List all repositories in the knowledge graph
1184 |     - `explore <repo_name>` - Get detailed overview of a specific repository
1185 |     
1186 |     **Class Commands:**  
1187 |     - `classes` - List all classes across all repositories (limited to 20)
1188 |     - `classes <repo_name>` - List classes in a specific repository
1189 |     - `class <class_name>` - Get detailed information about a specific class including methods and attributes
1190 |     
1191 |     **Method Commands:**
1192 |     - `method <method_name>` - Search for methods by name across all classes
1193 |     - `method <method_name> <class_name>` - Search for a method within a specific class
1194 |     
1195 |     **Custom Query:**
1196 |     - `query <cypher_query>` - Execute a custom Cypher query (results limited to 20 records)
1197 |     
1198 |     ## Knowledge Graph Schema:
1199 |     
1200 |     **Node Types:**
1201 |     - Repository: `(r:Repository {name: string})`
1202 |     - File: `(f:File {path: string, module_name: string})`
1203 |     - Class: `(c:Class {name: string, full_name: string})`
1204 |     - Method: `(m:Method {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1205 |     - Function: `(func:Function {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1206 |     - Attribute: `(a:Attribute {name: string, type: string})`
1207 |     
1208 |     **Relationships:**
1209 |     - `(r:Repository)-[:CONTAINS]->(f:File)`
1210 |     - `(f:File)-[:DEFINES]->(c:Class)`
1211 |     - `(c:Class)-[:HAS_METHOD]->(m:Method)`
1212 |     - `(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)`
1213 |     - `(f:File)-[:DEFINES]->(func:Function)`
1214 |     
1215 |     ## Example Workflow:
1216 |     ```
1217 |     1. repos                                    # See what repositories are available
1218 |     2. explore pydantic-ai                      # Explore a specific repository
1219 |     3. classes pydantic-ai                      # List classes in that repository
1220 |     4. class Agent                              # Explore the Agent class
1221 |     5. method run_stream                        # Search for run_stream method
1222 |     6. method __init__ Agent                    # Find Agent constructor
1223 |     7. query "MATCH (c:Class)-[:HAS_METHOD]->(m:Method) WHERE m.name = 'run' RETURN c.name, m.name LIMIT 5"
1224 |     ```
1225 |     
1226 |     Args:
1227 |         ctx: The MCP server provided context
1228 |         command: Command string to execute (see available commands above)
1229 |     
1230 |     Returns:
1231 |         JSON string with query results, statistics, and metadata
1232 |     """
1233 |     try:
1234 |         # Check if knowledge graph functionality is enabled
1235 |         knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1236 |         if not knowledge_graph_enabled:
1237 |             return json.dumps({
1238 |                 "success": False,
1239 |                 "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1240 |             }, indent=2)
1241 |         
1242 |         # Get Neo4j driver from context
1243 |         repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1244 |         if not repo_extractor or not repo_extractor.driver:
1245 |             return json.dumps({
1246 |                 "success": False,
1247 |                 "error": "Neo4j connection not available. Check Neo4j configuration in environment variables."
1248 |             }, indent=2)
1249 |         
1250 |         # Parse command
1251 |         command = command.strip()
1252 |         if not command:
1253 |             return json.dumps({
1254 |                 "success": False,
1255 |                 "command": "",
1256 |                 "error": "Command cannot be empty. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1257 |             }, indent=2)
1258 |         
1259 |         parts = command.split()
1260 |         cmd = parts[0].lower()
1261 |         args = parts[1:] if len(parts) > 1 else []
1262 |         
1263 |         async with repo_extractor.driver.session() as session:
1264 |             # Route to appropriate handler
1265 |             if cmd == "repos":
1266 |                 return await _handle_repos_command(session, command)
1267 |             elif cmd == "explore":
1268 |                 if not args:
1269 |                     return json.dumps({
1270 |                         "success": False,
1271 |                         "command": command,
1272 |                         "error": "Repository name required. Usage: explore <repo_name>"
1273 |                     }, indent=2)
1274 |                 return await _handle_explore_command(session, command, args[0])
1275 |             elif cmd == "classes":
1276 |                 repo_name = args[0] if args else None
1277 |                 return await _handle_classes_command(session, command, repo_name)
1278 |             elif cmd == "class":
1279 |                 if not args:
1280 |                     return json.dumps({
1281 |                         "success": False,
1282 |                         "command": command,
1283 |                         "error": "Class name required. Usage: class <class_name>"
1284 |                     }, indent=2)
1285 |                 return await _handle_class_command(session, command, args[0])
1286 |             elif cmd == "method":
1287 |                 if not args:
1288 |                     return json.dumps({
1289 |                         "success": False,
1290 |                         "command": command,
1291 |                         "error": "Method name required. Usage: method <method_name> [class_name]"
1292 |                     }, indent=2)
1293 |                 method_name = args[0]
1294 |                 class_name = args[1] if len(args) > 1 else None
1295 |                 return await _handle_method_command(session, command, method_name, class_name)
1296 |             elif cmd == "query":
1297 |                 if not args:
1298 |                     return json.dumps({
1299 |                         "success": False,
1300 |                         "command": command,
1301 |                         "error": "Cypher query required. Usage: query <cypher_query>"
1302 |                     }, indent=2)
1303 |                 cypher_query = " ".join(args)
1304 |                 return await _handle_query_command(session, command, cypher_query)
1305 |             else:
1306 |                 return json.dumps({
1307 |                     "success": False,
1308 |                     "command": command,
1309 |                     "error": f"Unknown command '{cmd}'. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1310 |                 }, indent=2)
1311 |                 
1312 |     except Exception as e:
1313 |         return json.dumps({
1314 |             "success": False,
1315 |             "command": command,
1316 |             "error": f"Query execution failed: {str(e)}"
1317 |         }, indent=2)
1318 | 
1319 | 
1320 | async def _handle_repos_command(session, command: str) -> str:
1321 |     """Handle 'repos' command - list all repositories"""
1322 |     query = "MATCH (r:Repository) RETURN r.name as name ORDER BY r.name"
1323 |     result = await session.run(query)
1324 |     
1325 |     repos = []
1326 |     async for record in result:
1327 |         repos.append(record['name'])
1328 |     
1329 |     return json.dumps({
1330 |         "success": True,
1331 |         "command": command,
1332 |         "data": {
1333 |             "repositories": repos
1334 |         },
1335 |         "metadata": {
1336 |             "total_results": len(repos),
1337 |             "limited": False
1338 |         }
1339 |     }, indent=2)
1340 | 
1341 | 
1342 | async def _handle_explore_command(session, command: str, repo_name: str) -> str:
1343 |     """Handle 'explore <repo>' command - get repository overview"""
1344 |     # Check if repository exists
1345 |     repo_check_query = "MATCH (r:Repository {name: $repo_name}) RETURN r.name as name"
1346 |     result = await session.run(repo_check_query, repo_name=repo_name)
1347 |     repo_record = await result.single()
1348 |     
1349 |     if not repo_record:
1350 |         return json.dumps({
1351 |             "success": False,
1352 |             "command": command,
1353 |             "error": f"Repository '{repo_name}' not found in knowledge graph"
1354 |         }, indent=2)
1355 |     
1356 |     # Get file count
1357 |     files_query = """
1358 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
1359 |     RETURN count(f) as file_count
1360 |     """
1361 |     result = await session.run(files_query, repo_name=repo_name)
1362 |     file_count = (await result.single())['file_count']
1363 |     
1364 |     # Get class count
1365 |     classes_query = """
1366 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1367 |     RETURN count(DISTINCT c) as class_count
1368 |     """
1369 |     result = await session.run(classes_query, repo_name=repo_name)
1370 |     class_count = (await result.single())['class_count']
1371 |     
1372 |     # Get function count
1373 |     functions_query = """
1374 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1375 |     RETURN count(DISTINCT func) as function_count
1376 |     """
1377 |     result = await session.run(functions_query, repo_name=repo_name)
1378 |     function_count = (await result.single())['function_count']
1379 |     
1380 |     # Get method count
1381 |     methods_query = """
1382 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1383 |     RETURN count(DISTINCT m) as method_count
1384 |     """
1385 |     result = await session.run(methods_query, repo_name=repo_name)
1386 |     method_count = (await result.single())['method_count']
1387 |     
1388 |     return json.dumps({
1389 |         "success": True,
1390 |         "command": command,
1391 |         "data": {
1392 |             "repository": repo_name,
1393 |             "statistics": {
1394 |                 "files": file_count,
1395 |                 "classes": class_count,
1396 |                 "functions": function_count,
1397 |                 "methods": method_count
1398 |             }
1399 |         },
1400 |         "metadata": {
1401 |             "total_results": 1,
1402 |             "limited": False
1403 |         }
1404 |     }, indent=2)
1405 | 
1406 | 
1407 | async def _handle_classes_command(session, command: str, repo_name: str = None) -> str:
1408 |     """Handle 'classes [repo]' command - list classes"""
1409 |     limit = 20
1410 |     
1411 |     if repo_name:
1412 |         query = """
1413 |         MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1414 |         RETURN c.name as name, c.full_name as full_name
1415 |         ORDER BY c.name
1416 |         LIMIT $limit
1417 |         """
1418 |         result = await session.run(query, repo_name=repo_name, limit=limit)
1419 |     else:
1420 |         query = """
1421 |         MATCH (c:Class)
1422 |         RETURN c.name as name, c.full_name as full_name
1423 |         ORDER BY c.name
1424 |         LIMIT $limit
1425 |         """
1426 |         result = await session.run(query, limit=limit)
1427 |     
1428 |     classes = []
1429 |     async for record in result:
1430 |         classes.append({
1431 |             'name': record['name'],
1432 |             'full_name': record['full_name']
1433 |         })
1434 |     
1435 |     return json.dumps({
1436 |         "success": True,
1437 |         "command": command,
1438 |         "data": {
1439 |             "classes": classes,
1440 |             "repository_filter": repo_name
1441 |         },
1442 |         "metadata": {
1443 |             "total_results": len(classes),
1444 |             "limited": len(classes) >= limit
1445 |         }
1446 |     }, indent=2)
1447 | 
1448 | 
1449 | async def _handle_class_command(session, command: str, class_name: str) -> str:
1450 |     """Handle 'class <name>' command - explore specific class"""
1451 |     # Find the class
1452 |     class_query = """
1453 |     MATCH (c:Class)
1454 |     WHERE c.name = $class_name OR c.full_name = $class_name
1455 |     RETURN c.name as name, c.full_name as full_name
1456 |     LIMIT 1
1457 |     """
1458 |     result = await session.run(class_query, class_name=class_name)
1459 |     class_record = await result.single()
1460 |     
1461 |     if not class_record:
1462 |         return json.dumps({
1463 |             "success": False,
1464 |             "command": command,
1465 |             "error": f"Class '{class_name}' not found in knowledge graph"
1466 |         }, indent=2)
1467 |     
1468 |     actual_name = class_record['name']
1469 |     full_name = class_record['full_name']
1470 |     
1471 |     # Get methods
1472 |     methods_query = """
1473 |     MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1474 |     WHERE c.name = $class_name OR c.full_name = $class_name
1475 |     RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, m.return_type as return_type
1476 |     ORDER BY m.name
1477 |     """
1478 |     result = await session.run(methods_query, class_name=class_name)
1479 |     
1480 |     methods = []
1481 |     async for record in result:
1482 |         # Use detailed params if available, fall back to simple params
1483 |         params_to_use = record['params_detailed'] or record['params_list'] or []
1484 |         methods.append({
1485 |             'name': record['name'],
1486 |             'parameters': params_to_use,
1487 |             'return_type': record['return_type'] or 'Any'
1488 |         })
1489 |     
1490 |     # Get attributes
1491 |     attributes_query = """
1492 |     MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
1493 |     WHERE c.name = $class_name OR c.full_name = $class_name
1494 |     RETURN a.name as name, a.type as type
1495 |     ORDER BY a.name
1496 |     """
1497 |     result = await session.run(attributes_query, class_name=class_name)
1498 |     
1499 |     attributes = []
1500 |     async for record in result:
1501 |         attributes.append({
1502 |             'name': record['name'],
1503 |             'type': record['type'] or 'Any'
1504 |         })
1505 |     
1506 |     return json.dumps({
1507 |         "success": True,
1508 |         "command": command,
1509 |         "data": {
1510 |             "class": {
1511 |                 "name": actual_name,
1512 |                 "full_name": full_name,
1513 |                 "methods": methods,
1514 |                 "attributes": attributes
1515 |             }
1516 |         },
1517 |         "metadata": {
1518 |             "total_results": 1,
1519 |             "methods_count": len(methods),
1520 |             "attributes_count": len(attributes),
1521 |             "limited": False
1522 |         }
1523 |     }, indent=2)
1524 | 
1525 | 
1526 | async def _handle_method_command(session, command: str, method_name: str, class_name: str = None) -> str:
1527 |     """Handle 'method <name> [class]' command - search for methods"""
1528 |     if class_name:
1529 |         query = """
1530 |         MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1531 |         WHERE (c.name = $class_name OR c.full_name = $class_name)
1532 |           AND m.name = $method_name
1533 |         RETURN c.name as class_name, c.full_name as class_full_name,
1534 |                m.name as method_name, m.params_list as params_list, 
1535 |                m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1536 |         """
1537 |         result = await session.run(query, class_name=class_name, method_name=method_name)
1538 |     else:
1539 |         query = """
1540 |         MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1541 |         WHERE m.name = $method_name
1542 |         RETURN c.name as class_name, c.full_name as class_full_name,
1543 |                m.name as method_name, m.params_list as params_list, 
1544 |                m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1545 |         ORDER BY c.name
1546 |         LIMIT 20
1547 |         """
1548 |         result = await session.run(query, method_name=method_name)
1549 |     
1550 |     methods = []
1551 |     async for record in result:
1552 |         # Use detailed params if available, fall back to simple params
1553 |         params_to_use = record['params_detailed'] or record['params_list'] or []
1554 |         methods.append({
1555 |             'class_name': record['class_name'],
1556 |             'class_full_name': record['class_full_name'],
1557 |             'method_name': record['method_name'],
1558 |             'parameters': params_to_use,
1559 |             'return_type': record['return_type'] or 'Any',
1560 |             'legacy_args': record['args'] or []
1561 |         })
1562 |     
1563 |     if not methods:
1564 |         return json.dumps({
1565 |             "success": False,
1566 |             "command": command,
1567 |             "error": f"Method '{method_name}'" + (f" in class '{class_name}'" if class_name else "") + " not found"
1568 |         }, indent=2)
1569 |     
1570 |     return json.dumps({
1571 |         "success": True,
1572 |         "command": command,
1573 |         "data": {
1574 |             "methods": methods,
1575 |             "class_filter": class_name
1576 |         },
1577 |         "metadata": {
1578 |             "total_results": len(methods),
1579 |             "limited": len(methods) >= 20 and not class_name
1580 |         }
1581 |     }, indent=2)
1582 | 
1583 | 
1584 | async def _handle_query_command(session, command: str, cypher_query: str) -> str:
1585 |     """Handle 'query <cypher>' command - execute custom Cypher query"""
1586 |     try:
1587 |         # Execute the query with a limit to prevent overwhelming responses
1588 |         result = await session.run(cypher_query)
1589 |         
1590 |         records = []
1591 |         count = 0
1592 |         async for record in result:
1593 |             records.append(dict(record))
1594 |             count += 1
1595 |             if count >= 20:  # Limit results to prevent overwhelming responses
1596 |                 break
1597 |         
1598 |         return json.dumps({
1599 |             "success": True,
1600 |             "command": command,
1601 |             "data": {
1602 |                 "query": cypher_query,
1603 |                 "results": records
1604 |             },
1605 |             "metadata": {
1606 |                 "total_results": len(records),
1607 |                 "limited": len(records) >= 20
1608 |             }
1609 |         }, indent=2)
1610 |         
1611 |     except Exception as e:
1612 |         return json.dumps({
1613 |             "success": False,
1614 |             "command": command,
1615 |             "error": f"Cypher query error: {str(e)}",
1616 |             "data": {
1617 |                 "query": cypher_query
1618 |             }
1619 |         }, indent=2)
1620 | 
1621 | 
1622 | @mcp.tool()
1623 | async def parse_github_repository(ctx: Context, repo_url: str) -> str:
1624 |     """
1625 |     Parse a GitHub repository into the Neo4j knowledge graph.
1626 |     
1627 |     This tool clones a GitHub repository, analyzes its Python files, and stores
1628 |     the code structure (classes, methods, functions, imports) in Neo4j for use
1629 |     in hallucination detection. The tool:
1630 |     
1631 |     - Clones the repository to a temporary location
1632 |     - Analyzes Python files to extract code structure
1633 |     - Stores classes, methods, functions, and imports in Neo4j
1634 |     - Provides detailed statistics about the parsing results
1635 |     - Automatically handles module name detection for imports
1636 |     
1637 |     Args:
1638 |         ctx: The MCP server provided context
1639 |         repo_url: GitHub repository URL (e.g., 'https://github.com/user/repo.git')
1640 |     
1641 |     Returns:
1642 |         JSON string with parsing results, statistics, and repository information
1643 |     """
1644 |     try:
1645 |         # Check if knowledge graph functionality is enabled
1646 |         knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1647 |         if not knowledge_graph_enabled:
1648 |             return json.dumps({
1649 |                 "success": False,
1650 |                 "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1651 |             }, indent=2)
1652 |         
1653 |         # Get the repository extractor from context
1654 |         repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1655 |         
1656 |         if not repo_extractor:
1657 |             return json.dumps({
1658 |                 "success": False,
1659 |                 "error": "Repository extractor not available. Check Neo4j configuration in environment variables."
1660 |             }, indent=2)
1661 |         
1662 |         # Validate repository URL
1663 |         validation = validate_github_url(repo_url)
1664 |         if not validation["valid"]:
1665 |             return json.dumps({
1666 |                 "success": False,
1667 |                 "repo_url": repo_url,
1668 |                 "error": validation["error"]
1669 |             }, indent=2)
1670 |         
1671 |         repo_name = validation["repo_name"]
1672 |         
1673 |         # Parse the repository (this includes cloning, analysis, and Neo4j storage)
1674 |         print(f"Starting repository analysis for: {repo_name}")
1675 |         await repo_extractor.analyze_repository(repo_url)
1676 |         print(f"Repository analysis completed for: {repo_name}")
1677 |         
1678 |         # Query Neo4j for statistics about the parsed repository
1679 |         async with repo_extractor.driver.session() as session:
1680 |             # Get comprehensive repository statistics
1681 |             stats_query = """
1682 |             MATCH (r:Repository {name: $repo_name})
1683 |             OPTIONAL MATCH (r)-[:CONTAINS]->(f:File)
1684 |             OPTIONAL MATCH (f)-[:DEFINES]->(c:Class)
1685 |             OPTIONAL MATCH (c)-[:HAS_METHOD]->(m:Method)
1686 |             OPTIONAL MATCH (f)-[:DEFINES]->(func:Function)
1687 |             OPTIONAL MATCH (c)-[:HAS_ATTRIBUTE]->(a:Attribute)
1688 |             WITH r, 
1689 |                  count(DISTINCT f) as files_count,
1690 |                  count(DISTINCT c) as classes_count,
1691 |                  count(DISTINCT m) as methods_count,
1692 |                  count(DISTINCT func) as functions_count,
1693 |                  count(DISTINCT a) as attributes_count
1694 |             
1695 |             // Get some sample module names
1696 |             OPTIONAL MATCH (r)-[:CONTAINS]->(sample_f:File)
1697 |             WITH r, files_count, classes_count, methods_count, functions_count, attributes_count,
1698 |                  collect(DISTINCT sample_f.module_name)[0..5] as sample_modules
1699 |             
1700 |             RETURN 
1701 |                 r.name as repo_name,
1702 |                 files_count,
1703 |                 classes_count, 
1704 |                 methods_count,
1705 |                 functions_count,
1706 |                 attributes_count,
1707 |                 sample_modules
1708 |             """
1709 |             
1710 |             result = await session.run(stats_query, repo_name=repo_name)
1711 |             record = await result.single()
1712 |             
1713 |             if record:
1714 |                 stats = {
1715 |                     "repository": record['repo_name'],
1716 |                     "files_processed": record['files_count'],
1717 |                     "classes_created": record['classes_count'],
1718 |                     "methods_created": record['methods_count'], 
1719 |                     "functions_created": record['functions_count'],
1720 |                     "attributes_created": record['attributes_count'],
1721 |                     "sample_modules": record['sample_modules'] or []
1722 |                 }
1723 |             else:
1724 |                 return json.dumps({
1725 |                     "success": False,
1726 |                     "repo_url": repo_url,
1727 |                     "error": f"Repository '{repo_name}' not found in database after parsing"
1728 |                 }, indent=2)
1729 |         
1730 |         return json.dumps({
1731 |             "success": True,
1732 |             "repo_url": repo_url,
1733 |             "repo_name": repo_name,
1734 |             "message": f"Successfully parsed repository '{repo_name}' into knowledge graph",
1735 |             "statistics": stats,
1736 |             "ready_for_validation": True,
1737 |             "next_steps": [
1738 |                 "Repository is now available for hallucination detection",
1739 |                 f"Use check_ai_script_hallucinations to validate scripts against {repo_name}",
1740 |                 "The knowledge graph contains classes, methods, and functions from this repository"
1741 |             ]
1742 |         }, indent=2)
1743 |         
1744 |     except Exception as e:
1745 |         return json.dumps({
1746 |             "success": False,
1747 |             "repo_url": repo_url,
1748 |             "error": f"Repository parsing failed: {str(e)}"
1749 |         }, indent=2)
1750 | 
1751 | async def crawl_markdown_file(crawler: AsyncWebCrawler, url: str) -> List[Dict[str, Any]]:
1752 |     """
1753 |     Crawl a .txt or markdown file.
1754 |     
1755 |     Args:
1756 |         crawler: AsyncWebCrawler instance
1757 |         url: URL of the file
1758 |         
1759 |     Returns:
1760 |         List of dictionaries with URL and markdown content
1761 |     """
1762 |     crawl_config = CrawlerRunConfig()
1763 | 
1764 |     result = await crawler.arun(url=url, config=crawl_config)
1765 |     if result.success and result.markdown:
1766 |         return [{'url': url, 'markdown': result.markdown}]
1767 |     else:
1768 |         print(f"Failed to crawl {url}: {result.error_message}")
1769 |         return []
1770 | 
1771 | async def crawl_batch(crawler: AsyncWebCrawler, urls: List[str], max_concurrent: int = 10) -> List[Dict[str, Any]]:
1772 |     """
1773 |     Batch crawl multiple URLs in parallel.
1774 |     
1775 |     Args:
1776 |         crawler: AsyncWebCrawler instance
1777 |         urls: List of URLs to crawl
1778 |         max_concurrent: Maximum number of concurrent browser sessions
1779 |         
1780 |     Returns:
1781 |         List of dictionaries with URL and markdown content
1782 |     """
1783 |     crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1784 |     dispatcher = MemoryAdaptiveDispatcher(
1785 |         memory_threshold_percent=70.0,
1786 |         check_interval=1.0,
1787 |         max_session_permit=max_concurrent
1788 |     )
1789 | 
1790 |     results = await crawler.arun_many(urls=urls, config=crawl_config, dispatcher=dispatcher)
1791 |     return [{'url': r.url, 'markdown': r.markdown} for r in results if r.success and r.markdown]
1792 | 
1793 | async def crawl_recursive_internal_links(crawler: AsyncWebCrawler, start_urls: List[str], max_depth: int = 3, max_concurrent: int = 10) -> List[Dict[str, Any]]:
1794 |     """
1795 |     Recursively crawl internal links from start URLs up to a maximum depth.
1796 |     
1797 |     Args:
1798 |         crawler: AsyncWebCrawler instance
1799 |         start_urls: List of starting URLs
1800 |         max_depth: Maximum recursion depth
1801 |         max_concurrent: Maximum number of concurrent browser sessions
1802 |         
1803 |     Returns:
1804 |         List of dictionaries with URL and markdown content
1805 |     """
1806 |     run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1807 |     dispatcher = MemoryAdaptiveDispatcher(
1808 |         memory_threshold_percent=70.0,
1809 |         check_interval=1.0,
1810 |         max_session_permit=max_concurrent
1811 |     )
1812 | 
1813 |     visited = set()
1814 | 
1815 |     def normalize_url(url):
1816 |         return urldefrag(url)[0]
1817 | 
1818 |     current_urls = set([normalize_url(u) for u in start_urls])
1819 |     results_all = []
1820 | 
1821 |     for depth in range(max_depth):
1822 |         urls_to_crawl = [normalize_url(url) for url in current_urls if normalize_url(url) not in visited]
1823 |         if not urls_to_crawl:
1824 |             break
1825 | 
1826 |         results = await crawler.arun_many(urls=urls_to_crawl, config=run_config, dispatcher=dispatcher)
1827 |         next_level_urls = set()
1828 | 
1829 |         for result in results:
1830 |             norm_url = normalize_url(result.url)
1831 |             visited.add(norm_url)
1832 | 
1833 |             if result.success and result.markdown:
1834 |                 results_all.append({'url': result.url, 'markdown': result.markdown})
1835 |                 for link in result.links.get("internal", []):
1836 |                     next_url = normalize_url(link["href"])
1837 |                     if next_url not in visited:
1838 |                         next_level_urls.add(next_url)
1839 | 
1840 |         current_urls = next_level_urls
1841 | 
1842 |     return results_all
1843 | 
1844 | async def main():
1845 |     transport = os.getenv("TRANSPORT", "sse")
1846 |     if transport == 'sse':
1847 |         # Run the MCP server with sse transport
1848 |         await mcp.run_sse_async()
1849 |     else:
1850 |         # Run the MCP server with stdio transport
1851 |         await mcp.run_stdio_async()
1852 | 
1853 | if __name__ == "__main__":
1854 |     asyncio.run(main())
```
Page 2/2FirstPrevNextLast