This is page 2 of 2. Use http://codebase.md/coleam00/mcp-crawl4ai-rag?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .dockerignore ├── .env.example ├── .gitattributes ├── .gitignore ├── crawled_pages.sql ├── Dockerfile ├── knowledge_graphs │ ├── ai_hallucination_detector.py │ ├── ai_script_analyzer.py │ ├── hallucination_reporter.py │ ├── knowledge_graph_validator.py │ ├── parse_repo_into_neo4j.py │ ├── query_knowledge_graph.py │ └── test_script.py ├── LICENSE ├── pyproject.toml ├── README.md ├── src │ ├── crawl4ai_mcp.py │ └── utils.py └── uv.lock ``` # Files -------------------------------------------------------------------------------- /knowledge_graphs/knowledge_graph_validator.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Knowledge Graph Validator 3 | 4 | Validates AI-generated code against Neo4j knowledge graph containing 5 | repository information. Checks imports, methods, attributes, and parameters. 6 | """ 7 | 8 | import asyncio 9 | import logging 10 | from typing import Dict, List, Optional, Set, Tuple, Any 11 | from dataclasses import dataclass, field 12 | from enum import Enum 13 | from neo4j import AsyncGraphDatabase 14 | 15 | from ai_script_analyzer import ( 16 | AnalysisResult, ImportInfo, MethodCall, AttributeAccess, 17 | FunctionCall, ClassInstantiation 18 | ) 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class ValidationStatus(Enum): 24 | VALID = "VALID" 25 | INVALID = "INVALID" 26 | UNCERTAIN = "UNCERTAIN" 27 | NOT_FOUND = "NOT_FOUND" 28 | 29 | 30 | @dataclass 31 | class ValidationResult: 32 | """Result of validating a single element""" 33 | status: ValidationStatus 34 | confidence: float # 0.0 to 1.0 35 | message: str 36 | details: Dict[str, Any] = field(default_factory=dict) 37 | suggestions: List[str] = field(default_factory=list) 38 | 39 | 40 | @dataclass 41 | class ImportValidation: 42 | """Validation result for an import""" 43 | import_info: ImportInfo 44 | validation: ValidationResult 45 | available_classes: List[str] = field(default_factory=list) 46 | available_functions: List[str] = field(default_factory=list) 47 | 48 | 49 | @dataclass 50 | class MethodValidation: 51 | """Validation result for a method call""" 52 | method_call: MethodCall 53 | validation: ValidationResult 54 | expected_params: List[str] = field(default_factory=list) 55 | actual_params: List[str] = field(default_factory=list) 56 | parameter_validation: ValidationResult = None 57 | 58 | 59 | @dataclass 60 | class AttributeValidation: 61 | """Validation result for attribute access""" 62 | attribute_access: AttributeAccess 63 | validation: ValidationResult 64 | expected_type: Optional[str] = None 65 | 66 | 67 | @dataclass 68 | class FunctionValidation: 69 | """Validation result for function call""" 70 | function_call: FunctionCall 71 | validation: ValidationResult 72 | expected_params: List[str] = field(default_factory=list) 73 | actual_params: List[str] = field(default_factory=list) 74 | parameter_validation: ValidationResult = None 75 | 76 | 77 | @dataclass 78 | class ClassValidation: 79 | """Validation result for class instantiation""" 80 | class_instantiation: ClassInstantiation 81 | validation: ValidationResult 82 | constructor_params: List[str] = field(default_factory=list) 83 | parameter_validation: ValidationResult = None 84 | 85 | 86 | @dataclass 87 | class ScriptValidationResult: 88 | """Complete validation results for a script""" 89 | script_path: str 90 | analysis_result: AnalysisResult 91 | import_validations: List[ImportValidation] = field(default_factory=list) 92 | class_validations: List[ClassValidation] = field(default_factory=list) 93 | method_validations: List[MethodValidation] = field(default_factory=list) 94 | attribute_validations: List[AttributeValidation] = field(default_factory=list) 95 | function_validations: List[FunctionValidation] = field(default_factory=list) 96 | overall_confidence: float = 0.0 97 | hallucinations_detected: List[Dict[str, Any]] = field(default_factory=list) 98 | 99 | 100 | class KnowledgeGraphValidator: 101 | """Validates code against Neo4j knowledge graph""" 102 | 103 | def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str): 104 | self.neo4j_uri = neo4j_uri 105 | self.neo4j_user = neo4j_user 106 | self.neo4j_password = neo4j_password 107 | self.driver = None 108 | 109 | # Cache for performance 110 | self.module_cache: Dict[str, List[str]] = {} 111 | self.class_cache: Dict[str, Dict[str, Any]] = {} 112 | self.method_cache: Dict[str, List[Dict[str, Any]]] = {} 113 | self.repo_cache: Dict[str, str] = {} # module_name -> repo_name 114 | self.knowledge_graph_modules: Set[str] = set() # Track modules in knowledge graph 115 | 116 | async def initialize(self): 117 | """Initialize Neo4j connection""" 118 | self.driver = AsyncGraphDatabase.driver( 119 | self.neo4j_uri, 120 | auth=(self.neo4j_user, self.neo4j_password) 121 | ) 122 | logger.info("Knowledge graph validator initialized") 123 | 124 | async def close(self): 125 | """Close Neo4j connection""" 126 | if self.driver: 127 | await self.driver.close() 128 | 129 | async def validate_script(self, analysis_result: AnalysisResult) -> ScriptValidationResult: 130 | """Validate entire script analysis against knowledge graph""" 131 | result = ScriptValidationResult( 132 | script_path=analysis_result.file_path, 133 | analysis_result=analysis_result 134 | ) 135 | 136 | # Validate imports first (builds context for other validations) 137 | result.import_validations = await self._validate_imports(analysis_result.imports) 138 | 139 | # Validate class instantiations 140 | result.class_validations = await self._validate_class_instantiations( 141 | analysis_result.class_instantiations 142 | ) 143 | 144 | # Validate method calls 145 | result.method_validations = await self._validate_method_calls( 146 | analysis_result.method_calls 147 | ) 148 | 149 | # Validate attribute accesses 150 | result.attribute_validations = await self._validate_attribute_accesses( 151 | analysis_result.attribute_accesses 152 | ) 153 | 154 | # Validate function calls 155 | result.function_validations = await self._validate_function_calls( 156 | analysis_result.function_calls 157 | ) 158 | 159 | # Calculate overall confidence and detect hallucinations 160 | result.overall_confidence = self._calculate_overall_confidence(result) 161 | result.hallucinations_detected = self._detect_hallucinations(result) 162 | 163 | return result 164 | 165 | async def _validate_imports(self, imports: List[ImportInfo]) -> List[ImportValidation]: 166 | """Validate all imports against knowledge graph""" 167 | validations = [] 168 | 169 | for import_info in imports: 170 | validation = await self._validate_single_import(import_info) 171 | validations.append(validation) 172 | 173 | return validations 174 | 175 | async def _validate_single_import(self, import_info: ImportInfo) -> ImportValidation: 176 | """Validate a single import""" 177 | # Determine module to search for 178 | search_module = import_info.module if import_info.is_from_import else import_info.name 179 | 180 | # Check cache first 181 | if search_module in self.module_cache: 182 | available_files = self.module_cache[search_module] 183 | else: 184 | # Query Neo4j for matching modules 185 | available_files = await self._find_modules(search_module) 186 | self.module_cache[search_module] = available_files 187 | 188 | if available_files: 189 | # Get available classes and functions from the module 190 | classes, functions = await self._get_module_contents(search_module) 191 | 192 | # Track this module as being in the knowledge graph 193 | self.knowledge_graph_modules.add(search_module) 194 | 195 | # Also track the base module for "from X.Y.Z import ..." patterns 196 | if '.' in search_module: 197 | base_module = search_module.split('.')[0] 198 | self.knowledge_graph_modules.add(base_module) 199 | 200 | validation = ValidationResult( 201 | status=ValidationStatus.VALID, 202 | confidence=0.9, 203 | message=f"Module '{search_module}' found in knowledge graph", 204 | details={"matched_files": available_files, "in_knowledge_graph": True} 205 | ) 206 | 207 | return ImportValidation( 208 | import_info=import_info, 209 | validation=validation, 210 | available_classes=classes, 211 | available_functions=functions 212 | ) 213 | else: 214 | # External library - mark as such but don't treat as error 215 | validation = ValidationResult( 216 | status=ValidationStatus.UNCERTAIN, 217 | confidence=0.8, # High confidence it's external, not an error 218 | message=f"Module '{search_module}' is external (not in knowledge graph)", 219 | details={"could_be_external": True, "in_knowledge_graph": False} 220 | ) 221 | 222 | return ImportValidation( 223 | import_info=import_info, 224 | validation=validation 225 | ) 226 | 227 | async def _validate_class_instantiations(self, instantiations: List[ClassInstantiation]) -> List[ClassValidation]: 228 | """Validate class instantiations""" 229 | validations = [] 230 | 231 | for instantiation in instantiations: 232 | validation = await self._validate_single_class_instantiation(instantiation) 233 | validations.append(validation) 234 | 235 | return validations 236 | 237 | async def _validate_single_class_instantiation(self, instantiation: ClassInstantiation) -> ClassValidation: 238 | """Validate a single class instantiation""" 239 | class_name = instantiation.full_class_name or instantiation.class_name 240 | 241 | # Skip validation for classes not from knowledge graph 242 | if not self._is_from_knowledge_graph(class_name): 243 | validation = ValidationResult( 244 | status=ValidationStatus.UNCERTAIN, 245 | confidence=0.8, 246 | message=f"Skipping validation: '{class_name}' is not from knowledge graph" 247 | ) 248 | return ClassValidation( 249 | class_instantiation=instantiation, 250 | validation=validation 251 | ) 252 | 253 | # Find class in knowledge graph 254 | class_info = await self._find_class(class_name) 255 | 256 | if not class_info: 257 | validation = ValidationResult( 258 | status=ValidationStatus.NOT_FOUND, 259 | confidence=0.2, 260 | message=f"Class '{class_name}' not found in knowledge graph" 261 | ) 262 | return ClassValidation( 263 | class_instantiation=instantiation, 264 | validation=validation 265 | ) 266 | 267 | # Check constructor parameters (look for __init__ method) 268 | init_method = await self._find_method(class_name, "__init__") 269 | 270 | if init_method: 271 | param_validation = self._validate_parameters( 272 | expected_params=init_method.get('params_list', []), 273 | provided_args=instantiation.args, 274 | provided_kwargs=instantiation.kwargs 275 | ) 276 | else: 277 | param_validation = ValidationResult( 278 | status=ValidationStatus.UNCERTAIN, 279 | confidence=0.5, 280 | message="Constructor parameters not found" 281 | ) 282 | 283 | # Use parameter validation result if it failed 284 | if param_validation.status == ValidationStatus.INVALID: 285 | validation = ValidationResult( 286 | status=ValidationStatus.INVALID, 287 | confidence=param_validation.confidence, 288 | message=f"Class '{class_name}' found but has invalid constructor parameters: {param_validation.message}", 289 | suggestions=param_validation.suggestions 290 | ) 291 | else: 292 | validation = ValidationResult( 293 | status=ValidationStatus.VALID, 294 | confidence=0.8, 295 | message=f"Class '{class_name}' found in knowledge graph" 296 | ) 297 | 298 | return ClassValidation( 299 | class_instantiation=instantiation, 300 | validation=validation, 301 | parameter_validation=param_validation 302 | ) 303 | 304 | async def _validate_method_calls(self, method_calls: List[MethodCall]) -> List[MethodValidation]: 305 | """Validate method calls""" 306 | validations = [] 307 | 308 | for method_call in method_calls: 309 | validation = await self._validate_single_method_call(method_call) 310 | validations.append(validation) 311 | 312 | return validations 313 | 314 | async def _validate_single_method_call(self, method_call: MethodCall) -> MethodValidation: 315 | """Validate a single method call""" 316 | class_type = method_call.object_type 317 | 318 | if not class_type: 319 | validation = ValidationResult( 320 | status=ValidationStatus.UNCERTAIN, 321 | confidence=0.3, 322 | message=f"Cannot determine object type for '{method_call.object_name}'" 323 | ) 324 | return MethodValidation( 325 | method_call=method_call, 326 | validation=validation 327 | ) 328 | 329 | # Skip validation for classes not from knowledge graph 330 | if not self._is_from_knowledge_graph(class_type): 331 | validation = ValidationResult( 332 | status=ValidationStatus.UNCERTAIN, 333 | confidence=0.8, 334 | message=f"Skipping validation: '{class_type}' is not from knowledge graph" 335 | ) 336 | return MethodValidation( 337 | method_call=method_call, 338 | validation=validation 339 | ) 340 | 341 | # Find method in knowledge graph 342 | method_info = await self._find_method(class_type, method_call.method_name) 343 | 344 | if not method_info: 345 | # Check for similar method names 346 | similar_methods = await self._find_similar_methods(class_type, method_call.method_name) 347 | 348 | validation = ValidationResult( 349 | status=ValidationStatus.NOT_FOUND, 350 | confidence=0.1, 351 | message=f"Method '{method_call.method_name}' not found on class '{class_type}'", 352 | suggestions=similar_methods 353 | ) 354 | return MethodValidation( 355 | method_call=method_call, 356 | validation=validation 357 | ) 358 | 359 | # Validate parameters 360 | expected_params = method_info.get('params_list', []) 361 | param_validation = self._validate_parameters( 362 | expected_params=expected_params, 363 | provided_args=method_call.args, 364 | provided_kwargs=method_call.kwargs 365 | ) 366 | 367 | # Use parameter validation result if it failed 368 | if param_validation.status == ValidationStatus.INVALID: 369 | validation = ValidationResult( 370 | status=ValidationStatus.INVALID, 371 | confidence=param_validation.confidence, 372 | message=f"Method '{method_call.method_name}' found but has invalid parameters: {param_validation.message}", 373 | suggestions=param_validation.suggestions 374 | ) 375 | else: 376 | validation = ValidationResult( 377 | status=ValidationStatus.VALID, 378 | confidence=0.9, 379 | message=f"Method '{method_call.method_name}' found on class '{class_type}'" 380 | ) 381 | 382 | return MethodValidation( 383 | method_call=method_call, 384 | validation=validation, 385 | expected_params=expected_params, 386 | actual_params=method_call.args + list(method_call.kwargs.keys()), 387 | parameter_validation=param_validation 388 | ) 389 | 390 | async def _validate_attribute_accesses(self, attribute_accesses: List[AttributeAccess]) -> List[AttributeValidation]: 391 | """Validate attribute accesses""" 392 | validations = [] 393 | 394 | for attr_access in attribute_accesses: 395 | validation = await self._validate_single_attribute_access(attr_access) 396 | validations.append(validation) 397 | 398 | return validations 399 | 400 | async def _validate_single_attribute_access(self, attr_access: AttributeAccess) -> AttributeValidation: 401 | """Validate a single attribute access""" 402 | class_type = attr_access.object_type 403 | 404 | if not class_type: 405 | validation = ValidationResult( 406 | status=ValidationStatus.UNCERTAIN, 407 | confidence=0.3, 408 | message=f"Cannot determine object type for '{attr_access.object_name}'" 409 | ) 410 | return AttributeValidation( 411 | attribute_access=attr_access, 412 | validation=validation 413 | ) 414 | 415 | # Skip validation for classes not from knowledge graph 416 | if not self._is_from_knowledge_graph(class_type): 417 | validation = ValidationResult( 418 | status=ValidationStatus.UNCERTAIN, 419 | confidence=0.8, 420 | message=f"Skipping validation: '{class_type}' is not from knowledge graph" 421 | ) 422 | return AttributeValidation( 423 | attribute_access=attr_access, 424 | validation=validation 425 | ) 426 | 427 | # Find attribute in knowledge graph 428 | attr_info = await self._find_attribute(class_type, attr_access.attribute_name) 429 | 430 | if not attr_info: 431 | # If not found as attribute, check if it's a method (for decorators like @agent.tool) 432 | method_info = await self._find_method(class_type, attr_access.attribute_name) 433 | 434 | if method_info: 435 | validation = ValidationResult( 436 | status=ValidationStatus.VALID, 437 | confidence=0.8, 438 | message=f"'{attr_access.attribute_name}' found as method on class '{class_type}' (likely used as decorator)" 439 | ) 440 | return AttributeValidation( 441 | attribute_access=attr_access, 442 | validation=validation, 443 | expected_type="method" 444 | ) 445 | 446 | validation = ValidationResult( 447 | status=ValidationStatus.NOT_FOUND, 448 | confidence=0.2, 449 | message=f"'{attr_access.attribute_name}' not found on class '{class_type}'" 450 | ) 451 | return AttributeValidation( 452 | attribute_access=attr_access, 453 | validation=validation 454 | ) 455 | 456 | validation = ValidationResult( 457 | status=ValidationStatus.VALID, 458 | confidence=0.8, 459 | message=f"Attribute '{attr_access.attribute_name}' found on class '{class_type}'" 460 | ) 461 | 462 | return AttributeValidation( 463 | attribute_access=attr_access, 464 | validation=validation, 465 | expected_type=attr_info.get('type') 466 | ) 467 | 468 | async def _validate_function_calls(self, function_calls: List[FunctionCall]) -> List[FunctionValidation]: 469 | """Validate function calls""" 470 | validations = [] 471 | 472 | for func_call in function_calls: 473 | validation = await self._validate_single_function_call(func_call) 474 | validations.append(validation) 475 | 476 | return validations 477 | 478 | async def _validate_single_function_call(self, func_call: FunctionCall) -> FunctionValidation: 479 | """Validate a single function call""" 480 | func_name = func_call.full_name or func_call.function_name 481 | 482 | # Skip validation for functions not from knowledge graph 483 | if func_call.full_name and not self._is_from_knowledge_graph(func_call.full_name): 484 | validation = ValidationResult( 485 | status=ValidationStatus.UNCERTAIN, 486 | confidence=0.8, 487 | message=f"Skipping validation: '{func_name}' is not from knowledge graph" 488 | ) 489 | return FunctionValidation( 490 | function_call=func_call, 491 | validation=validation 492 | ) 493 | 494 | # Find function in knowledge graph 495 | func_info = await self._find_function(func_name) 496 | 497 | if not func_info: 498 | validation = ValidationResult( 499 | status=ValidationStatus.NOT_FOUND, 500 | confidence=0.2, 501 | message=f"Function '{func_name}' not found in knowledge graph" 502 | ) 503 | return FunctionValidation( 504 | function_call=func_call, 505 | validation=validation 506 | ) 507 | 508 | # Validate parameters 509 | expected_params = func_info.get('params_list', []) 510 | param_validation = self._validate_parameters( 511 | expected_params=expected_params, 512 | provided_args=func_call.args, 513 | provided_kwargs=func_call.kwargs 514 | ) 515 | 516 | # Use parameter validation result if it failed 517 | if param_validation.status == ValidationStatus.INVALID: 518 | validation = ValidationResult( 519 | status=ValidationStatus.INVALID, 520 | confidence=param_validation.confidence, 521 | message=f"Function '{func_name}' found but has invalid parameters: {param_validation.message}", 522 | suggestions=param_validation.suggestions 523 | ) 524 | else: 525 | validation = ValidationResult( 526 | status=ValidationStatus.VALID, 527 | confidence=0.8, 528 | message=f"Function '{func_name}' found in knowledge graph" 529 | ) 530 | 531 | return FunctionValidation( 532 | function_call=func_call, 533 | validation=validation, 534 | expected_params=expected_params, 535 | actual_params=func_call.args + list(func_call.kwargs.keys()), 536 | parameter_validation=param_validation 537 | ) 538 | 539 | def _validate_parameters(self, expected_params: List[str], provided_args: List[str], 540 | provided_kwargs: Dict[str, str]) -> ValidationResult: 541 | """Validate function/method parameters with comprehensive support""" 542 | if not expected_params: 543 | return ValidationResult( 544 | status=ValidationStatus.UNCERTAIN, 545 | confidence=0.5, 546 | message="Parameter information not available" 547 | ) 548 | 549 | # Parse expected parameters - handle detailed format 550 | required_positional = [] 551 | optional_positional = [] 552 | keyword_only_required = [] 553 | keyword_only_optional = [] 554 | has_varargs = False 555 | has_varkwargs = False 556 | 557 | for param in expected_params: 558 | # Handle detailed format: "[keyword_only] name:type=default" or "name:type" 559 | param_clean = param.strip() 560 | 561 | # Check for parameter kind prefix 562 | kind = 'positional' 563 | if param_clean.startswith('['): 564 | end_bracket = param_clean.find(']') 565 | if end_bracket > 0: 566 | kind = param_clean[1:end_bracket] 567 | param_clean = param_clean[end_bracket+1:].strip() 568 | 569 | # Check for varargs/varkwargs 570 | if param_clean.startswith('*') and not param_clean.startswith('**'): 571 | has_varargs = True 572 | continue 573 | elif param_clean.startswith('**'): 574 | has_varkwargs = True 575 | continue 576 | 577 | # Parse name and check if optional 578 | if ':' in param_clean: 579 | param_name = param_clean.split(':')[0] 580 | is_optional = '=' in param_clean 581 | 582 | if kind == 'keyword_only': 583 | if is_optional: 584 | keyword_only_optional.append(param_name) 585 | else: 586 | keyword_only_required.append(param_name) 587 | else: # positional 588 | if is_optional: 589 | optional_positional.append(param_name) 590 | else: 591 | required_positional.append(param_name) 592 | 593 | # Count provided parameters 594 | provided_positional_count = len(provided_args) 595 | provided_keyword_names = set(provided_kwargs.keys()) 596 | 597 | # Validate positional arguments 598 | min_required_positional = len(required_positional) 599 | max_allowed_positional = len(required_positional) + len(optional_positional) 600 | 601 | if not has_varargs and provided_positional_count > max_allowed_positional: 602 | return ValidationResult( 603 | status=ValidationStatus.INVALID, 604 | confidence=0.8, 605 | message=f"Too many positional arguments: provided {provided_positional_count}, max allowed {max_allowed_positional}" 606 | ) 607 | 608 | if provided_positional_count < min_required_positional: 609 | return ValidationResult( 610 | status=ValidationStatus.INVALID, 611 | confidence=0.8, 612 | message=f"Too few positional arguments: provided {provided_positional_count}, required {min_required_positional}" 613 | ) 614 | 615 | # Validate keyword arguments 616 | all_valid_kwarg_names = set(required_positional + optional_positional + keyword_only_required + keyword_only_optional) 617 | invalid_kwargs = provided_keyword_names - all_valid_kwarg_names 618 | 619 | if invalid_kwargs and not has_varkwargs: 620 | return ValidationResult( 621 | status=ValidationStatus.INVALID, 622 | confidence=0.7, 623 | message=f"Invalid keyword arguments: {list(invalid_kwargs)}", 624 | suggestions=[f"Valid parameters: {list(all_valid_kwarg_names)}"] 625 | ) 626 | 627 | # Check required keyword-only arguments 628 | missing_required_kwargs = set(keyword_only_required) - provided_keyword_names 629 | if missing_required_kwargs: 630 | return ValidationResult( 631 | status=ValidationStatus.INVALID, 632 | confidence=0.8, 633 | message=f"Missing required keyword arguments: {list(missing_required_kwargs)}" 634 | ) 635 | 636 | return ValidationResult( 637 | status=ValidationStatus.VALID, 638 | confidence=0.9, 639 | message="Parameters are valid" 640 | ) 641 | 642 | # Neo4j Query Methods 643 | 644 | async def _find_modules(self, module_name: str) -> List[str]: 645 | """Find repository matching the module name, then return its files""" 646 | async with self.driver.session() as session: 647 | # First, try to find files with module names that match or start with the search term 648 | module_query = """ 649 | MATCH (r:Repository)-[:CONTAINS]->(f:File) 650 | WHERE f.module_name = $module_name 651 | OR f.module_name STARTS WITH $module_name + '.' 652 | OR split(f.module_name, '.')[0] = $module_name 653 | RETURN DISTINCT r.name as repo_name, count(f) as file_count 654 | ORDER BY file_count DESC 655 | LIMIT 5 656 | """ 657 | 658 | result = await session.run(module_query, module_name=module_name) 659 | repos_from_modules = [] 660 | async for record in result: 661 | repos_from_modules.append(record['repo_name']) 662 | 663 | # Also try repository name matching as fallback 664 | repo_query = """ 665 | MATCH (r:Repository) 666 | WHERE toLower(r.name) = toLower($module_name) 667 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name) 668 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name) 669 | RETURN r.name as repo_name 670 | ORDER BY 671 | CASE 672 | WHEN toLower(r.name) = toLower($module_name) THEN 1 673 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2 674 | WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3 675 | END 676 | LIMIT 5 677 | """ 678 | 679 | result = await session.run(repo_query, module_name=module_name) 680 | repos_from_names = [] 681 | async for record in result: 682 | repos_from_names.append(record['repo_name']) 683 | 684 | # Combine results, prioritizing module-based matches 685 | all_repos = repos_from_modules + [r for r in repos_from_names if r not in repos_from_modules] 686 | 687 | if not all_repos: 688 | return [] 689 | 690 | # Get files from the best matching repository 691 | best_repo = all_repos[0] 692 | files_query = """ 693 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File) 694 | RETURN f.path, f.module_name 695 | LIMIT 50 696 | """ 697 | 698 | result = await session.run(files_query, repo_name=best_repo) 699 | files = [] 700 | async for record in result: 701 | files.append(record['f.path']) 702 | 703 | return files 704 | 705 | async def _get_module_contents(self, module_name: str) -> Tuple[List[str], List[str]]: 706 | """Get classes and functions available in a repository matching the module name""" 707 | async with self.driver.session() as session: 708 | # First, try to find repository by module names in files 709 | module_query = """ 710 | MATCH (r:Repository)-[:CONTAINS]->(f:File) 711 | WHERE f.module_name = $module_name 712 | OR f.module_name STARTS WITH $module_name + '.' 713 | OR split(f.module_name, '.')[0] = $module_name 714 | RETURN DISTINCT r.name as repo_name, count(f) as file_count 715 | ORDER BY file_count DESC 716 | LIMIT 1 717 | """ 718 | 719 | result = await session.run(module_query, module_name=module_name) 720 | record = await result.single() 721 | 722 | if record: 723 | repo_name = record['repo_name'] 724 | else: 725 | # Fallback to repository name matching 726 | repo_query = """ 727 | MATCH (r:Repository) 728 | WHERE toLower(r.name) = toLower($module_name) 729 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name) 730 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name) 731 | RETURN r.name as repo_name 732 | ORDER BY 733 | CASE 734 | WHEN toLower(r.name) = toLower($module_name) THEN 1 735 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2 736 | WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3 737 | END 738 | LIMIT 1 739 | """ 740 | 741 | result = await session.run(repo_query, module_name=module_name) 742 | record = await result.single() 743 | 744 | if not record: 745 | return [], [] 746 | 747 | repo_name = record['repo_name'] 748 | 749 | # Get classes from this repository 750 | class_query = """ 751 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class) 752 | RETURN DISTINCT c.name as class_name 753 | """ 754 | 755 | result = await session.run(class_query, repo_name=repo_name) 756 | classes = [] 757 | async for record in result: 758 | classes.append(record['class_name']) 759 | 760 | # Get functions from this repository 761 | func_query = """ 762 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function) 763 | RETURN DISTINCT func.name as function_name 764 | """ 765 | 766 | result = await session.run(func_query, repo_name=repo_name) 767 | functions = [] 768 | async for record in result: 769 | functions.append(record['function_name']) 770 | 771 | return classes, functions 772 | 773 | async def _find_repository_for_module(self, module_name: str) -> Optional[str]: 774 | """Find the repository name that matches a module name""" 775 | if module_name in self.repo_cache: 776 | return self.repo_cache[module_name] 777 | 778 | async with self.driver.session() as session: 779 | # First, try to find repository by module names in files 780 | module_query = """ 781 | MATCH (r:Repository)-[:CONTAINS]->(f:File) 782 | WHERE f.module_name = $module_name 783 | OR f.module_name STARTS WITH $module_name + '.' 784 | OR split(f.module_name, '.')[0] = $module_name 785 | RETURN DISTINCT r.name as repo_name, count(f) as file_count 786 | ORDER BY file_count DESC 787 | LIMIT 1 788 | """ 789 | 790 | result = await session.run(module_query, module_name=module_name) 791 | record = await result.single() 792 | 793 | if record: 794 | repo_name = record['repo_name'] 795 | else: 796 | # Fallback to repository name matching 797 | query = """ 798 | MATCH (r:Repository) 799 | WHERE toLower(r.name) = toLower($module_name) 800 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name) 801 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name) 802 | OR toLower(r.name) CONTAINS toLower($module_name) 803 | OR toLower($module_name) CONTAINS toLower(replace(r.name, '-', '_')) 804 | RETURN r.name as repo_name 805 | ORDER BY 806 | CASE 807 | WHEN toLower(r.name) = toLower($module_name) THEN 1 808 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2 809 | ELSE 3 810 | END 811 | LIMIT 1 812 | """ 813 | 814 | result = await session.run(query, module_name=module_name) 815 | record = await result.single() 816 | 817 | repo_name = record['repo_name'] if record else None 818 | 819 | self.repo_cache[module_name] = repo_name 820 | return repo_name 821 | 822 | async def _find_class(self, class_name: str) -> Optional[Dict[str, Any]]: 823 | """Find class information in knowledge graph""" 824 | async with self.driver.session() as session: 825 | # First try exact match 826 | query = """ 827 | MATCH (c:Class) 828 | WHERE c.name = $class_name OR c.full_name = $class_name 829 | RETURN c.name as name, c.full_name as full_name 830 | LIMIT 1 831 | """ 832 | 833 | result = await session.run(query, class_name=class_name) 834 | record = await result.single() 835 | 836 | if record: 837 | return { 838 | 'name': record['name'], 839 | 'full_name': record['full_name'] 840 | } 841 | 842 | # If no exact match and class_name has dots, try repository-based search 843 | if '.' in class_name: 844 | parts = class_name.split('.') 845 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai" 846 | class_part = parts[-1] # e.g., "Agent" 847 | 848 | # Find repository for the module 849 | repo_name = await self._find_repository_for_module(module_part) 850 | 851 | if repo_name: 852 | # Search for class within this repository 853 | repo_query = """ 854 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class) 855 | WHERE c.name = $class_name 856 | RETURN c.name as name, c.full_name as full_name 857 | LIMIT 1 858 | """ 859 | 860 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part) 861 | record = await result.single() 862 | 863 | if record: 864 | return { 865 | 'name': record['name'], 866 | 'full_name': record['full_name'] 867 | } 868 | 869 | return None 870 | 871 | async def _find_method(self, class_name: str, method_name: str) -> Optional[Dict[str, Any]]: 872 | """Find method information for a class""" 873 | cache_key = f"{class_name}.{method_name}" 874 | if cache_key in self.method_cache: 875 | methods = self.method_cache[cache_key] 876 | return methods[0] if methods else None 877 | 878 | async with self.driver.session() as session: 879 | # First try exact match 880 | query = """ 881 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method) 882 | WHERE (c.name = $class_name OR c.full_name = $class_name) 883 | AND m.name = $method_name 884 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, 885 | m.return_type as return_type, m.args as args 886 | LIMIT 1 887 | """ 888 | 889 | result = await session.run(query, class_name=class_name, method_name=method_name) 890 | record = await result.single() 891 | 892 | if record: 893 | # Use detailed params if available, fall back to simple params 894 | params_to_use = record['params_detailed'] or record['params_list'] or [] 895 | 896 | method_info = { 897 | 'name': record['name'], 898 | 'params_list': params_to_use, 899 | 'return_type': record['return_type'], 900 | 'args': record['args'] or [] 901 | } 902 | self.method_cache[cache_key] = [method_info] 903 | return method_info 904 | 905 | # If no exact match and class_name has dots, try repository-based search 906 | if '.' in class_name: 907 | parts = class_name.split('.') 908 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai" 909 | class_part = parts[-1] # e.g., "Agent" 910 | 911 | # Find repository for the module 912 | repo_name = await self._find_repository_for_module(module_part) 913 | 914 | if repo_name: 915 | # Search for method within this repository's classes 916 | repo_query = """ 917 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method) 918 | WHERE c.name = $class_name AND m.name = $method_name 919 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, 920 | m.return_type as return_type, m.args as args 921 | LIMIT 1 922 | """ 923 | 924 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, method_name=method_name) 925 | record = await result.single() 926 | 927 | if record: 928 | # Use detailed params if available, fall back to simple params 929 | params_to_use = record['params_detailed'] or record['params_list'] or [] 930 | 931 | method_info = { 932 | 'name': record['name'], 933 | 'params_list': params_to_use, 934 | 'return_type': record['return_type'], 935 | 'args': record['args'] or [] 936 | } 937 | self.method_cache[cache_key] = [method_info] 938 | return method_info 939 | 940 | self.method_cache[cache_key] = [] 941 | return None 942 | 943 | async def _find_attribute(self, class_name: str, attr_name: str) -> Optional[Dict[str, Any]]: 944 | """Find attribute information for a class""" 945 | async with self.driver.session() as session: 946 | # First try exact match 947 | query = """ 948 | MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute) 949 | WHERE (c.name = $class_name OR c.full_name = $class_name) 950 | AND a.name = $attr_name 951 | RETURN a.name as name, a.type as type 952 | LIMIT 1 953 | """ 954 | 955 | result = await session.run(query, class_name=class_name, attr_name=attr_name) 956 | record = await result.single() 957 | 958 | if record: 959 | return { 960 | 'name': record['name'], 961 | 'type': record['type'] 962 | } 963 | 964 | # If no exact match and class_name has dots, try repository-based search 965 | if '.' in class_name: 966 | parts = class_name.split('.') 967 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai" 968 | class_part = parts[-1] # e.g., "Agent" 969 | 970 | # Find repository for the module 971 | repo_name = await self._find_repository_for_module(module_part) 972 | 973 | if repo_name: 974 | # Search for attribute within this repository's classes 975 | repo_query = """ 976 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute) 977 | WHERE c.name = $class_name AND a.name = $attr_name 978 | RETURN a.name as name, a.type as type 979 | LIMIT 1 980 | """ 981 | 982 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, attr_name=attr_name) 983 | record = await result.single() 984 | 985 | if record: 986 | return { 987 | 'name': record['name'], 988 | 'type': record['type'] 989 | } 990 | 991 | return None 992 | 993 | async def _find_function(self, func_name: str) -> Optional[Dict[str, Any]]: 994 | """Find function information""" 995 | async with self.driver.session() as session: 996 | # First try exact match 997 | query = """ 998 | MATCH (f:Function) 999 | WHERE f.name = $func_name OR f.full_name = $func_name 1000 | RETURN f.name as name, f.params_list as params_list, f.params_detailed as params_detailed, 1001 | f.return_type as return_type, f.args as args 1002 | LIMIT 1 1003 | """ 1004 | 1005 | result = await session.run(query, func_name=func_name) 1006 | record = await result.single() 1007 | 1008 | if record: 1009 | # Use detailed params if available, fall back to simple params 1010 | params_to_use = record['params_detailed'] or record['params_list'] or [] 1011 | 1012 | return { 1013 | 'name': record['name'], 1014 | 'params_list': params_to_use, 1015 | 'return_type': record['return_type'], 1016 | 'args': record['args'] or [] 1017 | } 1018 | 1019 | # If no exact match and func_name has dots, try repository-based search 1020 | if '.' in func_name: 1021 | parts = func_name.split('.') 1022 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai" 1023 | func_part = parts[-1] # e.g., "some_function" 1024 | 1025 | # Find repository for the module 1026 | repo_name = await self._find_repository_for_module(module_part) 1027 | 1028 | if repo_name: 1029 | # Search for function within this repository 1030 | repo_query = """ 1031 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function) 1032 | WHERE func.name = $func_name 1033 | RETURN func.name as name, func.params_list as params_list, func.params_detailed as params_detailed, 1034 | func.return_type as return_type, func.args as args 1035 | LIMIT 1 1036 | """ 1037 | 1038 | result = await session.run(repo_query, repo_name=repo_name, func_name=func_part) 1039 | record = await result.single() 1040 | 1041 | if record: 1042 | # Use detailed params if available, fall back to simple params 1043 | params_to_use = record['params_detailed'] or record['params_list'] or [] 1044 | 1045 | return { 1046 | 'name': record['name'], 1047 | 'params_list': params_to_use, 1048 | 'return_type': record['return_type'], 1049 | 'args': record['args'] or [] 1050 | } 1051 | 1052 | return None 1053 | 1054 | async def _find_pydantic_ai_result_method(self, method_name: str) -> Optional[Dict[str, Any]]: 1055 | """Find method information for pydantic_ai result objects""" 1056 | # Look for methods on pydantic_ai classes that could be result objects 1057 | async with self.driver.session() as session: 1058 | # Search for common result methods in pydantic_ai repository 1059 | query = """ 1060 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method) 1061 | WHERE m.name = $method_name 1062 | AND (c.name CONTAINS 'Result' OR c.name CONTAINS 'Stream' OR c.name CONTAINS 'Run') 1063 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, 1064 | m.return_type as return_type, m.args as args, c.name as class_name 1065 | LIMIT 1 1066 | """ 1067 | 1068 | result = await session.run(query, repo_name="pydantic_ai", method_name=method_name) 1069 | record = await result.single() 1070 | 1071 | if record: 1072 | # Use detailed params if available, fall back to simple params 1073 | params_to_use = record['params_detailed'] or record['params_list'] or [] 1074 | 1075 | return { 1076 | 'name': record['name'], 1077 | 'params_list': params_to_use, 1078 | 'return_type': record['return_type'], 1079 | 'args': record['args'] or [], 1080 | 'source_class': record['class_name'] 1081 | } 1082 | 1083 | return None 1084 | 1085 | async def _find_similar_modules(self, module_name: str) -> List[str]: 1086 | """Find similar repository names for suggestions""" 1087 | async with self.driver.session() as session: 1088 | query = """ 1089 | MATCH (r:Repository) 1090 | WHERE toLower(r.name) CONTAINS toLower($partial_name) 1091 | OR toLower(replace(r.name, '-', '_')) CONTAINS toLower($partial_name) 1092 | OR toLower(replace(r.name, '_', '-')) CONTAINS toLower($partial_name) 1093 | RETURN r.name 1094 | LIMIT 5 1095 | """ 1096 | 1097 | result = await session.run(query, partial_name=module_name[:3]) 1098 | suggestions = [] 1099 | async for record in result: 1100 | suggestions.append(record['name']) 1101 | 1102 | return suggestions 1103 | 1104 | async def _find_similar_methods(self, class_name: str, method_name: str) -> List[str]: 1105 | """Find similar method names for suggestions""" 1106 | async with self.driver.session() as session: 1107 | # First try exact class match 1108 | query = """ 1109 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method) 1110 | WHERE (c.name = $class_name OR c.full_name = $class_name) 1111 | AND m.name CONTAINS $partial_name 1112 | RETURN m.name as name 1113 | LIMIT 5 1114 | """ 1115 | 1116 | result = await session.run(query, class_name=class_name, partial_name=method_name[:3]) 1117 | suggestions = [] 1118 | async for record in result: 1119 | suggestions.append(record['name']) 1120 | 1121 | # If no suggestions and class_name has dots, try repository-based search 1122 | if not suggestions and '.' in class_name: 1123 | parts = class_name.split('.') 1124 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai" 1125 | class_part = parts[-1] # e.g., "Agent" 1126 | 1127 | # Find repository for the module 1128 | repo_name = await self._find_repository_for_module(module_part) 1129 | 1130 | if repo_name: 1131 | repo_query = """ 1132 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method) 1133 | WHERE c.name = $class_name AND m.name CONTAINS $partial_name 1134 | RETURN m.name as name 1135 | LIMIT 5 1136 | """ 1137 | 1138 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, partial_name=method_name[:3]) 1139 | async for record in result: 1140 | suggestions.append(record['name']) 1141 | 1142 | return suggestions 1143 | 1144 | def _calculate_overall_confidence(self, result: ScriptValidationResult) -> float: 1145 | """Calculate overall confidence score for the validation (knowledge graph items only)""" 1146 | kg_validations = [] 1147 | 1148 | # Only count validations from knowledge graph imports 1149 | for val in result.import_validations: 1150 | if val.validation.details.get('in_knowledge_graph', False): 1151 | kg_validations.append(val.validation.confidence) 1152 | 1153 | # Only count validations from knowledge graph classes 1154 | for val in result.class_validations: 1155 | class_name = val.class_instantiation.full_class_name or val.class_instantiation.class_name 1156 | if self._is_from_knowledge_graph(class_name): 1157 | kg_validations.append(val.validation.confidence) 1158 | 1159 | # Only count validations from knowledge graph methods 1160 | for val in result.method_validations: 1161 | if val.method_call.object_type and self._is_from_knowledge_graph(val.method_call.object_type): 1162 | kg_validations.append(val.validation.confidence) 1163 | 1164 | # Only count validations from knowledge graph attributes 1165 | for val in result.attribute_validations: 1166 | if val.attribute_access.object_type and self._is_from_knowledge_graph(val.attribute_access.object_type): 1167 | kg_validations.append(val.validation.confidence) 1168 | 1169 | # Only count validations from knowledge graph functions 1170 | for val in result.function_validations: 1171 | if val.function_call.full_name and self._is_from_knowledge_graph(val.function_call.full_name): 1172 | kg_validations.append(val.validation.confidence) 1173 | 1174 | if not kg_validations: 1175 | return 1.0 # No knowledge graph items to validate = perfect confidence 1176 | 1177 | return sum(kg_validations) / len(kg_validations) 1178 | 1179 | def _is_from_knowledge_graph(self, class_type: str) -> bool: 1180 | """Check if a class type comes from a module in the knowledge graph""" 1181 | if not class_type: 1182 | return False 1183 | 1184 | # For dotted names like "pydantic_ai.Agent" or "pydantic_ai.StreamedRunResult", check the base module 1185 | if '.' in class_type: 1186 | base_module = class_type.split('.')[0] 1187 | # Exact match only - "pydantic" should not match "pydantic_ai" 1188 | return base_module in self.knowledge_graph_modules 1189 | 1190 | # For simple names, check if any knowledge graph module matches exactly 1191 | # Don't use substring matching to avoid "pydantic" matching "pydantic_ai" 1192 | return class_type in self.knowledge_graph_modules 1193 | 1194 | def _detect_hallucinations(self, result: ScriptValidationResult) -> List[Dict[str, Any]]: 1195 | """Detect and categorize hallucinations""" 1196 | hallucinations = [] 1197 | reported_items = set() # Track reported items to avoid duplicates 1198 | 1199 | # Check method calls (only for knowledge graph classes) 1200 | for val in result.method_validations: 1201 | if (val.validation.status == ValidationStatus.NOT_FOUND and 1202 | val.method_call.object_type and 1203 | self._is_from_knowledge_graph(val.method_call.object_type)): 1204 | 1205 | # Create unique key to avoid duplicates 1206 | key = (val.method_call.line_number, val.method_call.method_name, val.method_call.object_type) 1207 | if key not in reported_items: 1208 | reported_items.add(key) 1209 | hallucinations.append({ 1210 | 'type': 'METHOD_NOT_FOUND', 1211 | 'location': f"line {val.method_call.line_number}", 1212 | 'description': f"Method '{val.method_call.method_name}' not found on class '{val.method_call.object_type}'", 1213 | 'suggestion': val.validation.suggestions[0] if val.validation.suggestions else None 1214 | }) 1215 | 1216 | # Check attributes (only for knowledge graph classes) - but skip if already reported as method 1217 | for val in result.attribute_validations: 1218 | if (val.validation.status == ValidationStatus.NOT_FOUND and 1219 | val.attribute_access.object_type and 1220 | self._is_from_knowledge_graph(val.attribute_access.object_type)): 1221 | 1222 | # Create unique key - if this was already reported as a method, skip it 1223 | key = (val.attribute_access.line_number, val.attribute_access.attribute_name, val.attribute_access.object_type) 1224 | if key not in reported_items: 1225 | reported_items.add(key) 1226 | hallucinations.append({ 1227 | 'type': 'ATTRIBUTE_NOT_FOUND', 1228 | 'location': f"line {val.attribute_access.line_number}", 1229 | 'description': f"Attribute '{val.attribute_access.attribute_name}' not found on class '{val.attribute_access.object_type}'" 1230 | }) 1231 | 1232 | # Check parameter issues (only for knowledge graph methods) 1233 | for val in result.method_validations: 1234 | if (val.parameter_validation and 1235 | val.parameter_validation.status == ValidationStatus.INVALID and 1236 | val.method_call.object_type and 1237 | self._is_from_knowledge_graph(val.method_call.object_type)): 1238 | hallucinations.append({ 1239 | 'type': 'INVALID_PARAMETERS', 1240 | 'location': f"line {val.method_call.line_number}", 1241 | 'description': f"Invalid parameters for method '{val.method_call.method_name}': {val.parameter_validation.message}" 1242 | }) 1243 | 1244 | return hallucinations ``` -------------------------------------------------------------------------------- /src/crawl4ai_mcp.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | MCP server for web crawling with Crawl4AI. 3 | 4 | This server provides tools to crawl websites using Crawl4AI, automatically detecting 5 | the appropriate crawl method based on URL type (sitemap, txt file, or regular webpage). 6 | Also includes AI hallucination detection and repository parsing tools using Neo4j knowledge graphs. 7 | """ 8 | from mcp.server.fastmcp import FastMCP, Context 9 | from sentence_transformers import CrossEncoder 10 | from contextlib import asynccontextmanager 11 | from collections.abc import AsyncIterator 12 | from dataclasses import dataclass 13 | from typing import List, Dict, Any, Optional 14 | from urllib.parse import urlparse, urldefrag 15 | from xml.etree import ElementTree 16 | from dotenv import load_dotenv 17 | from supabase import Client 18 | from pathlib import Path 19 | import requests 20 | import asyncio 21 | import json 22 | import os 23 | import re 24 | import concurrent.futures 25 | import sys 26 | 27 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher 28 | 29 | # Add knowledge_graphs folder to path for importing knowledge graph modules 30 | knowledge_graphs_path = Path(__file__).resolve().parent.parent / 'knowledge_graphs' 31 | sys.path.append(str(knowledge_graphs_path)) 32 | 33 | from utils import ( 34 | get_supabase_client, 35 | add_documents_to_supabase, 36 | search_documents, 37 | extract_code_blocks, 38 | generate_code_example_summary, 39 | add_code_examples_to_supabase, 40 | update_source_info, 41 | extract_source_summary, 42 | search_code_examples 43 | ) 44 | 45 | # Import knowledge graph modules 46 | from knowledge_graph_validator import KnowledgeGraphValidator 47 | from parse_repo_into_neo4j import DirectNeo4jExtractor 48 | from ai_script_analyzer import AIScriptAnalyzer 49 | from hallucination_reporter import HallucinationReporter 50 | 51 | # Load environment variables from the project root .env file 52 | project_root = Path(__file__).resolve().parent.parent 53 | dotenv_path = project_root / '.env' 54 | 55 | # Force override of existing environment variables 56 | load_dotenv(dotenv_path, override=True) 57 | 58 | # Helper functions for Neo4j validation and error handling 59 | def validate_neo4j_connection() -> bool: 60 | """Check if Neo4j environment variables are configured.""" 61 | return all([ 62 | os.getenv("NEO4J_URI"), 63 | os.getenv("NEO4J_USER"), 64 | os.getenv("NEO4J_PASSWORD") 65 | ]) 66 | 67 | def format_neo4j_error(error: Exception) -> str: 68 | """Format Neo4j connection errors for user-friendly messages.""" 69 | error_str = str(error).lower() 70 | if "authentication" in error_str or "unauthorized" in error_str: 71 | return "Neo4j authentication failed. Check NEO4J_USER and NEO4J_PASSWORD." 72 | elif "connection" in error_str or "refused" in error_str or "timeout" in error_str: 73 | return "Cannot connect to Neo4j. Check NEO4J_URI and ensure Neo4j is running." 74 | elif "database" in error_str: 75 | return "Neo4j database error. Check if the database exists and is accessible." 76 | else: 77 | return f"Neo4j error: {str(error)}" 78 | 79 | def validate_script_path(script_path: str) -> Dict[str, Any]: 80 | """Validate script path and return error info if invalid.""" 81 | if not script_path or not isinstance(script_path, str): 82 | return {"valid": False, "error": "Script path is required"} 83 | 84 | if not os.path.exists(script_path): 85 | return {"valid": False, "error": f"Script not found: {script_path}"} 86 | 87 | if not script_path.endswith('.py'): 88 | return {"valid": False, "error": "Only Python (.py) files are supported"} 89 | 90 | try: 91 | # Check if file is readable 92 | with open(script_path, 'r', encoding='utf-8') as f: 93 | f.read(1) # Read first character to test 94 | return {"valid": True} 95 | except Exception as e: 96 | return {"valid": False, "error": f"Cannot read script file: {str(e)}"} 97 | 98 | def validate_github_url(repo_url: str) -> Dict[str, Any]: 99 | """Validate GitHub repository URL.""" 100 | if not repo_url or not isinstance(repo_url, str): 101 | return {"valid": False, "error": "Repository URL is required"} 102 | 103 | repo_url = repo_url.strip() 104 | 105 | # Basic GitHub URL validation 106 | if not ("github.com" in repo_url.lower() or repo_url.endswith(".git")): 107 | return {"valid": False, "error": "Please provide a valid GitHub repository URL"} 108 | 109 | # Check URL format 110 | if not (repo_url.startswith("https://") or repo_url.startswith("git@")): 111 | return {"valid": False, "error": "Repository URL must start with https:// or git@"} 112 | 113 | return {"valid": True, "repo_name": repo_url.split('/')[-1].replace('.git', '')} 114 | 115 | # Create a dataclass for our application context 116 | @dataclass 117 | class Crawl4AIContext: 118 | """Context for the Crawl4AI MCP server.""" 119 | crawler: AsyncWebCrawler 120 | supabase_client: Client 121 | reranking_model: Optional[CrossEncoder] = None 122 | knowledge_validator: Optional[Any] = None # KnowledgeGraphValidator when available 123 | repo_extractor: Optional[Any] = None # DirectNeo4jExtractor when available 124 | 125 | @asynccontextmanager 126 | async def crawl4ai_lifespan(server: FastMCP) -> AsyncIterator[Crawl4AIContext]: 127 | """ 128 | Manages the Crawl4AI client lifecycle. 129 | 130 | Args: 131 | server: The FastMCP server instance 132 | 133 | Yields: 134 | Crawl4AIContext: The context containing the Crawl4AI crawler and Supabase client 135 | """ 136 | # Create browser configuration 137 | browser_config = BrowserConfig( 138 | headless=True, 139 | verbose=False 140 | ) 141 | 142 | # Initialize the crawler 143 | crawler = AsyncWebCrawler(config=browser_config) 144 | await crawler.__aenter__() 145 | 146 | # Initialize Supabase client 147 | supabase_client = get_supabase_client() 148 | 149 | # Initialize cross-encoder model for reranking if enabled 150 | reranking_model = None 151 | if os.getenv("USE_RERANKING", "false") == "true": 152 | try: 153 | reranking_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") 154 | except Exception as e: 155 | print(f"Failed to load reranking model: {e}") 156 | reranking_model = None 157 | 158 | # Initialize Neo4j components if configured and enabled 159 | knowledge_validator = None 160 | repo_extractor = None 161 | 162 | # Check if knowledge graph functionality is enabled 163 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true" 164 | 165 | if knowledge_graph_enabled: 166 | neo4j_uri = os.getenv("NEO4J_URI") 167 | neo4j_user = os.getenv("NEO4J_USER") 168 | neo4j_password = os.getenv("NEO4J_PASSWORD") 169 | 170 | if neo4j_uri and neo4j_user and neo4j_password: 171 | try: 172 | print("Initializing knowledge graph components...") 173 | 174 | # Initialize knowledge graph validator 175 | knowledge_validator = KnowledgeGraphValidator(neo4j_uri, neo4j_user, neo4j_password) 176 | await knowledge_validator.initialize() 177 | print("✓ Knowledge graph validator initialized") 178 | 179 | # Initialize repository extractor 180 | repo_extractor = DirectNeo4jExtractor(neo4j_uri, neo4j_user, neo4j_password) 181 | await repo_extractor.initialize() 182 | print("✓ Repository extractor initialized") 183 | 184 | except Exception as e: 185 | print(f"Failed to initialize Neo4j components: {format_neo4j_error(e)}") 186 | knowledge_validator = None 187 | repo_extractor = None 188 | else: 189 | print("Neo4j credentials not configured - knowledge graph tools will be unavailable") 190 | else: 191 | print("Knowledge graph functionality disabled - set USE_KNOWLEDGE_GRAPH=true to enable") 192 | 193 | try: 194 | yield Crawl4AIContext( 195 | crawler=crawler, 196 | supabase_client=supabase_client, 197 | reranking_model=reranking_model, 198 | knowledge_validator=knowledge_validator, 199 | repo_extractor=repo_extractor 200 | ) 201 | finally: 202 | # Clean up all components 203 | await crawler.__aexit__(None, None, None) 204 | if knowledge_validator: 205 | try: 206 | await knowledge_validator.close() 207 | print("✓ Knowledge graph validator closed") 208 | except Exception as e: 209 | print(f"Error closing knowledge validator: {e}") 210 | if repo_extractor: 211 | try: 212 | await repo_extractor.close() 213 | print("✓ Repository extractor closed") 214 | except Exception as e: 215 | print(f"Error closing repository extractor: {e}") 216 | 217 | # Initialize FastMCP server 218 | mcp = FastMCP( 219 | "mcp-crawl4ai-rag", 220 | description="MCP server for RAG and web crawling with Crawl4AI", 221 | lifespan=crawl4ai_lifespan, 222 | host=os.getenv("HOST", "0.0.0.0"), 223 | port=os.getenv("PORT", "8051") 224 | ) 225 | 226 | def rerank_results(model: CrossEncoder, query: str, results: List[Dict[str, Any]], content_key: str = "content") -> List[Dict[str, Any]]: 227 | """ 228 | Rerank search results using a cross-encoder model. 229 | 230 | Args: 231 | model: The cross-encoder model to use for reranking 232 | query: The search query 233 | results: List of search results 234 | content_key: The key in each result dict that contains the text content 235 | 236 | Returns: 237 | Reranked list of results 238 | """ 239 | if not model or not results: 240 | return results 241 | 242 | try: 243 | # Extract content from results 244 | texts = [result.get(content_key, "") for result in results] 245 | 246 | # Create pairs of [query, document] for the cross-encoder 247 | pairs = [[query, text] for text in texts] 248 | 249 | # Get relevance scores from the cross-encoder 250 | scores = model.predict(pairs) 251 | 252 | # Add scores to results and sort by score (descending) 253 | for i, result in enumerate(results): 254 | result["rerank_score"] = float(scores[i]) 255 | 256 | # Sort by rerank score 257 | reranked = sorted(results, key=lambda x: x.get("rerank_score", 0), reverse=True) 258 | 259 | return reranked 260 | except Exception as e: 261 | print(f"Error during reranking: {e}") 262 | return results 263 | 264 | def is_sitemap(url: str) -> bool: 265 | """ 266 | Check if a URL is a sitemap. 267 | 268 | Args: 269 | url: URL to check 270 | 271 | Returns: 272 | True if the URL is a sitemap, False otherwise 273 | """ 274 | return url.endswith('sitemap.xml') or 'sitemap' in urlparse(url).path 275 | 276 | def is_txt(url: str) -> bool: 277 | """ 278 | Check if a URL is a text file. 279 | 280 | Args: 281 | url: URL to check 282 | 283 | Returns: 284 | True if the URL is a text file, False otherwise 285 | """ 286 | return url.endswith('.txt') 287 | 288 | def parse_sitemap(sitemap_url: str) -> List[str]: 289 | """ 290 | Parse a sitemap and extract URLs. 291 | 292 | Args: 293 | sitemap_url: URL of the sitemap 294 | 295 | Returns: 296 | List of URLs found in the sitemap 297 | """ 298 | resp = requests.get(sitemap_url) 299 | urls = [] 300 | 301 | if resp.status_code == 200: 302 | try: 303 | tree = ElementTree.fromstring(resp.content) 304 | urls = [loc.text for loc in tree.findall('.//{*}loc')] 305 | except Exception as e: 306 | print(f"Error parsing sitemap XML: {e}") 307 | 308 | return urls 309 | 310 | def smart_chunk_markdown(text: str, chunk_size: int = 5000) -> List[str]: 311 | """Split text into chunks, respecting code blocks and paragraphs.""" 312 | chunks = [] 313 | start = 0 314 | text_length = len(text) 315 | 316 | while start < text_length: 317 | # Calculate end position 318 | end = start + chunk_size 319 | 320 | # If we're at the end of the text, just take what's left 321 | if end >= text_length: 322 | chunks.append(text[start:].strip()) 323 | break 324 | 325 | # Try to find a code block boundary first (```) 326 | chunk = text[start:end] 327 | code_block = chunk.rfind('```') 328 | if code_block != -1 and code_block > chunk_size * 0.3: 329 | end = start + code_block 330 | 331 | # If no code block, try to break at a paragraph 332 | elif '\n\n' in chunk: 333 | # Find the last paragraph break 334 | last_break = chunk.rfind('\n\n') 335 | if last_break > chunk_size * 0.3: # Only break if we're past 30% of chunk_size 336 | end = start + last_break 337 | 338 | # If no paragraph break, try to break at a sentence 339 | elif '. ' in chunk: 340 | # Find the last sentence break 341 | last_period = chunk.rfind('. ') 342 | if last_period > chunk_size * 0.3: # Only break if we're past 30% of chunk_size 343 | end = start + last_period + 1 344 | 345 | # Extract chunk and clean it up 346 | chunk = text[start:end].strip() 347 | if chunk: 348 | chunks.append(chunk) 349 | 350 | # Move start position for next chunk 351 | start = end 352 | 353 | return chunks 354 | 355 | def extract_section_info(chunk: str) -> Dict[str, Any]: 356 | """ 357 | Extracts headers and stats from a chunk. 358 | 359 | Args: 360 | chunk: Markdown chunk 361 | 362 | Returns: 363 | Dictionary with headers and stats 364 | """ 365 | headers = re.findall(r'^(#+)\s+(.+)$', chunk, re.MULTILINE) 366 | header_str = '; '.join([f'{h[0]} {h[1]}' for h in headers]) if headers else '' 367 | 368 | return { 369 | "headers": header_str, 370 | "char_count": len(chunk), 371 | "word_count": len(chunk.split()) 372 | } 373 | 374 | def process_code_example(args): 375 | """ 376 | Process a single code example to generate its summary. 377 | This function is designed to be used with concurrent.futures. 378 | 379 | Args: 380 | args: Tuple containing (code, context_before, context_after) 381 | 382 | Returns: 383 | The generated summary 384 | """ 385 | code, context_before, context_after = args 386 | return generate_code_example_summary(code, context_before, context_after) 387 | 388 | @mcp.tool() 389 | async def crawl_single_page(ctx: Context, url: str) -> str: 390 | """ 391 | Crawl a single web page and store its content in Supabase. 392 | 393 | This tool is ideal for quickly retrieving content from a specific URL without following links. 394 | The content is stored in Supabase for later retrieval and querying. 395 | 396 | Args: 397 | ctx: The MCP server provided context 398 | url: URL of the web page to crawl 399 | 400 | Returns: 401 | Summary of the crawling operation and storage in Supabase 402 | """ 403 | try: 404 | # Get the crawler from the context 405 | crawler = ctx.request_context.lifespan_context.crawler 406 | supabase_client = ctx.request_context.lifespan_context.supabase_client 407 | 408 | # Configure the crawl 409 | run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False) 410 | 411 | # Crawl the page 412 | result = await crawler.arun(url=url, config=run_config) 413 | 414 | if result.success and result.markdown: 415 | # Extract source_id 416 | parsed_url = urlparse(url) 417 | source_id = parsed_url.netloc or parsed_url.path 418 | 419 | # Chunk the content 420 | chunks = smart_chunk_markdown(result.markdown) 421 | 422 | # Prepare data for Supabase 423 | urls = [] 424 | chunk_numbers = [] 425 | contents = [] 426 | metadatas = [] 427 | total_word_count = 0 428 | 429 | for i, chunk in enumerate(chunks): 430 | urls.append(url) 431 | chunk_numbers.append(i) 432 | contents.append(chunk) 433 | 434 | # Extract metadata 435 | meta = extract_section_info(chunk) 436 | meta["chunk_index"] = i 437 | meta["url"] = url 438 | meta["source"] = source_id 439 | meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__) 440 | metadatas.append(meta) 441 | 442 | # Accumulate word count 443 | total_word_count += meta.get("word_count", 0) 444 | 445 | # Create url_to_full_document mapping 446 | url_to_full_document = {url: result.markdown} 447 | 448 | # Update source information FIRST (before inserting documents) 449 | source_summary = extract_source_summary(source_id, result.markdown[:5000]) # Use first 5000 chars for summary 450 | update_source_info(supabase_client, source_id, source_summary, total_word_count) 451 | 452 | # Add documentation chunks to Supabase (AFTER source exists) 453 | add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document) 454 | 455 | # Extract and process code examples only if enabled 456 | extract_code_examples = os.getenv("USE_AGENTIC_RAG", "false") == "true" 457 | if extract_code_examples: 458 | code_blocks = extract_code_blocks(result.markdown) 459 | if code_blocks: 460 | code_urls = [] 461 | code_chunk_numbers = [] 462 | code_examples = [] 463 | code_summaries = [] 464 | code_metadatas = [] 465 | 466 | # Process code examples in parallel 467 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: 468 | # Prepare arguments for parallel processing 469 | summary_args = [(block['code'], block['context_before'], block['context_after']) 470 | for block in code_blocks] 471 | 472 | # Generate summaries in parallel 473 | summaries = list(executor.map(process_code_example, summary_args)) 474 | 475 | # Prepare code example data 476 | for i, (block, summary) in enumerate(zip(code_blocks, summaries)): 477 | code_urls.append(url) 478 | code_chunk_numbers.append(i) 479 | code_examples.append(block['code']) 480 | code_summaries.append(summary) 481 | 482 | # Create metadata for code example 483 | code_meta = { 484 | "chunk_index": i, 485 | "url": url, 486 | "source": source_id, 487 | "char_count": len(block['code']), 488 | "word_count": len(block['code'].split()) 489 | } 490 | code_metadatas.append(code_meta) 491 | 492 | # Add code examples to Supabase 493 | add_code_examples_to_supabase( 494 | supabase_client, 495 | code_urls, 496 | code_chunk_numbers, 497 | code_examples, 498 | code_summaries, 499 | code_metadatas 500 | ) 501 | 502 | return json.dumps({ 503 | "success": True, 504 | "url": url, 505 | "chunks_stored": len(chunks), 506 | "code_examples_stored": len(code_blocks) if code_blocks else 0, 507 | "content_length": len(result.markdown), 508 | "total_word_count": total_word_count, 509 | "source_id": source_id, 510 | "links_count": { 511 | "internal": len(result.links.get("internal", [])), 512 | "external": len(result.links.get("external", [])) 513 | } 514 | }, indent=2) 515 | else: 516 | return json.dumps({ 517 | "success": False, 518 | "url": url, 519 | "error": result.error_message 520 | }, indent=2) 521 | except Exception as e: 522 | return json.dumps({ 523 | "success": False, 524 | "url": url, 525 | "error": str(e) 526 | }, indent=2) 527 | 528 | @mcp.tool() 529 | async def smart_crawl_url(ctx: Context, url: str, max_depth: int = 3, max_concurrent: int = 10, chunk_size: int = 5000) -> str: 530 | """ 531 | Intelligently crawl a URL based on its type and store content in Supabase. 532 | 533 | This tool automatically detects the URL type and applies the appropriate crawling method: 534 | - For sitemaps: Extracts and crawls all URLs in parallel 535 | - For text files (llms.txt): Directly retrieves the content 536 | - For regular webpages: Recursively crawls internal links up to the specified depth 537 | 538 | All crawled content is chunked and stored in Supabase for later retrieval and querying. 539 | 540 | Args: 541 | ctx: The MCP server provided context 542 | url: URL to crawl (can be a regular webpage, sitemap.xml, or .txt file) 543 | max_depth: Maximum recursion depth for regular URLs (default: 3) 544 | max_concurrent: Maximum number of concurrent browser sessions (default: 10) 545 | chunk_size: Maximum size of each content chunk in characters (default: 1000) 546 | 547 | Returns: 548 | JSON string with crawl summary and storage information 549 | """ 550 | try: 551 | # Get the crawler from the context 552 | crawler = ctx.request_context.lifespan_context.crawler 553 | supabase_client = ctx.request_context.lifespan_context.supabase_client 554 | 555 | # Determine the crawl strategy 556 | crawl_results = [] 557 | crawl_type = None 558 | 559 | if is_txt(url): 560 | # For text files, use simple crawl 561 | crawl_results = await crawl_markdown_file(crawler, url) 562 | crawl_type = "text_file" 563 | elif is_sitemap(url): 564 | # For sitemaps, extract URLs and crawl in parallel 565 | sitemap_urls = parse_sitemap(url) 566 | if not sitemap_urls: 567 | return json.dumps({ 568 | "success": False, 569 | "url": url, 570 | "error": "No URLs found in sitemap" 571 | }, indent=2) 572 | crawl_results = await crawl_batch(crawler, sitemap_urls, max_concurrent=max_concurrent) 573 | crawl_type = "sitemap" 574 | else: 575 | # For regular URLs, use recursive crawl 576 | crawl_results = await crawl_recursive_internal_links(crawler, [url], max_depth=max_depth, max_concurrent=max_concurrent) 577 | crawl_type = "webpage" 578 | 579 | if not crawl_results: 580 | return json.dumps({ 581 | "success": False, 582 | "url": url, 583 | "error": "No content found" 584 | }, indent=2) 585 | 586 | # Process results and store in Supabase 587 | urls = [] 588 | chunk_numbers = [] 589 | contents = [] 590 | metadatas = [] 591 | chunk_count = 0 592 | 593 | # Track sources and their content 594 | source_content_map = {} 595 | source_word_counts = {} 596 | 597 | # Process documentation chunks 598 | for doc in crawl_results: 599 | source_url = doc['url'] 600 | md = doc['markdown'] 601 | chunks = smart_chunk_markdown(md, chunk_size=chunk_size) 602 | 603 | # Extract source_id 604 | parsed_url = urlparse(source_url) 605 | source_id = parsed_url.netloc or parsed_url.path 606 | 607 | # Store content for source summary generation 608 | if source_id not in source_content_map: 609 | source_content_map[source_id] = md[:5000] # Store first 5000 chars 610 | source_word_counts[source_id] = 0 611 | 612 | for i, chunk in enumerate(chunks): 613 | urls.append(source_url) 614 | chunk_numbers.append(i) 615 | contents.append(chunk) 616 | 617 | # Extract metadata 618 | meta = extract_section_info(chunk) 619 | meta["chunk_index"] = i 620 | meta["url"] = source_url 621 | meta["source"] = source_id 622 | meta["crawl_type"] = crawl_type 623 | meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__) 624 | metadatas.append(meta) 625 | 626 | # Accumulate word count 627 | source_word_counts[source_id] += meta.get("word_count", 0) 628 | 629 | chunk_count += 1 630 | 631 | # Create url_to_full_document mapping 632 | url_to_full_document = {} 633 | for doc in crawl_results: 634 | url_to_full_document[doc['url']] = doc['markdown'] 635 | 636 | # Update source information for each unique source FIRST (before inserting documents) 637 | with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: 638 | source_summary_args = [(source_id, content) for source_id, content in source_content_map.items()] 639 | source_summaries = list(executor.map(lambda args: extract_source_summary(args[0], args[1]), source_summary_args)) 640 | 641 | for (source_id, _), summary in zip(source_summary_args, source_summaries): 642 | word_count = source_word_counts.get(source_id, 0) 643 | update_source_info(supabase_client, source_id, summary, word_count) 644 | 645 | # Add documentation chunks to Supabase (AFTER sources exist) 646 | batch_size = 20 647 | add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document, batch_size=batch_size) 648 | 649 | # Extract and process code examples from all documents only if enabled 650 | extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true" 651 | if extract_code_examples_enabled: 652 | all_code_blocks = [] 653 | code_urls = [] 654 | code_chunk_numbers = [] 655 | code_examples = [] 656 | code_summaries = [] 657 | code_metadatas = [] 658 | 659 | # Extract code blocks from all documents 660 | for doc in crawl_results: 661 | source_url = doc['url'] 662 | md = doc['markdown'] 663 | code_blocks = extract_code_blocks(md) 664 | 665 | if code_blocks: 666 | # Process code examples in parallel 667 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: 668 | # Prepare arguments for parallel processing 669 | summary_args = [(block['code'], block['context_before'], block['context_after']) 670 | for block in code_blocks] 671 | 672 | # Generate summaries in parallel 673 | summaries = list(executor.map(process_code_example, summary_args)) 674 | 675 | # Prepare code example data 676 | parsed_url = urlparse(source_url) 677 | source_id = parsed_url.netloc or parsed_url.path 678 | 679 | for i, (block, summary) in enumerate(zip(code_blocks, summaries)): 680 | code_urls.append(source_url) 681 | code_chunk_numbers.append(len(code_examples)) # Use global code example index 682 | code_examples.append(block['code']) 683 | code_summaries.append(summary) 684 | 685 | # Create metadata for code example 686 | code_meta = { 687 | "chunk_index": len(code_examples) - 1, 688 | "url": source_url, 689 | "source": source_id, 690 | "char_count": len(block['code']), 691 | "word_count": len(block['code'].split()) 692 | } 693 | code_metadatas.append(code_meta) 694 | 695 | # Add all code examples to Supabase 696 | if code_examples: 697 | add_code_examples_to_supabase( 698 | supabase_client, 699 | code_urls, 700 | code_chunk_numbers, 701 | code_examples, 702 | code_summaries, 703 | code_metadatas, 704 | batch_size=batch_size 705 | ) 706 | 707 | return json.dumps({ 708 | "success": True, 709 | "url": url, 710 | "crawl_type": crawl_type, 711 | "pages_crawled": len(crawl_results), 712 | "chunks_stored": chunk_count, 713 | "code_examples_stored": len(code_examples), 714 | "sources_updated": len(source_content_map), 715 | "urls_crawled": [doc['url'] for doc in crawl_results][:5] + (["..."] if len(crawl_results) > 5 else []) 716 | }, indent=2) 717 | except Exception as e: 718 | return json.dumps({ 719 | "success": False, 720 | "url": url, 721 | "error": str(e) 722 | }, indent=2) 723 | 724 | @mcp.tool() 725 | async def get_available_sources(ctx: Context) -> str: 726 | """ 727 | Get all available sources from the sources table. 728 | 729 | This tool returns a list of all unique sources (domains) that have been crawled and stored 730 | in the database, along with their summaries and statistics. This is useful for discovering 731 | what content is available for querying. 732 | 733 | Always use this tool before calling the RAG query or code example query tool 734 | with a specific source filter! 735 | 736 | Args: 737 | ctx: The MCP server provided context 738 | 739 | Returns: 740 | JSON string with the list of available sources and their details 741 | """ 742 | try: 743 | # Get the Supabase client from the context 744 | supabase_client = ctx.request_context.lifespan_context.supabase_client 745 | 746 | # Query the sources table directly 747 | result = supabase_client.from_('sources')\ 748 | .select('*')\ 749 | .order('source_id')\ 750 | .execute() 751 | 752 | # Format the sources with their details 753 | sources = [] 754 | if result.data: 755 | for source in result.data: 756 | sources.append({ 757 | "source_id": source.get("source_id"), 758 | "summary": source.get("summary"), 759 | "total_words": source.get("total_words"), 760 | "created_at": source.get("created_at"), 761 | "updated_at": source.get("updated_at") 762 | }) 763 | 764 | return json.dumps({ 765 | "success": True, 766 | "sources": sources, 767 | "count": len(sources) 768 | }, indent=2) 769 | except Exception as e: 770 | return json.dumps({ 771 | "success": False, 772 | "error": str(e) 773 | }, indent=2) 774 | 775 | @mcp.tool() 776 | async def perform_rag_query(ctx: Context, query: str, source: str = None, match_count: int = 5) -> str: 777 | """ 778 | Perform a RAG (Retrieval Augmented Generation) query on the stored content. 779 | 780 | This tool searches the vector database for content relevant to the query and returns 781 | the matching documents. Optionally filter by source domain. 782 | Get the source by using the get_available_sources tool before calling this search! 783 | 784 | Args: 785 | ctx: The MCP server provided context 786 | query: The search query 787 | source: Optional source domain to filter results (e.g., 'example.com') 788 | match_count: Maximum number of results to return (default: 5) 789 | 790 | Returns: 791 | JSON string with the search results 792 | """ 793 | try: 794 | # Get the Supabase client from the context 795 | supabase_client = ctx.request_context.lifespan_context.supabase_client 796 | 797 | # Check if hybrid search is enabled 798 | use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true" 799 | 800 | # Prepare filter if source is provided and not empty 801 | filter_metadata = None 802 | if source and source.strip(): 803 | filter_metadata = {"source": source} 804 | 805 | if use_hybrid_search: 806 | # Hybrid search: combine vector and keyword search 807 | 808 | # 1. Get vector search results (get more to account for filtering) 809 | vector_results = search_documents( 810 | client=supabase_client, 811 | query=query, 812 | match_count=match_count * 2, # Get double to have room for filtering 813 | filter_metadata=filter_metadata 814 | ) 815 | 816 | # 2. Get keyword search results using ILIKE 817 | keyword_query = supabase_client.from_('crawled_pages')\ 818 | .select('id, url, chunk_number, content, metadata, source_id')\ 819 | .ilike('content', f'%{query}%') 820 | 821 | # Apply source filter if provided 822 | if source and source.strip(): 823 | keyword_query = keyword_query.eq('source_id', source) 824 | 825 | # Execute keyword search 826 | keyword_response = keyword_query.limit(match_count * 2).execute() 827 | keyword_results = keyword_response.data if keyword_response.data else [] 828 | 829 | # 3. Combine results with preference for items appearing in both 830 | seen_ids = set() 831 | combined_results = [] 832 | 833 | # First, add items that appear in both searches (these are the best matches) 834 | vector_ids = {r.get('id') for r in vector_results if r.get('id')} 835 | for kr in keyword_results: 836 | if kr['id'] in vector_ids and kr['id'] not in seen_ids: 837 | # Find the vector result to get similarity score 838 | for vr in vector_results: 839 | if vr.get('id') == kr['id']: 840 | # Boost similarity score for items in both results 841 | vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2) 842 | combined_results.append(vr) 843 | seen_ids.add(kr['id']) 844 | break 845 | 846 | # Then add remaining vector results (semantic matches without exact keyword) 847 | for vr in vector_results: 848 | if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count: 849 | combined_results.append(vr) 850 | seen_ids.add(vr['id']) 851 | 852 | # Finally, add pure keyword matches if we still need more results 853 | for kr in keyword_results: 854 | if kr['id'] not in seen_ids and len(combined_results) < match_count: 855 | # Convert keyword result to match vector result format 856 | combined_results.append({ 857 | 'id': kr['id'], 858 | 'url': kr['url'], 859 | 'chunk_number': kr['chunk_number'], 860 | 'content': kr['content'], 861 | 'metadata': kr['metadata'], 862 | 'source_id': kr['source_id'], 863 | 'similarity': 0.5 # Default similarity for keyword-only matches 864 | }) 865 | seen_ids.add(kr['id']) 866 | 867 | # Use combined results 868 | results = combined_results[:match_count] 869 | 870 | else: 871 | # Standard vector search only 872 | results = search_documents( 873 | client=supabase_client, 874 | query=query, 875 | match_count=match_count, 876 | filter_metadata=filter_metadata 877 | ) 878 | 879 | # Apply reranking if enabled 880 | use_reranking = os.getenv("USE_RERANKING", "false") == "true" 881 | if use_reranking and ctx.request_context.lifespan_context.reranking_model: 882 | results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content") 883 | 884 | # Format the results 885 | formatted_results = [] 886 | for result in results: 887 | formatted_result = { 888 | "url": result.get("url"), 889 | "content": result.get("content"), 890 | "metadata": result.get("metadata"), 891 | "similarity": result.get("similarity") 892 | } 893 | # Include rerank score if available 894 | if "rerank_score" in result: 895 | formatted_result["rerank_score"] = result["rerank_score"] 896 | formatted_results.append(formatted_result) 897 | 898 | return json.dumps({ 899 | "success": True, 900 | "query": query, 901 | "source_filter": source, 902 | "search_mode": "hybrid" if use_hybrid_search else "vector", 903 | "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None, 904 | "results": formatted_results, 905 | "count": len(formatted_results) 906 | }, indent=2) 907 | except Exception as e: 908 | return json.dumps({ 909 | "success": False, 910 | "query": query, 911 | "error": str(e) 912 | }, indent=2) 913 | 914 | @mcp.tool() 915 | async def search_code_examples(ctx: Context, query: str, source_id: str = None, match_count: int = 5) -> str: 916 | """ 917 | Search for code examples relevant to the query. 918 | 919 | This tool searches the vector database for code examples relevant to the query and returns 920 | the matching examples with their summaries. Optionally filter by source_id. 921 | Get the source_id by using the get_available_sources tool before calling this search! 922 | 923 | Use the get_available_sources tool first to see what sources are available for filtering. 924 | 925 | Args: 926 | ctx: The MCP server provided context 927 | query: The search query 928 | source_id: Optional source ID to filter results (e.g., 'example.com') 929 | match_count: Maximum number of results to return (default: 5) 930 | 931 | Returns: 932 | JSON string with the search results 933 | """ 934 | # Check if code example extraction is enabled 935 | extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true" 936 | if not extract_code_examples_enabled: 937 | return json.dumps({ 938 | "success": False, 939 | "error": "Code example extraction is disabled. Perform a normal RAG search." 940 | }, indent=2) 941 | 942 | try: 943 | # Get the Supabase client from the context 944 | supabase_client = ctx.request_context.lifespan_context.supabase_client 945 | 946 | # Check if hybrid search is enabled 947 | use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true" 948 | 949 | # Prepare filter if source is provided and not empty 950 | filter_metadata = None 951 | if source_id and source_id.strip(): 952 | filter_metadata = {"source": source_id} 953 | 954 | if use_hybrid_search: 955 | # Hybrid search: combine vector and keyword search 956 | 957 | # Import the search function from utils 958 | from utils import search_code_examples as search_code_examples_impl 959 | 960 | # 1. Get vector search results (get more to account for filtering) 961 | vector_results = search_code_examples_impl( 962 | client=supabase_client, 963 | query=query, 964 | match_count=match_count * 2, # Get double to have room for filtering 965 | filter_metadata=filter_metadata 966 | ) 967 | 968 | # 2. Get keyword search results using ILIKE on both content and summary 969 | keyword_query = supabase_client.from_('code_examples')\ 970 | .select('id, url, chunk_number, content, summary, metadata, source_id')\ 971 | .or_(f'content.ilike.%{query}%,summary.ilike.%{query}%') 972 | 973 | # Apply source filter if provided 974 | if source_id and source_id.strip(): 975 | keyword_query = keyword_query.eq('source_id', source_id) 976 | 977 | # Execute keyword search 978 | keyword_response = keyword_query.limit(match_count * 2).execute() 979 | keyword_results = keyword_response.data if keyword_response.data else [] 980 | 981 | # 3. Combine results with preference for items appearing in both 982 | seen_ids = set() 983 | combined_results = [] 984 | 985 | # First, add items that appear in both searches (these are the best matches) 986 | vector_ids = {r.get('id') for r in vector_results if r.get('id')} 987 | for kr in keyword_results: 988 | if kr['id'] in vector_ids and kr['id'] not in seen_ids: 989 | # Find the vector result to get similarity score 990 | for vr in vector_results: 991 | if vr.get('id') == kr['id']: 992 | # Boost similarity score for items in both results 993 | vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2) 994 | combined_results.append(vr) 995 | seen_ids.add(kr['id']) 996 | break 997 | 998 | # Then add remaining vector results (semantic matches without exact keyword) 999 | for vr in vector_results: 1000 | if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count: 1001 | combined_results.append(vr) 1002 | seen_ids.add(vr['id']) 1003 | 1004 | # Finally, add pure keyword matches if we still need more results 1005 | for kr in keyword_results: 1006 | if kr['id'] not in seen_ids and len(combined_results) < match_count: 1007 | # Convert keyword result to match vector result format 1008 | combined_results.append({ 1009 | 'id': kr['id'], 1010 | 'url': kr['url'], 1011 | 'chunk_number': kr['chunk_number'], 1012 | 'content': kr['content'], 1013 | 'summary': kr['summary'], 1014 | 'metadata': kr['metadata'], 1015 | 'source_id': kr['source_id'], 1016 | 'similarity': 0.5 # Default similarity for keyword-only matches 1017 | }) 1018 | seen_ids.add(kr['id']) 1019 | 1020 | # Use combined results 1021 | results = combined_results[:match_count] 1022 | 1023 | else: 1024 | # Standard vector search only 1025 | from utils import search_code_examples as search_code_examples_impl 1026 | 1027 | results = search_code_examples_impl( 1028 | client=supabase_client, 1029 | query=query, 1030 | match_count=match_count, 1031 | filter_metadata=filter_metadata 1032 | ) 1033 | 1034 | # Apply reranking if enabled 1035 | use_reranking = os.getenv("USE_RERANKING", "false") == "true" 1036 | if use_reranking and ctx.request_context.lifespan_context.reranking_model: 1037 | results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content") 1038 | 1039 | # Format the results 1040 | formatted_results = [] 1041 | for result in results: 1042 | formatted_result = { 1043 | "url": result.get("url"), 1044 | "code": result.get("content"), 1045 | "summary": result.get("summary"), 1046 | "metadata": result.get("metadata"), 1047 | "source_id": result.get("source_id"), 1048 | "similarity": result.get("similarity") 1049 | } 1050 | # Include rerank score if available 1051 | if "rerank_score" in result: 1052 | formatted_result["rerank_score"] = result["rerank_score"] 1053 | formatted_results.append(formatted_result) 1054 | 1055 | return json.dumps({ 1056 | "success": True, 1057 | "query": query, 1058 | "source_filter": source_id, 1059 | "search_mode": "hybrid" if use_hybrid_search else "vector", 1060 | "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None, 1061 | "results": formatted_results, 1062 | "count": len(formatted_results) 1063 | }, indent=2) 1064 | except Exception as e: 1065 | return json.dumps({ 1066 | "success": False, 1067 | "query": query, 1068 | "error": str(e) 1069 | }, indent=2) 1070 | 1071 | @mcp.tool() 1072 | async def check_ai_script_hallucinations(ctx: Context, script_path: str) -> str: 1073 | """ 1074 | Check an AI-generated Python script for hallucinations using the knowledge graph. 1075 | 1076 | This tool analyzes a Python script for potential AI hallucinations by validating 1077 | imports, method calls, class instantiations, and function calls against a Neo4j 1078 | knowledge graph containing real repository data. 1079 | 1080 | The tool performs comprehensive analysis including: 1081 | - Import validation against known repositories 1082 | - Method call validation on classes from the knowledge graph 1083 | - Class instantiation parameter validation 1084 | - Function call parameter validation 1085 | - Attribute access validation 1086 | 1087 | Args: 1088 | ctx: The MCP server provided context 1089 | script_path: Absolute path to the Python script to analyze 1090 | 1091 | Returns: 1092 | JSON string with hallucination detection results, confidence scores, and recommendations 1093 | """ 1094 | try: 1095 | # Check if knowledge graph functionality is enabled 1096 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true" 1097 | if not knowledge_graph_enabled: 1098 | return json.dumps({ 1099 | "success": False, 1100 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment." 1101 | }, indent=2) 1102 | 1103 | # Get the knowledge validator from context 1104 | knowledge_validator = ctx.request_context.lifespan_context.knowledge_validator 1105 | 1106 | if not knowledge_validator: 1107 | return json.dumps({ 1108 | "success": False, 1109 | "error": "Knowledge graph validator not available. Check Neo4j configuration in environment variables." 1110 | }, indent=2) 1111 | 1112 | # Validate script path 1113 | validation = validate_script_path(script_path) 1114 | if not validation["valid"]: 1115 | return json.dumps({ 1116 | "success": False, 1117 | "script_path": script_path, 1118 | "error": validation["error"] 1119 | }, indent=2) 1120 | 1121 | # Step 1: Analyze script structure using AST 1122 | analyzer = AIScriptAnalyzer() 1123 | analysis_result = analyzer.analyze_script(script_path) 1124 | 1125 | if analysis_result.errors: 1126 | print(f"Analysis warnings for {script_path}: {analysis_result.errors}") 1127 | 1128 | # Step 2: Validate against knowledge graph 1129 | validation_result = await knowledge_validator.validate_script(analysis_result) 1130 | 1131 | # Step 3: Generate comprehensive report 1132 | reporter = HallucinationReporter() 1133 | report = reporter.generate_comprehensive_report(validation_result) 1134 | 1135 | # Format response with comprehensive information 1136 | return json.dumps({ 1137 | "success": True, 1138 | "script_path": script_path, 1139 | "overall_confidence": validation_result.overall_confidence, 1140 | "validation_summary": { 1141 | "total_validations": report["validation_summary"]["total_validations"], 1142 | "valid_count": report["validation_summary"]["valid_count"], 1143 | "invalid_count": report["validation_summary"]["invalid_count"], 1144 | "uncertain_count": report["validation_summary"]["uncertain_count"], 1145 | "not_found_count": report["validation_summary"]["not_found_count"], 1146 | "hallucination_rate": report["validation_summary"]["hallucination_rate"] 1147 | }, 1148 | "hallucinations_detected": report["hallucinations_detected"], 1149 | "recommendations": report["recommendations"], 1150 | "analysis_metadata": { 1151 | "total_imports": report["analysis_metadata"]["total_imports"], 1152 | "total_classes": report["analysis_metadata"]["total_classes"], 1153 | "total_methods": report["analysis_metadata"]["total_methods"], 1154 | "total_attributes": report["analysis_metadata"]["total_attributes"], 1155 | "total_functions": report["analysis_metadata"]["total_functions"] 1156 | }, 1157 | "libraries_analyzed": report.get("libraries_analyzed", []) 1158 | }, indent=2) 1159 | 1160 | except Exception as e: 1161 | return json.dumps({ 1162 | "success": False, 1163 | "script_path": script_path, 1164 | "error": f"Analysis failed: {str(e)}" 1165 | }, indent=2) 1166 | 1167 | @mcp.tool() 1168 | async def query_knowledge_graph(ctx: Context, command: str) -> str: 1169 | """ 1170 | Query and explore the Neo4j knowledge graph containing repository data. 1171 | 1172 | This tool provides comprehensive access to the knowledge graph for exploring repositories, 1173 | classes, methods, functions, and their relationships. Perfect for understanding what data 1174 | is available for hallucination detection and debugging validation results. 1175 | 1176 | **⚠️ IMPORTANT: Always start with the `repos` command first!** 1177 | Before using any other commands, run `repos` to see what repositories are available 1178 | in your knowledge graph. This will help you understand what data you can explore. 1179 | 1180 | ## Available Commands: 1181 | 1182 | **Repository Commands:** 1183 | - `repos` - **START HERE!** List all repositories in the knowledge graph 1184 | - `explore <repo_name>` - Get detailed overview of a specific repository 1185 | 1186 | **Class Commands:** 1187 | - `classes` - List all classes across all repositories (limited to 20) 1188 | - `classes <repo_name>` - List classes in a specific repository 1189 | - `class <class_name>` - Get detailed information about a specific class including methods and attributes 1190 | 1191 | **Method Commands:** 1192 | - `method <method_name>` - Search for methods by name across all classes 1193 | - `method <method_name> <class_name>` - Search for a method within a specific class 1194 | 1195 | **Custom Query:** 1196 | - `query <cypher_query>` - Execute a custom Cypher query (results limited to 20 records) 1197 | 1198 | ## Knowledge Graph Schema: 1199 | 1200 | **Node Types:** 1201 | - Repository: `(r:Repository {name: string})` 1202 | - File: `(f:File {path: string, module_name: string})` 1203 | - Class: `(c:Class {name: string, full_name: string})` 1204 | - Method: `(m:Method {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})` 1205 | - Function: `(func:Function {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})` 1206 | - Attribute: `(a:Attribute {name: string, type: string})` 1207 | 1208 | **Relationships:** 1209 | - `(r:Repository)-[:CONTAINS]->(f:File)` 1210 | - `(f:File)-[:DEFINES]->(c:Class)` 1211 | - `(c:Class)-[:HAS_METHOD]->(m:Method)` 1212 | - `(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)` 1213 | - `(f:File)-[:DEFINES]->(func:Function)` 1214 | 1215 | ## Example Workflow: 1216 | ``` 1217 | 1. repos # See what repositories are available 1218 | 2. explore pydantic-ai # Explore a specific repository 1219 | 3. classes pydantic-ai # List classes in that repository 1220 | 4. class Agent # Explore the Agent class 1221 | 5. method run_stream # Search for run_stream method 1222 | 6. method __init__ Agent # Find Agent constructor 1223 | 7. query "MATCH (c:Class)-[:HAS_METHOD]->(m:Method) WHERE m.name = 'run' RETURN c.name, m.name LIMIT 5" 1224 | ``` 1225 | 1226 | Args: 1227 | ctx: The MCP server provided context 1228 | command: Command string to execute (see available commands above) 1229 | 1230 | Returns: 1231 | JSON string with query results, statistics, and metadata 1232 | """ 1233 | try: 1234 | # Check if knowledge graph functionality is enabled 1235 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true" 1236 | if not knowledge_graph_enabled: 1237 | return json.dumps({ 1238 | "success": False, 1239 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment." 1240 | }, indent=2) 1241 | 1242 | # Get Neo4j driver from context 1243 | repo_extractor = ctx.request_context.lifespan_context.repo_extractor 1244 | if not repo_extractor or not repo_extractor.driver: 1245 | return json.dumps({ 1246 | "success": False, 1247 | "error": "Neo4j connection not available. Check Neo4j configuration in environment variables." 1248 | }, indent=2) 1249 | 1250 | # Parse command 1251 | command = command.strip() 1252 | if not command: 1253 | return json.dumps({ 1254 | "success": False, 1255 | "command": "", 1256 | "error": "Command cannot be empty. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>" 1257 | }, indent=2) 1258 | 1259 | parts = command.split() 1260 | cmd = parts[0].lower() 1261 | args = parts[1:] if len(parts) > 1 else [] 1262 | 1263 | async with repo_extractor.driver.session() as session: 1264 | # Route to appropriate handler 1265 | if cmd == "repos": 1266 | return await _handle_repos_command(session, command) 1267 | elif cmd == "explore": 1268 | if not args: 1269 | return json.dumps({ 1270 | "success": False, 1271 | "command": command, 1272 | "error": "Repository name required. Usage: explore <repo_name>" 1273 | }, indent=2) 1274 | return await _handle_explore_command(session, command, args[0]) 1275 | elif cmd == "classes": 1276 | repo_name = args[0] if args else None 1277 | return await _handle_classes_command(session, command, repo_name) 1278 | elif cmd == "class": 1279 | if not args: 1280 | return json.dumps({ 1281 | "success": False, 1282 | "command": command, 1283 | "error": "Class name required. Usage: class <class_name>" 1284 | }, indent=2) 1285 | return await _handle_class_command(session, command, args[0]) 1286 | elif cmd == "method": 1287 | if not args: 1288 | return json.dumps({ 1289 | "success": False, 1290 | "command": command, 1291 | "error": "Method name required. Usage: method <method_name> [class_name]" 1292 | }, indent=2) 1293 | method_name = args[0] 1294 | class_name = args[1] if len(args) > 1 else None 1295 | return await _handle_method_command(session, command, method_name, class_name) 1296 | elif cmd == "query": 1297 | if not args: 1298 | return json.dumps({ 1299 | "success": False, 1300 | "command": command, 1301 | "error": "Cypher query required. Usage: query <cypher_query>" 1302 | }, indent=2) 1303 | cypher_query = " ".join(args) 1304 | return await _handle_query_command(session, command, cypher_query) 1305 | else: 1306 | return json.dumps({ 1307 | "success": False, 1308 | "command": command, 1309 | "error": f"Unknown command '{cmd}'. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>" 1310 | }, indent=2) 1311 | 1312 | except Exception as e: 1313 | return json.dumps({ 1314 | "success": False, 1315 | "command": command, 1316 | "error": f"Query execution failed: {str(e)}" 1317 | }, indent=2) 1318 | 1319 | 1320 | async def _handle_repos_command(session, command: str) -> str: 1321 | """Handle 'repos' command - list all repositories""" 1322 | query = "MATCH (r:Repository) RETURN r.name as name ORDER BY r.name" 1323 | result = await session.run(query) 1324 | 1325 | repos = [] 1326 | async for record in result: 1327 | repos.append(record['name']) 1328 | 1329 | return json.dumps({ 1330 | "success": True, 1331 | "command": command, 1332 | "data": { 1333 | "repositories": repos 1334 | }, 1335 | "metadata": { 1336 | "total_results": len(repos), 1337 | "limited": False 1338 | } 1339 | }, indent=2) 1340 | 1341 | 1342 | async def _handle_explore_command(session, command: str, repo_name: str) -> str: 1343 | """Handle 'explore <repo>' command - get repository overview""" 1344 | # Check if repository exists 1345 | repo_check_query = "MATCH (r:Repository {name: $repo_name}) RETURN r.name as name" 1346 | result = await session.run(repo_check_query, repo_name=repo_name) 1347 | repo_record = await result.single() 1348 | 1349 | if not repo_record: 1350 | return json.dumps({ 1351 | "success": False, 1352 | "command": command, 1353 | "error": f"Repository '{repo_name}' not found in knowledge graph" 1354 | }, indent=2) 1355 | 1356 | # Get file count 1357 | files_query = """ 1358 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File) 1359 | RETURN count(f) as file_count 1360 | """ 1361 | result = await session.run(files_query, repo_name=repo_name) 1362 | file_count = (await result.single())['file_count'] 1363 | 1364 | # Get class count 1365 | classes_query = """ 1366 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class) 1367 | RETURN count(DISTINCT c) as class_count 1368 | """ 1369 | result = await session.run(classes_query, repo_name=repo_name) 1370 | class_count = (await result.single())['class_count'] 1371 | 1372 | # Get function count 1373 | functions_query = """ 1374 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function) 1375 | RETURN count(DISTINCT func) as function_count 1376 | """ 1377 | result = await session.run(functions_query, repo_name=repo_name) 1378 | function_count = (await result.single())['function_count'] 1379 | 1380 | # Get method count 1381 | methods_query = """ 1382 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method) 1383 | RETURN count(DISTINCT m) as method_count 1384 | """ 1385 | result = await session.run(methods_query, repo_name=repo_name) 1386 | method_count = (await result.single())['method_count'] 1387 | 1388 | return json.dumps({ 1389 | "success": True, 1390 | "command": command, 1391 | "data": { 1392 | "repository": repo_name, 1393 | "statistics": { 1394 | "files": file_count, 1395 | "classes": class_count, 1396 | "functions": function_count, 1397 | "methods": method_count 1398 | } 1399 | }, 1400 | "metadata": { 1401 | "total_results": 1, 1402 | "limited": False 1403 | } 1404 | }, indent=2) 1405 | 1406 | 1407 | async def _handle_classes_command(session, command: str, repo_name: str = None) -> str: 1408 | """Handle 'classes [repo]' command - list classes""" 1409 | limit = 20 1410 | 1411 | if repo_name: 1412 | query = """ 1413 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class) 1414 | RETURN c.name as name, c.full_name as full_name 1415 | ORDER BY c.name 1416 | LIMIT $limit 1417 | """ 1418 | result = await session.run(query, repo_name=repo_name, limit=limit) 1419 | else: 1420 | query = """ 1421 | MATCH (c:Class) 1422 | RETURN c.name as name, c.full_name as full_name 1423 | ORDER BY c.name 1424 | LIMIT $limit 1425 | """ 1426 | result = await session.run(query, limit=limit) 1427 | 1428 | classes = [] 1429 | async for record in result: 1430 | classes.append({ 1431 | 'name': record['name'], 1432 | 'full_name': record['full_name'] 1433 | }) 1434 | 1435 | return json.dumps({ 1436 | "success": True, 1437 | "command": command, 1438 | "data": { 1439 | "classes": classes, 1440 | "repository_filter": repo_name 1441 | }, 1442 | "metadata": { 1443 | "total_results": len(classes), 1444 | "limited": len(classes) >= limit 1445 | } 1446 | }, indent=2) 1447 | 1448 | 1449 | async def _handle_class_command(session, command: str, class_name: str) -> str: 1450 | """Handle 'class <name>' command - explore specific class""" 1451 | # Find the class 1452 | class_query = """ 1453 | MATCH (c:Class) 1454 | WHERE c.name = $class_name OR c.full_name = $class_name 1455 | RETURN c.name as name, c.full_name as full_name 1456 | LIMIT 1 1457 | """ 1458 | result = await session.run(class_query, class_name=class_name) 1459 | class_record = await result.single() 1460 | 1461 | if not class_record: 1462 | return json.dumps({ 1463 | "success": False, 1464 | "command": command, 1465 | "error": f"Class '{class_name}' not found in knowledge graph" 1466 | }, indent=2) 1467 | 1468 | actual_name = class_record['name'] 1469 | full_name = class_record['full_name'] 1470 | 1471 | # Get methods 1472 | methods_query = """ 1473 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method) 1474 | WHERE c.name = $class_name OR c.full_name = $class_name 1475 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, m.return_type as return_type 1476 | ORDER BY m.name 1477 | """ 1478 | result = await session.run(methods_query, class_name=class_name) 1479 | 1480 | methods = [] 1481 | async for record in result: 1482 | # Use detailed params if available, fall back to simple params 1483 | params_to_use = record['params_detailed'] or record['params_list'] or [] 1484 | methods.append({ 1485 | 'name': record['name'], 1486 | 'parameters': params_to_use, 1487 | 'return_type': record['return_type'] or 'Any' 1488 | }) 1489 | 1490 | # Get attributes 1491 | attributes_query = """ 1492 | MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute) 1493 | WHERE c.name = $class_name OR c.full_name = $class_name 1494 | RETURN a.name as name, a.type as type 1495 | ORDER BY a.name 1496 | """ 1497 | result = await session.run(attributes_query, class_name=class_name) 1498 | 1499 | attributes = [] 1500 | async for record in result: 1501 | attributes.append({ 1502 | 'name': record['name'], 1503 | 'type': record['type'] or 'Any' 1504 | }) 1505 | 1506 | return json.dumps({ 1507 | "success": True, 1508 | "command": command, 1509 | "data": { 1510 | "class": { 1511 | "name": actual_name, 1512 | "full_name": full_name, 1513 | "methods": methods, 1514 | "attributes": attributes 1515 | } 1516 | }, 1517 | "metadata": { 1518 | "total_results": 1, 1519 | "methods_count": len(methods), 1520 | "attributes_count": len(attributes), 1521 | "limited": False 1522 | } 1523 | }, indent=2) 1524 | 1525 | 1526 | async def _handle_method_command(session, command: str, method_name: str, class_name: str = None) -> str: 1527 | """Handle 'method <name> [class]' command - search for methods""" 1528 | if class_name: 1529 | query = """ 1530 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method) 1531 | WHERE (c.name = $class_name OR c.full_name = $class_name) 1532 | AND m.name = $method_name 1533 | RETURN c.name as class_name, c.full_name as class_full_name, 1534 | m.name as method_name, m.params_list as params_list, 1535 | m.params_detailed as params_detailed, m.return_type as return_type, m.args as args 1536 | """ 1537 | result = await session.run(query, class_name=class_name, method_name=method_name) 1538 | else: 1539 | query = """ 1540 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method) 1541 | WHERE m.name = $method_name 1542 | RETURN c.name as class_name, c.full_name as class_full_name, 1543 | m.name as method_name, m.params_list as params_list, 1544 | m.params_detailed as params_detailed, m.return_type as return_type, m.args as args 1545 | ORDER BY c.name 1546 | LIMIT 20 1547 | """ 1548 | result = await session.run(query, method_name=method_name) 1549 | 1550 | methods = [] 1551 | async for record in result: 1552 | # Use detailed params if available, fall back to simple params 1553 | params_to_use = record['params_detailed'] or record['params_list'] or [] 1554 | methods.append({ 1555 | 'class_name': record['class_name'], 1556 | 'class_full_name': record['class_full_name'], 1557 | 'method_name': record['method_name'], 1558 | 'parameters': params_to_use, 1559 | 'return_type': record['return_type'] or 'Any', 1560 | 'legacy_args': record['args'] or [] 1561 | }) 1562 | 1563 | if not methods: 1564 | return json.dumps({ 1565 | "success": False, 1566 | "command": command, 1567 | "error": f"Method '{method_name}'" + (f" in class '{class_name}'" if class_name else "") + " not found" 1568 | }, indent=2) 1569 | 1570 | return json.dumps({ 1571 | "success": True, 1572 | "command": command, 1573 | "data": { 1574 | "methods": methods, 1575 | "class_filter": class_name 1576 | }, 1577 | "metadata": { 1578 | "total_results": len(methods), 1579 | "limited": len(methods) >= 20 and not class_name 1580 | } 1581 | }, indent=2) 1582 | 1583 | 1584 | async def _handle_query_command(session, command: str, cypher_query: str) -> str: 1585 | """Handle 'query <cypher>' command - execute custom Cypher query""" 1586 | try: 1587 | # Execute the query with a limit to prevent overwhelming responses 1588 | result = await session.run(cypher_query) 1589 | 1590 | records = [] 1591 | count = 0 1592 | async for record in result: 1593 | records.append(dict(record)) 1594 | count += 1 1595 | if count >= 20: # Limit results to prevent overwhelming responses 1596 | break 1597 | 1598 | return json.dumps({ 1599 | "success": True, 1600 | "command": command, 1601 | "data": { 1602 | "query": cypher_query, 1603 | "results": records 1604 | }, 1605 | "metadata": { 1606 | "total_results": len(records), 1607 | "limited": len(records) >= 20 1608 | } 1609 | }, indent=2) 1610 | 1611 | except Exception as e: 1612 | return json.dumps({ 1613 | "success": False, 1614 | "command": command, 1615 | "error": f"Cypher query error: {str(e)}", 1616 | "data": { 1617 | "query": cypher_query 1618 | } 1619 | }, indent=2) 1620 | 1621 | 1622 | @mcp.tool() 1623 | async def parse_github_repository(ctx: Context, repo_url: str) -> str: 1624 | """ 1625 | Parse a GitHub repository into the Neo4j knowledge graph. 1626 | 1627 | This tool clones a GitHub repository, analyzes its Python files, and stores 1628 | the code structure (classes, methods, functions, imports) in Neo4j for use 1629 | in hallucination detection. The tool: 1630 | 1631 | - Clones the repository to a temporary location 1632 | - Analyzes Python files to extract code structure 1633 | - Stores classes, methods, functions, and imports in Neo4j 1634 | - Provides detailed statistics about the parsing results 1635 | - Automatically handles module name detection for imports 1636 | 1637 | Args: 1638 | ctx: The MCP server provided context 1639 | repo_url: GitHub repository URL (e.g., 'https://github.com/user/repo.git') 1640 | 1641 | Returns: 1642 | JSON string with parsing results, statistics, and repository information 1643 | """ 1644 | try: 1645 | # Check if knowledge graph functionality is enabled 1646 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true" 1647 | if not knowledge_graph_enabled: 1648 | return json.dumps({ 1649 | "success": False, 1650 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment." 1651 | }, indent=2) 1652 | 1653 | # Get the repository extractor from context 1654 | repo_extractor = ctx.request_context.lifespan_context.repo_extractor 1655 | 1656 | if not repo_extractor: 1657 | return json.dumps({ 1658 | "success": False, 1659 | "error": "Repository extractor not available. Check Neo4j configuration in environment variables." 1660 | }, indent=2) 1661 | 1662 | # Validate repository URL 1663 | validation = validate_github_url(repo_url) 1664 | if not validation["valid"]: 1665 | return json.dumps({ 1666 | "success": False, 1667 | "repo_url": repo_url, 1668 | "error": validation["error"] 1669 | }, indent=2) 1670 | 1671 | repo_name = validation["repo_name"] 1672 | 1673 | # Parse the repository (this includes cloning, analysis, and Neo4j storage) 1674 | print(f"Starting repository analysis for: {repo_name}") 1675 | await repo_extractor.analyze_repository(repo_url) 1676 | print(f"Repository analysis completed for: {repo_name}") 1677 | 1678 | # Query Neo4j for statistics about the parsed repository 1679 | async with repo_extractor.driver.session() as session: 1680 | # Get comprehensive repository statistics 1681 | stats_query = """ 1682 | MATCH (r:Repository {name: $repo_name}) 1683 | OPTIONAL MATCH (r)-[:CONTAINS]->(f:File) 1684 | OPTIONAL MATCH (f)-[:DEFINES]->(c:Class) 1685 | OPTIONAL MATCH (c)-[:HAS_METHOD]->(m:Method) 1686 | OPTIONAL MATCH (f)-[:DEFINES]->(func:Function) 1687 | OPTIONAL MATCH (c)-[:HAS_ATTRIBUTE]->(a:Attribute) 1688 | WITH r, 1689 | count(DISTINCT f) as files_count, 1690 | count(DISTINCT c) as classes_count, 1691 | count(DISTINCT m) as methods_count, 1692 | count(DISTINCT func) as functions_count, 1693 | count(DISTINCT a) as attributes_count 1694 | 1695 | // Get some sample module names 1696 | OPTIONAL MATCH (r)-[:CONTAINS]->(sample_f:File) 1697 | WITH r, files_count, classes_count, methods_count, functions_count, attributes_count, 1698 | collect(DISTINCT sample_f.module_name)[0..5] as sample_modules 1699 | 1700 | RETURN 1701 | r.name as repo_name, 1702 | files_count, 1703 | classes_count, 1704 | methods_count, 1705 | functions_count, 1706 | attributes_count, 1707 | sample_modules 1708 | """ 1709 | 1710 | result = await session.run(stats_query, repo_name=repo_name) 1711 | record = await result.single() 1712 | 1713 | if record: 1714 | stats = { 1715 | "repository": record['repo_name'], 1716 | "files_processed": record['files_count'], 1717 | "classes_created": record['classes_count'], 1718 | "methods_created": record['methods_count'], 1719 | "functions_created": record['functions_count'], 1720 | "attributes_created": record['attributes_count'], 1721 | "sample_modules": record['sample_modules'] or [] 1722 | } 1723 | else: 1724 | return json.dumps({ 1725 | "success": False, 1726 | "repo_url": repo_url, 1727 | "error": f"Repository '{repo_name}' not found in database after parsing" 1728 | }, indent=2) 1729 | 1730 | return json.dumps({ 1731 | "success": True, 1732 | "repo_url": repo_url, 1733 | "repo_name": repo_name, 1734 | "message": f"Successfully parsed repository '{repo_name}' into knowledge graph", 1735 | "statistics": stats, 1736 | "ready_for_validation": True, 1737 | "next_steps": [ 1738 | "Repository is now available for hallucination detection", 1739 | f"Use check_ai_script_hallucinations to validate scripts against {repo_name}", 1740 | "The knowledge graph contains classes, methods, and functions from this repository" 1741 | ] 1742 | }, indent=2) 1743 | 1744 | except Exception as e: 1745 | return json.dumps({ 1746 | "success": False, 1747 | "repo_url": repo_url, 1748 | "error": f"Repository parsing failed: {str(e)}" 1749 | }, indent=2) 1750 | 1751 | async def crawl_markdown_file(crawler: AsyncWebCrawler, url: str) -> List[Dict[str, Any]]: 1752 | """ 1753 | Crawl a .txt or markdown file. 1754 | 1755 | Args: 1756 | crawler: AsyncWebCrawler instance 1757 | url: URL of the file 1758 | 1759 | Returns: 1760 | List of dictionaries with URL and markdown content 1761 | """ 1762 | crawl_config = CrawlerRunConfig() 1763 | 1764 | result = await crawler.arun(url=url, config=crawl_config) 1765 | if result.success and result.markdown: 1766 | return [{'url': url, 'markdown': result.markdown}] 1767 | else: 1768 | print(f"Failed to crawl {url}: {result.error_message}") 1769 | return [] 1770 | 1771 | async def crawl_batch(crawler: AsyncWebCrawler, urls: List[str], max_concurrent: int = 10) -> List[Dict[str, Any]]: 1772 | """ 1773 | Batch crawl multiple URLs in parallel. 1774 | 1775 | Args: 1776 | crawler: AsyncWebCrawler instance 1777 | urls: List of URLs to crawl 1778 | max_concurrent: Maximum number of concurrent browser sessions 1779 | 1780 | Returns: 1781 | List of dictionaries with URL and markdown content 1782 | """ 1783 | crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False) 1784 | dispatcher = MemoryAdaptiveDispatcher( 1785 | memory_threshold_percent=70.0, 1786 | check_interval=1.0, 1787 | max_session_permit=max_concurrent 1788 | ) 1789 | 1790 | results = await crawler.arun_many(urls=urls, config=crawl_config, dispatcher=dispatcher) 1791 | return [{'url': r.url, 'markdown': r.markdown} for r in results if r.success and r.markdown] 1792 | 1793 | async def crawl_recursive_internal_links(crawler: AsyncWebCrawler, start_urls: List[str], max_depth: int = 3, max_concurrent: int = 10) -> List[Dict[str, Any]]: 1794 | """ 1795 | Recursively crawl internal links from start URLs up to a maximum depth. 1796 | 1797 | Args: 1798 | crawler: AsyncWebCrawler instance 1799 | start_urls: List of starting URLs 1800 | max_depth: Maximum recursion depth 1801 | max_concurrent: Maximum number of concurrent browser sessions 1802 | 1803 | Returns: 1804 | List of dictionaries with URL and markdown content 1805 | """ 1806 | run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False) 1807 | dispatcher = MemoryAdaptiveDispatcher( 1808 | memory_threshold_percent=70.0, 1809 | check_interval=1.0, 1810 | max_session_permit=max_concurrent 1811 | ) 1812 | 1813 | visited = set() 1814 | 1815 | def normalize_url(url): 1816 | return urldefrag(url)[0] 1817 | 1818 | current_urls = set([normalize_url(u) for u in start_urls]) 1819 | results_all = [] 1820 | 1821 | for depth in range(max_depth): 1822 | urls_to_crawl = [normalize_url(url) for url in current_urls if normalize_url(url) not in visited] 1823 | if not urls_to_crawl: 1824 | break 1825 | 1826 | results = await crawler.arun_many(urls=urls_to_crawl, config=run_config, dispatcher=dispatcher) 1827 | next_level_urls = set() 1828 | 1829 | for result in results: 1830 | norm_url = normalize_url(result.url) 1831 | visited.add(norm_url) 1832 | 1833 | if result.success and result.markdown: 1834 | results_all.append({'url': result.url, 'markdown': result.markdown}) 1835 | for link in result.links.get("internal", []): 1836 | next_url = normalize_url(link["href"]) 1837 | if next_url not in visited: 1838 | next_level_urls.add(next_url) 1839 | 1840 | current_urls = next_level_urls 1841 | 1842 | return results_all 1843 | 1844 | async def main(): 1845 | transport = os.getenv("TRANSPORT", "sse") 1846 | if transport == 'sse': 1847 | # Run the MCP server with sse transport 1848 | await mcp.run_sse_async() 1849 | else: 1850 | # Run the MCP server with stdio transport 1851 | await mcp.run_stdio_async() 1852 | 1853 | if __name__ == "__main__": 1854 | asyncio.run(main()) ```