chillbruhhh/crawl4ai-mcp # codebase.md

This is page 7 of 7. Use http://codebase.md/chillbruhhh/crawl4ai-mcp?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .dockerignore
├── .env.example
├── .gitattributes
├── .gitignore
├── crawled_pages.sql
├── Dockerfile
├── knowledge_graphs
│   ├── ai_hallucination_detector.py
│   ├── ai_script_analyzer.py
│   ├── hallucination_reporter.py
│   ├── knowledge_graph_validator.py
│   ├── parse_repo_into_neo4j.py
│   ├── query_knowledge_graph.py
│   └── test_script.py
├── LICENSE
├── neo4j
│   └── docker-neo4j
│       ├── .github
│       │   └── ISSUE_TEMPLATE
│       │       └── bug_report.md
│       ├── .gitignore
│       ├── build-docker-image.sh
│       ├── build-utils-common-functions.sh
│       ├── COPYRIGHT
│       ├── DEVELOPMENT.md
│       ├── devenv
│       ├── devenv.local.template
│       ├── docker-image-src
│       │   ├── 2.3
│       │   │   ├── docker-entrypoint.sh
│       │   │   └── Dockerfile
│       │   ├── 3.0
│       │   │   ├── docker-entrypoint.sh
│       │   │   └── Dockerfile
│       │   ├── 3.1
│       │   │   ├── docker-entrypoint.sh
│       │   │   └── Dockerfile
│       │   ├── 3.2
│       │   │   ├── docker-entrypoint.sh
│       │   │   └── Dockerfile
│       │   ├── 3.3
│       │   │   ├── docker-entrypoint.sh
│       │   │   └── Dockerfile
│       │   ├── 3.4
│       │   │   ├── docker-entrypoint.sh
│       │   │   └── Dockerfile
│       │   ├── 3.5
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   ├── Dockerfile
│       │   │   │   └── neo4j-plugins.json
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       └── Dockerfile
│       │   ├── 4.0
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   └── Dockerfile
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       └── Dockerfile
│       │   ├── 4.1
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   └── Dockerfile
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       └── Dockerfile
│       │   ├── 4.2
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   ├── Dockerfile
│       │   │   │   └── neo4j-plugins.json
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       └── Dockerfile
│       │   ├── 4.3
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   ├── Dockerfile
│       │   │   │   └── neo4j-plugins.json
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       └── Dockerfile
│       │   ├── 4.4
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   ├── Dockerfile-debian
│       │   │   │   ├── Dockerfile-ubi9
│       │   │   │   ├── neo4j-admin-report.sh
│       │   │   │   └── neo4j-plugins.json
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       ├── Dockerfile-debian
│       │   │       └── Dockerfile-ubi9
│       │   ├── 5
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   ├── Dockerfile-debian
│       │   │   │   ├── Dockerfile-ubi8
│       │   │   │   ├── Dockerfile-ubi9
│       │   │   │   ├── neo4j-admin-report.sh
│       │   │   │   └── neo4j-plugins.json
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       ├── Dockerfile-debian
│       │   │       ├── Dockerfile-ubi8
│       │   │       └── Dockerfile-ubi9
│       │   ├── calver
│       │   │   ├── coredb
│       │   │   │   ├── docker-entrypoint.sh
│       │   │   │   ├── Dockerfile-debian
│       │   │   │   ├── Dockerfile-ubi9
│       │   │   │   ├── neo4j-admin-report.sh
│       │   │   │   └── neo4j-plugins.json
│       │   │   └── neo4j-admin
│       │   │       ├── docker-entrypoint.sh
│       │   │       ├── Dockerfile-debian
│       │   │       └── Dockerfile-ubi9
│       │   └── common
│       │       ├── semver.jq
│       │       └── utilities.sh
│       ├── generate-stub-plugin
│       │   ├── build.gradle.kts
│       │   ├── Dockerfile
│       │   ├── ExampleNeo4jPlugin.java
│       │   ├── Makefile
│       │   ├── README.md
│       │   └── settings.gradle.kts
│       ├── LICENSE
│       ├── Makefile
│       ├── pom.xml
│       ├── publish-neo4j-admin-image.sh
│       ├── publish-neo4j-admin-images.sh
│       ├── README.md
│       └── src
│           ├── main
│           │   └── resources
│           │       └── log4j.properties
│           └── test
│               ├── java
│               │   └── com
│               │       └── neo4j
│               │           └── docker
│               │               ├── coredb
│               │               │   ├── configurations
│               │               │   │   ├── Configuration.java
│               │               │   │   ├── Setting.java
│               │               │   │   ├── TestConfSettings.java
│               │               │   │   ├── TestExtendedConf.java
│               │               │   │   └── TestJVMAdditionalConfig.java
│               │               │   ├── plugins
│               │               │   │   ├── Neo4jPluginEnv.java
│               │               │   │   ├── StubPluginHelper.java
│               │               │   │   ├── TestBundledPluginInstallation.java
│               │               │   │   ├── TestPluginInstallation.java
│               │               │   │   └── TestSemVerPluginMatching.java
│               │               │   ├── TestAdminReport.java
│               │               │   ├── TestAuthentication.java
│               │               │   ├── TestBasic.java
│               │               │   ├── TestCausalCluster.java
│               │               │   ├── TestMounting.java
│               │               │   └── TestUpgrade.java
│               │               ├── neo4jadmin
│               │               │   ├── TestAdminBasic.java
│               │               │   ├── TestBackupRestore.java
│               │               │   ├── TestBackupRestore44.java
│               │               │   ├── TestDumpLoad.java
│               │               │   ├── TestDumpLoad44.java
│               │               │   └── TestReport.java
│               │               ├── TestDeprecationWarning.java
│               │               ├── TestDockerComposeSecrets.java
│               │               └── utils
│               │                   ├── DatabaseIO.java
│               │                   ├── HostFileHttpHandler.java
│               │                   ├── HttpServerTestExtension.java
│               │                   ├── Neo4jVersion.java
│               │                   ├── Neo4jVersionTest.java
│               │                   ├── Network.java
│               │                   ├── SetContainerUser.java
│               │                   ├── TemporaryFolderManager.java
│               │                   ├── TemporaryFolderManagerTest.java
│               │                   ├── TestSettings.java
│               │                   └── WaitStrategies.java
│               └── resources
│                   ├── causal-cluster-compose.yml
│                   ├── confs
│                   │   ├── before50
│                   │   │   ├── ConfsNotOverridden.conf
│                   │   │   ├── ConfsReplaced.conf
│                   │   │   ├── EnterpriseOnlyNotOverwritten.conf
│                   │   │   ├── EnvVarsOverride.conf
│                   │   │   ├── ExtendedConf.conf
│                   │   │   ├── InvalidExtendedConf.conf
│                   │   │   ├── JvmAdditionalNotOverridden.conf
│                   │   │   ├── NoNewline.conf
│                   │   │   └── ReadConf.conf
│                   │   ├── ConfsNotOverridden.conf
│                   │   ├── ConfsReplaced.conf
│                   │   ├── EnterpriseOnlyNotOverwritten.conf
│                   │   ├── EnvVarsOverride.conf
│                   │   ├── ExtendedConf.conf
│                   │   ├── InvalidExtendedConf.conf
│                   │   ├── JvmAdditionalNotOverridden.conf
│                   │   ├── NoNewline.conf
│                   │   └── ReadConf.conf
│                   ├── dockersecrets
│                   │   ├── container-compose-with-incorrect-secrets.yml
│                   │   ├── container-compose-with-secrets-override.yml
│                   │   ├── container-compose-with-secrets.yml
│                   │   ├── simple-container-compose-with-external-file-var.yml
│                   │   └── simple-container-compose.yml
│                   ├── ha-cluster-compose.yml
│                   └── stubplugin
│                       └── myPlugin.jar
├── pyproject.toml
├── README.md
├── src
│   ├── crawl4ai_mcp.py
│   └── utils.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/knowledge_graphs/knowledge_graph_validator.py:
--------------------------------------------------------------------------------

```python
   1 | """
   2 | Knowledge Graph Validator
   3 | 
   4 | Validates AI-generated code against Neo4j knowledge graph containing
   5 | repository information. Checks imports, methods, attributes, and parameters.
   6 | """
   7 | 
   8 | import asyncio
   9 | import logging
  10 | from typing import Dict, List, Optional, Set, Tuple, Any
  11 | from dataclasses import dataclass, field
  12 | from enum import Enum
  13 | from neo4j import AsyncGraphDatabase
  14 | 
  15 | from ai_script_analyzer import (
  16 |     AnalysisResult, ImportInfo, MethodCall, AttributeAccess, 
  17 |     FunctionCall, ClassInstantiation
  18 | )
  19 | 
  20 | logger = logging.getLogger(__name__)
  21 | 
  22 | 
  23 | class ValidationStatus(Enum):
  24 |     VALID = "VALID"
  25 |     INVALID = "INVALID" 
  26 |     UNCERTAIN = "UNCERTAIN"
  27 |     NOT_FOUND = "NOT_FOUND"
  28 | 
  29 | 
  30 | @dataclass
  31 | class ValidationResult:
  32 |     """Result of validating a single element"""
  33 |     status: ValidationStatus
  34 |     confidence: float  # 0.0 to 1.0
  35 |     message: str
  36 |     details: Dict[str, Any] = field(default_factory=dict)
  37 |     suggestions: List[str] = field(default_factory=list)
  38 | 
  39 | 
  40 | @dataclass
  41 | class ImportValidation:
  42 |     """Validation result for an import"""
  43 |     import_info: ImportInfo
  44 |     validation: ValidationResult
  45 |     available_classes: List[str] = field(default_factory=list)
  46 |     available_functions: List[str] = field(default_factory=list)
  47 | 
  48 | 
  49 | @dataclass
  50 | class MethodValidation:
  51 |     """Validation result for a method call"""
  52 |     method_call: MethodCall
  53 |     validation: ValidationResult
  54 |     expected_params: List[str] = field(default_factory=list)
  55 |     actual_params: List[str] = field(default_factory=list)
  56 |     parameter_validation: ValidationResult = None
  57 | 
  58 | 
  59 | @dataclass
  60 | class AttributeValidation:
  61 |     """Validation result for attribute access"""
  62 |     attribute_access: AttributeAccess
  63 |     validation: ValidationResult
  64 |     expected_type: Optional[str] = None
  65 | 
  66 | 
  67 | @dataclass
  68 | class FunctionValidation:
  69 |     """Validation result for function call"""
  70 |     function_call: FunctionCall
  71 |     validation: ValidationResult
  72 |     expected_params: List[str] = field(default_factory=list)
  73 |     actual_params: List[str] = field(default_factory=list)
  74 |     parameter_validation: ValidationResult = None
  75 | 
  76 | 
  77 | @dataclass
  78 | class ClassValidation:
  79 |     """Validation result for class instantiation"""
  80 |     class_instantiation: ClassInstantiation
  81 |     validation: ValidationResult
  82 |     constructor_params: List[str] = field(default_factory=list)
  83 |     parameter_validation: ValidationResult = None
  84 | 
  85 | 
  86 | @dataclass
  87 | class ScriptValidationResult:
  88 |     """Complete validation results for a script"""
  89 |     script_path: str
  90 |     analysis_result: AnalysisResult
  91 |     import_validations: List[ImportValidation] = field(default_factory=list)
  92 |     class_validations: List[ClassValidation] = field(default_factory=list)
  93 |     method_validations: List[MethodValidation] = field(default_factory=list)
  94 |     attribute_validations: List[AttributeValidation] = field(default_factory=list)
  95 |     function_validations: List[FunctionValidation] = field(default_factory=list)
  96 |     overall_confidence: float = 0.0
  97 |     hallucinations_detected: List[Dict[str, Any]] = field(default_factory=list)
  98 | 
  99 | 
 100 | class KnowledgeGraphValidator:
 101 |     """Validates code against Neo4j knowledge graph"""
 102 |     
 103 |     def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
 104 |         self.neo4j_uri = neo4j_uri
 105 |         self.neo4j_user = neo4j_user
 106 |         self.neo4j_password = neo4j_password
 107 |         self.driver = None
 108 |         
 109 |         # Cache for performance
 110 |         self.module_cache: Dict[str, List[str]] = {}
 111 |         self.class_cache: Dict[str, Dict[str, Any]] = {}
 112 |         self.method_cache: Dict[str, List[Dict[str, Any]]] = {}
 113 |         self.repo_cache: Dict[str, str] = {}  # module_name -> repo_name
 114 |         self.knowledge_graph_modules: Set[str] = set()  # Track modules in knowledge graph
 115 |     
 116 |     async def initialize(self):
 117 |         """Initialize Neo4j connection"""
 118 |         self.driver = AsyncGraphDatabase.driver(
 119 |             self.neo4j_uri, 
 120 |             auth=(self.neo4j_user, self.neo4j_password)
 121 |         )
 122 |         logger.info("Knowledge graph validator initialized")
 123 |     
 124 |     async def close(self):
 125 |         """Close Neo4j connection"""
 126 |         if self.driver:
 127 |             await self.driver.close()
 128 |     
 129 |     async def validate_script(self, analysis_result: AnalysisResult) -> ScriptValidationResult:
 130 |         """Validate entire script analysis against knowledge graph"""
 131 |         result = ScriptValidationResult(
 132 |             script_path=analysis_result.file_path,
 133 |             analysis_result=analysis_result
 134 |         )
 135 |         
 136 |         # Validate imports first (builds context for other validations)
 137 |         result.import_validations = await self._validate_imports(analysis_result.imports)
 138 |         
 139 |         # Validate class instantiations
 140 |         result.class_validations = await self._validate_class_instantiations(
 141 |             analysis_result.class_instantiations
 142 |         )
 143 |         
 144 |         # Validate method calls
 145 |         result.method_validations = await self._validate_method_calls(
 146 |             analysis_result.method_calls
 147 |         )
 148 |         
 149 |         # Validate attribute accesses
 150 |         result.attribute_validations = await self._validate_attribute_accesses(
 151 |             analysis_result.attribute_accesses
 152 |         )
 153 |         
 154 |         # Validate function calls
 155 |         result.function_validations = await self._validate_function_calls(
 156 |             analysis_result.function_calls
 157 |         )
 158 |         
 159 |         # Calculate overall confidence and detect hallucinations
 160 |         result.overall_confidence = self._calculate_overall_confidence(result)
 161 |         result.hallucinations_detected = self._detect_hallucinations(result)
 162 |         
 163 |         return result
 164 |     
 165 |     async def _validate_imports(self, imports: List[ImportInfo]) -> List[ImportValidation]:
 166 |         """Validate all imports against knowledge graph"""
 167 |         validations = []
 168 |         
 169 |         for import_info in imports:
 170 |             validation = await self._validate_single_import(import_info)
 171 |             validations.append(validation)
 172 |         
 173 |         return validations
 174 |     
 175 |     async def _validate_single_import(self, import_info: ImportInfo) -> ImportValidation:
 176 |         """Validate a single import"""
 177 |         # Determine module to search for
 178 |         search_module = import_info.module if import_info.is_from_import else import_info.name
 179 |         
 180 |         # Check cache first
 181 |         if search_module in self.module_cache:
 182 |             available_files = self.module_cache[search_module]
 183 |         else:
 184 |             # Query Neo4j for matching modules
 185 |             available_files = await self._find_modules(search_module)
 186 |             self.module_cache[search_module] = available_files
 187 |         
 188 |         if available_files:
 189 |             # Get available classes and functions from the module
 190 |             classes, functions = await self._get_module_contents(search_module)
 191 |             
 192 |             # Track this module as being in the knowledge graph
 193 |             self.knowledge_graph_modules.add(search_module)
 194 |             
 195 |             # Also track the base module for "from X.Y.Z import ..." patterns
 196 |             if '.' in search_module:
 197 |                 base_module = search_module.split('.')[0]
 198 |                 self.knowledge_graph_modules.add(base_module)
 199 |             
 200 |             validation = ValidationResult(
 201 |                 status=ValidationStatus.VALID,
 202 |                 confidence=0.9,
 203 |                 message=f"Module '{search_module}' found in knowledge graph",
 204 |                 details={"matched_files": available_files, "in_knowledge_graph": True}
 205 |             )
 206 |             
 207 |             return ImportValidation(
 208 |                 import_info=import_info,
 209 |                 validation=validation,
 210 |                 available_classes=classes,
 211 |                 available_functions=functions
 212 |             )
 213 |         else:
 214 |             # External library - mark as such but don't treat as error
 215 |             validation = ValidationResult(
 216 |                 status=ValidationStatus.UNCERTAIN,
 217 |                 confidence=0.8,  # High confidence it's external, not an error
 218 |                 message=f"Module '{search_module}' is external (not in knowledge graph)",
 219 |                 details={"could_be_external": True, "in_knowledge_graph": False}
 220 |             )
 221 |             
 222 |             return ImportValidation(
 223 |                 import_info=import_info,
 224 |                 validation=validation
 225 |             )
 226 |     
 227 |     async def _validate_class_instantiations(self, instantiations: List[ClassInstantiation]) -> List[ClassValidation]:
 228 |         """Validate class instantiations"""
 229 |         validations = []
 230 |         
 231 |         for instantiation in instantiations:
 232 |             validation = await self._validate_single_class_instantiation(instantiation)
 233 |             validations.append(validation)
 234 |         
 235 |         return validations
 236 |     
 237 |     async def _validate_single_class_instantiation(self, instantiation: ClassInstantiation) -> ClassValidation:
 238 |         """Validate a single class instantiation"""
 239 |         class_name = instantiation.full_class_name or instantiation.class_name
 240 |         
 241 |         # Skip validation for classes not from knowledge graph
 242 |         if not self._is_from_knowledge_graph(class_name):
 243 |             validation = ValidationResult(
 244 |                 status=ValidationStatus.UNCERTAIN,
 245 |                 confidence=0.8,
 246 |                 message=f"Skipping validation: '{class_name}' is not from knowledge graph"
 247 |             )
 248 |             return ClassValidation(
 249 |                 class_instantiation=instantiation,
 250 |                 validation=validation
 251 |             )
 252 |         
 253 |         # Find class in knowledge graph
 254 |         class_info = await self._find_class(class_name)
 255 |         
 256 |         if not class_info:
 257 |             validation = ValidationResult(
 258 |                 status=ValidationStatus.NOT_FOUND,
 259 |                 confidence=0.2,
 260 |                 message=f"Class '{class_name}' not found in knowledge graph"
 261 |             )
 262 |             return ClassValidation(
 263 |                 class_instantiation=instantiation,
 264 |                 validation=validation
 265 |             )
 266 |         
 267 |         # Check constructor parameters (look for __init__ method)
 268 |         init_method = await self._find_method(class_name, "__init__")
 269 |         
 270 |         if init_method:
 271 |             param_validation = self._validate_parameters(
 272 |                 expected_params=init_method.get('params_list', []),
 273 |                 provided_args=instantiation.args,
 274 |                 provided_kwargs=instantiation.kwargs
 275 |             )
 276 |         else:
 277 |             param_validation = ValidationResult(
 278 |                 status=ValidationStatus.UNCERTAIN,
 279 |                 confidence=0.5,
 280 |                 message="Constructor parameters not found"
 281 |             )
 282 |         
 283 |         # Use parameter validation result if it failed
 284 |         if param_validation.status == ValidationStatus.INVALID:
 285 |             validation = ValidationResult(
 286 |                 status=ValidationStatus.INVALID,
 287 |                 confidence=param_validation.confidence,
 288 |                 message=f"Class '{class_name}' found but has invalid constructor parameters: {param_validation.message}",
 289 |                 suggestions=param_validation.suggestions
 290 |             )
 291 |         else:
 292 |             validation = ValidationResult(
 293 |                 status=ValidationStatus.VALID,
 294 |                 confidence=0.8,
 295 |                 message=f"Class '{class_name}' found in knowledge graph"
 296 |             )
 297 |         
 298 |         return ClassValidation(
 299 |             class_instantiation=instantiation,
 300 |             validation=validation,
 301 |             parameter_validation=param_validation
 302 |         )
 303 |     
 304 |     async def _validate_method_calls(self, method_calls: List[MethodCall]) -> List[MethodValidation]:
 305 |         """Validate method calls"""
 306 |         validations = []
 307 |         
 308 |         for method_call in method_calls:
 309 |             validation = await self._validate_single_method_call(method_call)
 310 |             validations.append(validation)
 311 |         
 312 |         return validations
 313 |     
 314 |     async def _validate_single_method_call(self, method_call: MethodCall) -> MethodValidation:
 315 |         """Validate a single method call"""
 316 |         class_type = method_call.object_type
 317 |         
 318 |         if not class_type:
 319 |             validation = ValidationResult(
 320 |                 status=ValidationStatus.UNCERTAIN,
 321 |                 confidence=0.3,
 322 |                 message=f"Cannot determine object type for '{method_call.object_name}'"
 323 |             )
 324 |             return MethodValidation(
 325 |                 method_call=method_call,
 326 |                 validation=validation
 327 |             )
 328 |         
 329 |         # Skip validation for classes not from knowledge graph
 330 |         if not self._is_from_knowledge_graph(class_type):
 331 |             validation = ValidationResult(
 332 |                 status=ValidationStatus.UNCERTAIN,
 333 |                 confidence=0.8,
 334 |                 message=f"Skipping validation: '{class_type}' is not from knowledge graph"
 335 |             )
 336 |             return MethodValidation(
 337 |                 method_call=method_call,
 338 |                 validation=validation
 339 |             )
 340 |         
 341 |         # Find method in knowledge graph
 342 |         method_info = await self._find_method(class_type, method_call.method_name)
 343 |         
 344 |         if not method_info:
 345 |             # Check for similar method names
 346 |             similar_methods = await self._find_similar_methods(class_type, method_call.method_name)
 347 |             
 348 |             validation = ValidationResult(
 349 |                 status=ValidationStatus.NOT_FOUND,
 350 |                 confidence=0.1,
 351 |                 message=f"Method '{method_call.method_name}' not found on class '{class_type}'",
 352 |                 suggestions=similar_methods
 353 |             )
 354 |             return MethodValidation(
 355 |                 method_call=method_call,
 356 |                 validation=validation
 357 |             )
 358 |         
 359 |         # Validate parameters
 360 |         expected_params = method_info.get('params_list', [])
 361 |         param_validation = self._validate_parameters(
 362 |             expected_params=expected_params,
 363 |             provided_args=method_call.args,
 364 |             provided_kwargs=method_call.kwargs
 365 |         )
 366 |         
 367 |         # Use parameter validation result if it failed
 368 |         if param_validation.status == ValidationStatus.INVALID:
 369 |             validation = ValidationResult(
 370 |                 status=ValidationStatus.INVALID,
 371 |                 confidence=param_validation.confidence,
 372 |                 message=f"Method '{method_call.method_name}' found but has invalid parameters: {param_validation.message}",
 373 |                 suggestions=param_validation.suggestions
 374 |             )
 375 |         else:
 376 |             validation = ValidationResult(
 377 |                 status=ValidationStatus.VALID,
 378 |                 confidence=0.9,
 379 |                 message=f"Method '{method_call.method_name}' found on class '{class_type}'"
 380 |             )
 381 |         
 382 |         return MethodValidation(
 383 |             method_call=method_call,
 384 |             validation=validation,
 385 |             expected_params=expected_params,
 386 |             actual_params=method_call.args + list(method_call.kwargs.keys()),
 387 |             parameter_validation=param_validation
 388 |         )
 389 |     
 390 |     async def _validate_attribute_accesses(self, attribute_accesses: List[AttributeAccess]) -> List[AttributeValidation]:
 391 |         """Validate attribute accesses"""
 392 |         validations = []
 393 |         
 394 |         for attr_access in attribute_accesses:
 395 |             validation = await self._validate_single_attribute_access(attr_access)
 396 |             validations.append(validation)
 397 |         
 398 |         return validations
 399 |     
 400 |     async def _validate_single_attribute_access(self, attr_access: AttributeAccess) -> AttributeValidation:
 401 |         """Validate a single attribute access"""
 402 |         class_type = attr_access.object_type
 403 |         
 404 |         if not class_type:
 405 |             validation = ValidationResult(
 406 |                 status=ValidationStatus.UNCERTAIN,
 407 |                 confidence=0.3,
 408 |                 message=f"Cannot determine object type for '{attr_access.object_name}'"
 409 |             )
 410 |             return AttributeValidation(
 411 |                 attribute_access=attr_access,
 412 |                 validation=validation
 413 |             )
 414 |         
 415 |         # Skip validation for classes not from knowledge graph
 416 |         if not self._is_from_knowledge_graph(class_type):
 417 |             validation = ValidationResult(
 418 |                 status=ValidationStatus.UNCERTAIN,
 419 |                 confidence=0.8,
 420 |                 message=f"Skipping validation: '{class_type}' is not from knowledge graph"
 421 |             )
 422 |             return AttributeValidation(
 423 |                 attribute_access=attr_access,
 424 |                 validation=validation
 425 |             )
 426 |         
 427 |         # Find attribute in knowledge graph
 428 |         attr_info = await self._find_attribute(class_type, attr_access.attribute_name)
 429 |         
 430 |         if not attr_info:
 431 |             # If not found as attribute, check if it's a method (for decorators like @agent.tool)
 432 |             method_info = await self._find_method(class_type, attr_access.attribute_name)
 433 |             
 434 |             if method_info:
 435 |                 validation = ValidationResult(
 436 |                     status=ValidationStatus.VALID,
 437 |                     confidence=0.8,
 438 |                     message=f"'{attr_access.attribute_name}' found as method on class '{class_type}' (likely used as decorator)"
 439 |                 )
 440 |                 return AttributeValidation(
 441 |                     attribute_access=attr_access,
 442 |                     validation=validation,
 443 |                     expected_type="method"
 444 |                 )
 445 |             
 446 |             validation = ValidationResult(
 447 |                 status=ValidationStatus.NOT_FOUND,
 448 |                 confidence=0.2,
 449 |                 message=f"'{attr_access.attribute_name}' not found on class '{class_type}'"
 450 |             )
 451 |             return AttributeValidation(
 452 |                 attribute_access=attr_access,
 453 |                 validation=validation
 454 |             )
 455 |         
 456 |         validation = ValidationResult(
 457 |             status=ValidationStatus.VALID,
 458 |             confidence=0.8,
 459 |             message=f"Attribute '{attr_access.attribute_name}' found on class '{class_type}'"
 460 |         )
 461 |         
 462 |         return AttributeValidation(
 463 |             attribute_access=attr_access,
 464 |             validation=validation,
 465 |             expected_type=attr_info.get('type')
 466 |         )
 467 |     
 468 |     async def _validate_function_calls(self, function_calls: List[FunctionCall]) -> List[FunctionValidation]:
 469 |         """Validate function calls"""
 470 |         validations = []
 471 |         
 472 |         for func_call in function_calls:
 473 |             validation = await self._validate_single_function_call(func_call)
 474 |             validations.append(validation)
 475 |         
 476 |         return validations
 477 |     
 478 |     async def _validate_single_function_call(self, func_call: FunctionCall) -> FunctionValidation:
 479 |         """Validate a single function call"""
 480 |         func_name = func_call.full_name or func_call.function_name
 481 |         
 482 |         # Skip validation for functions not from knowledge graph
 483 |         if func_call.full_name and not self._is_from_knowledge_graph(func_call.full_name):
 484 |             validation = ValidationResult(
 485 |                 status=ValidationStatus.UNCERTAIN,
 486 |                 confidence=0.8,
 487 |                 message=f"Skipping validation: '{func_name}' is not from knowledge graph"
 488 |             )
 489 |             return FunctionValidation(
 490 |                 function_call=func_call,
 491 |                 validation=validation
 492 |             )
 493 |         
 494 |         # Find function in knowledge graph
 495 |         func_info = await self._find_function(func_name)
 496 |         
 497 |         if not func_info:
 498 |             validation = ValidationResult(
 499 |                 status=ValidationStatus.NOT_FOUND,
 500 |                 confidence=0.2,
 501 |                 message=f"Function '{func_name}' not found in knowledge graph"
 502 |             )
 503 |             return FunctionValidation(
 504 |                 function_call=func_call,
 505 |                 validation=validation
 506 |             )
 507 |         
 508 |         # Validate parameters
 509 |         expected_params = func_info.get('params_list', [])
 510 |         param_validation = self._validate_parameters(
 511 |             expected_params=expected_params,
 512 |             provided_args=func_call.args,
 513 |             provided_kwargs=func_call.kwargs
 514 |         )
 515 |         
 516 |         # Use parameter validation result if it failed
 517 |         if param_validation.status == ValidationStatus.INVALID:
 518 |             validation = ValidationResult(
 519 |                 status=ValidationStatus.INVALID,
 520 |                 confidence=param_validation.confidence,
 521 |                 message=f"Function '{func_name}' found but has invalid parameters: {param_validation.message}",
 522 |                 suggestions=param_validation.suggestions
 523 |             )
 524 |         else:
 525 |             validation = ValidationResult(
 526 |                 status=ValidationStatus.VALID,
 527 |                 confidence=0.8,
 528 |                 message=f"Function '{func_name}' found in knowledge graph"
 529 |             )
 530 |         
 531 |         return FunctionValidation(
 532 |             function_call=func_call,
 533 |             validation=validation,
 534 |             expected_params=expected_params,
 535 |             actual_params=func_call.args + list(func_call.kwargs.keys()),
 536 |             parameter_validation=param_validation
 537 |         )
 538 |     
 539 |     def _validate_parameters(self, expected_params: List[str], provided_args: List[str], 
 540 |                            provided_kwargs: Dict[str, str]) -> ValidationResult:
 541 |         """Validate function/method parameters with comprehensive support"""
 542 |         if not expected_params:
 543 |             return ValidationResult(
 544 |                 status=ValidationStatus.UNCERTAIN,
 545 |                 confidence=0.5,
 546 |                 message="Parameter information not available"
 547 |             )
 548 |         
 549 |         # Parse expected parameters - handle detailed format
 550 |         required_positional = []
 551 |         optional_positional = []
 552 |         keyword_only_required = []
 553 |         keyword_only_optional = []
 554 |         has_varargs = False
 555 |         has_varkwargs = False
 556 |         
 557 |         for param in expected_params:
 558 |             # Handle detailed format: "[keyword_only] name:type=default" or "name:type"
 559 |             param_clean = param.strip()
 560 |             
 561 |             # Check for parameter kind prefix
 562 |             kind = 'positional'
 563 |             if param_clean.startswith('['):
 564 |                 end_bracket = param_clean.find(']')
 565 |                 if end_bracket > 0:
 566 |                     kind = param_clean[1:end_bracket]
 567 |                     param_clean = param_clean[end_bracket+1:].strip()
 568 |             
 569 |             # Check for varargs/varkwargs
 570 |             if param_clean.startswith('*') and not param_clean.startswith('**'):
 571 |                 has_varargs = True
 572 |                 continue
 573 |             elif param_clean.startswith('**'):
 574 |                 has_varkwargs = True
 575 |                 continue
 576 |             
 577 |             # Parse name and check if optional
 578 |             if ':' in param_clean:
 579 |                 param_name = param_clean.split(':')[0]
 580 |                 is_optional = '=' in param_clean
 581 |                 
 582 |                 if kind == 'keyword_only':
 583 |                     if is_optional:
 584 |                         keyword_only_optional.append(param_name)
 585 |                     else:
 586 |                         keyword_only_required.append(param_name)
 587 |                 else:  # positional
 588 |                     if is_optional:
 589 |                         optional_positional.append(param_name)
 590 |                     else:
 591 |                         required_positional.append(param_name)
 592 |         
 593 |         # Count provided parameters
 594 |         provided_positional_count = len(provided_args)
 595 |         provided_keyword_names = set(provided_kwargs.keys())
 596 |         
 597 |         # Validate positional arguments
 598 |         min_required_positional = len(required_positional)
 599 |         max_allowed_positional = len(required_positional) + len(optional_positional)
 600 |         
 601 |         if not has_varargs and provided_positional_count > max_allowed_positional:
 602 |             return ValidationResult(
 603 |                 status=ValidationStatus.INVALID,
 604 |                 confidence=0.8,
 605 |                 message=f"Too many positional arguments: provided {provided_positional_count}, max allowed {max_allowed_positional}"
 606 |             )
 607 |         
 608 |         if provided_positional_count < min_required_positional:
 609 |             return ValidationResult(
 610 |                 status=ValidationStatus.INVALID,
 611 |                 confidence=0.8,
 612 |                 message=f"Too few positional arguments: provided {provided_positional_count}, required {min_required_positional}"
 613 |             )
 614 |         
 615 |         # Validate keyword arguments
 616 |         all_valid_kwarg_names = set(required_positional + optional_positional + keyword_only_required + keyword_only_optional)
 617 |         invalid_kwargs = provided_keyword_names - all_valid_kwarg_names
 618 |         
 619 |         if invalid_kwargs and not has_varkwargs:
 620 |             return ValidationResult(
 621 |                 status=ValidationStatus.INVALID,
 622 |                 confidence=0.7,
 623 |                 message=f"Invalid keyword arguments: {list(invalid_kwargs)}",
 624 |                 suggestions=[f"Valid parameters: {list(all_valid_kwarg_names)}"]
 625 |             )
 626 |         
 627 |         # Check required keyword-only arguments
 628 |         missing_required_kwargs = set(keyword_only_required) - provided_keyword_names
 629 |         if missing_required_kwargs:
 630 |             return ValidationResult(
 631 |                 status=ValidationStatus.INVALID,
 632 |                 confidence=0.8,
 633 |                 message=f"Missing required keyword arguments: {list(missing_required_kwargs)}"
 634 |             )
 635 |         
 636 |         return ValidationResult(
 637 |             status=ValidationStatus.VALID,
 638 |             confidence=0.9,
 639 |             message="Parameters are valid"
 640 |         )
 641 |     
 642 |     # Neo4j Query Methods
 643 |     
 644 |     async def _find_modules(self, module_name: str) -> List[str]:
 645 |         """Find repository matching the module name, then return its files"""
 646 |         async with self.driver.session() as session:
 647 |             # First, try to find files with module names that match or start with the search term
 648 |             module_query = """
 649 |             MATCH (r:Repository)-[:CONTAINS]->(f:File)
 650 |             WHERE f.module_name = $module_name 
 651 |                OR f.module_name STARTS WITH $module_name + '.'
 652 |                OR split(f.module_name, '.')[0] = $module_name
 653 |             RETURN DISTINCT r.name as repo_name, count(f) as file_count
 654 |             ORDER BY file_count DESC
 655 |             LIMIT 5
 656 |             """
 657 |             
 658 |             result = await session.run(module_query, module_name=module_name)
 659 |             repos_from_modules = []
 660 |             async for record in result:
 661 |                 repos_from_modules.append(record['repo_name'])
 662 |             
 663 |             # Also try repository name matching as fallback
 664 |             repo_query = """
 665 |             MATCH (r:Repository)
 666 |             WHERE toLower(r.name) = toLower($module_name)
 667 |                OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
 668 |                OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
 669 |             RETURN r.name as repo_name
 670 |             ORDER BY 
 671 |                 CASE 
 672 |                     WHEN toLower(r.name) = toLower($module_name) THEN 1
 673 |                     WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
 674 |                     WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
 675 |                 END
 676 |             LIMIT 5
 677 |             """
 678 |             
 679 |             result = await session.run(repo_query, module_name=module_name)
 680 |             repos_from_names = []
 681 |             async for record in result:
 682 |                 repos_from_names.append(record['repo_name'])
 683 |             
 684 |             # Combine results, prioritizing module-based matches
 685 |             all_repos = repos_from_modules + [r for r in repos_from_names if r not in repos_from_modules]
 686 |             
 687 |             if not all_repos:
 688 |                 return []
 689 |             
 690 |             # Get files from the best matching repository
 691 |             best_repo = all_repos[0]
 692 |             files_query = """
 693 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
 694 |             RETURN f.path, f.module_name
 695 |             LIMIT 50
 696 |             """
 697 |             
 698 |             result = await session.run(files_query, repo_name=best_repo)
 699 |             files = []
 700 |             async for record in result:
 701 |                 files.append(record['f.path'])
 702 |             
 703 |             return files
 704 |     
 705 |     async def _get_module_contents(self, module_name: str) -> Tuple[List[str], List[str]]:
 706 |         """Get classes and functions available in a repository matching the module name"""
 707 |         async with self.driver.session() as session:
 708 |             # First, try to find repository by module names in files
 709 |             module_query = """
 710 |             MATCH (r:Repository)-[:CONTAINS]->(f:File)
 711 |             WHERE f.module_name = $module_name 
 712 |                OR f.module_name STARTS WITH $module_name + '.'
 713 |                OR split(f.module_name, '.')[0] = $module_name
 714 |             RETURN DISTINCT r.name as repo_name, count(f) as file_count
 715 |             ORDER BY file_count DESC
 716 |             LIMIT 1
 717 |             """
 718 |             
 719 |             result = await session.run(module_query, module_name=module_name)
 720 |             record = await result.single()
 721 |             
 722 |             if record:
 723 |                 repo_name = record['repo_name']
 724 |             else:
 725 |                 # Fallback to repository name matching
 726 |                 repo_query = """
 727 |                 MATCH (r:Repository)
 728 |                 WHERE toLower(r.name) = toLower($module_name)
 729 |                    OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
 730 |                    OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
 731 |                 RETURN r.name as repo_name
 732 |                 ORDER BY 
 733 |                     CASE 
 734 |                         WHEN toLower(r.name) = toLower($module_name) THEN 1
 735 |                         WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
 736 |                         WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
 737 |                     END
 738 |                 LIMIT 1
 739 |                 """
 740 |                 
 741 |                 result = await session.run(repo_query, module_name=module_name)
 742 |                 record = await result.single()
 743 |                 
 744 |                 if not record:
 745 |                     return [], []
 746 |                 
 747 |                 repo_name = record['repo_name']
 748 |             
 749 |             # Get classes from this repository
 750 |             class_query = """
 751 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
 752 |             RETURN DISTINCT c.name as class_name
 753 |             """
 754 |             
 755 |             result = await session.run(class_query, repo_name=repo_name)
 756 |             classes = []
 757 |             async for record in result:
 758 |                 classes.append(record['class_name'])
 759 |             
 760 |             # Get functions from this repository
 761 |             func_query = """
 762 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
 763 |             RETURN DISTINCT func.name as function_name
 764 |             """
 765 |             
 766 |             result = await session.run(func_query, repo_name=repo_name)
 767 |             functions = []
 768 |             async for record in result:
 769 |                 functions.append(record['function_name'])
 770 |             
 771 |             return classes, functions
 772 |     
 773 |     async def _find_repository_for_module(self, module_name: str) -> Optional[str]:
 774 |         """Find the repository name that matches a module name"""
 775 |         if module_name in self.repo_cache:
 776 |             return self.repo_cache[module_name]
 777 |         
 778 |         async with self.driver.session() as session:
 779 |             # First, try to find repository by module names in files
 780 |             module_query = """
 781 |             MATCH (r:Repository)-[:CONTAINS]->(f:File)
 782 |             WHERE f.module_name = $module_name 
 783 |                OR f.module_name STARTS WITH $module_name + '.'
 784 |                OR split(f.module_name, '.')[0] = $module_name
 785 |             RETURN DISTINCT r.name as repo_name, count(f) as file_count
 786 |             ORDER BY file_count DESC
 787 |             LIMIT 1
 788 |             """
 789 |             
 790 |             result = await session.run(module_query, module_name=module_name)
 791 |             record = await result.single()
 792 |             
 793 |             if record:
 794 |                 repo_name = record['repo_name']
 795 |             else:
 796 |                 # Fallback to repository name matching
 797 |                 query = """
 798 |                 MATCH (r:Repository)
 799 |                 WHERE toLower(r.name) = toLower($module_name)
 800 |                    OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
 801 |                    OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
 802 |                    OR toLower(r.name) CONTAINS toLower($module_name)
 803 |                    OR toLower($module_name) CONTAINS toLower(replace(r.name, '-', '_'))
 804 |                 RETURN r.name as repo_name
 805 |                 ORDER BY 
 806 |                     CASE 
 807 |                         WHEN toLower(r.name) = toLower($module_name) THEN 1
 808 |                         WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
 809 |                         ELSE 3
 810 |                     END
 811 |                 LIMIT 1
 812 |                 """
 813 |                 
 814 |                 result = await session.run(query, module_name=module_name)
 815 |                 record = await result.single()
 816 |                 
 817 |                 repo_name = record['repo_name'] if record else None
 818 |             
 819 |             self.repo_cache[module_name] = repo_name
 820 |             return repo_name
 821 |     
 822 |     async def _find_class(self, class_name: str) -> Optional[Dict[str, Any]]:
 823 |         """Find class information in knowledge graph"""
 824 |         async with self.driver.session() as session:
 825 |             # First try exact match
 826 |             query = """
 827 |             MATCH (c:Class)
 828 |             WHERE c.name = $class_name OR c.full_name = $class_name
 829 |             RETURN c.name as name, c.full_name as full_name
 830 |             LIMIT 1
 831 |             """
 832 |             
 833 |             result = await session.run(query, class_name=class_name)
 834 |             record = await result.single()
 835 |             
 836 |             if record:
 837 |                 return {
 838 |                     'name': record['name'],
 839 |                     'full_name': record['full_name']
 840 |                 }
 841 |             
 842 |             # If no exact match and class_name has dots, try repository-based search
 843 |             if '.' in class_name:
 844 |                 parts = class_name.split('.')
 845 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
 846 |                 class_part = parts[-1]  # e.g., "Agent"
 847 |                 
 848 |                 # Find repository for the module
 849 |                 repo_name = await self._find_repository_for_module(module_part)
 850 |                 
 851 |                 if repo_name:
 852 |                     # Search for class within this repository
 853 |                     repo_query = """
 854 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
 855 |                     WHERE c.name = $class_name
 856 |                     RETURN c.name as name, c.full_name as full_name
 857 |                     LIMIT 1
 858 |                     """
 859 |                     
 860 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part)
 861 |                     record = await result.single()
 862 |                     
 863 |                     if record:
 864 |                         return {
 865 |                             'name': record['name'],
 866 |                             'full_name': record['full_name']
 867 |                         }
 868 |             
 869 |             return None
 870 |     
 871 |     async def _find_method(self, class_name: str, method_name: str) -> Optional[Dict[str, Any]]:
 872 |         """Find method information for a class"""
 873 |         cache_key = f"{class_name}.{method_name}"
 874 |         if cache_key in self.method_cache:
 875 |             methods = self.method_cache[cache_key]
 876 |             return methods[0] if methods else None
 877 |         
 878 |         async with self.driver.session() as session:
 879 |             # First try exact match
 880 |             query = """
 881 |             MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
 882 |             WHERE (c.name = $class_name OR c.full_name = $class_name)
 883 |               AND m.name = $method_name
 884 |             RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, 
 885 |                    m.return_type as return_type, m.args as args
 886 |             LIMIT 1
 887 |             """
 888 |             
 889 |             result = await session.run(query, class_name=class_name, method_name=method_name)
 890 |             record = await result.single()
 891 |             
 892 |             if record:
 893 |                 # Use detailed params if available, fall back to simple params
 894 |                 params_to_use = record['params_detailed'] or record['params_list'] or []
 895 |                 
 896 |                 method_info = {
 897 |                     'name': record['name'],
 898 |                     'params_list': params_to_use,
 899 |                     'return_type': record['return_type'],
 900 |                     'args': record['args'] or []
 901 |                 }
 902 |                 self.method_cache[cache_key] = [method_info]
 903 |                 return method_info
 904 |             
 905 |             # If no exact match and class_name has dots, try repository-based search
 906 |             if '.' in class_name:
 907 |                 parts = class_name.split('.')
 908 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
 909 |                 class_part = parts[-1]  # e.g., "Agent"
 910 |                 
 911 |                 # Find repository for the module
 912 |                 repo_name = await self._find_repository_for_module(module_part)
 913 |                 
 914 |                 if repo_name:
 915 |                     # Search for method within this repository's classes
 916 |                     repo_query = """
 917 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
 918 |                     WHERE c.name = $class_name AND m.name = $method_name
 919 |                     RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
 920 |                            m.return_type as return_type, m.args as args
 921 |                     LIMIT 1
 922 |                     """
 923 |                     
 924 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, method_name=method_name)
 925 |                     record = await result.single()
 926 |                     
 927 |                     if record:
 928 |                         # Use detailed params if available, fall back to simple params
 929 |                         params_to_use = record['params_detailed'] or record['params_list'] or []
 930 |                         
 931 |                         method_info = {
 932 |                             'name': record['name'],
 933 |                             'params_list': params_to_use,
 934 |                             'return_type': record['return_type'],
 935 |                             'args': record['args'] or []
 936 |                         }
 937 |                         self.method_cache[cache_key] = [method_info]
 938 |                         return method_info
 939 |             
 940 |             self.method_cache[cache_key] = []
 941 |             return None
 942 |     
 943 |     async def _find_attribute(self, class_name: str, attr_name: str) -> Optional[Dict[str, Any]]:
 944 |         """Find attribute information for a class"""
 945 |         async with self.driver.session() as session:
 946 |             # First try exact match
 947 |             query = """
 948 |             MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
 949 |             WHERE (c.name = $class_name OR c.full_name = $class_name)
 950 |               AND a.name = $attr_name
 951 |             RETURN a.name as name, a.type as type
 952 |             LIMIT 1
 953 |             """
 954 |             
 955 |             result = await session.run(query, class_name=class_name, attr_name=attr_name)
 956 |             record = await result.single()
 957 |             
 958 |             if record:
 959 |                 return {
 960 |                     'name': record['name'],
 961 |                     'type': record['type']
 962 |                 }
 963 |             
 964 |             # If no exact match and class_name has dots, try repository-based search
 965 |             if '.' in class_name:
 966 |                 parts = class_name.split('.')
 967 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
 968 |                 class_part = parts[-1]  # e.g., "Agent"
 969 |                 
 970 |                 # Find repository for the module
 971 |                 repo_name = await self._find_repository_for_module(module_part)
 972 |                 
 973 |                 if repo_name:
 974 |                     # Search for attribute within this repository's classes
 975 |                     repo_query = """
 976 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
 977 |                     WHERE c.name = $class_name AND a.name = $attr_name
 978 |                     RETURN a.name as name, a.type as type
 979 |                     LIMIT 1
 980 |                     """
 981 |                     
 982 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, attr_name=attr_name)
 983 |                     record = await result.single()
 984 |                     
 985 |                     if record:
 986 |                         return {
 987 |                             'name': record['name'],
 988 |                             'type': record['type']
 989 |                         }
 990 |             
 991 |             return None
 992 |     
 993 |     async def _find_function(self, func_name: str) -> Optional[Dict[str, Any]]:
 994 |         """Find function information"""
 995 |         async with self.driver.session() as session:
 996 |             # First try exact match
 997 |             query = """
 998 |             MATCH (f:Function)
 999 |             WHERE f.name = $func_name OR f.full_name = $func_name
1000 |             RETURN f.name as name, f.params_list as params_list, f.params_detailed as params_detailed,
1001 |                    f.return_type as return_type, f.args as args
1002 |             LIMIT 1
1003 |             """
1004 |             
1005 |             result = await session.run(query, func_name=func_name)
1006 |             record = await result.single()
1007 |             
1008 |             if record:
1009 |                 # Use detailed params if available, fall back to simple params
1010 |                 params_to_use = record['params_detailed'] or record['params_list'] or []
1011 |                 
1012 |                 return {
1013 |                     'name': record['name'],
1014 |                     'params_list': params_to_use,
1015 |                     'return_type': record['return_type'],
1016 |                     'args': record['args'] or []
1017 |                 }
1018 |             
1019 |             # If no exact match and func_name has dots, try repository-based search
1020 |             if '.' in func_name:
1021 |                 parts = func_name.split('.')
1022 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
1023 |                 func_part = parts[-1]  # e.g., "some_function"
1024 |                 
1025 |                 # Find repository for the module
1026 |                 repo_name = await self._find_repository_for_module(module_part)
1027 |                 
1028 |                 if repo_name:
1029 |                     # Search for function within this repository
1030 |                     repo_query = """
1031 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1032 |                     WHERE func.name = $func_name
1033 |                     RETURN func.name as name, func.params_list as params_list, func.params_detailed as params_detailed,
1034 |                            func.return_type as return_type, func.args as args
1035 |                     LIMIT 1
1036 |                     """
1037 |                     
1038 |                     result = await session.run(repo_query, repo_name=repo_name, func_name=func_part)
1039 |                     record = await result.single()
1040 |                     
1041 |                     if record:
1042 |                         # Use detailed params if available, fall back to simple params
1043 |                         params_to_use = record['params_detailed'] or record['params_list'] or []
1044 |                         
1045 |                         return {
1046 |                             'name': record['name'],
1047 |                             'params_list': params_to_use,
1048 |                             'return_type': record['return_type'],
1049 |                             'args': record['args'] or []
1050 |                         }
1051 |             
1052 |             return None
1053 |     
1054 |     async def _find_pydantic_ai_result_method(self, method_name: str) -> Optional[Dict[str, Any]]:
1055 |         """Find method information for pydantic_ai result objects"""
1056 |         # Look for methods on pydantic_ai classes that could be result objects
1057 |         async with self.driver.session() as session:
1058 |             # Search for common result methods in pydantic_ai repository
1059 |             query = """
1060 |             MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1061 |             WHERE m.name = $method_name 
1062 |               AND (c.name CONTAINS 'Result' OR c.name CONTAINS 'Stream' OR c.name CONTAINS 'Run')
1063 |             RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
1064 |                    m.return_type as return_type, m.args as args, c.name as class_name
1065 |             LIMIT 1
1066 |             """
1067 |             
1068 |             result = await session.run(query, repo_name="pydantic_ai", method_name=method_name)
1069 |             record = await result.single()
1070 |             
1071 |             if record:
1072 |                 # Use detailed params if available, fall back to simple params
1073 |                 params_to_use = record['params_detailed'] or record['params_list'] or []
1074 |                 
1075 |                 return {
1076 |                     'name': record['name'],
1077 |                     'params_list': params_to_use,
1078 |                     'return_type': record['return_type'],
1079 |                     'args': record['args'] or [],
1080 |                     'source_class': record['class_name']
1081 |                 }
1082 |             
1083 |             return None
1084 |     
1085 |     async def _find_similar_modules(self, module_name: str) -> List[str]:
1086 |         """Find similar repository names for suggestions"""
1087 |         async with self.driver.session() as session:
1088 |             query = """
1089 |             MATCH (r:Repository)
1090 |             WHERE toLower(r.name) CONTAINS toLower($partial_name)
1091 |                OR toLower(replace(r.name, '-', '_')) CONTAINS toLower($partial_name)
1092 |                OR toLower(replace(r.name, '_', '-')) CONTAINS toLower($partial_name)
1093 |             RETURN r.name
1094 |             LIMIT 5
1095 |             """
1096 |             
1097 |             result = await session.run(query, partial_name=module_name[:3])
1098 |             suggestions = []
1099 |             async for record in result:
1100 |                 suggestions.append(record['name'])
1101 |             
1102 |             return suggestions
1103 |     
1104 |     async def _find_similar_methods(self, class_name: str, method_name: str) -> List[str]:
1105 |         """Find similar method names for suggestions"""
1106 |         async with self.driver.session() as session:
1107 |             # First try exact class match
1108 |             query = """
1109 |             MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1110 |             WHERE (c.name = $class_name OR c.full_name = $class_name)
1111 |               AND m.name CONTAINS $partial_name
1112 |             RETURN m.name as name
1113 |             LIMIT 5
1114 |             """
1115 |             
1116 |             result = await session.run(query, class_name=class_name, partial_name=method_name[:3])
1117 |             suggestions = []
1118 |             async for record in result:
1119 |                 suggestions.append(record['name'])
1120 |             
1121 |             # If no suggestions and class_name has dots, try repository-based search
1122 |             if not suggestions and '.' in class_name:
1123 |                 parts = class_name.split('.')
1124 |                 module_part = '.'.join(parts[:-1])  # e.g., "pydantic_ai"
1125 |                 class_part = parts[-1]  # e.g., "Agent"
1126 |                 
1127 |                 # Find repository for the module
1128 |                 repo_name = await self._find_repository_for_module(module_part)
1129 |                 
1130 |                 if repo_name:
1131 |                     repo_query = """
1132 |                     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1133 |                     WHERE c.name = $class_name AND m.name CONTAINS $partial_name
1134 |                     RETURN m.name as name
1135 |                     LIMIT 5
1136 |                     """
1137 |                     
1138 |                     result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, partial_name=method_name[:3])
1139 |                     async for record in result:
1140 |                         suggestions.append(record['name'])
1141 |             
1142 |             return suggestions
1143 |     
1144 |     def _calculate_overall_confidence(self, result: ScriptValidationResult) -> float:
1145 |         """Calculate overall confidence score for the validation (knowledge graph items only)"""
1146 |         kg_validations = []
1147 |         
1148 |         # Only count validations from knowledge graph imports
1149 |         for val in result.import_validations:
1150 |             if val.validation.details.get('in_knowledge_graph', False):
1151 |                 kg_validations.append(val.validation.confidence)
1152 |         
1153 |         # Only count validations from knowledge graph classes
1154 |         for val in result.class_validations:
1155 |             class_name = val.class_instantiation.full_class_name or val.class_instantiation.class_name
1156 |             if self._is_from_knowledge_graph(class_name):
1157 |                 kg_validations.append(val.validation.confidence)
1158 |         
1159 |         # Only count validations from knowledge graph methods
1160 |         for val in result.method_validations:
1161 |             if val.method_call.object_type and self._is_from_knowledge_graph(val.method_call.object_type):
1162 |                 kg_validations.append(val.validation.confidence)
1163 |         
1164 |         # Only count validations from knowledge graph attributes
1165 |         for val in result.attribute_validations:
1166 |             if val.attribute_access.object_type and self._is_from_knowledge_graph(val.attribute_access.object_type):
1167 |                 kg_validations.append(val.validation.confidence)
1168 |         
1169 |         # Only count validations from knowledge graph functions
1170 |         for val in result.function_validations:
1171 |             if val.function_call.full_name and self._is_from_knowledge_graph(val.function_call.full_name):
1172 |                 kg_validations.append(val.validation.confidence)
1173 |         
1174 |         if not kg_validations:
1175 |             return 1.0  # No knowledge graph items to validate = perfect confidence
1176 |         
1177 |         return sum(kg_validations) / len(kg_validations)
1178 |     
1179 |     def _is_from_knowledge_graph(self, class_type: str) -> bool:
1180 |         """Check if a class type comes from a module in the knowledge graph"""
1181 |         if not class_type:
1182 |             return False
1183 |         
1184 |         # For dotted names like "pydantic_ai.Agent" or "pydantic_ai.StreamedRunResult", check the base module
1185 |         if '.' in class_type:
1186 |             base_module = class_type.split('.')[0]
1187 |             # Exact match only - "pydantic" should not match "pydantic_ai"
1188 |             return base_module in self.knowledge_graph_modules
1189 |         
1190 |         # For simple names, check if any knowledge graph module matches exactly
1191 |         # Don't use substring matching to avoid "pydantic" matching "pydantic_ai"
1192 |         return class_type in self.knowledge_graph_modules
1193 |     
1194 |     def _detect_hallucinations(self, result: ScriptValidationResult) -> List[Dict[str, Any]]:
1195 |         """Detect and categorize hallucinations"""
1196 |         hallucinations = []
1197 |         reported_items = set()  # Track reported items to avoid duplicates
1198 |         
1199 |         # Check method calls (only for knowledge graph classes)
1200 |         for val in result.method_validations:
1201 |             if (val.validation.status == ValidationStatus.NOT_FOUND and 
1202 |                 val.method_call.object_type and 
1203 |                 self._is_from_knowledge_graph(val.method_call.object_type)):
1204 |                 
1205 |                 # Create unique key to avoid duplicates
1206 |                 key = (val.method_call.line_number, val.method_call.method_name, val.method_call.object_type)
1207 |                 if key not in reported_items:
1208 |                     reported_items.add(key)
1209 |                     hallucinations.append({
1210 |                         'type': 'METHOD_NOT_FOUND',
1211 |                         'location': f"line {val.method_call.line_number}",
1212 |                         'description': f"Method '{val.method_call.method_name}' not found on class '{val.method_call.object_type}'",
1213 |                         'suggestion': val.validation.suggestions[0] if val.validation.suggestions else None
1214 |                     })
1215 |         
1216 |         # Check attributes (only for knowledge graph classes) - but skip if already reported as method
1217 |         for val in result.attribute_validations:
1218 |             if (val.validation.status == ValidationStatus.NOT_FOUND and 
1219 |                 val.attribute_access.object_type and 
1220 |                 self._is_from_knowledge_graph(val.attribute_access.object_type)):
1221 |                 
1222 |                 # Create unique key - if this was already reported as a method, skip it
1223 |                 key = (val.attribute_access.line_number, val.attribute_access.attribute_name, val.attribute_access.object_type)
1224 |                 if key not in reported_items:
1225 |                     reported_items.add(key)
1226 |                     hallucinations.append({
1227 |                         'type': 'ATTRIBUTE_NOT_FOUND',
1228 |                         'location': f"line {val.attribute_access.line_number}",
1229 |                         'description': f"Attribute '{val.attribute_access.attribute_name}' not found on class '{val.attribute_access.object_type}'"
1230 |                     })
1231 |         
1232 |         # Check parameter issues (only for knowledge graph methods)
1233 |         for val in result.method_validations:
1234 |             if (val.parameter_validation and 
1235 |                 val.parameter_validation.status == ValidationStatus.INVALID and
1236 |                 val.method_call.object_type and 
1237 |                 self._is_from_knowledge_graph(val.method_call.object_type)):
1238 |                 hallucinations.append({
1239 |                     'type': 'INVALID_PARAMETERS',
1240 |                     'location': f"line {val.method_call.line_number}",
1241 |                     'description': f"Invalid parameters for method '{val.method_call.method_name}': {val.parameter_validation.message}"
1242 |                 })
1243 |         
1244 |         return hallucinations
```

--------------------------------------------------------------------------------
/src/crawl4ai_mcp.py:
--------------------------------------------------------------------------------

```python
   1 | """
   2 | MCP server for web crawling with Crawl4AI.
   3 | 
   4 | This server provides tools to crawl websites using Crawl4AI, automatically detecting
   5 | the appropriate crawl method based on URL type (sitemap, txt file, or regular webpage).
   6 | Also includes AI hallucination detection and repository parsing tools using Neo4j knowledge graphs.
   7 | """
   8 | from mcp.server.fastmcp import FastMCP, Context
   9 | from sentence_transformers import CrossEncoder
  10 | from contextlib import asynccontextmanager
  11 | from collections.abc import AsyncIterator
  12 | from dataclasses import dataclass
  13 | from typing import List, Dict, Any, Optional
  14 | from urllib.parse import urlparse, urldefrag
  15 | from xml.etree import ElementTree
  16 | from dotenv import load_dotenv
  17 | from supabase import Client
  18 | from pathlib import Path
  19 | import requests
  20 | import asyncio
  21 | import json
  22 | import os
  23 | import re
  24 | import concurrent.futures
  25 | import sys
  26 | 
  27 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
  28 | 
  29 | # Add knowledge_graphs folder to path for importing knowledge graph modules
  30 | knowledge_graphs_path = Path(__file__).resolve().parent.parent / 'knowledge_graphs'
  31 | sys.path.append(str(knowledge_graphs_path))
  32 | 
  33 | from utils import (
  34 |     get_supabase_client, 
  35 |     add_documents_to_supabase, 
  36 |     search_documents,
  37 |     extract_code_blocks,
  38 |     generate_code_example_summary,
  39 |     add_code_examples_to_supabase,
  40 |     update_source_info,
  41 |     extract_source_summary,
  42 |     search_code_examples
  43 | )
  44 | 
  45 | # Import knowledge graph modules
  46 | from knowledge_graph_validator import KnowledgeGraphValidator
  47 | from parse_repo_into_neo4j import DirectNeo4jExtractor
  48 | from ai_script_analyzer import AIScriptAnalyzer
  49 | from hallucination_reporter import HallucinationReporter
  50 | 
  51 | # Load environment variables from the project root .env file
  52 | project_root = Path(__file__).resolve().parent.parent
  53 | dotenv_path = project_root / '.env'
  54 | 
  55 | # Force override of existing environment variables
  56 | load_dotenv(dotenv_path, override=True)
  57 | 
  58 | # Helper functions for Neo4j validation and error handling
  59 | def validate_neo4j_connection() -> bool:
  60 |     """Check if Neo4j environment variables are configured."""
  61 |     return all([
  62 |         os.getenv("NEO4J_URI"),
  63 |         os.getenv("NEO4J_USER"),
  64 |         os.getenv("NEO4J_PASSWORD")
  65 |     ])
  66 | 
  67 | def format_neo4j_error(error: Exception) -> str:
  68 |     """Format Neo4j connection errors for user-friendly messages."""
  69 |     error_str = str(error).lower()
  70 |     if "authentication" in error_str or "unauthorized" in error_str:
  71 |         return "Neo4j authentication failed. Check NEO4J_USER and NEO4J_PASSWORD."
  72 |     elif "connection" in error_str or "refused" in error_str or "timeout" in error_str:
  73 |         return "Cannot connect to Neo4j. Check NEO4J_URI and ensure Neo4j is running."
  74 |     elif "database" in error_str:
  75 |         return "Neo4j database error. Check if the database exists and is accessible."
  76 |     else:
  77 |         return f"Neo4j error: {str(error)}"
  78 | 
  79 | def validate_script_path(script_path: str) -> Dict[str, Any]:
  80 |     """Validate script path and return error info if invalid."""
  81 |     if not script_path or not isinstance(script_path, str):
  82 |         return {"valid": False, "error": "Script path is required"}
  83 |     
  84 |     if not os.path.exists(script_path):
  85 |         return {"valid": False, "error": f"Script not found: {script_path}"}
  86 |     
  87 |     if not script_path.endswith('.py'):
  88 |         return {"valid": False, "error": "Only Python (.py) files are supported"}
  89 |     
  90 |     try:
  91 |         # Check if file is readable
  92 |         with open(script_path, 'r', encoding='utf-8') as f:
  93 |             f.read(1)  # Read first character to test
  94 |         return {"valid": True}
  95 |     except Exception as e:
  96 |         return {"valid": False, "error": f"Cannot read script file: {str(e)}"}
  97 | 
  98 | def validate_github_url(repo_url: str) -> Dict[str, Any]:
  99 |     """Validate GitHub repository URL."""
 100 |     if not repo_url or not isinstance(repo_url, str):
 101 |         return {"valid": False, "error": "Repository URL is required"}
 102 |     
 103 |     repo_url = repo_url.strip()
 104 |     
 105 |     # Basic GitHub URL validation
 106 |     if not ("github.com" in repo_url.lower() or repo_url.endswith(".git")):
 107 |         return {"valid": False, "error": "Please provide a valid GitHub repository URL"}
 108 |     
 109 |     # Check URL format
 110 |     if not (repo_url.startswith("https://") or repo_url.startswith("git@")):
 111 |         return {"valid": False, "error": "Repository URL must start with https:// or git@"}
 112 |     
 113 |     return {"valid": True, "repo_name": repo_url.split('/')[-1].replace('.git', '')}
 114 | 
 115 | # Create a dataclass for our application context
 116 | @dataclass
 117 | class Crawl4AIContext:
 118 |     """Context for the Crawl4AI MCP server."""
 119 |     crawler: AsyncWebCrawler
 120 |     supabase_client: Client
 121 |     reranking_model: Optional[CrossEncoder] = None
 122 |     knowledge_validator: Optional[Any] = None  # KnowledgeGraphValidator when available
 123 |     repo_extractor: Optional[Any] = None       # DirectNeo4jExtractor when available
 124 | 
 125 | @asynccontextmanager
 126 | async def crawl4ai_lifespan(server: FastMCP) -> AsyncIterator[Crawl4AIContext]:
 127 |     """
 128 |     Manages the Crawl4AI client lifecycle.
 129 |     
 130 |     Args:
 131 |         server: The FastMCP server instance
 132 |         
 133 |     Yields:
 134 |         Crawl4AIContext: The context containing the Crawl4AI crawler and Supabase client
 135 |     """
 136 |     # Create browser configuration
 137 |     browser_config = BrowserConfig(
 138 |         headless=True,
 139 |         verbose=False
 140 |     )
 141 |     
 142 |     # Initialize the crawler
 143 |     crawler = AsyncWebCrawler(config=browser_config)
 144 |     await crawler.__aenter__()
 145 |     
 146 |     # Initialize Supabase client
 147 |     supabase_client = get_supabase_client()
 148 |     
 149 |     # Initialize cross-encoder model for reranking if enabled
 150 |     reranking_model = None
 151 |     if os.getenv("USE_RERANKING", "false") == "true":
 152 |         try:
 153 |             reranking_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
 154 |         except Exception as e:
 155 |             print(f"Failed to load reranking model: {e}")
 156 |             reranking_model = None
 157 |     
 158 |     # Initialize Neo4j components if configured and enabled
 159 |     knowledge_validator = None
 160 |     repo_extractor = None
 161 |     
 162 |     # Check if knowledge graph functionality is enabled
 163 |     knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
 164 |     
 165 |     if knowledge_graph_enabled:
 166 |         neo4j_uri = os.getenv("NEO4J_URI")
 167 |         neo4j_user = os.getenv("NEO4J_USER")
 168 |         neo4j_password = os.getenv("NEO4J_PASSWORD")
 169 |         
 170 |         if neo4j_uri and neo4j_user and neo4j_password:
 171 |             try:
 172 |                 print("Initializing knowledge graph components...")
 173 |                 
 174 |                 # Initialize knowledge graph validator
 175 |                 knowledge_validator = KnowledgeGraphValidator(neo4j_uri, neo4j_user, neo4j_password)
 176 |                 await knowledge_validator.initialize()
 177 |                 print("✓ Knowledge graph validator initialized")
 178 |                 
 179 |                 # Initialize repository extractor
 180 |                 repo_extractor = DirectNeo4jExtractor(neo4j_uri, neo4j_user, neo4j_password)
 181 |                 await repo_extractor.initialize()
 182 |                 print("✓ Repository extractor initialized")
 183 |                 
 184 |             except Exception as e:
 185 |                 print(f"Failed to initialize Neo4j components: {format_neo4j_error(e)}")
 186 |                 knowledge_validator = None
 187 |                 repo_extractor = None
 188 |         else:
 189 |             print("Neo4j credentials not configured - knowledge graph tools will be unavailable")
 190 |     else:
 191 |         print("Knowledge graph functionality disabled - set USE_KNOWLEDGE_GRAPH=true to enable")
 192 |     
 193 |     try:
 194 |         # Small delay to ensure all components are fully initialized
 195 |         await asyncio.sleep(0.5)
 196 |         print("✓ All components initialized - server ready for requests")
 197 |         
 198 |         yield Crawl4AIContext(
 199 |             crawler=crawler,
 200 |             supabase_client=supabase_client,
 201 |             reranking_model=reranking_model,
 202 |             knowledge_validator=knowledge_validator,
 203 |             repo_extractor=repo_extractor
 204 |         )
 205 |     finally:
 206 |         # Clean up all components
 207 |         await crawler.__aexit__(None, None, None)
 208 |         if knowledge_validator:
 209 |             try:
 210 |                 await knowledge_validator.close()
 211 |                 print("✓ Knowledge graph validator closed")
 212 |             except Exception as e:
 213 |                 print(f"Error closing knowledge validator: {e}")
 214 |         if repo_extractor:
 215 |             try:
 216 |                 await repo_extractor.close()
 217 |                 print("✓ Repository extractor closed")
 218 |             except Exception as e:
 219 |                 print(f"Error closing repository extractor: {e}")
 220 | 
 221 | # Initialize FastMCP server
 222 | mcp = FastMCP(
 223 |     "mcp-crawl4ai-rag",
 224 |     description="MCP server for RAG and web crawling with Crawl4AI",
 225 |     lifespan=crawl4ai_lifespan,
 226 |     host=os.getenv("HOST", "0.0.0.0"),
 227 |     port=os.getenv("PORT", "8051")
 228 | )
 229 | 
 230 | def rerank_results(model: CrossEncoder, query: str, results: List[Dict[str, Any]], content_key: str = "content") -> List[Dict[str, Any]]:
 231 |     """
 232 |     Rerank search results using a cross-encoder model.
 233 |     
 234 |     Args:
 235 |         model: The cross-encoder model to use for reranking
 236 |         query: The search query
 237 |         results: List of search results
 238 |         content_key: The key in each result dict that contains the text content
 239 |         
 240 |     Returns:
 241 |         Reranked list of results
 242 |     """
 243 |     if not model or not results:
 244 |         return results
 245 |     
 246 |     try:
 247 |         # Extract content from results
 248 |         texts = [result.get(content_key, "") for result in results]
 249 |         
 250 |         # Create pairs of [query, document] for the cross-encoder
 251 |         pairs = [[query, text] for text in texts]
 252 |         
 253 |         # Get relevance scores from the cross-encoder
 254 |         scores = model.predict(pairs)
 255 |         
 256 |         # Add scores to results and sort by score (descending)
 257 |         for i, result in enumerate(results):
 258 |             result["rerank_score"] = float(scores[i])
 259 |         
 260 |         # Sort by rerank score
 261 |         reranked = sorted(results, key=lambda x: x.get("rerank_score", 0), reverse=True)
 262 |         
 263 |         return reranked
 264 |     except Exception as e:
 265 |         print(f"Error during reranking: {e}")
 266 |         return results
 267 | 
 268 | def is_sitemap(url: str) -> bool:
 269 |     """
 270 |     Check if a URL is a sitemap.
 271 |     
 272 |     Args:
 273 |         url: URL to check
 274 |         
 275 |     Returns:
 276 |         True if the URL is a sitemap, False otherwise
 277 |     """
 278 |     return url.endswith('sitemap.xml') or 'sitemap' in urlparse(url).path
 279 | 
 280 | def is_txt(url: str) -> bool:
 281 |     """
 282 |     Check if a URL is a text file.
 283 |     
 284 |     Args:
 285 |         url: URL to check
 286 |         
 287 |     Returns:
 288 |         True if the URL is a text file, False otherwise
 289 |     """
 290 |     return url.endswith('.txt')
 291 | 
 292 | def parse_sitemap(sitemap_url: str) -> List[str]:
 293 |     """
 294 |     Parse a sitemap and extract URLs.
 295 |     
 296 |     Args:
 297 |         sitemap_url: URL of the sitemap
 298 |         
 299 |     Returns:
 300 |         List of URLs found in the sitemap
 301 |     """
 302 |     resp = requests.get(sitemap_url)
 303 |     urls = []
 304 | 
 305 |     if resp.status_code == 200:
 306 |         try:
 307 |             tree = ElementTree.fromstring(resp.content)
 308 |             urls = [loc.text for loc in tree.findall('.//{*}loc')]
 309 |         except Exception as e:
 310 |             print(f"Error parsing sitemap XML: {e}")
 311 | 
 312 |     return urls
 313 | 
 314 | def smart_chunk_markdown(text: str, chunk_size: int = 5000) -> List[str]:
 315 |     """Split text into chunks, respecting code blocks and paragraphs."""
 316 |     chunks = []
 317 |     start = 0
 318 |     text_length = len(text)
 319 | 
 320 |     while start < text_length:
 321 |         # Calculate end position
 322 |         end = start + chunk_size
 323 | 
 324 |         # If we're at the end of the text, just take what's left
 325 |         if end >= text_length:
 326 |             chunks.append(text[start:].strip())
 327 |             break
 328 | 
 329 |         # Try to find a code block boundary first (```)
 330 |         chunk = text[start:end]
 331 |         code_block = chunk.rfind('```')
 332 |         if code_block != -1 and code_block > chunk_size * 0.3:
 333 |             end = start + code_block
 334 | 
 335 |         # If no code block, try to break at a paragraph
 336 |         elif '\n\n' in chunk:
 337 |             # Find the last paragraph break
 338 |             last_break = chunk.rfind('\n\n')
 339 |             if last_break > chunk_size * 0.3:  # Only break if we're past 30% of chunk_size
 340 |                 end = start + last_break
 341 | 
 342 |         # If no paragraph break, try to break at a sentence
 343 |         elif '. ' in chunk:
 344 |             # Find the last sentence break
 345 |             last_period = chunk.rfind('. ')
 346 |             if last_period > chunk_size * 0.3:  # Only break if we're past 30% of chunk_size
 347 |                 end = start + last_period + 1
 348 | 
 349 |         # Extract chunk and clean it up
 350 |         chunk = text[start:end].strip()
 351 |         if chunk:
 352 |             chunks.append(chunk)
 353 | 
 354 |         # Move start position for next chunk
 355 |         start = end
 356 | 
 357 |     return chunks
 358 | 
 359 | def extract_section_info(chunk: str) -> Dict[str, Any]:
 360 |     """
 361 |     Extracts headers and stats from a chunk.
 362 |     
 363 |     Args:
 364 |         chunk: Markdown chunk
 365 |         
 366 |     Returns:
 367 |         Dictionary with headers and stats
 368 |     """
 369 |     headers = re.findall(r'^(#+)\s+(.+)$', chunk, re.MULTILINE)
 370 |     header_str = '; '.join([f'{h[0]} {h[1]}' for h in headers]) if headers else ''
 371 | 
 372 |     return {
 373 |         "headers": header_str,
 374 |         "char_count": len(chunk),
 375 |         "word_count": len(chunk.split())
 376 |     }
 377 | 
 378 | def process_code_example(args):
 379 |     """
 380 |     Process a single code example to generate its summary.
 381 |     This function is designed to be used with concurrent.futures.
 382 |     
 383 |     Args:
 384 |         args: Tuple containing (code, context_before, context_after)
 385 |         
 386 |     Returns:
 387 |         The generated summary
 388 |     """
 389 |     code, context_before, context_after = args
 390 |     return generate_code_example_summary(code, context_before, context_after)
 391 | 
 392 | @mcp.tool()
 393 | async def crawl_single_page(ctx: Context, url: str) -> str:
 394 |     """
 395 |     Crawl a single web page and store its content in Supabase.
 396 |     
 397 |     This tool is ideal for quickly retrieving content from a specific URL without following links.
 398 |     The content is stored in Supabase for later retrieval and querying.
 399 |     
 400 |     Args:
 401 |         ctx: The MCP server provided context
 402 |         url: URL of the web page to crawl
 403 |     
 404 |     Returns:
 405 |         Summary of the crawling operation and storage in Supabase
 406 |     """
 407 |     try:
 408 |         # Get the crawler from the context
 409 |         crawler = ctx.request_context.lifespan_context.crawler
 410 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 411 |         
 412 |         # Configure the crawl
 413 |         run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
 414 |         
 415 |         # Crawl the page
 416 |         result = await crawler.arun(url=url, config=run_config)
 417 |         
 418 |         if result.success and result.markdown:
 419 |             # Extract source_id
 420 |             parsed_url = urlparse(url)
 421 |             source_id = parsed_url.netloc or parsed_url.path
 422 |             
 423 |             # Chunk the content
 424 |             chunks = smart_chunk_markdown(result.markdown)
 425 |             
 426 |             # Prepare data for Supabase
 427 |             urls = []
 428 |             chunk_numbers = []
 429 |             contents = []
 430 |             metadatas = []
 431 |             total_word_count = 0
 432 |             
 433 |             for i, chunk in enumerate(chunks):
 434 |                 urls.append(url)
 435 |                 chunk_numbers.append(i)
 436 |                 contents.append(chunk)
 437 |                 
 438 |                 # Extract metadata
 439 |                 meta = extract_section_info(chunk)
 440 |                 meta["chunk_index"] = i
 441 |                 meta["url"] = url
 442 |                 meta["source"] = source_id
 443 |                 meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
 444 |                 metadatas.append(meta)
 445 |                 
 446 |                 # Accumulate word count
 447 |                 total_word_count += meta.get("word_count", 0)
 448 |             
 449 |             # Create url_to_full_document mapping
 450 |             url_to_full_document = {url: result.markdown}
 451 |             
 452 |             # Update source information FIRST (before inserting documents)
 453 |             source_summary = extract_source_summary(source_id, result.markdown[:5000])  # Use first 5000 chars for summary
 454 |             update_source_info(supabase_client, source_id, source_summary, total_word_count)
 455 |             
 456 |             # Add documentation chunks to Supabase (AFTER source exists)
 457 |             add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document)
 458 |             
 459 |             # Extract and process code examples only if enabled
 460 |             extract_code_examples = os.getenv("USE_AGENTIC_RAG", "false") == "true"
 461 |             if extract_code_examples:
 462 |                 code_blocks = extract_code_blocks(result.markdown)
 463 |                 if code_blocks:
 464 |                     code_urls = []
 465 |                     code_chunk_numbers = []
 466 |                     code_examples = []
 467 |                     code_summaries = []
 468 |                     code_metadatas = []
 469 |                     
 470 |                     # Process code examples in parallel
 471 |                     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
 472 |                         # Prepare arguments for parallel processing
 473 |                         summary_args = [(block['code'], block['context_before'], block['context_after']) 
 474 |                                         for block in code_blocks]
 475 |                         
 476 |                         # Generate summaries in parallel
 477 |                         summaries = list(executor.map(process_code_example, summary_args))
 478 |                     
 479 |                     # Prepare code example data
 480 |                     for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
 481 |                         code_urls.append(url)
 482 |                         code_chunk_numbers.append(i)
 483 |                         code_examples.append(block['code'])
 484 |                         code_summaries.append(summary)
 485 |                         
 486 |                         # Create metadata for code example
 487 |                         code_meta = {
 488 |                             "chunk_index": i,
 489 |                             "url": url,
 490 |                             "source": source_id,
 491 |                             "char_count": len(block['code']),
 492 |                             "word_count": len(block['code'].split())
 493 |                         }
 494 |                         code_metadatas.append(code_meta)
 495 |                     
 496 |                     # Add code examples to Supabase
 497 |                     add_code_examples_to_supabase(
 498 |                         supabase_client, 
 499 |                         code_urls, 
 500 |                         code_chunk_numbers, 
 501 |                         code_examples, 
 502 |                         code_summaries, 
 503 |                         code_metadatas
 504 |                     )
 505 |             
 506 |             return json.dumps({
 507 |                 "success": True,
 508 |                 "url": url,
 509 |                 "chunks_stored": len(chunks),
 510 |                 "code_examples_stored": len(code_blocks) if code_blocks else 0,
 511 |                 "content_length": len(result.markdown),
 512 |                 "total_word_count": total_word_count,
 513 |                 "source_id": source_id,
 514 |                 "links_count": {
 515 |                     "internal": len(result.links.get("internal", [])),
 516 |                     "external": len(result.links.get("external", []))
 517 |                 }
 518 |             }, indent=2)
 519 |         else:
 520 |             return json.dumps({
 521 |                 "success": False,
 522 |                 "url": url,
 523 |                 "error": result.error_message
 524 |             }, indent=2)
 525 |     except Exception as e:
 526 |         return json.dumps({
 527 |             "success": False,
 528 |             "url": url,
 529 |             "error": str(e)
 530 |         }, indent=2)
 531 | 
 532 | @mcp.tool()
 533 | async def smart_crawl_url(ctx: Context, url: str, max_depth: int = 3, max_concurrent: int = 10, chunk_size: int = 5000) -> str:
 534 |     """
 535 |     Intelligently crawl a URL based on its type and store content in Supabase.
 536 |     
 537 |     This tool automatically detects the URL type and applies the appropriate crawling method:
 538 |     - For sitemaps: Extracts and crawls all URLs in parallel
 539 |     - For text files (llms.txt): Directly retrieves the content
 540 |     - For regular webpages: Recursively crawls internal links up to the specified depth
 541 |     
 542 |     All crawled content is chunked and stored in Supabase for later retrieval and querying.
 543 |     
 544 |     Args:
 545 |         ctx: The MCP server provided context
 546 |         url: URL to crawl (can be a regular webpage, sitemap.xml, or .txt file)
 547 |         max_depth: Maximum recursion depth for regular URLs (default: 3)
 548 |         max_concurrent: Maximum number of concurrent browser sessions (default: 10)
 549 |         chunk_size: Maximum size of each content chunk in characters (default: 1000)
 550 |     
 551 |     Returns:
 552 |         JSON string with crawl summary and storage information
 553 |     """
 554 |     try:
 555 |         # Get the crawler from the context
 556 |         crawler = ctx.request_context.lifespan_context.crawler
 557 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 558 |         
 559 |         # Determine the crawl strategy
 560 |         crawl_results = []
 561 |         crawl_type = None
 562 |         
 563 |         if is_txt(url):
 564 |             # For text files, use simple crawl
 565 |             crawl_results = await crawl_markdown_file(crawler, url)
 566 |             crawl_type = "text_file"
 567 |         elif is_sitemap(url):
 568 |             # For sitemaps, extract URLs and crawl in parallel
 569 |             sitemap_urls = parse_sitemap(url)
 570 |             if not sitemap_urls:
 571 |                 return json.dumps({
 572 |                     "success": False,
 573 |                     "url": url,
 574 |                     "error": "No URLs found in sitemap"
 575 |                 }, indent=2)
 576 |             crawl_results = await crawl_batch(crawler, sitemap_urls, max_concurrent=max_concurrent)
 577 |             crawl_type = "sitemap"
 578 |         else:
 579 |             # For regular URLs, use recursive crawl
 580 |             crawl_results = await crawl_recursive_internal_links(crawler, [url], max_depth=max_depth, max_concurrent=max_concurrent)
 581 |             crawl_type = "webpage"
 582 |         
 583 |         if not crawl_results:
 584 |             return json.dumps({
 585 |                 "success": False,
 586 |                 "url": url,
 587 |                 "error": "No content found"
 588 |             }, indent=2)
 589 |         
 590 |         # Process results and store in Supabase
 591 |         urls = []
 592 |         chunk_numbers = []
 593 |         contents = []
 594 |         metadatas = []
 595 |         chunk_count = 0
 596 |         
 597 |         # Track sources and their content
 598 |         source_content_map = {}
 599 |         source_word_counts = {}
 600 |         
 601 |         # Process documentation chunks
 602 |         for doc in crawl_results:
 603 |             source_url = doc['url']
 604 |             md = doc['markdown']
 605 |             chunks = smart_chunk_markdown(md, chunk_size=chunk_size)
 606 |             
 607 |             # Extract source_id
 608 |             parsed_url = urlparse(source_url)
 609 |             source_id = parsed_url.netloc or parsed_url.path
 610 |             
 611 |             # Store content for source summary generation
 612 |             if source_id not in source_content_map:
 613 |                 source_content_map[source_id] = md[:5000]  # Store first 5000 chars
 614 |                 source_word_counts[source_id] = 0
 615 |             
 616 |             for i, chunk in enumerate(chunks):
 617 |                 urls.append(source_url)
 618 |                 chunk_numbers.append(i)
 619 |                 contents.append(chunk)
 620 |                 
 621 |                 # Extract metadata
 622 |                 meta = extract_section_info(chunk)
 623 |                 meta["chunk_index"] = i
 624 |                 meta["url"] = source_url
 625 |                 meta["source"] = source_id
 626 |                 meta["crawl_type"] = crawl_type
 627 |                 meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
 628 |                 metadatas.append(meta)
 629 |                 
 630 |                 # Accumulate word count
 631 |                 source_word_counts[source_id] += meta.get("word_count", 0)
 632 |                 
 633 |                 chunk_count += 1
 634 |         
 635 |         # Create url_to_full_document mapping
 636 |         url_to_full_document = {}
 637 |         for doc in crawl_results:
 638 |             url_to_full_document[doc['url']] = doc['markdown']
 639 |         
 640 |         # Update source information for each unique source FIRST (before inserting documents)
 641 |         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
 642 |             source_summary_args = [(source_id, content) for source_id, content in source_content_map.items()]
 643 |             source_summaries = list(executor.map(lambda args: extract_source_summary(args[0], args[1]), source_summary_args))
 644 |         
 645 |         for (source_id, _), summary in zip(source_summary_args, source_summaries):
 646 |             word_count = source_word_counts.get(source_id, 0)
 647 |             update_source_info(supabase_client, source_id, summary, word_count)
 648 |         
 649 |         # Add documentation chunks to Supabase (AFTER sources exist)
 650 |         batch_size = 20
 651 |         add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document, batch_size=batch_size)
 652 |         
 653 |         # Extract and process code examples from all documents only if enabled
 654 |         extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
 655 |         if extract_code_examples_enabled:
 656 |             all_code_blocks = []
 657 |             code_urls = []
 658 |             code_chunk_numbers = []
 659 |             code_examples = []
 660 |             code_summaries = []
 661 |             code_metadatas = []
 662 |             
 663 |             # Extract code blocks from all documents
 664 |             for doc in crawl_results:
 665 |                 source_url = doc['url']
 666 |                 md = doc['markdown']
 667 |                 code_blocks = extract_code_blocks(md)
 668 |                 
 669 |                 if code_blocks:
 670 |                     # Process code examples in parallel
 671 |                     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
 672 |                         # Prepare arguments for parallel processing
 673 |                         summary_args = [(block['code'], block['context_before'], block['context_after']) 
 674 |                                         for block in code_blocks]
 675 |                         
 676 |                         # Generate summaries in parallel
 677 |                         summaries = list(executor.map(process_code_example, summary_args))
 678 |                     
 679 |                     # Prepare code example data
 680 |                     parsed_url = urlparse(source_url)
 681 |                     source_id = parsed_url.netloc or parsed_url.path
 682 |                     
 683 |                     for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
 684 |                         code_urls.append(source_url)
 685 |                         code_chunk_numbers.append(len(code_examples))  # Use global code example index
 686 |                         code_examples.append(block['code'])
 687 |                         code_summaries.append(summary)
 688 |                         
 689 |                         # Create metadata for code example
 690 |                         code_meta = {
 691 |                             "chunk_index": len(code_examples) - 1,
 692 |                             "url": source_url,
 693 |                             "source": source_id,
 694 |                             "char_count": len(block['code']),
 695 |                             "word_count": len(block['code'].split())
 696 |                         }
 697 |                         code_metadatas.append(code_meta)
 698 |             
 699 |             # Add all code examples to Supabase
 700 |             if code_examples:
 701 |                 add_code_examples_to_supabase(
 702 |                     supabase_client, 
 703 |                     code_urls, 
 704 |                     code_chunk_numbers, 
 705 |                     code_examples, 
 706 |                     code_summaries, 
 707 |                     code_metadatas,
 708 |                     batch_size=batch_size
 709 |                 )
 710 |         
 711 |         return json.dumps({
 712 |             "success": True,
 713 |             "url": url,
 714 |             "crawl_type": crawl_type,
 715 |             "pages_crawled": len(crawl_results),
 716 |             "chunks_stored": chunk_count,
 717 |             "code_examples_stored": len(code_examples),
 718 |             "sources_updated": len(source_content_map),
 719 |             "urls_crawled": [doc['url'] for doc in crawl_results][:5] + (["..."] if len(crawl_results) > 5 else [])
 720 |         }, indent=2)
 721 |     except Exception as e:
 722 |         return json.dumps({
 723 |             "success": False,
 724 |             "url": url,
 725 |             "error": str(e)
 726 |         }, indent=2)
 727 | 
 728 | @mcp.tool()
 729 | async def get_available_sources(ctx: Context) -> str:
 730 |     """
 731 |     Get all available sources from the sources table.
 732 |     
 733 |     This tool returns a list of all unique sources (domains) that have been crawled and stored
 734 |     in the database, along with their summaries and statistics. This is useful for discovering 
 735 |     what content is available for querying.
 736 | 
 737 |     Always use this tool before calling the RAG query or code example query tool
 738 |     with a specific source filter!
 739 |     
 740 |     Args:
 741 |         ctx: The MCP server provided context
 742 |     
 743 |     Returns:
 744 |         JSON string with the list of available sources and their details
 745 |     """
 746 |     try:
 747 |         # Get the Supabase client from the context
 748 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 749 |         
 750 |         # Query the sources table directly
 751 |         result = supabase_client.from_('sources')\
 752 |             .select('*')\
 753 |             .order('source_id')\
 754 |             .execute()
 755 |         
 756 |         # Format the sources with their details
 757 |         sources = []
 758 |         if result.data:
 759 |             for source in result.data:
 760 |                 sources.append({
 761 |                     "source_id": source.get("source_id"),
 762 |                     "summary": source.get("summary"),
 763 |                     "total_words": source.get("total_words"),
 764 |                     "created_at": source.get("created_at"),
 765 |                     "updated_at": source.get("updated_at")
 766 |                 })
 767 |         
 768 |         return json.dumps({
 769 |             "success": True,
 770 |             "sources": sources,
 771 |             "count": len(sources)
 772 |         }, indent=2)
 773 |     except Exception as e:
 774 |         return json.dumps({
 775 |             "success": False,
 776 |             "error": str(e)
 777 |         }, indent=2)
 778 | 
 779 | @mcp.tool()
 780 | async def perform_rag_query(ctx: Context, query: str, source: str = None, match_count: int = 5) -> str:
 781 |     """
 782 |     Perform a RAG (Retrieval Augmented Generation) query on the stored content.
 783 |     
 784 |     This tool searches the vector database for content relevant to the query and returns
 785 |     the matching documents. Optionally filter by source domain.
 786 |     Get the source by using the get_available_sources tool before calling this search!
 787 |     
 788 |     Args:
 789 |         ctx: The MCP server provided context
 790 |         query: The search query
 791 |         source: Optional source domain to filter results (e.g., 'example.com')
 792 |         match_count: Maximum number of results to return (default: 5)
 793 |     
 794 |     Returns:
 795 |         JSON string with the search results
 796 |     """
 797 |     try:
 798 |         # Get the Supabase client from the context
 799 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 800 |         
 801 |         # Check if hybrid search is enabled
 802 |         use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
 803 |         
 804 |         # Prepare filter if source is provided and not empty
 805 |         filter_metadata = None
 806 |         if source and source.strip():
 807 |             filter_metadata = {"source": source}
 808 |         
 809 |         if use_hybrid_search:
 810 |             # Hybrid search: combine vector and keyword search
 811 |             
 812 |             # 1. Get vector search results (get more to account for filtering)
 813 |             vector_results = search_documents(
 814 |                 client=supabase_client,
 815 |                 query=query,
 816 |                 match_count=match_count * 2,  # Get double to have room for filtering
 817 |                 filter_metadata=filter_metadata
 818 |             )
 819 |             
 820 |             # 2. Get keyword search results using ILIKE
 821 |             keyword_query = supabase_client.from_('crawled_pages')\
 822 |                 .select('id, url, chunk_number, content, metadata, source_id')\
 823 |                 .ilike('content', f'%{query}%')
 824 |             
 825 |             # Apply source filter if provided
 826 |             if source and source.strip():
 827 |                 keyword_query = keyword_query.eq('source_id', source)
 828 |             
 829 |             # Execute keyword search
 830 |             keyword_response = keyword_query.limit(match_count * 2).execute()
 831 |             keyword_results = keyword_response.data if keyword_response.data else []
 832 |             
 833 |             # 3. Combine results with preference for items appearing in both
 834 |             seen_ids = set()
 835 |             combined_results = []
 836 |             
 837 |             # First, add items that appear in both searches (these are the best matches)
 838 |             vector_ids = {r.get('id') for r in vector_results if r.get('id')}
 839 |             for kr in keyword_results:
 840 |                 if kr['id'] in vector_ids and kr['id'] not in seen_ids:
 841 |                     # Find the vector result to get similarity score
 842 |                     for vr in vector_results:
 843 |                         if vr.get('id') == kr['id']:
 844 |                             # Boost similarity score for items in both results
 845 |                             vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
 846 |                             combined_results.append(vr)
 847 |                             seen_ids.add(kr['id'])
 848 |                             break
 849 |             
 850 |             # Then add remaining vector results (semantic matches without exact keyword)
 851 |             for vr in vector_results:
 852 |                 if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
 853 |                     combined_results.append(vr)
 854 |                     seen_ids.add(vr['id'])
 855 |             
 856 |             # Finally, add pure keyword matches if we still need more results
 857 |             for kr in keyword_results:
 858 |                 if kr['id'] not in seen_ids and len(combined_results) < match_count:
 859 |                     # Convert keyword result to match vector result format
 860 |                     combined_results.append({
 861 |                         'id': kr['id'],
 862 |                         'url': kr['url'],
 863 |                         'chunk_number': kr['chunk_number'],
 864 |                         'content': kr['content'],
 865 |                         'metadata': kr['metadata'],
 866 |                         'source_id': kr['source_id'],
 867 |                         'similarity': 0.5  # Default similarity for keyword-only matches
 868 |                     })
 869 |                     seen_ids.add(kr['id'])
 870 |             
 871 |             # Use combined results
 872 |             results = combined_results[:match_count]
 873 |             
 874 |         else:
 875 |             # Standard vector search only
 876 |             results = search_documents(
 877 |                 client=supabase_client,
 878 |                 query=query,
 879 |                 match_count=match_count,
 880 |                 filter_metadata=filter_metadata
 881 |             )
 882 |         
 883 |         # Apply reranking if enabled
 884 |         use_reranking = os.getenv("USE_RERANKING", "false") == "true"
 885 |         if use_reranking and ctx.request_context.lifespan_context.reranking_model:
 886 |             results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
 887 |         
 888 |         # Format the results
 889 |         formatted_results = []
 890 |         for result in results:
 891 |             formatted_result = {
 892 |                 "url": result.get("url"),
 893 |                 "content": result.get("content"),
 894 |                 "metadata": result.get("metadata"),
 895 |                 "similarity": result.get("similarity")
 896 |             }
 897 |             # Include rerank score if available
 898 |             if "rerank_score" in result:
 899 |                 formatted_result["rerank_score"] = result["rerank_score"]
 900 |             formatted_results.append(formatted_result)
 901 |         
 902 |         return json.dumps({
 903 |             "success": True,
 904 |             "query": query,
 905 |             "source_filter": source,
 906 |             "search_mode": "hybrid" if use_hybrid_search else "vector",
 907 |             "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
 908 |             "results": formatted_results,
 909 |             "count": len(formatted_results)
 910 |         }, indent=2)
 911 |     except Exception as e:
 912 |         return json.dumps({
 913 |             "success": False,
 914 |             "query": query,
 915 |             "error": str(e)
 916 |         }, indent=2)
 917 | 
 918 | @mcp.tool()
 919 | async def search_code_examples(ctx: Context, query: str, source_id: str = None, match_count: int = 5) -> str:
 920 |     """
 921 |     Search for code examples relevant to the query.
 922 |     
 923 |     This tool searches the vector database for code examples relevant to the query and returns
 924 |     the matching examples with their summaries. Optionally filter by source_id.
 925 |     Get the source_id by using the get_available_sources tool before calling this search!
 926 | 
 927 |     Use the get_available_sources tool first to see what sources are available for filtering.
 928 |     
 929 |     Args:
 930 |         ctx: The MCP server provided context
 931 |         query: The search query
 932 |         source_id: Optional source ID to filter results (e.g., 'example.com')
 933 |         match_count: Maximum number of results to return (default: 5)
 934 |     
 935 |     Returns:
 936 |         JSON string with the search results
 937 |     """
 938 |     # Check if code example extraction is enabled
 939 |     extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
 940 |     if not extract_code_examples_enabled:
 941 |         return json.dumps({
 942 |             "success": False,
 943 |             "error": "Code example extraction is disabled. Perform a normal RAG search."
 944 |         }, indent=2)
 945 |     
 946 |     try:
 947 |         # Get the Supabase client from the context
 948 |         supabase_client = ctx.request_context.lifespan_context.supabase_client
 949 |         
 950 |         # Check if hybrid search is enabled
 951 |         use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
 952 |         
 953 |         # Prepare filter if source is provided and not empty
 954 |         filter_metadata = None
 955 |         if source_id and source_id.strip():
 956 |             filter_metadata = {"source": source_id}
 957 |         
 958 |         if use_hybrid_search:
 959 |             # Hybrid search: combine vector and keyword search
 960 |             
 961 |             # Import the search function from utils
 962 |             from utils import search_code_examples as search_code_examples_impl
 963 |             
 964 |             # 1. Get vector search results (get more to account for filtering)
 965 |             vector_results = search_code_examples_impl(
 966 |                 client=supabase_client,
 967 |                 query=query,
 968 |                 match_count=match_count * 2,  # Get double to have room for filtering
 969 |                 filter_metadata=filter_metadata
 970 |             )
 971 |             
 972 |             # 2. Get keyword search results using ILIKE on both content and summary
 973 |             keyword_query = supabase_client.from_('code_examples')\
 974 |                 .select('id, url, chunk_number, content, summary, metadata, source_id')\
 975 |                 .or_(f'content.ilike.%{query}%,summary.ilike.%{query}%')
 976 |             
 977 |             # Apply source filter if provided
 978 |             if source_id and source_id.strip():
 979 |                 keyword_query = keyword_query.eq('source_id', source_id)
 980 |             
 981 |             # Execute keyword search
 982 |             keyword_response = keyword_query.limit(match_count * 2).execute()
 983 |             keyword_results = keyword_response.data if keyword_response.data else []
 984 |             
 985 |             # 3. Combine results with preference for items appearing in both
 986 |             seen_ids = set()
 987 |             combined_results = []
 988 |             
 989 |             # First, add items that appear in both searches (these are the best matches)
 990 |             vector_ids = {r.get('id') for r in vector_results if r.get('id')}
 991 |             for kr in keyword_results:
 992 |                 if kr['id'] in vector_ids and kr['id'] not in seen_ids:
 993 |                     # Find the vector result to get similarity score
 994 |                     for vr in vector_results:
 995 |                         if vr.get('id') == kr['id']:
 996 |                             # Boost similarity score for items in both results
 997 |                             vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
 998 |                             combined_results.append(vr)
 999 |                             seen_ids.add(kr['id'])
1000 |                             break
1001 |             
1002 |             # Then add remaining vector results (semantic matches without exact keyword)
1003 |             for vr in vector_results:
1004 |                 if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
1005 |                     combined_results.append(vr)
1006 |                     seen_ids.add(vr['id'])
1007 |             
1008 |             # Finally, add pure keyword matches if we still need more results
1009 |             for kr in keyword_results:
1010 |                 if kr['id'] not in seen_ids and len(combined_results) < match_count:
1011 |                     # Convert keyword result to match vector result format
1012 |                     combined_results.append({
1013 |                         'id': kr['id'],
1014 |                         'url': kr['url'],
1015 |                         'chunk_number': kr['chunk_number'],
1016 |                         'content': kr['content'],
1017 |                         'summary': kr['summary'],
1018 |                         'metadata': kr['metadata'],
1019 |                         'source_id': kr['source_id'],
1020 |                         'similarity': 0.5  # Default similarity for keyword-only matches
1021 |                     })
1022 |                     seen_ids.add(kr['id'])
1023 |             
1024 |             # Use combined results
1025 |             results = combined_results[:match_count]
1026 |             
1027 |         else:
1028 |             # Standard vector search only
1029 |             from utils import search_code_examples as search_code_examples_impl
1030 |             
1031 |             results = search_code_examples_impl(
1032 |                 client=supabase_client,
1033 |                 query=query,
1034 |                 match_count=match_count,
1035 |                 filter_metadata=filter_metadata
1036 |             )
1037 |         
1038 |         # Apply reranking if enabled
1039 |         use_reranking = os.getenv("USE_RERANKING", "false") == "true"
1040 |         if use_reranking and ctx.request_context.lifespan_context.reranking_model:
1041 |             results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
1042 |         
1043 |         # Format the results
1044 |         formatted_results = []
1045 |         for result in results:
1046 |             formatted_result = {
1047 |                 "url": result.get("url"),
1048 |                 "code": result.get("content"),
1049 |                 "summary": result.get("summary"),
1050 |                 "metadata": result.get("metadata"),
1051 |                 "source_id": result.get("source_id"),
1052 |                 "similarity": result.get("similarity")
1053 |             }
1054 |             # Include rerank score if available
1055 |             if "rerank_score" in result:
1056 |                 formatted_result["rerank_score"] = result["rerank_score"]
1057 |             formatted_results.append(formatted_result)
1058 |         
1059 |         return json.dumps({
1060 |             "success": True,
1061 |             "query": query,
1062 |             "source_filter": source_id,
1063 |             "search_mode": "hybrid" if use_hybrid_search else "vector",
1064 |             "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
1065 |             "results": formatted_results,
1066 |             "count": len(formatted_results)
1067 |         }, indent=2)
1068 |     except Exception as e:
1069 |         return json.dumps({
1070 |             "success": False,
1071 |             "query": query,
1072 |             "error": str(e)
1073 |         }, indent=2)
1074 | 
1075 | @mcp.tool()
1076 | async def check_ai_script_hallucinations(ctx: Context, script_path: str) -> str:
1077 |     """
1078 |     Check an AI-generated Python script for hallucinations using the knowledge graph.
1079 |     
1080 |     This tool analyzes a Python script for potential AI hallucinations by validating
1081 |     imports, method calls, class instantiations, and function calls against a Neo4j
1082 |     knowledge graph containing real repository data.
1083 |     
1084 |     The tool performs comprehensive analysis including:
1085 |     - Import validation against known repositories
1086 |     - Method call validation on classes from the knowledge graph
1087 |     - Class instantiation parameter validation
1088 |     - Function call parameter validation
1089 |     - Attribute access validation
1090 |     
1091 |     Args:
1092 |         ctx: The MCP server provided context
1093 |         script_path: Absolute path to the Python script to analyze
1094 |     
1095 |     Returns:
1096 |         JSON string with hallucination detection results, confidence scores, and recommendations
1097 |     """
1098 |     try:
1099 |         # Check if knowledge graph functionality is enabled
1100 |         knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1101 |         if not knowledge_graph_enabled:
1102 |             return json.dumps({
1103 |                 "success": False,
1104 |                 "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1105 |             }, indent=2)
1106 |         
1107 |         # Get the knowledge validator from context
1108 |         knowledge_validator = ctx.request_context.lifespan_context.knowledge_validator
1109 |         
1110 |         if not knowledge_validator:
1111 |             return json.dumps({
1112 |                 "success": False,
1113 |                 "error": "Knowledge graph validator not available. Check Neo4j configuration in environment variables."
1114 |             }, indent=2)
1115 |         
1116 |         # Validate script path
1117 |         validation = validate_script_path(script_path)
1118 |         if not validation["valid"]:
1119 |             return json.dumps({
1120 |                 "success": False,
1121 |                 "script_path": script_path,
1122 |                 "error": validation["error"]
1123 |             }, indent=2)
1124 |         
1125 |         # Step 1: Analyze script structure using AST
1126 |         analyzer = AIScriptAnalyzer()
1127 |         analysis_result = analyzer.analyze_script(script_path)
1128 |         
1129 |         if analysis_result.errors:
1130 |             print(f"Analysis warnings for {script_path}: {analysis_result.errors}")
1131 |         
1132 |         # Step 2: Validate against knowledge graph
1133 |         validation_result = await knowledge_validator.validate_script(analysis_result)
1134 |         
1135 |         # Step 3: Generate comprehensive report
1136 |         reporter = HallucinationReporter()
1137 |         report = reporter.generate_comprehensive_report(validation_result)
1138 |         
1139 |         # Format response with comprehensive information
1140 |         return json.dumps({
1141 |             "success": True,
1142 |             "script_path": script_path,
1143 |             "overall_confidence": validation_result.overall_confidence,
1144 |             "validation_summary": {
1145 |                 "total_validations": report["validation_summary"]["total_validations"],
1146 |                 "valid_count": report["validation_summary"]["valid_count"],
1147 |                 "invalid_count": report["validation_summary"]["invalid_count"],
1148 |                 "uncertain_count": report["validation_summary"]["uncertain_count"],
1149 |                 "not_found_count": report["validation_summary"]["not_found_count"],
1150 |                 "hallucination_rate": report["validation_summary"]["hallucination_rate"]
1151 |             },
1152 |             "hallucinations_detected": report["hallucinations_detected"],
1153 |             "recommendations": report["recommendations"],
1154 |             "analysis_metadata": {
1155 |                 "total_imports": report["analysis_metadata"]["total_imports"],
1156 |                 "total_classes": report["analysis_metadata"]["total_classes"],
1157 |                 "total_methods": report["analysis_metadata"]["total_methods"],
1158 |                 "total_attributes": report["analysis_metadata"]["total_attributes"],
1159 |                 "total_functions": report["analysis_metadata"]["total_functions"]
1160 |             },
1161 |             "libraries_analyzed": report.get("libraries_analyzed", [])
1162 |         }, indent=2)
1163 |         
1164 |     except Exception as e:
1165 |         return json.dumps({
1166 |             "success": False,
1167 |             "script_path": script_path,
1168 |             "error": f"Analysis failed: {str(e)}"
1169 |         }, indent=2)
1170 | 
1171 | @mcp.tool()
1172 | async def query_knowledge_graph(ctx: Context, command: str) -> str:
1173 |     """
1174 |     Query and explore the Neo4j knowledge graph containing repository data.
1175 |     
1176 |     This tool provides comprehensive access to the knowledge graph for exploring repositories,
1177 |     classes, methods, functions, and their relationships. Perfect for understanding what data
1178 |     is available for hallucination detection and debugging validation results.
1179 |     
1180 |     **⚠️ IMPORTANT: Always start with the `repos` command first!**
1181 |     Before using any other commands, run `repos` to see what repositories are available
1182 |     in your knowledge graph. This will help you understand what data you can explore.
1183 |     
1184 |     ## Available Commands:
1185 |     
1186 |     **Repository Commands:**
1187 |     - `repos` - **START HERE!** List all repositories in the knowledge graph
1188 |     - `explore <repo_name>` - Get detailed overview of a specific repository
1189 |     
1190 |     **Class Commands:**  
1191 |     - `classes` - List all classes across all repositories (limited to 20)
1192 |     - `classes <repo_name>` - List classes in a specific repository
1193 |     - `class <class_name>` - Get detailed information about a specific class including methods and attributes
1194 |     
1195 |     **Method Commands:**
1196 |     - `method <method_name>` - Search for methods by name across all classes
1197 |     - `method <method_name> <class_name>` - Search for a method within a specific class
1198 |     
1199 |     **Custom Query:**
1200 |     - `query <cypher_query>` - Execute a custom Cypher query (results limited to 20 records)
1201 |     
1202 |     ## Knowledge Graph Schema:
1203 |     
1204 |     **Node Types:**
1205 |     - Repository: `(r:Repository {name: string})`
1206 |     - File: `(f:File {path: string, module_name: string})`
1207 |     - Class: `(c:Class {name: string, full_name: string})`
1208 |     - Method: `(m:Method {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1209 |     - Function: `(func:Function {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1210 |     - Attribute: `(a:Attribute {name: string, type: string})`
1211 |     
1212 |     **Relationships:**
1213 |     - `(r:Repository)-[:CONTAINS]->(f:File)`
1214 |     - `(f:File)-[:DEFINES]->(c:Class)`
1215 |     - `(c:Class)-[:HAS_METHOD]->(m:Method)`
1216 |     - `(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)`
1217 |     - `(f:File)-[:DEFINES]->(func:Function)`
1218 |     
1219 |     ## Example Workflow:
1220 |     ```
1221 |     1. repos                                    # See what repositories are available
1222 |     2. explore pydantic-ai                      # Explore a specific repository
1223 |     3. classes pydantic-ai                      # List classes in that repository
1224 |     4. class Agent                              # Explore the Agent class
1225 |     5. method run_stream                        # Search for run_stream method
1226 |     6. method __init__ Agent                    # Find Agent constructor
1227 |     7. query "MATCH (c:Class)-[:HAS_METHOD]->(m:Method) WHERE m.name = 'run' RETURN c.name, m.name LIMIT 5"
1228 |     ```
1229 |     
1230 |     Args:
1231 |         ctx: The MCP server provided context
1232 |         command: Command string to execute (see available commands above)
1233 |     
1234 |     Returns:
1235 |         JSON string with query results, statistics, and metadata
1236 |     """
1237 |     try:
1238 |         # Check if knowledge graph functionality is enabled
1239 |         knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1240 |         if not knowledge_graph_enabled:
1241 |             return json.dumps({
1242 |                 "success": False,
1243 |                 "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1244 |             }, indent=2)
1245 |         
1246 |         # Get Neo4j driver from context
1247 |         repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1248 |         if not repo_extractor or not repo_extractor.driver:
1249 |             return json.dumps({
1250 |                 "success": False,
1251 |                 "error": "Neo4j connection not available. Check Neo4j configuration in environment variables."
1252 |             }, indent=2)
1253 |         
1254 |         # Parse command
1255 |         command = command.strip()
1256 |         if not command:
1257 |             return json.dumps({
1258 |                 "success": False,
1259 |                 "command": "",
1260 |                 "error": "Command cannot be empty. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1261 |             }, indent=2)
1262 |         
1263 |         parts = command.split()
1264 |         cmd = parts[0].lower()
1265 |         args = parts[1:] if len(parts) > 1 else []
1266 |         
1267 |         async with repo_extractor.driver.session() as session:
1268 |             # Route to appropriate handler
1269 |             if cmd == "repos":
1270 |                 return await _handle_repos_command(session, command)
1271 |             elif cmd == "explore":
1272 |                 if not args:
1273 |                     return json.dumps({
1274 |                         "success": False,
1275 |                         "command": command,
1276 |                         "error": "Repository name required. Usage: explore <repo_name>"
1277 |                     }, indent=2)
1278 |                 return await _handle_explore_command(session, command, args[0])
1279 |             elif cmd == "classes":
1280 |                 repo_name = args[0] if args else None
1281 |                 return await _handle_classes_command(session, command, repo_name)
1282 |             elif cmd == "class":
1283 |                 if not args:
1284 |                     return json.dumps({
1285 |                         "success": False,
1286 |                         "command": command,
1287 |                         "error": "Class name required. Usage: class <class_name>"
1288 |                     }, indent=2)
1289 |                 return await _handle_class_command(session, command, args[0])
1290 |             elif cmd == "method":
1291 |                 if not args:
1292 |                     return json.dumps({
1293 |                         "success": False,
1294 |                         "command": command,
1295 |                         "error": "Method name required. Usage: method <method_name> [class_name]"
1296 |                     }, indent=2)
1297 |                 method_name = args[0]
1298 |                 class_name = args[1] if len(args) > 1 else None
1299 |                 return await _handle_method_command(session, command, method_name, class_name)
1300 |             elif cmd == "query":
1301 |                 if not args:
1302 |                     return json.dumps({
1303 |                         "success": False,
1304 |                         "command": command,
1305 |                         "error": "Cypher query required. Usage: query <cypher_query>"
1306 |                     }, indent=2)
1307 |                 cypher_query = " ".join(args)
1308 |                 return await _handle_query_command(session, command, cypher_query)
1309 |             else:
1310 |                 return json.dumps({
1311 |                     "success": False,
1312 |                     "command": command,
1313 |                     "error": f"Unknown command '{cmd}'. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1314 |                 }, indent=2)
1315 |                 
1316 |     except Exception as e:
1317 |         return json.dumps({
1318 |             "success": False,
1319 |             "command": command,
1320 |             "error": f"Query execution failed: {str(e)}"
1321 |         }, indent=2)
1322 | 
1323 | 
1324 | async def _handle_repos_command(session, command: str) -> str:
1325 |     """Handle 'repos' command - list all repositories"""
1326 |     query = "MATCH (r:Repository) RETURN r.name as name ORDER BY r.name"
1327 |     result = await session.run(query)
1328 |     
1329 |     repos = []
1330 |     async for record in result:
1331 |         repos.append(record['name'])
1332 |     
1333 |     return json.dumps({
1334 |         "success": True,
1335 |         "command": command,
1336 |         "data": {
1337 |             "repositories": repos
1338 |         },
1339 |         "metadata": {
1340 |             "total_results": len(repos),
1341 |             "limited": False
1342 |         }
1343 |     }, indent=2)
1344 | 
1345 | 
1346 | async def _handle_explore_command(session, command: str, repo_name: str) -> str:
1347 |     """Handle 'explore <repo>' command - get repository overview"""
1348 |     # Check if repository exists
1349 |     repo_check_query = "MATCH (r:Repository {name: $repo_name}) RETURN r.name as name"
1350 |     result = await session.run(repo_check_query, repo_name=repo_name)
1351 |     repo_record = await result.single()
1352 |     
1353 |     if not repo_record:
1354 |         return json.dumps({
1355 |             "success": False,
1356 |             "command": command,
1357 |             "error": f"Repository '{repo_name}' not found in knowledge graph"
1358 |         }, indent=2)
1359 |     
1360 |     # Get file count
1361 |     files_query = """
1362 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
1363 |     RETURN count(f) as file_count
1364 |     """
1365 |     result = await session.run(files_query, repo_name=repo_name)
1366 |     file_count = (await result.single())['file_count']
1367 |     
1368 |     # Get class count
1369 |     classes_query = """
1370 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1371 |     RETURN count(DISTINCT c) as class_count
1372 |     """
1373 |     result = await session.run(classes_query, repo_name=repo_name)
1374 |     class_count = (await result.single())['class_count']
1375 |     
1376 |     # Get function count
1377 |     functions_query = """
1378 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1379 |     RETURN count(DISTINCT func) as function_count
1380 |     """
1381 |     result = await session.run(functions_query, repo_name=repo_name)
1382 |     function_count = (await result.single())['function_count']
1383 |     
1384 |     # Get method count
1385 |     methods_query = """
1386 |     MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1387 |     RETURN count(DISTINCT m) as method_count
1388 |     """
1389 |     result = await session.run(methods_query, repo_name=repo_name)
1390 |     method_count = (await result.single())['method_count']
1391 |     
1392 |     return json.dumps({
1393 |         "success": True,
1394 |         "command": command,
1395 |         "data": {
1396 |             "repository": repo_name,
1397 |             "statistics": {
1398 |                 "files": file_count,
1399 |                 "classes": class_count,
1400 |                 "functions": function_count,
1401 |                 "methods": method_count
1402 |             }
1403 |         },
1404 |         "metadata": {
1405 |             "total_results": 1,
1406 |             "limited": False
1407 |         }
1408 |     }, indent=2)
1409 | 
1410 | 
1411 | async def _handle_classes_command(session, command: str, repo_name: str = None) -> str:
1412 |     """Handle 'classes [repo]' command - list classes"""
1413 |     limit = 20
1414 |     
1415 |     if repo_name:
1416 |         query = """
1417 |         MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1418 |         RETURN c.name as name, c.full_name as full_name
1419 |         ORDER BY c.name
1420 |         LIMIT $limit
1421 |         """
1422 |         result = await session.run(query, repo_name=repo_name, limit=limit)
1423 |     else:
1424 |         query = """
1425 |         MATCH (c:Class)
1426 |         RETURN c.name as name, c.full_name as full_name
1427 |         ORDER BY c.name
1428 |         LIMIT $limit
1429 |         """
1430 |         result = await session.run(query, limit=limit)
1431 |     
1432 |     classes = []
1433 |     async for record in result:
1434 |         classes.append({
1435 |             'name': record['name'],
1436 |             'full_name': record['full_name']
1437 |         })
1438 |     
1439 |     return json.dumps({
1440 |         "success": True,
1441 |         "command": command,
1442 |         "data": {
1443 |             "classes": classes,
1444 |             "repository_filter": repo_name
1445 |         },
1446 |         "metadata": {
1447 |             "total_results": len(classes),
1448 |             "limited": len(classes) >= limit
1449 |         }
1450 |     }, indent=2)
1451 | 
1452 | 
1453 | async def _handle_class_command(session, command: str, class_name: str) -> str:
1454 |     """Handle 'class <name>' command - explore specific class"""
1455 |     # Find the class
1456 |     class_query = """
1457 |     MATCH (c:Class)
1458 |     WHERE c.name = $class_name OR c.full_name = $class_name
1459 |     RETURN c.name as name, c.full_name as full_name
1460 |     LIMIT 1
1461 |     """
1462 |     result = await session.run(class_query, class_name=class_name)
1463 |     class_record = await result.single()
1464 |     
1465 |     if not class_record:
1466 |         return json.dumps({
1467 |             "success": False,
1468 |             "command": command,
1469 |             "error": f"Class '{class_name}' not found in knowledge graph"
1470 |         }, indent=2)
1471 |     
1472 |     actual_name = class_record['name']
1473 |     full_name = class_record['full_name']
1474 |     
1475 |     # Get methods
1476 |     methods_query = """
1477 |     MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1478 |     WHERE c.name = $class_name OR c.full_name = $class_name
1479 |     RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, m.return_type as return_type
1480 |     ORDER BY m.name
1481 |     """
1482 |     result = await session.run(methods_query, class_name=class_name)
1483 |     
1484 |     methods = []
1485 |     async for record in result:
1486 |         # Use detailed params if available, fall back to simple params
1487 |         params_to_use = record['params_detailed'] or record['params_list'] or []
1488 |         methods.append({
1489 |             'name': record['name'],
1490 |             'parameters': params_to_use,
1491 |             'return_type': record['return_type'] or 'Any'
1492 |         })
1493 |     
1494 |     # Get attributes
1495 |     attributes_query = """
1496 |     MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
1497 |     WHERE c.name = $class_name OR c.full_name = $class_name
1498 |     RETURN a.name as name, a.type as type
1499 |     ORDER BY a.name
1500 |     """
1501 |     result = await session.run(attributes_query, class_name=class_name)
1502 |     
1503 |     attributes = []
1504 |     async for record in result:
1505 |         attributes.append({
1506 |             'name': record['name'],
1507 |             'type': record['type'] or 'Any'
1508 |         })
1509 |     
1510 |     return json.dumps({
1511 |         "success": True,
1512 |         "command": command,
1513 |         "data": {
1514 |             "class": {
1515 |                 "name": actual_name,
1516 |                 "full_name": full_name,
1517 |                 "methods": methods,
1518 |                 "attributes": attributes
1519 |             }
1520 |         },
1521 |         "metadata": {
1522 |             "total_results": 1,
1523 |             "methods_count": len(methods),
1524 |             "attributes_count": len(attributes),
1525 |             "limited": False
1526 |         }
1527 |     }, indent=2)
1528 | 
1529 | 
1530 | async def _handle_method_command(session, command: str, method_name: str, class_name: str = None) -> str:
1531 |     """Handle 'method <name> [class]' command - search for methods"""
1532 |     if class_name:
1533 |         query = """
1534 |         MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1535 |         WHERE (c.name = $class_name OR c.full_name = $class_name)
1536 |           AND m.name = $method_name
1537 |         RETURN c.name as class_name, c.full_name as class_full_name,
1538 |                m.name as method_name, m.params_list as params_list, 
1539 |                m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1540 |         """
1541 |         result = await session.run(query, class_name=class_name, method_name=method_name)
1542 |     else:
1543 |         query = """
1544 |         MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1545 |         WHERE m.name = $method_name
1546 |         RETURN c.name as class_name, c.full_name as class_full_name,
1547 |                m.name as method_name, m.params_list as params_list, 
1548 |                m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1549 |         ORDER BY c.name
1550 |         LIMIT 20
1551 |         """
1552 |         result = await session.run(query, method_name=method_name)
1553 |     
1554 |     methods = []
1555 |     async for record in result:
1556 |         # Use detailed params if available, fall back to simple params
1557 |         params_to_use = record['params_detailed'] or record['params_list'] or []
1558 |         methods.append({
1559 |             'class_name': record['class_name'],
1560 |             'class_full_name': record['class_full_name'],
1561 |             'method_name': record['method_name'],
1562 |             'parameters': params_to_use,
1563 |             'return_type': record['return_type'] or 'Any',
1564 |             'legacy_args': record['args'] or []
1565 |         })
1566 |     
1567 |     if not methods:
1568 |         return json.dumps({
1569 |             "success": False,
1570 |             "command": command,
1571 |             "error": f"Method '{method_name}'" + (f" in class '{class_name}'" if class_name else "") + " not found"
1572 |         }, indent=2)
1573 |     
1574 |     return json.dumps({
1575 |         "success": True,
1576 |         "command": command,
1577 |         "data": {
1578 |             "methods": methods,
1579 |             "class_filter": class_name
1580 |         },
1581 |         "metadata": {
1582 |             "total_results": len(methods),
1583 |             "limited": len(methods) >= 20 and not class_name
1584 |         }
1585 |     }, indent=2)
1586 | 
1587 | 
1588 | async def _handle_query_command(session, command: str, cypher_query: str) -> str:
1589 |     """Handle 'query <cypher>' command - execute custom Cypher query"""
1590 |     try:
1591 |         # Execute the query with a limit to prevent overwhelming responses
1592 |         result = await session.run(cypher_query)
1593 |         
1594 |         records = []
1595 |         count = 0
1596 |         async for record in result:
1597 |             records.append(dict(record))
1598 |             count += 1
1599 |             if count >= 20:  # Limit results to prevent overwhelming responses
1600 |                 break
1601 |         
1602 |         return json.dumps({
1603 |             "success": True,
1604 |             "command": command,
1605 |             "data": {
1606 |                 "query": cypher_query,
1607 |                 "results": records
1608 |             },
1609 |             "metadata": {
1610 |                 "total_results": len(records),
1611 |                 "limited": len(records) >= 20
1612 |             }
1613 |         }, indent=2)
1614 |         
1615 |     except Exception as e:
1616 |         return json.dumps({
1617 |             "success": False,
1618 |             "command": command,
1619 |             "error": f"Cypher query error: {str(e)}",
1620 |             "data": {
1621 |                 "query": cypher_query
1622 |             }
1623 |         }, indent=2)
1624 | 
1625 | 
1626 | @mcp.tool()
1627 | async def parse_github_repository(ctx: Context, repo_url: str) -> str:
1628 |     """
1629 |     Parse a GitHub repository into the Neo4j knowledge graph.
1630 |     
1631 |     This tool clones a GitHub repository, analyzes its Python files, and stores
1632 |     the code structure (classes, methods, functions, imports) in Neo4j for use
1633 |     in hallucination detection. The tool:
1634 |     
1635 |     - Clones the repository to a temporary location
1636 |     - Analyzes Python files to extract code structure
1637 |     - Stores classes, methods, functions, and imports in Neo4j
1638 |     - Provides detailed statistics about the parsing results
1639 |     - Automatically handles module name detection for imports
1640 |     
1641 |     Args:
1642 |         ctx: The MCP server provided context
1643 |         repo_url: GitHub repository URL (e.g., 'https://github.com/user/repo.git')
1644 |     
1645 |     Returns:
1646 |         JSON string with parsing results, statistics, and repository information
1647 |     """
1648 |     try:
1649 |         # Check if knowledge graph functionality is enabled
1650 |         knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1651 |         if not knowledge_graph_enabled:
1652 |             return json.dumps({
1653 |                 "success": False,
1654 |                 "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1655 |             }, indent=2)
1656 |         
1657 |         # Get the repository extractor from context
1658 |         repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1659 |         
1660 |         if not repo_extractor:
1661 |             return json.dumps({
1662 |                 "success": False,
1663 |                 "error": "Repository extractor not available. Check Neo4j configuration in environment variables."
1664 |             }, indent=2)
1665 |         
1666 |         # Validate repository URL
1667 |         validation = validate_github_url(repo_url)
1668 |         if not validation["valid"]:
1669 |             return json.dumps({
1670 |                 "success": False,
1671 |                 "repo_url": repo_url,
1672 |                 "error": validation["error"]
1673 |             }, indent=2)
1674 |         
1675 |         repo_name = validation["repo_name"]
1676 |         
1677 |         # Parse the repository (this includes cloning, analysis, and Neo4j storage)
1678 |         print(f"Starting repository analysis for: {repo_name}")
1679 |         await repo_extractor.analyze_repository(repo_url)
1680 |         print(f"Repository analysis completed for: {repo_name}")
1681 |         
1682 |         # Query Neo4j for statistics about the parsed repository
1683 |         async with repo_extractor.driver.session() as session:
1684 |             # Get comprehensive repository statistics
1685 |             stats_query = """
1686 |             MATCH (r:Repository {name: $repo_name})
1687 |             OPTIONAL MATCH (r)-[:CONTAINS]->(f:File)
1688 |             OPTIONAL MATCH (f)-[:DEFINES]->(c:Class)
1689 |             OPTIONAL MATCH (c)-[:HAS_METHOD]->(m:Method)
1690 |             OPTIONAL MATCH (f)-[:DEFINES]->(func:Function)
1691 |             OPTIONAL MATCH (c)-[:HAS_ATTRIBUTE]->(a:Attribute)
1692 |             WITH r, 
1693 |                  count(DISTINCT f) as files_count,
1694 |                  count(DISTINCT c) as classes_count,
1695 |                  count(DISTINCT m) as methods_count,
1696 |                  count(DISTINCT func) as functions_count,
1697 |                  count(DISTINCT a) as attributes_count
1698 |             
1699 |             // Get some sample module names
1700 |             OPTIONAL MATCH (r)-[:CONTAINS]->(sample_f:File)
1701 |             WITH r, files_count, classes_count, methods_count, functions_count, attributes_count,
1702 |                  collect(DISTINCT sample_f.module_name)[0..5] as sample_modules
1703 |             
1704 |             RETURN 
1705 |                 r.name as repo_name,
1706 |                 files_count,
1707 |                 classes_count, 
1708 |                 methods_count,
1709 |                 functions_count,
1710 |                 attributes_count,
1711 |                 sample_modules
1712 |             """
1713 |             
1714 |             result = await session.run(stats_query, repo_name=repo_name)
1715 |             record = await result.single()
1716 |             
1717 |             if record:
1718 |                 stats = {
1719 |                     "repository": record['repo_name'],
1720 |                     "files_processed": record['files_count'],
1721 |                     "classes_created": record['classes_count'],
1722 |                     "methods_created": record['methods_count'], 
1723 |                     "functions_created": record['functions_count'],
1724 |                     "attributes_created": record['attributes_count'],
1725 |                     "sample_modules": record['sample_modules'] or []
1726 |                 }
1727 |             else:
1728 |                 return json.dumps({
1729 |                     "success": False,
1730 |                     "repo_url": repo_url,
1731 |                     "error": f"Repository '{repo_name}' not found in database after parsing"
1732 |                 }, indent=2)
1733 |         
1734 |         return json.dumps({
1735 |             "success": True,
1736 |             "repo_url": repo_url,
1737 |             "repo_name": repo_name,
1738 |             "message": f"Successfully parsed repository '{repo_name}' into knowledge graph",
1739 |             "statistics": stats,
1740 |             "ready_for_validation": True,
1741 |             "next_steps": [
1742 |                 "Repository is now available for hallucination detection",
1743 |                 f"Use check_ai_script_hallucinations to validate scripts against {repo_name}",
1744 |                 "The knowledge graph contains classes, methods, and functions from this repository"
1745 |             ]
1746 |         }, indent=2)
1747 |         
1748 |     except Exception as e:
1749 |         return json.dumps({
1750 |             "success": False,
1751 |             "repo_url": repo_url,
1752 |             "error": f"Repository parsing failed: {str(e)}"
1753 |         }, indent=2)
1754 | 
1755 | async def crawl_markdown_file(crawler: AsyncWebCrawler, url: str) -> List[Dict[str, Any]]:
1756 |     """
1757 |     Crawl a .txt or markdown file.
1758 |     
1759 |     Args:
1760 |         crawler: AsyncWebCrawler instance
1761 |         url: URL of the file
1762 |         
1763 |     Returns:
1764 |         List of dictionaries with URL and markdown content
1765 |     """
1766 |     crawl_config = CrawlerRunConfig()
1767 | 
1768 |     result = await crawler.arun(url=url, config=crawl_config)
1769 |     if result.success and result.markdown:
1770 |         return [{'url': url, 'markdown': result.markdown}]
1771 |     else:
1772 |         print(f"Failed to crawl {url}: {result.error_message}")
1773 |         return []
1774 | 
1775 | async def crawl_batch(crawler: AsyncWebCrawler, urls: List[str], max_concurrent: int = 10) -> List[Dict[str, Any]]:
1776 |     """
1777 |     Batch crawl multiple URLs in parallel.
1778 |     
1779 |     Args:
1780 |         crawler: AsyncWebCrawler instance
1781 |         urls: List of URLs to crawl
1782 |         max_concurrent: Maximum number of concurrent browser sessions
1783 |         
1784 |     Returns:
1785 |         List of dictionaries with URL and markdown content
1786 |     """
1787 |     crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1788 |     dispatcher = MemoryAdaptiveDispatcher(
1789 |         memory_threshold_percent=70.0,
1790 |         check_interval=1.0,
1791 |         max_session_permit=max_concurrent
1792 |     )
1793 | 
1794 |     results = await crawler.arun_many(urls=urls, config=crawl_config, dispatcher=dispatcher)
1795 |     return [{'url': r.url, 'markdown': r.markdown} for r in results if r.success and r.markdown]
1796 | 
1797 | async def crawl_recursive_internal_links(crawler: AsyncWebCrawler, start_urls: List[str], max_depth: int = 3, max_concurrent: int = 10) -> List[Dict[str, Any]]:
1798 |     """
1799 |     Recursively crawl internal links from start URLs up to a maximum depth.
1800 |     
1801 |     Args:
1802 |         crawler: AsyncWebCrawler instance
1803 |         start_urls: List of starting URLs
1804 |         max_depth: Maximum recursion depth
1805 |         max_concurrent: Maximum number of concurrent browser sessions
1806 |         
1807 |     Returns:
1808 |         List of dictionaries with URL and markdown content
1809 |     """
1810 |     run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1811 |     dispatcher = MemoryAdaptiveDispatcher(
1812 |         memory_threshold_percent=70.0,
1813 |         check_interval=1.0,
1814 |         max_session_permit=max_concurrent
1815 |     )
1816 | 
1817 |     visited = set()
1818 | 
1819 |     def normalize_url(url):
1820 |         return urldefrag(url)[0]
1821 | 
1822 |     current_urls = set([normalize_url(u) for u in start_urls])
1823 |     results_all = []
1824 | 
1825 |     for depth in range(max_depth):
1826 |         urls_to_crawl = [normalize_url(url) for url in current_urls if normalize_url(url) not in visited]
1827 |         if not urls_to_crawl:
1828 |             break
1829 | 
1830 |         results = await crawler.arun_many(urls=urls_to_crawl, config=run_config, dispatcher=dispatcher)
1831 |         next_level_urls = set()
1832 | 
1833 |         for result in results:
1834 |             norm_url = normalize_url(result.url)
1835 |             visited.add(norm_url)
1836 | 
1837 |             if result.success and result.markdown:
1838 |                 results_all.append({'url': result.url, 'markdown': result.markdown})
1839 |                 for link in result.links.get("internal", []):
1840 |                     next_url = normalize_url(link["href"])
1841 |                     if next_url not in visited:
1842 |                         next_level_urls.add(next_url)
1843 | 
1844 |         current_urls = next_level_urls
1845 | 
1846 |     return results_all
1847 | 
1848 | async def main():
1849 |     transport = os.getenv("TRANSPORT", "sse")
1850 |     if transport == 'sse':
1851 |         # Run the MCP server with sse transport
1852 |         await mcp.run_sse_async()
1853 |     else:
1854 |         # Run the MCP server with stdio transport
1855 |         await mcp.run_stdio_async()
1856 | 
1857 | if __name__ == "__main__":
1858 |     asyncio.run(main())
```