This is page 7 of 7. Use http://codebase.md/chillbruhhh/crawl4ai-mcp?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .dockerignore
├── .env.example
├── .gitattributes
├── .gitignore
├── crawled_pages.sql
├── Dockerfile
├── knowledge_graphs
│ ├── ai_hallucination_detector.py
│ ├── ai_script_analyzer.py
│ ├── hallucination_reporter.py
│ ├── knowledge_graph_validator.py
│ ├── parse_repo_into_neo4j.py
│ ├── query_knowledge_graph.py
│ └── test_script.py
├── LICENSE
├── neo4j
│ └── docker-neo4j
│ ├── .github
│ │ └── ISSUE_TEMPLATE
│ │ └── bug_report.md
│ ├── .gitignore
│ ├── build-docker-image.sh
│ ├── build-utils-common-functions.sh
│ ├── COPYRIGHT
│ ├── DEVELOPMENT.md
│ ├── devenv
│ ├── devenv.local.template
│ ├── docker-image-src
│ │ ├── 2.3
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.0
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.1
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.2
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.3
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.4
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.5
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.0
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ └── Dockerfile
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.1
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ └── Dockerfile
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.2
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.3
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.4
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile-debian
│ │ │ │ ├── Dockerfile-ubi9
│ │ │ │ ├── neo4j-admin-report.sh
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ ├── Dockerfile-debian
│ │ │ └── Dockerfile-ubi9
│ │ ├── 5
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile-debian
│ │ │ │ ├── Dockerfile-ubi8
│ │ │ │ ├── Dockerfile-ubi9
│ │ │ │ ├── neo4j-admin-report.sh
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ ├── Dockerfile-debian
│ │ │ ├── Dockerfile-ubi8
│ │ │ └── Dockerfile-ubi9
│ │ ├── calver
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile-debian
│ │ │ │ ├── Dockerfile-ubi9
│ │ │ │ ├── neo4j-admin-report.sh
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ ├── Dockerfile-debian
│ │ │ └── Dockerfile-ubi9
│ │ └── common
│ │ ├── semver.jq
│ │ └── utilities.sh
│ ├── generate-stub-plugin
│ │ ├── build.gradle.kts
│ │ ├── Dockerfile
│ │ ├── ExampleNeo4jPlugin.java
│ │ ├── Makefile
│ │ ├── README.md
│ │ └── settings.gradle.kts
│ ├── LICENSE
│ ├── Makefile
│ ├── pom.xml
│ ├── publish-neo4j-admin-image.sh
│ ├── publish-neo4j-admin-images.sh
│ ├── README.md
│ └── src
│ ├── main
│ │ └── resources
│ │ └── log4j.properties
│ └── test
│ ├── java
│ │ └── com
│ │ └── neo4j
│ │ └── docker
│ │ ├── coredb
│ │ │ ├── configurations
│ │ │ │ ├── Configuration.java
│ │ │ │ ├── Setting.java
│ │ │ │ ├── TestConfSettings.java
│ │ │ │ ├── TestExtendedConf.java
│ │ │ │ └── TestJVMAdditionalConfig.java
│ │ │ ├── plugins
│ │ │ │ ├── Neo4jPluginEnv.java
│ │ │ │ ├── StubPluginHelper.java
│ │ │ │ ├── TestBundledPluginInstallation.java
│ │ │ │ ├── TestPluginInstallation.java
│ │ │ │ └── TestSemVerPluginMatching.java
│ │ │ ├── TestAdminReport.java
│ │ │ ├── TestAuthentication.java
│ │ │ ├── TestBasic.java
│ │ │ ├── TestCausalCluster.java
│ │ │ ├── TestMounting.java
│ │ │ └── TestUpgrade.java
│ │ ├── neo4jadmin
│ │ │ ├── TestAdminBasic.java
│ │ │ ├── TestBackupRestore.java
│ │ │ ├── TestBackupRestore44.java
│ │ │ ├── TestDumpLoad.java
│ │ │ ├── TestDumpLoad44.java
│ │ │ └── TestReport.java
│ │ ├── TestDeprecationWarning.java
│ │ ├── TestDockerComposeSecrets.java
│ │ └── utils
│ │ ├── DatabaseIO.java
│ │ ├── HostFileHttpHandler.java
│ │ ├── HttpServerTestExtension.java
│ │ ├── Neo4jVersion.java
│ │ ├── Neo4jVersionTest.java
│ │ ├── Network.java
│ │ ├── SetContainerUser.java
│ │ ├── TemporaryFolderManager.java
│ │ ├── TemporaryFolderManagerTest.java
│ │ ├── TestSettings.java
│ │ └── WaitStrategies.java
│ └── resources
│ ├── causal-cluster-compose.yml
│ ├── confs
│ │ ├── before50
│ │ │ ├── ConfsNotOverridden.conf
│ │ │ ├── ConfsReplaced.conf
│ │ │ ├── EnterpriseOnlyNotOverwritten.conf
│ │ │ ├── EnvVarsOverride.conf
│ │ │ ├── ExtendedConf.conf
│ │ │ ├── InvalidExtendedConf.conf
│ │ │ ├── JvmAdditionalNotOverridden.conf
│ │ │ ├── NoNewline.conf
│ │ │ └── ReadConf.conf
│ │ ├── ConfsNotOverridden.conf
│ │ ├── ConfsReplaced.conf
│ │ ├── EnterpriseOnlyNotOverwritten.conf
│ │ ├── EnvVarsOverride.conf
│ │ ├── ExtendedConf.conf
│ │ ├── InvalidExtendedConf.conf
│ │ ├── JvmAdditionalNotOverridden.conf
│ │ ├── NoNewline.conf
│ │ └── ReadConf.conf
│ ├── dockersecrets
│ │ ├── container-compose-with-incorrect-secrets.yml
│ │ ├── container-compose-with-secrets-override.yml
│ │ ├── container-compose-with-secrets.yml
│ │ ├── simple-container-compose-with-external-file-var.yml
│ │ └── simple-container-compose.yml
│ ├── ha-cluster-compose.yml
│ └── stubplugin
│ └── myPlugin.jar
├── pyproject.toml
├── README.md
├── src
│ ├── crawl4ai_mcp.py
│ └── utils.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/knowledge_graphs/knowledge_graph_validator.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Knowledge Graph Validator
3 |
4 | Validates AI-generated code against Neo4j knowledge graph containing
5 | repository information. Checks imports, methods, attributes, and parameters.
6 | """
7 |
8 | import asyncio
9 | import logging
10 | from typing import Dict, List, Optional, Set, Tuple, Any
11 | from dataclasses import dataclass, field
12 | from enum import Enum
13 | from neo4j import AsyncGraphDatabase
14 |
15 | from ai_script_analyzer import (
16 | AnalysisResult, ImportInfo, MethodCall, AttributeAccess,
17 | FunctionCall, ClassInstantiation
18 | )
19 |
20 | logger = logging.getLogger(__name__)
21 |
22 |
23 | class ValidationStatus(Enum):
24 | VALID = "VALID"
25 | INVALID = "INVALID"
26 | UNCERTAIN = "UNCERTAIN"
27 | NOT_FOUND = "NOT_FOUND"
28 |
29 |
30 | @dataclass
31 | class ValidationResult:
32 | """Result of validating a single element"""
33 | status: ValidationStatus
34 | confidence: float # 0.0 to 1.0
35 | message: str
36 | details: Dict[str, Any] = field(default_factory=dict)
37 | suggestions: List[str] = field(default_factory=list)
38 |
39 |
40 | @dataclass
41 | class ImportValidation:
42 | """Validation result for an import"""
43 | import_info: ImportInfo
44 | validation: ValidationResult
45 | available_classes: List[str] = field(default_factory=list)
46 | available_functions: List[str] = field(default_factory=list)
47 |
48 |
49 | @dataclass
50 | class MethodValidation:
51 | """Validation result for a method call"""
52 | method_call: MethodCall
53 | validation: ValidationResult
54 | expected_params: List[str] = field(default_factory=list)
55 | actual_params: List[str] = field(default_factory=list)
56 | parameter_validation: ValidationResult = None
57 |
58 |
59 | @dataclass
60 | class AttributeValidation:
61 | """Validation result for attribute access"""
62 | attribute_access: AttributeAccess
63 | validation: ValidationResult
64 | expected_type: Optional[str] = None
65 |
66 |
67 | @dataclass
68 | class FunctionValidation:
69 | """Validation result for function call"""
70 | function_call: FunctionCall
71 | validation: ValidationResult
72 | expected_params: List[str] = field(default_factory=list)
73 | actual_params: List[str] = field(default_factory=list)
74 | parameter_validation: ValidationResult = None
75 |
76 |
77 | @dataclass
78 | class ClassValidation:
79 | """Validation result for class instantiation"""
80 | class_instantiation: ClassInstantiation
81 | validation: ValidationResult
82 | constructor_params: List[str] = field(default_factory=list)
83 | parameter_validation: ValidationResult = None
84 |
85 |
86 | @dataclass
87 | class ScriptValidationResult:
88 | """Complete validation results for a script"""
89 | script_path: str
90 | analysis_result: AnalysisResult
91 | import_validations: List[ImportValidation] = field(default_factory=list)
92 | class_validations: List[ClassValidation] = field(default_factory=list)
93 | method_validations: List[MethodValidation] = field(default_factory=list)
94 | attribute_validations: List[AttributeValidation] = field(default_factory=list)
95 | function_validations: List[FunctionValidation] = field(default_factory=list)
96 | overall_confidence: float = 0.0
97 | hallucinations_detected: List[Dict[str, Any]] = field(default_factory=list)
98 |
99 |
100 | class KnowledgeGraphValidator:
101 | """Validates code against Neo4j knowledge graph"""
102 |
103 | def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
104 | self.neo4j_uri = neo4j_uri
105 | self.neo4j_user = neo4j_user
106 | self.neo4j_password = neo4j_password
107 | self.driver = None
108 |
109 | # Cache for performance
110 | self.module_cache: Dict[str, List[str]] = {}
111 | self.class_cache: Dict[str, Dict[str, Any]] = {}
112 | self.method_cache: Dict[str, List[Dict[str, Any]]] = {}
113 | self.repo_cache: Dict[str, str] = {} # module_name -> repo_name
114 | self.knowledge_graph_modules: Set[str] = set() # Track modules in knowledge graph
115 |
116 | async def initialize(self):
117 | """Initialize Neo4j connection"""
118 | self.driver = AsyncGraphDatabase.driver(
119 | self.neo4j_uri,
120 | auth=(self.neo4j_user, self.neo4j_password)
121 | )
122 | logger.info("Knowledge graph validator initialized")
123 |
124 | async def close(self):
125 | """Close Neo4j connection"""
126 | if self.driver:
127 | await self.driver.close()
128 |
129 | async def validate_script(self, analysis_result: AnalysisResult) -> ScriptValidationResult:
130 | """Validate entire script analysis against knowledge graph"""
131 | result = ScriptValidationResult(
132 | script_path=analysis_result.file_path,
133 | analysis_result=analysis_result
134 | )
135 |
136 | # Validate imports first (builds context for other validations)
137 | result.import_validations = await self._validate_imports(analysis_result.imports)
138 |
139 | # Validate class instantiations
140 | result.class_validations = await self._validate_class_instantiations(
141 | analysis_result.class_instantiations
142 | )
143 |
144 | # Validate method calls
145 | result.method_validations = await self._validate_method_calls(
146 | analysis_result.method_calls
147 | )
148 |
149 | # Validate attribute accesses
150 | result.attribute_validations = await self._validate_attribute_accesses(
151 | analysis_result.attribute_accesses
152 | )
153 |
154 | # Validate function calls
155 | result.function_validations = await self._validate_function_calls(
156 | analysis_result.function_calls
157 | )
158 |
159 | # Calculate overall confidence and detect hallucinations
160 | result.overall_confidence = self._calculate_overall_confidence(result)
161 | result.hallucinations_detected = self._detect_hallucinations(result)
162 |
163 | return result
164 |
165 | async def _validate_imports(self, imports: List[ImportInfo]) -> List[ImportValidation]:
166 | """Validate all imports against knowledge graph"""
167 | validations = []
168 |
169 | for import_info in imports:
170 | validation = await self._validate_single_import(import_info)
171 | validations.append(validation)
172 |
173 | return validations
174 |
175 | async def _validate_single_import(self, import_info: ImportInfo) -> ImportValidation:
176 | """Validate a single import"""
177 | # Determine module to search for
178 | search_module = import_info.module if import_info.is_from_import else import_info.name
179 |
180 | # Check cache first
181 | if search_module in self.module_cache:
182 | available_files = self.module_cache[search_module]
183 | else:
184 | # Query Neo4j for matching modules
185 | available_files = await self._find_modules(search_module)
186 | self.module_cache[search_module] = available_files
187 |
188 | if available_files:
189 | # Get available classes and functions from the module
190 | classes, functions = await self._get_module_contents(search_module)
191 |
192 | # Track this module as being in the knowledge graph
193 | self.knowledge_graph_modules.add(search_module)
194 |
195 | # Also track the base module for "from X.Y.Z import ..." patterns
196 | if '.' in search_module:
197 | base_module = search_module.split('.')[0]
198 | self.knowledge_graph_modules.add(base_module)
199 |
200 | validation = ValidationResult(
201 | status=ValidationStatus.VALID,
202 | confidence=0.9,
203 | message=f"Module '{search_module}' found in knowledge graph",
204 | details={"matched_files": available_files, "in_knowledge_graph": True}
205 | )
206 |
207 | return ImportValidation(
208 | import_info=import_info,
209 | validation=validation,
210 | available_classes=classes,
211 | available_functions=functions
212 | )
213 | else:
214 | # External library - mark as such but don't treat as error
215 | validation = ValidationResult(
216 | status=ValidationStatus.UNCERTAIN,
217 | confidence=0.8, # High confidence it's external, not an error
218 | message=f"Module '{search_module}' is external (not in knowledge graph)",
219 | details={"could_be_external": True, "in_knowledge_graph": False}
220 | )
221 |
222 | return ImportValidation(
223 | import_info=import_info,
224 | validation=validation
225 | )
226 |
227 | async def _validate_class_instantiations(self, instantiations: List[ClassInstantiation]) -> List[ClassValidation]:
228 | """Validate class instantiations"""
229 | validations = []
230 |
231 | for instantiation in instantiations:
232 | validation = await self._validate_single_class_instantiation(instantiation)
233 | validations.append(validation)
234 |
235 | return validations
236 |
237 | async def _validate_single_class_instantiation(self, instantiation: ClassInstantiation) -> ClassValidation:
238 | """Validate a single class instantiation"""
239 | class_name = instantiation.full_class_name or instantiation.class_name
240 |
241 | # Skip validation for classes not from knowledge graph
242 | if not self._is_from_knowledge_graph(class_name):
243 | validation = ValidationResult(
244 | status=ValidationStatus.UNCERTAIN,
245 | confidence=0.8,
246 | message=f"Skipping validation: '{class_name}' is not from knowledge graph"
247 | )
248 | return ClassValidation(
249 | class_instantiation=instantiation,
250 | validation=validation
251 | )
252 |
253 | # Find class in knowledge graph
254 | class_info = await self._find_class(class_name)
255 |
256 | if not class_info:
257 | validation = ValidationResult(
258 | status=ValidationStatus.NOT_FOUND,
259 | confidence=0.2,
260 | message=f"Class '{class_name}' not found in knowledge graph"
261 | )
262 | return ClassValidation(
263 | class_instantiation=instantiation,
264 | validation=validation
265 | )
266 |
267 | # Check constructor parameters (look for __init__ method)
268 | init_method = await self._find_method(class_name, "__init__")
269 |
270 | if init_method:
271 | param_validation = self._validate_parameters(
272 | expected_params=init_method.get('params_list', []),
273 | provided_args=instantiation.args,
274 | provided_kwargs=instantiation.kwargs
275 | )
276 | else:
277 | param_validation = ValidationResult(
278 | status=ValidationStatus.UNCERTAIN,
279 | confidence=0.5,
280 | message="Constructor parameters not found"
281 | )
282 |
283 | # Use parameter validation result if it failed
284 | if param_validation.status == ValidationStatus.INVALID:
285 | validation = ValidationResult(
286 | status=ValidationStatus.INVALID,
287 | confidence=param_validation.confidence,
288 | message=f"Class '{class_name}' found but has invalid constructor parameters: {param_validation.message}",
289 | suggestions=param_validation.suggestions
290 | )
291 | else:
292 | validation = ValidationResult(
293 | status=ValidationStatus.VALID,
294 | confidence=0.8,
295 | message=f"Class '{class_name}' found in knowledge graph"
296 | )
297 |
298 | return ClassValidation(
299 | class_instantiation=instantiation,
300 | validation=validation,
301 | parameter_validation=param_validation
302 | )
303 |
304 | async def _validate_method_calls(self, method_calls: List[MethodCall]) -> List[MethodValidation]:
305 | """Validate method calls"""
306 | validations = []
307 |
308 | for method_call in method_calls:
309 | validation = await self._validate_single_method_call(method_call)
310 | validations.append(validation)
311 |
312 | return validations
313 |
314 | async def _validate_single_method_call(self, method_call: MethodCall) -> MethodValidation:
315 | """Validate a single method call"""
316 | class_type = method_call.object_type
317 |
318 | if not class_type:
319 | validation = ValidationResult(
320 | status=ValidationStatus.UNCERTAIN,
321 | confidence=0.3,
322 | message=f"Cannot determine object type for '{method_call.object_name}'"
323 | )
324 | return MethodValidation(
325 | method_call=method_call,
326 | validation=validation
327 | )
328 |
329 | # Skip validation for classes not from knowledge graph
330 | if not self._is_from_knowledge_graph(class_type):
331 | validation = ValidationResult(
332 | status=ValidationStatus.UNCERTAIN,
333 | confidence=0.8,
334 | message=f"Skipping validation: '{class_type}' is not from knowledge graph"
335 | )
336 | return MethodValidation(
337 | method_call=method_call,
338 | validation=validation
339 | )
340 |
341 | # Find method in knowledge graph
342 | method_info = await self._find_method(class_type, method_call.method_name)
343 |
344 | if not method_info:
345 | # Check for similar method names
346 | similar_methods = await self._find_similar_methods(class_type, method_call.method_name)
347 |
348 | validation = ValidationResult(
349 | status=ValidationStatus.NOT_FOUND,
350 | confidence=0.1,
351 | message=f"Method '{method_call.method_name}' not found on class '{class_type}'",
352 | suggestions=similar_methods
353 | )
354 | return MethodValidation(
355 | method_call=method_call,
356 | validation=validation
357 | )
358 |
359 | # Validate parameters
360 | expected_params = method_info.get('params_list', [])
361 | param_validation = self._validate_parameters(
362 | expected_params=expected_params,
363 | provided_args=method_call.args,
364 | provided_kwargs=method_call.kwargs
365 | )
366 |
367 | # Use parameter validation result if it failed
368 | if param_validation.status == ValidationStatus.INVALID:
369 | validation = ValidationResult(
370 | status=ValidationStatus.INVALID,
371 | confidence=param_validation.confidence,
372 | message=f"Method '{method_call.method_name}' found but has invalid parameters: {param_validation.message}",
373 | suggestions=param_validation.suggestions
374 | )
375 | else:
376 | validation = ValidationResult(
377 | status=ValidationStatus.VALID,
378 | confidence=0.9,
379 | message=f"Method '{method_call.method_name}' found on class '{class_type}'"
380 | )
381 |
382 | return MethodValidation(
383 | method_call=method_call,
384 | validation=validation,
385 | expected_params=expected_params,
386 | actual_params=method_call.args + list(method_call.kwargs.keys()),
387 | parameter_validation=param_validation
388 | )
389 |
390 | async def _validate_attribute_accesses(self, attribute_accesses: List[AttributeAccess]) -> List[AttributeValidation]:
391 | """Validate attribute accesses"""
392 | validations = []
393 |
394 | for attr_access in attribute_accesses:
395 | validation = await self._validate_single_attribute_access(attr_access)
396 | validations.append(validation)
397 |
398 | return validations
399 |
400 | async def _validate_single_attribute_access(self, attr_access: AttributeAccess) -> AttributeValidation:
401 | """Validate a single attribute access"""
402 | class_type = attr_access.object_type
403 |
404 | if not class_type:
405 | validation = ValidationResult(
406 | status=ValidationStatus.UNCERTAIN,
407 | confidence=0.3,
408 | message=f"Cannot determine object type for '{attr_access.object_name}'"
409 | )
410 | return AttributeValidation(
411 | attribute_access=attr_access,
412 | validation=validation
413 | )
414 |
415 | # Skip validation for classes not from knowledge graph
416 | if not self._is_from_knowledge_graph(class_type):
417 | validation = ValidationResult(
418 | status=ValidationStatus.UNCERTAIN,
419 | confidence=0.8,
420 | message=f"Skipping validation: '{class_type}' is not from knowledge graph"
421 | )
422 | return AttributeValidation(
423 | attribute_access=attr_access,
424 | validation=validation
425 | )
426 |
427 | # Find attribute in knowledge graph
428 | attr_info = await self._find_attribute(class_type, attr_access.attribute_name)
429 |
430 | if not attr_info:
431 | # If not found as attribute, check if it's a method (for decorators like @agent.tool)
432 | method_info = await self._find_method(class_type, attr_access.attribute_name)
433 |
434 | if method_info:
435 | validation = ValidationResult(
436 | status=ValidationStatus.VALID,
437 | confidence=0.8,
438 | message=f"'{attr_access.attribute_name}' found as method on class '{class_type}' (likely used as decorator)"
439 | )
440 | return AttributeValidation(
441 | attribute_access=attr_access,
442 | validation=validation,
443 | expected_type="method"
444 | )
445 |
446 | validation = ValidationResult(
447 | status=ValidationStatus.NOT_FOUND,
448 | confidence=0.2,
449 | message=f"'{attr_access.attribute_name}' not found on class '{class_type}'"
450 | )
451 | return AttributeValidation(
452 | attribute_access=attr_access,
453 | validation=validation
454 | )
455 |
456 | validation = ValidationResult(
457 | status=ValidationStatus.VALID,
458 | confidence=0.8,
459 | message=f"Attribute '{attr_access.attribute_name}' found on class '{class_type}'"
460 | )
461 |
462 | return AttributeValidation(
463 | attribute_access=attr_access,
464 | validation=validation,
465 | expected_type=attr_info.get('type')
466 | )
467 |
468 | async def _validate_function_calls(self, function_calls: List[FunctionCall]) -> List[FunctionValidation]:
469 | """Validate function calls"""
470 | validations = []
471 |
472 | for func_call in function_calls:
473 | validation = await self._validate_single_function_call(func_call)
474 | validations.append(validation)
475 |
476 | return validations
477 |
478 | async def _validate_single_function_call(self, func_call: FunctionCall) -> FunctionValidation:
479 | """Validate a single function call"""
480 | func_name = func_call.full_name or func_call.function_name
481 |
482 | # Skip validation for functions not from knowledge graph
483 | if func_call.full_name and not self._is_from_knowledge_graph(func_call.full_name):
484 | validation = ValidationResult(
485 | status=ValidationStatus.UNCERTAIN,
486 | confidence=0.8,
487 | message=f"Skipping validation: '{func_name}' is not from knowledge graph"
488 | )
489 | return FunctionValidation(
490 | function_call=func_call,
491 | validation=validation
492 | )
493 |
494 | # Find function in knowledge graph
495 | func_info = await self._find_function(func_name)
496 |
497 | if not func_info:
498 | validation = ValidationResult(
499 | status=ValidationStatus.NOT_FOUND,
500 | confidence=0.2,
501 | message=f"Function '{func_name}' not found in knowledge graph"
502 | )
503 | return FunctionValidation(
504 | function_call=func_call,
505 | validation=validation
506 | )
507 |
508 | # Validate parameters
509 | expected_params = func_info.get('params_list', [])
510 | param_validation = self._validate_parameters(
511 | expected_params=expected_params,
512 | provided_args=func_call.args,
513 | provided_kwargs=func_call.kwargs
514 | )
515 |
516 | # Use parameter validation result if it failed
517 | if param_validation.status == ValidationStatus.INVALID:
518 | validation = ValidationResult(
519 | status=ValidationStatus.INVALID,
520 | confidence=param_validation.confidence,
521 | message=f"Function '{func_name}' found but has invalid parameters: {param_validation.message}",
522 | suggestions=param_validation.suggestions
523 | )
524 | else:
525 | validation = ValidationResult(
526 | status=ValidationStatus.VALID,
527 | confidence=0.8,
528 | message=f"Function '{func_name}' found in knowledge graph"
529 | )
530 |
531 | return FunctionValidation(
532 | function_call=func_call,
533 | validation=validation,
534 | expected_params=expected_params,
535 | actual_params=func_call.args + list(func_call.kwargs.keys()),
536 | parameter_validation=param_validation
537 | )
538 |
539 | def _validate_parameters(self, expected_params: List[str], provided_args: List[str],
540 | provided_kwargs: Dict[str, str]) -> ValidationResult:
541 | """Validate function/method parameters with comprehensive support"""
542 | if not expected_params:
543 | return ValidationResult(
544 | status=ValidationStatus.UNCERTAIN,
545 | confidence=0.5,
546 | message="Parameter information not available"
547 | )
548 |
549 | # Parse expected parameters - handle detailed format
550 | required_positional = []
551 | optional_positional = []
552 | keyword_only_required = []
553 | keyword_only_optional = []
554 | has_varargs = False
555 | has_varkwargs = False
556 |
557 | for param in expected_params:
558 | # Handle detailed format: "[keyword_only] name:type=default" or "name:type"
559 | param_clean = param.strip()
560 |
561 | # Check for parameter kind prefix
562 | kind = 'positional'
563 | if param_clean.startswith('['):
564 | end_bracket = param_clean.find(']')
565 | if end_bracket > 0:
566 | kind = param_clean[1:end_bracket]
567 | param_clean = param_clean[end_bracket+1:].strip()
568 |
569 | # Check for varargs/varkwargs
570 | if param_clean.startswith('*') and not param_clean.startswith('**'):
571 | has_varargs = True
572 | continue
573 | elif param_clean.startswith('**'):
574 | has_varkwargs = True
575 | continue
576 |
577 | # Parse name and check if optional
578 | if ':' in param_clean:
579 | param_name = param_clean.split(':')[0]
580 | is_optional = '=' in param_clean
581 |
582 | if kind == 'keyword_only':
583 | if is_optional:
584 | keyword_only_optional.append(param_name)
585 | else:
586 | keyword_only_required.append(param_name)
587 | else: # positional
588 | if is_optional:
589 | optional_positional.append(param_name)
590 | else:
591 | required_positional.append(param_name)
592 |
593 | # Count provided parameters
594 | provided_positional_count = len(provided_args)
595 | provided_keyword_names = set(provided_kwargs.keys())
596 |
597 | # Validate positional arguments
598 | min_required_positional = len(required_positional)
599 | max_allowed_positional = len(required_positional) + len(optional_positional)
600 |
601 | if not has_varargs and provided_positional_count > max_allowed_positional:
602 | return ValidationResult(
603 | status=ValidationStatus.INVALID,
604 | confidence=0.8,
605 | message=f"Too many positional arguments: provided {provided_positional_count}, max allowed {max_allowed_positional}"
606 | )
607 |
608 | if provided_positional_count < min_required_positional:
609 | return ValidationResult(
610 | status=ValidationStatus.INVALID,
611 | confidence=0.8,
612 | message=f"Too few positional arguments: provided {provided_positional_count}, required {min_required_positional}"
613 | )
614 |
615 | # Validate keyword arguments
616 | all_valid_kwarg_names = set(required_positional + optional_positional + keyword_only_required + keyword_only_optional)
617 | invalid_kwargs = provided_keyword_names - all_valid_kwarg_names
618 |
619 | if invalid_kwargs and not has_varkwargs:
620 | return ValidationResult(
621 | status=ValidationStatus.INVALID,
622 | confidence=0.7,
623 | message=f"Invalid keyword arguments: {list(invalid_kwargs)}",
624 | suggestions=[f"Valid parameters: {list(all_valid_kwarg_names)}"]
625 | )
626 |
627 | # Check required keyword-only arguments
628 | missing_required_kwargs = set(keyword_only_required) - provided_keyword_names
629 | if missing_required_kwargs:
630 | return ValidationResult(
631 | status=ValidationStatus.INVALID,
632 | confidence=0.8,
633 | message=f"Missing required keyword arguments: {list(missing_required_kwargs)}"
634 | )
635 |
636 | return ValidationResult(
637 | status=ValidationStatus.VALID,
638 | confidence=0.9,
639 | message="Parameters are valid"
640 | )
641 |
642 | # Neo4j Query Methods
643 |
644 | async def _find_modules(self, module_name: str) -> List[str]:
645 | """Find repository matching the module name, then return its files"""
646 | async with self.driver.session() as session:
647 | # First, try to find files with module names that match or start with the search term
648 | module_query = """
649 | MATCH (r:Repository)-[:CONTAINS]->(f:File)
650 | WHERE f.module_name = $module_name
651 | OR f.module_name STARTS WITH $module_name + '.'
652 | OR split(f.module_name, '.')[0] = $module_name
653 | RETURN DISTINCT r.name as repo_name, count(f) as file_count
654 | ORDER BY file_count DESC
655 | LIMIT 5
656 | """
657 |
658 | result = await session.run(module_query, module_name=module_name)
659 | repos_from_modules = []
660 | async for record in result:
661 | repos_from_modules.append(record['repo_name'])
662 |
663 | # Also try repository name matching as fallback
664 | repo_query = """
665 | MATCH (r:Repository)
666 | WHERE toLower(r.name) = toLower($module_name)
667 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
668 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
669 | RETURN r.name as repo_name
670 | ORDER BY
671 | CASE
672 | WHEN toLower(r.name) = toLower($module_name) THEN 1
673 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
674 | WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
675 | END
676 | LIMIT 5
677 | """
678 |
679 | result = await session.run(repo_query, module_name=module_name)
680 | repos_from_names = []
681 | async for record in result:
682 | repos_from_names.append(record['repo_name'])
683 |
684 | # Combine results, prioritizing module-based matches
685 | all_repos = repos_from_modules + [r for r in repos_from_names if r not in repos_from_modules]
686 |
687 | if not all_repos:
688 | return []
689 |
690 | # Get files from the best matching repository
691 | best_repo = all_repos[0]
692 | files_query = """
693 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
694 | RETURN f.path, f.module_name
695 | LIMIT 50
696 | """
697 |
698 | result = await session.run(files_query, repo_name=best_repo)
699 | files = []
700 | async for record in result:
701 | files.append(record['f.path'])
702 |
703 | return files
704 |
705 | async def _get_module_contents(self, module_name: str) -> Tuple[List[str], List[str]]:
706 | """Get classes and functions available in a repository matching the module name"""
707 | async with self.driver.session() as session:
708 | # First, try to find repository by module names in files
709 | module_query = """
710 | MATCH (r:Repository)-[:CONTAINS]->(f:File)
711 | WHERE f.module_name = $module_name
712 | OR f.module_name STARTS WITH $module_name + '.'
713 | OR split(f.module_name, '.')[0] = $module_name
714 | RETURN DISTINCT r.name as repo_name, count(f) as file_count
715 | ORDER BY file_count DESC
716 | LIMIT 1
717 | """
718 |
719 | result = await session.run(module_query, module_name=module_name)
720 | record = await result.single()
721 |
722 | if record:
723 | repo_name = record['repo_name']
724 | else:
725 | # Fallback to repository name matching
726 | repo_query = """
727 | MATCH (r:Repository)
728 | WHERE toLower(r.name) = toLower($module_name)
729 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
730 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
731 | RETURN r.name as repo_name
732 | ORDER BY
733 | CASE
734 | WHEN toLower(r.name) = toLower($module_name) THEN 1
735 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
736 | WHEN toLower(replace(r.name, '_', '-')) = toLower($module_name) THEN 3
737 | END
738 | LIMIT 1
739 | """
740 |
741 | result = await session.run(repo_query, module_name=module_name)
742 | record = await result.single()
743 |
744 | if not record:
745 | return [], []
746 |
747 | repo_name = record['repo_name']
748 |
749 | # Get classes from this repository
750 | class_query = """
751 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
752 | RETURN DISTINCT c.name as class_name
753 | """
754 |
755 | result = await session.run(class_query, repo_name=repo_name)
756 | classes = []
757 | async for record in result:
758 | classes.append(record['class_name'])
759 |
760 | # Get functions from this repository
761 | func_query = """
762 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
763 | RETURN DISTINCT func.name as function_name
764 | """
765 |
766 | result = await session.run(func_query, repo_name=repo_name)
767 | functions = []
768 | async for record in result:
769 | functions.append(record['function_name'])
770 |
771 | return classes, functions
772 |
773 | async def _find_repository_for_module(self, module_name: str) -> Optional[str]:
774 | """Find the repository name that matches a module name"""
775 | if module_name in self.repo_cache:
776 | return self.repo_cache[module_name]
777 |
778 | async with self.driver.session() as session:
779 | # First, try to find repository by module names in files
780 | module_query = """
781 | MATCH (r:Repository)-[:CONTAINS]->(f:File)
782 | WHERE f.module_name = $module_name
783 | OR f.module_name STARTS WITH $module_name + '.'
784 | OR split(f.module_name, '.')[0] = $module_name
785 | RETURN DISTINCT r.name as repo_name, count(f) as file_count
786 | ORDER BY file_count DESC
787 | LIMIT 1
788 | """
789 |
790 | result = await session.run(module_query, module_name=module_name)
791 | record = await result.single()
792 |
793 | if record:
794 | repo_name = record['repo_name']
795 | else:
796 | # Fallback to repository name matching
797 | query = """
798 | MATCH (r:Repository)
799 | WHERE toLower(r.name) = toLower($module_name)
800 | OR toLower(replace(r.name, '-', '_')) = toLower($module_name)
801 | OR toLower(replace(r.name, '_', '-')) = toLower($module_name)
802 | OR toLower(r.name) CONTAINS toLower($module_name)
803 | OR toLower($module_name) CONTAINS toLower(replace(r.name, '-', '_'))
804 | RETURN r.name as repo_name
805 | ORDER BY
806 | CASE
807 | WHEN toLower(r.name) = toLower($module_name) THEN 1
808 | WHEN toLower(replace(r.name, '-', '_')) = toLower($module_name) THEN 2
809 | ELSE 3
810 | END
811 | LIMIT 1
812 | """
813 |
814 | result = await session.run(query, module_name=module_name)
815 | record = await result.single()
816 |
817 | repo_name = record['repo_name'] if record else None
818 |
819 | self.repo_cache[module_name] = repo_name
820 | return repo_name
821 |
822 | async def _find_class(self, class_name: str) -> Optional[Dict[str, Any]]:
823 | """Find class information in knowledge graph"""
824 | async with self.driver.session() as session:
825 | # First try exact match
826 | query = """
827 | MATCH (c:Class)
828 | WHERE c.name = $class_name OR c.full_name = $class_name
829 | RETURN c.name as name, c.full_name as full_name
830 | LIMIT 1
831 | """
832 |
833 | result = await session.run(query, class_name=class_name)
834 | record = await result.single()
835 |
836 | if record:
837 | return {
838 | 'name': record['name'],
839 | 'full_name': record['full_name']
840 | }
841 |
842 | # If no exact match and class_name has dots, try repository-based search
843 | if '.' in class_name:
844 | parts = class_name.split('.')
845 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
846 | class_part = parts[-1] # e.g., "Agent"
847 |
848 | # Find repository for the module
849 | repo_name = await self._find_repository_for_module(module_part)
850 |
851 | if repo_name:
852 | # Search for class within this repository
853 | repo_query = """
854 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
855 | WHERE c.name = $class_name
856 | RETURN c.name as name, c.full_name as full_name
857 | LIMIT 1
858 | """
859 |
860 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part)
861 | record = await result.single()
862 |
863 | if record:
864 | return {
865 | 'name': record['name'],
866 | 'full_name': record['full_name']
867 | }
868 |
869 | return None
870 |
871 | async def _find_method(self, class_name: str, method_name: str) -> Optional[Dict[str, Any]]:
872 | """Find method information for a class"""
873 | cache_key = f"{class_name}.{method_name}"
874 | if cache_key in self.method_cache:
875 | methods = self.method_cache[cache_key]
876 | return methods[0] if methods else None
877 |
878 | async with self.driver.session() as session:
879 | # First try exact match
880 | query = """
881 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
882 | WHERE (c.name = $class_name OR c.full_name = $class_name)
883 | AND m.name = $method_name
884 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
885 | m.return_type as return_type, m.args as args
886 | LIMIT 1
887 | """
888 |
889 | result = await session.run(query, class_name=class_name, method_name=method_name)
890 | record = await result.single()
891 |
892 | if record:
893 | # Use detailed params if available, fall back to simple params
894 | params_to_use = record['params_detailed'] or record['params_list'] or []
895 |
896 | method_info = {
897 | 'name': record['name'],
898 | 'params_list': params_to_use,
899 | 'return_type': record['return_type'],
900 | 'args': record['args'] or []
901 | }
902 | self.method_cache[cache_key] = [method_info]
903 | return method_info
904 |
905 | # If no exact match and class_name has dots, try repository-based search
906 | if '.' in class_name:
907 | parts = class_name.split('.')
908 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
909 | class_part = parts[-1] # e.g., "Agent"
910 |
911 | # Find repository for the module
912 | repo_name = await self._find_repository_for_module(module_part)
913 |
914 | if repo_name:
915 | # Search for method within this repository's classes
916 | repo_query = """
917 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
918 | WHERE c.name = $class_name AND m.name = $method_name
919 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
920 | m.return_type as return_type, m.args as args
921 | LIMIT 1
922 | """
923 |
924 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, method_name=method_name)
925 | record = await result.single()
926 |
927 | if record:
928 | # Use detailed params if available, fall back to simple params
929 | params_to_use = record['params_detailed'] or record['params_list'] or []
930 |
931 | method_info = {
932 | 'name': record['name'],
933 | 'params_list': params_to_use,
934 | 'return_type': record['return_type'],
935 | 'args': record['args'] or []
936 | }
937 | self.method_cache[cache_key] = [method_info]
938 | return method_info
939 |
940 | self.method_cache[cache_key] = []
941 | return None
942 |
943 | async def _find_attribute(self, class_name: str, attr_name: str) -> Optional[Dict[str, Any]]:
944 | """Find attribute information for a class"""
945 | async with self.driver.session() as session:
946 | # First try exact match
947 | query = """
948 | MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
949 | WHERE (c.name = $class_name OR c.full_name = $class_name)
950 | AND a.name = $attr_name
951 | RETURN a.name as name, a.type as type
952 | LIMIT 1
953 | """
954 |
955 | result = await session.run(query, class_name=class_name, attr_name=attr_name)
956 | record = await result.single()
957 |
958 | if record:
959 | return {
960 | 'name': record['name'],
961 | 'type': record['type']
962 | }
963 |
964 | # If no exact match and class_name has dots, try repository-based search
965 | if '.' in class_name:
966 | parts = class_name.split('.')
967 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
968 | class_part = parts[-1] # e.g., "Agent"
969 |
970 | # Find repository for the module
971 | repo_name = await self._find_repository_for_module(module_part)
972 |
973 | if repo_name:
974 | # Search for attribute within this repository's classes
975 | repo_query = """
976 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
977 | WHERE c.name = $class_name AND a.name = $attr_name
978 | RETURN a.name as name, a.type as type
979 | LIMIT 1
980 | """
981 |
982 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, attr_name=attr_name)
983 | record = await result.single()
984 |
985 | if record:
986 | return {
987 | 'name': record['name'],
988 | 'type': record['type']
989 | }
990 |
991 | return None
992 |
993 | async def _find_function(self, func_name: str) -> Optional[Dict[str, Any]]:
994 | """Find function information"""
995 | async with self.driver.session() as session:
996 | # First try exact match
997 | query = """
998 | MATCH (f:Function)
999 | WHERE f.name = $func_name OR f.full_name = $func_name
1000 | RETURN f.name as name, f.params_list as params_list, f.params_detailed as params_detailed,
1001 | f.return_type as return_type, f.args as args
1002 | LIMIT 1
1003 | """
1004 |
1005 | result = await session.run(query, func_name=func_name)
1006 | record = await result.single()
1007 |
1008 | if record:
1009 | # Use detailed params if available, fall back to simple params
1010 | params_to_use = record['params_detailed'] or record['params_list'] or []
1011 |
1012 | return {
1013 | 'name': record['name'],
1014 | 'params_list': params_to_use,
1015 | 'return_type': record['return_type'],
1016 | 'args': record['args'] or []
1017 | }
1018 |
1019 | # If no exact match and func_name has dots, try repository-based search
1020 | if '.' in func_name:
1021 | parts = func_name.split('.')
1022 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
1023 | func_part = parts[-1] # e.g., "some_function"
1024 |
1025 | # Find repository for the module
1026 | repo_name = await self._find_repository_for_module(module_part)
1027 |
1028 | if repo_name:
1029 | # Search for function within this repository
1030 | repo_query = """
1031 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1032 | WHERE func.name = $func_name
1033 | RETURN func.name as name, func.params_list as params_list, func.params_detailed as params_detailed,
1034 | func.return_type as return_type, func.args as args
1035 | LIMIT 1
1036 | """
1037 |
1038 | result = await session.run(repo_query, repo_name=repo_name, func_name=func_part)
1039 | record = await result.single()
1040 |
1041 | if record:
1042 | # Use detailed params if available, fall back to simple params
1043 | params_to_use = record['params_detailed'] or record['params_list'] or []
1044 |
1045 | return {
1046 | 'name': record['name'],
1047 | 'params_list': params_to_use,
1048 | 'return_type': record['return_type'],
1049 | 'args': record['args'] or []
1050 | }
1051 |
1052 | return None
1053 |
1054 | async def _find_pydantic_ai_result_method(self, method_name: str) -> Optional[Dict[str, Any]]:
1055 | """Find method information for pydantic_ai result objects"""
1056 | # Look for methods on pydantic_ai classes that could be result objects
1057 | async with self.driver.session() as session:
1058 | # Search for common result methods in pydantic_ai repository
1059 | query = """
1060 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1061 | WHERE m.name = $method_name
1062 | AND (c.name CONTAINS 'Result' OR c.name CONTAINS 'Stream' OR c.name CONTAINS 'Run')
1063 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed,
1064 | m.return_type as return_type, m.args as args, c.name as class_name
1065 | LIMIT 1
1066 | """
1067 |
1068 | result = await session.run(query, repo_name="pydantic_ai", method_name=method_name)
1069 | record = await result.single()
1070 |
1071 | if record:
1072 | # Use detailed params if available, fall back to simple params
1073 | params_to_use = record['params_detailed'] or record['params_list'] or []
1074 |
1075 | return {
1076 | 'name': record['name'],
1077 | 'params_list': params_to_use,
1078 | 'return_type': record['return_type'],
1079 | 'args': record['args'] or [],
1080 | 'source_class': record['class_name']
1081 | }
1082 |
1083 | return None
1084 |
1085 | async def _find_similar_modules(self, module_name: str) -> List[str]:
1086 | """Find similar repository names for suggestions"""
1087 | async with self.driver.session() as session:
1088 | query = """
1089 | MATCH (r:Repository)
1090 | WHERE toLower(r.name) CONTAINS toLower($partial_name)
1091 | OR toLower(replace(r.name, '-', '_')) CONTAINS toLower($partial_name)
1092 | OR toLower(replace(r.name, '_', '-')) CONTAINS toLower($partial_name)
1093 | RETURN r.name
1094 | LIMIT 5
1095 | """
1096 |
1097 | result = await session.run(query, partial_name=module_name[:3])
1098 | suggestions = []
1099 | async for record in result:
1100 | suggestions.append(record['name'])
1101 |
1102 | return suggestions
1103 |
1104 | async def _find_similar_methods(self, class_name: str, method_name: str) -> List[str]:
1105 | """Find similar method names for suggestions"""
1106 | async with self.driver.session() as session:
1107 | # First try exact class match
1108 | query = """
1109 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1110 | WHERE (c.name = $class_name OR c.full_name = $class_name)
1111 | AND m.name CONTAINS $partial_name
1112 | RETURN m.name as name
1113 | LIMIT 5
1114 | """
1115 |
1116 | result = await session.run(query, class_name=class_name, partial_name=method_name[:3])
1117 | suggestions = []
1118 | async for record in result:
1119 | suggestions.append(record['name'])
1120 |
1121 | # If no suggestions and class_name has dots, try repository-based search
1122 | if not suggestions and '.' in class_name:
1123 | parts = class_name.split('.')
1124 | module_part = '.'.join(parts[:-1]) # e.g., "pydantic_ai"
1125 | class_part = parts[-1] # e.g., "Agent"
1126 |
1127 | # Find repository for the module
1128 | repo_name = await self._find_repository_for_module(module_part)
1129 |
1130 | if repo_name:
1131 | repo_query = """
1132 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1133 | WHERE c.name = $class_name AND m.name CONTAINS $partial_name
1134 | RETURN m.name as name
1135 | LIMIT 5
1136 | """
1137 |
1138 | result = await session.run(repo_query, repo_name=repo_name, class_name=class_part, partial_name=method_name[:3])
1139 | async for record in result:
1140 | suggestions.append(record['name'])
1141 |
1142 | return suggestions
1143 |
1144 | def _calculate_overall_confidence(self, result: ScriptValidationResult) -> float:
1145 | """Calculate overall confidence score for the validation (knowledge graph items only)"""
1146 | kg_validations = []
1147 |
1148 | # Only count validations from knowledge graph imports
1149 | for val in result.import_validations:
1150 | if val.validation.details.get('in_knowledge_graph', False):
1151 | kg_validations.append(val.validation.confidence)
1152 |
1153 | # Only count validations from knowledge graph classes
1154 | for val in result.class_validations:
1155 | class_name = val.class_instantiation.full_class_name or val.class_instantiation.class_name
1156 | if self._is_from_knowledge_graph(class_name):
1157 | kg_validations.append(val.validation.confidence)
1158 |
1159 | # Only count validations from knowledge graph methods
1160 | for val in result.method_validations:
1161 | if val.method_call.object_type and self._is_from_knowledge_graph(val.method_call.object_type):
1162 | kg_validations.append(val.validation.confidence)
1163 |
1164 | # Only count validations from knowledge graph attributes
1165 | for val in result.attribute_validations:
1166 | if val.attribute_access.object_type and self._is_from_knowledge_graph(val.attribute_access.object_type):
1167 | kg_validations.append(val.validation.confidence)
1168 |
1169 | # Only count validations from knowledge graph functions
1170 | for val in result.function_validations:
1171 | if val.function_call.full_name and self._is_from_knowledge_graph(val.function_call.full_name):
1172 | kg_validations.append(val.validation.confidence)
1173 |
1174 | if not kg_validations:
1175 | return 1.0 # No knowledge graph items to validate = perfect confidence
1176 |
1177 | return sum(kg_validations) / len(kg_validations)
1178 |
1179 | def _is_from_knowledge_graph(self, class_type: str) -> bool:
1180 | """Check if a class type comes from a module in the knowledge graph"""
1181 | if not class_type:
1182 | return False
1183 |
1184 | # For dotted names like "pydantic_ai.Agent" or "pydantic_ai.StreamedRunResult", check the base module
1185 | if '.' in class_type:
1186 | base_module = class_type.split('.')[0]
1187 | # Exact match only - "pydantic" should not match "pydantic_ai"
1188 | return base_module in self.knowledge_graph_modules
1189 |
1190 | # For simple names, check if any knowledge graph module matches exactly
1191 | # Don't use substring matching to avoid "pydantic" matching "pydantic_ai"
1192 | return class_type in self.knowledge_graph_modules
1193 |
1194 | def _detect_hallucinations(self, result: ScriptValidationResult) -> List[Dict[str, Any]]:
1195 | """Detect and categorize hallucinations"""
1196 | hallucinations = []
1197 | reported_items = set() # Track reported items to avoid duplicates
1198 |
1199 | # Check method calls (only for knowledge graph classes)
1200 | for val in result.method_validations:
1201 | if (val.validation.status == ValidationStatus.NOT_FOUND and
1202 | val.method_call.object_type and
1203 | self._is_from_knowledge_graph(val.method_call.object_type)):
1204 |
1205 | # Create unique key to avoid duplicates
1206 | key = (val.method_call.line_number, val.method_call.method_name, val.method_call.object_type)
1207 | if key not in reported_items:
1208 | reported_items.add(key)
1209 | hallucinations.append({
1210 | 'type': 'METHOD_NOT_FOUND',
1211 | 'location': f"line {val.method_call.line_number}",
1212 | 'description': f"Method '{val.method_call.method_name}' not found on class '{val.method_call.object_type}'",
1213 | 'suggestion': val.validation.suggestions[0] if val.validation.suggestions else None
1214 | })
1215 |
1216 | # Check attributes (only for knowledge graph classes) - but skip if already reported as method
1217 | for val in result.attribute_validations:
1218 | if (val.validation.status == ValidationStatus.NOT_FOUND and
1219 | val.attribute_access.object_type and
1220 | self._is_from_knowledge_graph(val.attribute_access.object_type)):
1221 |
1222 | # Create unique key - if this was already reported as a method, skip it
1223 | key = (val.attribute_access.line_number, val.attribute_access.attribute_name, val.attribute_access.object_type)
1224 | if key not in reported_items:
1225 | reported_items.add(key)
1226 | hallucinations.append({
1227 | 'type': 'ATTRIBUTE_NOT_FOUND',
1228 | 'location': f"line {val.attribute_access.line_number}",
1229 | 'description': f"Attribute '{val.attribute_access.attribute_name}' not found on class '{val.attribute_access.object_type}'"
1230 | })
1231 |
1232 | # Check parameter issues (only for knowledge graph methods)
1233 | for val in result.method_validations:
1234 | if (val.parameter_validation and
1235 | val.parameter_validation.status == ValidationStatus.INVALID and
1236 | val.method_call.object_type and
1237 | self._is_from_knowledge_graph(val.method_call.object_type)):
1238 | hallucinations.append({
1239 | 'type': 'INVALID_PARAMETERS',
1240 | 'location': f"line {val.method_call.line_number}",
1241 | 'description': f"Invalid parameters for method '{val.method_call.method_name}': {val.parameter_validation.message}"
1242 | })
1243 |
1244 | return hallucinations
```
--------------------------------------------------------------------------------
/src/crawl4ai_mcp.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | MCP server for web crawling with Crawl4AI.
3 |
4 | This server provides tools to crawl websites using Crawl4AI, automatically detecting
5 | the appropriate crawl method based on URL type (sitemap, txt file, or regular webpage).
6 | Also includes AI hallucination detection and repository parsing tools using Neo4j knowledge graphs.
7 | """
8 | from mcp.server.fastmcp import FastMCP, Context
9 | from sentence_transformers import CrossEncoder
10 | from contextlib import asynccontextmanager
11 | from collections.abc import AsyncIterator
12 | from dataclasses import dataclass
13 | from typing import List, Dict, Any, Optional
14 | from urllib.parse import urlparse, urldefrag
15 | from xml.etree import ElementTree
16 | from dotenv import load_dotenv
17 | from supabase import Client
18 | from pathlib import Path
19 | import requests
20 | import asyncio
21 | import json
22 | import os
23 | import re
24 | import concurrent.futures
25 | import sys
26 |
27 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
28 |
29 | # Add knowledge_graphs folder to path for importing knowledge graph modules
30 | knowledge_graphs_path = Path(__file__).resolve().parent.parent / 'knowledge_graphs'
31 | sys.path.append(str(knowledge_graphs_path))
32 |
33 | from utils import (
34 | get_supabase_client,
35 | add_documents_to_supabase,
36 | search_documents,
37 | extract_code_blocks,
38 | generate_code_example_summary,
39 | add_code_examples_to_supabase,
40 | update_source_info,
41 | extract_source_summary,
42 | search_code_examples
43 | )
44 |
45 | # Import knowledge graph modules
46 | from knowledge_graph_validator import KnowledgeGraphValidator
47 | from parse_repo_into_neo4j import DirectNeo4jExtractor
48 | from ai_script_analyzer import AIScriptAnalyzer
49 | from hallucination_reporter import HallucinationReporter
50 |
51 | # Load environment variables from the project root .env file
52 | project_root = Path(__file__).resolve().parent.parent
53 | dotenv_path = project_root / '.env'
54 |
55 | # Force override of existing environment variables
56 | load_dotenv(dotenv_path, override=True)
57 |
58 | # Helper functions for Neo4j validation and error handling
59 | def validate_neo4j_connection() -> bool:
60 | """Check if Neo4j environment variables are configured."""
61 | return all([
62 | os.getenv("NEO4J_URI"),
63 | os.getenv("NEO4J_USER"),
64 | os.getenv("NEO4J_PASSWORD")
65 | ])
66 |
67 | def format_neo4j_error(error: Exception) -> str:
68 | """Format Neo4j connection errors for user-friendly messages."""
69 | error_str = str(error).lower()
70 | if "authentication" in error_str or "unauthorized" in error_str:
71 | return "Neo4j authentication failed. Check NEO4J_USER and NEO4J_PASSWORD."
72 | elif "connection" in error_str or "refused" in error_str or "timeout" in error_str:
73 | return "Cannot connect to Neo4j. Check NEO4J_URI and ensure Neo4j is running."
74 | elif "database" in error_str:
75 | return "Neo4j database error. Check if the database exists and is accessible."
76 | else:
77 | return f"Neo4j error: {str(error)}"
78 |
79 | def validate_script_path(script_path: str) -> Dict[str, Any]:
80 | """Validate script path and return error info if invalid."""
81 | if not script_path or not isinstance(script_path, str):
82 | return {"valid": False, "error": "Script path is required"}
83 |
84 | if not os.path.exists(script_path):
85 | return {"valid": False, "error": f"Script not found: {script_path}"}
86 |
87 | if not script_path.endswith('.py'):
88 | return {"valid": False, "error": "Only Python (.py) files are supported"}
89 |
90 | try:
91 | # Check if file is readable
92 | with open(script_path, 'r', encoding='utf-8') as f:
93 | f.read(1) # Read first character to test
94 | return {"valid": True}
95 | except Exception as e:
96 | return {"valid": False, "error": f"Cannot read script file: {str(e)}"}
97 |
98 | def validate_github_url(repo_url: str) -> Dict[str, Any]:
99 | """Validate GitHub repository URL."""
100 | if not repo_url or not isinstance(repo_url, str):
101 | return {"valid": False, "error": "Repository URL is required"}
102 |
103 | repo_url = repo_url.strip()
104 |
105 | # Basic GitHub URL validation
106 | if not ("github.com" in repo_url.lower() or repo_url.endswith(".git")):
107 | return {"valid": False, "error": "Please provide a valid GitHub repository URL"}
108 |
109 | # Check URL format
110 | if not (repo_url.startswith("https://") or repo_url.startswith("git@")):
111 | return {"valid": False, "error": "Repository URL must start with https:// or git@"}
112 |
113 | return {"valid": True, "repo_name": repo_url.split('/')[-1].replace('.git', '')}
114 |
115 | # Create a dataclass for our application context
116 | @dataclass
117 | class Crawl4AIContext:
118 | """Context for the Crawl4AI MCP server."""
119 | crawler: AsyncWebCrawler
120 | supabase_client: Client
121 | reranking_model: Optional[CrossEncoder] = None
122 | knowledge_validator: Optional[Any] = None # KnowledgeGraphValidator when available
123 | repo_extractor: Optional[Any] = None # DirectNeo4jExtractor when available
124 |
125 | @asynccontextmanager
126 | async def crawl4ai_lifespan(server: FastMCP) -> AsyncIterator[Crawl4AIContext]:
127 | """
128 | Manages the Crawl4AI client lifecycle.
129 |
130 | Args:
131 | server: The FastMCP server instance
132 |
133 | Yields:
134 | Crawl4AIContext: The context containing the Crawl4AI crawler and Supabase client
135 | """
136 | # Create browser configuration
137 | browser_config = BrowserConfig(
138 | headless=True,
139 | verbose=False
140 | )
141 |
142 | # Initialize the crawler
143 | crawler = AsyncWebCrawler(config=browser_config)
144 | await crawler.__aenter__()
145 |
146 | # Initialize Supabase client
147 | supabase_client = get_supabase_client()
148 |
149 | # Initialize cross-encoder model for reranking if enabled
150 | reranking_model = None
151 | if os.getenv("USE_RERANKING", "false") == "true":
152 | try:
153 | reranking_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
154 | except Exception as e:
155 | print(f"Failed to load reranking model: {e}")
156 | reranking_model = None
157 |
158 | # Initialize Neo4j components if configured and enabled
159 | knowledge_validator = None
160 | repo_extractor = None
161 |
162 | # Check if knowledge graph functionality is enabled
163 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
164 |
165 | if knowledge_graph_enabled:
166 | neo4j_uri = os.getenv("NEO4J_URI")
167 | neo4j_user = os.getenv("NEO4J_USER")
168 | neo4j_password = os.getenv("NEO4J_PASSWORD")
169 |
170 | if neo4j_uri and neo4j_user and neo4j_password:
171 | try:
172 | print("Initializing knowledge graph components...")
173 |
174 | # Initialize knowledge graph validator
175 | knowledge_validator = KnowledgeGraphValidator(neo4j_uri, neo4j_user, neo4j_password)
176 | await knowledge_validator.initialize()
177 | print("✓ Knowledge graph validator initialized")
178 |
179 | # Initialize repository extractor
180 | repo_extractor = DirectNeo4jExtractor(neo4j_uri, neo4j_user, neo4j_password)
181 | await repo_extractor.initialize()
182 | print("✓ Repository extractor initialized")
183 |
184 | except Exception as e:
185 | print(f"Failed to initialize Neo4j components: {format_neo4j_error(e)}")
186 | knowledge_validator = None
187 | repo_extractor = None
188 | else:
189 | print("Neo4j credentials not configured - knowledge graph tools will be unavailable")
190 | else:
191 | print("Knowledge graph functionality disabled - set USE_KNOWLEDGE_GRAPH=true to enable")
192 |
193 | try:
194 | # Small delay to ensure all components are fully initialized
195 | await asyncio.sleep(0.5)
196 | print("✓ All components initialized - server ready for requests")
197 |
198 | yield Crawl4AIContext(
199 | crawler=crawler,
200 | supabase_client=supabase_client,
201 | reranking_model=reranking_model,
202 | knowledge_validator=knowledge_validator,
203 | repo_extractor=repo_extractor
204 | )
205 | finally:
206 | # Clean up all components
207 | await crawler.__aexit__(None, None, None)
208 | if knowledge_validator:
209 | try:
210 | await knowledge_validator.close()
211 | print("✓ Knowledge graph validator closed")
212 | except Exception as e:
213 | print(f"Error closing knowledge validator: {e}")
214 | if repo_extractor:
215 | try:
216 | await repo_extractor.close()
217 | print("✓ Repository extractor closed")
218 | except Exception as e:
219 | print(f"Error closing repository extractor: {e}")
220 |
221 | # Initialize FastMCP server
222 | mcp = FastMCP(
223 | "mcp-crawl4ai-rag",
224 | description="MCP server for RAG and web crawling with Crawl4AI",
225 | lifespan=crawl4ai_lifespan,
226 | host=os.getenv("HOST", "0.0.0.0"),
227 | port=os.getenv("PORT", "8051")
228 | )
229 |
230 | def rerank_results(model: CrossEncoder, query: str, results: List[Dict[str, Any]], content_key: str = "content") -> List[Dict[str, Any]]:
231 | """
232 | Rerank search results using a cross-encoder model.
233 |
234 | Args:
235 | model: The cross-encoder model to use for reranking
236 | query: The search query
237 | results: List of search results
238 | content_key: The key in each result dict that contains the text content
239 |
240 | Returns:
241 | Reranked list of results
242 | """
243 | if not model or not results:
244 | return results
245 |
246 | try:
247 | # Extract content from results
248 | texts = [result.get(content_key, "") for result in results]
249 |
250 | # Create pairs of [query, document] for the cross-encoder
251 | pairs = [[query, text] for text in texts]
252 |
253 | # Get relevance scores from the cross-encoder
254 | scores = model.predict(pairs)
255 |
256 | # Add scores to results and sort by score (descending)
257 | for i, result in enumerate(results):
258 | result["rerank_score"] = float(scores[i])
259 |
260 | # Sort by rerank score
261 | reranked = sorted(results, key=lambda x: x.get("rerank_score", 0), reverse=True)
262 |
263 | return reranked
264 | except Exception as e:
265 | print(f"Error during reranking: {e}")
266 | return results
267 |
268 | def is_sitemap(url: str) -> bool:
269 | """
270 | Check if a URL is a sitemap.
271 |
272 | Args:
273 | url: URL to check
274 |
275 | Returns:
276 | True if the URL is a sitemap, False otherwise
277 | """
278 | return url.endswith('sitemap.xml') or 'sitemap' in urlparse(url).path
279 |
280 | def is_txt(url: str) -> bool:
281 | """
282 | Check if a URL is a text file.
283 |
284 | Args:
285 | url: URL to check
286 |
287 | Returns:
288 | True if the URL is a text file, False otherwise
289 | """
290 | return url.endswith('.txt')
291 |
292 | def parse_sitemap(sitemap_url: str) -> List[str]:
293 | """
294 | Parse a sitemap and extract URLs.
295 |
296 | Args:
297 | sitemap_url: URL of the sitemap
298 |
299 | Returns:
300 | List of URLs found in the sitemap
301 | """
302 | resp = requests.get(sitemap_url)
303 | urls = []
304 |
305 | if resp.status_code == 200:
306 | try:
307 | tree = ElementTree.fromstring(resp.content)
308 | urls = [loc.text for loc in tree.findall('.//{*}loc')]
309 | except Exception as e:
310 | print(f"Error parsing sitemap XML: {e}")
311 |
312 | return urls
313 |
314 | def smart_chunk_markdown(text: str, chunk_size: int = 5000) -> List[str]:
315 | """Split text into chunks, respecting code blocks and paragraphs."""
316 | chunks = []
317 | start = 0
318 | text_length = len(text)
319 |
320 | while start < text_length:
321 | # Calculate end position
322 | end = start + chunk_size
323 |
324 | # If we're at the end of the text, just take what's left
325 | if end >= text_length:
326 | chunks.append(text[start:].strip())
327 | break
328 |
329 | # Try to find a code block boundary first (```)
330 | chunk = text[start:end]
331 | code_block = chunk.rfind('```')
332 | if code_block != -1 and code_block > chunk_size * 0.3:
333 | end = start + code_block
334 |
335 | # If no code block, try to break at a paragraph
336 | elif '\n\n' in chunk:
337 | # Find the last paragraph break
338 | last_break = chunk.rfind('\n\n')
339 | if last_break > chunk_size * 0.3: # Only break if we're past 30% of chunk_size
340 | end = start + last_break
341 |
342 | # If no paragraph break, try to break at a sentence
343 | elif '. ' in chunk:
344 | # Find the last sentence break
345 | last_period = chunk.rfind('. ')
346 | if last_period > chunk_size * 0.3: # Only break if we're past 30% of chunk_size
347 | end = start + last_period + 1
348 |
349 | # Extract chunk and clean it up
350 | chunk = text[start:end].strip()
351 | if chunk:
352 | chunks.append(chunk)
353 |
354 | # Move start position for next chunk
355 | start = end
356 |
357 | return chunks
358 |
359 | def extract_section_info(chunk: str) -> Dict[str, Any]:
360 | """
361 | Extracts headers and stats from a chunk.
362 |
363 | Args:
364 | chunk: Markdown chunk
365 |
366 | Returns:
367 | Dictionary with headers and stats
368 | """
369 | headers = re.findall(r'^(#+)\s+(.+)$', chunk, re.MULTILINE)
370 | header_str = '; '.join([f'{h[0]} {h[1]}' for h in headers]) if headers else ''
371 |
372 | return {
373 | "headers": header_str,
374 | "char_count": len(chunk),
375 | "word_count": len(chunk.split())
376 | }
377 |
378 | def process_code_example(args):
379 | """
380 | Process a single code example to generate its summary.
381 | This function is designed to be used with concurrent.futures.
382 |
383 | Args:
384 | args: Tuple containing (code, context_before, context_after)
385 |
386 | Returns:
387 | The generated summary
388 | """
389 | code, context_before, context_after = args
390 | return generate_code_example_summary(code, context_before, context_after)
391 |
392 | @mcp.tool()
393 | async def crawl_single_page(ctx: Context, url: str) -> str:
394 | """
395 | Crawl a single web page and store its content in Supabase.
396 |
397 | This tool is ideal for quickly retrieving content from a specific URL without following links.
398 | The content is stored in Supabase for later retrieval and querying.
399 |
400 | Args:
401 | ctx: The MCP server provided context
402 | url: URL of the web page to crawl
403 |
404 | Returns:
405 | Summary of the crawling operation and storage in Supabase
406 | """
407 | try:
408 | # Get the crawler from the context
409 | crawler = ctx.request_context.lifespan_context.crawler
410 | supabase_client = ctx.request_context.lifespan_context.supabase_client
411 |
412 | # Configure the crawl
413 | run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
414 |
415 | # Crawl the page
416 | result = await crawler.arun(url=url, config=run_config)
417 |
418 | if result.success and result.markdown:
419 | # Extract source_id
420 | parsed_url = urlparse(url)
421 | source_id = parsed_url.netloc or parsed_url.path
422 |
423 | # Chunk the content
424 | chunks = smart_chunk_markdown(result.markdown)
425 |
426 | # Prepare data for Supabase
427 | urls = []
428 | chunk_numbers = []
429 | contents = []
430 | metadatas = []
431 | total_word_count = 0
432 |
433 | for i, chunk in enumerate(chunks):
434 | urls.append(url)
435 | chunk_numbers.append(i)
436 | contents.append(chunk)
437 |
438 | # Extract metadata
439 | meta = extract_section_info(chunk)
440 | meta["chunk_index"] = i
441 | meta["url"] = url
442 | meta["source"] = source_id
443 | meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
444 | metadatas.append(meta)
445 |
446 | # Accumulate word count
447 | total_word_count += meta.get("word_count", 0)
448 |
449 | # Create url_to_full_document mapping
450 | url_to_full_document = {url: result.markdown}
451 |
452 | # Update source information FIRST (before inserting documents)
453 | source_summary = extract_source_summary(source_id, result.markdown[:5000]) # Use first 5000 chars for summary
454 | update_source_info(supabase_client, source_id, source_summary, total_word_count)
455 |
456 | # Add documentation chunks to Supabase (AFTER source exists)
457 | add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document)
458 |
459 | # Extract and process code examples only if enabled
460 | extract_code_examples = os.getenv("USE_AGENTIC_RAG", "false") == "true"
461 | if extract_code_examples:
462 | code_blocks = extract_code_blocks(result.markdown)
463 | if code_blocks:
464 | code_urls = []
465 | code_chunk_numbers = []
466 | code_examples = []
467 | code_summaries = []
468 | code_metadatas = []
469 |
470 | # Process code examples in parallel
471 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
472 | # Prepare arguments for parallel processing
473 | summary_args = [(block['code'], block['context_before'], block['context_after'])
474 | for block in code_blocks]
475 |
476 | # Generate summaries in parallel
477 | summaries = list(executor.map(process_code_example, summary_args))
478 |
479 | # Prepare code example data
480 | for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
481 | code_urls.append(url)
482 | code_chunk_numbers.append(i)
483 | code_examples.append(block['code'])
484 | code_summaries.append(summary)
485 |
486 | # Create metadata for code example
487 | code_meta = {
488 | "chunk_index": i,
489 | "url": url,
490 | "source": source_id,
491 | "char_count": len(block['code']),
492 | "word_count": len(block['code'].split())
493 | }
494 | code_metadatas.append(code_meta)
495 |
496 | # Add code examples to Supabase
497 | add_code_examples_to_supabase(
498 | supabase_client,
499 | code_urls,
500 | code_chunk_numbers,
501 | code_examples,
502 | code_summaries,
503 | code_metadatas
504 | )
505 |
506 | return json.dumps({
507 | "success": True,
508 | "url": url,
509 | "chunks_stored": len(chunks),
510 | "code_examples_stored": len(code_blocks) if code_blocks else 0,
511 | "content_length": len(result.markdown),
512 | "total_word_count": total_word_count,
513 | "source_id": source_id,
514 | "links_count": {
515 | "internal": len(result.links.get("internal", [])),
516 | "external": len(result.links.get("external", []))
517 | }
518 | }, indent=2)
519 | else:
520 | return json.dumps({
521 | "success": False,
522 | "url": url,
523 | "error": result.error_message
524 | }, indent=2)
525 | except Exception as e:
526 | return json.dumps({
527 | "success": False,
528 | "url": url,
529 | "error": str(e)
530 | }, indent=2)
531 |
532 | @mcp.tool()
533 | async def smart_crawl_url(ctx: Context, url: str, max_depth: int = 3, max_concurrent: int = 10, chunk_size: int = 5000) -> str:
534 | """
535 | Intelligently crawl a URL based on its type and store content in Supabase.
536 |
537 | This tool automatically detects the URL type and applies the appropriate crawling method:
538 | - For sitemaps: Extracts and crawls all URLs in parallel
539 | - For text files (llms.txt): Directly retrieves the content
540 | - For regular webpages: Recursively crawls internal links up to the specified depth
541 |
542 | All crawled content is chunked and stored in Supabase for later retrieval and querying.
543 |
544 | Args:
545 | ctx: The MCP server provided context
546 | url: URL to crawl (can be a regular webpage, sitemap.xml, or .txt file)
547 | max_depth: Maximum recursion depth for regular URLs (default: 3)
548 | max_concurrent: Maximum number of concurrent browser sessions (default: 10)
549 | chunk_size: Maximum size of each content chunk in characters (default: 1000)
550 |
551 | Returns:
552 | JSON string with crawl summary and storage information
553 | """
554 | try:
555 | # Get the crawler from the context
556 | crawler = ctx.request_context.lifespan_context.crawler
557 | supabase_client = ctx.request_context.lifespan_context.supabase_client
558 |
559 | # Determine the crawl strategy
560 | crawl_results = []
561 | crawl_type = None
562 |
563 | if is_txt(url):
564 | # For text files, use simple crawl
565 | crawl_results = await crawl_markdown_file(crawler, url)
566 | crawl_type = "text_file"
567 | elif is_sitemap(url):
568 | # For sitemaps, extract URLs and crawl in parallel
569 | sitemap_urls = parse_sitemap(url)
570 | if not sitemap_urls:
571 | return json.dumps({
572 | "success": False,
573 | "url": url,
574 | "error": "No URLs found in sitemap"
575 | }, indent=2)
576 | crawl_results = await crawl_batch(crawler, sitemap_urls, max_concurrent=max_concurrent)
577 | crawl_type = "sitemap"
578 | else:
579 | # For regular URLs, use recursive crawl
580 | crawl_results = await crawl_recursive_internal_links(crawler, [url], max_depth=max_depth, max_concurrent=max_concurrent)
581 | crawl_type = "webpage"
582 |
583 | if not crawl_results:
584 | return json.dumps({
585 | "success": False,
586 | "url": url,
587 | "error": "No content found"
588 | }, indent=2)
589 |
590 | # Process results and store in Supabase
591 | urls = []
592 | chunk_numbers = []
593 | contents = []
594 | metadatas = []
595 | chunk_count = 0
596 |
597 | # Track sources and their content
598 | source_content_map = {}
599 | source_word_counts = {}
600 |
601 | # Process documentation chunks
602 | for doc in crawl_results:
603 | source_url = doc['url']
604 | md = doc['markdown']
605 | chunks = smart_chunk_markdown(md, chunk_size=chunk_size)
606 |
607 | # Extract source_id
608 | parsed_url = urlparse(source_url)
609 | source_id = parsed_url.netloc or parsed_url.path
610 |
611 | # Store content for source summary generation
612 | if source_id not in source_content_map:
613 | source_content_map[source_id] = md[:5000] # Store first 5000 chars
614 | source_word_counts[source_id] = 0
615 |
616 | for i, chunk in enumerate(chunks):
617 | urls.append(source_url)
618 | chunk_numbers.append(i)
619 | contents.append(chunk)
620 |
621 | # Extract metadata
622 | meta = extract_section_info(chunk)
623 | meta["chunk_index"] = i
624 | meta["url"] = source_url
625 | meta["source"] = source_id
626 | meta["crawl_type"] = crawl_type
627 | meta["crawl_time"] = str(asyncio.current_task().get_coro().__name__)
628 | metadatas.append(meta)
629 |
630 | # Accumulate word count
631 | source_word_counts[source_id] += meta.get("word_count", 0)
632 |
633 | chunk_count += 1
634 |
635 | # Create url_to_full_document mapping
636 | url_to_full_document = {}
637 | for doc in crawl_results:
638 | url_to_full_document[doc['url']] = doc['markdown']
639 |
640 | # Update source information for each unique source FIRST (before inserting documents)
641 | with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
642 | source_summary_args = [(source_id, content) for source_id, content in source_content_map.items()]
643 | source_summaries = list(executor.map(lambda args: extract_source_summary(args[0], args[1]), source_summary_args))
644 |
645 | for (source_id, _), summary in zip(source_summary_args, source_summaries):
646 | word_count = source_word_counts.get(source_id, 0)
647 | update_source_info(supabase_client, source_id, summary, word_count)
648 |
649 | # Add documentation chunks to Supabase (AFTER sources exist)
650 | batch_size = 20
651 | add_documents_to_supabase(supabase_client, urls, chunk_numbers, contents, metadatas, url_to_full_document, batch_size=batch_size)
652 |
653 | # Extract and process code examples from all documents only if enabled
654 | extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
655 | if extract_code_examples_enabled:
656 | all_code_blocks = []
657 | code_urls = []
658 | code_chunk_numbers = []
659 | code_examples = []
660 | code_summaries = []
661 | code_metadatas = []
662 |
663 | # Extract code blocks from all documents
664 | for doc in crawl_results:
665 | source_url = doc['url']
666 | md = doc['markdown']
667 | code_blocks = extract_code_blocks(md)
668 |
669 | if code_blocks:
670 | # Process code examples in parallel
671 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
672 | # Prepare arguments for parallel processing
673 | summary_args = [(block['code'], block['context_before'], block['context_after'])
674 | for block in code_blocks]
675 |
676 | # Generate summaries in parallel
677 | summaries = list(executor.map(process_code_example, summary_args))
678 |
679 | # Prepare code example data
680 | parsed_url = urlparse(source_url)
681 | source_id = parsed_url.netloc or parsed_url.path
682 |
683 | for i, (block, summary) in enumerate(zip(code_blocks, summaries)):
684 | code_urls.append(source_url)
685 | code_chunk_numbers.append(len(code_examples)) # Use global code example index
686 | code_examples.append(block['code'])
687 | code_summaries.append(summary)
688 |
689 | # Create metadata for code example
690 | code_meta = {
691 | "chunk_index": len(code_examples) - 1,
692 | "url": source_url,
693 | "source": source_id,
694 | "char_count": len(block['code']),
695 | "word_count": len(block['code'].split())
696 | }
697 | code_metadatas.append(code_meta)
698 |
699 | # Add all code examples to Supabase
700 | if code_examples:
701 | add_code_examples_to_supabase(
702 | supabase_client,
703 | code_urls,
704 | code_chunk_numbers,
705 | code_examples,
706 | code_summaries,
707 | code_metadatas,
708 | batch_size=batch_size
709 | )
710 |
711 | return json.dumps({
712 | "success": True,
713 | "url": url,
714 | "crawl_type": crawl_type,
715 | "pages_crawled": len(crawl_results),
716 | "chunks_stored": chunk_count,
717 | "code_examples_stored": len(code_examples),
718 | "sources_updated": len(source_content_map),
719 | "urls_crawled": [doc['url'] for doc in crawl_results][:5] + (["..."] if len(crawl_results) > 5 else [])
720 | }, indent=2)
721 | except Exception as e:
722 | return json.dumps({
723 | "success": False,
724 | "url": url,
725 | "error": str(e)
726 | }, indent=2)
727 |
728 | @mcp.tool()
729 | async def get_available_sources(ctx: Context) -> str:
730 | """
731 | Get all available sources from the sources table.
732 |
733 | This tool returns a list of all unique sources (domains) that have been crawled and stored
734 | in the database, along with their summaries and statistics. This is useful for discovering
735 | what content is available for querying.
736 |
737 | Always use this tool before calling the RAG query or code example query tool
738 | with a specific source filter!
739 |
740 | Args:
741 | ctx: The MCP server provided context
742 |
743 | Returns:
744 | JSON string with the list of available sources and their details
745 | """
746 | try:
747 | # Get the Supabase client from the context
748 | supabase_client = ctx.request_context.lifespan_context.supabase_client
749 |
750 | # Query the sources table directly
751 | result = supabase_client.from_('sources')\
752 | .select('*')\
753 | .order('source_id')\
754 | .execute()
755 |
756 | # Format the sources with their details
757 | sources = []
758 | if result.data:
759 | for source in result.data:
760 | sources.append({
761 | "source_id": source.get("source_id"),
762 | "summary": source.get("summary"),
763 | "total_words": source.get("total_words"),
764 | "created_at": source.get("created_at"),
765 | "updated_at": source.get("updated_at")
766 | })
767 |
768 | return json.dumps({
769 | "success": True,
770 | "sources": sources,
771 | "count": len(sources)
772 | }, indent=2)
773 | except Exception as e:
774 | return json.dumps({
775 | "success": False,
776 | "error": str(e)
777 | }, indent=2)
778 |
779 | @mcp.tool()
780 | async def perform_rag_query(ctx: Context, query: str, source: str = None, match_count: int = 5) -> str:
781 | """
782 | Perform a RAG (Retrieval Augmented Generation) query on the stored content.
783 |
784 | This tool searches the vector database for content relevant to the query and returns
785 | the matching documents. Optionally filter by source domain.
786 | Get the source by using the get_available_sources tool before calling this search!
787 |
788 | Args:
789 | ctx: The MCP server provided context
790 | query: The search query
791 | source: Optional source domain to filter results (e.g., 'example.com')
792 | match_count: Maximum number of results to return (default: 5)
793 |
794 | Returns:
795 | JSON string with the search results
796 | """
797 | try:
798 | # Get the Supabase client from the context
799 | supabase_client = ctx.request_context.lifespan_context.supabase_client
800 |
801 | # Check if hybrid search is enabled
802 | use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
803 |
804 | # Prepare filter if source is provided and not empty
805 | filter_metadata = None
806 | if source and source.strip():
807 | filter_metadata = {"source": source}
808 |
809 | if use_hybrid_search:
810 | # Hybrid search: combine vector and keyword search
811 |
812 | # 1. Get vector search results (get more to account for filtering)
813 | vector_results = search_documents(
814 | client=supabase_client,
815 | query=query,
816 | match_count=match_count * 2, # Get double to have room for filtering
817 | filter_metadata=filter_metadata
818 | )
819 |
820 | # 2. Get keyword search results using ILIKE
821 | keyword_query = supabase_client.from_('crawled_pages')\
822 | .select('id, url, chunk_number, content, metadata, source_id')\
823 | .ilike('content', f'%{query}%')
824 |
825 | # Apply source filter if provided
826 | if source and source.strip():
827 | keyword_query = keyword_query.eq('source_id', source)
828 |
829 | # Execute keyword search
830 | keyword_response = keyword_query.limit(match_count * 2).execute()
831 | keyword_results = keyword_response.data if keyword_response.data else []
832 |
833 | # 3. Combine results with preference for items appearing in both
834 | seen_ids = set()
835 | combined_results = []
836 |
837 | # First, add items that appear in both searches (these are the best matches)
838 | vector_ids = {r.get('id') for r in vector_results if r.get('id')}
839 | for kr in keyword_results:
840 | if kr['id'] in vector_ids and kr['id'] not in seen_ids:
841 | # Find the vector result to get similarity score
842 | for vr in vector_results:
843 | if vr.get('id') == kr['id']:
844 | # Boost similarity score for items in both results
845 | vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
846 | combined_results.append(vr)
847 | seen_ids.add(kr['id'])
848 | break
849 |
850 | # Then add remaining vector results (semantic matches without exact keyword)
851 | for vr in vector_results:
852 | if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
853 | combined_results.append(vr)
854 | seen_ids.add(vr['id'])
855 |
856 | # Finally, add pure keyword matches if we still need more results
857 | for kr in keyword_results:
858 | if kr['id'] not in seen_ids and len(combined_results) < match_count:
859 | # Convert keyword result to match vector result format
860 | combined_results.append({
861 | 'id': kr['id'],
862 | 'url': kr['url'],
863 | 'chunk_number': kr['chunk_number'],
864 | 'content': kr['content'],
865 | 'metadata': kr['metadata'],
866 | 'source_id': kr['source_id'],
867 | 'similarity': 0.5 # Default similarity for keyword-only matches
868 | })
869 | seen_ids.add(kr['id'])
870 |
871 | # Use combined results
872 | results = combined_results[:match_count]
873 |
874 | else:
875 | # Standard vector search only
876 | results = search_documents(
877 | client=supabase_client,
878 | query=query,
879 | match_count=match_count,
880 | filter_metadata=filter_metadata
881 | )
882 |
883 | # Apply reranking if enabled
884 | use_reranking = os.getenv("USE_RERANKING", "false") == "true"
885 | if use_reranking and ctx.request_context.lifespan_context.reranking_model:
886 | results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
887 |
888 | # Format the results
889 | formatted_results = []
890 | for result in results:
891 | formatted_result = {
892 | "url": result.get("url"),
893 | "content": result.get("content"),
894 | "metadata": result.get("metadata"),
895 | "similarity": result.get("similarity")
896 | }
897 | # Include rerank score if available
898 | if "rerank_score" in result:
899 | formatted_result["rerank_score"] = result["rerank_score"]
900 | formatted_results.append(formatted_result)
901 |
902 | return json.dumps({
903 | "success": True,
904 | "query": query,
905 | "source_filter": source,
906 | "search_mode": "hybrid" if use_hybrid_search else "vector",
907 | "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
908 | "results": formatted_results,
909 | "count": len(formatted_results)
910 | }, indent=2)
911 | except Exception as e:
912 | return json.dumps({
913 | "success": False,
914 | "query": query,
915 | "error": str(e)
916 | }, indent=2)
917 |
918 | @mcp.tool()
919 | async def search_code_examples(ctx: Context, query: str, source_id: str = None, match_count: int = 5) -> str:
920 | """
921 | Search for code examples relevant to the query.
922 |
923 | This tool searches the vector database for code examples relevant to the query and returns
924 | the matching examples with their summaries. Optionally filter by source_id.
925 | Get the source_id by using the get_available_sources tool before calling this search!
926 |
927 | Use the get_available_sources tool first to see what sources are available for filtering.
928 |
929 | Args:
930 | ctx: The MCP server provided context
931 | query: The search query
932 | source_id: Optional source ID to filter results (e.g., 'example.com')
933 | match_count: Maximum number of results to return (default: 5)
934 |
935 | Returns:
936 | JSON string with the search results
937 | """
938 | # Check if code example extraction is enabled
939 | extract_code_examples_enabled = os.getenv("USE_AGENTIC_RAG", "false") == "true"
940 | if not extract_code_examples_enabled:
941 | return json.dumps({
942 | "success": False,
943 | "error": "Code example extraction is disabled. Perform a normal RAG search."
944 | }, indent=2)
945 |
946 | try:
947 | # Get the Supabase client from the context
948 | supabase_client = ctx.request_context.lifespan_context.supabase_client
949 |
950 | # Check if hybrid search is enabled
951 | use_hybrid_search = os.getenv("USE_HYBRID_SEARCH", "false") == "true"
952 |
953 | # Prepare filter if source is provided and not empty
954 | filter_metadata = None
955 | if source_id and source_id.strip():
956 | filter_metadata = {"source": source_id}
957 |
958 | if use_hybrid_search:
959 | # Hybrid search: combine vector and keyword search
960 |
961 | # Import the search function from utils
962 | from utils import search_code_examples as search_code_examples_impl
963 |
964 | # 1. Get vector search results (get more to account for filtering)
965 | vector_results = search_code_examples_impl(
966 | client=supabase_client,
967 | query=query,
968 | match_count=match_count * 2, # Get double to have room for filtering
969 | filter_metadata=filter_metadata
970 | )
971 |
972 | # 2. Get keyword search results using ILIKE on both content and summary
973 | keyword_query = supabase_client.from_('code_examples')\
974 | .select('id, url, chunk_number, content, summary, metadata, source_id')\
975 | .or_(f'content.ilike.%{query}%,summary.ilike.%{query}%')
976 |
977 | # Apply source filter if provided
978 | if source_id and source_id.strip():
979 | keyword_query = keyword_query.eq('source_id', source_id)
980 |
981 | # Execute keyword search
982 | keyword_response = keyword_query.limit(match_count * 2).execute()
983 | keyword_results = keyword_response.data if keyword_response.data else []
984 |
985 | # 3. Combine results with preference for items appearing in both
986 | seen_ids = set()
987 | combined_results = []
988 |
989 | # First, add items that appear in both searches (these are the best matches)
990 | vector_ids = {r.get('id') for r in vector_results if r.get('id')}
991 | for kr in keyword_results:
992 | if kr['id'] in vector_ids and kr['id'] not in seen_ids:
993 | # Find the vector result to get similarity score
994 | for vr in vector_results:
995 | if vr.get('id') == kr['id']:
996 | # Boost similarity score for items in both results
997 | vr['similarity'] = min(1.0, vr.get('similarity', 0) * 1.2)
998 | combined_results.append(vr)
999 | seen_ids.add(kr['id'])
1000 | break
1001 |
1002 | # Then add remaining vector results (semantic matches without exact keyword)
1003 | for vr in vector_results:
1004 | if vr.get('id') and vr['id'] not in seen_ids and len(combined_results) < match_count:
1005 | combined_results.append(vr)
1006 | seen_ids.add(vr['id'])
1007 |
1008 | # Finally, add pure keyword matches if we still need more results
1009 | for kr in keyword_results:
1010 | if kr['id'] not in seen_ids and len(combined_results) < match_count:
1011 | # Convert keyword result to match vector result format
1012 | combined_results.append({
1013 | 'id': kr['id'],
1014 | 'url': kr['url'],
1015 | 'chunk_number': kr['chunk_number'],
1016 | 'content': kr['content'],
1017 | 'summary': kr['summary'],
1018 | 'metadata': kr['metadata'],
1019 | 'source_id': kr['source_id'],
1020 | 'similarity': 0.5 # Default similarity for keyword-only matches
1021 | })
1022 | seen_ids.add(kr['id'])
1023 |
1024 | # Use combined results
1025 | results = combined_results[:match_count]
1026 |
1027 | else:
1028 | # Standard vector search only
1029 | from utils import search_code_examples as search_code_examples_impl
1030 |
1031 | results = search_code_examples_impl(
1032 | client=supabase_client,
1033 | query=query,
1034 | match_count=match_count,
1035 | filter_metadata=filter_metadata
1036 | )
1037 |
1038 | # Apply reranking if enabled
1039 | use_reranking = os.getenv("USE_RERANKING", "false") == "true"
1040 | if use_reranking and ctx.request_context.lifespan_context.reranking_model:
1041 | results = rerank_results(ctx.request_context.lifespan_context.reranking_model, query, results, content_key="content")
1042 |
1043 | # Format the results
1044 | formatted_results = []
1045 | for result in results:
1046 | formatted_result = {
1047 | "url": result.get("url"),
1048 | "code": result.get("content"),
1049 | "summary": result.get("summary"),
1050 | "metadata": result.get("metadata"),
1051 | "source_id": result.get("source_id"),
1052 | "similarity": result.get("similarity")
1053 | }
1054 | # Include rerank score if available
1055 | if "rerank_score" in result:
1056 | formatted_result["rerank_score"] = result["rerank_score"]
1057 | formatted_results.append(formatted_result)
1058 |
1059 | return json.dumps({
1060 | "success": True,
1061 | "query": query,
1062 | "source_filter": source_id,
1063 | "search_mode": "hybrid" if use_hybrid_search else "vector",
1064 | "reranking_applied": use_reranking and ctx.request_context.lifespan_context.reranking_model is not None,
1065 | "results": formatted_results,
1066 | "count": len(formatted_results)
1067 | }, indent=2)
1068 | except Exception as e:
1069 | return json.dumps({
1070 | "success": False,
1071 | "query": query,
1072 | "error": str(e)
1073 | }, indent=2)
1074 |
1075 | @mcp.tool()
1076 | async def check_ai_script_hallucinations(ctx: Context, script_path: str) -> str:
1077 | """
1078 | Check an AI-generated Python script for hallucinations using the knowledge graph.
1079 |
1080 | This tool analyzes a Python script for potential AI hallucinations by validating
1081 | imports, method calls, class instantiations, and function calls against a Neo4j
1082 | knowledge graph containing real repository data.
1083 |
1084 | The tool performs comprehensive analysis including:
1085 | - Import validation against known repositories
1086 | - Method call validation on classes from the knowledge graph
1087 | - Class instantiation parameter validation
1088 | - Function call parameter validation
1089 | - Attribute access validation
1090 |
1091 | Args:
1092 | ctx: The MCP server provided context
1093 | script_path: Absolute path to the Python script to analyze
1094 |
1095 | Returns:
1096 | JSON string with hallucination detection results, confidence scores, and recommendations
1097 | """
1098 | try:
1099 | # Check if knowledge graph functionality is enabled
1100 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1101 | if not knowledge_graph_enabled:
1102 | return json.dumps({
1103 | "success": False,
1104 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1105 | }, indent=2)
1106 |
1107 | # Get the knowledge validator from context
1108 | knowledge_validator = ctx.request_context.lifespan_context.knowledge_validator
1109 |
1110 | if not knowledge_validator:
1111 | return json.dumps({
1112 | "success": False,
1113 | "error": "Knowledge graph validator not available. Check Neo4j configuration in environment variables."
1114 | }, indent=2)
1115 |
1116 | # Validate script path
1117 | validation = validate_script_path(script_path)
1118 | if not validation["valid"]:
1119 | return json.dumps({
1120 | "success": False,
1121 | "script_path": script_path,
1122 | "error": validation["error"]
1123 | }, indent=2)
1124 |
1125 | # Step 1: Analyze script structure using AST
1126 | analyzer = AIScriptAnalyzer()
1127 | analysis_result = analyzer.analyze_script(script_path)
1128 |
1129 | if analysis_result.errors:
1130 | print(f"Analysis warnings for {script_path}: {analysis_result.errors}")
1131 |
1132 | # Step 2: Validate against knowledge graph
1133 | validation_result = await knowledge_validator.validate_script(analysis_result)
1134 |
1135 | # Step 3: Generate comprehensive report
1136 | reporter = HallucinationReporter()
1137 | report = reporter.generate_comprehensive_report(validation_result)
1138 |
1139 | # Format response with comprehensive information
1140 | return json.dumps({
1141 | "success": True,
1142 | "script_path": script_path,
1143 | "overall_confidence": validation_result.overall_confidence,
1144 | "validation_summary": {
1145 | "total_validations": report["validation_summary"]["total_validations"],
1146 | "valid_count": report["validation_summary"]["valid_count"],
1147 | "invalid_count": report["validation_summary"]["invalid_count"],
1148 | "uncertain_count": report["validation_summary"]["uncertain_count"],
1149 | "not_found_count": report["validation_summary"]["not_found_count"],
1150 | "hallucination_rate": report["validation_summary"]["hallucination_rate"]
1151 | },
1152 | "hallucinations_detected": report["hallucinations_detected"],
1153 | "recommendations": report["recommendations"],
1154 | "analysis_metadata": {
1155 | "total_imports": report["analysis_metadata"]["total_imports"],
1156 | "total_classes": report["analysis_metadata"]["total_classes"],
1157 | "total_methods": report["analysis_metadata"]["total_methods"],
1158 | "total_attributes": report["analysis_metadata"]["total_attributes"],
1159 | "total_functions": report["analysis_metadata"]["total_functions"]
1160 | },
1161 | "libraries_analyzed": report.get("libraries_analyzed", [])
1162 | }, indent=2)
1163 |
1164 | except Exception as e:
1165 | return json.dumps({
1166 | "success": False,
1167 | "script_path": script_path,
1168 | "error": f"Analysis failed: {str(e)}"
1169 | }, indent=2)
1170 |
1171 | @mcp.tool()
1172 | async def query_knowledge_graph(ctx: Context, command: str) -> str:
1173 | """
1174 | Query and explore the Neo4j knowledge graph containing repository data.
1175 |
1176 | This tool provides comprehensive access to the knowledge graph for exploring repositories,
1177 | classes, methods, functions, and their relationships. Perfect for understanding what data
1178 | is available for hallucination detection and debugging validation results.
1179 |
1180 | **⚠️ IMPORTANT: Always start with the `repos` command first!**
1181 | Before using any other commands, run `repos` to see what repositories are available
1182 | in your knowledge graph. This will help you understand what data you can explore.
1183 |
1184 | ## Available Commands:
1185 |
1186 | **Repository Commands:**
1187 | - `repos` - **START HERE!** List all repositories in the knowledge graph
1188 | - `explore <repo_name>` - Get detailed overview of a specific repository
1189 |
1190 | **Class Commands:**
1191 | - `classes` - List all classes across all repositories (limited to 20)
1192 | - `classes <repo_name>` - List classes in a specific repository
1193 | - `class <class_name>` - Get detailed information about a specific class including methods and attributes
1194 |
1195 | **Method Commands:**
1196 | - `method <method_name>` - Search for methods by name across all classes
1197 | - `method <method_name> <class_name>` - Search for a method within a specific class
1198 |
1199 | **Custom Query:**
1200 | - `query <cypher_query>` - Execute a custom Cypher query (results limited to 20 records)
1201 |
1202 | ## Knowledge Graph Schema:
1203 |
1204 | **Node Types:**
1205 | - Repository: `(r:Repository {name: string})`
1206 | - File: `(f:File {path: string, module_name: string})`
1207 | - Class: `(c:Class {name: string, full_name: string})`
1208 | - Method: `(m:Method {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1209 | - Function: `(func:Function {name: string, params_list: [string], params_detailed: [string], return_type: string, args: [string]})`
1210 | - Attribute: `(a:Attribute {name: string, type: string})`
1211 |
1212 | **Relationships:**
1213 | - `(r:Repository)-[:CONTAINS]->(f:File)`
1214 | - `(f:File)-[:DEFINES]->(c:Class)`
1215 | - `(c:Class)-[:HAS_METHOD]->(m:Method)`
1216 | - `(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)`
1217 | - `(f:File)-[:DEFINES]->(func:Function)`
1218 |
1219 | ## Example Workflow:
1220 | ```
1221 | 1. repos # See what repositories are available
1222 | 2. explore pydantic-ai # Explore a specific repository
1223 | 3. classes pydantic-ai # List classes in that repository
1224 | 4. class Agent # Explore the Agent class
1225 | 5. method run_stream # Search for run_stream method
1226 | 6. method __init__ Agent # Find Agent constructor
1227 | 7. query "MATCH (c:Class)-[:HAS_METHOD]->(m:Method) WHERE m.name = 'run' RETURN c.name, m.name LIMIT 5"
1228 | ```
1229 |
1230 | Args:
1231 | ctx: The MCP server provided context
1232 | command: Command string to execute (see available commands above)
1233 |
1234 | Returns:
1235 | JSON string with query results, statistics, and metadata
1236 | """
1237 | try:
1238 | # Check if knowledge graph functionality is enabled
1239 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1240 | if not knowledge_graph_enabled:
1241 | return json.dumps({
1242 | "success": False,
1243 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1244 | }, indent=2)
1245 |
1246 | # Get Neo4j driver from context
1247 | repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1248 | if not repo_extractor or not repo_extractor.driver:
1249 | return json.dumps({
1250 | "success": False,
1251 | "error": "Neo4j connection not available. Check Neo4j configuration in environment variables."
1252 | }, indent=2)
1253 |
1254 | # Parse command
1255 | command = command.strip()
1256 | if not command:
1257 | return json.dumps({
1258 | "success": False,
1259 | "command": "",
1260 | "error": "Command cannot be empty. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1261 | }, indent=2)
1262 |
1263 | parts = command.split()
1264 | cmd = parts[0].lower()
1265 | args = parts[1:] if len(parts) > 1 else []
1266 |
1267 | async with repo_extractor.driver.session() as session:
1268 | # Route to appropriate handler
1269 | if cmd == "repos":
1270 | return await _handle_repos_command(session, command)
1271 | elif cmd == "explore":
1272 | if not args:
1273 | return json.dumps({
1274 | "success": False,
1275 | "command": command,
1276 | "error": "Repository name required. Usage: explore <repo_name>"
1277 | }, indent=2)
1278 | return await _handle_explore_command(session, command, args[0])
1279 | elif cmd == "classes":
1280 | repo_name = args[0] if args else None
1281 | return await _handle_classes_command(session, command, repo_name)
1282 | elif cmd == "class":
1283 | if not args:
1284 | return json.dumps({
1285 | "success": False,
1286 | "command": command,
1287 | "error": "Class name required. Usage: class <class_name>"
1288 | }, indent=2)
1289 | return await _handle_class_command(session, command, args[0])
1290 | elif cmd == "method":
1291 | if not args:
1292 | return json.dumps({
1293 | "success": False,
1294 | "command": command,
1295 | "error": "Method name required. Usage: method <method_name> [class_name]"
1296 | }, indent=2)
1297 | method_name = args[0]
1298 | class_name = args[1] if len(args) > 1 else None
1299 | return await _handle_method_command(session, command, method_name, class_name)
1300 | elif cmd == "query":
1301 | if not args:
1302 | return json.dumps({
1303 | "success": False,
1304 | "command": command,
1305 | "error": "Cypher query required. Usage: query <cypher_query>"
1306 | }, indent=2)
1307 | cypher_query = " ".join(args)
1308 | return await _handle_query_command(session, command, cypher_query)
1309 | else:
1310 | return json.dumps({
1311 | "success": False,
1312 | "command": command,
1313 | "error": f"Unknown command '{cmd}'. Available commands: repos, explore <repo>, classes [repo], class <name>, method <name> [class], query <cypher>"
1314 | }, indent=2)
1315 |
1316 | except Exception as e:
1317 | return json.dumps({
1318 | "success": False,
1319 | "command": command,
1320 | "error": f"Query execution failed: {str(e)}"
1321 | }, indent=2)
1322 |
1323 |
1324 | async def _handle_repos_command(session, command: str) -> str:
1325 | """Handle 'repos' command - list all repositories"""
1326 | query = "MATCH (r:Repository) RETURN r.name as name ORDER BY r.name"
1327 | result = await session.run(query)
1328 |
1329 | repos = []
1330 | async for record in result:
1331 | repos.append(record['name'])
1332 |
1333 | return json.dumps({
1334 | "success": True,
1335 | "command": command,
1336 | "data": {
1337 | "repositories": repos
1338 | },
1339 | "metadata": {
1340 | "total_results": len(repos),
1341 | "limited": False
1342 | }
1343 | }, indent=2)
1344 |
1345 |
1346 | async def _handle_explore_command(session, command: str, repo_name: str) -> str:
1347 | """Handle 'explore <repo>' command - get repository overview"""
1348 | # Check if repository exists
1349 | repo_check_query = "MATCH (r:Repository {name: $repo_name}) RETURN r.name as name"
1350 | result = await session.run(repo_check_query, repo_name=repo_name)
1351 | repo_record = await result.single()
1352 |
1353 | if not repo_record:
1354 | return json.dumps({
1355 | "success": False,
1356 | "command": command,
1357 | "error": f"Repository '{repo_name}' not found in knowledge graph"
1358 | }, indent=2)
1359 |
1360 | # Get file count
1361 | files_query = """
1362 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
1363 | RETURN count(f) as file_count
1364 | """
1365 | result = await session.run(files_query, repo_name=repo_name)
1366 | file_count = (await result.single())['file_count']
1367 |
1368 | # Get class count
1369 | classes_query = """
1370 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1371 | RETURN count(DISTINCT c) as class_count
1372 | """
1373 | result = await session.run(classes_query, repo_name=repo_name)
1374 | class_count = (await result.single())['class_count']
1375 |
1376 | # Get function count
1377 | functions_query = """
1378 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
1379 | RETURN count(DISTINCT func) as function_count
1380 | """
1381 | result = await session.run(functions_query, repo_name=repo_name)
1382 | function_count = (await result.single())['function_count']
1383 |
1384 | # Get method count
1385 | methods_query = """
1386 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
1387 | RETURN count(DISTINCT m) as method_count
1388 | """
1389 | result = await session.run(methods_query, repo_name=repo_name)
1390 | method_count = (await result.single())['method_count']
1391 |
1392 | return json.dumps({
1393 | "success": True,
1394 | "command": command,
1395 | "data": {
1396 | "repository": repo_name,
1397 | "statistics": {
1398 | "files": file_count,
1399 | "classes": class_count,
1400 | "functions": function_count,
1401 | "methods": method_count
1402 | }
1403 | },
1404 | "metadata": {
1405 | "total_results": 1,
1406 | "limited": False
1407 | }
1408 | }, indent=2)
1409 |
1410 |
1411 | async def _handle_classes_command(session, command: str, repo_name: str = None) -> str:
1412 | """Handle 'classes [repo]' command - list classes"""
1413 | limit = 20
1414 |
1415 | if repo_name:
1416 | query = """
1417 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
1418 | RETURN c.name as name, c.full_name as full_name
1419 | ORDER BY c.name
1420 | LIMIT $limit
1421 | """
1422 | result = await session.run(query, repo_name=repo_name, limit=limit)
1423 | else:
1424 | query = """
1425 | MATCH (c:Class)
1426 | RETURN c.name as name, c.full_name as full_name
1427 | ORDER BY c.name
1428 | LIMIT $limit
1429 | """
1430 | result = await session.run(query, limit=limit)
1431 |
1432 | classes = []
1433 | async for record in result:
1434 | classes.append({
1435 | 'name': record['name'],
1436 | 'full_name': record['full_name']
1437 | })
1438 |
1439 | return json.dumps({
1440 | "success": True,
1441 | "command": command,
1442 | "data": {
1443 | "classes": classes,
1444 | "repository_filter": repo_name
1445 | },
1446 | "metadata": {
1447 | "total_results": len(classes),
1448 | "limited": len(classes) >= limit
1449 | }
1450 | }, indent=2)
1451 |
1452 |
1453 | async def _handle_class_command(session, command: str, class_name: str) -> str:
1454 | """Handle 'class <name>' command - explore specific class"""
1455 | # Find the class
1456 | class_query = """
1457 | MATCH (c:Class)
1458 | WHERE c.name = $class_name OR c.full_name = $class_name
1459 | RETURN c.name as name, c.full_name as full_name
1460 | LIMIT 1
1461 | """
1462 | result = await session.run(class_query, class_name=class_name)
1463 | class_record = await result.single()
1464 |
1465 | if not class_record:
1466 | return json.dumps({
1467 | "success": False,
1468 | "command": command,
1469 | "error": f"Class '{class_name}' not found in knowledge graph"
1470 | }, indent=2)
1471 |
1472 | actual_name = class_record['name']
1473 | full_name = class_record['full_name']
1474 |
1475 | # Get methods
1476 | methods_query = """
1477 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1478 | WHERE c.name = $class_name OR c.full_name = $class_name
1479 | RETURN m.name as name, m.params_list as params_list, m.params_detailed as params_detailed, m.return_type as return_type
1480 | ORDER BY m.name
1481 | """
1482 | result = await session.run(methods_query, class_name=class_name)
1483 |
1484 | methods = []
1485 | async for record in result:
1486 | # Use detailed params if available, fall back to simple params
1487 | params_to_use = record['params_detailed'] or record['params_list'] or []
1488 | methods.append({
1489 | 'name': record['name'],
1490 | 'parameters': params_to_use,
1491 | 'return_type': record['return_type'] or 'Any'
1492 | })
1493 |
1494 | # Get attributes
1495 | attributes_query = """
1496 | MATCH (c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
1497 | WHERE c.name = $class_name OR c.full_name = $class_name
1498 | RETURN a.name as name, a.type as type
1499 | ORDER BY a.name
1500 | """
1501 | result = await session.run(attributes_query, class_name=class_name)
1502 |
1503 | attributes = []
1504 | async for record in result:
1505 | attributes.append({
1506 | 'name': record['name'],
1507 | 'type': record['type'] or 'Any'
1508 | })
1509 |
1510 | return json.dumps({
1511 | "success": True,
1512 | "command": command,
1513 | "data": {
1514 | "class": {
1515 | "name": actual_name,
1516 | "full_name": full_name,
1517 | "methods": methods,
1518 | "attributes": attributes
1519 | }
1520 | },
1521 | "metadata": {
1522 | "total_results": 1,
1523 | "methods_count": len(methods),
1524 | "attributes_count": len(attributes),
1525 | "limited": False
1526 | }
1527 | }, indent=2)
1528 |
1529 |
1530 | async def _handle_method_command(session, command: str, method_name: str, class_name: str = None) -> str:
1531 | """Handle 'method <name> [class]' command - search for methods"""
1532 | if class_name:
1533 | query = """
1534 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1535 | WHERE (c.name = $class_name OR c.full_name = $class_name)
1536 | AND m.name = $method_name
1537 | RETURN c.name as class_name, c.full_name as class_full_name,
1538 | m.name as method_name, m.params_list as params_list,
1539 | m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1540 | """
1541 | result = await session.run(query, class_name=class_name, method_name=method_name)
1542 | else:
1543 | query = """
1544 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
1545 | WHERE m.name = $method_name
1546 | RETURN c.name as class_name, c.full_name as class_full_name,
1547 | m.name as method_name, m.params_list as params_list,
1548 | m.params_detailed as params_detailed, m.return_type as return_type, m.args as args
1549 | ORDER BY c.name
1550 | LIMIT 20
1551 | """
1552 | result = await session.run(query, method_name=method_name)
1553 |
1554 | methods = []
1555 | async for record in result:
1556 | # Use detailed params if available, fall back to simple params
1557 | params_to_use = record['params_detailed'] or record['params_list'] or []
1558 | methods.append({
1559 | 'class_name': record['class_name'],
1560 | 'class_full_name': record['class_full_name'],
1561 | 'method_name': record['method_name'],
1562 | 'parameters': params_to_use,
1563 | 'return_type': record['return_type'] or 'Any',
1564 | 'legacy_args': record['args'] or []
1565 | })
1566 |
1567 | if not methods:
1568 | return json.dumps({
1569 | "success": False,
1570 | "command": command,
1571 | "error": f"Method '{method_name}'" + (f" in class '{class_name}'" if class_name else "") + " not found"
1572 | }, indent=2)
1573 |
1574 | return json.dumps({
1575 | "success": True,
1576 | "command": command,
1577 | "data": {
1578 | "methods": methods,
1579 | "class_filter": class_name
1580 | },
1581 | "metadata": {
1582 | "total_results": len(methods),
1583 | "limited": len(methods) >= 20 and not class_name
1584 | }
1585 | }, indent=2)
1586 |
1587 |
1588 | async def _handle_query_command(session, command: str, cypher_query: str) -> str:
1589 | """Handle 'query <cypher>' command - execute custom Cypher query"""
1590 | try:
1591 | # Execute the query with a limit to prevent overwhelming responses
1592 | result = await session.run(cypher_query)
1593 |
1594 | records = []
1595 | count = 0
1596 | async for record in result:
1597 | records.append(dict(record))
1598 | count += 1
1599 | if count >= 20: # Limit results to prevent overwhelming responses
1600 | break
1601 |
1602 | return json.dumps({
1603 | "success": True,
1604 | "command": command,
1605 | "data": {
1606 | "query": cypher_query,
1607 | "results": records
1608 | },
1609 | "metadata": {
1610 | "total_results": len(records),
1611 | "limited": len(records) >= 20
1612 | }
1613 | }, indent=2)
1614 |
1615 | except Exception as e:
1616 | return json.dumps({
1617 | "success": False,
1618 | "command": command,
1619 | "error": f"Cypher query error: {str(e)}",
1620 | "data": {
1621 | "query": cypher_query
1622 | }
1623 | }, indent=2)
1624 |
1625 |
1626 | @mcp.tool()
1627 | async def parse_github_repository(ctx: Context, repo_url: str) -> str:
1628 | """
1629 | Parse a GitHub repository into the Neo4j knowledge graph.
1630 |
1631 | This tool clones a GitHub repository, analyzes its Python files, and stores
1632 | the code structure (classes, methods, functions, imports) in Neo4j for use
1633 | in hallucination detection. The tool:
1634 |
1635 | - Clones the repository to a temporary location
1636 | - Analyzes Python files to extract code structure
1637 | - Stores classes, methods, functions, and imports in Neo4j
1638 | - Provides detailed statistics about the parsing results
1639 | - Automatically handles module name detection for imports
1640 |
1641 | Args:
1642 | ctx: The MCP server provided context
1643 | repo_url: GitHub repository URL (e.g., 'https://github.com/user/repo.git')
1644 |
1645 | Returns:
1646 | JSON string with parsing results, statistics, and repository information
1647 | """
1648 | try:
1649 | # Check if knowledge graph functionality is enabled
1650 | knowledge_graph_enabled = os.getenv("USE_KNOWLEDGE_GRAPH", "false") == "true"
1651 | if not knowledge_graph_enabled:
1652 | return json.dumps({
1653 | "success": False,
1654 | "error": "Knowledge graph functionality is disabled. Set USE_KNOWLEDGE_GRAPH=true in environment."
1655 | }, indent=2)
1656 |
1657 | # Get the repository extractor from context
1658 | repo_extractor = ctx.request_context.lifespan_context.repo_extractor
1659 |
1660 | if not repo_extractor:
1661 | return json.dumps({
1662 | "success": False,
1663 | "error": "Repository extractor not available. Check Neo4j configuration in environment variables."
1664 | }, indent=2)
1665 |
1666 | # Validate repository URL
1667 | validation = validate_github_url(repo_url)
1668 | if not validation["valid"]:
1669 | return json.dumps({
1670 | "success": False,
1671 | "repo_url": repo_url,
1672 | "error": validation["error"]
1673 | }, indent=2)
1674 |
1675 | repo_name = validation["repo_name"]
1676 |
1677 | # Parse the repository (this includes cloning, analysis, and Neo4j storage)
1678 | print(f"Starting repository analysis for: {repo_name}")
1679 | await repo_extractor.analyze_repository(repo_url)
1680 | print(f"Repository analysis completed for: {repo_name}")
1681 |
1682 | # Query Neo4j for statistics about the parsed repository
1683 | async with repo_extractor.driver.session() as session:
1684 | # Get comprehensive repository statistics
1685 | stats_query = """
1686 | MATCH (r:Repository {name: $repo_name})
1687 | OPTIONAL MATCH (r)-[:CONTAINS]->(f:File)
1688 | OPTIONAL MATCH (f)-[:DEFINES]->(c:Class)
1689 | OPTIONAL MATCH (c)-[:HAS_METHOD]->(m:Method)
1690 | OPTIONAL MATCH (f)-[:DEFINES]->(func:Function)
1691 | OPTIONAL MATCH (c)-[:HAS_ATTRIBUTE]->(a:Attribute)
1692 | WITH r,
1693 | count(DISTINCT f) as files_count,
1694 | count(DISTINCT c) as classes_count,
1695 | count(DISTINCT m) as methods_count,
1696 | count(DISTINCT func) as functions_count,
1697 | count(DISTINCT a) as attributes_count
1698 |
1699 | // Get some sample module names
1700 | OPTIONAL MATCH (r)-[:CONTAINS]->(sample_f:File)
1701 | WITH r, files_count, classes_count, methods_count, functions_count, attributes_count,
1702 | collect(DISTINCT sample_f.module_name)[0..5] as sample_modules
1703 |
1704 | RETURN
1705 | r.name as repo_name,
1706 | files_count,
1707 | classes_count,
1708 | methods_count,
1709 | functions_count,
1710 | attributes_count,
1711 | sample_modules
1712 | """
1713 |
1714 | result = await session.run(stats_query, repo_name=repo_name)
1715 | record = await result.single()
1716 |
1717 | if record:
1718 | stats = {
1719 | "repository": record['repo_name'],
1720 | "files_processed": record['files_count'],
1721 | "classes_created": record['classes_count'],
1722 | "methods_created": record['methods_count'],
1723 | "functions_created": record['functions_count'],
1724 | "attributes_created": record['attributes_count'],
1725 | "sample_modules": record['sample_modules'] or []
1726 | }
1727 | else:
1728 | return json.dumps({
1729 | "success": False,
1730 | "repo_url": repo_url,
1731 | "error": f"Repository '{repo_name}' not found in database after parsing"
1732 | }, indent=2)
1733 |
1734 | return json.dumps({
1735 | "success": True,
1736 | "repo_url": repo_url,
1737 | "repo_name": repo_name,
1738 | "message": f"Successfully parsed repository '{repo_name}' into knowledge graph",
1739 | "statistics": stats,
1740 | "ready_for_validation": True,
1741 | "next_steps": [
1742 | "Repository is now available for hallucination detection",
1743 | f"Use check_ai_script_hallucinations to validate scripts against {repo_name}",
1744 | "The knowledge graph contains classes, methods, and functions from this repository"
1745 | ]
1746 | }, indent=2)
1747 |
1748 | except Exception as e:
1749 | return json.dumps({
1750 | "success": False,
1751 | "repo_url": repo_url,
1752 | "error": f"Repository parsing failed: {str(e)}"
1753 | }, indent=2)
1754 |
1755 | async def crawl_markdown_file(crawler: AsyncWebCrawler, url: str) -> List[Dict[str, Any]]:
1756 | """
1757 | Crawl a .txt or markdown file.
1758 |
1759 | Args:
1760 | crawler: AsyncWebCrawler instance
1761 | url: URL of the file
1762 |
1763 | Returns:
1764 | List of dictionaries with URL and markdown content
1765 | """
1766 | crawl_config = CrawlerRunConfig()
1767 |
1768 | result = await crawler.arun(url=url, config=crawl_config)
1769 | if result.success and result.markdown:
1770 | return [{'url': url, 'markdown': result.markdown}]
1771 | else:
1772 | print(f"Failed to crawl {url}: {result.error_message}")
1773 | return []
1774 |
1775 | async def crawl_batch(crawler: AsyncWebCrawler, urls: List[str], max_concurrent: int = 10) -> List[Dict[str, Any]]:
1776 | """
1777 | Batch crawl multiple URLs in parallel.
1778 |
1779 | Args:
1780 | crawler: AsyncWebCrawler instance
1781 | urls: List of URLs to crawl
1782 | max_concurrent: Maximum number of concurrent browser sessions
1783 |
1784 | Returns:
1785 | List of dictionaries with URL and markdown content
1786 | """
1787 | crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1788 | dispatcher = MemoryAdaptiveDispatcher(
1789 | memory_threshold_percent=70.0,
1790 | check_interval=1.0,
1791 | max_session_permit=max_concurrent
1792 | )
1793 |
1794 | results = await crawler.arun_many(urls=urls, config=crawl_config, dispatcher=dispatcher)
1795 | return [{'url': r.url, 'markdown': r.markdown} for r in results if r.success and r.markdown]
1796 |
1797 | async def crawl_recursive_internal_links(crawler: AsyncWebCrawler, start_urls: List[str], max_depth: int = 3, max_concurrent: int = 10) -> List[Dict[str, Any]]:
1798 | """
1799 | Recursively crawl internal links from start URLs up to a maximum depth.
1800 |
1801 | Args:
1802 | crawler: AsyncWebCrawler instance
1803 | start_urls: List of starting URLs
1804 | max_depth: Maximum recursion depth
1805 | max_concurrent: Maximum number of concurrent browser sessions
1806 |
1807 | Returns:
1808 | List of dictionaries with URL and markdown content
1809 | """
1810 | run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=False)
1811 | dispatcher = MemoryAdaptiveDispatcher(
1812 | memory_threshold_percent=70.0,
1813 | check_interval=1.0,
1814 | max_session_permit=max_concurrent
1815 | )
1816 |
1817 | visited = set()
1818 |
1819 | def normalize_url(url):
1820 | return urldefrag(url)[0]
1821 |
1822 | current_urls = set([normalize_url(u) for u in start_urls])
1823 | results_all = []
1824 |
1825 | for depth in range(max_depth):
1826 | urls_to_crawl = [normalize_url(url) for url in current_urls if normalize_url(url) not in visited]
1827 | if not urls_to_crawl:
1828 | break
1829 |
1830 | results = await crawler.arun_many(urls=urls_to_crawl, config=run_config, dispatcher=dispatcher)
1831 | next_level_urls = set()
1832 |
1833 | for result in results:
1834 | norm_url = normalize_url(result.url)
1835 | visited.add(norm_url)
1836 |
1837 | if result.success and result.markdown:
1838 | results_all.append({'url': result.url, 'markdown': result.markdown})
1839 | for link in result.links.get("internal", []):
1840 | next_url = normalize_url(link["href"])
1841 | if next_url not in visited:
1842 | next_level_urls.add(next_url)
1843 |
1844 | current_urls = next_level_urls
1845 |
1846 | return results_all
1847 |
1848 | async def main():
1849 | transport = os.getenv("TRANSPORT", "sse")
1850 | if transport == 'sse':
1851 | # Run the MCP server with sse transport
1852 | await mcp.run_sse_async()
1853 | else:
1854 | # Run the MCP server with stdio transport
1855 | await mcp.run_stdio_async()
1856 |
1857 | if __name__ == "__main__":
1858 | asyncio.run(main())
```