This is page 6 of 6. Use http://codebase.md/threatflux/yaraflux?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .dockerignore ├── .env ├── .env.example ├── .github │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── codeql.yml │ ├── publish-release.yml │ ├── safety_scan.yml │ ├── update-actions.yml │ └── version-bump.yml ├── .gitignore ├── .pylintrc ├── .safety-project.ini ├── bandit.yaml ├── codecov.yml ├── docker-compose.yml ├── docker-entrypoint.sh ├── Dockerfile ├── docs │ ├── api_mcp_architecture.md │ ├── api.md │ ├── architecture_diagram.md │ ├── cli.md │ ├── examples.md │ ├── file_management.md │ ├── installation.md │ ├── mcp.md │ ├── README.md │ └── yara_rules.md ├── entrypoint.sh ├── examples │ ├── claude_desktop_config.json │ └── install_via_smithery.sh ├── glama.json ├── images │ ├── architecture.svg │ ├── architecture.txt │ ├── image copy.png │ └── image.png ├── LICENSE ├── Makefile ├── mypy.ini ├── pyproject.toml ├── pytest.ini ├── README.md ├── requirements-dev.txt ├── requirements.txt ├── SECURITY.md ├── setup.py ├── src │ └── yaraflux_mcp_server │ ├── __init__.py │ ├── __main__.py │ ├── app.py │ ├── auth.py │ ├── claude_mcp_tools.py │ ├── claude_mcp.py │ ├── config.py │ ├── mcp_server.py │ ├── mcp_tools │ │ ├── __init__.py │ │ ├── base.py │ │ ├── file_tools.py │ │ ├── rule_tools.py │ │ ├── scan_tools.py │ │ └── storage_tools.py │ ├── models.py │ ├── routers │ │ ├── __init__.py │ │ ├── auth.py │ │ ├── files.py │ │ ├── rules.py │ │ └── scan.py │ ├── run_mcp.py │ ├── storage │ │ ├── __init__.py │ │ ├── base.py │ │ ├── factory.py │ │ ├── local.py │ │ └── minio.py │ ├── utils │ │ ├── __init__.py │ │ ├── error_handling.py │ │ ├── logging_config.py │ │ ├── param_parsing.py │ │ └── wrapper_generator.py │ └── yara_service.py ├── test.txt ├── tests │ ├── conftest.py │ ├── functional │ │ └── __init__.py │ ├── integration │ │ └── __init__.py │ └── unit │ ├── __init__.py │ ├── test_app.py │ ├── test_auth_fixtures │ │ ├── test_token_auth.py │ │ └── test_user_management.py │ ├── test_auth.py │ ├── test_claude_mcp_tools.py │ ├── test_cli │ │ ├── __init__.py │ │ ├── test_main.py │ │ └── test_run_mcp.py │ ├── test_config.py │ ├── test_mcp_server.py │ ├── test_mcp_tools │ │ ├── test_file_tools_extended.py │ │ ├── test_file_tools.py │ │ ├── test_init.py │ │ ├── test_rule_tools_extended.py │ │ ├── test_rule_tools.py │ │ ├── test_scan_tools_extended.py │ │ ├── test_scan_tools.py │ │ ├── test_storage_tools_enhanced.py │ │ └── test_storage_tools.py │ ├── test_mcp_tools.py │ ├── test_routers │ │ ├── test_auth_router.py │ │ ├── test_files.py │ │ ├── test_rules.py │ │ └── test_scan.py │ ├── test_storage │ │ ├── test_factory.py │ │ ├── test_local_storage.py │ │ └── test_minio_storage.py │ ├── test_storage_base.py │ ├── test_utils │ │ ├── __init__.py │ │ ├── test_error_handling.py │ │ ├── test_logging_config.py │ │ ├── test_param_parsing.py │ │ └── test_wrapper_generator.py │ ├── test_yara_rule_compilation.py │ └── test_yara_service.py └── uv.lock ``` # Files -------------------------------------------------------------------------------- /src/yaraflux_mcp_server/yara_service.py: -------------------------------------------------------------------------------- ```python 1 | """YARA integration service for YaraFlux MCP Server. 2 | 3 | This module provides functionality for working with YARA rules, including: 4 | - Rule compilation and validation 5 | - Rule management (add, update, delete) 6 | - File scanning with rules 7 | - Integration with ThreatFlux YARA-Rules repository 8 | """ 9 | 10 | import hashlib 11 | import logging 12 | import os 13 | import time 14 | from concurrent.futures import ThreadPoolExecutor 15 | from datetime import UTC, datetime 16 | from typing import Any, BinaryIO, Callable, Dict, List, Optional, Union 17 | from urllib.parse import urlparse 18 | 19 | import httpx 20 | import yara 21 | 22 | from yaraflux_mcp_server.config import settings 23 | from yaraflux_mcp_server.models import YaraMatch, YaraRuleMetadata, YaraScanResult 24 | from yaraflux_mcp_server.storage import StorageClient, StorageError, get_storage_client 25 | 26 | # Configure logging 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class YaraError(Exception): 31 | """Custom exception for YARA-related errors.""" 32 | 33 | 34 | class YaraService: 35 | """Service for YARA rule compilation, management, and scanning.""" 36 | 37 | def __init__(self, storage_client: Optional[StorageClient] = None): 38 | """Initialize the YARA service. 39 | 40 | Args: 41 | storage_client: Optional storage client to use 42 | """ 43 | self.storage = storage_client or get_storage_client() 44 | self._rules_cache: Dict[str, yara.Rules] = {} 45 | self._rule_include_callbacks: Dict[str, Callable[[str, str], bytes]] = {} 46 | 47 | # Initialize executor for scanning 48 | self._executor = ThreadPoolExecutor(max_workers=4) 49 | 50 | logger.info("YARA service initialized") 51 | 52 | def load_rules(self, include_default_rules: bool = True) -> None: 53 | """Load all YARA rules from storage. 54 | 55 | Args: 56 | include_default_rules: Whether to include default ThreatFlux rules 57 | """ 58 | # Clear existing cache 59 | self._rules_cache.clear() 60 | 61 | # List all available rules 62 | rules_metadata = self.storage.list_rules() 63 | 64 | # Group rules by source 65 | rules_by_source: Dict[str, List[Dict[str, Any]]] = {} 66 | for rule in rules_metadata: 67 | source = rule.get("source", "custom") 68 | if source not in rules_by_source: 69 | rules_by_source[source] = [] 70 | rules_by_source[source].append(rule) 71 | 72 | # First, load all rules individually (this populates include callbacks) 73 | for rule in rules_metadata: 74 | try: 75 | source = rule.get("source", "custom") 76 | rule_name = rule.get("name") 77 | 78 | # Skip loading community rules individually if they'll be loaded as a whole 79 | if include_default_rules and source == "community": 80 | continue 81 | 82 | self._compile_rule(rule_name, source) 83 | logger.debug(f"Loaded rule: {rule_name} from {source}") 84 | except Exception as e: 85 | logger.warning(f"Failed to load rule {rule.get('name')}: {str(e)}") 86 | 87 | # Then, try to load community rules as a single ruleset if requested 88 | if include_default_rules and "community" in rules_by_source: 89 | try: 90 | self._compile_community_rules() 91 | logger.info("Loaded community rules as combined ruleset") 92 | except Exception as e: 93 | logger.warning(f"Failed to load community rules as combined ruleset: {str(e)}") 94 | 95 | logger.info(f"Loaded {len(self._rules_cache)} rule sets") 96 | 97 | def _compile_rule(self, rule_name: str, source: str = "custom") -> yara.Rules: 98 | """Compile a single YARA rule from storage. 99 | 100 | Args: 101 | rule_name: Name of the rule 102 | source: Source of the rule 103 | 104 | Returns: 105 | Compiled YARA rules object 106 | 107 | Raises: 108 | YaraError: If rule compilation fails 109 | """ 110 | # Check for an existing compiled rule 111 | cache_key = f"{source}:{rule_name}" 112 | if cache_key in self._rules_cache: 113 | return self._rules_cache[cache_key] 114 | 115 | try: 116 | # Get the rule content from storage 117 | rule_content = self.storage.get_rule(rule_name, source) 118 | 119 | # Register an include callback for this rule 120 | self._register_include_callback(source, rule_name) 121 | 122 | # Compile the rule 123 | compiled_rule = yara.compile( 124 | source=rule_content, 125 | includes=True, 126 | include_callback=self._get_include_callback(source), 127 | error_on_warning=True, 128 | ) 129 | 130 | # Cache the compiled rule 131 | self._rules_cache[cache_key] = compiled_rule 132 | 133 | return compiled_rule 134 | except yara.Error as e: 135 | logger.error(f"YARA compilation error for rule {rule_name}: {str(e)}") 136 | raise YaraError(f"Failed to compile rule {rule_name}: {str(e)}") from e 137 | except StorageError as e: 138 | logger.error(f"Storage error getting rule {rule_name}: {str(e)}") 139 | raise YaraError(f"Failed to load rule {rule_name}: {str(e)}") from e 140 | 141 | def _compile_community_rules(self) -> yara.Rules: 142 | """Compile all community YARA rules as a single ruleset. 143 | 144 | Returns: 145 | Compiled YARA rules object 146 | 147 | Raises: 148 | YaraError: If rule compilation fails 149 | """ 150 | cache_key = "community:all" 151 | if cache_key in self._rules_cache: 152 | return self._rules_cache[cache_key] 153 | 154 | try: 155 | # Get all community rules 156 | rules_metadata = self.storage.list_rules("community") 157 | 158 | # Create a combined source with imports for all rules 159 | combined_source = "" 160 | for rule in rules_metadata: 161 | rule_name = rule.get("name") 162 | if not rule_name.endswith(".yar"): 163 | continue 164 | combined_source += f'include "{rule_name}"\n' 165 | 166 | # Skip if no rules found 167 | if not combined_source: 168 | raise YaraError("No community rules found") 169 | 170 | # Register include callbacks for all community rules 171 | for rule in rules_metadata: 172 | self._register_include_callback("community", rule.get("name")) 173 | 174 | # Compile the combined ruleset 175 | compiled_rule = yara.compile( 176 | source=combined_source, 177 | includes=True, 178 | include_callback=self._get_include_callback("community"), 179 | error_on_warning=True, 180 | ) 181 | 182 | # Cache the compiled rule 183 | self._rules_cache[cache_key] = compiled_rule 184 | 185 | return compiled_rule 186 | except yara.Error as e: 187 | logger.error(f"YARA compilation error for community rules: {str(e)}") 188 | raise YaraError(f"Failed to compile community rules: {str(e)}") from e 189 | except StorageError as e: 190 | logger.error(f"Storage error getting community rules: {str(e)}") 191 | raise YaraError(f"Failed to load community rules: {str(e)}") from e 192 | 193 | def _register_include_callback(self, source: str, rule_name: str) -> None: 194 | """Register an include callback for a rule. 195 | 196 | Args: 197 | source: Source of the rule 198 | rule_name: Name of the rule 199 | """ 200 | callback_key = f"{source}:{rule_name}" 201 | 202 | # Define the include callback for this rule 203 | def include_callback(requested_filename: str, namespace: str) -> bytes: 204 | """Include callback for YARA rules. 205 | 206 | Args: 207 | requested_filename: Filename requested by the include directive 208 | namespace: Namespace for the included content 209 | 210 | Returns: 211 | Content of the included file 212 | 213 | Raises: 214 | yara.Error: If include file cannot be found 215 | """ 216 | logger.debug(f"Include requested: {requested_filename} in namespace {namespace}") 217 | 218 | try: 219 | # Try to load from the same source 220 | include_content = self.storage.get_rule(requested_filename, source) 221 | return include_content.encode("utf-8") 222 | except StorageError: 223 | # If not found in the same source, try custom rules 224 | try: 225 | if source != "custom": 226 | include_content = self.storage.get_rule(requested_filename, "custom") 227 | return include_content.encode("utf-8") 228 | except StorageError: 229 | # If not found in custom rules either, try community rules 230 | try: 231 | if source != "community": 232 | include_content = self.storage.get_rule(requested_filename, "community") 233 | return include_content.encode("utf-8") 234 | except StorageError as e: 235 | # If not found anywhere, raise an error 236 | logger.warning(f"Include file not found: {requested_filename}") 237 | raise yara.Error(f"Include file not found: {requested_filename}") from e 238 | 239 | # If all attempts fail, raise an error 240 | raise yara.Error(f"Include file not found: {requested_filename}") 241 | 242 | # Register the callback 243 | self._rule_include_callbacks[callback_key] = include_callback 244 | 245 | def _get_include_callback(self, source: str) -> Callable[[str, str], bytes]: 246 | """Get the include callback for a source. 247 | 248 | Args: 249 | source: Source of the rules 250 | 251 | Returns: 252 | Include callback function 253 | """ 254 | 255 | def combined_callback(requested_filename: str, namespace: str) -> bytes: 256 | """Combined include callback that tries all registered callbacks. 257 | 258 | Args: 259 | requested_filename: Filename requested by the include directive 260 | namespace: Namespace for the included content 261 | 262 | Returns: 263 | Content of the included file 264 | 265 | Raises: 266 | yara.Error: If include file cannot be found 267 | """ 268 | # Try all callbacks associated with this source 269 | for key, callback in self._rule_include_callbacks.items(): 270 | if key.startswith(f"{source}:"): 271 | try: 272 | return callback(requested_filename, namespace) 273 | except yara.Error: 274 | # Try the next callback 275 | continue 276 | 277 | # If no callback succeeds, raise an error 278 | logger.warning(f"Include file not found by any callback: {requested_filename}") 279 | raise yara.Error(f"Include file not found: {requested_filename}") 280 | 281 | return combined_callback 282 | 283 | def add_rule(self, rule_name: str, content: str, source: str = "custom") -> YaraRuleMetadata: 284 | """Add a new YARA rule. 285 | 286 | Args: 287 | rule_name: Name of the rule 288 | content: YARA rule content 289 | source: Source of the rule 290 | 291 | Returns: 292 | Metadata for the added rule 293 | 294 | Raises: 295 | YaraError: If rule validation or compilation fails 296 | """ 297 | # Ensure rule_name has .yar extension 298 | if not rule_name.endswith(".yar"): 299 | rule_name = f"{rule_name}.yar" 300 | 301 | # Validate the rule by compiling it 302 | try: 303 | # Try to compile without includes first for basic validation 304 | yara.compile(source=content, error_on_warning=True) 305 | 306 | # Then compile with includes to validate imports 307 | yara.compile( 308 | source=content, 309 | includes=True, 310 | include_callback=self._get_include_callback(source), 311 | error_on_warning=True, 312 | ) 313 | except yara.Error as e: 314 | logger.error(f"YARA validation error for rule {rule_name}: {str(e)}") 315 | raise YaraError(f"Invalid YARA rule: {str(e)}") from e 316 | 317 | # Save the rule 318 | try: 319 | self.storage.save_rule(rule_name, content, source) 320 | logger.info(f"Added rule {rule_name} from {source}") 321 | 322 | # Compile and cache the rule 323 | compiled_rule = self._compile_rule(rule_name, source) 324 | if compiled_rule: 325 | cache_key = f"{source}:{rule_name}" 326 | self._rules_cache[cache_key] = compiled_rule 327 | # Return metadata 328 | return YaraRuleMetadata(name=rule_name, source=source, created=datetime.now(UTC), is_compiled=True) 329 | except StorageError as e: 330 | logger.error(f"Storage error saving rule {rule_name}: {str(e)}") 331 | raise YaraError(f"Failed to save rule: {str(e)}") from e 332 | 333 | def update_rule(self, rule_name: str, content: str, source: str = "custom") -> YaraRuleMetadata: 334 | """Update an existing YARA rule. 335 | 336 | Args: 337 | rule_name: Name of the rule 338 | content: Updated YARA rule content 339 | source: Source of the rule 340 | 341 | Returns: 342 | Metadata for the updated rule 343 | 344 | Raises: 345 | YaraError: If rule validation, compilation, or update fails 346 | """ 347 | # Ensure rule exists 348 | try: 349 | self.storage.get_rule(rule_name, source) 350 | except StorageError as e: 351 | logger.error(f"Rule not found: {rule_name} from {source}") 352 | raise YaraError(f"Rule not found: {rule_name}") from e 353 | 354 | # Add the rule (this will validate and save it) 355 | metadata = self.add_rule(rule_name, content, source) 356 | 357 | # Set modified timestamp 358 | metadata.modified = datetime.now(UTC) 359 | 360 | # Clear cache for this rule 361 | cache_key = f"{source}:{rule_name}" 362 | if cache_key in self._rules_cache: 363 | del self._rules_cache[cache_key] 364 | 365 | # Also clear combined community rules cache if this was a community rule 366 | if source == "community" and "community:all" in self._rules_cache: 367 | del self._rules_cache["community:all"] 368 | 369 | return metadata 370 | 371 | def delete_rule(self, rule_name: str, source: str = "custom") -> bool: 372 | """Delete a YARA rule. 373 | 374 | Args: 375 | rule_name: Name of the rule 376 | source: Source of the rule 377 | 378 | Returns: 379 | True if rule was deleted, False if not found 380 | 381 | Raises: 382 | YaraError: If rule deletion fails 383 | """ 384 | try: 385 | result = self.storage.delete_rule(rule_name, source) 386 | 387 | if result: 388 | # Clear cache for this rule 389 | cache_key = f"{source}:{rule_name}" 390 | if cache_key in self._rules_cache: 391 | del self._rules_cache[cache_key] 392 | 393 | # Also clear combined community rules cache if this was a community rule 394 | if source == "community" and "community:all" in self._rules_cache: 395 | del self._rules_cache["community:all"] 396 | 397 | logger.info(f"Deleted rule {rule_name} from {source}") 398 | 399 | return result 400 | except StorageError as e: 401 | logger.error(f"Storage error deleting rule {rule_name}: {str(e)}") 402 | raise YaraError(f"Failed to delete rule: {str(e)}") from e 403 | 404 | def get_rule(self, rule_name: str, source: str = "custom") -> str: 405 | """Get a YARA rule's content. 406 | 407 | Args: 408 | rule_name: Name of the rule 409 | source: Source of the rule 410 | 411 | Returns: 412 | Rule content 413 | 414 | Raises: 415 | YaraError: If rule not found 416 | """ 417 | try: 418 | return self.storage.get_rule(rule_name, source) 419 | except StorageError as e: 420 | logger.error(f"Storage error getting rule {rule_name}: {str(e)}") 421 | raise YaraError(f"Failed to get rule: {str(e)}") from e 422 | 423 | def list_rules(self, source: Optional[str] = None) -> List[YaraRuleMetadata]: 424 | """List all YARA rules. 425 | 426 | Args: 427 | source: Optional filter by source 428 | 429 | Returns: 430 | List of rule metadata 431 | """ 432 | try: 433 | rules_data = self.storage.list_rules(source) 434 | 435 | # Convert to YaraRuleMetadata objects 436 | rules_metadata = [] 437 | for rule in rules_data: 438 | try: 439 | # Check if rule is compiled 440 | is_compiled = False 441 | rule_source = rule.get("source", "custom") 442 | rule_name = rule.get("name") 443 | cache_key = f"{rule_source}:{rule_name}" 444 | 445 | # Rule is compiled if it's in the cache 446 | is_compiled = cache_key in self._rules_cache 447 | 448 | # Rule is also compiled if it's a community rule and community:all is compiled 449 | if rule_source == "community" and "community:all" in self._rules_cache: 450 | is_compiled = True 451 | 452 | # Create metadata object 453 | created = rule.get("created") 454 | if isinstance(created, str): 455 | created = datetime.fromisoformat(created) 456 | elif not isinstance(created, datetime): 457 | created = datetime.now(UTC) 458 | 459 | modified = rule.get("modified") 460 | if isinstance(modified, str): 461 | modified = datetime.fromisoformat(modified) 462 | 463 | metadata = YaraRuleMetadata( 464 | name=rule.get("name"), 465 | source=rule.get("source", "custom"), 466 | created=created, 467 | modified=modified, 468 | is_compiled=is_compiled, 469 | ) 470 | 471 | rules_metadata.append(metadata) 472 | except Exception as e: 473 | logger.warning(f"Error processing rule metadata: {str(e)}") 474 | 475 | return rules_metadata 476 | except StorageError as e: 477 | logger.error(f"Storage error listing rules: {str(e)}") 478 | raise YaraError(f"Failed to list rules: {str(e)}") from e 479 | 480 | def match_file( 481 | self, 482 | file_path: str, 483 | *, 484 | rule_names: Optional[List[str]] = None, 485 | sources: Optional[List[str]] = None, 486 | timeout: Optional[int] = None, 487 | ) -> YaraScanResult: 488 | """Match YARA rules against a file. 489 | 490 | Args: 491 | file_path: Path to the file to scan 492 | rule_names: Optional list of rule names to match (if None, match all) 493 | sources: Optional list of sources to match rules from (if None, match all) 494 | timeout: Optional timeout in seconds (if None, use default) 495 | 496 | Returns: 497 | Scan result 498 | 499 | Raises: 500 | YaraError: If scanning fails 501 | """ 502 | # Resolve timeout 503 | if timeout is None: 504 | timeout = settings.YARA_SCAN_TIMEOUT 505 | 506 | # Get file information 507 | try: 508 | file_size = os.path.getsize(file_path) 509 | if file_size > settings.YARA_MAX_FILE_SIZE: 510 | logger.warning(f"File too large: {file_path} ({file_size} bytes)") 511 | raise YaraError(f"File too large: {file_size} bytes (max {settings.YARA_MAX_FILE_SIZE} bytes)") 512 | 513 | # Calculate file hash 514 | with open(file_path, "rb") as f: 515 | file_hash = hashlib.sha256(f.read()).hexdigest() 516 | 517 | # Get filename from path 518 | file_name = os.path.basename(file_path) 519 | 520 | # Prepare the scan 521 | scan_start = time.time() 522 | timeout_reached = False 523 | error = None 524 | 525 | # Collect rules to match 526 | rules_to_match = self._collect_rules(rule_names, sources) 527 | 528 | # Match rules against the file 529 | matches: List[yara.Match] = [] 530 | for rule in rules_to_match: 531 | try: 532 | # Match with timeout 533 | rule_matches = rule.match(file_path, timeout=timeout) 534 | matches.extend(rule_matches) 535 | except yara.TimeoutError: 536 | logger.warning(f"YARA scan timeout for file {file_path}") 537 | timeout_reached = True 538 | break 539 | except yara.Error as e: 540 | logger.error(f"YARA scan error for file {file_path}: {str(e)}") 541 | error = str(e) 542 | break 543 | 544 | # Calculate scan time 545 | scan_time = time.time() - scan_start 546 | 547 | # Process matches 548 | yara_matches = self._process_matches(matches) 549 | 550 | # Create scan result 551 | result = YaraScanResult( 552 | file_name=file_name, 553 | file_size=file_size, 554 | file_hash=file_hash, 555 | matches=yara_matches, 556 | scan_time=scan_time, 557 | timeout_reached=timeout_reached, 558 | error=error, 559 | ) 560 | 561 | # Save the result 562 | result_id = result.scan_id 563 | self.storage.save_result(str(result_id), result.model_dump()) 564 | 565 | return result 566 | except (IOError, OSError) as e: 567 | logger.error(f"File error scanning {file_path}: {str(e)}") 568 | raise YaraError(f"Failed to scan file: {str(e)}") from e 569 | 570 | def match_data( 571 | self, 572 | data: Union[bytes, BinaryIO], 573 | file_name: str, 574 | *, 575 | rule_names: Optional[List[str]] = None, 576 | sources: Optional[List[str]] = None, 577 | timeout: Optional[int] = None, 578 | ) -> YaraScanResult: 579 | """Match YARA rules against in-memory data. 580 | 581 | Args: 582 | data: Bytes or file-like object to scan 583 | file_name: Name of the file for reference 584 | rule_names: Optional list of rule names to match (if None, match all) 585 | sources: Optional list of sources to match rules from (if None, match all) 586 | timeout: Optional timeout in seconds (if None, use default) 587 | 588 | Returns: 589 | Scan result 590 | 591 | Raises: 592 | YaraError: If scanning fails 593 | """ 594 | # Resolve timeout 595 | if timeout is None: 596 | timeout = settings.YARA_SCAN_TIMEOUT 597 | 598 | # Ensure data is bytes 599 | if hasattr(data, "read"): 600 | # It's a file-like object, read it into memory 601 | data_bytes = data.read() 602 | if hasattr(data, "seek"): 603 | data.seek(0) # Reset for potential future reads 604 | else: 605 | data_bytes = data 606 | 607 | # Check file size 608 | file_size = len(data_bytes) 609 | if file_size > settings.YARA_MAX_FILE_SIZE: 610 | logger.warning(f"Data too large: {file_name} ({file_size} bytes)") 611 | raise YaraError(f"Data too large: {file_size} bytes (max {settings.YARA_MAX_FILE_SIZE} bytes)") 612 | 613 | # Calculate data hash 614 | file_hash = hashlib.sha256(data_bytes).hexdigest() 615 | 616 | try: 617 | # Prepare the scan 618 | scan_start = time.time() 619 | timeout_reached = False 620 | error = None 621 | 622 | # Collect rules to match 623 | rules_to_match = self._collect_rules(rule_names, sources) 624 | 625 | # Match rules against the data 626 | matches: List[yara.Match] = [] 627 | for rule in rules_to_match: 628 | try: 629 | # Match with timeout 630 | rule_matches = rule.match(data=data_bytes, timeout=timeout) 631 | matches.extend(rule_matches) 632 | except yara.TimeoutError: 633 | logger.warning(f"YARA scan timeout for data {file_name}") 634 | timeout_reached = True 635 | break 636 | except yara.Error as e: 637 | logger.error(f"YARA scan error for data {file_name}: {str(e)}") 638 | error = str(e) 639 | break 640 | 641 | # Calculate scan time 642 | scan_time = time.time() - scan_start 643 | 644 | # Process matches 645 | yara_matches = self._process_matches(matches) 646 | 647 | # Create scan result 648 | result = YaraScanResult( 649 | file_name=file_name, 650 | file_size=file_size, 651 | file_hash=file_hash, 652 | matches=yara_matches, 653 | scan_time=scan_time, 654 | timeout_reached=timeout_reached, 655 | error=error, 656 | ) 657 | 658 | # Save the result 659 | result_id = result.scan_id 660 | self.storage.save_result(str(result_id), result.model_dump()) 661 | 662 | return result 663 | except Exception as e: 664 | logger.error(f"Error scanning data {file_name}: {str(e)}") 665 | raise YaraError(f"Failed to scan data: {str(e)}") from e 666 | 667 | def fetch_and_scan( 668 | self, 669 | url: str, 670 | *, 671 | rule_names: Optional[List[str]] = None, 672 | sources: Optional[List[str]] = None, 673 | timeout: Optional[int] = None, 674 | download_timeout: int = 30, 675 | ) -> YaraScanResult: 676 | """Fetch a file from a URL and scan it with YARA rules. 677 | 678 | Args: 679 | url: URL to fetch 680 | rule_names: Optional list of rule names to match (if None, match all) 681 | sources: Optional list of sources to match rules from (if None, match all) 682 | timeout: Optional timeout in seconds for YARA scan (if None, use default) 683 | download_timeout: Timeout in seconds for downloading the file 684 | 685 | Returns: 686 | Scan result 687 | 688 | Raises: 689 | YaraError: If fetching or scanning fails 690 | """ 691 | # Parse URL to get filename 692 | parsed_url = urlparse(url) 693 | file_name = os.path.basename(parsed_url.path) 694 | if not file_name: 695 | file_name = "downloaded_file" 696 | 697 | # Create a temporary file 698 | temp_file = None 699 | try: 700 | # Download the file 701 | logger.info(f"Fetching file from URL: {url}") 702 | with httpx.Client(timeout=download_timeout) as client: 703 | response = client.get(url, follow_redirects=True) 704 | response.raise_for_status() # Raise exception for error status codes 705 | 706 | # Get content 707 | content = response.content 708 | 709 | # Check file size 710 | file_size = len(content) 711 | if file_size > settings.YARA_MAX_FILE_SIZE: 712 | logger.warning(f"Downloaded file too large: {file_name} ({file_size} bytes)") 713 | raise YaraError( 714 | f"Downloaded file too large: {file_size} bytes (max {settings.YARA_MAX_FILE_SIZE} bytes)" 715 | ) from None 716 | 717 | # Try to get a better filename from Content-Disposition header if available 718 | content_disposition = response.headers.get("Content-Disposition") 719 | if content_disposition and "filename=" in content_disposition: 720 | import re # pylint: disable=import-outside-toplevel 721 | 722 | filename_match = re.search(r'filename="?([^";]+)"?', content_disposition) 723 | if filename_match: 724 | file_name = filename_match.group(1) 725 | 726 | # Save to storage 727 | file_path, file_hash = self.storage.save_sample(filename=file_name, content=content) 728 | logger.info("Downloaded file saved to storage with hash: %s", file_hash) 729 | # Scan the file 730 | if os.path.exists(file_path): 731 | # If file_path is a real file on disk, use match_file 732 | return self.match_file(file_path, rule_names=rule_names, sources=sources, timeout=timeout) 733 | # Otherwise, use match_data 734 | return self.match_data( 735 | data=content, file_name=file_name, rule_names=rule_names, sources=sources, timeout=timeout 736 | ) 737 | except httpx.RequestError as e: 738 | logger.error(f"HTTP request error fetching {url}: {str(e)}") 739 | raise YaraError(f"Failed to fetch file: {str(e)}") from e 740 | except httpx.HTTPStatusError as e: 741 | logger.error(f"HTTP error fetching {url}: {e.response.status_code}") 742 | raise YaraError(f"Failed to fetch file: HTTP {e.response.status_code}") from e 743 | finally: 744 | # Clean up temporary file if created 745 | if temp_file: 746 | try: 747 | temp_file.close() 748 | os.unlink(temp_file.name) 749 | except (IOError, OSError): 750 | pass 751 | 752 | def _collect_rules( 753 | self, rule_names: Optional[List[str]] = None, sources: Optional[List[str]] = None 754 | ) -> List[yara.Rules]: 755 | """Collect YARA rules to match. 756 | 757 | Args: 758 | rule_names: Optional list of rule names to match (if None, match all) 759 | sources: Optional list of sources to match rules from (if None, match all) 760 | 761 | Returns: 762 | List of YARA rules objects 763 | 764 | Raises: 765 | YaraError: If no rules are found 766 | """ 767 | rules_to_match: List[yara.Rules] = [] 768 | 769 | # If specific rules are requested 770 | if rule_names: 771 | for rule_name in rule_names: 772 | # Try to find the rule in all sources if sources not specified 773 | if not sources: 774 | available_sources = ["custom", "community"] 775 | else: 776 | available_sources = sources 777 | 778 | found = False 779 | for source in available_sources: 780 | try: 781 | rule = self._compile_rule(rule_name, source) 782 | rules_to_match.append(rule) 783 | found = True 784 | break 785 | except YaraError: 786 | continue 787 | 788 | if not found: 789 | logger.warning(f"Rule not found: {rule_name}") 790 | 791 | if not rules_to_match: 792 | raise YaraError("No requested rules found") 793 | else: 794 | # No specific rules requested, use all available rules 795 | 796 | # Check if we have a community:all ruleset 797 | if not sources or "community" in sources: 798 | try: 799 | community_rules = self._compile_community_rules() 800 | rules_to_match.append(community_rules) 801 | except YaraError: 802 | # Community rules not available as combined set, try individual rules 803 | if not sources: 804 | sources = ["custom", "community"] 805 | 806 | # For each source, get all rules 807 | for source in sources: 808 | try: 809 | rules = self.list_rules(source) 810 | for rule in rules: 811 | try: 812 | compiled_rule = self._compile_rule(rule.name, rule.source) 813 | rules_to_match.append(compiled_rule) 814 | except YaraError: 815 | continue 816 | except YaraError: 817 | continue 818 | else: 819 | # Use only specified sources 820 | for source in sources: 821 | try: 822 | rules = self.list_rules(source) 823 | for rule in rules: 824 | try: 825 | compiled_rule = self._compile_rule(rule.name, rule.source) 826 | rules_to_match.append(compiled_rule) 827 | except YaraError: 828 | continue 829 | except YaraError: 830 | continue 831 | 832 | # Ensure we have at least one rule 833 | if not rules_to_match: 834 | raise YaraError("No YARA rules available") 835 | 836 | return rules_to_match 837 | 838 | def _process_matches(self, matches: List[yara.Match]) -> List[YaraMatch]: 839 | """Process YARA matches into YaraMatch objects. 840 | 841 | Args: 842 | matches: List of YARA match objects 843 | 844 | Returns: 845 | List of YaraMatch objects 846 | """ 847 | result: List[YaraMatch] = [] 848 | 849 | for match in matches: 850 | try: 851 | # Extract rule name 852 | rule_name = match.rule 853 | 854 | # Extract namespace 855 | namespace = match.namespace 856 | 857 | # Extract tags 858 | tags = match.tags 859 | 860 | # Extract metadata 861 | meta = match.meta 862 | 863 | # Create empty strings list - we're skipping string processing due to compatibility issues 864 | strings = [] 865 | 866 | # Create YaraMatch object 867 | yara_match = YaraMatch(rule=rule_name, namespace=namespace, tags=tags, meta=meta, strings=strings) 868 | 869 | result.append(yara_match) 870 | except Exception as e: 871 | logger.error(f"Error processing YARA match: {str(e)}") 872 | continue 873 | 874 | return result 875 | 876 | 877 | # Create a singleton instance 878 | yara_service = YaraService() 879 | ```