#
tokens: 30853/50000 1/207 files (page 29/45)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 29 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│   ├── __init__.py
│   ├── advanced_agent_flows_using_unified_memory_system_demo.py
│   ├── advanced_extraction_demo.py
│   ├── advanced_unified_memory_system_demo.py
│   ├── advanced_vector_search_demo.py
│   ├── analytics_reporting_demo.py
│   ├── audio_transcription_demo.py
│   ├── basic_completion_demo.py
│   ├── cache_demo.py
│   ├── claude_integration_demo.py
│   ├── compare_synthesize_demo.py
│   ├── cost_optimization.py
│   ├── data
│   │   ├── sample_event.txt
│   │   ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│   │   └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│   ├── docstring_refiner_demo.py
│   ├── document_conversion_and_processing_demo.py
│   ├── entity_relation_graph_demo.py
│   ├── filesystem_operations_demo.py
│   ├── grok_integration_demo.py
│   ├── local_text_tools_demo.py
│   ├── marqo_fused_search_demo.py
│   ├── measure_model_speeds.py
│   ├── meta_api_demo.py
│   ├── multi_provider_demo.py
│   ├── ollama_integration_demo.py
│   ├── prompt_templates_demo.py
│   ├── python_sandbox_demo.py
│   ├── rag_example.py
│   ├── research_workflow_demo.py
│   ├── sample
│   │   ├── article.txt
│   │   ├── backprop_paper.pdf
│   │   ├── buffett.pdf
│   │   ├── contract_link.txt
│   │   ├── legal_contract.txt
│   │   ├── medical_case.txt
│   │   ├── northwind.db
│   │   ├── research_paper.txt
│   │   ├── sample_data.json
│   │   └── text_classification_samples
│   │       ├── email_classification.txt
│   │       ├── news_samples.txt
│   │       ├── product_reviews.txt
│   │       └── support_tickets.txt
│   ├── sample_docs
│   │   └── downloaded
│   │       └── attention_is_all_you_need.pdf
│   ├── sentiment_analysis_demo.py
│   ├── simple_completion_demo.py
│   ├── single_shot_synthesis_demo.py
│   ├── smart_browser_demo.py
│   ├── sql_database_demo.py
│   ├── sse_client_demo.py
│   ├── test_code_extraction.py
│   ├── test_content_detection.py
│   ├── test_ollama.py
│   ├── text_classification_demo.py
│   ├── text_redline_demo.py
│   ├── tool_composition_examples.py
│   ├── tournament_code_demo.py
│   ├── tournament_text_demo.py
│   ├── unified_memory_system_demo.py
│   ├── vector_search_demo.py
│   ├── web_automation_instruction_packs.py
│   └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│   └── smart_browser_internal
│       ├── locator_cache.db
│       ├── readability.js
│       └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│   ├── __init__.py
│   ├── conftest.py
│   ├── integration
│   │   ├── __init__.py
│   │   └── test_server.py
│   ├── manual
│   │   ├── test_extraction_advanced.py
│   │   └── test_extraction.py
│   └── unit
│       ├── __init__.py
│       ├── test_cache.py
│       ├── test_providers.py
│       └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── commands.py
│   │   ├── helpers.py
│   │   └── typer_cli.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── completion_client.py
│   │   └── rag_client.py
│   ├── config
│   │   └── examples
│   │       └── filesystem_config.yaml
│   ├── config.py
│   ├── constants.py
│   ├── core
│   │   ├── __init__.py
│   │   ├── evaluation
│   │   │   ├── base.py
│   │   │   └── evaluators.py
│   │   ├── providers
│   │   │   ├── __init__.py
│   │   │   ├── anthropic.py
│   │   │   ├── base.py
│   │   │   ├── deepseek.py
│   │   │   ├── gemini.py
│   │   │   ├── grok.py
│   │   │   ├── ollama.py
│   │   │   ├── openai.py
│   │   │   └── openrouter.py
│   │   ├── server.py
│   │   ├── state_store.py
│   │   ├── tournaments
│   │   │   ├── manager.py
│   │   │   ├── tasks.py
│   │   │   └── utils.py
│   │   └── ums_api
│   │       ├── __init__.py
│   │       ├── ums_database.py
│   │       ├── ums_endpoints.py
│   │       ├── ums_models.py
│   │       └── ums_services.py
│   ├── exceptions.py
│   ├── graceful_shutdown.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── analytics
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── reporting.py
│   │   ├── cache
│   │   │   ├── __init__.py
│   │   │   ├── cache_service.py
│   │   │   ├── persistence.py
│   │   │   ├── strategies.py
│   │   │   └── utils.py
│   │   ├── cache.py
│   │   ├── document.py
│   │   ├── knowledge_base
│   │   │   ├── __init__.py
│   │   │   ├── feedback.py
│   │   │   ├── manager.py
│   │   │   ├── rag_engine.py
│   │   │   ├── retriever.py
│   │   │   └── utils.py
│   │   ├── prompts
│   │   │   ├── __init__.py
│   │   │   ├── repository.py
│   │   │   └── templates.py
│   │   ├── prompts.py
│   │   └── vector
│   │       ├── __init__.py
│   │       ├── embeddings.py
│   │       └── vector_service.py
│   ├── tool_token_counter.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── audio_transcription.py
│   │   ├── base.py
│   │   ├── completion.py
│   │   ├── docstring_refiner.py
│   │   ├── document_conversion_and_processing.py
│   │   ├── enhanced-ums-lookbook.html
│   │   ├── entity_relation_graph.py
│   │   ├── excel_spreadsheet_automation.py
│   │   ├── extraction.py
│   │   ├── filesystem.py
│   │   ├── html_to_markdown.py
│   │   ├── local_text_tools.py
│   │   ├── marqo_fused_search.py
│   │   ├── meta_api_tool.py
│   │   ├── ocr_tools.py
│   │   ├── optimization.py
│   │   ├── provider.py
│   │   ├── pyodide_boot_template.html
│   │   ├── python_sandbox.py
│   │   ├── rag.py
│   │   ├── redline-compiled.css
│   │   ├── sentiment_analysis.py
│   │   ├── single_shot_synthesis.py
│   │   ├── smart_browser.py
│   │   ├── sql_databases.py
│   │   ├── text_classification.py
│   │   ├── text_redline_tools.py
│   │   ├── tournament.py
│   │   ├── ums_explorer.html
│   │   └── unified_memory_system.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── async_utils.py
│   │   ├── display.py
│   │   ├── logging
│   │   │   ├── __init__.py
│   │   │   ├── console.py
│   │   │   ├── emojis.py
│   │   │   ├── formatter.py
│   │   │   ├── logger.py
│   │   │   ├── panels.py
│   │   │   ├── progress.py
│   │   │   └── themes.py
│   │   ├── parse_yaml.py
│   │   ├── parsing.py
│   │   ├── security.py
│   │   └── text.py
│   └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/local_text_tools.py:
--------------------------------------------------------------------------------

```python
   1 | # ultimate_mcp_server/tools/local_text_tools.py
   2 | """
   3 | Standalone, secure wrappers around local CLI text-processing utilities (rg, awk, sed, jq)
   4 | for the Ultimate MCP Server framework.
   5 | 
   6 | This module provides controlled execution of common command-line text tools within
   7 | a defined workspace, incorporating enhanced security checks, resource limits, performance
   8 | optimizations like caching and concurrency control, and robust error handling.
   9 | 
  10 | Key Features:
  11 | *   Standalone Functions: Tools exposed as individual async functions.
  12 | *   Workspace Confinement: All file/directory operations strictly enforced within WORKSPACE_DIR.
  13 | *   Security Hardening: Validates arguments against shell metacharacters, subshells,
  14 |     redirection, path traversal, and specific unsafe flags (e.g., `sed -i`). Uses `prctl`
  15 |     and `setsid` on Linux for further sandboxing.
  16 | *   Resource Limiting: Applies CPU time and memory limits (Unix only).
  17 | *   Input Flexibility: Handles input via stdin (`input_data`) or file/directory targets
  18 |     specified within the command arguments (`args_str`). Stdin size is capped.
  19 | *   Standardized Output: Returns consistent `ToolResult` TypedDict with stdout, stderr,
  20 |     exit_code, success status, timing, and truncation info. Output is truncated.
  21 | *   Command Integrity: Checks command availability and checksums (lazily, with re-verification).
  22 |     Enforces minimum versions.
  23 | *   Performance: Includes per-tool concurrency limits and optional disk-based caching
  24 |     of identical command invocations.
  25 | *   LLM-Friendly: Detailed docstrings, structured errors with codes, optional streaming modes,
  26 |     and a `dry_run` option enhance usability for AI agents.
  27 | """
  28 | 
  29 | import asyncio
  30 | import hashlib
  31 | import json
  32 | import os
  33 | import random
  34 | import re
  35 | import shlex
  36 | import shutil
  37 | import sys
  38 | import textwrap
  39 | import time
  40 | from dataclasses import (
  41 |     dataclass,
  42 | )  # Keep field for potential future use, though not used for update now
  43 | from enum import Enum
  44 | from pathlib import Path
  45 | from typing import (
  46 |     AsyncIterator,
  47 |     Dict,
  48 |     List,
  49 |     Optional,
  50 |     Sequence,
  51 |     Tuple,
  52 |     TypedDict,
  53 |     cast,
  54 | )
  55 | 
  56 | import aiofiles  # Needed for async checksum
  57 | 
  58 | from ultimate_mcp_server.exceptions import ToolExecutionError, ToolInputError
  59 | from ultimate_mcp_server.tools.base import with_error_handling, with_tool_metrics
  60 | from ultimate_mcp_server.utils import get_logger
  61 | 
  62 | logger = get_logger("ultimate_mcp_server.tools.local_text")
  63 | 
  64 | # Conditional import for resource limiting and sandboxing
  65 | try:
  66 |     import resource  # type: ignore [import-not-found]
  67 | 
  68 |     HAS_RESOURCE = True
  69 | except ImportError:
  70 |     HAS_RESOURCE = False
  71 |     logger.debug("`resource` module not found (likely non-Unix). Resource limits disabled.")
  72 | 
  73 | try:
  74 |     import prctl  # type: ignore [import-not-found]
  75 | 
  76 |     HAS_PRCTL = True
  77 | except ImportError:
  78 |     HAS_PRCTL = False
  79 |     logger.debug("`prctl` module not found (likely non-Linux). Advanced sandboxing disabled.")
  80 | 
  81 | # --------------------------------------------------------------------------- #
  82 | # Configuration (Loaded from Environment or Defaults)
  83 | # --------------------------------------------------------------------------- #
  84 | 
  85 | #: Maximum bytes returned in stdout / stderr before truncation
  86 | MAX_OUTPUT_BYTES = int(os.getenv("MCP_TEXT_MAX_OUTPUT", "1_000_000"))  # 1 MiB default
  87 | #: Maximum bytes accepted via stdin (`input_data`)
  88 | MAX_INPUT_BYTES = int(os.getenv("MCP_TEXT_MAX_INPUT", "25_000_000"))  # 25 MiB default
  89 | #: Maximum seconds a command may run before being terminated
  90 | DEFAULT_TIMEOUT = float(os.getenv("MCP_TEXT_TIMEOUT", "30"))
  91 | #: Workspace root – **all file/directory arguments must resolve inside this tree**
  92 | try:
  93 |     WORKSPACE_DIR_STR = os.getenv("MCP_TEXT_WORKSPACE", ".")
  94 |     WORKSPACE_DIR = Path(WORKSPACE_DIR_STR).resolve()
  95 |     if not WORKSPACE_DIR.is_dir():
  96 |         logger.warning(
  97 |             f"MCP_TEXT_WORKSPACE ('{WORKSPACE_DIR_STR}' -> '{WORKSPACE_DIR}') is not a directory. Defaulting to current."
  98 |         )
  99 |         WORKSPACE_DIR = Path(".").resolve()
 100 | except Exception as e:
 101 |     logger.error(
 102 |         f"Error resolving MCP_TEXT_WORKSPACE ('{WORKSPACE_DIR_STR}'): {e}. Defaulting to current."
 103 |     )
 104 |     WORKSPACE_DIR = Path(".").resolve()
 105 | logger.info(f"LocalTextTools workspace confined to: {WORKSPACE_DIR}")
 106 | 
 107 | #: Disk cache directory for command results
 108 | CACHE_DIR_STR = os.getenv("MCP_TEXT_CACHE_DIR", "~/.cache/ultimate_mcp_server/local_text_tools")
 109 | CACHE_DIR = Path(CACHE_DIR_STR).expanduser().resolve()
 110 | CACHE_ENABLED = os.getenv("MCP_TEXT_CACHE_ENABLED", "true").lower() == "true"
 111 | CACHE_MAX_SIZE_MB = int(os.getenv("MCP_TEXT_CACHE_MAX_MB", "500"))
 112 | CACHE_MAX_AGE_DAYS = int(os.getenv("MCP_TEXT_CACHE_MAX_AGE_DAYS", "7"))
 113 | 
 114 | if CACHE_ENABLED:
 115 |     try:
 116 |         CACHE_DIR.mkdir(parents=True, exist_ok=True)
 117 |         logger.info(f"Using command invocation cache directory: {CACHE_DIR}")
 118 |     except OSError as e:
 119 |         logger.error(
 120 |             f"Failed to create command cache directory {CACHE_DIR}: {e}. Caching disabled."
 121 |         )
 122 |         CACHE_ENABLED = False  # Disable cache if directory fails
 123 | 
 124 | # Concurrency limits per command
 125 | DEFAULT_CONCURRENCY = 4
 126 | CONCURRENCY_LIMITS = {
 127 |     "rg": int(os.getenv("MCP_TEXT_CONCURRENCY_RG", "8")),
 128 |     "awk": int(os.getenv("MCP_TEXT_CONCURRENCY_AWK", str(DEFAULT_CONCURRENCY))),
 129 |     "sed": int(os.getenv("MCP_TEXT_CONCURRENCY_SED", str(DEFAULT_CONCURRENCY))),
 130 |     "jq": int(os.getenv("MCP_TEXT_CONCURRENCY_JQ", str(DEFAULT_CONCURRENCY))),
 131 | }
 132 | 
 133 | # Forbidden shell metacharacters (pre-compiled set for efficiency)
 134 | _FORBIDDEN_CHARS_SET = frozenset("&;`|><$()")
 135 | 
 136 | 
 137 | # --------------------------------------------------------------------------- #
 138 | # Error Codes Enum
 139 | # --------------------------------------------------------------------------- #
 140 | class ToolErrorCode(str, Enum):
 141 |     """Machine-readable error codes for local text tools."""
 142 | 
 143 |     PATH_TRAVERSAL = "PATH_TRAVERSAL"
 144 |     ABS_PATH_FORBIDDEN = "ABS_PATH_FORBIDDEN"
 145 |     WORKSPACE_VIOLATION = "WORKSPACE_VIOLATION"
 146 |     FORBIDDEN_FLAG = "FORBIDDEN_FLAG"
 147 |     FORBIDDEN_CHAR = "FORBIDDEN_CHAR"
 148 |     CMD_SUBSTITUTION = "CMD_SUBSTITUTION"  # Often related to $() or ``
 149 |     INVALID_ARGS = "INVALID_ARGS"
 150 |     INPUT_TOO_LARGE = "INPUT_TOO_LARGE"
 151 |     INVALID_JSON_INPUT = "INVALID_JSON_INPUT"
 152 |     CMD_NOT_FOUND = "CMD_NOT_FOUND"
 153 |     TIMEOUT = "TIMEOUT"
 154 |     EXEC_ERROR = "EXEC_ERROR"
 155 |     COMMUNICATION_ERROR = "COMMUNICATION_ERROR"
 156 |     CHECKSUM_MISMATCH = "CHECKSUM_MISMATCH"
 157 |     VERSION_TOO_OLD = "VERSION_TOO_OLD"
 158 |     UNEXPECTED_FAILURE = "UNEXPECTED_FAILURE"
 159 |     CACHE_ERROR = "CACHE_ERROR"
 160 | 
 161 | 
 162 | # --------------------------------------------------------------------------- #
 163 | # Result Schema (TypedDict)
 164 | # --------------------------------------------------------------------------- #
 165 | class ToolResult(TypedDict, total=False):
 166 |     """Standardized result structure for local text tool executions."""
 167 | 
 168 |     stdout: Optional[str]  # Decoded standard output (potentially truncated)
 169 |     stderr: Optional[str]  # Decoded standard error (potentially truncated)
 170 |     exit_code: Optional[int]  # Process exit code
 171 |     success: bool  # True if execution considered successful (depends on tool/retcode)
 172 |     error: Optional[str]  # Human-readable error message if success is False
 173 |     error_code: Optional[ToolErrorCode]  # Machine-readable error code if success is False
 174 |     duration: float  # Execution duration in seconds
 175 |     stdout_truncated: bool  # True if stdout was truncated
 176 |     stderr_truncated: bool  # True if stderr was truncated
 177 |     cached_result: bool  # True if this result was served from cache
 178 |     dry_run_cmdline: Optional[List[str]]  # Populated only if dry_run=True
 179 | 
 180 | 
 181 | # --------------------------------------------------------------------------- #
 182 | # Command Metadata & Discovery
 183 | # --------------------------------------------------------------------------- #
 184 | 
 185 | 
 186 | @dataclass(slots=True, frozen=True)  # Make truly immutable
 187 | class CommandMeta:
 188 |     """Metadata for a command-line tool."""
 189 | 
 190 |     name: str
 191 |     path: Optional[Path] = None  # Store the resolved absolute path
 192 |     checksum: Optional[str] = None  # SHA-256 checksum of the executable (calculated lazily)
 193 |     mtime: Optional[float] = None  # Last modification time (for checksum re-verification)
 194 |     version: Optional[tuple[int, ...]] = None  # Parsed version tuple (e.g., (13, 0, 1))
 195 |     forbidden_flags: frozenset[str] = frozenset()  # Flags disallowed for security
 196 |     readonly: bool = True  # True if the command should not modify the filesystem
 197 |     min_version: Optional[tuple[int, ...]] = None  # Minimum required version tuple
 198 | 
 199 | 
 200 | # Store CommandMeta objects, keyed by command name
 201 | _COMMAND_METADATA: Dict[str, CommandMeta] = {
 202 |     "rg": CommandMeta("rg", min_version=(13, 0, 0)),  # Example: Require ripgrep >= 13.0.0
 203 |     "awk": CommandMeta(
 204 |         "awk", forbidden_flags=frozenset({"-i", "--in-place"})
 205 |     ),  # AWK in-place is less common but exists (gawk)
 206 |     "sed": CommandMeta("sed", forbidden_flags=frozenset({"-i", "--in-place"})),
 207 |     "jq": CommandMeta("jq", min_version=(1, 6)),  # Example: Require jq >= 1.6
 208 | }
 209 | _COMMAND_VERSIONS_CACHE: Dict[str, Optional[tuple[int, ...]]] = {}  # Cache parsed versions
 210 | _checksum_lock = asyncio.Lock()  # Lock for lazy checksum calculation
 211 | _version_lock = asyncio.Lock()  # Lock for lazy version checking
 212 | 
 213 | 
 214 | async def _calculate_sha256sum_async(path: Path, chunk: int = 262_144) -> str:
 215 |     """Asynchronously calculates SHA256 checksum using aiofiles."""
 216 |     h = hashlib.sha256()
 217 |     try:
 218 |         async with aiofiles.open(path, "rb") as fh:
 219 |             while True:
 220 |                 blk = await fh.read(chunk)
 221 |                 if not blk:
 222 |                     break
 223 |                 h.update(blk)
 224 |         return h.hexdigest()
 225 |     except OSError as e:
 226 |         logger.error(f"Failed to calculate SHA256 for {path}: {e}")
 227 |         return "error_calculating_checksum"
 228 | 
 229 | 
 230 | async def _get_command_checksum(meta: CommandMeta) -> Optional[str]:
 231 |     """Lazily calculates and caches the command checksum, verifying mtime."""
 232 |     global _COMMAND_METADATA  # Allow modification (replacing entry)
 233 |     if not meta.path:
 234 |         return None  # Cannot checksum if path unknown
 235 | 
 236 |     async with _checksum_lock:
 237 |         # Re-fetch meta inside lock in case it was updated by another coroutine
 238 |         meta_locked = _COMMAND_METADATA.get(meta.name)
 239 |         if not meta_locked or not meta_locked.path:
 240 |             return None
 241 | 
 242 |         needs_recalc = False
 243 |         current_mtime = None
 244 |         try:
 245 |             # Use asyncio.to_thread for potentially blocking stat
 246 |             stat_res = await asyncio.to_thread(meta_locked.path.stat)
 247 |             current_mtime = stat_res.st_mtime
 248 |             if (
 249 |                 meta_locked.checksum is None
 250 |                 or meta_locked.mtime is None
 251 |                 or current_mtime != meta_locked.mtime
 252 |             ):
 253 |                 needs_recalc = True
 254 |         except OSError as e:
 255 |             logger.warning(
 256 |                 f"Could not stat {meta_locked.path} for checksum verification: {e}. Recalculating."
 257 |             )
 258 |             needs_recalc = True  # Force recalc if stat fails
 259 | 
 260 |         if needs_recalc:
 261 |             logger.debug(
 262 |                 f"Calculating checksum for {meta.name} (mtime changed or first calculation)..."
 263 |             )
 264 |             new_checksum = await _calculate_sha256sum_async(meta_locked.path)
 265 |             _COMMAND_METADATA[meta.name] = CommandMeta(
 266 |                 name=meta_locked.name,
 267 |                 path=meta_locked.path,
 268 |                 checksum=new_checksum,
 269 |                 mtime=current_mtime,  # Store the mtime when checksum was calculated
 270 |                 forbidden_flags=meta_locked.forbidden_flags,
 271 |                 readonly=meta_locked.readonly,
 272 |                 min_version=meta_locked.min_version,
 273 |             )
 274 |             logger.debug(f"Updated checksum for {meta.name}: {new_checksum[:12]}...")
 275 |             return new_checksum
 276 |         else:
 277 |             # Return cached checksum
 278 |             return meta_locked.checksum
 279 | 
 280 | 
 281 | async def _parse_version(cmd_path: Path) -> Optional[tuple[int, ...]]:
 282 |     """Runs '<tool> --version' and parses semantic version tuple."""
 283 |     try:
 284 |         proc = await asyncio.create_subprocess_exec(
 285 |             str(cmd_path),
 286 |             "--version",
 287 |             stdout=asyncio.subprocess.PIPE,
 288 |             stderr=asyncio.subprocess.PIPE,
 289 |         )
 290 |         stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=5.0)
 291 |         if proc.returncode != 0:
 292 |             logger.warning(
 293 |                 f"Command '{cmd_path.name} --version' failed with code {proc.returncode}: {stderr_b.decode(errors='ignore')[:100]}"
 294 |             )
 295 |             return None
 296 | 
 297 |         output = stdout_b.decode(errors="ignore").strip()
 298 |         # Common version patterns (adjust as needed for specific tools)
 299 |         match = re.search(r"(\d+)\.(\d+)(?:\.(\d+))?", output)
 300 |         if match:
 301 |             major = int(match.group(1))
 302 |             minor = int(match.group(2))
 303 |             patch = int(match.group(3) or 0)  # Default patch to 0
 304 |             return (major, minor, patch)
 305 |         else:
 306 |             logger.warning(f"Could not parse version from '{cmd_path.name}' output: {output[:100]}")
 307 |             return None
 308 |     except asyncio.TimeoutError:
 309 |         logger.warning(f"Timeout getting version for '{cmd_path.name}'.")
 310 |         return None
 311 |     except Exception as e:
 312 |         logger.error(f"Error getting version for '{cmd_path.name}': {e}")
 313 |         return None
 314 | 
 315 | 
 316 | async def _check_command_version(meta: CommandMeta) -> None:
 317 |     """Checks if the command meets the minimum version requirement."""
 318 |     global _COMMAND_VERSIONS_CACHE  # Allow modification
 319 |     if not meta.path or not meta.min_version:
 320 |         return  # Skip check if no path or no minimum defined
 321 | 
 322 |     async with _version_lock:
 323 |         # Check cache first
 324 |         if meta.name in _COMMAND_VERSIONS_CACHE:
 325 |             actual_version = _COMMAND_VERSIONS_CACHE[meta.name]
 326 |         else:
 327 |             # Parse version if not cached
 328 |             actual_version = await _parse_version(meta.path)
 329 |             _COMMAND_VERSIONS_CACHE[meta.name] = actual_version  # Cache result (even None)
 330 | 
 331 |     if actual_version is None:
 332 |         logger.warning(
 333 |             f"Could not determine version for '{meta.name}'. Skipping minimum version check."
 334 |         )
 335 |         return
 336 | 
 337 |     if actual_version < meta.min_version:
 338 |         actual_str = ".".join(map(str, actual_version))
 339 |         required_str = ".".join(map(str, meta.min_version))
 340 |         raise ToolExecutionError(
 341 |             f"Command '{meta.name}' version ({actual_str}) is older than required minimum ({required_str}). Please update.",
 342 |             error_code=ToolErrorCode.VERSION_TOO_OLD,
 343 |             details={"required": required_str, "actual": actual_str},
 344 |         )
 345 |     else:
 346 |         logger.debug(
 347 |             f"Version check passed for {meta.name} (found {''.join(map(str, actual_version))}, required >= {''.join(map(str, meta.min_version))})"
 348 |         )
 349 | 
 350 | 
 351 | def _initial_command_discovery() -> None:
 352 |     """Finds commands and stores paths. Logs warnings for missing commands."""
 353 |     global _COMMAND_METADATA  # Modify the global dict
 354 |     missing: list[str] = []
 355 |     updated_metadata = {}
 356 | 
 357 |     for name, meta in _COMMAND_METADATA.items():
 358 |         exe_path_str = shutil.which(name)
 359 |         if exe_path_str is None:
 360 |             missing.append(name)
 361 |             # Keep original meta without path if not found
 362 |             updated_metadata[name] = CommandMeta(
 363 |                 name=meta.name,
 364 |                 path=None,
 365 |                 checksum=None,
 366 |                 mtime=None,
 367 |                 version=None,  # Reset dynamic fields
 368 |                 forbidden_flags=meta.forbidden_flags,
 369 |                 readonly=meta.readonly,
 370 |                 min_version=meta.min_version,
 371 |             )
 372 |             continue
 373 | 
 374 |         resolved_path = Path(exe_path_str).resolve()
 375 |         current_mtime = None
 376 |         try:
 377 |             # Use sync stat here, module load time is acceptable
 378 |             current_mtime = resolved_path.stat().st_mtime
 379 |         except OSError as e:
 380 |             logger.warning(f"Could not stat {resolved_path} during initial discovery: {e}")
 381 | 
 382 |         # Create updated CommandMeta with path and mtime (checksum/version are lazy)
 383 |         updated_meta = CommandMeta(
 384 |             name=meta.name,
 385 |             path=resolved_path,
 386 |             checksum=None,  # Lazy loaded
 387 |             mtime=current_mtime,
 388 |             # Keep original version/checksum null, they are loaded async later
 389 |             version=None,
 390 |             forbidden_flags=meta.forbidden_flags,
 391 |             readonly=meta.readonly,
 392 |             min_version=meta.min_version,
 393 |         )
 394 |         updated_metadata[name] = updated_meta
 395 |         logger.debug(f"{name}: found at {resolved_path}")
 396 | 
 397 |     _COMMAND_METADATA = updated_metadata  # Replace global dict
 398 | 
 399 |     if missing:
 400 |         logger.warning(
 401 |             "Missing local text CLI tools: %s. Corresponding functions will fail.",
 402 |             ", ".join(missing),
 403 |             # emoji_key="warning", # Assuming logger supports this extra field
 404 |         )
 405 | 
 406 | 
 407 | # Run discovery when the module is loaded
 408 | _initial_command_discovery()
 409 | 
 410 | 
 411 | # --------------------------------------------------------------------------- #
 412 | # Argument Validation Placeholder
 413 | # --------------------------------------------------------------------------- #
 414 | 
 415 | 
 416 | def _validate_arguments(cmd_name: str, argv: List[str]) -> None:
 417 |     """
 418 |     Validates command arguments for security and workspace compliance.
 419 |     (Placeholder implementation - refine based on specific security needs)
 420 |     """
 421 |     meta = _COMMAND_METADATA.get(cmd_name)
 422 |     if not meta:
 423 |         # This should generally not happen if called after discovery
 424 |         raise ToolInputError(
 425 |             f"Metadata not found for command '{cmd_name}' during validation.",
 426 |             param_name="cmd_name",
 427 |             details={"unexpected_failure": True},
 428 |         )
 429 | 
 430 |     forbidden_flags = meta.forbidden_flags
 431 | 
 432 |     for i, arg in enumerate(argv):
 433 |         # Check for forbidden flags
 434 |         if arg in forbidden_flags:
 435 |             raise ToolInputError(
 436 |                 f"Forbidden flag '{arg}' is not allowed for command '{cmd_name}'.",
 437 |                 param_name="args_str",
 438 |                 details={
 439 |                     "argument": arg,
 440 |                     "index": i,
 441 |                     "command": cmd_name,
 442 |                     "error_code": ToolErrorCode.FORBIDDEN_FLAG.value,
 443 |                 },
 444 |             )
 445 | 
 446 |         # Command-specific validation rules
 447 |         if cmd_name == "rg":
 448 |             # For rg, allow regex metacharacters ( (), |, ?, +, *, { }, [, ], ^, $, . )
 449 |             forbidden_chars = {
 450 |                 char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "()|"
 451 |             }
 452 |             if forbidden_chars:
 453 |                 raise ToolInputError(
 454 |                     f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
 455 |                     param_name="args_str",
 456 |                     details={
 457 |                         "argument": arg,
 458 |                         "index": i,
 459 |                         "forbidden_chars": sorted(list(forbidden_chars)),
 460 |                         "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
 461 |                     },
 462 |                 )
 463 | 
 464 |             # Basic check for command substitution patterns (can be complex)
 465 |             if "`" in arg or "$(" in arg:
 466 |                 raise ToolInputError(
 467 |                     f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
 468 |                     param_name="args_str",
 469 |                     details={
 470 |                         "argument": arg,
 471 |                         "index": i,
 472 |                         "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
 473 |                     },
 474 |                 )
 475 | 
 476 |             continue  # Skip further checks for rg
 477 | 
 478 |         elif cmd_name == "jq":
 479 |             # For jq, allow (, ), |, > and < characters as they're essential for the query language
 480 |             forbidden_chars = {
 481 |                 char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "()|}><"
 482 |             }
 483 |             if forbidden_chars:
 484 |                 raise ToolInputError(
 485 |                     f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
 486 |                     param_name="args_str",
 487 |                     details={
 488 |                         "argument": arg,
 489 |                         "index": i,
 490 |                         "forbidden_chars": sorted(list(forbidden_chars)),
 491 |                         "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
 492 |                     },
 493 |                 )
 494 | 
 495 |             # Basic check for command substitution patterns (can be complex)
 496 |             if "`" in arg or "$(" in arg:
 497 |                 raise ToolInputError(
 498 |                     f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
 499 |                     param_name="args_str",
 500 |                     details={
 501 |                         "argument": arg,
 502 |                         "index": i,
 503 |                         "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
 504 |                     },
 505 |                 )
 506 | 
 507 |             continue  # Skip further checks for jq
 508 | 
 509 |         elif cmd_name == "awk":
 510 |             # For awk, allow $ character (for field references), / (for regex patterns),
 511 |             # and other characters needed for awk scripts like (, ), ;
 512 |             # Allow '>' and '<' for comparisons in AWK
 513 |             forbidden_chars = {
 514 |                 char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "$/();{}<>"
 515 |             }
 516 |             
 517 |             # Special check for file redirection (> followed by a string in quotes)
 518 |             # Pattern detects constructs like: print $1 > "file.txt" or print $1 > 'file.txt'
 519 |             if re.search(r'>\s*["\']', arg) or re.search(r'print.*>\s*["\']', arg):
 520 |                 raise ToolInputError(
 521 |                     f"Argument '{arg}' appears to contain file redirection, which is forbidden.",
 522 |                     param_name="args_str",
 523 |                     details={
 524 |                         "argument": arg,
 525 |                         "index": i,
 526 |                         "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
 527 |                     },
 528 |                 )
 529 |                 
 530 |             if forbidden_chars:
 531 |                 raise ToolInputError(
 532 |                     f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
 533 |                     param_name="args_str",
 534 |                     details={
 535 |                         "argument": arg,
 536 |                         "index": i,
 537 |                         "forbidden_chars": sorted(list(forbidden_chars)),
 538 |                         "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
 539 |                     },
 540 |                 )
 541 | 
 542 |             # Basic check for command substitution patterns (can be complex)
 543 |             if "`" in arg or "$(" in arg:
 544 |                 raise ToolInputError(
 545 |                     f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
 546 |                     param_name="args_str",
 547 |                     details={
 548 |                         "argument": arg,
 549 |                         "index": i,
 550 |                         "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
 551 |                     },
 552 |                 )
 553 | 
 554 |             # Don't treat awk patterns like /pattern/ as absolute paths
 555 |             if arg.startswith("/") and not (arg.count("/") >= 2 and arg[1:].find("/") > 0):
 556 |                 # Still check for absolute paths that aren't regex patterns
 557 |                 # A regex pattern would typically have at least one more / after the first character
 558 |                 try:
 559 |                     # Resolve the path relative to the workspace *without* accessing filesystem yet
 560 |                     # Use os.path.normpath and os.path.join for basic checks before full resolve
 561 |                     norm_path = os.path.normpath(os.path.join(str(WORKSPACE_DIR), arg))
 562 |                     if not norm_path.startswith(str(WORKSPACE_DIR)):
 563 |                         raise ToolInputError(
 564 |                             f"Path traversal or absolute path '{arg}' is forbidden.",
 565 |                             param_name="args_str",
 566 |                             details={
 567 |                                 "argument": arg,
 568 |                                 "index": i,
 569 |                                 "error_code": ToolErrorCode.PATH_TRAVERSAL.value,
 570 |                             },
 571 |                         )
 572 |                 except Exception as e:
 573 |                     logger.error(f"Error checking path '{arg}': {e}")
 574 |                     raise ToolInputError(
 575 |                         f"Invalid path argument '{arg}'.",
 576 |                         param_name="args_str",
 577 |                         details={
 578 |                             "argument": arg,
 579 |                             "index": i,
 580 |                             "error_code": ToolErrorCode.INVALID_ARGS.value,
 581 |                         },
 582 |                     ) from e
 583 | 
 584 |             continue  # Skip further checks for awk
 585 | 
 586 |         elif cmd_name == "sed":
 587 |             # For sed, allow / (for regex patterns), as well as |, (, ) for sed expressions
 588 |             forbidden_chars = {
 589 |                 char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "/|()"
 590 |             }
 591 |             if forbidden_chars:
 592 |                 raise ToolInputError(
 593 |                     f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
 594 |                     param_name="args_str",
 595 |                     details={
 596 |                         "argument": arg,
 597 |                         "index": i,
 598 |                         "forbidden_chars": sorted(list(forbidden_chars)),
 599 |                         "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
 600 |                     },
 601 |                 )
 602 | 
 603 |             # Basic check for command substitution patterns (can be complex)
 604 |             if "`" in arg or "$(" in arg:
 605 |                 raise ToolInputError(
 606 |                     f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
 607 |                     param_name="args_str",
 608 |                     details={
 609 |                         "argument": arg,
 610 |                         "index": i,
 611 |                         "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
 612 |                     },
 613 |                 )
 614 | 
 615 |             # Don't treat sed patterns like /pattern/ as absolute paths
 616 |             if (
 617 |                 arg.startswith("/")
 618 |                 and arg != "/"
 619 |                 and "/ " not in arg
 620 |                 and not (arg.count("/") >= 2 and arg[1:].find("/") > 0)
 621 |             ):
 622 |                 # Still check for absolute paths that aren't regex patterns
 623 |                 try:
 624 |                     # Resolve the path relative to the workspace *without* accessing filesystem yet
 625 |                     # Use os.path.normpath and os.path.join for basic checks before full resolve
 626 |                     norm_path = os.path.normpath(os.path.join(str(WORKSPACE_DIR), arg))
 627 |                     if not norm_path.startswith(str(WORKSPACE_DIR)):
 628 |                         raise ToolInputError(
 629 |                             f"Path traversal or absolute path '{arg}' is forbidden.",
 630 |                             param_name="args_str",
 631 |                             details={
 632 |                                 "argument": arg,
 633 |                                 "index": i,
 634 |                                 "error_code": ToolErrorCode.PATH_TRAVERSAL.value,
 635 |                             },
 636 |                         )
 637 |                 except Exception as e:
 638 |                     logger.error(f"Error checking path '{arg}': {e}")
 639 |                     raise ToolInputError(
 640 |                         f"Invalid path argument '{arg}'.",
 641 |                         param_name="args_str",
 642 |                         details={
 643 |                             "argument": arg,
 644 |                             "index": i,
 645 |                             "error_code": ToolErrorCode.INVALID_ARGS.value,
 646 |                         },
 647 |                     ) from e
 648 | 
 649 |             continue  # Skip further checks for sed
 650 | 
 651 |         # Standard checks for all other commands
 652 |         # Check for forbidden characters
 653 |         if any(char in _FORBIDDEN_CHARS_SET for char in arg):
 654 |             # Be more specific about which char if possible
 655 |             found_chars = {char for char in arg if char in _FORBIDDEN_CHARS_SET}
 656 |             raise ToolInputError(
 657 |                 f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(found_chars))}",
 658 |                 param_name="args_str",
 659 |                 details={
 660 |                     "argument": arg,
 661 |                     "index": i,
 662 |                     "forbidden_chars": sorted(list(found_chars)),
 663 |                     "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
 664 |                 },
 665 |             )
 666 | 
 667 |         # Basic check for command substitution patterns (can be complex)
 668 |         if "`" in arg or "$(" in arg:
 669 |             raise ToolInputError(
 670 |                 f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
 671 |                 param_name="args_str",
 672 |                 details={
 673 |                     "argument": arg,
 674 |                     "index": i,
 675 |                     "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
 676 |                 },
 677 |             )
 678 | 
 679 |         # --- Path Validation ---
 680 |         # Heuristic: Does the argument look like a path that needs checking?
 681 |         # This is tricky. We might only check args that aren't flags (don't start with '-')
 682 |         # or args known to take paths for specific commands.
 683 |         # For simplicity here, we check any arg that contains '/' or could be a filename.
 684 |         # More robust: parse args properly (difficult) or check based on tool context.
 685 |         potential_path = False
 686 |         if not arg.startswith("-") and (os.sep in arg or "." in arg or Path(arg).suffix):
 687 |             potential_path = True
 688 |             # Allow '-' as a special argument representing stdin/stdout
 689 |             if arg == "-":
 690 |                 potential_path = False
 691 | 
 692 |         if potential_path:
 693 |             try:
 694 |                 # Disallow absolute paths
 695 |                 if Path(arg).is_absolute():
 696 |                     raise ToolInputError(
 697 |                         f"Absolute paths like '{arg}' are forbidden. Use paths relative to the workspace.",
 698 |                         param_name="args_str",
 699 |                         details={
 700 |                             "argument": arg,
 701 |                             "index": i,
 702 |                             "error_code": ToolErrorCode.ABS_PATH_FORBIDDEN.value,
 703 |                         },
 704 |                     )
 705 | 
 706 |                 # Resolve the path relative to the workspace *without* accessing filesystem yet
 707 |                 # Use os.path.normpath and os.path.join for basic checks before full resolve
 708 |                 norm_path = os.path.normpath(os.path.join(str(WORKSPACE_DIR), arg))
 709 | 
 710 |                 # Check for path traversal using normpath result
 711 |                 if not norm_path.startswith(str(WORKSPACE_DIR)):
 712 |                     # Check specifically for '..' components that might escape
 713 |                     if ".." in Path(arg).parts:
 714 |                         raise ToolInputError(
 715 |                             f"Path traversal ('..') is forbidden in argument '{arg}'.",
 716 |                             param_name="args_str",
 717 |                             details={
 718 |                                 "argument": arg,
 719 |                                 "index": i,
 720 |                                 "error_code": ToolErrorCode.PATH_TRAVERSAL.value,
 721 |                             },
 722 |                         )
 723 |                     else:
 724 |                         # Generic workspace violation if normpath doesn't match prefix (e.g., symlinks handled later)
 725 |                         raise ToolInputError(
 726 |                             f"Argument '{arg}' resolves outside the allowed workspace '{WORKSPACE_DIR}'.",
 727 |                             param_name="args_str",
 728 |                             details={
 729 |                                 "argument": arg,
 730 |                                 "index": i,
 731 |                                 "resolved_norm": norm_path,
 732 |                                 "error_code": ToolErrorCode.WORKSPACE_VIOLATION.value,
 733 |                             },
 734 |                         )
 735 | 
 736 |                 # More robust check: Resolve the path fully and check again
 737 |                 # This *does* access the filesystem but ensures symlinks are handled
 738 |                 resolved_arg_path = (WORKSPACE_DIR / arg).resolve()
 739 |                 if not resolved_arg_path.is_relative_to(WORKSPACE_DIR):
 740 |                     raise ToolInputError(
 741 |                         f"Argument '{arg}' resolves outside the allowed workspace '{WORKSPACE_DIR}' (checked after resolving symlinks).",
 742 |                         param_name="args_str",
 743 |                         details={
 744 |                             "argument": arg,
 745 |                             "index": i,
 746 |                             "resolved_absolute": str(resolved_arg_path),
 747 |                             "error_code": ToolErrorCode.WORKSPACE_VIOLATION.value,
 748 |                         },
 749 |                     )
 750 | 
 751 |             except OSError as e:
 752 |                 # Ignore errors resolving paths that might not exist yet (e.g., output files for some tools)
 753 |                 # But log a warning. More strict validation could forbid non-existent input paths.
 754 |                 logger.debug(
 755 |                     f"Could not fully resolve potential path argument '{arg}': {e}. Assuming OK if basic checks passed."
 756 |                 )
 757 |             except ToolInputError:
 758 |                 raise  # Re-raise our specific validation errors
 759 |             except Exception as e:
 760 |                 logger.error(f"Unexpected error validating argument '{arg}': {e}", exc_info=True)
 761 |                 raise ToolInputError(
 762 |                     f"Unexpected error during validation of argument '{arg}'.",
 763 |                     param_name="args_str",
 764 |                     details={
 765 |                         "argument": arg,
 766 |                         "index": i,
 767 |                         "error_code": ToolErrorCode.UNEXPECTED_FAILURE.value,
 768 |                     },
 769 |                 ) from e
 770 | 
 771 | 
 772 | # --------------------------------------------------------------------------- #
 773 | # Invocation Caching (Disk-based LRU-like)
 774 | # --------------------------------------------------------------------------- #
 775 | 
 776 | 
 777 | def _get_cache_key(cmd_name: str, argv: Sequence[str], input_data_bytes: Optional[bytes]) -> str:
 778 |     """Creates a hash key based on command, args, and input data bytes."""
 779 |     hasher = hashlib.sha256()
 780 |     hasher.update(cmd_name.encode())
 781 |     for arg in argv:
 782 |         hasher.update(arg.encode())
 783 |     if input_data_bytes is not None:
 784 |         hasher.update(b"\x00\x01")  # Separator for input data
 785 |         hasher.update(input_data_bytes)
 786 |     else:
 787 |         hasher.update(b"\x00\x00")  # Separator for no input data
 788 |     return hasher.hexdigest()
 789 | 
 790 | 
 791 | def _get_cache_path(key: str) -> Path:
 792 |     """Gets the file path for a cache key."""
 793 |     # Simple structure: cache_dir / key_prefix / key.json
 794 |     prefix = key[:2]
 795 |     return CACHE_DIR / prefix / f"{key}.json"
 796 | 
 797 | 
 798 | # --- Async Cache Get/Put using asyncio.to_thread for OS file IO ---
 799 | async def _cache_get_async(key: str) -> Optional[ToolResult]:
 800 |     """Asynchronously gets a result from the disk cache."""
 801 |     if not CACHE_ENABLED:
 802 |         return None
 803 |     cache_path = _get_cache_path(key)
 804 |     try:
 805 |         if await asyncio.to_thread(cache_path.exists):
 806 |             # Check cache entry age
 807 |             stat_res = await asyncio.to_thread(cache_path.stat)
 808 |             age_seconds = time.time() - stat_res.st_mtime
 809 |             if age_seconds > (CACHE_MAX_AGE_DAYS * 24 * 3600):
 810 |                 logger.debug(
 811 |                     f"Cache entry {key[:8]} expired (age {age_seconds:.0f}s > {CACHE_MAX_AGE_DAYS}d). Removing."
 812 |                 )
 813 |                 try:
 814 |                     await asyncio.to_thread(cache_path.unlink)
 815 |                 except OSError as e:
 816 |                     logger.warning(f"Failed to remove expired cache file {cache_path}: {e}")
 817 |                 return None
 818 | 
 819 |             # Read cached data (aiofiles is okay for read/write content)
 820 |             async with aiofiles.open(cache_path, mode="r", encoding="utf-8") as f:
 821 |                 content = await f.read()
 822 |             data = json.loads(content)
 823 |             # --- Type Check and Reconstruction ---
 824 |             required_keys = {
 825 |                 "stdout": None,
 826 |                 "stderr": None,
 827 |                 "exit_code": None,
 828 |                 "success": False,
 829 |                 "duration": 0.0,
 830 |                 "stdout_truncated": False,
 831 |                 "stderr_truncated": False,
 832 |                 "error": None,
 833 |                 "error_code": None,
 834 |                 "cached_result": True,  # Set flag here
 835 |             }
 836 |             validated_data = {}
 837 |             for k, default_val in required_keys.items():
 838 |                 validated_data[k] = data.get(k, default_val)
 839 | 
 840 |             # Ensure error_code is None or a valid ToolErrorCode enum member
 841 |             raw_error_code = validated_data.get("error_code")
 842 |             if raw_error_code is not None:
 843 |                 try:
 844 |                     validated_data["error_code"] = ToolErrorCode(raw_error_code)
 845 |                 except ValueError:
 846 |                     logger.warning(
 847 |                         f"Invalid error_code '{raw_error_code}' found in cache for key {key[:8]}. Setting to None."
 848 |                     )
 849 |                     validated_data["error_code"] = None
 850 | 
 851 |             logger.info(f"Cache HIT for key {key[:8]}.")
 852 |             # Use cast to satisfy type checker, assuming validation ensures structure
 853 |             return cast(ToolResult, validated_data)
 854 |         else:
 855 |             # logger.debug(f"Cache MISS for key {key[:8]}.")
 856 |             return None
 857 |     except (OSError, json.JSONDecodeError, TypeError) as e:
 858 |         logger.warning(f"Cache read error for key {key[:8]}: {e}. Treating as miss.")
 859 |         # Attempt to remove potentially corrupt file
 860 |         try:
 861 |             if await asyncio.to_thread(cache_path.exists):
 862 |                 await asyncio.to_thread(cache_path.unlink)
 863 |         except OSError:
 864 |             pass
 865 |         return None
 866 |     except Exception as e:
 867 |         logger.error(f"Unexpected cache get error for key {key[:8]}: {e}", exc_info=True)
 868 |         return None
 869 | 
 870 | 
 871 | async def _cache_put_async(key: str, result: ToolResult):
 872 |     """Asynchronously puts a result into the disk cache."""
 873 |     if not CACHE_ENABLED:
 874 |         return
 875 |     cache_path = _get_cache_path(key)
 876 |     try:
 877 |         # Ensure parent directory exists
 878 |         await asyncio.to_thread(cache_path.parent.mkdir, parents=True, exist_ok=True)
 879 | 
 880 |         # Write data as JSON
 881 |         result_to_write = result.copy()
 882 |         result_to_write.pop("cached_result", None)  # Don't store cache flag itself
 883 |         # Ensure enum is converted to string for JSON
 884 |         if result_to_write.get("error_code") is not None:
 885 |             result_to_write["error_code"] = result_to_write["error_code"].value
 886 | 
 887 |         json_data = json.dumps(result_to_write, indent=2)  # Pretty print for readability
 888 |         async with aiofiles.open(cache_path, mode="w", encoding="utf-8") as f:
 889 |             await f.write(json_data)
 890 |         logger.debug(f"Cache PUT successful for key {key[:8]}.")
 891 | 
 892 |         # Trigger cleanup check periodically (simple random approach)
 893 |         if random.random() < 0.01:  # ~1% chance on write
 894 |             asyncio.create_task(_cleanup_cache_lru())
 895 |     except (OSError, TypeError, json.JSONDecodeError) as e:
 896 |         logger.error(f"Cache write error for key {key[:8]}: {e}")
 897 |     except Exception as e:
 898 |         logger.error(f"Unexpected cache put error for key {key[:8]}: {e}", exc_info=True)
 899 | 
 900 | 
 901 | # --- Cache Cleanup (LRU-like based on mtime) ---
 902 | async def _cleanup_cache_lru():
 903 |     """Removes cache files exceeding max size or age."""
 904 |     if not CACHE_ENABLED:
 905 |         return
 906 |     logger.debug("Running cache cleanup...")
 907 |     try:
 908 |         files: List[Tuple[Path, float, int]] = []
 909 |         total_size = 0
 910 |         now = time.time()
 911 |         max_age_seconds = CACHE_MAX_AGE_DAYS * 24 * 3600
 912 |         max_size_bytes = CACHE_MAX_SIZE_MB * 1024 * 1024
 913 | 
 914 |         # Use aiofiles.os.walk (designed for async iteration)
 915 |         # Note: aiofiles.os.walk might still be less performant than scandir + to_thread for large dirs
 916 |         async for root, _, filenames in aiofiles.os.walk(CACHE_DIR):
 917 |             for filename in filenames:
 918 |                 if filename.endswith(".json"):
 919 |                     filepath = Path(root) / filename
 920 |                     try:
 921 |                         stat_res = await asyncio.to_thread(filepath.stat)
 922 |                         files.append((filepath, stat_res.st_mtime, stat_res.st_size))
 923 |                         total_size += stat_res.st_size
 924 |                     except OSError:
 925 |                         continue  # Skip files we can't stat
 926 | 
 927 |         # Sort by modification time (oldest first)
 928 |         files.sort(key=lambda x: x[1])
 929 | 
 930 |         removed_count = 0
 931 |         removed_size = 0
 932 |         current_total_size = total_size  # Keep track of size as we remove
 933 | 
 934 |         # Remove files based on age or if total size exceeds limit
 935 |         for filepath, mtime, size in files:
 936 |             age = now - mtime
 937 |             # Check size limit against potentially reduced total size
 938 |             over_size_limit = current_total_size > max_size_bytes
 939 |             over_age_limit = age > max_age_seconds
 940 | 
 941 |             if over_age_limit or over_size_limit:
 942 |                 try:
 943 |                     await asyncio.to_thread(filepath.unlink)
 944 |                     removed_count += 1
 945 |                     removed_size += size
 946 |                     current_total_size -= size
 947 |                     logger.debug(
 948 |                         f"Cache cleanup removed: {filepath.name} (Reason: {'Age' if over_age_limit else 'Size'})"
 949 |                     )
 950 |                 except OSError as e:
 951 |                     logger.warning(f"Cache cleanup failed to remove {filepath}: {e}")
 952 |             # Optimization: If we are no longer over the size limit, we only need to continue checking for age limit
 953 |             elif not over_size_limit:
 954 |                 # We must continue iterating to check older files for age limit.
 955 |                 pass
 956 | 
 957 |         if removed_count > 0:
 958 |             logger.info(
 959 |                 f"Cache cleanup complete. Removed {removed_count} files ({removed_size / (1024 * 1024):.1f} MB). Current size: {current_total_size / (1024 * 1024):.1f} MB"
 960 |             )
 961 |         else:
 962 |             logger.debug(
 963 |                 f"Cache cleanup complete. No files removed. Current size: {current_total_size / (1024 * 1024):.1f} MB"
 964 |             )
 965 | 
 966 |     except Exception as e:
 967 |         logger.error(f"Error during cache cleanup: {e}", exc_info=True)
 968 | 
 969 | 
 970 | # --------------------------------------------------------------------------- #
 971 | # Resource Limiting Function (Module Scope)
 972 | # --------------------------------------------------------------------------- #
 973 | 
 974 | 
 975 | def _limit_resources(timeout: float, cmd_name: Optional[str] = None) -> None:
 976 |     """Sets resource limits for the child process (Unix only), potentially command-specific."""
 977 |     try:
 978 |         if HAS_RESOURCE:
 979 |             # CPU seconds (add 1s buffer)
 980 |             cpu_limit = int(timeout) + 1
 981 |             resource.setrlimit(resource.RLIMIT_CPU, (cpu_limit, cpu_limit))
 982 | 
 983 |             # Address-space (bytes) - Virtual Memory
 984 |             soft_as_str = os.getenv("MCP_TEXT_RLIMIT_AS", "2_147_483_648")  # 2GiB default
 985 |             hard_as_str = os.getenv(
 986 |                 "MCP_TEXT_RLIMIT_AS_HARD", str(int(soft_as_str) + 100 * 1024 * 1024)
 987 |             )  # +100MiB buffer default
 988 |             try:
 989 |                 soft_as = int(soft_as_str)
 990 |                 hard_as = int(hard_as_str)
 991 |                 if soft_as > 0 and hard_as > 0:  # Allow disabling with 0 or negative
 992 |                     resource.setrlimit(resource.RLIMIT_AS, (soft_as, hard_as))
 993 |                     logger.debug(
 994 |                         f"Applied resource limit: AS={soft_as / (1024**3):.1f}GiB (soft), {hard_as / (1024**3):.1f}GiB (hard)"
 995 |                     )
 996 |             except ValueError:
 997 |                 logger.warning(
 998 |                     "Invalid value for MCP_TEXT_RLIMIT_AS or MCP_TEXT_RLIMIT_AS_HARD. Skipping AS limit."
 999 |                 )
1000 |             except resource.error as e:
1001 |                 logger.warning(
1002 |                     f"Failed to set AS limit: {e}. Limit might be too low or too high for the system."
1003 |                 )
1004 | 
1005 |             # No core dumps
1006 |             try:
1007 |                 resource.setrlimit(resource.RLIMIT_CORE, (0, 0))
1008 |             except resource.error as e:
1009 |                 logger.warning(f"Failed to disable core dumps: {e}")  # May fail in containers
1010 | 
1011 |             # --- RLIMIT_NPROC --------------------------------------------------
1012 |             try:
1013 |                 # honour optional global opt-out
1014 |                 if os.getenv("MCP_TEXT_RLIMIT_NPROC_DISABLE", "").lower() == "true":
1015 |                     logger.debug("RLIMIT_NPROC: disabled via env flag")
1016 |                 else:
1017 |                     soft_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_SOFT", "4096"))
1018 |                     hard_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_HARD", "8192"))
1019 |                     if cmd_name == "rg":
1020 |                         soft_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_SOFT_RG", "16384"))
1021 |                         hard_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_HARD_RG", "32768"))
1022 | 
1023 |                     cur_soft, cur_hard = resource.getrlimit(resource.RLIMIT_NPROC)
1024 | 
1025 |                     # translate "unlimited"/-1
1026 |                     req_soft = cur_soft if soft_req <= 0 else soft_req
1027 |                     req_hard = cur_hard if hard_req <= 0 else hard_req
1028 | 
1029 |                     # never *lower* an existing limit
1030 |                     new_soft = max(cur_soft, req_soft)
1031 |                     new_hard = max(cur_hard, req_hard)
1032 | 
1033 |                     # only call setrlimit if anything actually changes
1034 |                     if (new_soft, new_hard) != (cur_soft, cur_hard):
1035 |                         resource.setrlimit(resource.RLIMIT_NPROC, (new_soft, new_hard))
1036 |                         logger.debug(
1037 |                             f"RLIMIT_NPROC set to (soft={new_soft}, hard={new_hard}) "
1038 |                             f"(was {cur_soft}/{cur_hard}) for {cmd_name}"
1039 |                         )
1040 |             except (ValueError, resource.error) as e:
1041 |                 logger.warning(f"RLIMIT_NPROC not applied: {e}")
1042 | 
1043 |             logger.debug(f"Applied resource limits: CPU={cpu_limit}s")  # Summary log
1044 | 
1045 |         if HAS_PRCTL and sys.platform == "linux":
1046 |             try:
1047 |                 # Prevent privilege escalation
1048 |                 prctl.set_no_new_privs(True)
1049 |                 logger.debug("Applied prctl NO_NEW_PRIVS.")
1050 |             except Exception as e:  # prctl might raise various errors
1051 |                 logger.warning(f"Failed to set prctl NO_NEW_PRIVS: {e}")
1052 | 
1053 |         # Run in new session to isolate from controlling terminal (if any)
1054 |         # This helps ensure signals (like Ctrl+C) don't propagate unexpectedly.
1055 |         if sys.platform != "win32" and hasattr(os, "setsid"):
1056 |             try:
1057 |                 os.setsid()
1058 |                 logger.debug("Process started in new session ID.")
1059 |             except OSError as e:
1060 |                 logger.warning(f"Failed to call setsid: {e}")  # May fail if already session leader
1061 | 
1062 |     except Exception as exc:
1063 |         # Catch-all for any unexpected issue during limit setting
1064 |         logger.warning(f"Failed to apply resource limits/sandboxing: {exc}")
1065 | 
1066 | 
1067 | # --------------------------------------------------------------------------- #
1068 | # Helper Functions
1069 | # --------------------------------------------------------------------------- #
1070 | 
1071 | 
1072 | def _truncate(data: bytes) -> tuple[str, bool]:
1073 |     """Decodes bytes to string and truncates if necessary."""
1074 |     truncated = False
1075 |     if len(data) > MAX_OUTPUT_BYTES:
1076 |         data = data[:MAX_OUTPUT_BYTES]
1077 |         truncated = True
1078 |         # Try to decode truncated data, replacing errors
1079 |         decoded = data.decode("utf-8", errors="replace")
1080 |         # Add truncation marker AFTER decoding to avoid corrupting multibyte char
1081 |         decoded += "\n... (output truncated)"
1082 |     else:
1083 |         decoded = data.decode("utf-8", errors="replace")
1084 |     return decoded, truncated
1085 | 
1086 | 
1087 | def _is_json_or_json_lines(text: str) -> bool:
1088 |     """
1089 |     True  → text is a single JSON document        (json.loads ok)
1090 |     True  → text is newline-delimited JSON lines  (all lines load)
1091 |     False → otherwise
1092 |     """
1093 |     try:
1094 |         json.loads(text)
1095 |         return True                     # one big doc
1096 |     except json.JSONDecodeError:
1097 |         pass
1098 | 
1099 |     ok = True
1100 |     for ln in text.splitlines():
1101 |         ln = ln.strip()
1102 |         if not ln:                      # skip blanks
1103 |             continue
1104 |         try:
1105 |             json.loads(ln)
1106 |         except json.JSONDecodeError:
1107 |             ok = False
1108 |             break
1109 |     return ok
1110 | 
1111 | 
1112 | # --------------------------------------------------------------------------- #
1113 | # Secure Async Executor Core (with Caching, Checksum, Version Checks)
1114 | # --------------------------------------------------------------------------- #
1115 | # Dictionary of asyncio.Semaphore objects, one per command
1116 | _SEMAPHORES = {cmd: asyncio.Semaphore(limit) for cmd, limit in CONCURRENCY_LIMITS.items()}
1117 | 
1118 | 
1119 | async def _run_command_secure(
1120 |     cmd_name: str,
1121 |     args_str: str,
1122 |     *,
1123 |     input_data: Optional[str] = None,
1124 |     is_file_target: bool = False,
1125 |     is_dir_target: bool = False,
1126 |     timeout: float = DEFAULT_TIMEOUT,
1127 |     dry_run: bool = False,
1128 | ) -> ToolResult:
1129 |     """Securely executes a validated local command with enhancements."""
1130 | 
1131 |     # 1. Get Command Metadata and Path
1132 |     meta = _COMMAND_METADATA.get(cmd_name)
1133 |     if not meta or not meta.path:
1134 |         raise ToolExecutionError(
1135 |             f"Command '{cmd_name}' is not available or configured.",
1136 |             error_code=ToolErrorCode.CMD_NOT_FOUND,
1137 |         )
1138 | 
1139 |     # Version/checksum checks run before semaphore. This is generally okay,
1140 |     # potential for minor concurrent hashing/version checks on very first simultaneous calls.
1141 |     # 2. Check Minimum Version (async, cached)
1142 |     try:
1143 |         await _check_command_version(meta)
1144 |     except ToolExecutionError as e:
1145 |         # Propagate version error directly
1146 |         raise e
1147 | 
1148 |     # 3. Verify Checksum (async, lazy, cached)
1149 |     try:
1150 |         await _get_command_checksum(meta)  # Verifies if mtime changed, re-calcs if needed
1151 |         # Optionally, compare against a known-good checksum if available/needed
1152 |         # if meta.checksum != EXPECTED_CHECKSUMS[cmd_name]: raise ToolExecutionError(...)
1153 |     except ToolExecutionError as e:
1154 |         raise e  # Propagate checksum errors
1155 | 
1156 |     # 4. Parse Arguments using shlex
1157 |     try:
1158 |         argv: List[str] = shlex.split(args_str, posix=True)
1159 |     except ValueError as exc:
1160 |         raise ToolInputError(
1161 |             "Invalid quoting or escaping in args_str.",
1162 |             param_name="args_str",
1163 |             details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1164 |         ) from exc
1165 | 
1166 |     # 5. Validate Arguments (Security & Workspace) using the placeholder/real validator
1167 |     try:
1168 |         _validate_arguments(cmd_name, argv)
1169 |     except ToolInputError as e:
1170 |         # Add command context to validation error
1171 |         raise ToolInputError(
1172 |             f"Argument validation failed for '{cmd_name}': {e}",
1173 |             param_name="args_str",
1174 |             details=e.details,
1175 |         ) from e
1176 | 
1177 |     # 6. Handle Dry Run
1178 |     cmd_path_str = str(meta.path)
1179 |     cmdline: List[str] = [cmd_path_str, *argv]
1180 |     if dry_run:
1181 |         logger.info(f"Dry run: Command validated successfully: {shlex.join(cmdline)}")
1182 |         # Ensure ToolResult structure is fully populated for dry run
1183 |         return ToolResult(
1184 |             success=True,
1185 |             dry_run_cmdline=cmdline,  # Return the command list
1186 |             stdout=None,
1187 |             stderr=None,
1188 |             exit_code=None,
1189 |             error=None,
1190 |             error_code=None,
1191 |             duration=0.0,
1192 |             stdout_truncated=False,
1193 |             stderr_truncated=False,
1194 |             cached_result=False,
1195 |         )
1196 | 
1197 |     # 7. Stdin Size Check and Encoding (Optimization)
1198 |     input_bytes: Optional[bytes] = None
1199 |     input_len = 0
1200 |     if input_data is not None:
1201 |         input_bytes = input_data.encode("utf-8", errors="ignore")
1202 |         input_len = len(input_bytes)
1203 |         if input_len > MAX_INPUT_BYTES:
1204 |             raise ToolInputError(
1205 |                 f"input_data exceeds maximum allowed size of {MAX_INPUT_BYTES / (1024 * 1024):.1f} MB.",
1206 |                 param_name="input_data",
1207 |                 details={
1208 |                     "limit_bytes": MAX_INPUT_BYTES,
1209 |                     "actual_bytes": input_len,
1210 |                     "error_code": ToolErrorCode.INPUT_TOO_LARGE.value,
1211 |                 },
1212 |             )
1213 | 
1214 |     # 8. Prepare for Caching
1215 |     cache_key = _get_cache_key(cmd_name, argv, input_bytes)
1216 |     cached_result = await _cache_get_async(cache_key)
1217 |     if cached_result:
1218 |         # Ensure cached_result flag is set correctly (should be done by _cache_get_async)
1219 |         cached_result["cached_result"] = True
1220 |         return cached_result
1221 | 
1222 |     # 9. Acquire Concurrency Semaphore
1223 |     semaphore = _SEMAPHORES.get(cmd_name)
1224 |     if not semaphore:
1225 |         logger.error(
1226 |             f"Internal error: No semaphore found for command '{cmd_name}'. Using fallback limit 1."
1227 |         )
1228 |         # Should not happen if CONCURRENCY_LIMITS is correct
1229 |         semaphore = asyncio.Semaphore(1)  # Fallback to concurrency of 1
1230 | 
1231 |     async with semaphore:
1232 |         # 10. Redacted Logging
1233 |         redacted_argv = [
1234 |             re.sub(r"(?i)(--?(?:password|token|key|secret)=)\S+", r"\1********", arg)
1235 |             for arg in argv
1236 |         ]
1237 |         log_payload = {
1238 |             "event": "execute_local_text_tool",
1239 |             "command": cmd_name,
1240 |             "args": redacted_argv,
1241 |             "input_source": "stdin"
1242 |             if input_data is not None
1243 |             else ("file" if is_file_target else ("dir" if is_dir_target else "args_only")),
1244 |             "timeout_s": timeout,
1245 |             "cache_key_prefix": cache_key[:8] if cache_key else None,
1246 |         }
1247 |         # Using extra assumes logger is configured to handle it
1248 |         logger.info(json.dumps(log_payload), extra={"json_fields": log_payload})
1249 | 
1250 |         # 11. Resource Limit Setup (Unix only) - Uses module-level function
1251 |         def preexec_fn_to_use():
1252 |             # Pass cmd_name for command-specific limits
1253 |             return _limit_resources(timeout, cmd_name=cmd_name) if sys.platform != "win32" else None
1254 | 
1255 |         # 12. Minimal Sanitized Environment
1256 |         safe_env = {"PATH": os.getenv("PATH", "/usr/bin:/bin:/usr/local/bin")}  # Add common paths
1257 |         # Preserve locale settings for correct text processing
1258 |         for safe_var in ["LANG", "LC_ALL", "LC_CTYPE", "LC_MESSAGES", "LC_COLLATE"]:
1259 |             if safe_var in os.environ:
1260 |                 safe_env[safe_var] = os.environ[safe_var]
1261 |         # Add HOME for tools that might need it for config (like awk finding extensions)
1262 |         if "HOME" in os.environ:
1263 |             safe_env["HOME"] = os.environ["HOME"]
1264 |         # Add TMPDIR as some tools use it
1265 |         if "TMPDIR" in os.environ:
1266 |             safe_env["TMPDIR"] = os.environ["TMPDIR"]
1267 |         elif os.getenv("TEMP"):  # Windows variant
1268 |             safe_env["TEMP"] = os.getenv("TEMP")  # type: ignore[assignment]
1269 | 
1270 |         # 13. Launch Subprocess
1271 |         t0 = time.perf_counter()
1272 |         proc: Optional[asyncio.subprocess.Process] = None
1273 |         result: Optional[ToolResult] = None  # Ensure result is defined
1274 | 
1275 |         try:
1276 |             proc = await asyncio.create_subprocess_exec(
1277 |                 *cmdline,
1278 |                 stdin=asyncio.subprocess.PIPE
1279 |                 if input_bytes is not None
1280 |                 else asyncio.subprocess.DEVNULL,
1281 |                 stdout=asyncio.subprocess.PIPE,
1282 |                 stderr=asyncio.subprocess.PIPE,
1283 |                 env=safe_env,
1284 |                 limit=MAX_OUTPUT_BYTES * 2,  # Limit buffer size slightly larger than max output
1285 |                 preexec_fn=preexec_fn_to_use,
1286 |                 # cwd=str(WORKSPACE_DIR) # Generally not needed if paths are validated relative to WORKSPACE_DIR
1287 |             )
1288 | 
1289 |             # 14. Communicate and Handle Timeout
1290 |             try:
1291 |                 stdout_b, stderr_b = await asyncio.wait_for(
1292 |                     proc.communicate(input=input_bytes),
1293 |                     timeout=timeout,
1294 |                 )
1295 |             except asyncio.TimeoutError as e:
1296 |                 logger.warning(
1297 |                     f"Command '{cmd_name}' timed out after {timeout}s. Terminating.",
1298 |                     extra={"command": cmd_name, "timeout": timeout},
1299 |                 )
1300 |                 if proc and proc.returncode is None:
1301 |                     try:
1302 |                         proc.terminate()
1303 |                         # Give it a moment to terminate gracefully
1304 |                         await asyncio.wait_for(proc.wait(), timeout=1.0)
1305 |                     except asyncio.TimeoutError:
1306 |                         logger.warning(
1307 |                             f"Process {proc.pid} did not terminate gracefully after 1s, killing.",
1308 |                             extra={"pid": proc.pid},
1309 |                         )
1310 |                         try:
1311 |                             proc.kill()
1312 |                         except ProcessLookupError:
1313 |                             pass  # Already gone
1314 |                         await proc.wait()  # Wait for kill to complete
1315 |                     except ProcessLookupError:
1316 |                         pass  # Already gone
1317 |                     except Exception as term_err:
1318 |                         logger.error(
1319 |                             f"Error terminating process {proc.pid}: {term_err}",
1320 |                             extra={"pid": proc.pid},
1321 |                         )
1322 | 
1323 |                 raise ToolExecutionError(
1324 |                     f"'{cmd_name}' exceeded timeout of {timeout}s.",
1325 |                     error_code=ToolErrorCode.TIMEOUT,
1326 |                     details={"timeout": timeout},
1327 |                 ) from e
1328 |             except (BrokenPipeError, ConnectionResetError) as comm_err:
1329 |                 # Handle cases where the process exits before consuming all input/producing output
1330 |                 exit_code_on_comm_err = proc.returncode if proc else -1
1331 |                 logger.warning(
1332 |                     f"Communication error with '{cmd_name}' (process likely exited early): {comm_err}. Exit code: {exit_code_on_comm_err}",
1333 |                     extra={
1334 |                         "command": cmd_name,
1335 |                         "error": str(comm_err),
1336 |                         "exit_code": exit_code_on_comm_err,
1337 |                     },
1338 |                 )
1339 |                 # Read any remaining output before raising
1340 |                 stdout_b = b""
1341 |                 stderr_b = b""
1342 |                 try:
1343 |                     # Use readexactly or read with limit to avoid indefinite block
1344 |                     if proc and proc.stdout:
1345 |                         stdout_b = await asyncio.wait_for(
1346 |                             proc.stdout.read(MAX_OUTPUT_BYTES * 2), timeout=0.5
1347 |                         )
1348 |                     if proc and proc.stderr:
1349 |                         stderr_b = await asyncio.wait_for(
1350 |                             proc.stderr.read(MAX_OUTPUT_BYTES * 2), timeout=0.5
1351 |                         )
1352 |                 except asyncio.TimeoutError:
1353 |                     pass
1354 |                 except Exception as read_err:
1355 |                     logger.warning(f"Error reading remaining output after comm error: {read_err}")
1356 | 
1357 |                 # Continue processing with potentially partial output, but mark as communication error
1358 |                 # Let success be determined by exit code if available, otherwise assume failure
1359 |                 duration = time.perf_counter() - t0
1360 |                 exit_code = (
1361 |                     proc.returncode if proc and proc.returncode is not None else -1
1362 |                 )  # Use -1 if unknown
1363 |                 stdout, stdout_truncated = _truncate(stdout_b)
1364 |                 stderr, stderr_truncated = _truncate(stderr_b)
1365 |                 retcode_ok_map = RETCODE_OK.get(cmd_name, {0})
1366 |                 success = exit_code in retcode_ok_map
1367 | 
1368 |                 # Construct result indicating communication error
1369 |                 result = ToolResult(
1370 |                     stdout=stdout,
1371 |                     stderr=stderr,
1372 |                     exit_code=exit_code,
1373 |                     success=success,  # May be true if exit code was OK despite pipe error
1374 |                     error=f"Communication error with '{cmd_name}': {comm_err}",
1375 |                     error_code=ToolErrorCode.COMMUNICATION_ERROR,
1376 |                     duration=round(duration, 3),
1377 |                     stdout_truncated=stdout_truncated,
1378 |                     stderr_truncated=stderr_truncated,
1379 |                     cached_result=False,
1380 |                 )
1381 |                 # Do not cache communication errors
1382 |                 return result  # Return immediately, skip normal success/cache path
1383 | 
1384 |             except Exception as comm_err:  # Catch other potential communicate errors
1385 |                 exit_code_on_comm_err = proc.returncode if proc else -1
1386 |                 raise ToolExecutionError(
1387 |                     f"Communication error with '{cmd_name}': {comm_err}",
1388 |                     error_code=ToolErrorCode.COMMUNICATION_ERROR,
1389 |                     details={"exit_code": exit_code_on_comm_err},
1390 |                 ) from comm_err
1391 | 
1392 |             # 15. Process Results
1393 |             duration = time.perf_counter() - t0
1394 |             exit_code = proc.returncode
1395 |             stdout, stdout_truncated = _truncate(stdout_b)
1396 |             stderr, stderr_truncated = _truncate(stderr_b)
1397 | 
1398 |             # Use normalization table for success check
1399 |             retcode_ok_map = RETCODE_OK.get(cmd_name, {0})  # Default to {0} if cmd not in map
1400 |             success = exit_code in retcode_ok_map
1401 |             error_message: Optional[str] = None
1402 |             error_code: Optional[ToolErrorCode] = None
1403 | 
1404 |             if not success:
1405 |                 error_message = f"Command '{cmd_name}' failed with exit code {exit_code}."
1406 |                 # Attempt to map common exit codes to specific error types if possible
1407 |                 # e.g., if exit code 2 for rg means regex error -> ToolErrorCode.INVALID_ARGS
1408 |                 error_code = ToolErrorCode.EXEC_ERROR  # Default execution error
1409 |                 if stderr:
1410 |                     error_message += (
1411 |                         f" Stderr: '{textwrap.shorten(stderr, 150, placeholder='...')}'"
1412 |                     )
1413 | 
1414 |             # 16. Construct ToolResult TypedDict
1415 |             result = ToolResult(
1416 |                 stdout=stdout,
1417 |                 stderr=stderr,
1418 |                 exit_code=exit_code,
1419 |                 success=success,
1420 |                 error=error_message,
1421 |                 error_code=error_code,
1422 |                 duration=round(duration, 3),
1423 |                 stdout_truncated=stdout_truncated,
1424 |                 stderr_truncated=stderr_truncated,
1425 |                 cached_result=False,  # Will be set by caller if cached
1426 |             )
1427 | 
1428 |             # 17. Cache the result if successful and not too large? (optional check)
1429 |             # Consider not caching extremely large successful results if space is a concern
1430 |             # current logic caches all successful results
1431 |             if success:
1432 |                 await _cache_put_async(cache_key, result)
1433 | 
1434 |             return result
1435 | 
1436 |         except (ToolInputError, ToolExecutionError) as e:
1437 |             # Propagate specific errors we raised
1438 |             raise e
1439 |         except FileNotFoundError as e:
1440 |             # Specifically catch if the command itself isn't found at exec time
1441 |             logger.error(
1442 |                 f"Command '{cmd_name}' not found at path: {cmdline[0]}. Ensure it's installed and in PATH.",
1443 |                 exc_info=True,
1444 |             )
1445 |             raise ToolExecutionError(
1446 |                 f"Command '{cmd_name}' executable not found.",
1447 |                 error_code=ToolErrorCode.CMD_NOT_FOUND,
1448 |             ) from e
1449 |         except PermissionError as e:
1450 |             logger.error(
1451 |                 f"Permission denied executing command '{cmd_name}' at path: {cmdline[0]}.",
1452 |                 exc_info=True,
1453 |             )
1454 |             raise ToolExecutionError(
1455 |                 f"Permission denied executing '{cmd_name}'. Check file permissions.",
1456 |                 error_code=ToolErrorCode.EXEC_ERROR,
1457 |             ) from e
1458 |         except Exception as e:
1459 |             # Catch unexpected errors during setup/execution
1460 |             logger.critical(f"Unexpected error running command '{cmd_name}': {e}", exc_info=True)
1461 |             raise ToolExecutionError(
1462 |                 f"Unexpected failure executing '{cmd_name}': {e}",
1463 |                 error_code=ToolErrorCode.UNEXPECTED_FAILURE,
1464 |             ) from e
1465 |         finally:
1466 |             # Ensure process is cleaned up if an exception occurred after creation
1467 |             if proc and proc.returncode is None:
1468 |                 logger.warning(
1469 |                     f"Process {proc.pid} for '{cmd_name}' still running after exception, attempting kill."
1470 |                 )
1471 |                 try:
1472 |                     proc.kill()
1473 |                     await proc.wait()
1474 |                 except ProcessLookupError:
1475 |                     pass
1476 |                 except Exception as final_kill_err:
1477 |                     logger.error(
1478 |                         f"Error killing process {proc.pid} in finally block: {final_kill_err}"
1479 |                     )
1480 | 
1481 | 
1482 | # --------------------------------------------------------------------------- #
1483 | # Retcode Normalization
1484 | # --------------------------------------------------------------------------- #
1485 | RETCODE_OK: Dict[str, set[int]] = {
1486 |     "rg": {0, 1},  # 0 = matches found, 1 = no matches found (both OK for searching)
1487 |     "jq": {0},  # 0 = success. Other codes (e.g., 4 for no matches with -e) indicate issues.
1488 |     "awk": {0},  # 0 = success
1489 |     "sed": {0},  # 0 = success
1490 | }
1491 | 
1492 | # --------------------------------------------------------------------------- #
1493 | # Public Tool Functions (Standalone Wrappers)
1494 | # --------------------------------------------------------------------------- #
1495 | 
1496 | 
1497 | def _require_single_source(
1498 |     cmd: str,
1499 |     *,
1500 |     input_data: Optional[str],
1501 |     input_file: Optional[bool],
1502 |     input_dir: Optional[bool],
1503 | ) -> None:
1504 |     """Validates that exactly one input source mode is indicated."""
1505 |     # Convert bool flags to 0 or 1 for summing. None becomes 0.
1506 |     modes = [
1507 |         input_data is not None,
1508 |         input_file is True,  # Explicit True check
1509 |         input_dir is True,  # Explicit True check
1510 |     ]
1511 |     num_modes = sum(modes)
1512 |     if num_modes == 0:
1513 |         # Use a clearer error message
1514 |         raise ToolInputError(
1515 |             f"For '{cmd}', you must provide exactly one input: 'input_data' OR 'input_file=True' OR 'input_dir=True'.",
1516 |             param_name="inputs",
1517 |             details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1518 |         )
1519 |     elif num_modes > 1:
1520 |         raise ToolInputError(
1521 |             f"For '{cmd}', specify exactly one input mode: provide 'input_data' OR set 'input_file=True' OR set 'input_dir=True'. Found multiple.",
1522 |             param_name="inputs",
1523 |             details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1524 |         )
1525 | 
1526 | 
1527 | # --- run_ripgrep ---
1528 | @with_tool_metrics
1529 | @with_error_handling
1530 | async def run_ripgrep(
1531 |     args_str: str,
1532 |     *,
1533 |     input_data: Optional[str] = None,
1534 |     input_file: Optional[bool] = False,  # Default to False for clarity
1535 |     input_dir: Optional[bool] = False,  # Default to False for clarity
1536 |     timeout: float = DEFAULT_TIMEOUT,
1537 |     dry_run: bool = False,
1538 | ) -> ToolResult:
1539 |     """
1540 |     Executes the 'rg' (ripgrep) command for fast text pattern searching within the secure workspace.
1541 | 
1542 |     Searches recursively through directories or specified files (relative to the workspace)
1543 |     for lines matching a regular expression or fixed string pattern.
1544 | 
1545 |     Input Handling:
1546 |     - `input_data`: Provide text directly via stdin. Omit `input_file`/`input_dir` or set to `False`. Do *not* include a path in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1547 |     - `input_file=True`: Indicates `args_str` contains target file path(s). Omit `input_data`/`input_dir` or set to `False`. Path(s) must be relative to the workspace and specified in `args_str`. Example: `args_str="'pattern' path/to/file.txt"`
1548 |     - `input_dir=True`: Indicates `args_str` contains target directory path(s). Omit `input_data`/`input_file` or set to `False`. Path(s) must be relative to the workspace and specified in `args_str`. Example: `args_str="'pattern' path/to/dir"`
1549 | 
1550 |     Common `rg` Arguments (include in `args_str`, use workspace-relative paths):
1551 |       `'pattern'`: Regex pattern or fixed string.
1552 |       `path`: Workspace-relative file or directory path(s).
1553 |       `-i`, `--ignore-case`: Case-insensitive search.
1554 |       `-v`, `--invert-match`: Select non-matching lines.
1555 |       `-l`, `--files-with-matches`: List files containing matches.
1556 |       `-c`, `--count`: Count matches per file.
1557 |       `-A NUM`, `-B NUM`, `-C NUM`: Context control.
1558 |       `--json`: JSON output format.
1559 |       `-t type`: Filter by file type (e.g., `py`, `md`).
1560 |       `-g glob`: Include/exclude files/dirs by glob (e.g., `-g '*.py' -g '!temp/'`).
1561 |       `-o, --only-matching`: Print only matching parts.
1562 |       `--follow`: Follow symbolic links (targets must also be within workspace).
1563 |       `--no-filename`, `-N`: Suppress filename/line numbers.
1564 | 
1565 |     Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks absolute paths, path traversal ('..'), unsafe flags, and shell metacharacters. Limited resources (CPU, Memory, Processes).
1566 | 
1567 |     Exit Codes & Success Field:
1568 |     - `0`: Matches found -> `success: True`, `error: null`
1569 |     - `1`: No matches found -> `success: True`, `error: null` (NOTE: Considered success by this tool wrapper)
1570 |     - `2+`: Error occurred -> `success: False`, `error: "..."`, `error_code: EXEC_ERROR`
1571 | 
1572 |     Args:
1573 |         args_str: Command-line arguments for `rg` (pattern, flags, workspace-relative paths).
1574 |         input_data: String data to pipe to `rg` via stdin. Omit/False for `input_file`/`input_dir`.
1575 |         input_file: Set to True if `args_str` includes target file path(s). Omit/False for `input_data`/`input_dir`.
1576 |         input_dir: Set to True if `args_str` includes target directory path(s). Omit/False for `input_data`/`input_file`.
1577 |         timeout: Max execution time in seconds.
1578 |         dry_run: If True, validate args and return command line without execution.
1579 | 
1580 |     Returns:
1581 |         ToolResult: Dictionary containing execution results or dry run info.
1582 |                     Includes stdout, stderr, exit_code, success, error, error_code, duration,
1583 |                     truncation flags, and cached_result status.
1584 | 
1585 |     Raises:
1586 |         ToolInputError: For invalid arguments, security violations, or incorrect input mode usage.
1587 |         ToolExecutionError: If `rg` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1588 |     """
1589 |     _require_single_source("rg", input_data=input_data, input_file=input_file, input_dir=input_dir)
1590 |     return await _run_command_secure(
1591 |         "rg",
1592 |         args_str,
1593 |         input_data=input_data,
1594 |         is_file_target=input_file,  # Pass the boolean flag directly
1595 |         is_dir_target=input_dir,  # Pass the boolean flag directly
1596 |         timeout=timeout,
1597 |         dry_run=dry_run,
1598 |     )
1599 | 
1600 | 
1601 | # --- run_awk ---
1602 | @with_tool_metrics
1603 | @with_error_handling
1604 | async def run_awk(
1605 |     args_str: str,
1606 |     *,
1607 |     input_data: Optional[str] = None,
1608 |     input_file: Optional[bool] = False,  # Default to False for clarity
1609 |     timeout: float = DEFAULT_TIMEOUT,
1610 |     dry_run: bool = False,
1611 | ) -> ToolResult:
1612 |     """
1613 |     Executes the 'awk' command for pattern scanning and field-based text processing within the secure workspace.
1614 | 
1615 |     Input Handling:
1616 |     - `input_data`: Provide text directly via stdin. Omit `input_file` or set to `False`. Do *not* include filename in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1617 |     - `input_file=True`: Indicates `args_str` contains target file path(s). Omit `input_data` or set to `False`. Path(s) must be relative to the workspace and specified in `args_str`. Example: `args_str="'{print $1}' path/data.log"`
1618 | 
1619 |     Common `awk` Arguments (include in `args_str`, use workspace-relative paths):
1620 |       `'program'`: The AWK script (e.g., `'{ print $1, $3 }'`).
1621 |       `filename(s)`: Workspace-relative input filename(s).
1622 |       `-F fs`: Define input field separator (e.g., `-F ','`).
1623 |       `-v var=value`: Assign variable.
1624 | 
1625 |     Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks unsafe flags (`-i`/`--in-place` gawk), shell characters, absolute paths, traversal. Limited resources.
1626 | 
1627 |     Args:
1628 |         args_str: Command-line arguments for `awk` (script, flags, workspace-relative paths).
1629 |         input_data: String data to pipe to `awk` via stdin. Omit/False for `input_file`.
1630 |         input_file: Set to True if `args_str` includes target file path(s). Omit/False for `input_data`.
1631 |         timeout: Max execution time in seconds.
1632 |         dry_run: If True, validate args and return command line without execution.
1633 | 
1634 |     Returns:
1635 |         ToolResult: Dictionary with results. `success` is True only if exit code is 0.
1636 | 
1637 |     Raises:
1638 |         ToolInputError: For invalid arguments, security violations, or incorrect input mode usage.
1639 |         ToolExecutionError: If `awk` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1640 |     """
1641 |     _require_single_source("awk", input_data=input_data, input_file=input_file, input_dir=False)
1642 |     return await _run_command_secure(
1643 |         "awk",
1644 |         args_str,
1645 |         input_data=input_data,
1646 |         is_file_target=input_file,
1647 |         is_dir_target=False,  # awk typically doesn't take dir targets directly
1648 |         timeout=timeout,
1649 |         dry_run=dry_run,
1650 |     )
1651 | 
1652 | 
1653 | # --- run_sed ---
1654 | @with_tool_metrics
1655 | @with_error_handling
1656 | async def run_sed(
1657 |     args_str: str,
1658 |     *,
1659 |     input_data: Optional[str] = None,
1660 |     input_file: Optional[bool] = False,  # Default to False for clarity
1661 |     timeout: float = DEFAULT_TIMEOUT,
1662 |     dry_run: bool = False,
1663 | ) -> ToolResult:
1664 |     """
1665 |     Executes the 'sed' (Stream Editor) command for line-by-line text transformations within the secure workspace.
1666 | 
1667 |     Performs substitutions, deletions, insertions based on patterns. **In-place editing (`-i`) is disabled.**
1668 | 
1669 |     Input Handling:
1670 |     - `input_data`: Provide text directly via stdin. Omit `input_file` or set to `False`. Do *not* include filename in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1671 |     - `input_file=True`: Indicates `args_str` contains target file path. Omit `input_data` or set to `False`. Path must be relative to the workspace and specified in `args_str`. Example: `args_str="'s/ERROR/WARN/g' path/app.log"`
1672 | 
1673 |     Common `sed` Arguments (include in `args_str`, use workspace-relative paths):
1674 |       `'script'`: The `sed` script or command (e.g., `'s/foo/bar/g'`, `'/^DEBUG/d'`).
1675 |       `filename`: Workspace-relative input filename.
1676 |       `-e script`: Add multiple scripts.
1677 |       `-f script-file`: Read commands from a workspace-relative file.
1678 |       `-n`: Suppress automatic printing.
1679 |       `-E` or `-r`: Use extended regular expressions.
1680 | 
1681 |     Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks `-i` flag, shell characters, absolute paths, traversal. Limited resources.
1682 | 
1683 |     Args:
1684 |         args_str: Command-line arguments for `sed` (script, flags, workspace-relative path).
1685 |         input_data: String data to pipe to `sed` via stdin. Omit/False for `input_file`.
1686 |         input_file: Set to True if `args_str` includes target file path. Omit/False for `input_data`.
1687 |         timeout: Max execution time in seconds.
1688 |         dry_run: If True, validate args and return command line without execution.
1689 | 
1690 |     Returns:
1691 |         ToolResult: Dictionary with results. `success` is True only if exit code is 0.
1692 | 
1693 |     Raises:
1694 |         ToolInputError: For invalid arguments, security violations (like using -i), or incorrect input mode usage.
1695 |         ToolExecutionError: If `sed` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1696 |     """
1697 |     _require_single_source("sed", input_data=input_data, input_file=input_file, input_dir=False)
1698 |     return await _run_command_secure(
1699 |         "sed",
1700 |         args_str,
1701 |         input_data=input_data,
1702 |         is_file_target=input_file,
1703 |         is_dir_target=False,  # sed operates on files or stdin
1704 |         timeout=timeout,
1705 |         dry_run=dry_run,
1706 |     )
1707 | 
1708 | 
1709 | # --- run_jq ---
1710 | @with_tool_metrics
1711 | @with_error_handling
1712 | async def run_jq(
1713 |     args_str: str,
1714 |     *,
1715 |     input_data: Optional[str] = None,
1716 |     input_file: Optional[bool] = False,  # Default to False for clarity
1717 |     timeout: float = DEFAULT_TIMEOUT,
1718 |     dry_run: bool = False,
1719 | ) -> ToolResult:
1720 |     """
1721 |     Executes the 'jq' command for querying, filtering, and transforming JSON data within the secure workspace.
1722 | 
1723 |     Essential for extracting values, filtering arrays/objects, or restructuring JSON
1724 |     provided via stdin (`input_data`) or from a file (`input_file=True`).
1725 | 
1726 |     Input Handling:
1727 |     - `input_data`: Provide valid JSON text directly via stdin. Omit `input_file` or set to `False`. Do *not* include filename in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1728 |     - `input_file=True`: Indicates `args_str` contains target JSON file path. Omit `input_data` or set to `False`. Path must be relative to the workspace and specified in `args_str`. Example: `args_str="'.items[].name' data.json"`
1729 | 
1730 |     Common `jq` Arguments (include in `args_str`, use workspace-relative paths):
1731 |       `'filter'`: The `jq` filter expression (e.g., `'.users[] | select(.active==true)'`).
1732 |       `filename`: Workspace-relative input JSON filename.
1733 |       `-c`: Compact output.
1734 |       `-r`: Raw string output (no JSON quotes).
1735 |       `-s`: Slurp mode (read input stream into an array).
1736 |       `--arg name value`: Define string variable.
1737 |       `--argjson name json_value`: Define JSON variable.
1738 | 
1739 |     Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks unsafe flags, shell characters, absolute paths, traversal. Limited resources. Validates `input_data` is JSON before execution.
1740 | 
1741 |     Args:
1742 |         args_str: Command-line arguments for `jq` (filter, flags, workspace-relative path).
1743 |         input_data: String containing valid JSON data to pipe to `jq` via stdin. Omit/False for `input_file`.
1744 |         input_file: Set to True if `args_str` includes target JSON file path. Omit/False for `input_data`.
1745 |         timeout: Max execution time in seconds.
1746 |         dry_run: If True, validate args and return command line without execution.
1747 | 
1748 |     Returns:
1749 |         ToolResult: Dictionary with results. `success` is True only if exit code is 0.
1750 | 
1751 |     Raises:
1752 |         ToolInputError: If `input_data` is not valid JSON, arguments are invalid, security violations occur, or incorrect input mode usage.
1753 |         ToolExecutionError: If `jq` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1754 |     """
1755 |     _require_single_source("jq", input_data=input_data, input_file=input_file, input_dir=False)
1756 |     # Extra check for jq: validate input_data is JSON before running
1757 |     if input_data is not None and not _is_json_or_json_lines(input_data):
1758 |         raise ToolInputError(
1759 |             "input_data is not valid JSON or JSON-Lines",
1760 |             param_name="input_data",
1761 |             details={"error_code": ToolErrorCode.INVALID_JSON_INPUT.value},
1762 |         )
1763 | 
1764 |     return await _run_command_secure(
1765 |         "jq",
1766 |         args_str,
1767 |         input_data=input_data,
1768 |         is_file_target=input_file,
1769 |         is_dir_target=False,  # jq operates on files or stdin
1770 |         timeout=timeout,
1771 |         dry_run=dry_run,
1772 |     )
1773 | 
1774 | 
1775 | # --------------------------------------------------------------------------- #
1776 | # Streaming Core Executor
1777 | # --------------------------------------------------------------------------- #
1778 | 
1779 | 
1780 | async def _run_command_stream(
1781 |     cmd_name: str,
1782 |     args_str: str,
1783 |     *,
1784 |     input_data: Optional[str] = None,
1785 |     is_file_target: bool = False,
1786 |     is_dir_target: bool = False,
1787 |     timeout: float = DEFAULT_TIMEOUT,
1788 | ) -> AsyncIterator[str]:
1789 |     """Securely executes a command and streams its stdout line by line."""
1790 | 
1791 |     # 1. Get Command Metadata and Path
1792 |     meta = _COMMAND_METADATA.get(cmd_name)
1793 |     if not meta or not meta.path:
1794 |         raise ToolExecutionError(
1795 |             f"Command '{cmd_name}' is not available or configured.",
1796 |             error_code=ToolErrorCode.CMD_NOT_FOUND,
1797 |         )
1798 | 
1799 |     # 2. Version & Checksum Checks (Same as non-streaming)
1800 |     await _check_command_version(meta)
1801 |     await _get_command_checksum(meta)  # Verification happens here
1802 | 
1803 |     # 3. Parse & Validate Arguments
1804 |     try:
1805 |         argv: List[str] = shlex.split(args_str, posix=True)
1806 |         _validate_arguments(cmd_name, argv)
1807 |     except ValueError as exc:
1808 |         raise ToolInputError(
1809 |             "Invalid quoting or escaping in args_str.",
1810 |             param_name="args_str",
1811 |             details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1812 |         ) from exc
1813 |     except ToolInputError as e:
1814 |         raise ToolInputError(
1815 |             f"Argument validation failed for '{cmd_name}' stream: {e}",
1816 |             param_name="args_str",
1817 |             details=e.details,
1818 |         ) from e
1819 | 
1820 |     # 4. Prepare command line and logging
1821 |     cmd_path_str = str(meta.path)
1822 |     cmdline: List[str] = [cmd_path_str, *argv]
1823 |     redacted_argv = [
1824 |         re.sub(r"(?i)(--?(?:password|token|key|secret)=)\S+", r"\1********", arg) for arg in argv
1825 |     ]
1826 |     log_payload = {
1827 |         "event": "execute_local_text_tool_stream",
1828 |         "command": cmd_name,
1829 |         "args": redacted_argv,
1830 |         "input_source": "stdin"
1831 |         if input_data is not None
1832 |         else ("file" if is_file_target else ("dir" if is_dir_target else "args_only")),
1833 |         "timeout_s": timeout,
1834 |     }
1835 |     logger.info(json.dumps(log_payload), extra={"json_fields": log_payload})
1836 | 
1837 |     # 5. Resource limits & Env (Same as non-streaming)
1838 |     def preexec_fn_to_use():
1839 |         return _limit_resources(timeout) if sys.platform != "win32" else None
1840 | 
1841 |     safe_env = {"PATH": os.getenv("PATH", "/usr/bin:/bin:/usr/local/bin")}
1842 |     for safe_var in ["LANG", "LC_ALL", "LC_CTYPE", "LC_MESSAGES", "LC_COLLATE"]:
1843 |         if safe_var in os.environ:
1844 |             safe_env[safe_var] = os.environ[safe_var]
1845 |     if "HOME" in os.environ:
1846 |         safe_env["HOME"] = os.environ["HOME"]
1847 |     if "TMPDIR" in os.environ:
1848 |         safe_env["TMPDIR"] = os.environ["TMPDIR"]
1849 |     elif os.getenv("TEMP"):
1850 |         safe_env["TEMP"] = os.getenv("TEMP")  # type: ignore[assignment]
1851 | 
1852 |     input_bytes: Optional[bytes] = None
1853 |     if input_data is not None:
1854 |         input_bytes = input_data.encode("utf-8", errors="ignore")
1855 |         # Check size *before* starting process
1856 |         if len(input_bytes) > MAX_INPUT_BYTES:
1857 |             raise ToolInputError(
1858 |                 f"input_data exceeds maximum allowed size of {MAX_INPUT_BYTES / (1024 * 1024):.1f} MB for streaming.",
1859 |                 param_name="input_data",
1860 |                 details={
1861 |                     "limit_bytes": MAX_INPUT_BYTES,
1862 |                     "actual_bytes": len(input_bytes),
1863 |                     "error_code": ToolErrorCode.INPUT_TOO_LARGE.value,
1864 |                 },
1865 |             )
1866 | 
1867 |     # 6. Launch Subprocess and Stream Output
1868 |     proc: Optional[asyncio.subprocess.Process] = None
1869 |     stderr_lines: List[str] = []  # Collect stderr lines
1870 |     start_time = time.monotonic()
1871 | 
1872 |     try:
1873 |         proc = await asyncio.create_subprocess_exec(
1874 |             *cmdline,
1875 |             stdin=asyncio.subprocess.PIPE
1876 |             if input_bytes is not None
1877 |             else asyncio.subprocess.DEVNULL,
1878 |             stdout=asyncio.subprocess.PIPE,
1879 |             stderr=asyncio.subprocess.PIPE,  # Capture stderr
1880 |             env=safe_env,
1881 |             limit=MAX_OUTPUT_BYTES * 2,  # Apply buffer limit
1882 |             preexec_fn=preexec_fn_to_use,
1883 |         )
1884 | 
1885 |         # --- Define Helper Coroutines for IO ---
1886 |         async def write_stdin_task(proc: asyncio.subprocess.Process) -> None:
1887 |             """Writes input data to stdin if provided and closes stdin."""
1888 |             if input_bytes is not None and proc.stdin:
1889 |                 try:
1890 |                     proc.stdin.write(input_bytes)
1891 |                     await proc.stdin.drain()
1892 |                 except (BrokenPipeError, ConnectionResetError) as e:
1893 |                     logger.warning(
1894 |                         f"Stream: Error writing to stdin for '{cmd_name}' (pid {proc.pid}): {e}. Process might have exited."
1895 |                     )
1896 |                 except Exception as e:
1897 |                     logger.error(
1898 |                         f"Stream: Unexpected error writing to stdin for '{cmd_name}' (pid {proc.pid}): {e}",
1899 |                         exc_info=True,
1900 |                     )
1901 |                 finally:
1902 |                     if proc.stdin:
1903 |                         try:
1904 |                             proc.stdin.close()
1905 |                         except Exception:
1906 |                             pass  # Ignore errors closing stdin already closed/broken
1907 |             elif proc.stdin:
1908 |                 # Close stdin immediately if no input_data
1909 |                 try:
1910 |                     proc.stdin.close()
1911 |                 except Exception:
1912 |                     pass
1913 | 
1914 |         async def read_stderr_task(proc: asyncio.subprocess.Process, lines_list: List[str]) -> None:
1915 |             """Reads stderr lines into the provided list."""
1916 |             if not proc.stderr:
1917 |                 return
1918 |             stderr_bytes_read = 0
1919 |             while True:
1920 |                 try:
1921 |                     # Read with a timeout? readline() could block if process hangs writing stderr
1922 |                     line_bytes = await proc.stderr.readline()
1923 |                     if not line_bytes:
1924 |                         break  # End of stream
1925 |                     stderr_bytes_read += len(line_bytes)
1926 |                     # Basic truncation within stderr collection to prevent memory issues
1927 |                     if stderr_bytes_read > MAX_OUTPUT_BYTES * 1.1:  # Allow slightly more for marker
1928 |                         if not lines_list or not lines_list[-1].endswith("...(stderr truncated)"):
1929 |                             lines_list.append("...(stderr truncated)")
1930 |                         continue  # Stop appending lines but keep reading to drain pipe
1931 |                     lines_list.append(line_bytes.decode("utf-8", errors="replace"))
1932 |                 except Exception as e:
1933 |                     logger.warning(
1934 |                         f"Stream: Error reading stderr line for '{cmd_name}' (pid {proc.pid}): {e}"
1935 |                     )
1936 |                     lines_list.append(f"##STREAM_STDERR_READ_ERROR: {e}\n")
1937 |                     break  # Stop reading stderr on error
1938 | 
1939 |         # --- Create IO Tasks ---
1940 |         stdin_writer = asyncio.create_task(write_stdin_task(proc))
1941 |         stderr_reader = asyncio.create_task(read_stderr_task(proc, stderr_lines))
1942 | 
1943 |         # --- Yield stdout lines (Inlined async generator logic) ---
1944 |         stdout_line_count = 0
1945 |         stdout_bytes_read = 0
1946 |         stdout_truncated = False
1947 |         if proc.stdout:
1948 |             while True:
1949 |                 # Check timeout explicitly within the loop
1950 |                 if time.monotonic() - start_time > timeout:
1951 |                     raise asyncio.TimeoutError()
1952 | 
1953 |                 try:
1954 |                     # Use readline with a timeout? Less efficient but prevents hangs.
1955 |                     # Or rely on overall wait_for timeout below.
1956 |                     line_bytes = await proc.stdout.readline()
1957 |                     if not line_bytes:
1958 |                         break  # End of stdout stream
1959 | 
1960 |                     stdout_bytes_read += len(line_bytes)
1961 |                     if stdout_bytes_read > MAX_OUTPUT_BYTES:
1962 |                         if not stdout_truncated:  # Add marker only once
1963 |                             yield "...(stdout truncated)\n"
1964 |                             stdout_truncated = True
1965 |                         continue  # Stop yielding lines but keep reading to drain pipe
1966 | 
1967 |                     if not stdout_truncated:
1968 |                         # Decode and yield the line
1969 |                         yield line_bytes.decode("utf-8", errors="replace")
1970 |                         stdout_line_count += 1
1971 | 
1972 |                 except asyncio.TimeoutError as e:  # Catch timeout during readline if applied
1973 |                     raise asyncio.TimeoutError() from e  # Re-raise to be caught by outer handler
1974 |                 except Exception as e:
1975 |                     logger.warning(
1976 |                         f"Stream: Error reading stdout line for '{cmd_name}' (pid {proc.pid}): {e}"
1977 |                     )
1978 |                     # Yield an error marker in the stream
1979 |                     yield f"##STREAM_STDOUT_READ_ERROR: {e}\n"
1980 |                     break  # Stop reading stdout on error
1981 |         else:
1982 |             logger.warning(
1983 |                 f"Stream: Process for '{cmd_name}' has no stdout stream.",
1984 |                 extra={"command": cmd_name},
1985 |             )
1986 | 
1987 |         # --- Wait for process and stderr/stdin tasks to complete ---
1988 |         # Wait for the process itself, applying the main timeout
1989 |         try:
1990 |             # Wait slightly longer than the timeout to allow for cleanup/exit signals
1991 |             exit_code = await asyncio.wait_for(proc.wait(), timeout=timeout + 5.0)
1992 |         except asyncio.TimeoutError as e:
1993 |             # If proc.wait() times out, it means the process didn't exit within timeout+5s
1994 |             # Re-raise the specific timeout error defined earlier
1995 |             raise ToolExecutionError(
1996 |                 f"'{cmd_name}' stream process failed to exit within timeout of {timeout}s (+5s buffer).",
1997 |                 error_code=ToolErrorCode.TIMEOUT,
1998 |                 details={"timeout": timeout},
1999 |             ) from e
2000 | 
2001 |         # Ensure IO tasks have finished (they should have if process exited, but wait briefly)
2002 |         try:
2003 |             await asyncio.wait_for(stdin_writer, timeout=1.0)
2004 |             await asyncio.wait_for(stderr_reader, timeout=1.0)
2005 |         except asyncio.TimeoutError:
2006 |             logger.warning(
2007 |                 f"Stream: IO tasks for '{cmd_name}' (pid {proc.pid}) did not complete quickly after process exit."
2008 |             )
2009 | 
2010 |         # --- Check final status ---
2011 |         duration = time.monotonic() - start_time
2012 |         retcode_ok_map = RETCODE_OK.get(cmd_name, {0})
2013 |         success = exit_code in retcode_ok_map
2014 |         stderr_full = "".join(stderr_lines)
2015 | 
2016 |         if not success:
2017 |             stderr_snip = textwrap.shorten(stderr_full.strip(), 150, placeholder="...")
2018 |             error_msg = f"Command '{cmd_name}' stream finished with failure exit code {exit_code}. Stderr: '{stderr_snip}'"
2019 |             logger.warning(
2020 |                 error_msg,
2021 |                 extra={"command": cmd_name, "exit_code": exit_code, "stderr": stderr_full},
2022 |             )
2023 |             # Raise error *after* iteration completes to signal failure to the caller
2024 |             raise ToolExecutionError(
2025 |                 error_msg,
2026 |                 error_code=ToolErrorCode.EXEC_ERROR,
2027 |                 details={"exit_code": exit_code, "stderr": stderr_full},
2028 |             )
2029 |         else:
2030 |             logger.info(
2031 |                 f"Stream: '{cmd_name}' (pid {proc.pid}) finished successfully in {duration:.3f}s (code={exit_code}, stdout_lines={stdout_line_count}, truncated={stdout_truncated})"
2032 |             )
2033 | 
2034 |     except asyncio.TimeoutError as e:
2035 |         # This catches the explicit timeout check within the stdout loop or the wait_for on proc.wait()
2036 |         logger.warning(
2037 |             f"Command '{cmd_name}' stream timed out after ~{timeout}s. Terminating.",
2038 |             extra={"command": cmd_name, "timeout": timeout},
2039 |         )
2040 |         if proc and proc.returncode is None:
2041 |             try:
2042 |                 proc.terminate()
2043 |                 await asyncio.wait_for(proc.wait(), timeout=1.0)
2044 |             except asyncio.TimeoutError:
2045 |                 logger.warning(f"Killing unresponsive stream process {proc.pid}")
2046 |                 proc.kill()
2047 |                 await proc.wait()
2048 |             except ProcessLookupError:
2049 |                 pass
2050 |             except Exception as term_err:
2051 |                 logger.error(f"Error killing stream process {proc.pid}: {term_err}")
2052 |         # Raise the standard execution error for timeout
2053 |         raise ToolExecutionError(
2054 |             f"'{cmd_name}' stream exceeded timeout of {timeout}s.",
2055 |             error_code=ToolErrorCode.TIMEOUT,
2056 |             details={"timeout": timeout},
2057 |         ) from e
2058 |     except (ToolInputError, ToolExecutionError) as e:
2059 |         # Propagate specific errors raised during setup or validation
2060 |         raise e
2061 |     except FileNotFoundError as e:
2062 |         logger.error(
2063 |             f"Stream: Command '{cmd_name}' not found at path: {cmdline[0]}. Ensure it's installed and in PATH.",
2064 |             exc_info=True,
2065 |         )
2066 |         raise ToolExecutionError(
2067 |             f"Stream: Command '{cmd_name}' executable not found.",
2068 |             error_code=ToolErrorCode.CMD_NOT_FOUND,
2069 |         ) from e
2070 |     except PermissionError as e:
2071 |         logger.error(
2072 |             f"Stream: Permission denied executing command '{cmd_name}' at path: {cmdline[0]}.",
2073 |             exc_info=True,
2074 |         )
2075 |         raise ToolExecutionError(
2076 |             f"Stream: Permission denied executing '{cmd_name}'. Check file permissions.",
2077 |             error_code=ToolErrorCode.EXEC_ERROR,
2078 |         ) from e
2079 |     except Exception as e:
2080 |         logger.error(
2081 |             f"Stream: Unexpected error during command stream '{cmd_name}': {e}", exc_info=True
2082 |         )
2083 |         raise ToolExecutionError(
2084 |             f"Unexpected failure during '{cmd_name}' stream: {e}",
2085 |             error_code=ToolErrorCode.UNEXPECTED_FAILURE,
2086 |         ) from e
2087 |     finally:
2088 |         # Final cleanup check for the process in case of unexpected exit from the try block
2089 |         if proc and proc.returncode is None:
2090 |             logger.warning(
2091 |                 f"Stream: Process {proc.pid} for '{cmd_name}' still running after stream processing finished unexpectedly, killing."
2092 |             )
2093 |             try:
2094 |                 proc.kill()
2095 |                 await proc.wait()
2096 |             except ProcessLookupError:
2097 |                 pass
2098 |             except Exception as final_kill_err:
2099 |                 logger.error(
2100 |                     f"Stream: Error killing process {proc.pid} in finally block: {final_kill_err}"
2101 |                 )
2102 | 
2103 | 
2104 | # --- run_ripgrep_stream ---
2105 | @with_tool_metrics
2106 | @with_error_handling
2107 | async def run_ripgrep_stream(
2108 |     args_str: str,
2109 |     *,
2110 |     input_data: Optional[str] = None,
2111 |     input_file: Optional[bool] = False,  # Default False
2112 |     input_dir: Optional[bool] = False,  # Default False
2113 |     timeout: float = DEFAULT_TIMEOUT,
2114 | ) -> AsyncIterator[str]:
2115 |     """
2116 |     Executes 'rg' and streams stdout lines asynchronously. Useful for large outputs.
2117 | 
2118 |     See `run_ripgrep` for detailed argument descriptions and security notes.
2119 |     This variant yields each line of standard output as it becomes available.
2120 | 
2121 |     **Note:** The final success status and stderr are not part of the yielded stream.
2122 |     Errors during execution (e.g., timeout, non-zero exit code other than 1 for 'no match')
2123 |     will raise a `ToolExecutionError` *after* the stream iteration completes
2124 |     (or immediately if setup fails). Use a `try...finally` or check status afterwards.
2125 | 
2126 |     Args:
2127 |         args_str: Command-line arguments for `rg`.
2128 |         input_data: String data to pipe via stdin. Omit/False for `input_file`/`input_dir`.
2129 |         input_file: Set True if `args_str` includes target file path(s). Omit/False for `input_data`/`input_dir`.
2130 |         input_dir: Set True if `args_str` includes target directory path(s). Omit/False for `input_data`/`input_file`.
2131 |         timeout: Max execution time in seconds for the entire operation.
2132 | 
2133 |     Yields:
2134 |         str: Each line of standard output from the `rg` command.
2135 | 
2136 |     Raises:
2137 |         ToolInputError: For invalid arguments or security violations during setup.
2138 |         ToolExecutionError: If `rg` fails execution (timeout, bad exit code > 1),
2139 |                             or if errors occur during streaming I/O. Raised *after* iteration.
2140 |     """
2141 |     _require_single_source(
2142 |         "rg (stream)", input_data=input_data, input_file=input_file, input_dir=input_dir
2143 |     )
2144 |     # Use the streaming executor
2145 |     async for line in _run_command_stream(
2146 |         "rg",
2147 |         args_str,
2148 |         input_data=input_data,
2149 |         is_file_target=input_file,
2150 |         is_dir_target=input_dir,
2151 |         timeout=timeout,
2152 |     ):
2153 |         yield line
2154 | 
2155 | 
2156 | # --- run_awk_stream ---
2157 | @with_tool_metrics
2158 | @with_error_handling
2159 | async def run_awk_stream(
2160 |     args_str: str,
2161 |     *,
2162 |     input_data: Optional[str] = None,
2163 |     input_file: Optional[bool] = False,  # Default False
2164 |     timeout: float = DEFAULT_TIMEOUT,
2165 | ) -> AsyncIterator[str]:
2166 |     """
2167 |     Executes 'awk' and streams stdout lines asynchronously.
2168 | 
2169 |     See `run_awk` for detailed argument descriptions and security notes.
2170 |     Yields each line of standard output as it becomes available.
2171 | 
2172 |     Args:
2173 |         args_str: Command-line arguments for `awk`.
2174 |         input_data: String data to pipe via stdin. Omit/False for `input_file`.
2175 |         input_file: Set True if `args_str` includes target file path(s). Omit/False for `input_data`.
2176 |         timeout: Max execution time in seconds.
2177 | 
2178 |     Yields:
2179 |         str: Each line of standard output from the `awk` command.
2180 | 
2181 |     Raises:
2182 |         ToolInputError: For invalid arguments or security violations.
2183 |         ToolExecutionError: If `awk` fails (non-zero exit), times out, or errors during streaming. Raised *after* iteration.
2184 |     """
2185 |     _require_single_source(
2186 |         "awk (stream)", input_data=input_data, input_file=input_file, input_dir=False
2187 |     )
2188 |     async for line in _run_command_stream(
2189 |         "awk",
2190 |         args_str,
2191 |         input_data=input_data,
2192 |         is_file_target=input_file,
2193 |         is_dir_target=False,
2194 |         timeout=timeout,
2195 |     ):
2196 |         yield line
2197 | 
2198 | 
2199 | # --- run_sed_stream ---
2200 | @with_tool_metrics
2201 | @with_error_handling
2202 | async def run_sed_stream(
2203 |     args_str: str,
2204 |     *,
2205 |     input_data: Optional[str] = None,
2206 |     input_file: Optional[bool] = False,  # Default False
2207 |     timeout: float = DEFAULT_TIMEOUT,
2208 | ) -> AsyncIterator[str]:
2209 |     """
2210 |     Executes 'sed' and streams stdout lines asynchronously.
2211 | 
2212 |     See `run_sed` for detailed argument descriptions and security notes.
2213 |     Yields each line of standard output as it becomes available.
2214 | 
2215 |     Args:
2216 |         args_str: Command-line arguments for `sed`.
2217 |         input_data: String data to pipe via stdin. Omit/False for `input_file`.
2218 |         input_file: Set True if `args_str` includes target file path. Omit/False for `input_data`.
2219 |         timeout: Max execution time in seconds.
2220 | 
2221 |     Yields:
2222 |         str: Each line of standard output from the `sed` command.
2223 | 
2224 |     Raises:
2225 |         ToolInputError: For invalid arguments or security violations.
2226 |         ToolExecutionError: If `sed` fails (non-zero exit), times out, or errors during streaming. Raised *after* iteration.
2227 |     """
2228 |     _require_single_source(
2229 |         "sed (stream)", input_data=input_data, input_file=input_file, input_dir=False
2230 |     )
2231 |     async for line in _run_command_stream(
2232 |         "sed",
2233 |         args_str,
2234 |         input_data=input_data,
2235 |         is_file_target=input_file,
2236 |         is_dir_target=False,
2237 |         timeout=timeout,
2238 |     ):
2239 |         yield line
2240 | 
2241 | 
2242 | # --- run_jq_stream ---
2243 | @with_tool_metrics
2244 | @with_error_handling
2245 | async def run_jq_stream(
2246 |     args_str: str,
2247 |     *,
2248 |     input_data: Optional[str] = None,
2249 |     input_file: Optional[bool] = False,  # Default False
2250 |     timeout: float = DEFAULT_TIMEOUT,
2251 | ) -> AsyncIterator[str]:
2252 |     """
2253 |     Executes 'jq' and streams stdout lines asynchronously.
2254 | 
2255 |     See `run_jq` for detailed argument descriptions and security notes.
2256 |     Yields each line of standard output (often JSON objects if not using `-c`)
2257 |     as it becomes available.
2258 | 
2259 |     Args:
2260 |         args_str: Command-line arguments for `jq`.
2261 |         input_data: String containing valid JSON data to pipe via stdin. Omit/False for `input_file`.
2262 |         input_file: Set True if `args_str` includes target JSON file path. Omit/False for `input_data`.
2263 |         timeout: Max execution time in seconds.
2264 | 
2265 |     Yields:
2266 |         str: Each line of standard output from the `jq` command.
2267 | 
2268 |     Raises:
2269 |         ToolInputError: If `input_data` is not valid JSON, arguments are invalid, or security violations occur.
2270 |         ToolExecutionError: If `jq` fails (non-zero exit), times out, or errors during streaming. Raised *after* iteration.
2271 |     """
2272 |     _require_single_source(
2273 |         "jq (stream)", input_data=input_data, input_file=input_file, input_dir=False
2274 |     )
2275 |     # Validate input_data is JSON if provided, before starting process
2276 |     if input_data is not None and not _is_json_or_json_lines(input_data):
2277 |         raise ToolInputError(
2278 |             "input_data is not valid JSON or JSON-Lines",
2279 |             param_name="input_data",
2280 |             details={"error_code": ToolErrorCode.INVALID_JSON_INPUT.value},
2281 |         )
2282 | 
2283 |     async for line in _run_command_stream(
2284 |         "jq",
2285 |         args_str,
2286 |         input_data=input_data,
2287 |         is_file_target=input_file,
2288 |         is_dir_target=False,
2289 |         timeout=timeout,
2290 |     ):
2291 |         yield line
2292 | 
2293 | 
2294 | # --------------------------------------------------------------------------- #
2295 | # Public API Exports
2296 | # --------------------------------------------------------------------------- #
2297 | 
2298 | 
2299 | def get_workspace_dir() -> str:
2300 |     """Return the absolute workspace directory path enforced by this module."""
2301 |     return str(WORKSPACE_DIR)
2302 | 
2303 | 
2304 | # Add command metadata access if needed?
2305 | # def get_command_meta(cmd_name: str) -> Optional[CommandMeta]:
2306 | #    """Returns the discovered metadata for a command, if available."""
2307 | #    return _COMMAND_METADATA.get(cmd_name)
2308 | 
2309 | 
2310 | __all__ = [
2311 |     # Standard execution
2312 |     "run_ripgrep",
2313 |     "run_awk",
2314 |     "run_sed",
2315 |     "run_jq",
2316 |     # Streaming variants
2317 |     "run_ripgrep_stream",
2318 |     "run_awk_stream",
2319 |     "run_sed_stream",
2320 |     "run_jq_stream",
2321 |     # Configuration info
2322 |     "get_workspace_dir",
2323 |     # Types (for consumers)
2324 |     "ToolResult",
2325 |     "ToolErrorCode",
2326 |     # Maybe export CommandMeta if useful externally?
2327 |     # "CommandMeta",
2328 | ]
2329 | 
```
Page 29/45FirstPrevNextLast