This is page 29 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│ ├── __init__.py
│ ├── advanced_agent_flows_using_unified_memory_system_demo.py
│ ├── advanced_extraction_demo.py
│ ├── advanced_unified_memory_system_demo.py
│ ├── advanced_vector_search_demo.py
│ ├── analytics_reporting_demo.py
│ ├── audio_transcription_demo.py
│ ├── basic_completion_demo.py
│ ├── cache_demo.py
│ ├── claude_integration_demo.py
│ ├── compare_synthesize_demo.py
│ ├── cost_optimization.py
│ ├── data
│ │ ├── sample_event.txt
│ │ ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│ │ └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│ ├── docstring_refiner_demo.py
│ ├── document_conversion_and_processing_demo.py
│ ├── entity_relation_graph_demo.py
│ ├── filesystem_operations_demo.py
│ ├── grok_integration_demo.py
│ ├── local_text_tools_demo.py
│ ├── marqo_fused_search_demo.py
│ ├── measure_model_speeds.py
│ ├── meta_api_demo.py
│ ├── multi_provider_demo.py
│ ├── ollama_integration_demo.py
│ ├── prompt_templates_demo.py
│ ├── python_sandbox_demo.py
│ ├── rag_example.py
│ ├── research_workflow_demo.py
│ ├── sample
│ │ ├── article.txt
│ │ ├── backprop_paper.pdf
│ │ ├── buffett.pdf
│ │ ├── contract_link.txt
│ │ ├── legal_contract.txt
│ │ ├── medical_case.txt
│ │ ├── northwind.db
│ │ ├── research_paper.txt
│ │ ├── sample_data.json
│ │ └── text_classification_samples
│ │ ├── email_classification.txt
│ │ ├── news_samples.txt
│ │ ├── product_reviews.txt
│ │ └── support_tickets.txt
│ ├── sample_docs
│ │ └── downloaded
│ │ └── attention_is_all_you_need.pdf
│ ├── sentiment_analysis_demo.py
│ ├── simple_completion_demo.py
│ ├── single_shot_synthesis_demo.py
│ ├── smart_browser_demo.py
│ ├── sql_database_demo.py
│ ├── sse_client_demo.py
│ ├── test_code_extraction.py
│ ├── test_content_detection.py
│ ├── test_ollama.py
│ ├── text_classification_demo.py
│ ├── text_redline_demo.py
│ ├── tool_composition_examples.py
│ ├── tournament_code_demo.py
│ ├── tournament_text_demo.py
│ ├── unified_memory_system_demo.py
│ ├── vector_search_demo.py
│ ├── web_automation_instruction_packs.py
│ └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│ └── smart_browser_internal
│ ├── locator_cache.db
│ ├── readability.js
│ └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── integration
│ │ ├── __init__.py
│ │ └── test_server.py
│ ├── manual
│ │ ├── test_extraction_advanced.py
│ │ └── test_extraction.py
│ └── unit
│ ├── __init__.py
│ ├── test_cache.py
│ ├── test_providers.py
│ └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── commands.py
│ │ ├── helpers.py
│ │ └── typer_cli.py
│ ├── clients
│ │ ├── __init__.py
│ │ ├── completion_client.py
│ │ └── rag_client.py
│ ├── config
│ │ └── examples
│ │ └── filesystem_config.yaml
│ ├── config.py
│ ├── constants.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── evaluation
│ │ │ ├── base.py
│ │ │ └── evaluators.py
│ │ ├── providers
│ │ │ ├── __init__.py
│ │ │ ├── anthropic.py
│ │ │ ├── base.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemini.py
│ │ │ ├── grok.py
│ │ │ ├── ollama.py
│ │ │ ├── openai.py
│ │ │ └── openrouter.py
│ │ ├── server.py
│ │ ├── state_store.py
│ │ ├── tournaments
│ │ │ ├── manager.py
│ │ │ ├── tasks.py
│ │ │ └── utils.py
│ │ └── ums_api
│ │ ├── __init__.py
│ │ ├── ums_database.py
│ │ ├── ums_endpoints.py
│ │ ├── ums_models.py
│ │ └── ums_services.py
│ ├── exceptions.py
│ ├── graceful_shutdown.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── analytics
│ │ │ ├── __init__.py
│ │ │ ├── metrics.py
│ │ │ └── reporting.py
│ │ ├── cache
│ │ │ ├── __init__.py
│ │ │ ├── cache_service.py
│ │ │ ├── persistence.py
│ │ │ ├── strategies.py
│ │ │ └── utils.py
│ │ ├── cache.py
│ │ ├── document.py
│ │ ├── knowledge_base
│ │ │ ├── __init__.py
│ │ │ ├── feedback.py
│ │ │ ├── manager.py
│ │ │ ├── rag_engine.py
│ │ │ ├── retriever.py
│ │ │ └── utils.py
│ │ ├── prompts
│ │ │ ├── __init__.py
│ │ │ ├── repository.py
│ │ │ └── templates.py
│ │ ├── prompts.py
│ │ └── vector
│ │ ├── __init__.py
│ │ ├── embeddings.py
│ │ └── vector_service.py
│ ├── tool_token_counter.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── audio_transcription.py
│ │ ├── base.py
│ │ ├── completion.py
│ │ ├── docstring_refiner.py
│ │ ├── document_conversion_and_processing.py
│ │ ├── enhanced-ums-lookbook.html
│ │ ├── entity_relation_graph.py
│ │ ├── excel_spreadsheet_automation.py
│ │ ├── extraction.py
│ │ ├── filesystem.py
│ │ ├── html_to_markdown.py
│ │ ├── local_text_tools.py
│ │ ├── marqo_fused_search.py
│ │ ├── meta_api_tool.py
│ │ ├── ocr_tools.py
│ │ ├── optimization.py
│ │ ├── provider.py
│ │ ├── pyodide_boot_template.html
│ │ ├── python_sandbox.py
│ │ ├── rag.py
│ │ ├── redline-compiled.css
│ │ ├── sentiment_analysis.py
│ │ ├── single_shot_synthesis.py
│ │ ├── smart_browser.py
│ │ ├── sql_databases.py
│ │ ├── text_classification.py
│ │ ├── text_redline_tools.py
│ │ ├── tournament.py
│ │ ├── ums_explorer.html
│ │ └── unified_memory_system.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── async_utils.py
│ │ ├── display.py
│ │ ├── logging
│ │ │ ├── __init__.py
│ │ │ ├── console.py
│ │ │ ├── emojis.py
│ │ │ ├── formatter.py
│ │ │ ├── logger.py
│ │ │ ├── panels.py
│ │ │ ├── progress.py
│ │ │ └── themes.py
│ │ ├── parse_yaml.py
│ │ ├── parsing.py
│ │ ├── security.py
│ │ └── text.py
│ └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/local_text_tools.py:
--------------------------------------------------------------------------------
```python
1 | # ultimate_mcp_server/tools/local_text_tools.py
2 | """
3 | Standalone, secure wrappers around local CLI text-processing utilities (rg, awk, sed, jq)
4 | for the Ultimate MCP Server framework.
5 |
6 | This module provides controlled execution of common command-line text tools within
7 | a defined workspace, incorporating enhanced security checks, resource limits, performance
8 | optimizations like caching and concurrency control, and robust error handling.
9 |
10 | Key Features:
11 | * Standalone Functions: Tools exposed as individual async functions.
12 | * Workspace Confinement: All file/directory operations strictly enforced within WORKSPACE_DIR.
13 | * Security Hardening: Validates arguments against shell metacharacters, subshells,
14 | redirection, path traversal, and specific unsafe flags (e.g., `sed -i`). Uses `prctl`
15 | and `setsid` on Linux for further sandboxing.
16 | * Resource Limiting: Applies CPU time and memory limits (Unix only).
17 | * Input Flexibility: Handles input via stdin (`input_data`) or file/directory targets
18 | specified within the command arguments (`args_str`). Stdin size is capped.
19 | * Standardized Output: Returns consistent `ToolResult` TypedDict with stdout, stderr,
20 | exit_code, success status, timing, and truncation info. Output is truncated.
21 | * Command Integrity: Checks command availability and checksums (lazily, with re-verification).
22 | Enforces minimum versions.
23 | * Performance: Includes per-tool concurrency limits and optional disk-based caching
24 | of identical command invocations.
25 | * LLM-Friendly: Detailed docstrings, structured errors with codes, optional streaming modes,
26 | and a `dry_run` option enhance usability for AI agents.
27 | """
28 |
29 | import asyncio
30 | import hashlib
31 | import json
32 | import os
33 | import random
34 | import re
35 | import shlex
36 | import shutil
37 | import sys
38 | import textwrap
39 | import time
40 | from dataclasses import (
41 | dataclass,
42 | ) # Keep field for potential future use, though not used for update now
43 | from enum import Enum
44 | from pathlib import Path
45 | from typing import (
46 | AsyncIterator,
47 | Dict,
48 | List,
49 | Optional,
50 | Sequence,
51 | Tuple,
52 | TypedDict,
53 | cast,
54 | )
55 |
56 | import aiofiles # Needed for async checksum
57 |
58 | from ultimate_mcp_server.exceptions import ToolExecutionError, ToolInputError
59 | from ultimate_mcp_server.tools.base import with_error_handling, with_tool_metrics
60 | from ultimate_mcp_server.utils import get_logger
61 |
62 | logger = get_logger("ultimate_mcp_server.tools.local_text")
63 |
64 | # Conditional import for resource limiting and sandboxing
65 | try:
66 | import resource # type: ignore [import-not-found]
67 |
68 | HAS_RESOURCE = True
69 | except ImportError:
70 | HAS_RESOURCE = False
71 | logger.debug("`resource` module not found (likely non-Unix). Resource limits disabled.")
72 |
73 | try:
74 | import prctl # type: ignore [import-not-found]
75 |
76 | HAS_PRCTL = True
77 | except ImportError:
78 | HAS_PRCTL = False
79 | logger.debug("`prctl` module not found (likely non-Linux). Advanced sandboxing disabled.")
80 |
81 | # --------------------------------------------------------------------------- #
82 | # Configuration (Loaded from Environment or Defaults)
83 | # --------------------------------------------------------------------------- #
84 |
85 | #: Maximum bytes returned in stdout / stderr before truncation
86 | MAX_OUTPUT_BYTES = int(os.getenv("MCP_TEXT_MAX_OUTPUT", "1_000_000")) # 1 MiB default
87 | #: Maximum bytes accepted via stdin (`input_data`)
88 | MAX_INPUT_BYTES = int(os.getenv("MCP_TEXT_MAX_INPUT", "25_000_000")) # 25 MiB default
89 | #: Maximum seconds a command may run before being terminated
90 | DEFAULT_TIMEOUT = float(os.getenv("MCP_TEXT_TIMEOUT", "30"))
91 | #: Workspace root – **all file/directory arguments must resolve inside this tree**
92 | try:
93 | WORKSPACE_DIR_STR = os.getenv("MCP_TEXT_WORKSPACE", ".")
94 | WORKSPACE_DIR = Path(WORKSPACE_DIR_STR).resolve()
95 | if not WORKSPACE_DIR.is_dir():
96 | logger.warning(
97 | f"MCP_TEXT_WORKSPACE ('{WORKSPACE_DIR_STR}' -> '{WORKSPACE_DIR}') is not a directory. Defaulting to current."
98 | )
99 | WORKSPACE_DIR = Path(".").resolve()
100 | except Exception as e:
101 | logger.error(
102 | f"Error resolving MCP_TEXT_WORKSPACE ('{WORKSPACE_DIR_STR}'): {e}. Defaulting to current."
103 | )
104 | WORKSPACE_DIR = Path(".").resolve()
105 | logger.info(f"LocalTextTools workspace confined to: {WORKSPACE_DIR}")
106 |
107 | #: Disk cache directory for command results
108 | CACHE_DIR_STR = os.getenv("MCP_TEXT_CACHE_DIR", "~/.cache/ultimate_mcp_server/local_text_tools")
109 | CACHE_DIR = Path(CACHE_DIR_STR).expanduser().resolve()
110 | CACHE_ENABLED = os.getenv("MCP_TEXT_CACHE_ENABLED", "true").lower() == "true"
111 | CACHE_MAX_SIZE_MB = int(os.getenv("MCP_TEXT_CACHE_MAX_MB", "500"))
112 | CACHE_MAX_AGE_DAYS = int(os.getenv("MCP_TEXT_CACHE_MAX_AGE_DAYS", "7"))
113 |
114 | if CACHE_ENABLED:
115 | try:
116 | CACHE_DIR.mkdir(parents=True, exist_ok=True)
117 | logger.info(f"Using command invocation cache directory: {CACHE_DIR}")
118 | except OSError as e:
119 | logger.error(
120 | f"Failed to create command cache directory {CACHE_DIR}: {e}. Caching disabled."
121 | )
122 | CACHE_ENABLED = False # Disable cache if directory fails
123 |
124 | # Concurrency limits per command
125 | DEFAULT_CONCURRENCY = 4
126 | CONCURRENCY_LIMITS = {
127 | "rg": int(os.getenv("MCP_TEXT_CONCURRENCY_RG", "8")),
128 | "awk": int(os.getenv("MCP_TEXT_CONCURRENCY_AWK", str(DEFAULT_CONCURRENCY))),
129 | "sed": int(os.getenv("MCP_TEXT_CONCURRENCY_SED", str(DEFAULT_CONCURRENCY))),
130 | "jq": int(os.getenv("MCP_TEXT_CONCURRENCY_JQ", str(DEFAULT_CONCURRENCY))),
131 | }
132 |
133 | # Forbidden shell metacharacters (pre-compiled set for efficiency)
134 | _FORBIDDEN_CHARS_SET = frozenset("&;`|><$()")
135 |
136 |
137 | # --------------------------------------------------------------------------- #
138 | # Error Codes Enum
139 | # --------------------------------------------------------------------------- #
140 | class ToolErrorCode(str, Enum):
141 | """Machine-readable error codes for local text tools."""
142 |
143 | PATH_TRAVERSAL = "PATH_TRAVERSAL"
144 | ABS_PATH_FORBIDDEN = "ABS_PATH_FORBIDDEN"
145 | WORKSPACE_VIOLATION = "WORKSPACE_VIOLATION"
146 | FORBIDDEN_FLAG = "FORBIDDEN_FLAG"
147 | FORBIDDEN_CHAR = "FORBIDDEN_CHAR"
148 | CMD_SUBSTITUTION = "CMD_SUBSTITUTION" # Often related to $() or ``
149 | INVALID_ARGS = "INVALID_ARGS"
150 | INPUT_TOO_LARGE = "INPUT_TOO_LARGE"
151 | INVALID_JSON_INPUT = "INVALID_JSON_INPUT"
152 | CMD_NOT_FOUND = "CMD_NOT_FOUND"
153 | TIMEOUT = "TIMEOUT"
154 | EXEC_ERROR = "EXEC_ERROR"
155 | COMMUNICATION_ERROR = "COMMUNICATION_ERROR"
156 | CHECKSUM_MISMATCH = "CHECKSUM_MISMATCH"
157 | VERSION_TOO_OLD = "VERSION_TOO_OLD"
158 | UNEXPECTED_FAILURE = "UNEXPECTED_FAILURE"
159 | CACHE_ERROR = "CACHE_ERROR"
160 |
161 |
162 | # --------------------------------------------------------------------------- #
163 | # Result Schema (TypedDict)
164 | # --------------------------------------------------------------------------- #
165 | class ToolResult(TypedDict, total=False):
166 | """Standardized result structure for local text tool executions."""
167 |
168 | stdout: Optional[str] # Decoded standard output (potentially truncated)
169 | stderr: Optional[str] # Decoded standard error (potentially truncated)
170 | exit_code: Optional[int] # Process exit code
171 | success: bool # True if execution considered successful (depends on tool/retcode)
172 | error: Optional[str] # Human-readable error message if success is False
173 | error_code: Optional[ToolErrorCode] # Machine-readable error code if success is False
174 | duration: float # Execution duration in seconds
175 | stdout_truncated: bool # True if stdout was truncated
176 | stderr_truncated: bool # True if stderr was truncated
177 | cached_result: bool # True if this result was served from cache
178 | dry_run_cmdline: Optional[List[str]] # Populated only if dry_run=True
179 |
180 |
181 | # --------------------------------------------------------------------------- #
182 | # Command Metadata & Discovery
183 | # --------------------------------------------------------------------------- #
184 |
185 |
186 | @dataclass(slots=True, frozen=True) # Make truly immutable
187 | class CommandMeta:
188 | """Metadata for a command-line tool."""
189 |
190 | name: str
191 | path: Optional[Path] = None # Store the resolved absolute path
192 | checksum: Optional[str] = None # SHA-256 checksum of the executable (calculated lazily)
193 | mtime: Optional[float] = None # Last modification time (for checksum re-verification)
194 | version: Optional[tuple[int, ...]] = None # Parsed version tuple (e.g., (13, 0, 1))
195 | forbidden_flags: frozenset[str] = frozenset() # Flags disallowed for security
196 | readonly: bool = True # True if the command should not modify the filesystem
197 | min_version: Optional[tuple[int, ...]] = None # Minimum required version tuple
198 |
199 |
200 | # Store CommandMeta objects, keyed by command name
201 | _COMMAND_METADATA: Dict[str, CommandMeta] = {
202 | "rg": CommandMeta("rg", min_version=(13, 0, 0)), # Example: Require ripgrep >= 13.0.0
203 | "awk": CommandMeta(
204 | "awk", forbidden_flags=frozenset({"-i", "--in-place"})
205 | ), # AWK in-place is less common but exists (gawk)
206 | "sed": CommandMeta("sed", forbidden_flags=frozenset({"-i", "--in-place"})),
207 | "jq": CommandMeta("jq", min_version=(1, 6)), # Example: Require jq >= 1.6
208 | }
209 | _COMMAND_VERSIONS_CACHE: Dict[str, Optional[tuple[int, ...]]] = {} # Cache parsed versions
210 | _checksum_lock = asyncio.Lock() # Lock for lazy checksum calculation
211 | _version_lock = asyncio.Lock() # Lock for lazy version checking
212 |
213 |
214 | async def _calculate_sha256sum_async(path: Path, chunk: int = 262_144) -> str:
215 | """Asynchronously calculates SHA256 checksum using aiofiles."""
216 | h = hashlib.sha256()
217 | try:
218 | async with aiofiles.open(path, "rb") as fh:
219 | while True:
220 | blk = await fh.read(chunk)
221 | if not blk:
222 | break
223 | h.update(blk)
224 | return h.hexdigest()
225 | except OSError as e:
226 | logger.error(f"Failed to calculate SHA256 for {path}: {e}")
227 | return "error_calculating_checksum"
228 |
229 |
230 | async def _get_command_checksum(meta: CommandMeta) -> Optional[str]:
231 | """Lazily calculates and caches the command checksum, verifying mtime."""
232 | global _COMMAND_METADATA # Allow modification (replacing entry)
233 | if not meta.path:
234 | return None # Cannot checksum if path unknown
235 |
236 | async with _checksum_lock:
237 | # Re-fetch meta inside lock in case it was updated by another coroutine
238 | meta_locked = _COMMAND_METADATA.get(meta.name)
239 | if not meta_locked or not meta_locked.path:
240 | return None
241 |
242 | needs_recalc = False
243 | current_mtime = None
244 | try:
245 | # Use asyncio.to_thread for potentially blocking stat
246 | stat_res = await asyncio.to_thread(meta_locked.path.stat)
247 | current_mtime = stat_res.st_mtime
248 | if (
249 | meta_locked.checksum is None
250 | or meta_locked.mtime is None
251 | or current_mtime != meta_locked.mtime
252 | ):
253 | needs_recalc = True
254 | except OSError as e:
255 | logger.warning(
256 | f"Could not stat {meta_locked.path} for checksum verification: {e}. Recalculating."
257 | )
258 | needs_recalc = True # Force recalc if stat fails
259 |
260 | if needs_recalc:
261 | logger.debug(
262 | f"Calculating checksum for {meta.name} (mtime changed or first calculation)..."
263 | )
264 | new_checksum = await _calculate_sha256sum_async(meta_locked.path)
265 | _COMMAND_METADATA[meta.name] = CommandMeta(
266 | name=meta_locked.name,
267 | path=meta_locked.path,
268 | checksum=new_checksum,
269 | mtime=current_mtime, # Store the mtime when checksum was calculated
270 | forbidden_flags=meta_locked.forbidden_flags,
271 | readonly=meta_locked.readonly,
272 | min_version=meta_locked.min_version,
273 | )
274 | logger.debug(f"Updated checksum for {meta.name}: {new_checksum[:12]}...")
275 | return new_checksum
276 | else:
277 | # Return cached checksum
278 | return meta_locked.checksum
279 |
280 |
281 | async def _parse_version(cmd_path: Path) -> Optional[tuple[int, ...]]:
282 | """Runs '<tool> --version' and parses semantic version tuple."""
283 | try:
284 | proc = await asyncio.create_subprocess_exec(
285 | str(cmd_path),
286 | "--version",
287 | stdout=asyncio.subprocess.PIPE,
288 | stderr=asyncio.subprocess.PIPE,
289 | )
290 | stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=5.0)
291 | if proc.returncode != 0:
292 | logger.warning(
293 | f"Command '{cmd_path.name} --version' failed with code {proc.returncode}: {stderr_b.decode(errors='ignore')[:100]}"
294 | )
295 | return None
296 |
297 | output = stdout_b.decode(errors="ignore").strip()
298 | # Common version patterns (adjust as needed for specific tools)
299 | match = re.search(r"(\d+)\.(\d+)(?:\.(\d+))?", output)
300 | if match:
301 | major = int(match.group(1))
302 | minor = int(match.group(2))
303 | patch = int(match.group(3) or 0) # Default patch to 0
304 | return (major, minor, patch)
305 | else:
306 | logger.warning(f"Could not parse version from '{cmd_path.name}' output: {output[:100]}")
307 | return None
308 | except asyncio.TimeoutError:
309 | logger.warning(f"Timeout getting version for '{cmd_path.name}'.")
310 | return None
311 | except Exception as e:
312 | logger.error(f"Error getting version for '{cmd_path.name}': {e}")
313 | return None
314 |
315 |
316 | async def _check_command_version(meta: CommandMeta) -> None:
317 | """Checks if the command meets the minimum version requirement."""
318 | global _COMMAND_VERSIONS_CACHE # Allow modification
319 | if not meta.path or not meta.min_version:
320 | return # Skip check if no path or no minimum defined
321 |
322 | async with _version_lock:
323 | # Check cache first
324 | if meta.name in _COMMAND_VERSIONS_CACHE:
325 | actual_version = _COMMAND_VERSIONS_CACHE[meta.name]
326 | else:
327 | # Parse version if not cached
328 | actual_version = await _parse_version(meta.path)
329 | _COMMAND_VERSIONS_CACHE[meta.name] = actual_version # Cache result (even None)
330 |
331 | if actual_version is None:
332 | logger.warning(
333 | f"Could not determine version for '{meta.name}'. Skipping minimum version check."
334 | )
335 | return
336 |
337 | if actual_version < meta.min_version:
338 | actual_str = ".".join(map(str, actual_version))
339 | required_str = ".".join(map(str, meta.min_version))
340 | raise ToolExecutionError(
341 | f"Command '{meta.name}' version ({actual_str}) is older than required minimum ({required_str}). Please update.",
342 | error_code=ToolErrorCode.VERSION_TOO_OLD,
343 | details={"required": required_str, "actual": actual_str},
344 | )
345 | else:
346 | logger.debug(
347 | f"Version check passed for {meta.name} (found {''.join(map(str, actual_version))}, required >= {''.join(map(str, meta.min_version))})"
348 | )
349 |
350 |
351 | def _initial_command_discovery() -> None:
352 | """Finds commands and stores paths. Logs warnings for missing commands."""
353 | global _COMMAND_METADATA # Modify the global dict
354 | missing: list[str] = []
355 | updated_metadata = {}
356 |
357 | for name, meta in _COMMAND_METADATA.items():
358 | exe_path_str = shutil.which(name)
359 | if exe_path_str is None:
360 | missing.append(name)
361 | # Keep original meta without path if not found
362 | updated_metadata[name] = CommandMeta(
363 | name=meta.name,
364 | path=None,
365 | checksum=None,
366 | mtime=None,
367 | version=None, # Reset dynamic fields
368 | forbidden_flags=meta.forbidden_flags,
369 | readonly=meta.readonly,
370 | min_version=meta.min_version,
371 | )
372 | continue
373 |
374 | resolved_path = Path(exe_path_str).resolve()
375 | current_mtime = None
376 | try:
377 | # Use sync stat here, module load time is acceptable
378 | current_mtime = resolved_path.stat().st_mtime
379 | except OSError as e:
380 | logger.warning(f"Could not stat {resolved_path} during initial discovery: {e}")
381 |
382 | # Create updated CommandMeta with path and mtime (checksum/version are lazy)
383 | updated_meta = CommandMeta(
384 | name=meta.name,
385 | path=resolved_path,
386 | checksum=None, # Lazy loaded
387 | mtime=current_mtime,
388 | # Keep original version/checksum null, they are loaded async later
389 | version=None,
390 | forbidden_flags=meta.forbidden_flags,
391 | readonly=meta.readonly,
392 | min_version=meta.min_version,
393 | )
394 | updated_metadata[name] = updated_meta
395 | logger.debug(f"{name}: found at {resolved_path}")
396 |
397 | _COMMAND_METADATA = updated_metadata # Replace global dict
398 |
399 | if missing:
400 | logger.warning(
401 | "Missing local text CLI tools: %s. Corresponding functions will fail.",
402 | ", ".join(missing),
403 | # emoji_key="warning", # Assuming logger supports this extra field
404 | )
405 |
406 |
407 | # Run discovery when the module is loaded
408 | _initial_command_discovery()
409 |
410 |
411 | # --------------------------------------------------------------------------- #
412 | # Argument Validation Placeholder
413 | # --------------------------------------------------------------------------- #
414 |
415 |
416 | def _validate_arguments(cmd_name: str, argv: List[str]) -> None:
417 | """
418 | Validates command arguments for security and workspace compliance.
419 | (Placeholder implementation - refine based on specific security needs)
420 | """
421 | meta = _COMMAND_METADATA.get(cmd_name)
422 | if not meta:
423 | # This should generally not happen if called after discovery
424 | raise ToolInputError(
425 | f"Metadata not found for command '{cmd_name}' during validation.",
426 | param_name="cmd_name",
427 | details={"unexpected_failure": True},
428 | )
429 |
430 | forbidden_flags = meta.forbidden_flags
431 |
432 | for i, arg in enumerate(argv):
433 | # Check for forbidden flags
434 | if arg in forbidden_flags:
435 | raise ToolInputError(
436 | f"Forbidden flag '{arg}' is not allowed for command '{cmd_name}'.",
437 | param_name="args_str",
438 | details={
439 | "argument": arg,
440 | "index": i,
441 | "command": cmd_name,
442 | "error_code": ToolErrorCode.FORBIDDEN_FLAG.value,
443 | },
444 | )
445 |
446 | # Command-specific validation rules
447 | if cmd_name == "rg":
448 | # For rg, allow regex metacharacters ( (), |, ?, +, *, { }, [, ], ^, $, . )
449 | forbidden_chars = {
450 | char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "()|"
451 | }
452 | if forbidden_chars:
453 | raise ToolInputError(
454 | f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
455 | param_name="args_str",
456 | details={
457 | "argument": arg,
458 | "index": i,
459 | "forbidden_chars": sorted(list(forbidden_chars)),
460 | "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
461 | },
462 | )
463 |
464 | # Basic check for command substitution patterns (can be complex)
465 | if "`" in arg or "$(" in arg:
466 | raise ToolInputError(
467 | f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
468 | param_name="args_str",
469 | details={
470 | "argument": arg,
471 | "index": i,
472 | "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
473 | },
474 | )
475 |
476 | continue # Skip further checks for rg
477 |
478 | elif cmd_name == "jq":
479 | # For jq, allow (, ), |, > and < characters as they're essential for the query language
480 | forbidden_chars = {
481 | char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "()|}><"
482 | }
483 | if forbidden_chars:
484 | raise ToolInputError(
485 | f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
486 | param_name="args_str",
487 | details={
488 | "argument": arg,
489 | "index": i,
490 | "forbidden_chars": sorted(list(forbidden_chars)),
491 | "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
492 | },
493 | )
494 |
495 | # Basic check for command substitution patterns (can be complex)
496 | if "`" in arg or "$(" in arg:
497 | raise ToolInputError(
498 | f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
499 | param_name="args_str",
500 | details={
501 | "argument": arg,
502 | "index": i,
503 | "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
504 | },
505 | )
506 |
507 | continue # Skip further checks for jq
508 |
509 | elif cmd_name == "awk":
510 | # For awk, allow $ character (for field references), / (for regex patterns),
511 | # and other characters needed for awk scripts like (, ), ;
512 | # Allow '>' and '<' for comparisons in AWK
513 | forbidden_chars = {
514 | char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "$/();{}<>"
515 | }
516 |
517 | # Special check for file redirection (> followed by a string in quotes)
518 | # Pattern detects constructs like: print $1 > "file.txt" or print $1 > 'file.txt'
519 | if re.search(r'>\s*["\']', arg) or re.search(r'print.*>\s*["\']', arg):
520 | raise ToolInputError(
521 | f"Argument '{arg}' appears to contain file redirection, which is forbidden.",
522 | param_name="args_str",
523 | details={
524 | "argument": arg,
525 | "index": i,
526 | "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
527 | },
528 | )
529 |
530 | if forbidden_chars:
531 | raise ToolInputError(
532 | f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
533 | param_name="args_str",
534 | details={
535 | "argument": arg,
536 | "index": i,
537 | "forbidden_chars": sorted(list(forbidden_chars)),
538 | "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
539 | },
540 | )
541 |
542 | # Basic check for command substitution patterns (can be complex)
543 | if "`" in arg or "$(" in arg:
544 | raise ToolInputError(
545 | f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
546 | param_name="args_str",
547 | details={
548 | "argument": arg,
549 | "index": i,
550 | "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
551 | },
552 | )
553 |
554 | # Don't treat awk patterns like /pattern/ as absolute paths
555 | if arg.startswith("/") and not (arg.count("/") >= 2 and arg[1:].find("/") > 0):
556 | # Still check for absolute paths that aren't regex patterns
557 | # A regex pattern would typically have at least one more / after the first character
558 | try:
559 | # Resolve the path relative to the workspace *without* accessing filesystem yet
560 | # Use os.path.normpath and os.path.join for basic checks before full resolve
561 | norm_path = os.path.normpath(os.path.join(str(WORKSPACE_DIR), arg))
562 | if not norm_path.startswith(str(WORKSPACE_DIR)):
563 | raise ToolInputError(
564 | f"Path traversal or absolute path '{arg}' is forbidden.",
565 | param_name="args_str",
566 | details={
567 | "argument": arg,
568 | "index": i,
569 | "error_code": ToolErrorCode.PATH_TRAVERSAL.value,
570 | },
571 | )
572 | except Exception as e:
573 | logger.error(f"Error checking path '{arg}': {e}")
574 | raise ToolInputError(
575 | f"Invalid path argument '{arg}'.",
576 | param_name="args_str",
577 | details={
578 | "argument": arg,
579 | "index": i,
580 | "error_code": ToolErrorCode.INVALID_ARGS.value,
581 | },
582 | ) from e
583 |
584 | continue # Skip further checks for awk
585 |
586 | elif cmd_name == "sed":
587 | # For sed, allow / (for regex patterns), as well as |, (, ) for sed expressions
588 | forbidden_chars = {
589 | char for char in arg if char in _FORBIDDEN_CHARS_SET and char not in "/|()"
590 | }
591 | if forbidden_chars:
592 | raise ToolInputError(
593 | f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(forbidden_chars))}",
594 | param_name="args_str",
595 | details={
596 | "argument": arg,
597 | "index": i,
598 | "forbidden_chars": sorted(list(forbidden_chars)),
599 | "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
600 | },
601 | )
602 |
603 | # Basic check for command substitution patterns (can be complex)
604 | if "`" in arg or "$(" in arg:
605 | raise ToolInputError(
606 | f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
607 | param_name="args_str",
608 | details={
609 | "argument": arg,
610 | "index": i,
611 | "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
612 | },
613 | )
614 |
615 | # Don't treat sed patterns like /pattern/ as absolute paths
616 | if (
617 | arg.startswith("/")
618 | and arg != "/"
619 | and "/ " not in arg
620 | and not (arg.count("/") >= 2 and arg[1:].find("/") > 0)
621 | ):
622 | # Still check for absolute paths that aren't regex patterns
623 | try:
624 | # Resolve the path relative to the workspace *without* accessing filesystem yet
625 | # Use os.path.normpath and os.path.join for basic checks before full resolve
626 | norm_path = os.path.normpath(os.path.join(str(WORKSPACE_DIR), arg))
627 | if not norm_path.startswith(str(WORKSPACE_DIR)):
628 | raise ToolInputError(
629 | f"Path traversal or absolute path '{arg}' is forbidden.",
630 | param_name="args_str",
631 | details={
632 | "argument": arg,
633 | "index": i,
634 | "error_code": ToolErrorCode.PATH_TRAVERSAL.value,
635 | },
636 | )
637 | except Exception as e:
638 | logger.error(f"Error checking path '{arg}': {e}")
639 | raise ToolInputError(
640 | f"Invalid path argument '{arg}'.",
641 | param_name="args_str",
642 | details={
643 | "argument": arg,
644 | "index": i,
645 | "error_code": ToolErrorCode.INVALID_ARGS.value,
646 | },
647 | ) from e
648 |
649 | continue # Skip further checks for sed
650 |
651 | # Standard checks for all other commands
652 | # Check for forbidden characters
653 | if any(char in _FORBIDDEN_CHARS_SET for char in arg):
654 | # Be more specific about which char if possible
655 | found_chars = {char for char in arg if char in _FORBIDDEN_CHARS_SET}
656 | raise ToolInputError(
657 | f"Argument '{arg}' contains forbidden shell metacharacter(s): {', '.join(sorted(found_chars))}",
658 | param_name="args_str",
659 | details={
660 | "argument": arg,
661 | "index": i,
662 | "forbidden_chars": sorted(list(found_chars)),
663 | "error_code": ToolErrorCode.FORBIDDEN_CHAR.value,
664 | },
665 | )
666 |
667 | # Basic check for command substitution patterns (can be complex)
668 | if "`" in arg or "$(" in arg:
669 | raise ToolInputError(
670 | f"Argument '{arg}' seems to contain command substitution, which is forbidden.",
671 | param_name="args_str",
672 | details={
673 | "argument": arg,
674 | "index": i,
675 | "error_code": ToolErrorCode.CMD_SUBSTITUTION.value,
676 | },
677 | )
678 |
679 | # --- Path Validation ---
680 | # Heuristic: Does the argument look like a path that needs checking?
681 | # This is tricky. We might only check args that aren't flags (don't start with '-')
682 | # or args known to take paths for specific commands.
683 | # For simplicity here, we check any arg that contains '/' or could be a filename.
684 | # More robust: parse args properly (difficult) or check based on tool context.
685 | potential_path = False
686 | if not arg.startswith("-") and (os.sep in arg or "." in arg or Path(arg).suffix):
687 | potential_path = True
688 | # Allow '-' as a special argument representing stdin/stdout
689 | if arg == "-":
690 | potential_path = False
691 |
692 | if potential_path:
693 | try:
694 | # Disallow absolute paths
695 | if Path(arg).is_absolute():
696 | raise ToolInputError(
697 | f"Absolute paths like '{arg}' are forbidden. Use paths relative to the workspace.",
698 | param_name="args_str",
699 | details={
700 | "argument": arg,
701 | "index": i,
702 | "error_code": ToolErrorCode.ABS_PATH_FORBIDDEN.value,
703 | },
704 | )
705 |
706 | # Resolve the path relative to the workspace *without* accessing filesystem yet
707 | # Use os.path.normpath and os.path.join for basic checks before full resolve
708 | norm_path = os.path.normpath(os.path.join(str(WORKSPACE_DIR), arg))
709 |
710 | # Check for path traversal using normpath result
711 | if not norm_path.startswith(str(WORKSPACE_DIR)):
712 | # Check specifically for '..' components that might escape
713 | if ".." in Path(arg).parts:
714 | raise ToolInputError(
715 | f"Path traversal ('..') is forbidden in argument '{arg}'.",
716 | param_name="args_str",
717 | details={
718 | "argument": arg,
719 | "index": i,
720 | "error_code": ToolErrorCode.PATH_TRAVERSAL.value,
721 | },
722 | )
723 | else:
724 | # Generic workspace violation if normpath doesn't match prefix (e.g., symlinks handled later)
725 | raise ToolInputError(
726 | f"Argument '{arg}' resolves outside the allowed workspace '{WORKSPACE_DIR}'.",
727 | param_name="args_str",
728 | details={
729 | "argument": arg,
730 | "index": i,
731 | "resolved_norm": norm_path,
732 | "error_code": ToolErrorCode.WORKSPACE_VIOLATION.value,
733 | },
734 | )
735 |
736 | # More robust check: Resolve the path fully and check again
737 | # This *does* access the filesystem but ensures symlinks are handled
738 | resolved_arg_path = (WORKSPACE_DIR / arg).resolve()
739 | if not resolved_arg_path.is_relative_to(WORKSPACE_DIR):
740 | raise ToolInputError(
741 | f"Argument '{arg}' resolves outside the allowed workspace '{WORKSPACE_DIR}' (checked after resolving symlinks).",
742 | param_name="args_str",
743 | details={
744 | "argument": arg,
745 | "index": i,
746 | "resolved_absolute": str(resolved_arg_path),
747 | "error_code": ToolErrorCode.WORKSPACE_VIOLATION.value,
748 | },
749 | )
750 |
751 | except OSError as e:
752 | # Ignore errors resolving paths that might not exist yet (e.g., output files for some tools)
753 | # But log a warning. More strict validation could forbid non-existent input paths.
754 | logger.debug(
755 | f"Could not fully resolve potential path argument '{arg}': {e}. Assuming OK if basic checks passed."
756 | )
757 | except ToolInputError:
758 | raise # Re-raise our specific validation errors
759 | except Exception as e:
760 | logger.error(f"Unexpected error validating argument '{arg}': {e}", exc_info=True)
761 | raise ToolInputError(
762 | f"Unexpected error during validation of argument '{arg}'.",
763 | param_name="args_str",
764 | details={
765 | "argument": arg,
766 | "index": i,
767 | "error_code": ToolErrorCode.UNEXPECTED_FAILURE.value,
768 | },
769 | ) from e
770 |
771 |
772 | # --------------------------------------------------------------------------- #
773 | # Invocation Caching (Disk-based LRU-like)
774 | # --------------------------------------------------------------------------- #
775 |
776 |
777 | def _get_cache_key(cmd_name: str, argv: Sequence[str], input_data_bytes: Optional[bytes]) -> str:
778 | """Creates a hash key based on command, args, and input data bytes."""
779 | hasher = hashlib.sha256()
780 | hasher.update(cmd_name.encode())
781 | for arg in argv:
782 | hasher.update(arg.encode())
783 | if input_data_bytes is not None:
784 | hasher.update(b"\x00\x01") # Separator for input data
785 | hasher.update(input_data_bytes)
786 | else:
787 | hasher.update(b"\x00\x00") # Separator for no input data
788 | return hasher.hexdigest()
789 |
790 |
791 | def _get_cache_path(key: str) -> Path:
792 | """Gets the file path for a cache key."""
793 | # Simple structure: cache_dir / key_prefix / key.json
794 | prefix = key[:2]
795 | return CACHE_DIR / prefix / f"{key}.json"
796 |
797 |
798 | # --- Async Cache Get/Put using asyncio.to_thread for OS file IO ---
799 | async def _cache_get_async(key: str) -> Optional[ToolResult]:
800 | """Asynchronously gets a result from the disk cache."""
801 | if not CACHE_ENABLED:
802 | return None
803 | cache_path = _get_cache_path(key)
804 | try:
805 | if await asyncio.to_thread(cache_path.exists):
806 | # Check cache entry age
807 | stat_res = await asyncio.to_thread(cache_path.stat)
808 | age_seconds = time.time() - stat_res.st_mtime
809 | if age_seconds > (CACHE_MAX_AGE_DAYS * 24 * 3600):
810 | logger.debug(
811 | f"Cache entry {key[:8]} expired (age {age_seconds:.0f}s > {CACHE_MAX_AGE_DAYS}d). Removing."
812 | )
813 | try:
814 | await asyncio.to_thread(cache_path.unlink)
815 | except OSError as e:
816 | logger.warning(f"Failed to remove expired cache file {cache_path}: {e}")
817 | return None
818 |
819 | # Read cached data (aiofiles is okay for read/write content)
820 | async with aiofiles.open(cache_path, mode="r", encoding="utf-8") as f:
821 | content = await f.read()
822 | data = json.loads(content)
823 | # --- Type Check and Reconstruction ---
824 | required_keys = {
825 | "stdout": None,
826 | "stderr": None,
827 | "exit_code": None,
828 | "success": False,
829 | "duration": 0.0,
830 | "stdout_truncated": False,
831 | "stderr_truncated": False,
832 | "error": None,
833 | "error_code": None,
834 | "cached_result": True, # Set flag here
835 | }
836 | validated_data = {}
837 | for k, default_val in required_keys.items():
838 | validated_data[k] = data.get(k, default_val)
839 |
840 | # Ensure error_code is None or a valid ToolErrorCode enum member
841 | raw_error_code = validated_data.get("error_code")
842 | if raw_error_code is not None:
843 | try:
844 | validated_data["error_code"] = ToolErrorCode(raw_error_code)
845 | except ValueError:
846 | logger.warning(
847 | f"Invalid error_code '{raw_error_code}' found in cache for key {key[:8]}. Setting to None."
848 | )
849 | validated_data["error_code"] = None
850 |
851 | logger.info(f"Cache HIT for key {key[:8]}.")
852 | # Use cast to satisfy type checker, assuming validation ensures structure
853 | return cast(ToolResult, validated_data)
854 | else:
855 | # logger.debug(f"Cache MISS for key {key[:8]}.")
856 | return None
857 | except (OSError, json.JSONDecodeError, TypeError) as e:
858 | logger.warning(f"Cache read error for key {key[:8]}: {e}. Treating as miss.")
859 | # Attempt to remove potentially corrupt file
860 | try:
861 | if await asyncio.to_thread(cache_path.exists):
862 | await asyncio.to_thread(cache_path.unlink)
863 | except OSError:
864 | pass
865 | return None
866 | except Exception as e:
867 | logger.error(f"Unexpected cache get error for key {key[:8]}: {e}", exc_info=True)
868 | return None
869 |
870 |
871 | async def _cache_put_async(key: str, result: ToolResult):
872 | """Asynchronously puts a result into the disk cache."""
873 | if not CACHE_ENABLED:
874 | return
875 | cache_path = _get_cache_path(key)
876 | try:
877 | # Ensure parent directory exists
878 | await asyncio.to_thread(cache_path.parent.mkdir, parents=True, exist_ok=True)
879 |
880 | # Write data as JSON
881 | result_to_write = result.copy()
882 | result_to_write.pop("cached_result", None) # Don't store cache flag itself
883 | # Ensure enum is converted to string for JSON
884 | if result_to_write.get("error_code") is not None:
885 | result_to_write["error_code"] = result_to_write["error_code"].value
886 |
887 | json_data = json.dumps(result_to_write, indent=2) # Pretty print for readability
888 | async with aiofiles.open(cache_path, mode="w", encoding="utf-8") as f:
889 | await f.write(json_data)
890 | logger.debug(f"Cache PUT successful for key {key[:8]}.")
891 |
892 | # Trigger cleanup check periodically (simple random approach)
893 | if random.random() < 0.01: # ~1% chance on write
894 | asyncio.create_task(_cleanup_cache_lru())
895 | except (OSError, TypeError, json.JSONDecodeError) as e:
896 | logger.error(f"Cache write error for key {key[:8]}: {e}")
897 | except Exception as e:
898 | logger.error(f"Unexpected cache put error for key {key[:8]}: {e}", exc_info=True)
899 |
900 |
901 | # --- Cache Cleanup (LRU-like based on mtime) ---
902 | async def _cleanup_cache_lru():
903 | """Removes cache files exceeding max size or age."""
904 | if not CACHE_ENABLED:
905 | return
906 | logger.debug("Running cache cleanup...")
907 | try:
908 | files: List[Tuple[Path, float, int]] = []
909 | total_size = 0
910 | now = time.time()
911 | max_age_seconds = CACHE_MAX_AGE_DAYS * 24 * 3600
912 | max_size_bytes = CACHE_MAX_SIZE_MB * 1024 * 1024
913 |
914 | # Use aiofiles.os.walk (designed for async iteration)
915 | # Note: aiofiles.os.walk might still be less performant than scandir + to_thread for large dirs
916 | async for root, _, filenames in aiofiles.os.walk(CACHE_DIR):
917 | for filename in filenames:
918 | if filename.endswith(".json"):
919 | filepath = Path(root) / filename
920 | try:
921 | stat_res = await asyncio.to_thread(filepath.stat)
922 | files.append((filepath, stat_res.st_mtime, stat_res.st_size))
923 | total_size += stat_res.st_size
924 | except OSError:
925 | continue # Skip files we can't stat
926 |
927 | # Sort by modification time (oldest first)
928 | files.sort(key=lambda x: x[1])
929 |
930 | removed_count = 0
931 | removed_size = 0
932 | current_total_size = total_size # Keep track of size as we remove
933 |
934 | # Remove files based on age or if total size exceeds limit
935 | for filepath, mtime, size in files:
936 | age = now - mtime
937 | # Check size limit against potentially reduced total size
938 | over_size_limit = current_total_size > max_size_bytes
939 | over_age_limit = age > max_age_seconds
940 |
941 | if over_age_limit or over_size_limit:
942 | try:
943 | await asyncio.to_thread(filepath.unlink)
944 | removed_count += 1
945 | removed_size += size
946 | current_total_size -= size
947 | logger.debug(
948 | f"Cache cleanup removed: {filepath.name} (Reason: {'Age' if over_age_limit else 'Size'})"
949 | )
950 | except OSError as e:
951 | logger.warning(f"Cache cleanup failed to remove {filepath}: {e}")
952 | # Optimization: If we are no longer over the size limit, we only need to continue checking for age limit
953 | elif not over_size_limit:
954 | # We must continue iterating to check older files for age limit.
955 | pass
956 |
957 | if removed_count > 0:
958 | logger.info(
959 | f"Cache cleanup complete. Removed {removed_count} files ({removed_size / (1024 * 1024):.1f} MB). Current size: {current_total_size / (1024 * 1024):.1f} MB"
960 | )
961 | else:
962 | logger.debug(
963 | f"Cache cleanup complete. No files removed. Current size: {current_total_size / (1024 * 1024):.1f} MB"
964 | )
965 |
966 | except Exception as e:
967 | logger.error(f"Error during cache cleanup: {e}", exc_info=True)
968 |
969 |
970 | # --------------------------------------------------------------------------- #
971 | # Resource Limiting Function (Module Scope)
972 | # --------------------------------------------------------------------------- #
973 |
974 |
975 | def _limit_resources(timeout: float, cmd_name: Optional[str] = None) -> None:
976 | """Sets resource limits for the child process (Unix only), potentially command-specific."""
977 | try:
978 | if HAS_RESOURCE:
979 | # CPU seconds (add 1s buffer)
980 | cpu_limit = int(timeout) + 1
981 | resource.setrlimit(resource.RLIMIT_CPU, (cpu_limit, cpu_limit))
982 |
983 | # Address-space (bytes) - Virtual Memory
984 | soft_as_str = os.getenv("MCP_TEXT_RLIMIT_AS", "2_147_483_648") # 2GiB default
985 | hard_as_str = os.getenv(
986 | "MCP_TEXT_RLIMIT_AS_HARD", str(int(soft_as_str) + 100 * 1024 * 1024)
987 | ) # +100MiB buffer default
988 | try:
989 | soft_as = int(soft_as_str)
990 | hard_as = int(hard_as_str)
991 | if soft_as > 0 and hard_as > 0: # Allow disabling with 0 or negative
992 | resource.setrlimit(resource.RLIMIT_AS, (soft_as, hard_as))
993 | logger.debug(
994 | f"Applied resource limit: AS={soft_as / (1024**3):.1f}GiB (soft), {hard_as / (1024**3):.1f}GiB (hard)"
995 | )
996 | except ValueError:
997 | logger.warning(
998 | "Invalid value for MCP_TEXT_RLIMIT_AS or MCP_TEXT_RLIMIT_AS_HARD. Skipping AS limit."
999 | )
1000 | except resource.error as e:
1001 | logger.warning(
1002 | f"Failed to set AS limit: {e}. Limit might be too low or too high for the system."
1003 | )
1004 |
1005 | # No core dumps
1006 | try:
1007 | resource.setrlimit(resource.RLIMIT_CORE, (0, 0))
1008 | except resource.error as e:
1009 | logger.warning(f"Failed to disable core dumps: {e}") # May fail in containers
1010 |
1011 | # --- RLIMIT_NPROC --------------------------------------------------
1012 | try:
1013 | # honour optional global opt-out
1014 | if os.getenv("MCP_TEXT_RLIMIT_NPROC_DISABLE", "").lower() == "true":
1015 | logger.debug("RLIMIT_NPROC: disabled via env flag")
1016 | else:
1017 | soft_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_SOFT", "4096"))
1018 | hard_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_HARD", "8192"))
1019 | if cmd_name == "rg":
1020 | soft_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_SOFT_RG", "16384"))
1021 | hard_req = int(os.getenv("MCP_TEXT_RLIMIT_NPROC_HARD_RG", "32768"))
1022 |
1023 | cur_soft, cur_hard = resource.getrlimit(resource.RLIMIT_NPROC)
1024 |
1025 | # translate "unlimited"/-1
1026 | req_soft = cur_soft if soft_req <= 0 else soft_req
1027 | req_hard = cur_hard if hard_req <= 0 else hard_req
1028 |
1029 | # never *lower* an existing limit
1030 | new_soft = max(cur_soft, req_soft)
1031 | new_hard = max(cur_hard, req_hard)
1032 |
1033 | # only call setrlimit if anything actually changes
1034 | if (new_soft, new_hard) != (cur_soft, cur_hard):
1035 | resource.setrlimit(resource.RLIMIT_NPROC, (new_soft, new_hard))
1036 | logger.debug(
1037 | f"RLIMIT_NPROC set to (soft={new_soft}, hard={new_hard}) "
1038 | f"(was {cur_soft}/{cur_hard}) for {cmd_name}"
1039 | )
1040 | except (ValueError, resource.error) as e:
1041 | logger.warning(f"RLIMIT_NPROC not applied: {e}")
1042 |
1043 | logger.debug(f"Applied resource limits: CPU={cpu_limit}s") # Summary log
1044 |
1045 | if HAS_PRCTL and sys.platform == "linux":
1046 | try:
1047 | # Prevent privilege escalation
1048 | prctl.set_no_new_privs(True)
1049 | logger.debug("Applied prctl NO_NEW_PRIVS.")
1050 | except Exception as e: # prctl might raise various errors
1051 | logger.warning(f"Failed to set prctl NO_NEW_PRIVS: {e}")
1052 |
1053 | # Run in new session to isolate from controlling terminal (if any)
1054 | # This helps ensure signals (like Ctrl+C) don't propagate unexpectedly.
1055 | if sys.platform != "win32" and hasattr(os, "setsid"):
1056 | try:
1057 | os.setsid()
1058 | logger.debug("Process started in new session ID.")
1059 | except OSError as e:
1060 | logger.warning(f"Failed to call setsid: {e}") # May fail if already session leader
1061 |
1062 | except Exception as exc:
1063 | # Catch-all for any unexpected issue during limit setting
1064 | logger.warning(f"Failed to apply resource limits/sandboxing: {exc}")
1065 |
1066 |
1067 | # --------------------------------------------------------------------------- #
1068 | # Helper Functions
1069 | # --------------------------------------------------------------------------- #
1070 |
1071 |
1072 | def _truncate(data: bytes) -> tuple[str, bool]:
1073 | """Decodes bytes to string and truncates if necessary."""
1074 | truncated = False
1075 | if len(data) > MAX_OUTPUT_BYTES:
1076 | data = data[:MAX_OUTPUT_BYTES]
1077 | truncated = True
1078 | # Try to decode truncated data, replacing errors
1079 | decoded = data.decode("utf-8", errors="replace")
1080 | # Add truncation marker AFTER decoding to avoid corrupting multibyte char
1081 | decoded += "\n... (output truncated)"
1082 | else:
1083 | decoded = data.decode("utf-8", errors="replace")
1084 | return decoded, truncated
1085 |
1086 |
1087 | def _is_json_or_json_lines(text: str) -> bool:
1088 | """
1089 | True → text is a single JSON document (json.loads ok)
1090 | True → text is newline-delimited JSON lines (all lines load)
1091 | False → otherwise
1092 | """
1093 | try:
1094 | json.loads(text)
1095 | return True # one big doc
1096 | except json.JSONDecodeError:
1097 | pass
1098 |
1099 | ok = True
1100 | for ln in text.splitlines():
1101 | ln = ln.strip()
1102 | if not ln: # skip blanks
1103 | continue
1104 | try:
1105 | json.loads(ln)
1106 | except json.JSONDecodeError:
1107 | ok = False
1108 | break
1109 | return ok
1110 |
1111 |
1112 | # --------------------------------------------------------------------------- #
1113 | # Secure Async Executor Core (with Caching, Checksum, Version Checks)
1114 | # --------------------------------------------------------------------------- #
1115 | # Dictionary of asyncio.Semaphore objects, one per command
1116 | _SEMAPHORES = {cmd: asyncio.Semaphore(limit) for cmd, limit in CONCURRENCY_LIMITS.items()}
1117 |
1118 |
1119 | async def _run_command_secure(
1120 | cmd_name: str,
1121 | args_str: str,
1122 | *,
1123 | input_data: Optional[str] = None,
1124 | is_file_target: bool = False,
1125 | is_dir_target: bool = False,
1126 | timeout: float = DEFAULT_TIMEOUT,
1127 | dry_run: bool = False,
1128 | ) -> ToolResult:
1129 | """Securely executes a validated local command with enhancements."""
1130 |
1131 | # 1. Get Command Metadata and Path
1132 | meta = _COMMAND_METADATA.get(cmd_name)
1133 | if not meta or not meta.path:
1134 | raise ToolExecutionError(
1135 | f"Command '{cmd_name}' is not available or configured.",
1136 | error_code=ToolErrorCode.CMD_NOT_FOUND,
1137 | )
1138 |
1139 | # Version/checksum checks run before semaphore. This is generally okay,
1140 | # potential for minor concurrent hashing/version checks on very first simultaneous calls.
1141 | # 2. Check Minimum Version (async, cached)
1142 | try:
1143 | await _check_command_version(meta)
1144 | except ToolExecutionError as e:
1145 | # Propagate version error directly
1146 | raise e
1147 |
1148 | # 3. Verify Checksum (async, lazy, cached)
1149 | try:
1150 | await _get_command_checksum(meta) # Verifies if mtime changed, re-calcs if needed
1151 | # Optionally, compare against a known-good checksum if available/needed
1152 | # if meta.checksum != EXPECTED_CHECKSUMS[cmd_name]: raise ToolExecutionError(...)
1153 | except ToolExecutionError as e:
1154 | raise e # Propagate checksum errors
1155 |
1156 | # 4. Parse Arguments using shlex
1157 | try:
1158 | argv: List[str] = shlex.split(args_str, posix=True)
1159 | except ValueError as exc:
1160 | raise ToolInputError(
1161 | "Invalid quoting or escaping in args_str.",
1162 | param_name="args_str",
1163 | details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1164 | ) from exc
1165 |
1166 | # 5. Validate Arguments (Security & Workspace) using the placeholder/real validator
1167 | try:
1168 | _validate_arguments(cmd_name, argv)
1169 | except ToolInputError as e:
1170 | # Add command context to validation error
1171 | raise ToolInputError(
1172 | f"Argument validation failed for '{cmd_name}': {e}",
1173 | param_name="args_str",
1174 | details=e.details,
1175 | ) from e
1176 |
1177 | # 6. Handle Dry Run
1178 | cmd_path_str = str(meta.path)
1179 | cmdline: List[str] = [cmd_path_str, *argv]
1180 | if dry_run:
1181 | logger.info(f"Dry run: Command validated successfully: {shlex.join(cmdline)}")
1182 | # Ensure ToolResult structure is fully populated for dry run
1183 | return ToolResult(
1184 | success=True,
1185 | dry_run_cmdline=cmdline, # Return the command list
1186 | stdout=None,
1187 | stderr=None,
1188 | exit_code=None,
1189 | error=None,
1190 | error_code=None,
1191 | duration=0.0,
1192 | stdout_truncated=False,
1193 | stderr_truncated=False,
1194 | cached_result=False,
1195 | )
1196 |
1197 | # 7. Stdin Size Check and Encoding (Optimization)
1198 | input_bytes: Optional[bytes] = None
1199 | input_len = 0
1200 | if input_data is not None:
1201 | input_bytes = input_data.encode("utf-8", errors="ignore")
1202 | input_len = len(input_bytes)
1203 | if input_len > MAX_INPUT_BYTES:
1204 | raise ToolInputError(
1205 | f"input_data exceeds maximum allowed size of {MAX_INPUT_BYTES / (1024 * 1024):.1f} MB.",
1206 | param_name="input_data",
1207 | details={
1208 | "limit_bytes": MAX_INPUT_BYTES,
1209 | "actual_bytes": input_len,
1210 | "error_code": ToolErrorCode.INPUT_TOO_LARGE.value,
1211 | },
1212 | )
1213 |
1214 | # 8. Prepare for Caching
1215 | cache_key = _get_cache_key(cmd_name, argv, input_bytes)
1216 | cached_result = await _cache_get_async(cache_key)
1217 | if cached_result:
1218 | # Ensure cached_result flag is set correctly (should be done by _cache_get_async)
1219 | cached_result["cached_result"] = True
1220 | return cached_result
1221 |
1222 | # 9. Acquire Concurrency Semaphore
1223 | semaphore = _SEMAPHORES.get(cmd_name)
1224 | if not semaphore:
1225 | logger.error(
1226 | f"Internal error: No semaphore found for command '{cmd_name}'. Using fallback limit 1."
1227 | )
1228 | # Should not happen if CONCURRENCY_LIMITS is correct
1229 | semaphore = asyncio.Semaphore(1) # Fallback to concurrency of 1
1230 |
1231 | async with semaphore:
1232 | # 10. Redacted Logging
1233 | redacted_argv = [
1234 | re.sub(r"(?i)(--?(?:password|token|key|secret)=)\S+", r"\1********", arg)
1235 | for arg in argv
1236 | ]
1237 | log_payload = {
1238 | "event": "execute_local_text_tool",
1239 | "command": cmd_name,
1240 | "args": redacted_argv,
1241 | "input_source": "stdin"
1242 | if input_data is not None
1243 | else ("file" if is_file_target else ("dir" if is_dir_target else "args_only")),
1244 | "timeout_s": timeout,
1245 | "cache_key_prefix": cache_key[:8] if cache_key else None,
1246 | }
1247 | # Using extra assumes logger is configured to handle it
1248 | logger.info(json.dumps(log_payload), extra={"json_fields": log_payload})
1249 |
1250 | # 11. Resource Limit Setup (Unix only) - Uses module-level function
1251 | def preexec_fn_to_use():
1252 | # Pass cmd_name for command-specific limits
1253 | return _limit_resources(timeout, cmd_name=cmd_name) if sys.platform != "win32" else None
1254 |
1255 | # 12. Minimal Sanitized Environment
1256 | safe_env = {"PATH": os.getenv("PATH", "/usr/bin:/bin:/usr/local/bin")} # Add common paths
1257 | # Preserve locale settings for correct text processing
1258 | for safe_var in ["LANG", "LC_ALL", "LC_CTYPE", "LC_MESSAGES", "LC_COLLATE"]:
1259 | if safe_var in os.environ:
1260 | safe_env[safe_var] = os.environ[safe_var]
1261 | # Add HOME for tools that might need it for config (like awk finding extensions)
1262 | if "HOME" in os.environ:
1263 | safe_env["HOME"] = os.environ["HOME"]
1264 | # Add TMPDIR as some tools use it
1265 | if "TMPDIR" in os.environ:
1266 | safe_env["TMPDIR"] = os.environ["TMPDIR"]
1267 | elif os.getenv("TEMP"): # Windows variant
1268 | safe_env["TEMP"] = os.getenv("TEMP") # type: ignore[assignment]
1269 |
1270 | # 13. Launch Subprocess
1271 | t0 = time.perf_counter()
1272 | proc: Optional[asyncio.subprocess.Process] = None
1273 | result: Optional[ToolResult] = None # Ensure result is defined
1274 |
1275 | try:
1276 | proc = await asyncio.create_subprocess_exec(
1277 | *cmdline,
1278 | stdin=asyncio.subprocess.PIPE
1279 | if input_bytes is not None
1280 | else asyncio.subprocess.DEVNULL,
1281 | stdout=asyncio.subprocess.PIPE,
1282 | stderr=asyncio.subprocess.PIPE,
1283 | env=safe_env,
1284 | limit=MAX_OUTPUT_BYTES * 2, # Limit buffer size slightly larger than max output
1285 | preexec_fn=preexec_fn_to_use,
1286 | # cwd=str(WORKSPACE_DIR) # Generally not needed if paths are validated relative to WORKSPACE_DIR
1287 | )
1288 |
1289 | # 14. Communicate and Handle Timeout
1290 | try:
1291 | stdout_b, stderr_b = await asyncio.wait_for(
1292 | proc.communicate(input=input_bytes),
1293 | timeout=timeout,
1294 | )
1295 | except asyncio.TimeoutError as e:
1296 | logger.warning(
1297 | f"Command '{cmd_name}' timed out after {timeout}s. Terminating.",
1298 | extra={"command": cmd_name, "timeout": timeout},
1299 | )
1300 | if proc and proc.returncode is None:
1301 | try:
1302 | proc.terminate()
1303 | # Give it a moment to terminate gracefully
1304 | await asyncio.wait_for(proc.wait(), timeout=1.0)
1305 | except asyncio.TimeoutError:
1306 | logger.warning(
1307 | f"Process {proc.pid} did not terminate gracefully after 1s, killing.",
1308 | extra={"pid": proc.pid},
1309 | )
1310 | try:
1311 | proc.kill()
1312 | except ProcessLookupError:
1313 | pass # Already gone
1314 | await proc.wait() # Wait for kill to complete
1315 | except ProcessLookupError:
1316 | pass # Already gone
1317 | except Exception as term_err:
1318 | logger.error(
1319 | f"Error terminating process {proc.pid}: {term_err}",
1320 | extra={"pid": proc.pid},
1321 | )
1322 |
1323 | raise ToolExecutionError(
1324 | f"'{cmd_name}' exceeded timeout of {timeout}s.",
1325 | error_code=ToolErrorCode.TIMEOUT,
1326 | details={"timeout": timeout},
1327 | ) from e
1328 | except (BrokenPipeError, ConnectionResetError) as comm_err:
1329 | # Handle cases where the process exits before consuming all input/producing output
1330 | exit_code_on_comm_err = proc.returncode if proc else -1
1331 | logger.warning(
1332 | f"Communication error with '{cmd_name}' (process likely exited early): {comm_err}. Exit code: {exit_code_on_comm_err}",
1333 | extra={
1334 | "command": cmd_name,
1335 | "error": str(comm_err),
1336 | "exit_code": exit_code_on_comm_err,
1337 | },
1338 | )
1339 | # Read any remaining output before raising
1340 | stdout_b = b""
1341 | stderr_b = b""
1342 | try:
1343 | # Use readexactly or read with limit to avoid indefinite block
1344 | if proc and proc.stdout:
1345 | stdout_b = await asyncio.wait_for(
1346 | proc.stdout.read(MAX_OUTPUT_BYTES * 2), timeout=0.5
1347 | )
1348 | if proc and proc.stderr:
1349 | stderr_b = await asyncio.wait_for(
1350 | proc.stderr.read(MAX_OUTPUT_BYTES * 2), timeout=0.5
1351 | )
1352 | except asyncio.TimeoutError:
1353 | pass
1354 | except Exception as read_err:
1355 | logger.warning(f"Error reading remaining output after comm error: {read_err}")
1356 |
1357 | # Continue processing with potentially partial output, but mark as communication error
1358 | # Let success be determined by exit code if available, otherwise assume failure
1359 | duration = time.perf_counter() - t0
1360 | exit_code = (
1361 | proc.returncode if proc and proc.returncode is not None else -1
1362 | ) # Use -1 if unknown
1363 | stdout, stdout_truncated = _truncate(stdout_b)
1364 | stderr, stderr_truncated = _truncate(stderr_b)
1365 | retcode_ok_map = RETCODE_OK.get(cmd_name, {0})
1366 | success = exit_code in retcode_ok_map
1367 |
1368 | # Construct result indicating communication error
1369 | result = ToolResult(
1370 | stdout=stdout,
1371 | stderr=stderr,
1372 | exit_code=exit_code,
1373 | success=success, # May be true if exit code was OK despite pipe error
1374 | error=f"Communication error with '{cmd_name}': {comm_err}",
1375 | error_code=ToolErrorCode.COMMUNICATION_ERROR,
1376 | duration=round(duration, 3),
1377 | stdout_truncated=stdout_truncated,
1378 | stderr_truncated=stderr_truncated,
1379 | cached_result=False,
1380 | )
1381 | # Do not cache communication errors
1382 | return result # Return immediately, skip normal success/cache path
1383 |
1384 | except Exception as comm_err: # Catch other potential communicate errors
1385 | exit_code_on_comm_err = proc.returncode if proc else -1
1386 | raise ToolExecutionError(
1387 | f"Communication error with '{cmd_name}': {comm_err}",
1388 | error_code=ToolErrorCode.COMMUNICATION_ERROR,
1389 | details={"exit_code": exit_code_on_comm_err},
1390 | ) from comm_err
1391 |
1392 | # 15. Process Results
1393 | duration = time.perf_counter() - t0
1394 | exit_code = proc.returncode
1395 | stdout, stdout_truncated = _truncate(stdout_b)
1396 | stderr, stderr_truncated = _truncate(stderr_b)
1397 |
1398 | # Use normalization table for success check
1399 | retcode_ok_map = RETCODE_OK.get(cmd_name, {0}) # Default to {0} if cmd not in map
1400 | success = exit_code in retcode_ok_map
1401 | error_message: Optional[str] = None
1402 | error_code: Optional[ToolErrorCode] = None
1403 |
1404 | if not success:
1405 | error_message = f"Command '{cmd_name}' failed with exit code {exit_code}."
1406 | # Attempt to map common exit codes to specific error types if possible
1407 | # e.g., if exit code 2 for rg means regex error -> ToolErrorCode.INVALID_ARGS
1408 | error_code = ToolErrorCode.EXEC_ERROR # Default execution error
1409 | if stderr:
1410 | error_message += (
1411 | f" Stderr: '{textwrap.shorten(stderr, 150, placeholder='...')}'"
1412 | )
1413 |
1414 | # 16. Construct ToolResult TypedDict
1415 | result = ToolResult(
1416 | stdout=stdout,
1417 | stderr=stderr,
1418 | exit_code=exit_code,
1419 | success=success,
1420 | error=error_message,
1421 | error_code=error_code,
1422 | duration=round(duration, 3),
1423 | stdout_truncated=stdout_truncated,
1424 | stderr_truncated=stderr_truncated,
1425 | cached_result=False, # Will be set by caller if cached
1426 | )
1427 |
1428 | # 17. Cache the result if successful and not too large? (optional check)
1429 | # Consider not caching extremely large successful results if space is a concern
1430 | # current logic caches all successful results
1431 | if success:
1432 | await _cache_put_async(cache_key, result)
1433 |
1434 | return result
1435 |
1436 | except (ToolInputError, ToolExecutionError) as e:
1437 | # Propagate specific errors we raised
1438 | raise e
1439 | except FileNotFoundError as e:
1440 | # Specifically catch if the command itself isn't found at exec time
1441 | logger.error(
1442 | f"Command '{cmd_name}' not found at path: {cmdline[0]}. Ensure it's installed and in PATH.",
1443 | exc_info=True,
1444 | )
1445 | raise ToolExecutionError(
1446 | f"Command '{cmd_name}' executable not found.",
1447 | error_code=ToolErrorCode.CMD_NOT_FOUND,
1448 | ) from e
1449 | except PermissionError as e:
1450 | logger.error(
1451 | f"Permission denied executing command '{cmd_name}' at path: {cmdline[0]}.",
1452 | exc_info=True,
1453 | )
1454 | raise ToolExecutionError(
1455 | f"Permission denied executing '{cmd_name}'. Check file permissions.",
1456 | error_code=ToolErrorCode.EXEC_ERROR,
1457 | ) from e
1458 | except Exception as e:
1459 | # Catch unexpected errors during setup/execution
1460 | logger.critical(f"Unexpected error running command '{cmd_name}': {e}", exc_info=True)
1461 | raise ToolExecutionError(
1462 | f"Unexpected failure executing '{cmd_name}': {e}",
1463 | error_code=ToolErrorCode.UNEXPECTED_FAILURE,
1464 | ) from e
1465 | finally:
1466 | # Ensure process is cleaned up if an exception occurred after creation
1467 | if proc and proc.returncode is None:
1468 | logger.warning(
1469 | f"Process {proc.pid} for '{cmd_name}' still running after exception, attempting kill."
1470 | )
1471 | try:
1472 | proc.kill()
1473 | await proc.wait()
1474 | except ProcessLookupError:
1475 | pass
1476 | except Exception as final_kill_err:
1477 | logger.error(
1478 | f"Error killing process {proc.pid} in finally block: {final_kill_err}"
1479 | )
1480 |
1481 |
1482 | # --------------------------------------------------------------------------- #
1483 | # Retcode Normalization
1484 | # --------------------------------------------------------------------------- #
1485 | RETCODE_OK: Dict[str, set[int]] = {
1486 | "rg": {0, 1}, # 0 = matches found, 1 = no matches found (both OK for searching)
1487 | "jq": {0}, # 0 = success. Other codes (e.g., 4 for no matches with -e) indicate issues.
1488 | "awk": {0}, # 0 = success
1489 | "sed": {0}, # 0 = success
1490 | }
1491 |
1492 | # --------------------------------------------------------------------------- #
1493 | # Public Tool Functions (Standalone Wrappers)
1494 | # --------------------------------------------------------------------------- #
1495 |
1496 |
1497 | def _require_single_source(
1498 | cmd: str,
1499 | *,
1500 | input_data: Optional[str],
1501 | input_file: Optional[bool],
1502 | input_dir: Optional[bool],
1503 | ) -> None:
1504 | """Validates that exactly one input source mode is indicated."""
1505 | # Convert bool flags to 0 or 1 for summing. None becomes 0.
1506 | modes = [
1507 | input_data is not None,
1508 | input_file is True, # Explicit True check
1509 | input_dir is True, # Explicit True check
1510 | ]
1511 | num_modes = sum(modes)
1512 | if num_modes == 0:
1513 | # Use a clearer error message
1514 | raise ToolInputError(
1515 | f"For '{cmd}', you must provide exactly one input: 'input_data' OR 'input_file=True' OR 'input_dir=True'.",
1516 | param_name="inputs",
1517 | details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1518 | )
1519 | elif num_modes > 1:
1520 | raise ToolInputError(
1521 | f"For '{cmd}', specify exactly one input mode: provide 'input_data' OR set 'input_file=True' OR set 'input_dir=True'. Found multiple.",
1522 | param_name="inputs",
1523 | details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1524 | )
1525 |
1526 |
1527 | # --- run_ripgrep ---
1528 | @with_tool_metrics
1529 | @with_error_handling
1530 | async def run_ripgrep(
1531 | args_str: str,
1532 | *,
1533 | input_data: Optional[str] = None,
1534 | input_file: Optional[bool] = False, # Default to False for clarity
1535 | input_dir: Optional[bool] = False, # Default to False for clarity
1536 | timeout: float = DEFAULT_TIMEOUT,
1537 | dry_run: bool = False,
1538 | ) -> ToolResult:
1539 | """
1540 | Executes the 'rg' (ripgrep) command for fast text pattern searching within the secure workspace.
1541 |
1542 | Searches recursively through directories or specified files (relative to the workspace)
1543 | for lines matching a regular expression or fixed string pattern.
1544 |
1545 | Input Handling:
1546 | - `input_data`: Provide text directly via stdin. Omit `input_file`/`input_dir` or set to `False`. Do *not* include a path in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1547 | - `input_file=True`: Indicates `args_str` contains target file path(s). Omit `input_data`/`input_dir` or set to `False`. Path(s) must be relative to the workspace and specified in `args_str`. Example: `args_str="'pattern' path/to/file.txt"`
1548 | - `input_dir=True`: Indicates `args_str` contains target directory path(s). Omit `input_data`/`input_file` or set to `False`. Path(s) must be relative to the workspace and specified in `args_str`. Example: `args_str="'pattern' path/to/dir"`
1549 |
1550 | Common `rg` Arguments (include in `args_str`, use workspace-relative paths):
1551 | `'pattern'`: Regex pattern or fixed string.
1552 | `path`: Workspace-relative file or directory path(s).
1553 | `-i`, `--ignore-case`: Case-insensitive search.
1554 | `-v`, `--invert-match`: Select non-matching lines.
1555 | `-l`, `--files-with-matches`: List files containing matches.
1556 | `-c`, `--count`: Count matches per file.
1557 | `-A NUM`, `-B NUM`, `-C NUM`: Context control.
1558 | `--json`: JSON output format.
1559 | `-t type`: Filter by file type (e.g., `py`, `md`).
1560 | `-g glob`: Include/exclude files/dirs by glob (e.g., `-g '*.py' -g '!temp/'`).
1561 | `-o, --only-matching`: Print only matching parts.
1562 | `--follow`: Follow symbolic links (targets must also be within workspace).
1563 | `--no-filename`, `-N`: Suppress filename/line numbers.
1564 |
1565 | Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks absolute paths, path traversal ('..'), unsafe flags, and shell metacharacters. Limited resources (CPU, Memory, Processes).
1566 |
1567 | Exit Codes & Success Field:
1568 | - `0`: Matches found -> `success: True`, `error: null`
1569 | - `1`: No matches found -> `success: True`, `error: null` (NOTE: Considered success by this tool wrapper)
1570 | - `2+`: Error occurred -> `success: False`, `error: "..."`, `error_code: EXEC_ERROR`
1571 |
1572 | Args:
1573 | args_str: Command-line arguments for `rg` (pattern, flags, workspace-relative paths).
1574 | input_data: String data to pipe to `rg` via stdin. Omit/False for `input_file`/`input_dir`.
1575 | input_file: Set to True if `args_str` includes target file path(s). Omit/False for `input_data`/`input_dir`.
1576 | input_dir: Set to True if `args_str` includes target directory path(s). Omit/False for `input_data`/`input_file`.
1577 | timeout: Max execution time in seconds.
1578 | dry_run: If True, validate args and return command line without execution.
1579 |
1580 | Returns:
1581 | ToolResult: Dictionary containing execution results or dry run info.
1582 | Includes stdout, stderr, exit_code, success, error, error_code, duration,
1583 | truncation flags, and cached_result status.
1584 |
1585 | Raises:
1586 | ToolInputError: For invalid arguments, security violations, or incorrect input mode usage.
1587 | ToolExecutionError: If `rg` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1588 | """
1589 | _require_single_source("rg", input_data=input_data, input_file=input_file, input_dir=input_dir)
1590 | return await _run_command_secure(
1591 | "rg",
1592 | args_str,
1593 | input_data=input_data,
1594 | is_file_target=input_file, # Pass the boolean flag directly
1595 | is_dir_target=input_dir, # Pass the boolean flag directly
1596 | timeout=timeout,
1597 | dry_run=dry_run,
1598 | )
1599 |
1600 |
1601 | # --- run_awk ---
1602 | @with_tool_metrics
1603 | @with_error_handling
1604 | async def run_awk(
1605 | args_str: str,
1606 | *,
1607 | input_data: Optional[str] = None,
1608 | input_file: Optional[bool] = False, # Default to False for clarity
1609 | timeout: float = DEFAULT_TIMEOUT,
1610 | dry_run: bool = False,
1611 | ) -> ToolResult:
1612 | """
1613 | Executes the 'awk' command for pattern scanning and field-based text processing within the secure workspace.
1614 |
1615 | Input Handling:
1616 | - `input_data`: Provide text directly via stdin. Omit `input_file` or set to `False`. Do *not* include filename in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1617 | - `input_file=True`: Indicates `args_str` contains target file path(s). Omit `input_data` or set to `False`. Path(s) must be relative to the workspace and specified in `args_str`. Example: `args_str="'{print $1}' path/data.log"`
1618 |
1619 | Common `awk` Arguments (include in `args_str`, use workspace-relative paths):
1620 | `'program'`: The AWK script (e.g., `'{ print $1, $3 }'`).
1621 | `filename(s)`: Workspace-relative input filename(s).
1622 | `-F fs`: Define input field separator (e.g., `-F ','`).
1623 | `-v var=value`: Assign variable.
1624 |
1625 | Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks unsafe flags (`-i`/`--in-place` gawk), shell characters, absolute paths, traversal. Limited resources.
1626 |
1627 | Args:
1628 | args_str: Command-line arguments for `awk` (script, flags, workspace-relative paths).
1629 | input_data: String data to pipe to `awk` via stdin. Omit/False for `input_file`.
1630 | input_file: Set to True if `args_str` includes target file path(s). Omit/False for `input_data`.
1631 | timeout: Max execution time in seconds.
1632 | dry_run: If True, validate args and return command line without execution.
1633 |
1634 | Returns:
1635 | ToolResult: Dictionary with results. `success` is True only if exit code is 0.
1636 |
1637 | Raises:
1638 | ToolInputError: For invalid arguments, security violations, or incorrect input mode usage.
1639 | ToolExecutionError: If `awk` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1640 | """
1641 | _require_single_source("awk", input_data=input_data, input_file=input_file, input_dir=False)
1642 | return await _run_command_secure(
1643 | "awk",
1644 | args_str,
1645 | input_data=input_data,
1646 | is_file_target=input_file,
1647 | is_dir_target=False, # awk typically doesn't take dir targets directly
1648 | timeout=timeout,
1649 | dry_run=dry_run,
1650 | )
1651 |
1652 |
1653 | # --- run_sed ---
1654 | @with_tool_metrics
1655 | @with_error_handling
1656 | async def run_sed(
1657 | args_str: str,
1658 | *,
1659 | input_data: Optional[str] = None,
1660 | input_file: Optional[bool] = False, # Default to False for clarity
1661 | timeout: float = DEFAULT_TIMEOUT,
1662 | dry_run: bool = False,
1663 | ) -> ToolResult:
1664 | """
1665 | Executes the 'sed' (Stream Editor) command for line-by-line text transformations within the secure workspace.
1666 |
1667 | Performs substitutions, deletions, insertions based on patterns. **In-place editing (`-i`) is disabled.**
1668 |
1669 | Input Handling:
1670 | - `input_data`: Provide text directly via stdin. Omit `input_file` or set to `False`. Do *not* include filename in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1671 | - `input_file=True`: Indicates `args_str` contains target file path. Omit `input_data` or set to `False`. Path must be relative to the workspace and specified in `args_str`. Example: `args_str="'s/ERROR/WARN/g' path/app.log"`
1672 |
1673 | Common `sed` Arguments (include in `args_str`, use workspace-relative paths):
1674 | `'script'`: The `sed` script or command (e.g., `'s/foo/bar/g'`, `'/^DEBUG/d'`).
1675 | `filename`: Workspace-relative input filename.
1676 | `-e script`: Add multiple scripts.
1677 | `-f script-file`: Read commands from a workspace-relative file.
1678 | `-n`: Suppress automatic printing.
1679 | `-E` or `-r`: Use extended regular expressions.
1680 |
1681 | Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks `-i` flag, shell characters, absolute paths, traversal. Limited resources.
1682 |
1683 | Args:
1684 | args_str: Command-line arguments for `sed` (script, flags, workspace-relative path).
1685 | input_data: String data to pipe to `sed` via stdin. Omit/False for `input_file`.
1686 | input_file: Set to True if `args_str` includes target file path. Omit/False for `input_data`.
1687 | timeout: Max execution time in seconds.
1688 | dry_run: If True, validate args and return command line without execution.
1689 |
1690 | Returns:
1691 | ToolResult: Dictionary with results. `success` is True only if exit code is 0.
1692 |
1693 | Raises:
1694 | ToolInputError: For invalid arguments, security violations (like using -i), or incorrect input mode usage.
1695 | ToolExecutionError: If `sed` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1696 | """
1697 | _require_single_source("sed", input_data=input_data, input_file=input_file, input_dir=False)
1698 | return await _run_command_secure(
1699 | "sed",
1700 | args_str,
1701 | input_data=input_data,
1702 | is_file_target=input_file,
1703 | is_dir_target=False, # sed operates on files or stdin
1704 | timeout=timeout,
1705 | dry_run=dry_run,
1706 | )
1707 |
1708 |
1709 | # --- run_jq ---
1710 | @with_tool_metrics
1711 | @with_error_handling
1712 | async def run_jq(
1713 | args_str: str,
1714 | *,
1715 | input_data: Optional[str] = None,
1716 | input_file: Optional[bool] = False, # Default to False for clarity
1717 | timeout: float = DEFAULT_TIMEOUT,
1718 | dry_run: bool = False,
1719 | ) -> ToolResult:
1720 | """
1721 | Executes the 'jq' command for querying, filtering, and transforming JSON data within the secure workspace.
1722 |
1723 | Essential for extracting values, filtering arrays/objects, or restructuring JSON
1724 | provided via stdin (`input_data`) or from a file (`input_file=True`).
1725 |
1726 | Input Handling:
1727 | - `input_data`: Provide valid JSON text directly via stdin. Omit `input_file` or set to `False`. Do *not* include filename in `args_str`. Max size controlled by MCP_TEXT_MAX_INPUT.
1728 | - `input_file=True`: Indicates `args_str` contains target JSON file path. Omit `input_data` or set to `False`. Path must be relative to the workspace and specified in `args_str`. Example: `args_str="'.items[].name' data.json"`
1729 |
1730 | Common `jq` Arguments (include in `args_str`, use workspace-relative paths):
1731 | `'filter'`: The `jq` filter expression (e.g., `'.users[] | select(.active==true)'`).
1732 | `filename`: Workspace-relative input JSON filename.
1733 | `-c`: Compact output.
1734 | `-r`: Raw string output (no JSON quotes).
1735 | `-s`: Slurp mode (read input stream into an array).
1736 | `--arg name value`: Define string variable.
1737 | `--argjson name json_value`: Define JSON variable.
1738 |
1739 | Security: Enforces workspace boundary `"{get_workspace_dir()}"`. Blocks unsafe flags, shell characters, absolute paths, traversal. Limited resources. Validates `input_data` is JSON before execution.
1740 |
1741 | Args:
1742 | args_str: Command-line arguments for `jq` (filter, flags, workspace-relative path).
1743 | input_data: String containing valid JSON data to pipe to `jq` via stdin. Omit/False for `input_file`.
1744 | input_file: Set to True if `args_str` includes target JSON file path. Omit/False for `input_data`.
1745 | timeout: Max execution time in seconds.
1746 | dry_run: If True, validate args and return command line without execution.
1747 |
1748 | Returns:
1749 | ToolResult: Dictionary with results. `success` is True only if exit code is 0.
1750 |
1751 | Raises:
1752 | ToolInputError: If `input_data` is not valid JSON, arguments are invalid, security violations occur, or incorrect input mode usage.
1753 | ToolExecutionError: If `jq` is not found, times out, fails version/checksum checks, or fails unexpectedly.
1754 | """
1755 | _require_single_source("jq", input_data=input_data, input_file=input_file, input_dir=False)
1756 | # Extra check for jq: validate input_data is JSON before running
1757 | if input_data is not None and not _is_json_or_json_lines(input_data):
1758 | raise ToolInputError(
1759 | "input_data is not valid JSON or JSON-Lines",
1760 | param_name="input_data",
1761 | details={"error_code": ToolErrorCode.INVALID_JSON_INPUT.value},
1762 | )
1763 |
1764 | return await _run_command_secure(
1765 | "jq",
1766 | args_str,
1767 | input_data=input_data,
1768 | is_file_target=input_file,
1769 | is_dir_target=False, # jq operates on files or stdin
1770 | timeout=timeout,
1771 | dry_run=dry_run,
1772 | )
1773 |
1774 |
1775 | # --------------------------------------------------------------------------- #
1776 | # Streaming Core Executor
1777 | # --------------------------------------------------------------------------- #
1778 |
1779 |
1780 | async def _run_command_stream(
1781 | cmd_name: str,
1782 | args_str: str,
1783 | *,
1784 | input_data: Optional[str] = None,
1785 | is_file_target: bool = False,
1786 | is_dir_target: bool = False,
1787 | timeout: float = DEFAULT_TIMEOUT,
1788 | ) -> AsyncIterator[str]:
1789 | """Securely executes a command and streams its stdout line by line."""
1790 |
1791 | # 1. Get Command Metadata and Path
1792 | meta = _COMMAND_METADATA.get(cmd_name)
1793 | if not meta or not meta.path:
1794 | raise ToolExecutionError(
1795 | f"Command '{cmd_name}' is not available or configured.",
1796 | error_code=ToolErrorCode.CMD_NOT_FOUND,
1797 | )
1798 |
1799 | # 2. Version & Checksum Checks (Same as non-streaming)
1800 | await _check_command_version(meta)
1801 | await _get_command_checksum(meta) # Verification happens here
1802 |
1803 | # 3. Parse & Validate Arguments
1804 | try:
1805 | argv: List[str] = shlex.split(args_str, posix=True)
1806 | _validate_arguments(cmd_name, argv)
1807 | except ValueError as exc:
1808 | raise ToolInputError(
1809 | "Invalid quoting or escaping in args_str.",
1810 | param_name="args_str",
1811 | details={"error_code": ToolErrorCode.INVALID_ARGS.value},
1812 | ) from exc
1813 | except ToolInputError as e:
1814 | raise ToolInputError(
1815 | f"Argument validation failed for '{cmd_name}' stream: {e}",
1816 | param_name="args_str",
1817 | details=e.details,
1818 | ) from e
1819 |
1820 | # 4. Prepare command line and logging
1821 | cmd_path_str = str(meta.path)
1822 | cmdline: List[str] = [cmd_path_str, *argv]
1823 | redacted_argv = [
1824 | re.sub(r"(?i)(--?(?:password|token|key|secret)=)\S+", r"\1********", arg) for arg in argv
1825 | ]
1826 | log_payload = {
1827 | "event": "execute_local_text_tool_stream",
1828 | "command": cmd_name,
1829 | "args": redacted_argv,
1830 | "input_source": "stdin"
1831 | if input_data is not None
1832 | else ("file" if is_file_target else ("dir" if is_dir_target else "args_only")),
1833 | "timeout_s": timeout,
1834 | }
1835 | logger.info(json.dumps(log_payload), extra={"json_fields": log_payload})
1836 |
1837 | # 5. Resource limits & Env (Same as non-streaming)
1838 | def preexec_fn_to_use():
1839 | return _limit_resources(timeout) if sys.platform != "win32" else None
1840 |
1841 | safe_env = {"PATH": os.getenv("PATH", "/usr/bin:/bin:/usr/local/bin")}
1842 | for safe_var in ["LANG", "LC_ALL", "LC_CTYPE", "LC_MESSAGES", "LC_COLLATE"]:
1843 | if safe_var in os.environ:
1844 | safe_env[safe_var] = os.environ[safe_var]
1845 | if "HOME" in os.environ:
1846 | safe_env["HOME"] = os.environ["HOME"]
1847 | if "TMPDIR" in os.environ:
1848 | safe_env["TMPDIR"] = os.environ["TMPDIR"]
1849 | elif os.getenv("TEMP"):
1850 | safe_env["TEMP"] = os.getenv("TEMP") # type: ignore[assignment]
1851 |
1852 | input_bytes: Optional[bytes] = None
1853 | if input_data is not None:
1854 | input_bytes = input_data.encode("utf-8", errors="ignore")
1855 | # Check size *before* starting process
1856 | if len(input_bytes) > MAX_INPUT_BYTES:
1857 | raise ToolInputError(
1858 | f"input_data exceeds maximum allowed size of {MAX_INPUT_BYTES / (1024 * 1024):.1f} MB for streaming.",
1859 | param_name="input_data",
1860 | details={
1861 | "limit_bytes": MAX_INPUT_BYTES,
1862 | "actual_bytes": len(input_bytes),
1863 | "error_code": ToolErrorCode.INPUT_TOO_LARGE.value,
1864 | },
1865 | )
1866 |
1867 | # 6. Launch Subprocess and Stream Output
1868 | proc: Optional[asyncio.subprocess.Process] = None
1869 | stderr_lines: List[str] = [] # Collect stderr lines
1870 | start_time = time.monotonic()
1871 |
1872 | try:
1873 | proc = await asyncio.create_subprocess_exec(
1874 | *cmdline,
1875 | stdin=asyncio.subprocess.PIPE
1876 | if input_bytes is not None
1877 | else asyncio.subprocess.DEVNULL,
1878 | stdout=asyncio.subprocess.PIPE,
1879 | stderr=asyncio.subprocess.PIPE, # Capture stderr
1880 | env=safe_env,
1881 | limit=MAX_OUTPUT_BYTES * 2, # Apply buffer limit
1882 | preexec_fn=preexec_fn_to_use,
1883 | )
1884 |
1885 | # --- Define Helper Coroutines for IO ---
1886 | async def write_stdin_task(proc: asyncio.subprocess.Process) -> None:
1887 | """Writes input data to stdin if provided and closes stdin."""
1888 | if input_bytes is not None and proc.stdin:
1889 | try:
1890 | proc.stdin.write(input_bytes)
1891 | await proc.stdin.drain()
1892 | except (BrokenPipeError, ConnectionResetError) as e:
1893 | logger.warning(
1894 | f"Stream: Error writing to stdin for '{cmd_name}' (pid {proc.pid}): {e}. Process might have exited."
1895 | )
1896 | except Exception as e:
1897 | logger.error(
1898 | f"Stream: Unexpected error writing to stdin for '{cmd_name}' (pid {proc.pid}): {e}",
1899 | exc_info=True,
1900 | )
1901 | finally:
1902 | if proc.stdin:
1903 | try:
1904 | proc.stdin.close()
1905 | except Exception:
1906 | pass # Ignore errors closing stdin already closed/broken
1907 | elif proc.stdin:
1908 | # Close stdin immediately if no input_data
1909 | try:
1910 | proc.stdin.close()
1911 | except Exception:
1912 | pass
1913 |
1914 | async def read_stderr_task(proc: asyncio.subprocess.Process, lines_list: List[str]) -> None:
1915 | """Reads stderr lines into the provided list."""
1916 | if not proc.stderr:
1917 | return
1918 | stderr_bytes_read = 0
1919 | while True:
1920 | try:
1921 | # Read with a timeout? readline() could block if process hangs writing stderr
1922 | line_bytes = await proc.stderr.readline()
1923 | if not line_bytes:
1924 | break # End of stream
1925 | stderr_bytes_read += len(line_bytes)
1926 | # Basic truncation within stderr collection to prevent memory issues
1927 | if stderr_bytes_read > MAX_OUTPUT_BYTES * 1.1: # Allow slightly more for marker
1928 | if not lines_list or not lines_list[-1].endswith("...(stderr truncated)"):
1929 | lines_list.append("...(stderr truncated)")
1930 | continue # Stop appending lines but keep reading to drain pipe
1931 | lines_list.append(line_bytes.decode("utf-8", errors="replace"))
1932 | except Exception as e:
1933 | logger.warning(
1934 | f"Stream: Error reading stderr line for '{cmd_name}' (pid {proc.pid}): {e}"
1935 | )
1936 | lines_list.append(f"##STREAM_STDERR_READ_ERROR: {e}\n")
1937 | break # Stop reading stderr on error
1938 |
1939 | # --- Create IO Tasks ---
1940 | stdin_writer = asyncio.create_task(write_stdin_task(proc))
1941 | stderr_reader = asyncio.create_task(read_stderr_task(proc, stderr_lines))
1942 |
1943 | # --- Yield stdout lines (Inlined async generator logic) ---
1944 | stdout_line_count = 0
1945 | stdout_bytes_read = 0
1946 | stdout_truncated = False
1947 | if proc.stdout:
1948 | while True:
1949 | # Check timeout explicitly within the loop
1950 | if time.monotonic() - start_time > timeout:
1951 | raise asyncio.TimeoutError()
1952 |
1953 | try:
1954 | # Use readline with a timeout? Less efficient but prevents hangs.
1955 | # Or rely on overall wait_for timeout below.
1956 | line_bytes = await proc.stdout.readline()
1957 | if not line_bytes:
1958 | break # End of stdout stream
1959 |
1960 | stdout_bytes_read += len(line_bytes)
1961 | if stdout_bytes_read > MAX_OUTPUT_BYTES:
1962 | if not stdout_truncated: # Add marker only once
1963 | yield "...(stdout truncated)\n"
1964 | stdout_truncated = True
1965 | continue # Stop yielding lines but keep reading to drain pipe
1966 |
1967 | if not stdout_truncated:
1968 | # Decode and yield the line
1969 | yield line_bytes.decode("utf-8", errors="replace")
1970 | stdout_line_count += 1
1971 |
1972 | except asyncio.TimeoutError as e: # Catch timeout during readline if applied
1973 | raise asyncio.TimeoutError() from e # Re-raise to be caught by outer handler
1974 | except Exception as e:
1975 | logger.warning(
1976 | f"Stream: Error reading stdout line for '{cmd_name}' (pid {proc.pid}): {e}"
1977 | )
1978 | # Yield an error marker in the stream
1979 | yield f"##STREAM_STDOUT_READ_ERROR: {e}\n"
1980 | break # Stop reading stdout on error
1981 | else:
1982 | logger.warning(
1983 | f"Stream: Process for '{cmd_name}' has no stdout stream.",
1984 | extra={"command": cmd_name},
1985 | )
1986 |
1987 | # --- Wait for process and stderr/stdin tasks to complete ---
1988 | # Wait for the process itself, applying the main timeout
1989 | try:
1990 | # Wait slightly longer than the timeout to allow for cleanup/exit signals
1991 | exit_code = await asyncio.wait_for(proc.wait(), timeout=timeout + 5.0)
1992 | except asyncio.TimeoutError as e:
1993 | # If proc.wait() times out, it means the process didn't exit within timeout+5s
1994 | # Re-raise the specific timeout error defined earlier
1995 | raise ToolExecutionError(
1996 | f"'{cmd_name}' stream process failed to exit within timeout of {timeout}s (+5s buffer).",
1997 | error_code=ToolErrorCode.TIMEOUT,
1998 | details={"timeout": timeout},
1999 | ) from e
2000 |
2001 | # Ensure IO tasks have finished (they should have if process exited, but wait briefly)
2002 | try:
2003 | await asyncio.wait_for(stdin_writer, timeout=1.0)
2004 | await asyncio.wait_for(stderr_reader, timeout=1.0)
2005 | except asyncio.TimeoutError:
2006 | logger.warning(
2007 | f"Stream: IO tasks for '{cmd_name}' (pid {proc.pid}) did not complete quickly after process exit."
2008 | )
2009 |
2010 | # --- Check final status ---
2011 | duration = time.monotonic() - start_time
2012 | retcode_ok_map = RETCODE_OK.get(cmd_name, {0})
2013 | success = exit_code in retcode_ok_map
2014 | stderr_full = "".join(stderr_lines)
2015 |
2016 | if not success:
2017 | stderr_snip = textwrap.shorten(stderr_full.strip(), 150, placeholder="...")
2018 | error_msg = f"Command '{cmd_name}' stream finished with failure exit code {exit_code}. Stderr: '{stderr_snip}'"
2019 | logger.warning(
2020 | error_msg,
2021 | extra={"command": cmd_name, "exit_code": exit_code, "stderr": stderr_full},
2022 | )
2023 | # Raise error *after* iteration completes to signal failure to the caller
2024 | raise ToolExecutionError(
2025 | error_msg,
2026 | error_code=ToolErrorCode.EXEC_ERROR,
2027 | details={"exit_code": exit_code, "stderr": stderr_full},
2028 | )
2029 | else:
2030 | logger.info(
2031 | f"Stream: '{cmd_name}' (pid {proc.pid}) finished successfully in {duration:.3f}s (code={exit_code}, stdout_lines={stdout_line_count}, truncated={stdout_truncated})"
2032 | )
2033 |
2034 | except asyncio.TimeoutError as e:
2035 | # This catches the explicit timeout check within the stdout loop or the wait_for on proc.wait()
2036 | logger.warning(
2037 | f"Command '{cmd_name}' stream timed out after ~{timeout}s. Terminating.",
2038 | extra={"command": cmd_name, "timeout": timeout},
2039 | )
2040 | if proc and proc.returncode is None:
2041 | try:
2042 | proc.terminate()
2043 | await asyncio.wait_for(proc.wait(), timeout=1.0)
2044 | except asyncio.TimeoutError:
2045 | logger.warning(f"Killing unresponsive stream process {proc.pid}")
2046 | proc.kill()
2047 | await proc.wait()
2048 | except ProcessLookupError:
2049 | pass
2050 | except Exception as term_err:
2051 | logger.error(f"Error killing stream process {proc.pid}: {term_err}")
2052 | # Raise the standard execution error for timeout
2053 | raise ToolExecutionError(
2054 | f"'{cmd_name}' stream exceeded timeout of {timeout}s.",
2055 | error_code=ToolErrorCode.TIMEOUT,
2056 | details={"timeout": timeout},
2057 | ) from e
2058 | except (ToolInputError, ToolExecutionError) as e:
2059 | # Propagate specific errors raised during setup or validation
2060 | raise e
2061 | except FileNotFoundError as e:
2062 | logger.error(
2063 | f"Stream: Command '{cmd_name}' not found at path: {cmdline[0]}. Ensure it's installed and in PATH.",
2064 | exc_info=True,
2065 | )
2066 | raise ToolExecutionError(
2067 | f"Stream: Command '{cmd_name}' executable not found.",
2068 | error_code=ToolErrorCode.CMD_NOT_FOUND,
2069 | ) from e
2070 | except PermissionError as e:
2071 | logger.error(
2072 | f"Stream: Permission denied executing command '{cmd_name}' at path: {cmdline[0]}.",
2073 | exc_info=True,
2074 | )
2075 | raise ToolExecutionError(
2076 | f"Stream: Permission denied executing '{cmd_name}'. Check file permissions.",
2077 | error_code=ToolErrorCode.EXEC_ERROR,
2078 | ) from e
2079 | except Exception as e:
2080 | logger.error(
2081 | f"Stream: Unexpected error during command stream '{cmd_name}': {e}", exc_info=True
2082 | )
2083 | raise ToolExecutionError(
2084 | f"Unexpected failure during '{cmd_name}' stream: {e}",
2085 | error_code=ToolErrorCode.UNEXPECTED_FAILURE,
2086 | ) from e
2087 | finally:
2088 | # Final cleanup check for the process in case of unexpected exit from the try block
2089 | if proc and proc.returncode is None:
2090 | logger.warning(
2091 | f"Stream: Process {proc.pid} for '{cmd_name}' still running after stream processing finished unexpectedly, killing."
2092 | )
2093 | try:
2094 | proc.kill()
2095 | await proc.wait()
2096 | except ProcessLookupError:
2097 | pass
2098 | except Exception as final_kill_err:
2099 | logger.error(
2100 | f"Stream: Error killing process {proc.pid} in finally block: {final_kill_err}"
2101 | )
2102 |
2103 |
2104 | # --- run_ripgrep_stream ---
2105 | @with_tool_metrics
2106 | @with_error_handling
2107 | async def run_ripgrep_stream(
2108 | args_str: str,
2109 | *,
2110 | input_data: Optional[str] = None,
2111 | input_file: Optional[bool] = False, # Default False
2112 | input_dir: Optional[bool] = False, # Default False
2113 | timeout: float = DEFAULT_TIMEOUT,
2114 | ) -> AsyncIterator[str]:
2115 | """
2116 | Executes 'rg' and streams stdout lines asynchronously. Useful for large outputs.
2117 |
2118 | See `run_ripgrep` for detailed argument descriptions and security notes.
2119 | This variant yields each line of standard output as it becomes available.
2120 |
2121 | **Note:** The final success status and stderr are not part of the yielded stream.
2122 | Errors during execution (e.g., timeout, non-zero exit code other than 1 for 'no match')
2123 | will raise a `ToolExecutionError` *after* the stream iteration completes
2124 | (or immediately if setup fails). Use a `try...finally` or check status afterwards.
2125 |
2126 | Args:
2127 | args_str: Command-line arguments for `rg`.
2128 | input_data: String data to pipe via stdin. Omit/False for `input_file`/`input_dir`.
2129 | input_file: Set True if `args_str` includes target file path(s). Omit/False for `input_data`/`input_dir`.
2130 | input_dir: Set True if `args_str` includes target directory path(s). Omit/False for `input_data`/`input_file`.
2131 | timeout: Max execution time in seconds for the entire operation.
2132 |
2133 | Yields:
2134 | str: Each line of standard output from the `rg` command.
2135 |
2136 | Raises:
2137 | ToolInputError: For invalid arguments or security violations during setup.
2138 | ToolExecutionError: If `rg` fails execution (timeout, bad exit code > 1),
2139 | or if errors occur during streaming I/O. Raised *after* iteration.
2140 | """
2141 | _require_single_source(
2142 | "rg (stream)", input_data=input_data, input_file=input_file, input_dir=input_dir
2143 | )
2144 | # Use the streaming executor
2145 | async for line in _run_command_stream(
2146 | "rg",
2147 | args_str,
2148 | input_data=input_data,
2149 | is_file_target=input_file,
2150 | is_dir_target=input_dir,
2151 | timeout=timeout,
2152 | ):
2153 | yield line
2154 |
2155 |
2156 | # --- run_awk_stream ---
2157 | @with_tool_metrics
2158 | @with_error_handling
2159 | async def run_awk_stream(
2160 | args_str: str,
2161 | *,
2162 | input_data: Optional[str] = None,
2163 | input_file: Optional[bool] = False, # Default False
2164 | timeout: float = DEFAULT_TIMEOUT,
2165 | ) -> AsyncIterator[str]:
2166 | """
2167 | Executes 'awk' and streams stdout lines asynchronously.
2168 |
2169 | See `run_awk` for detailed argument descriptions and security notes.
2170 | Yields each line of standard output as it becomes available.
2171 |
2172 | Args:
2173 | args_str: Command-line arguments for `awk`.
2174 | input_data: String data to pipe via stdin. Omit/False for `input_file`.
2175 | input_file: Set True if `args_str` includes target file path(s). Omit/False for `input_data`.
2176 | timeout: Max execution time in seconds.
2177 |
2178 | Yields:
2179 | str: Each line of standard output from the `awk` command.
2180 |
2181 | Raises:
2182 | ToolInputError: For invalid arguments or security violations.
2183 | ToolExecutionError: If `awk` fails (non-zero exit), times out, or errors during streaming. Raised *after* iteration.
2184 | """
2185 | _require_single_source(
2186 | "awk (stream)", input_data=input_data, input_file=input_file, input_dir=False
2187 | )
2188 | async for line in _run_command_stream(
2189 | "awk",
2190 | args_str,
2191 | input_data=input_data,
2192 | is_file_target=input_file,
2193 | is_dir_target=False,
2194 | timeout=timeout,
2195 | ):
2196 | yield line
2197 |
2198 |
2199 | # --- run_sed_stream ---
2200 | @with_tool_metrics
2201 | @with_error_handling
2202 | async def run_sed_stream(
2203 | args_str: str,
2204 | *,
2205 | input_data: Optional[str] = None,
2206 | input_file: Optional[bool] = False, # Default False
2207 | timeout: float = DEFAULT_TIMEOUT,
2208 | ) -> AsyncIterator[str]:
2209 | """
2210 | Executes 'sed' and streams stdout lines asynchronously.
2211 |
2212 | See `run_sed` for detailed argument descriptions and security notes.
2213 | Yields each line of standard output as it becomes available.
2214 |
2215 | Args:
2216 | args_str: Command-line arguments for `sed`.
2217 | input_data: String data to pipe via stdin. Omit/False for `input_file`.
2218 | input_file: Set True if `args_str` includes target file path. Omit/False for `input_data`.
2219 | timeout: Max execution time in seconds.
2220 |
2221 | Yields:
2222 | str: Each line of standard output from the `sed` command.
2223 |
2224 | Raises:
2225 | ToolInputError: For invalid arguments or security violations.
2226 | ToolExecutionError: If `sed` fails (non-zero exit), times out, or errors during streaming. Raised *after* iteration.
2227 | """
2228 | _require_single_source(
2229 | "sed (stream)", input_data=input_data, input_file=input_file, input_dir=False
2230 | )
2231 | async for line in _run_command_stream(
2232 | "sed",
2233 | args_str,
2234 | input_data=input_data,
2235 | is_file_target=input_file,
2236 | is_dir_target=False,
2237 | timeout=timeout,
2238 | ):
2239 | yield line
2240 |
2241 |
2242 | # --- run_jq_stream ---
2243 | @with_tool_metrics
2244 | @with_error_handling
2245 | async def run_jq_stream(
2246 | args_str: str,
2247 | *,
2248 | input_data: Optional[str] = None,
2249 | input_file: Optional[bool] = False, # Default False
2250 | timeout: float = DEFAULT_TIMEOUT,
2251 | ) -> AsyncIterator[str]:
2252 | """
2253 | Executes 'jq' and streams stdout lines asynchronously.
2254 |
2255 | See `run_jq` for detailed argument descriptions and security notes.
2256 | Yields each line of standard output (often JSON objects if not using `-c`)
2257 | as it becomes available.
2258 |
2259 | Args:
2260 | args_str: Command-line arguments for `jq`.
2261 | input_data: String containing valid JSON data to pipe via stdin. Omit/False for `input_file`.
2262 | input_file: Set True if `args_str` includes target JSON file path. Omit/False for `input_data`.
2263 | timeout: Max execution time in seconds.
2264 |
2265 | Yields:
2266 | str: Each line of standard output from the `jq` command.
2267 |
2268 | Raises:
2269 | ToolInputError: If `input_data` is not valid JSON, arguments are invalid, or security violations occur.
2270 | ToolExecutionError: If `jq` fails (non-zero exit), times out, or errors during streaming. Raised *after* iteration.
2271 | """
2272 | _require_single_source(
2273 | "jq (stream)", input_data=input_data, input_file=input_file, input_dir=False
2274 | )
2275 | # Validate input_data is JSON if provided, before starting process
2276 | if input_data is not None and not _is_json_or_json_lines(input_data):
2277 | raise ToolInputError(
2278 | "input_data is not valid JSON or JSON-Lines",
2279 | param_name="input_data",
2280 | details={"error_code": ToolErrorCode.INVALID_JSON_INPUT.value},
2281 | )
2282 |
2283 | async for line in _run_command_stream(
2284 | "jq",
2285 | args_str,
2286 | input_data=input_data,
2287 | is_file_target=input_file,
2288 | is_dir_target=False,
2289 | timeout=timeout,
2290 | ):
2291 | yield line
2292 |
2293 |
2294 | # --------------------------------------------------------------------------- #
2295 | # Public API Exports
2296 | # --------------------------------------------------------------------------- #
2297 |
2298 |
2299 | def get_workspace_dir() -> str:
2300 | """Return the absolute workspace directory path enforced by this module."""
2301 | return str(WORKSPACE_DIR)
2302 |
2303 |
2304 | # Add command metadata access if needed?
2305 | # def get_command_meta(cmd_name: str) -> Optional[CommandMeta]:
2306 | # """Returns the discovered metadata for a command, if available."""
2307 | # return _COMMAND_METADATA.get(cmd_name)
2308 |
2309 |
2310 | __all__ = [
2311 | # Standard execution
2312 | "run_ripgrep",
2313 | "run_awk",
2314 | "run_sed",
2315 | "run_jq",
2316 | # Streaming variants
2317 | "run_ripgrep_stream",
2318 | "run_awk_stream",
2319 | "run_sed_stream",
2320 | "run_jq_stream",
2321 | # Configuration info
2322 | "get_workspace_dir",
2323 | # Types (for consumers)
2324 | "ToolResult",
2325 | "ToolErrorCode",
2326 | # Maybe export CommandMeta if useful externally?
2327 | # "CommandMeta",
2328 | ]
2329 |
```