dicklesworthstone/llm_gateway_mcp

This is page 45 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│   ├── __init__.py
│   ├── advanced_agent_flows_using_unified_memory_system_demo.py
│   ├── advanced_extraction_demo.py
│   ├── advanced_unified_memory_system_demo.py
│   ├── advanced_vector_search_demo.py
│   ├── analytics_reporting_demo.py
│   ├── audio_transcription_demo.py
│   ├── basic_completion_demo.py
│   ├── cache_demo.py
│   ├── claude_integration_demo.py
│   ├── compare_synthesize_demo.py
│   ├── cost_optimization.py
│   ├── data
│   │   ├── sample_event.txt
│   │   ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│   │   └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│   ├── docstring_refiner_demo.py
│   ├── document_conversion_and_processing_demo.py
│   ├── entity_relation_graph_demo.py
│   ├── filesystem_operations_demo.py
│   ├── grok_integration_demo.py
│   ├── local_text_tools_demo.py
│   ├── marqo_fused_search_demo.py
│   ├── measure_model_speeds.py
│   ├── meta_api_demo.py
│   ├── multi_provider_demo.py
│   ├── ollama_integration_demo.py
│   ├── prompt_templates_demo.py
│   ├── python_sandbox_demo.py
│   ├── rag_example.py
│   ├── research_workflow_demo.py
│   ├── sample
│   │   ├── article.txt
│   │   ├── backprop_paper.pdf
│   │   ├── buffett.pdf
│   │   ├── contract_link.txt
│   │   ├── legal_contract.txt
│   │   ├── medical_case.txt
│   │   ├── northwind.db
│   │   ├── research_paper.txt
│   │   ├── sample_data.json
│   │   └── text_classification_samples
│   │       ├── email_classification.txt
│   │       ├── news_samples.txt
│   │       ├── product_reviews.txt
│   │       └── support_tickets.txt
│   ├── sample_docs
│   │   └── downloaded
│   │       └── attention_is_all_you_need.pdf
│   ├── sentiment_analysis_demo.py
│   ├── simple_completion_demo.py
│   ├── single_shot_synthesis_demo.py
│   ├── smart_browser_demo.py
│   ├── sql_database_demo.py
│   ├── sse_client_demo.py
│   ├── test_code_extraction.py
│   ├── test_content_detection.py
│   ├── test_ollama.py
│   ├── text_classification_demo.py
│   ├── text_redline_demo.py
│   ├── tool_composition_examples.py
│   ├── tournament_code_demo.py
│   ├── tournament_text_demo.py
│   ├── unified_memory_system_demo.py
│   ├── vector_search_demo.py
│   ├── web_automation_instruction_packs.py
│   └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│   └── smart_browser_internal
│       ├── locator_cache.db
│       ├── readability.js
│       └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│   ├── __init__.py
│   ├── conftest.py
│   ├── integration
│   │   ├── __init__.py
│   │   └── test_server.py
│   ├── manual
│   │   ├── test_extraction_advanced.py
│   │   └── test_extraction.py
│   └── unit
│       ├── __init__.py
│       ├── test_cache.py
│       ├── test_providers.py
│       └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── commands.py
│   │   ├── helpers.py
│   │   └── typer_cli.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── completion_client.py
│   │   └── rag_client.py
│   ├── config
│   │   └── examples
│   │       └── filesystem_config.yaml
│   ├── config.py
│   ├── constants.py
│   ├── core
│   │   ├── __init__.py
│   │   ├── evaluation
│   │   │   ├── base.py
│   │   │   └── evaluators.py
│   │   ├── providers
│   │   │   ├── __init__.py
│   │   │   ├── anthropic.py
│   │   │   ├── base.py
│   │   │   ├── deepseek.py
│   │   │   ├── gemini.py
│   │   │   ├── grok.py
│   │   │   ├── ollama.py
│   │   │   ├── openai.py
│   │   │   └── openrouter.py
│   │   ├── server.py
│   │   ├── state_store.py
│   │   ├── tournaments
│   │   │   ├── manager.py
│   │   │   ├── tasks.py
│   │   │   └── utils.py
│   │   └── ums_api
│   │       ├── __init__.py
│   │       ├── ums_database.py
│   │       ├── ums_endpoints.py
│   │       ├── ums_models.py
│   │       └── ums_services.py
│   ├── exceptions.py
│   ├── graceful_shutdown.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── analytics
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── reporting.py
│   │   ├── cache
│   │   │   ├── __init__.py
│   │   │   ├── cache_service.py
│   │   │   ├── persistence.py
│   │   │   ├── strategies.py
│   │   │   └── utils.py
│   │   ├── cache.py
│   │   ├── document.py
│   │   ├── knowledge_base
│   │   │   ├── __init__.py
│   │   │   ├── feedback.py
│   │   │   ├── manager.py
│   │   │   ├── rag_engine.py
│   │   │   ├── retriever.py
│   │   │   └── utils.py
│   │   ├── prompts
│   │   │   ├── __init__.py
│   │   │   ├── repository.py
│   │   │   └── templates.py
│   │   ├── prompts.py
│   │   └── vector
│   │       ├── __init__.py
│   │       ├── embeddings.py
│   │       └── vector_service.py
│   ├── tool_token_counter.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── audio_transcription.py
│   │   ├── base.py
│   │   ├── completion.py
│   │   ├── docstring_refiner.py
│   │   ├── document_conversion_and_processing.py
│   │   ├── enhanced-ums-lookbook.html
│   │   ├── entity_relation_graph.py
│   │   ├── excel_spreadsheet_automation.py
│   │   ├── extraction.py
│   │   ├── filesystem.py
│   │   ├── html_to_markdown.py
│   │   ├── local_text_tools.py
│   │   ├── marqo_fused_search.py
│   │   ├── meta_api_tool.py
│   │   ├── ocr_tools.py
│   │   ├── optimization.py
│   │   ├── provider.py
│   │   ├── pyodide_boot_template.html
│   │   ├── python_sandbox.py
│   │   ├── rag.py
│   │   ├── redline-compiled.css
│   │   ├── sentiment_analysis.py
│   │   ├── single_shot_synthesis.py
│   │   ├── smart_browser.py
│   │   ├── sql_databases.py
│   │   ├── text_classification.py
│   │   ├── text_redline_tools.py
│   │   ├── tournament.py
│   │   ├── ums_explorer.html
│   │   └── unified_memory_system.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── async_utils.py
│   │   ├── display.py
│   │   ├── logging
│   │   │   ├── __init__.py
│   │   │   ├── console.py
│   │   │   ├── emojis.py
│   │   │   ├── formatter.py
│   │   │   ├── logger.py
│   │   │   ├── panels.py
│   │   │   ├── progress.py
│   │   │   └── themes.py
│   │   ├── parse_yaml.py
│   │   ├── parsing.py
│   │   ├── security.py
│   │   └── text.py
│   └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/smart_browser.py:
--------------------------------------------------------------------------------

```python
   1 | # ultimate_mcp_server/tools/smart_browser.py
   2 | """
   3 | Smart Browser - Standalone Playwright-powered web automation tools for Ultimate MCP Server.
   4 | 
   5 | Provides enterprise-grade web automation with comprehensive features for scraping,
   6 | testing, and browser automation tasks with built-in security, resilience, and ML capabilities.
   7 | 
   8 | Refactored into standalone functions for compatibility with the MCP tool registration system.
   9 | State and lifecycle are managed via global variables and explicit init/shutdown calls.
  10 | """
  11 | 
  12 | # Python Standard Library Imports
  13 | import asyncio
  14 | import atexit
  15 | import base64
  16 | import concurrent.futures
  17 | import difflib
  18 | import functools
  19 | import hashlib
  20 | import json
  21 | import os
  22 | import random
  23 | import re
  24 | import signal
  25 | import sqlite3
  26 | import subprocess
  27 | import textwrap
  28 | import threading
  29 | import time
  30 | import unicodedata
  31 | import urllib.parse
  32 | 
  33 | # Python Standard Library Type Hinting and Collections Imports
  34 | from collections import deque
  35 | from contextlib import asynccontextmanager, closing
  36 | from datetime import datetime, timezone
  37 | from pathlib import Path
  38 | from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
  39 | from urllib.parse import urlparse
  40 | 
  41 | # Third-Party Library Imports
  42 | import aiofiles
  43 | import httpx
  44 | from bs4 import BeautifulSoup
  45 | from cryptography.exceptions import InvalidTag
  46 | from cryptography.hazmat.primitives.ciphers.aead import AESGCM
  47 | from playwright._impl._errors import Error as PlaywrightException
  48 | from playwright._impl._errors import TimeoutError as PlaywrightTimeoutError
  49 | from playwright.async_api import Browser, BrowserContext, Locator, Page, async_playwright
  50 | 
  51 | # First-Party Library Imports (MCP Specific)
  52 | from ultimate_mcp_server.config import SmartBrowserConfig, get_config
  53 | 
  54 | # Assuming these are available and work standalone
  55 | from ultimate_mcp_server.constants import Provider
  56 | from ultimate_mcp_server.core.providers.base import get_provider, parse_model_string
  57 | from ultimate_mcp_server.exceptions import ProviderError, ToolError, ToolInputError
  58 | 
  59 | # Import STANDALONE filesystem and completion tools
  60 | from ultimate_mcp_server.tools.completion import chat_completion
  61 | from ultimate_mcp_server.tools.filesystem import (
  62 |     create_directory,
  63 |     get_unique_filepath,
  64 |     read_file,
  65 |     write_file,
  66 | )
  67 | from ultimate_mcp_server.utils import get_logger
  68 | 
  69 | # For loop binding and forked process detection
  70 | _pid = os.getpid()
  71 | 
  72 | # --- Global Logger ---
  73 | logger = get_logger("ultimate_mcp_server.tools.smart_browser")
  74 | 
  75 | # --- Load External Tools Dynamically (Best Effort) ---
  76 | # This allows using tools defined later without circular imports at top level
  77 | # We'll look them up by name when needed in autopilot.
  78 | _filesystem_tools_module = None
  79 | _completion_tools_module = None
  80 | 
  81 | 
  82 | def _get_filesystem_tool(name):
  83 |     global _filesystem_tools_module
  84 |     if _filesystem_tools_module is None:
  85 |         import ultimate_mcp_server.tools.filesystem as fs
  86 | 
  87 |         _filesystem_tools_module = fs
  88 |     tool_func = getattr(_filesystem_tools_module, name, None)
  89 |     return tool_func
  90 | 
  91 | 
  92 | def _get_completion_tool(name):
  93 |     global _completion_tools_module
  94 |     if _completion_tools_module is None:
  95 |         import ultimate_mcp_server.tools.completion as cm
  96 | 
  97 |         _completion_tools_module = cm
  98 |     tool_func = getattr(_completion_tools_module, name, None)
  99 |     return tool_func
 100 | 
 101 | 
 102 | # --- Global Configuration Variables ---
 103 | # (These will be populated by _ensure_initialized)
 104 | _sb_state_key_b64_global: Optional[str] = None
 105 | _sb_max_tabs_global: int = 5
 106 | _sb_tab_timeout_global: int = 300
 107 | _sb_inactivity_timeout_global: int = 600
 108 | _headless_mode_global: bool = True
 109 | _vnc_enabled_global: bool = False
 110 | _vnc_password_global: Optional[str] = None
 111 | _proxy_pool_str_global: str = ""
 112 | _proxy_allowed_domains_str_global: str = "*"
 113 | _vault_allowed_paths_str_global: str = "secret/data/,kv/data/"
 114 | _max_widgets_global: int = 300
 115 | _max_section_chars_global: int = 5000
 116 | _dom_fp_limit_global: int = 20000
 117 | _llm_model_locator_global: str = "openai/gpt-4.1-mini"  # Updated default
 118 | _retry_after_fail_global: int = 1
 119 | _seq_cutoff_global: float = 0.72
 120 | _area_min_global: int = 400
 121 | _high_risk_domains_set_global: Set[str] = set()
 122 | _SB_INTERNAL_BASE_PATH_STR: Optional[str] = None
 123 | _STATE_FILE: Optional[Path] = None
 124 | _LOG_FILE: Optional[Path] = None
 125 | _CACHE_DB: Optional[Path] = None
 126 | _READ_JS_CACHE: Optional[Path] = None
 127 | _PROXY_CONFIG_DICT: Optional[Dict[str, Any]] = None
 128 | _PROXY_ALLOWED_DOMAINS_LIST: Optional[List[str]] = None
 129 | _ALLOWED_VAULT_PATHS: Set[str] = set()
 130 | 
 131 | # --- Global State Variables ---
 132 | _pw: Optional[async_playwright] = None
 133 | _browser: Optional[Browser] = None
 134 | _ctx: Optional[BrowserContext] = None  # Shared context
 135 | _vnc_proc: Optional[subprocess.Popen] = None
 136 | _last_hash: str | None = None
 137 | _js_lib_cached: Set[str] = set()
 138 | _db_connection: sqlite3.Connection | None = None
 139 | _locator_cache_cleanup_task_handle: Optional[asyncio.Task] = None
 140 | _inactivity_monitor_task_handle: Optional[asyncio.Task] = None  # New handle for monitor task
 141 | _last_activity: float = 0.0  # Global last activity timestamp
 142 | 
 143 | # --- Locks ---
 144 | _init_lock = asyncio.Lock()
 145 | _playwright_lock = asyncio.Lock()
 146 | _js_lib_lock = asyncio.Lock()
 147 | _audit_log_lock = asyncio.Lock()
 148 | _db_conn_pool_lock = threading.RLock()  # Keep RLock for sync DB access from async context
 149 | _shutdown_lock = asyncio.Lock()
 150 | 
 151 | # --- Flags ---
 152 | _is_initialized = False
 153 | _shutdown_initiated = False
 154 | 
 155 | # --- Thread Pool ---
 156 | _cpu_count = os.cpu_count() or 1
 157 | _thread_pool = concurrent.futures.ThreadPoolExecutor(
 158 |     max_workers=min(32, _cpu_count * 2 + 4), thread_name_prefix="sb_worker"
 159 | )
 160 | 
 161 | # --- Helper Functions ---
 162 | 
 163 | 
 164 | def _update_activity():
 165 |     """Updates the global activity timestamp. Should be called by user-facing tool functions."""
 166 |     global _last_activity
 167 |     now = time.monotonic()
 168 |     logger.debug(f"Updating last activity timestamp to {now}")
 169 |     _last_activity = now
 170 | 
 171 | 
 172 | def _get_pool():  # Keep as is
 173 |     global _thread_pool, _pid
 174 |     if _pid != os.getpid():
 175 |         _thread_pool.shutdown(wait=False)
 176 |         pool_max_workers = min(32, _sb_max_tabs_global * 2)
 177 |         _thread_pool = concurrent.futures.ThreadPoolExecutor(
 178 |             max_workers=pool_max_workers, thread_name_prefix="sb_worker"
 179 |         )
 180 |         _pid = os.getpid()
 181 |     return _thread_pool
 182 | 
 183 | 
 184 | # --- Encryption ---
 185 | CIPHER_VERSION = b"SB1"
 186 | AAD_TAG = b"smart-browser-state-v1"
 187 | 
 188 | 
 189 | def _key() -> bytes | None:  # Uses global _sb_state_key_b64_global
 190 |     """Get AES-GCM key from the globally set config value."""
 191 |     if not _sb_state_key_b64_global:
 192 |         return None
 193 |     try:
 194 |         decoded = base64.b64decode(_sb_state_key_b64_global)
 195 |         key_length = len(decoded)
 196 |         if key_length not in (16, 24, 32):
 197 |             logger.warning(f"Invalid SB State Key length: {key_length} bytes. Need 16, 24, or 32.")
 198 |             return None
 199 |         return decoded
 200 |     except (ValueError, TypeError) as e:
 201 |         logger.warning(f"Invalid base64 SB State Key: {e}")
 202 |         return None
 203 | 
 204 | 
 205 | def _enc(buf: bytes) -> bytes:  # Uses global _key
 206 |     """Encrypt data using AES-GCM with AAD if key is set."""
 207 |     k = _key()
 208 |     if not k:
 209 |         logger.debug("SB_STATE_KEY not set. Skipping encryption for state.")
 210 |         return buf
 211 |     try:
 212 |         nonce = os.urandom(12)
 213 |         cipher = AESGCM(k)
 214 |         encrypted_data = cipher.encrypt(nonce, buf, AAD_TAG)
 215 |         result = CIPHER_VERSION + nonce + encrypted_data
 216 |         return result
 217 |     except Exception as e:
 218 |         logger.error(f"Encryption failed: {e}", exc_info=True)
 219 |         raise RuntimeError(f"Encryption failed: {e}") from e
 220 | 
 221 | 
 222 | def _dec(buf: bytes) -> bytes | None:  # Uses global _key, _STATE_FILE
 223 |     """Decrypt data using AES-GCM with AAD if key is set and buffer looks encrypted."""
 224 |     k = _key()
 225 |     if not k:
 226 |         logger.debug("SB_STATE_KEY not set. Assuming state is unencrypted.")
 227 |         try:
 228 |             stripped_buf = buf.strip()
 229 |             if stripped_buf.startswith(b"{") or stripped_buf.startswith(b"["):
 230 |                 return buf
 231 |             else:
 232 |                 logger.warning("Unencrypted state file doesn't look like JSON. Ignoring.")
 233 |                 return None
 234 |         except Exception:
 235 |             logger.warning("Error checking unencrypted state file format. Ignoring.")
 236 |             return None
 237 | 
 238 |     if not buf.startswith(CIPHER_VERSION):
 239 |         logger.warning(
 240 |             "State file exists but lacks expected encryption header. Treating as legacy/invalid."
 241 |         )
 242 |         if _STATE_FILE and _STATE_FILE.exists():
 243 |             try:
 244 |                 _STATE_FILE.unlink()
 245 |             except Exception:
 246 |                 pass
 247 |         return None
 248 | 
 249 |     hdr_len = len(CIPHER_VERSION)
 250 |     nonce_len = 12
 251 |     min_len = hdr_len + nonce_len + 1  # Header + Nonce + Tag(at least 1 byte)
 252 |     if len(buf) < min_len:
 253 |         logger.error("State file too short to be valid encrypted data")
 254 |         return None
 255 | 
 256 |     _hdr_start = 0
 257 |     _hdr_end = hdr_len
 258 |     _nonce_start = _hdr_end
 259 |     _nonce_end = _hdr_end + nonce_len
 260 |     _ciphertext_start = _nonce_end
 261 | 
 262 |     _HDR = buf[_hdr_start:_hdr_end]
 263 |     nonce = buf[_nonce_start:_nonce_end]
 264 |     ciphertext = buf[_ciphertext_start:]
 265 | 
 266 |     try:
 267 |         cipher = AESGCM(k)
 268 |         decrypted_data = cipher.decrypt(nonce, ciphertext, AAD_TAG)
 269 |         return decrypted_data
 270 |     except InvalidTag:
 271 |         logger.error("Decryption failed: Invalid tag (tampered/wrong key?)")
 272 |         if _STATE_FILE and _STATE_FILE.exists():
 273 |             try:
 274 |                 _STATE_FILE.unlink()
 275 |             except Exception:
 276 |                 pass
 277 |         raise RuntimeError("State-file authentication failed (InvalidTag)") from None
 278 |     except Exception as e:
 279 |         logger.error(f"Decryption failed: {e}.", exc_info=True)
 280 |         if _STATE_FILE and _STATE_FILE.exists():
 281 |             try:
 282 |                 _STATE_FILE.unlink()
 283 |             except Exception:
 284 |                 pass
 285 |         return None
 286 | 
 287 | 
 288 | # --- Locator Cache DB ---
 289 | def _get_db_connection() -> sqlite3.Connection:  # Uses global _db_connection, _CACHE_DB
 290 |     """Get or create the single shared SQLite connection."""
 291 |     global _db_connection
 292 |     with _db_conn_pool_lock:
 293 |         if _db_connection is None:
 294 |             if _CACHE_DB is None:
 295 |                 raise RuntimeError("Database path (_CACHE_DB) not initialized before DB access.")
 296 |             try:
 297 |                 conn = sqlite3.connect(
 298 |                     _CACHE_DB,
 299 |                     check_same_thread=False,
 300 |                     isolation_level=None,
 301 |                     timeout=10,
 302 |                 )
 303 |                 conn.execute("PRAGMA journal_mode=WAL")
 304 |                 conn.execute("PRAGMA synchronous=FULL")
 305 |                 conn.execute("PRAGMA busy_timeout = 10000")
 306 |                 _db_connection = conn
 307 |                 logger.info(f"Initialized SQLite DB connection to {_CACHE_DB}")
 308 |             except sqlite3.Error as e:
 309 |                 logger.critical(
 310 |                     f"Failed to connect/init SQLite DB at {_CACHE_DB}: {e}", exc_info=True
 311 |                 )
 312 |                 raise RuntimeError(f"Failed to initialize database: {e}") from e
 313 |         return _db_connection
 314 | 
 315 | 
 316 | def _close_db_connection():  # Uses global _db_connection
 317 |     """Close the SQLite connection."""
 318 |     global _db_connection
 319 |     with _db_conn_pool_lock:
 320 |         if _db_connection is not None:
 321 |             conn_to_close = _db_connection
 322 |             _db_connection = None  # Set to None first
 323 |             try:
 324 |                 conn_to_close.execute("PRAGMA wal_checkpoint(TRUNCATE);")
 325 |             except sqlite3.Error as e:
 326 |                 logger.warning(f"Error during WAL checkpoint before closing DB: {e}")
 327 |             try:
 328 |                 conn_to_close.close()
 329 |                 logger.info("Closed SQLite DB connection.")
 330 |             except sqlite3.Error as e:
 331 |                 logger.error(f"Error closing SQLite DB connection: {e}")
 332 | 
 333 | 
 334 | atexit.register(_close_db_connection)  # Keep atexit hook
 335 | 
 336 | 
 337 | def _init_locator_cache_db_sync():  # Uses global _CACHE_DB
 338 |     """Synchronous DB schema initialization for the locator cache."""
 339 |     conn = None
 340 |     if _CACHE_DB is None:
 341 |         logger.error("Cannot initialize locator DB: Path not set.")
 342 |         return  # Cannot proceed without path
 343 |     try:
 344 |         conn = _get_db_connection()
 345 |         with closing(conn.cursor()) as cursor:
 346 |             create_table_sql = """CREATE TABLE IF NOT EXISTS selector_cache(
 347 |                     key       TEXT,
 348 |                     selector  TEXT NOT NULL,
 349 |                     dom_fp    TEXT NOT NULL,
 350 |                     hits      INTEGER DEFAULT 1,
 351 |                     created_ts INTEGER DEFAULT (strftime('%s', 'now')),
 352 |                     last_hit  INTEGER DEFAULT (strftime('%s', 'now')),
 353 |                     PRIMARY KEY (key, dom_fp)
 354 |                 );"""
 355 |             cursor.execute(create_table_sql)
 356 |             try:
 357 |                 cursor.execute("SELECT last_hit FROM selector_cache LIMIT 1")
 358 |             except sqlite3.OperationalError:
 359 |                 logger.info("Adding last_hit column to selector_cache table...")
 360 |                 alter_table_sql = "ALTER TABLE selector_cache ADD COLUMN last_hit INTEGER DEFAULT(strftime('%s','now'))"
 361 |                 cursor.execute(alter_table_sql)
 362 |             logger.info(f"Enhanced Locator cache DB schema initialized/verified at {_CACHE_DB}")
 363 |     except sqlite3.Error as e:
 364 |         logger.critical(f"Failed to initialize locator cache DB schema: {e}", exc_info=True)
 365 |         raise RuntimeError(f"Failed to initialize locator cache database: {e}") from e
 366 |     except RuntimeError as e:  # Catch error from _get_db_connection if path is missing
 367 |         logger.critical(f"Failed to get DB connection for schema init: {e}")
 368 |         raise
 369 | 
 370 | 
 371 | def _cache_put_sync(key: str, selector: str, dom_fp: str) -> None:  # Uses global _get_db_connection
 372 |     """Synchronous write/update to the locator cache."""
 373 |     try:
 374 |         conn = _get_db_connection()
 375 |         insert_sql = """INSERT INTO selector_cache(key, selector, dom_fp, created_ts, last_hit)
 376 |                VALUES (?, ?, ?, strftime('%s', 'now'), strftime('%s', 'now'))
 377 |                ON CONFLICT(key, dom_fp) DO UPDATE SET
 378 |                  hits = hits + 1,
 379 |                  last_hit = strftime('%s', 'now')
 380 |                WHERE key = excluded.key AND dom_fp = excluded.dom_fp;"""
 381 |         params = (key, selector, dom_fp)
 382 |         conn.execute(insert_sql, params)
 383 |     except sqlite3.Error as e:
 384 |         key_prefix = key[:8]
 385 |         logger.error(f"Failed to write to locator cache (key prefix={key_prefix}...): {e}")
 386 |     except RuntimeError as e:
 387 |         logger.error(f"Failed to get DB connection for cache put: {e}")
 388 | 
 389 | 
 390 | def _cache_delete_sync(key: str) -> None:  # Uses global _get_db_connection
 391 |     """Synchronously delete an entry from the locator cache by key."""
 392 |     key_prefix = key[:8]
 393 |     try:
 394 |         conn = _get_db_connection()
 395 |         logger.debug(f"Deleting stale cache entry with key prefix: {key_prefix}...")
 396 |         delete_sql = "DELETE FROM selector_cache WHERE key = ?"
 397 |         params = (key,)
 398 |         cursor = conn.execute(delete_sql, params)
 399 |         if cursor.rowcount > 0:
 400 |             logger.debug(f"Successfully deleted stale cache entry {key_prefix}...")
 401 |     except sqlite3.Error as e:
 402 |         logger.error(f"Failed to delete stale cache entry (key prefix={key_prefix}...): {e}")
 403 |     except RuntimeError as e:
 404 |         logger.error(f"Failed to get DB connection for cache delete: {e}")
 405 |     except Exception as e:
 406 |         logger.error(
 407 |             f"Unexpected error deleting cache entry (key prefix={key_prefix}...): {e}",
 408 |             exc_info=True,
 409 |         )
 410 | 
 411 | 
 412 | def _cache_get_sync(key: str, dom_fp: str) -> Optional[str]:  # Uses global _get_db_connection
 413 |     """Synchronous read from cache, checking fingerprint. Deletes stale entries."""
 414 |     row = None
 415 |     try:
 416 |         conn = _get_db_connection()
 417 |         with closing(conn.cursor()) as cursor:
 418 |             select_sql = "SELECT selector FROM selector_cache WHERE key=? AND dom_fp=?"
 419 |             params_select = (key, dom_fp)
 420 |             cursor.execute(select_sql, params_select)
 421 |             row = cursor.fetchone()
 422 |             if row:
 423 |                 update_sql = "UPDATE selector_cache SET last_hit = strftime('%s', 'now') WHERE key=? AND dom_fp=?"
 424 |                 params_update = (key, dom_fp)
 425 |                 conn.execute(update_sql, params_update)
 426 |                 selector = row[0]
 427 |                 return selector
 428 |             # If row not found with matching key and fp, check if key exists at all
 429 |             check_key_sql = "SELECT 1 FROM selector_cache WHERE key=? LIMIT 1"
 430 |             params_check = (key,)
 431 |             cursor.execute(check_key_sql, params_check)
 432 |             key_exists = cursor.fetchone()
 433 |             if key_exists:
 434 |                 key_prefix = key[:8]
 435 |                 logger.debug(
 436 |                     f"Cache key '{key_prefix}...' found but DOM fingerprint mismatch. Deleting."
 437 |                 )
 438 |                 _cache_delete_sync(key)
 439 |     except sqlite3.Error as e:
 440 |         logger.error(f"Failed to read from locator cache (key={key}): {e}")
 441 |     except RuntimeError as e:
 442 |         logger.error(f"Failed to get DB connection for cache get: {e}")
 443 |     return None
 444 | 
 445 | 
 446 | # --- Locator Cache Cleanup ---
 447 | def _cleanup_locator_cache_db_sync(
 448 |     retention_days: int = 90,
 449 | ) -> int:  # Uses global _get_db_connection
 450 |     """Synchronously removes old entries from the locator cache DB."""
 451 |     deleted_count = 0
 452 |     if retention_days <= 0:
 453 |         logger.info("Locator cache cleanup skipped (retention_days <= 0).")
 454 |         return 0
 455 |     try:
 456 |         conn = _get_db_connection()
 457 |         # Note: f-string for time modification is safe as retention_days is an int
 458 |         cutoff_time_sql = f"strftime('%s', 'now', '-{retention_days} days')"
 459 |         logger.info(
 460 |             f"Running locator cache cleanup: Removing entries older than {retention_days} days or with hits=0..."
 461 |         )
 462 |         with closing(conn.cursor()) as cursor:
 463 |             # Use placeholder for the time comparison to be safer if possible, but strftime makes it tricky
 464 |             # For this controlled use case, f-string is acceptable.
 465 |             delete_sql = (
 466 |                 f"DELETE FROM selector_cache WHERE created_ts < ({cutoff_time_sql}) OR hits = 0"
 467 |             )
 468 |             cursor.execute(delete_sql)
 469 |             deleted_count = cursor.rowcount
 470 |             # Vacuum only if significant changes were made
 471 |             if deleted_count > 500:
 472 |                 logger.info(f"Vacuuming locator cache DB after deleting {deleted_count} entries...")
 473 |                 cursor.execute("VACUUM;")
 474 |         logger.info(f"Locator cache cleanup finished. Removed {deleted_count} old entries.")
 475 |         return deleted_count
 476 |     except sqlite3.Error as e:
 477 |         logger.error(f"Error during locator cache cleanup: {e}")
 478 |         return -1
 479 |     except RuntimeError as e:
 480 |         logger.error(f"Failed to get DB connection for cache cleanup: {e}")
 481 |         return -1
 482 |     except Exception as e:
 483 |         logger.error(f"Unexpected error during locator cache cleanup: {e}", exc_info=True)
 484 |         return -1
 485 | 
 486 | 
 487 | async def _locator_cache_cleanup_task(
 488 |     interval_seconds: int = 24 * 60 * 60,
 489 | ):  # Uses global _get_pool
 490 |     """Background task to periodically run locator cache cleanup."""
 491 |     if interval_seconds <= 0:
 492 |         logger.info("Locator cache cleanup task disabled (interval <= 0).")
 493 |         return
 494 |     logger.info(f"Locator cache cleanup task started. Running every {interval_seconds} seconds.")
 495 |     # Initial delay before first run
 496 |     await asyncio.sleep(interval_seconds)
 497 |     while True:
 498 |         try:
 499 |             loop = asyncio.get_running_loop()
 500 |             pool = _get_pool()
 501 |             result_count = await loop.run_in_executor(pool, _cleanup_locator_cache_db_sync)
 502 |             if result_count < 0:
 503 |                 logger.warning("Locator cache cleanup run encountered an error.")
 504 |             await asyncio.sleep(interval_seconds)
 505 |         except asyncio.CancelledError:
 506 |             logger.info("Locator cache cleanup task cancelled.")
 507 |             break
 508 |         except Exception as e:
 509 |             logger.error(f"Error in locator cache cleanup task loop: {e}", exc_info=True)
 510 |             # Wait longer after an error before retrying
 511 |             await asyncio.sleep(60 * 5)
 512 | 
 513 | 
 514 | # --- Audit Log ---
 515 | _salt = os.urandom(16)
 516 | 
 517 | 
 518 | def _sanitize_for_log(obj: Any) -> Any:  # Keep as is
 519 |     # ... (implementation largely unchanged, but split multi-line expressions) ...
 520 |     if isinstance(obj, str):
 521 |         try:
 522 |             # Remove control characters
 523 |             s = re.sub(r"[\x00-\x1f\x7f]", "", obj)
 524 |             # JSON encode to handle quotes, backslashes etc.
 525 |             encoded = json.dumps(s)
 526 |             # Remove the outer quotes added by json.dumps
 527 |             if len(encoded) >= 2:
 528 |                 return encoded[1:-1]
 529 |             else:
 530 |                 return ""
 531 |         except TypeError:
 532 |             return "???"  # Should not happen for str, but safety first
 533 |     elif isinstance(obj, dict):
 534 |         # Recursively sanitize dictionary values
 535 |         new_dict = {}
 536 |         for k, v in obj.items():
 537 |             sanitized_v = _sanitize_for_log(v)
 538 |             str_k = str(k)  # Ensure keys are strings
 539 |             new_dict[str_k] = sanitized_v
 540 |         return new_dict
 541 |     elif isinstance(obj, list):
 542 |         # Recursively sanitize list items
 543 |         new_list = []
 544 |         for item in obj:
 545 |             sanitized_item = _sanitize_for_log(item)
 546 |             new_list.append(sanitized_item)
 547 |         return new_list
 548 |     elif isinstance(obj, (int, float, bool, type(None))):
 549 |         # Allow simple types directly
 550 |         return obj
 551 |     else:
 552 |         # Attempt to stringify, sanitize, and encode other types
 553 |         try:
 554 |             s = str(obj)
 555 |             s = re.sub(r"[\x00-\x1f\x7f]", "", s)
 556 |             encoded = json.dumps(s)
 557 |             if len(encoded) >= 2:
 558 |                 return encoded[1:-1]
 559 |             else:
 560 |                 return ""
 561 |         except Exception:
 562 |             # Fallback for types that fail stringification/encoding
 563 |             return "???"
 564 | 
 565 | 
 566 | _EVENT_EMOJI_MAP = {  # Keep as is
 567 |     # ... (emoji map unchanged) ...
 568 |     "browser_start": "🚀",
 569 |     "browser_shutdown": "🛑",
 570 |     "browser_shutdown_complete": "🏁",
 571 |     "browser_context_create": "➕",
 572 |     "browser_incognito_context": "🕶️",
 573 |     "browser_context_close_shared": "➖",
 574 |     "browser_close": "🚪",
 575 |     "page_open": "📄",
 576 |     "page_close": "덮",
 577 |     "page_error": "🔥",
 578 |     "tab_timeout": "⏱️",
 579 |     "tab_cancelled": "🚫",
 580 |     "tab_error": "💥",
 581 |     "navigate": "➡️",
 582 |     "navigate_start": "➡️",
 583 |     "navigate_success": "✅",
 584 |     "navigate_fail_playwright": "❌",
 585 |     "navigate_fail_unexpected": "💣",
 586 |     "navigate_wait_selector_ok": "👌",
 587 |     "navigate_wait_selector_timeout": "⏳",
 588 |     "page_state_extracted": "ℹ️",
 589 |     "browse_fail_proxy_disallowed": "🛡️",
 590 |     "click": "🖱️",
 591 |     "click_success": "🖱️✅",
 592 |     "click_fail_notfound": "🖱️❓",
 593 |     "click_fail_playwright": "🖱️❌",
 594 |     "click_fail_unexpected": "🖱️💣",
 595 |     "type": "⌨️",
 596 |     "type_success": "⌨️✅",
 597 |     "type_fail_secret": "⌨️🔑",
 598 |     "type_fail_notfound": "⌨️❓",
 599 |     "type_fail_playwright": "⌨️❌",
 600 |     "type_fail_unexpected": "⌨️💣",
 601 |     "scroll": "↕️",
 602 |     "locator_cache_hit": "⚡",
 603 |     "locator_heuristic_match": "🧠",
 604 |     "locator_llm_pick": "🤖🎯",
 605 |     "locator_fail_all": "❓❓",
 606 |     "locator_text_fallback": "✍️",
 607 |     "locator_success": "🎯",
 608 |     "locator_fail": "❓",
 609 |     "download": "💾",
 610 |     "download_navigate": "🚚",
 611 |     "download_success": "💾✅",
 612 |     "download_fail_notfound": "💾❓",
 613 |     "download_fail_timeout": "💾⏱️",
 614 |     "download_fail_playwright": "💾❌",
 615 |     "download_fail_unexpected": "💾💣",
 616 |     "download_pdf_http": "📄💾",
 617 |     "download_direct_success": "✨💾",
 618 |     "download_pdf_error": "📄🔥",
 619 |     "download_site_pdfs_complete": "📚✅",
 620 |     "table_extract_success": "📊✅",
 621 |     "table_extract_error": "📊❌",
 622 |     "docs_collected_success": "📖✅",
 623 |     "docs_harvest": "📖",
 624 |     "search": "🔍",
 625 |     "search_start": "🔍➡️",
 626 |     "search_complete": "🔍✅",
 627 |     "search_captcha": "🤖",
 628 |     "search_no_results_selector": "🤷",
 629 |     "search_error_playwright": "🔍❌",
 630 |     "search_error_unexpected": "🔍💣",
 631 |     "macro_plan": "📝",
 632 |     "macro_plan_generated": "📝✅",
 633 |     "macro_plan_empty": "📝🤷",
 634 |     "macro_step_result": "▶️",
 635 |     "macro_complete": "🎉",
 636 |     "macro_finish_action": "🏁",
 637 |     "macro_error": "💥",
 638 |     "macro_exceeded_rounds": "🔄",
 639 |     "macro_fail_step": "❌",
 640 |     "macro_error_tool": "🛠️💥",
 641 |     "macro_error_unexpected": "💣💥",
 642 |     "macro_navigate": "🗺️➡️",
 643 |     "click_extract_navigate": "🖱️🗺️",
 644 |     "click_extract_success": "🖱️✅✨",
 645 |     "fill_form_navigate": "✍️🗺️",
 646 |     "fill_form_field": "✍️",
 647 |     "fill_form_submit": "✔️",
 648 |     "fill_form_success": "✍️✅",
 649 |     "autopilot_run": "🧑‍✈️",
 650 |     "autopilot_step_start": "▶️",
 651 |     "autopilot_step_success": "✅",
 652 |     "autopilot_step_fail": "❌",
 653 |     "autopilot_replan_success": "🧠🔄",
 654 |     "autopilot_replan_fail": "🧠❌",
 655 |     "autopilot_max_steps": "🚫🔄",
 656 |     "autopilot_plan_end": "🏁",
 657 |     "autopilot_critical_error": "💥🧑‍✈️",
 658 |     "parallel_navigate": "🚦➡️",
 659 |     "parallel_url_error": "🚦🔥",
 660 |     "parallel_process_complete": "🚦🏁",
 661 |     "retry": "⏳",
 662 |     "retry_fail": "⚠️",
 663 |     "retry_fail_unexpected": "💣⚠️",
 664 |     "retry_unexpected": "⏳💣",
 665 |     "llm_call_complete": "🤖💬",
 666 | }
 667 | 
 668 | 
 669 | async def _log(event: str, **details):  # Uses global _last_hash, _salt, _LOG_FILE
 670 |     """Append a hash-chained entry to the audit log asynchronously."""
 671 |     global _last_hash, _salt
 672 |     if _LOG_FILE is None:  # Need to check if path is set
 673 |         logger.warning(f"Audit log skipped for event '{event}': Log file path not initialized.")
 674 |         return
 675 |     now_utc = datetime.now(timezone.utc)
 676 |     ts_iso = now_utc.isoformat()
 677 |     sanitized_details = _sanitize_for_log(details)
 678 |     emoji_key = _EVENT_EMOJI_MAP.get(event, "❓")
 679 |     async with _audit_log_lock:
 680 |         current_last_hash = _last_hash
 681 |         entry = {
 682 |             "ts": ts_iso,
 683 |             "event": event,
 684 |             "details": sanitized_details,
 685 |             "prev": current_last_hash,
 686 |             "emoji": emoji_key,
 687 |         }
 688 |         entry_json = json.dumps(entry, sort_keys=True, separators=(",", ":"))
 689 |         payload = _salt + entry_json.encode("utf-8")
 690 |         hasher = hashlib.sha256(payload)
 691 |         h = hasher.hexdigest()
 692 |         log_entry_data = {"hash": h, **entry}
 693 |         log_entry_line = json.dumps(log_entry_data, separators=(",", ":")) + "\n"
 694 |         try:
 695 |             async with aiofiles.open(_LOG_FILE, "a", encoding="utf-8") as f:
 696 |                 await f.write(log_entry_line)
 697 |                 await f.flush()
 698 |                 # os.fsync is sync, run in executor if strict atomic persistence needed
 699 |                 # loop = asyncio.get_running_loop()
 700 |                 # await loop.run_in_executor(_get_pool(), os.fsync, f.fileno())
 701 |             _last_hash = h
 702 |         except IOError as e:
 703 |             logger.error(f"Failed to write to audit log {_LOG_FILE}: {e}")
 704 |         except Exception as e:
 705 |             logger.error(f"Unexpected error writing audit log: {e}", exc_info=True)
 706 | 
 707 | 
 708 | def _init_last_hash():  # Uses global _LOG_FILE, _last_hash
 709 |     """Initializes the last hash from the audit log file."""
 710 |     global _last_hash
 711 |     if _LOG_FILE is None:
 712 |         logger.info("Audit log initialization skipped: _LOG_FILE path not set yet.")
 713 |         return
 714 |     if _LOG_FILE.exists():
 715 |         try:
 716 |             with open(_LOG_FILE, "rb") as f:
 717 |                 f.seek(0, os.SEEK_END)
 718 |                 file_size = f.tell()
 719 |                 if file_size == 0:  # Empty file
 720 |                     _last_hash = None
 721 |                     logger.info("Audit log file found but is empty.")
 722 |                     return
 723 | 
 724 |                 # Read backwards efficiently (simplified version)
 725 |                 buffer_size = 4096
 726 |                 last_line = b""
 727 |                 read_pos = max(0, file_size - buffer_size)
 728 | 
 729 |                 while read_pos >= 0:
 730 |                     f.seek(read_pos)
 731 |                     buffer = f.read(buffer_size)
 732 |                     lines = buffer.splitlines()  # Split by \n, \r, or \r\n
 733 |                     if lines:
 734 |                         # Find the last *complete* line in the buffer
 735 |                         # A complete line will either be the last one if the buffer ends with newline,
 736 |                         # or the second to last one otherwise.
 737 |                         is_last_line_complete = buffer.endswith(b"\n") or buffer.endswith(b"\r")
 738 |                         if is_last_line_complete:
 739 |                             last_line_candidate = lines[-1]
 740 |                         elif len(lines) > 1:
 741 |                             last_line_candidate = lines[-2]  # Use second-to-last if last is partial
 742 |                         else:  # File smaller than buffer or only one partial line
 743 |                             last_line_candidate = b""  # Assume partial
 744 | 
 745 |                         # Ensure candidate is not empty and potentially valid JSON before breaking
 746 |                         if last_line_candidate.strip().startswith(b"{"):
 747 |                             last_line = last_line_candidate
 748 |                             break  # Found a likely valid, complete line
 749 | 
 750 |                     if read_pos == 0:
 751 |                         # Reached beginning, check if the first line itself is the only one
 752 |                         if len(lines) == 1 and lines[0].strip().startswith(b"{"):
 753 |                             last_line = lines[0]
 754 |                         break
 755 | 
 756 |                     # Move back, overlapping slightly to ensure line endings are caught
 757 |                     read_pos = max(0, read_pos - (buffer_size // 2))
 758 | 
 759 |             if last_line:
 760 |                 try:
 761 |                     decoded_line = last_line.decode("utf-8")
 762 |                     last_entry = json.loads(decoded_line)
 763 |                     found_hash = last_entry.get("hash")
 764 |                     _last_hash = found_hash
 765 |                     if _last_hash:
 766 |                         hash_preview = _last_hash[:8]
 767 |                         logger.info(
 768 |                             f"Initialized audit log chain from last hash: {hash_preview}..."
 769 |                         )
 770 |                     else:
 771 |                         logger.warning(
 772 |                             "Last log entry parsed but missing 'hash'. Starting new chain."
 773 |                         )
 774 |                         _last_hash = None
 775 |                 except (json.JSONDecodeError, UnicodeDecodeError) as e:
 776 |                     logger.error(f"Error decoding last line of audit log: {e}. Starting new chain.")
 777 |                     _last_hash = None
 778 |             else:
 779 |                 logger.info("Could not read last complete line from audit log. Starting new chain.")
 780 |                 _last_hash = None
 781 |         except Exception as e:
 782 |             logger.error(
 783 |                 f"Failed to read last hash from audit log {_LOG_FILE}: {e}. Starting new chain.",
 784 |                 exc_info=True,
 785 |             )
 786 |             _last_hash = None
 787 |     else:
 788 |         logger.info("No existing audit log found. Starting new chain.")
 789 |         _last_hash = None
 790 | 
 791 | 
 792 | # --- Resilient Decorator ---
 793 | def resilient(max_attempts: int = 3, backoff: float = 0.3):  # Uses global _log
 794 |     """Decorator for async functions; retries on common transient errors."""
 795 | 
 796 |     def wrap(fn):
 797 |         import functools  # Ensure functools is imported locally for the decorator
 798 | 
 799 |         @functools.wraps(fn)
 800 |         async def inner(*a, **kw):
 801 |             attempt = 0
 802 |             while True:
 803 |                 try:
 804 |                     if attempt > 0:
 805 |                         # Calculate jittered delay before retrying
 806 |                         delay_factor = 2 ** (attempt - 1)
 807 |                         base_delay = backoff * delay_factor
 808 |                         jitter = random.uniform(0.8, 1.2)
 809 |                         jitter_delay = base_delay * jitter
 810 |                         await asyncio.sleep(jitter_delay)
 811 |                     result = await fn(*a, **kw)
 812 |                     return result
 813 |                 except (PlaywrightTimeoutError, httpx.RequestError, asyncio.TimeoutError) as e:
 814 |                     attempt += 1
 815 |                     func_name = getattr(fn, "__name__", "unknown_func")
 816 |                     if attempt >= max_attempts:
 817 |                         await _log(
 818 |                             "retry_fail", func=func_name, attempts=max_attempts, error=str(e)
 819 |                         )
 820 |                         raise ToolError(
 821 |                             f"Operation '{func_name}' failed after {max_attempts} attempts: {e}"
 822 |                         ) from e
 823 |                     # Calculate delay for logging purposes (actual sleep is at loop start)
 824 |                     delay_factor_log = 2 ** (attempt - 1)
 825 |                     base_delay_log = backoff * delay_factor_log
 826 |                     jitter_log = random.uniform(
 827 |                         0.8, 1.2
 828 |                     )  # Recalculate for log consistency, might differ slightly from sleep
 829 |                     delay_log = base_delay_log * jitter_log
 830 |                     rounded_delay = round(delay_log, 2)
 831 |                     await _log(
 832 |                         "retry",
 833 |                         func=func_name,
 834 |                         attempt=attempt,
 835 |                         max_attempts=max_attempts,
 836 |                         sleep=rounded_delay,
 837 |                         error=str(e),
 838 |                     )
 839 |                     # Sleep moved to start of the next iteration
 840 |                 except (
 841 |                     ToolError,
 842 |                     ValueError,
 843 |                     TypeError,
 844 |                     KeyError,
 845 |                     KeyboardInterrupt,
 846 |                     sqlite3.Error,
 847 |                 ):
 848 |                     # Non-retryable errors specific to the application or unrecoverable
 849 |                     raise  # Re-raise immediately
 850 |                 except Exception as e:
 851 |                     # Catch other unexpected exceptions and retry them
 852 |                     attempt += 1
 853 |                     func_name = getattr(fn, "__name__", "unknown_func")
 854 |                     if attempt >= max_attempts:
 855 |                         await _log(
 856 |                             "retry_fail_unexpected",
 857 |                             func=func_name,
 858 |                             attempts=max_attempts,
 859 |                             error=str(e),
 860 |                         )
 861 |                         raise ToolError(
 862 |                             f"Operation '{func_name}' failed with unexpected error after {max_attempts} attempts: {e}"
 863 |                         ) from e
 864 |                     # Calculate delay for logging
 865 |                     delay_factor_log = 2 ** (attempt - 1)
 866 |                     base_delay_log = backoff * delay_factor_log
 867 |                     jitter_log = random.uniform(0.8, 1.2)
 868 |                     delay_log = base_delay_log * jitter_log
 869 |                     rounded_delay = round(delay_log, 2)
 870 |                     await _log(
 871 |                         "retry_unexpected",
 872 |                         func=func_name,
 873 |                         attempt=attempt,
 874 |                         max_attempts=max_attempts,
 875 |                         sleep=rounded_delay,
 876 |                         error=str(e),
 877 |                     )
 878 |                     # Sleep moved to start of the next iteration
 879 | 
 880 |         return inner
 881 | 
 882 |     return wrap
 883 | 
 884 | 
 885 | # --- Secret Vault ---
 886 | def _update_vault_paths():  # Uses global _vault_allowed_paths_str_global, _ALLOWED_VAULT_PATHS
 887 |     """Parse the vault allowed paths string from global config into the global set."""
 888 |     global _ALLOWED_VAULT_PATHS
 889 |     new_set = set()
 890 |     path_list = _vault_allowed_paths_str_global.split(",")
 891 |     for path in path_list:
 892 |         stripped_path = path.strip()
 893 |         if stripped_path:
 894 |             # Ensure path ends with a slash for prefix matching
 895 |             formatted_path = stripped_path.rstrip("/") + "/"
 896 |             new_set.add(formatted_path)
 897 |     _ALLOWED_VAULT_PATHS = new_set
 898 | 
 899 | 
 900 | def get_secret(path_key: str) -> str:  # Uses global _ALLOWED_VAULT_PATHS
 901 |     """Retrieves secret from environment or HashiCorp Vault."""
 902 |     # ... (implementation largely unchanged, relies on _ALLOWED_VAULT_PATHS global, split multi-line expressions) ...
 903 |     if path_key.startswith("env:"):
 904 |         var = path_key[4:]
 905 |         val = os.getenv(var)
 906 |         if val is None:
 907 |             raise ToolInputError(f"Environment variable secret '{var}' not set.")
 908 |         return val
 909 |     if path_key.startswith("vault:"):
 910 |         try:
 911 |             import hvac
 912 |         except ImportError as e:
 913 |             raise RuntimeError("'hvac' library required for Vault access.") from e
 914 |         addr = os.getenv("VAULT_ADDR")
 915 |         token = os.getenv("VAULT_TOKEN")
 916 |         if not addr or not token:
 917 |             raise RuntimeError("VAULT_ADDR and VAULT_TOKEN environment variables must be set.")
 918 | 
 919 |         vault_uri_part = path_key[len("vault:") :]
 920 |         if "://" in vault_uri_part:
 921 |             raise ValueError("Vault path cannot contain '://'. Use format 'mount/path#key'.")
 922 |         if "#" not in vault_uri_part:
 923 |             raise ValueError("Vault path must include '#key'. Use format 'mount/path#key'.")
 924 | 
 925 |         path_part_raw, key_name = vault_uri_part.split("#", 1)
 926 |         path_part = path_part_raw.strip("/")
 927 | 
 928 |         if not _ALLOWED_VAULT_PATHS:
 929 |             _update_vault_paths()  # Ensure allowed paths are populated
 930 | 
 931 |         # Check if the requested path is allowed
 932 |         path_to_check = path_part + "/"  # Ensure trailing slash for prefix check
 933 |         found_prefix = False
 934 |         for prefix in _ALLOWED_VAULT_PATHS:
 935 |             if path_to_check.startswith(prefix):
 936 |                 found_prefix = True
 937 |                 break
 938 |         if not found_prefix:
 939 |             logger.warning(
 940 |                 f"Access denied for Vault path '{path_part}'. Allowed prefixes: {_ALLOWED_VAULT_PATHS}"
 941 |             )
 942 |             raise ValueError(f"Access to Vault path '{path_part}' is not allowed.")
 943 | 
 944 |         client = hvac.Client(url=addr, token=token)
 945 |         if not client.is_authenticated():
 946 |             raise RuntimeError(f"Vault authentication failed for {addr}.")
 947 | 
 948 |         path_segments = path_part.split("/")
 949 |         if not path_segments:
 950 |             raise ValueError(f"Invalid Vault path format: '{path_part}'")
 951 | 
 952 |         mount_point = path_segments[0]
 953 |         rest_segments = path_segments[1:]
 954 |         secret_sub_path = "/".join(rest_segments)
 955 | 
 956 |         # Try KV v2 first
 957 |         try:
 958 |             resp_v2 = client.secrets.kv.v2.read_secret_version(
 959 |                 mount_point=mount_point, path=secret_sub_path
 960 |             )
 961 |             data_v2 = resp_v2["data"]["data"]
 962 |             if key_name in data_v2:
 963 |                 return data_v2[key_name]
 964 |             else:
 965 |                 # Key not found in this v2 secret
 966 |                 pass  # Will proceed to check v1 or raise later
 967 |         except hvac.exceptions.InvalidPath:
 968 |             # Path doesn't exist in KV v2 mount, try KV v1
 969 |             pass
 970 |         except (KeyError, TypeError):
 971 |             # Error accessing nested data['data'], indicates issue with response structure
 972 |             logger.warning(
 973 |                 f"Unexpected response structure from Vault KV v2 for path '{path_part}'."
 974 |             )
 975 |             pass
 976 |         except Exception as e:
 977 |             logger.error(f"Error reading Vault KV v2 secret '{path_part}': {e}")
 978 |             # Don't raise immediately, allow fallback to v1 if configured
 979 |             pass
 980 | 
 981 |         # Try KV v1
 982 |         try:
 983 |             resp_v1 = client.secrets.kv.v1.read_secret(
 984 |                 mount_point=mount_point, path=secret_sub_path
 985 |             )
 986 |             data_v1 = resp_v1["data"]
 987 |             if key_name in data_v1:
 988 |                 return data_v1[key_name]
 989 |             else:
 990 |                 # Key not found in v1 either
 991 |                 raise KeyError(
 992 |                     f"Key '{key_name}' not found in Vault secret at '{path_part}' (tried KV v2 & v1)."
 993 |                 )
 994 |         except hvac.exceptions.InvalidPath:
 995 |             # Path not found in v1 either (and wasn't found in v2 or errored)
 996 |             raise KeyError(
 997 |                 f"Secret path '{path_part}' not found in Vault (tried KV v2 & v1)."
 998 |             ) from None
 999 |         except KeyError:
1000 |             # Re-raise the KeyError from the v1 check if key wasn't found there
1001 |             raise KeyError(f"Key '{key_name}' not found at '{path_part}' (KV v1).") from None
1002 |         except Exception as e:
1003 |             logger.error(f"Error reading Vault KV v1 secret '{path_part}': {e}")
1004 |             raise RuntimeError(f"Failed to read Vault secret (KV v1): {e}") from e
1005 | 
1006 |     # If scheme is not 'env:' or 'vault:'
1007 |     raise ValueError(f"Unknown secret scheme or invalid path format: {path_key}")
1008 | 
1009 | 
1010 | # --- Playwright Lifecycle ---
1011 | def _update_proxy_settings():  # Uses globals
1012 |     """Parse global proxy config strings into usable dict/list."""
1013 |     global _PROXY_CONFIG_DICT, _PROXY_ALLOWED_DOMAINS_LIST
1014 |     _PROXY_CONFIG_DICT = None  # Reset
1015 |     if _proxy_pool_str_global:
1016 |         # Split and filter empty strings
1017 |         proxies_raw = _proxy_pool_str_global.split(";")
1018 |         proxies = []
1019 |         for p in proxies_raw:
1020 |             stripped_p = p.strip()
1021 |             if stripped_p:
1022 |                 proxies.append(stripped_p)
1023 | 
1024 |         if proxies:
1025 |             chosen_proxy = random.choice(proxies)
1026 |             try:
1027 |                 parsed = urlparse(chosen_proxy)
1028 |                 # Basic validation
1029 |                 is_valid_scheme = parsed.scheme in ("http", "https", "socks5", "socks5h")
1030 |                 has_netloc = bool(parsed.netloc)
1031 |                 no_fragment = "#" not in chosen_proxy  # Fragments not allowed in proxy URL itself
1032 | 
1033 |                 if is_valid_scheme and has_netloc and no_fragment:
1034 |                     # Construct base server URL without credentials
1035 |                     if parsed.port:
1036 |                         hostname_port = f"{parsed.hostname}:{parsed.port}"
1037 |                     else:
1038 |                         hostname_port = parsed.hostname
1039 |                     server_url = f"{parsed.scheme}://{hostname_port}"
1040 | 
1041 |                     proxy_dict: Dict[str, Any] = {"server": server_url}
1042 |                     if parsed.username:
1043 |                         unquoted_username = urllib.parse.unquote(parsed.username)
1044 |                         proxy_dict["username"] = unquoted_username
1045 |                     if parsed.password:
1046 |                         unquoted_password = urllib.parse.unquote(parsed.password)
1047 |                         proxy_dict["password"] = unquoted_password
1048 | 
1049 |                     _PROXY_CONFIG_DICT = proxy_dict
1050 |                     logger.info(f"Proxy settings parsed: Using {proxy_dict.get('server')}")
1051 |                 else:
1052 |                     logger.warning(f"Invalid proxy URL format/scheme: '{chosen_proxy}'. Skipping.")
1053 |             except Exception as e:
1054 |                 logger.warning(f"Error parsing proxy URL '{chosen_proxy}': {e}")
1055 | 
1056 |     # Parse allowed domains
1057 |     if not _proxy_allowed_domains_str_global or _proxy_allowed_domains_str_global == "*":
1058 |         _PROXY_ALLOWED_DOMAINS_LIST = None  # None means allow all
1059 |         logger.info("Proxy allowed domains: * (all allowed)")
1060 |     else:
1061 |         domains_raw = _proxy_allowed_domains_str_global.split(",")
1062 |         domains = []
1063 |         for d in domains_raw:
1064 |             stripped_d = d.strip()
1065 |             if stripped_d:
1066 |                 lower_d = stripped_d.lower()
1067 |                 domains.append(lower_d)
1068 | 
1069 |         # Ensure domains start with a dot for proper suffix matching
1070 |         new_domain_list = []
1071 |         for d in domains:
1072 |             if d.startswith("."):
1073 |                 new_domain_list.append(d)
1074 |             else:
1075 |                 new_domain_list.append("." + d)
1076 |         _PROXY_ALLOWED_DOMAINS_LIST = new_domain_list
1077 |         logger.info(f"Proxy allowed domains parsed: {_PROXY_ALLOWED_DOMAINS_LIST}")
1078 | 
1079 | 
1080 | def _get_proxy_config() -> Optional[Dict[str, Any]]:  # Uses global _PROXY_CONFIG_DICT
1081 |     """Returns the globally cached parsed proxy dictionary."""
1082 |     return _PROXY_CONFIG_DICT
1083 | 
1084 | 
1085 | def _is_domain_allowed_for_proxy(url: str) -> bool:  # Uses global _PROXY_ALLOWED_DOMAINS_LIST
1086 |     """Checks if the URL's domain is allowed based on globally cached list."""
1087 |     if _PROXY_ALLOWED_DOMAINS_LIST is None:
1088 |         return True  # Allow all if list is None (wildcard)
1089 |     try:
1090 |         parsed_url = urlparse(url)
1091 |         domain = parsed_url.netloc.lower()
1092 |         if not domain:
1093 |             return False  # Cannot determine domain
1094 | 
1095 |         # Check domain and its superdomains against the allowed list
1096 |         domain_parts = domain.split(".")
1097 |         for i in range(len(domain_parts)):
1098 |             sub_domain_check = "." + ".".join(domain_parts[i:])
1099 |             if sub_domain_check in _PROXY_ALLOWED_DOMAINS_LIST:
1100 |                 return True
1101 |         # Check exact domain match as well (if domain doesn't start with .)
1102 |         # The logic above already covers this because we ensure allowed domains start with '.'
1103 |         # e.g. if "example.com" is requested and ".example.com" is allowed, it matches.
1104 |         return False  # No allowed suffix matched
1105 |     except Exception as e:
1106 |         logger.warning(f"Error parsing URL '{url}' for proxy domain check: {e}")
1107 |         return False  # Deny on error
1108 | 
1109 | 
1110 | def _run_sync(coro):  # Keep as is
1111 |     try:
1112 |         loop = asyncio.get_running_loop()
1113 |     except RuntimeError:
1114 |         # No running loop, run in a new one
1115 |         return asyncio.run(coro)
1116 |     else:
1117 |         # Loop exists, run in threadsafe way if called from sync context
1118 |         future = asyncio.run_coroutine_threadsafe(coro, loop)  # noqa: F841
1119 |         # If needing the result synchronously (careful with deadlocks):
1120 |         # return future.result()
1121 |         return None  # Or return future if caller handles it
1122 | 
1123 | 
1124 | async def _try_close_browser():  # Uses global _browser
1125 |     """Attempt to close the browser gracefully via atexit."""
1126 |     global _browser
1127 |     browser_to_close = _browser  # Capture current browser instance
1128 |     if browser_to_close and browser_to_close.is_connected():
1129 |         logger.info("Attempting to close browser via atexit handler...")
1130 |         try:
1131 |             await browser_to_close.close()
1132 |             logger.info("Browser closed successfully via atexit.")
1133 |         except Exception as e:
1134 |             logger.error(f"Error closing browser during atexit: {e}")
1135 |         finally:
1136 |             # Only reset global _browser if it hasn't changed in the meantime
1137 |             if _browser == browser_to_close:
1138 |                 _browser = None
1139 | 
1140 | 
1141 | async def get_browser_context(
1142 |     use_incognito: bool = False,
1143 |     context_args: Optional[Dict[str, Any]] = None,
1144 | ) -> tuple[BrowserContext, Browser]:  # Uses MANY globals
1145 |     """Get or create a browser context using global config values."""
1146 |     global _pw, _browser, _ctx
1147 |     async with _playwright_lock:
1148 |         # 1. Ensure Playwright is started
1149 |         if not _pw:
1150 |             try:
1151 |                 playwright_manager = async_playwright()
1152 |                 _pw = await playwright_manager.start()
1153 |                 logger.info("Playwright started.")
1154 |             except Exception as e:
1155 |                 raise RuntimeError(f"Failed to start Playwright: {e}") from e
1156 | 
1157 |         # 2. Handle Headless Mode and VNC
1158 |         is_headless = _headless_mode_global
1159 |         if not is_headless:
1160 |             _start_vnc()  # Starts VNC if enabled and not already running
1161 | 
1162 |         # 3. Ensure Browser is launched and connected
1163 |         if not _browser or not _browser.is_connected():
1164 |             if _browser:  # Close previous instance if disconnected
1165 |                 try:
1166 |                     await _browser.close()
1167 |                 except Exception as close_err:
1168 |                     logger.warning(
1169 |                         f"Error closing previous disconnected browser instance: {close_err}"
1170 |                     )
1171 |             try:
1172 |                 browser_args = [
1173 |                     "--no-sandbox",
1174 |                     "--disable-dev-shm-usage",
1175 |                     "--disable-gpu",
1176 |                     "--window-size=1280,1024",
1177 |                 ]
1178 |                 launched_browser = await _pw.chromium.launch(
1179 |                     headless=is_headless,
1180 |                     args=browser_args,
1181 |                 )
1182 |                 _browser = launched_browser
1183 |                 logger.info(f"Browser launched (Headless: {is_headless}).")
1184 |                 # Register atexit handler *after* successful launch
1185 |                 atexit.register(lambda: _run_sync(_try_close_browser()))
1186 |             except PlaywrightException as e:
1187 |                 raise RuntimeError(f"Failed to launch browser: {e}") from e
1188 | 
1189 |         # 4. Prepare Context Arguments
1190 |         default_args = {
1191 |             "viewport": {"width": 1280, "height": 1024},
1192 |             "locale": "en-US",
1193 |             "timezone_id": "UTC",
1194 |             "accept_downloads": True,
1195 |         }
1196 |         if context_args:
1197 |             default_args.update(context_args)
1198 | 
1199 |         # 5. Handle Incognito Context Request
1200 |         if use_incognito:
1201 |             try:
1202 |                 incog_ctx = await _browser.new_context(**default_args)
1203 |                 await _log("browser_incognito_context", args=default_args)
1204 |                 # Apply proxy routing rules if necessary for incognito context
1205 |                 proxy_cfg = _get_proxy_config()
1206 |                 if proxy_cfg:
1207 |                     await _add_proxy_routing_rule(incog_ctx, proxy_cfg)
1208 |                 return incog_ctx, _browser
1209 |             except PlaywrightException as e:
1210 |                 raise ToolError(f"Failed to create incognito context: {e}") from e
1211 | 
1212 |         # 6. Handle Shared Context Request
1213 |         if not _ctx or not _ctx.browser:  # Check if shared context needs creation/recreation
1214 |             if _ctx:  # Close previous invalid context if any
1215 |                 try:
1216 |                     await _ctx.close()
1217 |                 except Exception as close_err:
1218 |                     logger.warning(f"Error closing previous invalid shared context: {close_err}")
1219 | 
1220 |             try:
1221 |                 # Load state before creating context
1222 |                 loaded_state = await _load_state()
1223 |                 proxy_cfg = _get_proxy_config()
1224 | 
1225 |                 final_ctx_args = default_args.copy()
1226 |                 if loaded_state:
1227 |                     final_ctx_args["storage_state"] = loaded_state
1228 |                 if proxy_cfg:
1229 |                     # Note: Using context.route for proxy filtering now,
1230 |                     # but setting proxy here is still needed for Playwright to use it.
1231 |                     final_ctx_args["proxy"] = proxy_cfg
1232 | 
1233 |                 # Create the new shared context
1234 |                 new_shared_ctx = await _browser.new_context(**final_ctx_args)
1235 |                 _ctx = new_shared_ctx
1236 | 
1237 |                 # Log context creation details (excluding potentially large state)
1238 |                 log_args = {}
1239 |                 for k, v in final_ctx_args.items():
1240 |                     if k != "storage_state":
1241 |                         log_args[k] = v
1242 |                 await _log(
1243 |                     "browser_context_create",
1244 |                     headless=is_headless,
1245 |                     proxy=bool(proxy_cfg),
1246 |                     args=log_args,
1247 |                 )
1248 | 
1249 |                 # Apply proxy routing rules if needed
1250 |                 if proxy_cfg:
1251 |                     await _add_proxy_routing_rule(_ctx, proxy_cfg)
1252 | 
1253 |                 # Start maintenance loop for the *new* shared context
1254 |                 asyncio.create_task(_context_maintenance_loop(_ctx))
1255 | 
1256 |             except PlaywrightException as e:
1257 |                 raise RuntimeError(f"Failed to create shared context: {e}") from e
1258 |             except Exception as e:  # Catch errors during state load/save too
1259 |                 raise RuntimeError(f"Failed during shared context creation/state load: {e}") from e
1260 | 
1261 |         # 7. Return the valid shared context and browser
1262 |         return _ctx, _browser
1263 | 
1264 | 
1265 | async def _add_proxy_routing_rule(
1266 |     context: BrowserContext, proxy_config: Dict[str, Any]
1267 | ):  # Uses global _PROXY_ALLOWED_DOMAINS_LIST
1268 |     """Adds routing rule to enforce proxy domain restrictions if enabled."""
1269 |     # Check if domain restrictions are active
1270 |     if _PROXY_ALLOWED_DOMAINS_LIST is None:
1271 |         logger.debug("No proxy domain restrictions configured. Skipping routing rule.")
1272 |         return
1273 | 
1274 |     async def handle_route(route):
1275 |         request_url = route.request.url
1276 |         if not _is_domain_allowed_for_proxy(request_url):
1277 |             logger.warning(f"Proxy blocked for disallowed domain: {request_url}. Aborting request.")
1278 |             try:
1279 |                 await route.abort("accessdenied")
1280 |             except PlaywrightException as e:
1281 |                 # Log error but don't crash the handler
1282 |                 logger.error(f"Error aborting route for {request_url}: {e}")
1283 |         else:
1284 |             # Domain is allowed, let the request proceed (through the proxy set on the context)
1285 |             try:
1286 |                 await route.continue_()
1287 |             except PlaywrightException as e:
1288 |                 # Log error but don't crash the handler
1289 |                 logger.error(f"Error continuing route for {request_url}: {e}")
1290 | 
1291 |     try:
1292 |         # Route all network requests ('**/*')
1293 |         await context.route("**/*", handle_route)
1294 |         logger.info("Proxy domain restriction routing rule added.")
1295 |     except PlaywrightException as e:
1296 |         logger.error(f"Failed to add proxy routing rule: {e}")
1297 | 
1298 | 
1299 | def _start_vnc():  # Uses globals
1300 |     """Starts X11VNC if VNC enabled and password set."""
1301 |     global _vnc_proc
1302 |     # Check if already running or not enabled
1303 |     if _vnc_proc or not _vnc_enabled_global:
1304 |         return
1305 | 
1306 |     vnc_pass = _vnc_password_global
1307 |     if not vnc_pass:
1308 |         logger.debug("VNC start skipped: Password not set.")
1309 |         return
1310 | 
1311 |     display = os.getenv("DISPLAY", ":0")
1312 |     try:
1313 |         # Check if x11vnc command exists
1314 |         which_cmd = ["which", "x11vnc"]
1315 |         result = subprocess.run(which_cmd, capture_output=True, text=True, check=False)
1316 |         if result.returncode != 0:
1317 |             logger.warning("x11vnc command not found in PATH. Cannot start VNC server.")
1318 |             return
1319 | 
1320 |         # Prepare command arguments
1321 |         cmd = [
1322 |             "x11vnc",
1323 |             "-display",
1324 |             display,
1325 |             "-passwd",
1326 |             vnc_pass,  # Use the password directly
1327 |             "-forever",  # Keep running until explicitly killed
1328 |             "-localhost",  # Only listen on localhost
1329 |             "-quiet",  # Reduce log output
1330 |             "-noxdamage",  # Compatibility option
1331 |         ]
1332 | 
1333 |         # Use setsid to run in a new session, allowing clean termination
1334 |         if hasattr(os, "setsid"):
1335 |             preexec_fn = os.setsid
1336 |         else:
1337 |             preexec_fn = None  # Not available on Windows
1338 | 
1339 |         # Start the process
1340 |         vnc_process = subprocess.Popen(
1341 |             cmd,
1342 |             stdout=subprocess.DEVNULL,  # Redirect stdout
1343 |             stderr=subprocess.DEVNULL,  # Redirect stderr
1344 |             preexec_fn=preexec_fn,  # Run in new session if possible
1345 |         )
1346 |         _vnc_proc = vnc_process
1347 |         logger.info(
1348 |             f"Password-protected VNC server started on display {display} (localhost only). PID: {_vnc_proc.pid}"
1349 |         )
1350 | 
1351 |         # Register cleanup function to run on exit
1352 |         atexit.register(_cleanup_vnc)
1353 | 
1354 |     except FileNotFoundError:
1355 |         # This shouldn't happen if `which` check passed, but belts and suspenders
1356 |         logger.warning("x11vnc command found by 'which' but Popen failed (FileNotFoundError).")
1357 |     except Exception as e:
1358 |         logger.error(f"Failed to start VNC server: {e}", exc_info=True)
1359 |         _vnc_proc = None  # Ensure proc is None if start failed
1360 | 
1361 | 
1362 | def _cleanup_vnc():  # Uses global _vnc_proc
1363 |     """Terminates the VNC server process."""
1364 |     global _vnc_proc
1365 |     proc = _vnc_proc  # Capture current process instance
1366 |     if proc and proc.poll() is None:  # Check if process exists and is running
1367 |         logger.info(f"Terminating VNC server process (PID: {proc.pid})...")
1368 |         try:
1369 |             # Try to terminate the whole process group first (more reliable)
1370 |             if hasattr(os, "getpgid") and hasattr(os, "killpg"):
1371 |                 try:
1372 |                     pgid = os.getpgid(proc.pid)
1373 |                     os.killpg(pgid, signal.SIGTERM)
1374 |                     logger.debug(f"Sent SIGTERM to process group {pgid}.")
1375 |                 except ProcessLookupError:
1376 |                     # Process group might already be gone
1377 |                     logger.debug("VNC process group not found, trying direct SIGTERM.")
1378 |                     proc.terminate()
1379 |                 except Exception as pg_err:
1380 |                     logger.warning(
1381 |                         f"Error sending SIGTERM to process group, trying direct SIGTERM: {pg_err}"
1382 |                     )
1383 |                     proc.terminate()  # Fallback to terminating just the process
1384 |             else:
1385 |                 # Fallback if killpg/getpgid not available
1386 |                 proc.terminate()
1387 |                 logger.debug("Sent SIGTERM directly to VNC process.")
1388 | 
1389 |             # Wait for termination with timeout
1390 |             proc.wait(timeout=5)
1391 |             logger.info("VNC server process terminated gracefully.")
1392 |         except subprocess.TimeoutExpired:
1393 |             logger.warning("VNC server did not terminate after SIGTERM. Sending SIGKILL.")
1394 |             # Force kill if SIGTERM failed
1395 |             if hasattr(os, "getpgid") and hasattr(os, "killpg"):
1396 |                 try:
1397 |                     pgid = os.getpgid(proc.pid)
1398 |                     os.killpg(pgid, signal.SIGKILL)
1399 |                     logger.debug(f"Sent SIGKILL to process group {pgid}.")
1400 |                 except ProcessLookupError:
1401 |                     logger.debug("VNC process group not found for SIGKILL, trying direct SIGKILL.")
1402 |                     proc.kill()  # Fallback to killing just the process
1403 |                 except Exception as pg_kill_err:
1404 |                     logger.warning(
1405 |                         f"Error sending SIGKILL to process group, trying direct SIGKILL: {pg_kill_err}"
1406 |                     )
1407 |                     proc.kill()  # Fallback
1408 |             else:
1409 |                 proc.kill()  # Fallback if killpg not available
1410 |                 logger.debug("Sent SIGKILL directly to VNC process.")
1411 |             # Wait briefly after SIGKILL
1412 |             try:
1413 |                 proc.wait(timeout=2)
1414 |             except Exception:
1415 |                 # Ignore errors during wait after SIGKILL
1416 |                 pass
1417 |         except ProcessLookupError:
1418 |             # Process was already gone before we could signal it
1419 |             logger.info("VNC process already terminated before cleanup.")
1420 |         except Exception as e:
1421 |             logger.error(f"Error during VNC cleanup: {e}")
1422 |         finally:
1423 |             # Ensure global state reflects VNC is stopped
1424 |             if _vnc_proc == proc:  # Avoid race condition if started again quickly
1425 |                 _vnc_proc = None
1426 | 
1427 | 
1428 | async def _load_state() -> dict[str, Any] | None:  # Uses global _STATE_FILE, _get_pool, _dec
1429 |     """Loads browser state asynchronously. Decryption runs in executor if needed."""
1430 |     if _STATE_FILE is None or not _STATE_FILE.exists():
1431 |         logger.info("Browser state file path not set or file not found. No state loaded.")
1432 |         return None
1433 | 
1434 |     loop = asyncio.get_running_loop()
1435 |     pool = _get_pool()
1436 |     try:
1437 |         # Read the potentially encrypted file content
1438 |         async with aiofiles.open(_STATE_FILE, "rb") as f:
1439 |             file_data = await f.read()
1440 | 
1441 |         # Decrypt if necessary (runs sync _dec in thread pool)
1442 |         # _dec handles the check for whether encryption is active or not
1443 |         try:
1444 |             decrypted_data = await loop.run_in_executor(pool, _dec, file_data)
1445 |         except RuntimeError as e:
1446 |             if "cannot schedule new futures after shutdown" in str(e):
1447 |                 logger.warning(
1448 |                     "Thread pool is shutdown. Creating a temporary pool for state loading."
1449 |                 )
1450 |                 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as temp_pool:
1451 |                     decrypted_data = await loop.run_in_executor(temp_pool, _dec, file_data)
1452 |             else:
1453 |                 raise
1454 | 
1455 |         if decrypted_data is None:
1456 |             # _dec logs specific reasons (invalid format, decryption failure, etc.)
1457 |             logger.warning("Failed to load or decrypt state data. State file might be invalid.")
1458 |             # Optionally remove the invalid file here if desired
1459 |             # try: _STATE_FILE.unlink(); except Exception: pass
1460 |             return None
1461 | 
1462 |         # Parse the decrypted JSON data
1463 |         state_dict = json.loads(decrypted_data)
1464 |         logger.info(f"Browser state loaded successfully from {_STATE_FILE}.")
1465 |         return state_dict
1466 | 
1467 |     except FileNotFoundError:
1468 |         # This case should be caught by the initial check, but handle defensively
1469 |         logger.info(f"Browser state file {_STATE_FILE} not found during read.")
1470 |         return None
1471 |     except json.JSONDecodeError as e:
1472 |         logger.error(
1473 |             f"Failed to parse browser state JSON from {_STATE_FILE}: {e}. Removing corrupt file."
1474 |         )
1475 |         if _STATE_FILE:
1476 |             try:
1477 |                 _STATE_FILE.unlink()
1478 |             except Exception as unlink_e:
1479 |                 logger.error(f"Failed to remove corrupt state file {_STATE_FILE}: {unlink_e}")
1480 |         return None
1481 |     except RuntimeError as e:  # Catch auth errors from _dec (InvalidTag)
1482 |         logger.error(
1483 |             f"Failed to authenticate/load browser state from {_STATE_FILE}: {e}", exc_info=True
1484 |         )
1485 |         if _STATE_FILE:
1486 |             try:
1487 |                 _STATE_FILE.unlink()
1488 |             except Exception as unlink_e:
1489 |                 logger.error(
1490 |                     f"Failed to remove unauthenticated state file {_STATE_FILE}: {unlink_e}"
1491 |                 )
1492 |         return None
1493 |     except Exception as e:
1494 |         logger.error(f"Failed to load browser state from {_STATE_FILE}: {e}", exc_info=True)
1495 |         # Optionally remove the problematic file
1496 |         if _STATE_FILE:
1497 |             try:
1498 |                 _STATE_FILE.unlink()
1499 |             except Exception as unlink_e:
1500 |                 logger.error(f"Failed to remove problematic state file {_STATE_FILE}: {unlink_e}")
1501 |         return None
1502 | 
1503 | 
1504 | async def _save_state(ctx: BrowserContext):  # Uses global _get_pool, _enc, _STATE_FILE, _key, _playwright_lock
1505 |     """Saves browser state asynchronously using FileSystemTool's write_file."""
1506 |     if _STATE_FILE is None:
1507 |         logger.warning("Skipping save state: State file path (_STATE_FILE) not initialized.")
1508 |         return
1509 | 
1510 |     # Acquire lock *before* checking context validity to prevent race with shutdown
1511 |     async with _playwright_lock:
1512 |         # Re-check context validity *after* acquiring the lock
1513 |         if not ctx or not ctx.browser or not ctx.browser.is_connected():
1514 |             logger.debug("Skipping save state: Context or browser became invalid/disconnected before save.")
1515 |             return
1516 | 
1517 |         loop = asyncio.get_running_loop()
1518 |         pool = _get_pool()
1519 |         validated_fpath = str(_STATE_FILE)
1520 | 
1521 |         try:
1522 |             # 1. Get the current storage state from Playwright (NOW protected by lock)
1523 |             state = await ctx.storage_state()
1524 | 
1525 |             # 2. Serialize state to JSON bytes
1526 |             state_json = json.dumps(state)
1527 |             state_bytes = state_json.encode("utf-8")
1528 | 
1529 |             # 3. Encrypt the state bytes if key is configured (runs sync _enc in thread pool)
1530 |             try:
1531 |                 data_to_write = await loop.run_in_executor(pool, _enc, state_bytes)
1532 |             except RuntimeError as e:
1533 |                 if "cannot schedule new futures after shutdown" in str(e):
1534 |                     logger.warning("Thread pool is shutdown. Creating a temporary pool for state encryption.")
1535 |                     # Fallback pool creation remains useful
1536 |                     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as temp_pool:
1537 |                         data_to_write = await loop.run_in_executor(temp_pool, _enc, state_bytes)
1538 |                 else:
1539 |                     raise # Re-raise other RuntimeErrors
1540 | 
1541 |             # 4. Write the (potentially encrypted) bytes using the standalone filesystem tool
1542 |             logger.debug(f"Attempting to save state to: {validated_fpath} using filesystem tool.")
1543 |             write_result = await write_file(path=validated_fpath, content=data_to_write) # Pass bytes
1544 | 
1545 |             # 5. Check result from filesystem tool
1546 |             if not isinstance(write_result, dict) or not write_result.get("success"):
1547 |                 error_detail = "Invalid response"
1548 |                 if isinstance(write_result, dict):
1549 |                     error_detail = write_result.get("error", "Unknown")
1550 |                 logger.error(
1551 |                     f"Failed to save browser state using filesystem tool. Reason: {error_detail}"
1552 |                 )
1553 |                 # Log but don't raise ToolError here directly, let the maintenance loop handle logging it
1554 |                 return # Exit if write failed
1555 | 
1556 |             # 6. Log success
1557 |             actual_path = write_result.get("path", validated_fpath)
1558 |             logger.debug(f"Successfully saved state to file: {actual_path}") # Changed log level
1559 | 
1560 |         except PlaywrightException as e:
1561 |             # Catch errors specifically from ctx.storage_state() if the context closed unexpectedly
1562 |             # even with the lock (less likely now, but possible)
1563 |             logger.warning(f"Playwright error during save state (context likely closed): {e}")
1564 |             # Don't raise, let the loop continue/exit gracefully
1565 |         except ToolError as e:
1566 |             # Pass ToolError through (e.g., from write_file) - should be logged by caller
1567 |             logger.error(f"ToolError during save state: {e}")
1568 |             # Don't re-raise here, maintenance loop will log the error
1569 |         except Exception as e:
1570 |             logger.error(f"Unexpected error saving browser state (path: {validated_fpath}): {e}", exc_info=True)
1571 |             # Don't raise ToolError here, let the maintenance loop log the failure
1572 | 
1573 | 
1574 | @asynccontextmanager
1575 | async def _tab_context(ctx: BrowserContext):  # Uses global _log
1576 |     """Async context manager for creating and cleaning up a Page."""
1577 |     page = None
1578 |     context_id = id(ctx)  # Get ID for logging before potential errors
1579 |     try:
1580 |         page = await ctx.new_page()
1581 |         await _log("page_open", context_id=context_id)
1582 |         yield page
1583 |     except PlaywrightException as e:
1584 |         # Log the error before raising
1585 |         await _log("page_error", context_id=context_id, action="create", error=str(e))
1586 |         raise ToolError(f"Failed to create browser page: {e}") from e
1587 |     finally:
1588 |         if page and not page.is_closed():
1589 |             try:
1590 |                 await page.close()
1591 |                 await _log("page_close", context_id=context_id)
1592 |             except PlaywrightException as e:
1593 |                 # Log error during close, but don't prevent cleanup completion
1594 |                 logger.warning(f"Error closing page for context {context_id}: {e}")
1595 |                 await _log("page_error", context_id=context_id, action="close", error=str(e))
1596 | 
1597 | 
1598 | async def _context_maintenance_loop(ctx: BrowserContext): # Uses global _save_state
1599 |     """Periodically saves state for the shared context."""
1600 |     save_interval_seconds = 15 * 60  # Save every 15 minutes
1601 |     context_id = id(ctx)
1602 |     logger.info(f"Starting context maintenance loop for shared context {context_id}.")
1603 | 
1604 |     while True:
1605 |         try:
1606 |             # Check if context is still valid *before* sleeping
1607 |             # Use is_connected() for a more robust check
1608 |             if not ctx or not ctx.browser or not ctx.browser.is_connected():
1609 |                 logger.info(f"Shared context {context_id} seems invalid or disconnected. Stopping maintenance loop.")
1610 |                 break
1611 | 
1612 |             # Wait for the specified interval
1613 |             await asyncio.sleep(save_interval_seconds)
1614 | 
1615 |             # Re-check context validity *after* sleeping, before saving
1616 |             if not ctx or not ctx.browser or not ctx.browser.is_connected():
1617 |                 logger.info(f"Shared context {context_id} became invalid/disconnected during sleep. Stopping maintenance loop.")
1618 |                 break
1619 | 
1620 |             # Save the state (which now handles its own locking and errors more gracefully)
1621 |             await _save_state(ctx)
1622 | 
1623 |         except asyncio.CancelledError:
1624 |             logger.info(f"Context maintenance loop for {context_id} cancelled.")
1625 |             break # Exit loop cleanly on cancellation
1626 |         except Exception as e:
1627 |             # Log unexpected errors in the loop itself (e.g., during the sleep?)
1628 |             logger.error(f"Unexpected error in context maintenance loop for {context_id}: {e}", exc_info=True)
1629 |             # Wait a bit longer before retrying after an unexpected loop error
1630 |             await asyncio.sleep(60)
1631 | 
1632 | 
1633 | # --- Standalone Shutdown Function ---
1634 | # --- Replace the existing shutdown function in smart_browser.py ---
1635 | async def shutdown():  # Uses MANY globals
1636 |     """Gracefully shut down Playwright, browser, context, VNC, and thread pool."""
1637 |     global \
1638 |         _pw, \
1639 |         _browser, \
1640 |         _ctx, \
1641 |         _vnc_proc, \
1642 |         _thread_pool, \
1643 |         _locator_cache_cleanup_task_handle, \
1644 |         _inactivity_monitor_task_handle, \
1645 |         _is_initialized, \
1646 |         _shutdown_initiated # Added missing global reference
1647 | 
1648 |     # Use lock to prevent concurrent shutdown calls
1649 |     # Check _shutdown_initiated flag inside lock for atomicity
1650 |     async with _shutdown_lock:
1651 |         if _shutdown_initiated:
1652 |             logger.debug("Shutdown already initiated or in progress. Skipping.")
1653 |             return
1654 |         if not _is_initialized:
1655 |             logger.info("Shutdown called but Smart Browser was not initialized. Skipping.")
1656 |             return
1657 |         # Mark shutdown as initiated *inside* the lock
1658 |         _shutdown_initiated = True
1659 | 
1660 |     logger.info("Initiating graceful shutdown for Smart Browser...")
1661 | 
1662 |     # Set a global shutdown timeout to prevent hanging
1663 |     shutdown_timeout = 10.0  # 10 seconds to complete shutdown or we'll force through
1664 |     shutdown_start_time = time.monotonic()
1665 |     
1666 |     # Function to check if shutdown is taking too long
1667 |     def is_shutdown_timeout():
1668 |         return (time.monotonic() - shutdown_start_time) > shutdown_timeout
1669 |     
1670 |     # 1. Cancel Background Tasks First
1671 |     tasks_to_cancel = [
1672 |         (_locator_cache_cleanup_task_handle, "Locator Cache Cleanup Task"),
1673 |         (_inactivity_monitor_task_handle, "Inactivity Monitor Task"),
1674 |     ]
1675 |     for task_handle, task_name in tasks_to_cancel:
1676 |         if task_handle and not task_handle.done():
1677 |             logger.info(f"Cancelling {task_name}...")
1678 |             task_handle.cancel()
1679 |             try:
1680 |                 # Wait briefly for cancellation to complete
1681 |                 await asyncio.wait_for(task_handle, timeout=2.0)
1682 |                 logger.info(f"{task_name} cancellation confirmed.") # Changed log level
1683 |             except asyncio.CancelledError:
1684 |                 logger.info(f"{task_name} cancellation confirmed.") # Expected outcome
1685 |             except asyncio.TimeoutError:
1686 |                 logger.warning(f"Timeout waiting for {task_name} cancellation.")
1687 |             except Exception as e:
1688 |                 err_type = type(e).__name__
1689 |                 logger.warning(f"Error waiting for {task_name} cancellation: {err_type}")
1690 | 
1691 |     # Reset task handles
1692 |     _locator_cache_cleanup_task_handle = None
1693 |     _inactivity_monitor_task_handle = None
1694 | 
1695 |     # 2. Cancel any active tab pool operations
1696 |     await tab_pool.cancel_all() # Handles incognito contexts
1697 | 
1698 |     # 3. Close Playwright resources (under lock to prevent concurrent access)
1699 |     async with _playwright_lock:
1700 |         # Close Shared Context (save state first, if possible)
1701 |         ctx_to_close = _ctx
1702 |         _ctx = None # Immediately unset global reference
1703 | 
1704 |         # Skip state saving if we're already at the timeout
1705 |         if is_shutdown_timeout():
1706 |             logger.warning("Skipping state saving due to shutdown timeout")
1707 |         # --- Robust Check and Save State ---
1708 |         elif ctx_to_close and ctx_to_close.browser and ctx_to_close.browser.is_connected():
1709 |             logger.info("Attempting to save state for shared browser context...")
1710 |             try:
1711 |                 # Add timeout for state saving
1712 |                 await asyncio.wait_for(_save_state(ctx_to_close), timeout=3.0)
1713 |                 logger.info("State saving attempted for shared context.") # Log attempt, success logged within _save_state
1714 |             except asyncio.TimeoutError:
1715 |                 logger.warning("State saving timed out after 3 seconds")
1716 |             except Exception as e:
1717 |                 # Catch any unexpected error from _save_state itself (should be rare now)
1718 |                 logger.error(f"Unexpected error during final state save attempt: {e}", exc_info=True)
1719 |         elif ctx_to_close:
1720 |              logger.warning("Skipping final state save: Shared context or browser already closed/disconnected.")
1721 |         else:
1722 |              logger.debug("Skipping final state save: No shared context exists.")
1723 |         # --- End Robust Check and Save State ---
1724 | 
1725 |         # Close the context object itself
1726 |         if ctx_to_close and not is_shutdown_timeout():
1727 |             logger.info("Closing shared browser context object...")
1728 |             try:
1729 |                 # Add timeout for context closing
1730 |                 await asyncio.wait_for(ctx_to_close.close(), timeout=3.0)
1731 |                 await _log("browser_context_close_shared")
1732 |                 logger.info("Shared browser context closed.")
1733 |             except asyncio.TimeoutError:
1734 |                 logger.warning("Browser context close timed out after 3 seconds")
1735 |             except Exception as e:
1736 |                 # Log error but continue shutdown
1737 |                 logger.error(f"Error closing shared context object: {e}", exc_info=False) # Keep log less verbose
1738 |         elif ctx_to_close:
1739 |             logger.warning("Skipping browser context close due to shutdown timeout")
1740 | 
1741 |         # Close Browser
1742 |         browser_to_close = _browser
1743 |         _browser = None # Immediately unset global reference
1744 |         if browser_to_close and browser_to_close.is_connected() and not is_shutdown_timeout():
1745 |             logger.info("Closing browser instance...")
1746 |             try:
1747 |                 # Add timeout for browser closing - shorter timeout to avoid hanging
1748 |                 await asyncio.wait_for(browser_to_close.close(), timeout=3.0)
1749 |                 await _log("browser_close")
1750 |                 logger.info("Browser instance closed.")
1751 |             except asyncio.TimeoutError:
1752 |                 logger.warning("Browser close timed out after 3 seconds")
1753 |             except Exception as e:
1754 |                 logger.error(f"Error closing browser: {e}", exc_info=False) # Keep log less verbose
1755 |         elif browser_to_close and browser_to_close.is_connected():
1756 |             logger.warning("Skipping browser close due to shutdown timeout")
1757 | 
1758 |         # Stop Playwright
1759 |         pw_to_stop = _pw
1760 |         _pw = None # Immediately unset global reference
1761 |         if pw_to_stop and not is_shutdown_timeout():
1762 |             logger.info("Stopping Playwright...")
1763 |             try:
1764 |                 # Add timeout for playwright stop - shorter timeout
1765 |                 await asyncio.wait_for(pw_to_stop.stop(), timeout=2.0)
1766 |                 logger.info("Playwright stopped.")
1767 |             except asyncio.TimeoutError:
1768 |                 logger.warning("Playwright stop timed out after 2 seconds")
1769 |             except Exception as e:
1770 |                 logger.error(f"Error stopping Playwright: {e}", exc_info=False) # Keep log less verbose
1771 |         elif pw_to_stop:
1772 |             logger.warning("Skipping Playwright stop due to shutdown timeout")
1773 | 
1774 |     # 4. Cleanup Synchronous Resources - always do this regardless of timeout
1775 |     _cleanup_vnc()
1776 |     _close_db_connection()
1777 | 
1778 |     # 5. Log completion and reset flags
1779 |     await _log("browser_shutdown_complete")
1780 |     if is_shutdown_timeout():
1781 |         logger.warning("Smart Browser shutdown reached timeout limit - some resources may not be fully released")
1782 |     else:
1783 |         logger.info("Smart Browser graceful shutdown complete.")
1784 |     _is_initialized = False
1785 | 
1786 |     # 6. Shutdown Thread Pool (MOVED TO THE VERY END)
1787 |     logger.info("Shutting down thread pool...")
1788 |     pool_to_shutdown = _get_pool()
1789 |     # Don't wait for tasks if we're already at timeout
1790 |     if is_shutdown_timeout():
1791 |         try:
1792 |             pool_to_shutdown.shutdown(wait=False)
1793 |             logger.info("Thread pool shutdown initiated without waiting")
1794 |         except Exception as e:
1795 |             logger.error(f"Error during thread pool non-waiting shutdown: {e}")
1796 |     else:
1797 |         # Give the pool a short timeout to avoid hanging
1798 |         try:
1799 |             time_left = max(0, shutdown_timeout - (time.monotonic() - shutdown_start_time))
1800 |             # Use the minimum of 3 seconds or remaining time
1801 |             wait_time = min(3.0, time_left)
1802 |             
1803 |             # Create a separate thread to shut down the pool with a timeout
1804 |             import threading
1805 |             shutdown_complete = threading.Event()
1806 |             
1807 |             def shutdown_pool_with_timeout():
1808 |                 try:
1809 |                     pool_to_shutdown.shutdown(wait=True)
1810 |                     shutdown_complete.set()
1811 |                 except Exception as e:
1812 |                     logger.error(f"Error in thread pool shutdown thread: {e}")
1813 |             
1814 |             # Start the shutdown in a separate thread
1815 |             thread = threading.Thread(target=shutdown_pool_with_timeout)
1816 |             thread.daemon = True
1817 |             thread.start()
1818 |             
1819 |             # Wait for completion or timeout
1820 |             if shutdown_complete.wait(wait_time):
1821 |                 logger.info("Thread pool shut down successfully.")
1822 |             else:
1823 |                 logger.warning(f"Thread pool shutdown timed out after {wait_time} seconds")
1824 |                 # Try non-waiting shutdown as fallback
1825 |                 try:
1826 |                     pool_to_shutdown.shutdown(wait=False)
1827 |                 except Exception:
1828 |                     pass  # Already logged above
1829 |         except Exception as e:
1830 |             logger.error(f"Error setting up thread pool shutdown: {e}")
1831 |             # Fallback to non-waiting shutdown
1832 |             pool_to_shutdown.shutdown(wait=False)
1833 | 
1834 | 
1835 | async def _initiate_shutdown():  # Uses global _shutdown_initiated, _shutdown_lock
1836 |     """Ensures shutdown runs only once."""
1837 |     global _shutdown_initiated
1838 |     async with _shutdown_lock:
1839 |         if not _shutdown_initiated:
1840 |             _shutdown_initiated = True
1841 |             await shutdown()
1842 |         else:
1843 |             logger.debug("Shutdown already initiated. Ignoring duplicate request.")
1844 | 
1845 | 
1846 | # --- Signal Handling (Keep top-level) ---
1847 | def _signal_handler(sig, frame):
1848 |     """Handle termination signals gracefully."""
1849 |     signal_name = signal.Signals(sig).name
1850 |     logger.info(f"Received signal {signal_name} ({sig}). Initiating Smart Browser shutdown...")
1851 |     try:
1852 |         # Try to get the running event loop
1853 |         loop = asyncio.get_running_loop()
1854 |         if loop.is_running():
1855 |             # Schedule shutdown in the running loop, don't block signal handler
1856 |             asyncio.create_task(_initiate_shutdown())
1857 |         else:
1858 |             # No running loop, attempt synchronous run (best effort)
1859 |             logger.warning(
1860 |                 "No running event loop found in signal handler. Attempting sync shutdown."
1861 |             )
1862 |             try:
1863 |                 asyncio.run(_initiate_shutdown())
1864 |             except RuntimeError as e:
1865 |                 logger.error(f"Could not run async shutdown synchronously from signal handler: {e}")
1866 |     except RuntimeError as e:
1867 |         # Error getting the loop itself
1868 |         logger.error(
1869 |             f"Error getting event loop in signal handler: {e}. Shutdown might be incomplete."
1870 |         )
1871 | 
1872 | 
1873 | # Register signal handlers in a try-except block
1874 | try:
1875 |     signal.signal(signal.SIGTERM, _signal_handler)
1876 |     signal.signal(signal.SIGINT, _signal_handler)  # Handle Ctrl+C too
1877 | except ValueError:
1878 |     # This can happen if not running in the main thread
1879 |     logger.warning(
1880 |         "Could not register signal handlers (not running in main thread?). Graceful shutdown on SIGTERM/SIGINT might not work."
1881 |     )
1882 | 
1883 | 
1884 | # --- Tab Pool (Keep global instance) ---
1885 | class TabPool:  # Keep class definition
1886 |     """Runs async callables needing a Page in parallel, bounded by global config."""
1887 | 
1888 |     def __init__(self, max_tabs: int | None = None):
1889 |         if max_tabs is not None:
1890 |             self.max_tabs = max_tabs
1891 |         else:
1892 |             self.max_tabs = _sb_max_tabs_global
1893 | 
1894 |         if self.max_tabs <= 0:
1895 |             logger.warning(f"TabPool max_tabs configured to {self.max_tabs}. Setting to 1.")
1896 |             self.max_tabs = 1
1897 |         self.sem = asyncio.Semaphore(self.max_tabs)
1898 |         self._active_contexts: Set[BrowserContext] = set()  # Store contexts being used
1899 |         self._context_lock = asyncio.Lock()  # Protect access to _active_contexts
1900 |         logger.info(f"TabPool initialized with max_tabs={self.max_tabs}")
1901 | 
1902 |     async def _run(self, fn: Callable[[Page], Awaitable[Any]]) -> Any:
1903 |         """Internal method to run a single function within a managed tab."""
1904 |         timeout_seconds = _sb_tab_timeout_global
1905 |         incognito_ctx: Optional[BrowserContext] = None
1906 |         task = asyncio.current_task()
1907 |         task_id = id(task)
1908 |         func_name = getattr(fn, "__name__", "anon_tab_fn")
1909 | 
1910 |         try:
1911 |             # Acquire semaphore before creating context/page
1912 |             async with self.sem:
1913 |                 # Create a new incognito context for isolation
1914 |                 # Pass None for context_args to use defaults
1915 |                 incognito_ctx, _ = await get_browser_context(use_incognito=True, context_args=None)
1916 | 
1917 |                 # Add context to active set under lock
1918 |                 async with self._context_lock:
1919 |                     self._active_contexts.add(incognito_ctx)
1920 | 
1921 |                 # Use the async context manager for the page
1922 |                 async with _tab_context(incognito_ctx) as page:
1923 |                     # Run the provided function with timeout
1924 |                     result = await asyncio.wait_for(fn(page), timeout=timeout_seconds)
1925 |                     return result  # Return the successful result
1926 | 
1927 |         except asyncio.TimeoutError:
1928 |             await _log("tab_timeout", function=func_name, timeout=timeout_seconds, task_id=task_id)
1929 |             # Return error structure on timeout
1930 |             return {
1931 |                 "error": f"Tab operation '{func_name}' timed out after {timeout_seconds}s",
1932 |                 "success": False,
1933 |             }
1934 |         except asyncio.CancelledError:
1935 |             # Log cancellation and re-raise
1936 |             await _log("tab_cancelled", function=func_name, task_id=task_id)
1937 |             raise  # Important to propagate cancellation
1938 |         except Exception as e:
1939 |             # Log any other exceptions during execution
1940 |             await _log(
1941 |                 "tab_error", function=func_name, error=str(e), task_id=task_id, exc_info=True
1942 |             )
1943 |             # Return error structure
1944 |             return {"error": f"Tab operation '{func_name}' failed: {e}", "success": False}
1945 |         finally:
1946 |             # Cleanup: Remove context from active set and close it
1947 |             if incognito_ctx:
1948 |                 incog_ctx_id = id(incognito_ctx)  # Get ID before potential close error
1949 |                 async with self._context_lock:
1950 |                     self._active_contexts.discard(incognito_ctx)
1951 |                 try:
1952 |                     await incognito_ctx.close()
1953 |                     logger.debug(f"Incognito context {incog_ctx_id} closed for task {task_id}.")
1954 |                 except PlaywrightException as close_err:
1955 |                     # Log error but don't let it prevent other cleanup
1956 |                     logger.warning(
1957 |                         f"Error closing incognito context {incog_ctx_id} for task {task_id}: {close_err}"
1958 |                     )
1959 | 
1960 |     async def map(self, fns: Sequence[Callable[[Page], Awaitable[Any]]]) -> List[Any]:
1961 |         """Runs multiple functions concurrently using the tab pool."""
1962 |         if not fns:
1963 |             return []
1964 | 
1965 |         # Create tasks for each function using the internal _run method
1966 |         tasks = []
1967 |         for fn in fns:
1968 |             task = asyncio.create_task(self._run(fn))
1969 |             tasks.append(task)
1970 | 
1971 |         # Wait for all tasks to complete
1972 |         results = await asyncio.gather(*tasks, return_exceptions=True)
1973 | 
1974 |         # Process results, handling potential exceptions returned by gather
1975 |         processed_results = []
1976 |         for i, res in enumerate(results):
1977 |             if isinstance(res, Exception):
1978 |                 # Log the exception if a task failed unexpectedly
1979 |                 func_name = getattr(fns[i], "__name__", f"fn_{i}")
1980 |                 logger.error(f"Error in TabPool.map for '{func_name}': {res}", exc_info=res)
1981 |                 # Append an error dictionary for failed tasks
1982 |                 processed_results.append(
1983 |                     {"error": f"Task '{func_name}' failed with exception: {res}", "success": False}
1984 |                 )
1985 |             else:
1986 |                 # Append the result directly (which might be an error dict from _run)
1987 |                 processed_results.append(res)
1988 |         return processed_results
1989 | 
1990 |     async def cancel_all(self):
1991 |         """Attempts to close all currently active incognito contexts managed by the pool."""
1992 |         contexts_to_close: List[BrowserContext] = []
1993 |         # Safely get the list of active contexts and clear the set under lock
1994 |         async with self._context_lock:
1995 |             contexts_to_close = list(self._active_contexts)
1996 |             self._active_contexts.clear()
1997 | 
1998 |         if not contexts_to_close:
1999 |             logger.debug("TabPool cancel_all: No active contexts to close.")
2000 |             return
2001 | 
2002 |         logger.info(
2003 |             f"TabPool cancel_all: Attempting to close {len(contexts_to_close)} active incognito contexts."
2004 |         )
2005 |         # Create closing tasks for each context
2006 |         close_tasks = []
2007 |         for ctx in contexts_to_close:
2008 |             task = asyncio.create_task(ctx.close())
2009 |             close_tasks.append(task)
2010 | 
2011 |         # Wait for all close tasks to complete, collecting results/exceptions
2012 |         results = await asyncio.gather(*close_tasks, return_exceptions=True)
2013 | 
2014 |         # Count and log errors during closure
2015 |         errors = 0
2016 |         for res in results:
2017 |             if isinstance(res, Exception):
2018 |                 errors += 1
2019 |         if errors:
2020 |             logger.warning(
2021 |                 f"TabPool cancel_all: Encountered {errors} errors while closing contexts."
2022 |             )
2023 |         else:
2024 |             logger.info(
2025 |                 f"TabPool cancel_all: Successfully closed {len(contexts_to_close)} contexts."
2026 |             )
2027 | 
2028 | 
2029 | # Global instance of the TabPool
2030 | tab_pool = TabPool()
2031 | 
2032 | 
2033 | # --- Human Jitter ---
2034 | def _risk_factor(url: str) -> float:  # Uses global _high_risk_domains_set_global
2035 |     """Calculates risk factor based on URL's domain (higher for known tricky domains)."""
2036 |     if not url:
2037 |         return 1.0  # Default risk if URL is empty
2038 |     try:
2039 |         parsed_url = urlparse(url)
2040 |         domain = parsed_url.netloc.lower()
2041 |         # Remove common www prefix
2042 |         if domain.startswith("www."):
2043 |             domain = domain[4:]
2044 | 
2045 |         if not domain:
2046 |             return 1.0  # Default risk if domain cannot be parsed
2047 | 
2048 |         # Check if domain or its parent domains are in the high-risk set
2049 |         domain_parts = domain.split(".")
2050 |         for i in range(len(domain_parts)):
2051 |             # Construct subdomain like ".example.com", ".com"
2052 |             sub_domain_check = "." + ".".join(domain_parts[i:])
2053 |             if sub_domain_check in _high_risk_domains_set_global:
2054 |                 return 2.0  # High risk factor
2055 | 
2056 |         # No match found in high-risk set
2057 |         return 1.0  # Standard risk factor
2058 |     except Exception as e:
2059 |         logger.warning(f"Error calculating risk factor for URL '{url}': {e}")
2060 |         return 1.0  # Default risk on error
2061 | 
2062 | 
2063 | async def _pause(
2064 |     page: Page, base_ms_range: tuple[int, int] = (150, 500)
2065 | ):  # Uses global _risk_factor
2066 |     """Introduce a short, randomized pause, adjusted by URL risk factor and page complexity."""
2067 |     if not page or page.is_closed():
2068 |         return  # Do nothing if page is invalid
2069 | 
2070 |     risk = _risk_factor(page.url)
2071 |     min_ms, max_ms = base_ms_range
2072 |     base_delay_ms = random.uniform(min_ms, max_ms)
2073 |     adjusted_delay_ms = base_delay_ms * risk
2074 | 
2075 |     try:
2076 |         # Estimate page complexity based on number of interactive elements
2077 |         # Use a simpler selector for broad compatibility
2078 |         selector = "a, button, input, select, textarea, [role=button], [role=link], [onclick]"
2079 |         js_expr = f"() => document.querySelectorAll('{selector}').length"
2080 |         element_count = await page.evaluate(js_expr)
2081 | 
2082 |         # If count is 0, might be an error or very simple page, assume moderate complexity
2083 |         if element_count == 0:
2084 |             element_count = max(element_count, 100)  # Avoid division by zero/tiny factors
2085 | 
2086 |         # Skip pauses for low-risk, very simple pages
2087 |         is_low_risk = risk == 1.0
2088 |         is_simple_page = element_count < 50
2089 |         if is_low_risk and is_simple_page:
2090 |             return  # No pause needed
2091 | 
2092 |         # Increase delay slightly based on complexity, capping the factor
2093 |         complexity_factor_base = 1.0 + (element_count / 500.0)
2094 |         complexity_factor = min(complexity_factor_base, 1.5)  # Cap factor at 1.5
2095 |         adjusted_delay_ms *= complexity_factor
2096 | 
2097 |     except PlaywrightException as e:
2098 |         # Ignore errors during element count evaluation, proceed with risk-adjusted delay
2099 |         logger.debug(f"Could not evaluate element count for pause adjustment: {e}")
2100 |         pass
2101 | 
2102 |     # Cap the final delay to avoid excessive pauses
2103 |     final_delay_ms = min(adjusted_delay_ms, 3000)  # Max 3 seconds pause
2104 | 
2105 |     # Convert ms to seconds and sleep
2106 |     final_delay_sec = final_delay_ms / 1000.0
2107 |     await asyncio.sleep(final_delay_sec)
2108 | 
2109 | 
2110 | # --- Enhanced Locator Helpers (Depend on globals, use Filesystem tools) ---
2111 | _READ_JS_WRAPPER = textwrap.dedent("""
2112 |     (html) => {
2113 |         // Ensure Readability library is loaded in the window scope
2114 |         const R = window.__sbReadability;
2115 |         if (!R || !html) {
2116 |             console.warn('Readability object or HTML missing.');
2117 |             return ""; // Cannot proceed without library or content
2118 |         }
2119 |         try {
2120 |             // Create a DOM from the HTML string
2121 |             const parser = new DOMParser();
2122 |             const doc = parser.parseFromString(html, "text/html");
2123 | 
2124 |             // Basic validation of the parsed document
2125 |             if (!doc || !doc.body || doc.body.innerHTML.trim() === '') {
2126 |                  console.warn('Parsed document is invalid or empty.');
2127 |                  return "";
2128 |             }
2129 | 
2130 |             // Use Readability to parse the article content
2131 |             const article = new R.Readability(doc).parse();
2132 | 
2133 |             // Return the text content if parsing was successful
2134 |             return article ? article.textContent : "";
2135 | 
2136 |         } catch (e) {
2137 |             // Log errors during parsing
2138 |             console.warn('Readability parsing failed:', e);
2139 |             return ""; // Return empty string on error
2140 |         }
2141 |     }
2142 | """)
2143 | 
2144 | 
2145 | async def _ensure_readability(page: Page) -> None:  # Uses global _READ_JS_CACHE
2146 |     """Ensures Readability.js is injected, using STANDALONE filesystem tools."""
2147 |     # Check if already injected
2148 |     is_injected_js = "() => window.__sbReadability !== undefined"
2149 |     already_injected = await page.evaluate(is_injected_js)
2150 |     if already_injected:
2151 |         logger.debug("Readability.js already injected.")
2152 |         return
2153 | 
2154 |     if _READ_JS_CACHE is None:
2155 |         logger.warning("Readability cache path (_READ_JS_CACHE) not set. Cannot cache script.")
2156 |         # Proceed to fetch, but won't cache
2157 |     else:
2158 |         cache_file_path = str(_READ_JS_CACHE)
2159 | 
2160 |     src: Optional[str] = None
2161 | 
2162 |     # Try reading from cache if path is set
2163 |     if _READ_JS_CACHE:
2164 |         try:
2165 |             logger.debug(f"Attempting to load Readability.js from cache: {cache_file_path}")
2166 |             read_result = await read_file(path=cache_file_path)
2167 |             if isinstance(read_result, dict) and not read_result.get("success"):
2168 |                 error_msg = read_result.get("error", "Unknown read error")
2169 |                 error_code = read_result.get("error_code", "")
2170 |                 logger.warning(
2171 |                     f"Failed to read Readability.js cache {cache_file_path}: {error_msg} (Code: {error_code}). Full response: {read_result}. Will attempt fetch."  # Log full dict
2172 |                 )
2173 |             if isinstance(read_result, dict) and read_result.get("success"):
2174 |                 content_list = read_result.get("content", [])
2175 |                 if isinstance(content_list, list) and content_list:
2176 |                     # Assuming single file content for this cache
2177 |                     file_content = content_list[0]
2178 |                     if isinstance(file_content, dict):
2179 |                         src = file_content.get("text")
2180 |                         if src:
2181 |                             logger.debug(
2182 |                                 f"Readability.js loaded successfully from cache: {cache_file_path}"
2183 |                             )
2184 |                         else:
2185 |                             logger.warning(
2186 |                                 f"Readability cache file {cache_file_path} content missing 'text'."
2187 |                             )
2188 |                     else:
2189 |                         logger.warning(
2190 |                             f"Readability cache file {cache_file_path} content format unexpected."
2191 |                         )
2192 |                 else:
2193 |                     logger.info(
2194 |                         f"Readability cache file {cache_file_path} exists but is empty or has no content list."
2195 |                     )
2196 |             # Handle specific file not found error (or other read errors) from standalone tool
2197 |             elif isinstance(read_result, dict) and not read_result.get("success"):
2198 |                 error_msg = read_result.get("error", "Unknown read error")
2199 |                 error_code = read_result.get("error_code", "")
2200 |                 if "does not exist" in error_msg.lower() or "PATH_NOT_FOUND" in error_code:
2201 |                     logger.info(
2202 |                         f"Readability.js cache file not found ({cache_file_path}). Will attempt fetch."
2203 |                     )
2204 |                 else:
2205 |                     logger.warning(
2206 |                         f"Failed to read Readability.js cache {cache_file_path}: {error_msg}. Will attempt fetch."
2207 |                     )
2208 |             else:  # Unexpected response format
2209 |                 logger.warning(
2210 |                     f"Unexpected response from read_file for {cache_file_path}. Will attempt fetch."
2211 |                 )
2212 | 
2213 |         except ToolError as e:  # Catch explicit ToolError if raised by read_file internally
2214 |             if "does not exist" in str(e).lower() or "PATH_NOT_FOUND" in getattr(
2215 |                 e, "error_code", ""
2216 |             ):
2217 |                 logger.info(
2218 |                     f"Readability.js cache file not found ({cache_file_path}). Will attempt fetch."
2219 |                 )
2220 |             else:
2221 |                 logger.warning(
2222 |                     f"ToolError reading Readability.js cache {cache_file_path}: {e}. Will attempt fetch."
2223 |                 )
2224 |         except Exception as e:
2225 |             # Catch any other unexpected errors during cache read
2226 |             logger.warning(
2227 |                 f"Unexpected error reading Readability.js cache {cache_file_path}: {e}. Will attempt fetch.",
2228 |                 exc_info=True,
2229 |             )
2230 | 
2231 |     # Fetch from CDN if not loaded from cache
2232 |     if src is None:
2233 |         logger.info("Fetching Readability.js from CDN...")
2234 |         try:
2235 |             async with httpx.AsyncClient() as client:
2236 |                 # Use a reliable CDN link
2237 |                 cdn_url = "https://cdnjs.cloudflare.com/ajax/libs/readability/0.5.0/Readability.js"
2238 |                 response = await client.get(cdn_url, timeout=15.0)
2239 |                 response.raise_for_status()  # Raise exception for bad status codes
2240 |                 fetched_src = response.text
2241 |                 fetched_size = len(fetched_src)
2242 |                 await _log("readability_js_fetch", url=cdn_url, size=fetched_size)
2243 | 
2244 |             if fetched_src:
2245 |                 # Try writing to cache if path is set
2246 |                 if _READ_JS_CACHE:
2247 |                     try:
2248 |                         logger.debug(
2249 |                             f"Attempting to save fetched Readability.js to cache: {cache_file_path}"
2250 |                         )
2251 |                         # Use STANDALONE write_file tool
2252 |                         write_res = await write_file(
2253 |                             path=cache_file_path, content=fetched_src
2254 |                         )  # Pass string content
2255 | 
2256 |                         if isinstance(write_res, dict) and write_res.get("success"):
2257 |                             logger.info(f"Saved fetched Readability.js to cache: {cache_file_path}")
2258 |                         else:
2259 |                             error_msg = (
2260 |                                 write_res.get("error", "Unknown write error")
2261 |                                 if isinstance(write_res, dict)
2262 |                                 else "Invalid write_file response"
2263 |                             )
2264 |                             logger.warning(
2265 |                                 f"Failed to write Readability.js cache ({cache_file_path}): {error_msg}"
2266 |                             )
2267 |                     except Exception as write_err:
2268 |                         # Log error but proceed with injection using fetched source
2269 |                         logger.warning(
2270 |                             f"Error writing Readability.js cache ({cache_file_path}): {write_err}"
2271 |                         )
2272 | 
2273 |                 # Use the fetched source for injection
2274 |                 src = fetched_src
2275 |             else:
2276 |                 logger.warning("Fetched empty content for Readability.js from CDN.")
2277 | 
2278 |         except httpx.HTTPStatusError as fetch_err:
2279 |             logger.error(
2280 |                 f"HTTP error fetching Readability.js from {fetch_err.request.url}: {fetch_err.response.status_code}"
2281 |             )
2282 |         except httpx.RequestError as fetch_err:
2283 |             logger.error(f"Network error fetching Readability.js: {fetch_err}")
2284 |         except Exception as fetch_err:
2285 |             logger.error(f"Failed to fetch/cache Readability.js: {fetch_err}", exc_info=True)
2286 | 
2287 |     # Inject the script if source code was successfully obtained (from cache or fetch)
2288 |     if src:
2289 |         # Wrap the source code to assign the Readability class to a window property
2290 |         wrapped_src = f"window.__sbReadability = (() => {{ {src}; return Readability; }})();"
2291 |         try:
2292 |             await page.add_script_tag(content=wrapped_src)
2293 |             logger.debug("Readability.js injected successfully.")
2294 |         except PlaywrightException as e:
2295 |             # Handle potential injection errors (e.g., Content Security Policy)
2296 |             err_str = str(e)
2297 |             if "Content Security Policy" in err_str:
2298 |                 page_url = page.url  # Get URL for context
2299 |                 logger.warning(
2300 |                     f"Could not inject Readability.js due to Content Security Policy on {page_url}."
2301 |                 )
2302 |             else:
2303 |                 logger.error(f"Failed to inject Readability.js script tag: {e}", exc_info=True)
2304 |         except Exception as e:
2305 |             logger.error(f"Unexpected error injecting Readability.js: {e}", exc_info=True)
2306 |     else:
2307 |         # Log if source couldn't be obtained
2308 |         logger.warning("Failed to load or fetch Readability.js source. Proceeding without it.")
2309 | 
2310 | 
2311 | async def _dom_fingerprint(page: Page) -> str:  # Uses global _dom_fp_limit_global
2312 |     """Calculates a fingerprint of the page's visible text content."""
2313 |     try:
2314 |         # Evaluate JS to get the initial part of the body's innerText
2315 |         js_expr = f"() => document.body.innerText.slice(0, {_dom_fp_limit_global})"
2316 |         txt_content = await page.main_frame.evaluate(js_expr)
2317 | 
2318 |         # Ensure text is not None and strip whitespace
2319 |         cleaned_txt = (txt_content or "").strip()
2320 | 
2321 |         # Encode the text to bytes (ignoring errors) and hash it
2322 |         txt_bytes = cleaned_txt.encode("utf-8", "ignore")
2323 |         hasher = hashlib.sha256(txt_bytes)
2324 |         fingerprint = hasher.hexdigest()
2325 |         return fingerprint
2326 | 
2327 |     except PlaywrightException as e:
2328 |         # Log error if evaluation fails, return hash of empty string
2329 |         logger.warning(f"Could not get text for DOM fingerprint: {e}")
2330 |         empty_hash = hashlib.sha256(b"").hexdigest()
2331 |         return empty_hash
2332 |     except Exception as e:
2333 |         # Catch unexpected errors
2334 |         logger.error(f"Unexpected error calculating DOM fingerprint: {e}", exc_info=True)
2335 |         empty_hash = hashlib.sha256(b"").hexdigest()
2336 |         return empty_hash
2337 | 
2338 | 
2339 | def _shadow_deep_js() -> str:  # Uses globals _max_widgets_global, _area_min_global
2340 |     """JS function string to find elements, traversing shadow DOM."""
2341 |     # This JS function is complex but self-contained. Keep as multi-line f-string.
2342 |     # Relies on _max_widgets_global and _area_min_global from Python scope.
2343 |     return f"""
2344 |     (prefix) => {{
2345 |         const MAX_ELEMENTS = {_max_widgets_global};
2346 |         const MIN_ELEMENT_AREA = {_area_min_global};
2347 | 
2348 |         // --- Helper Functions ---
2349 |         const isVisible = (el) => {{
2350 |             if (!el || typeof el.getBoundingClientRect !== 'function') {{ return false; }}
2351 |             try {{
2352 |                 // Check CSS visibility properties
2353 |                 const style = window.getComputedStyle(el);
2354 |                 if (style.display === 'none' || style.visibility === 'hidden' || parseFloat(style.opacity) === 0 || el.hidden) {{
2355 |                     return false;
2356 |                 }}
2357 |                 // Check if it has an offset parent (not detached or position:fixed parent hidden)
2358 |                 if (!el.offsetParent && style.position !== 'fixed') {{
2359 |                      return false;
2360 |                 }}
2361 | 
2362 |                 // Check bounding box dimensions and position
2363 |                 const rect = el.getBoundingClientRect();
2364 |                 const hasPositiveSize = rect.width > 1 && rect.height > 1; // Needs some dimensions
2365 |                 const hasSufficientArea = (rect.width * rect.height) >= MIN_ELEMENT_AREA;
2366 | 
2367 |                 // Check if it's within the viewport bounds (partially is sufficient)
2368 |                 const viewportHeight = window.innerHeight || document.documentElement.clientHeight;
2369 |                 const viewportWidth = window.innerWidth || document.documentElement.clientWidth;
2370 |                 const isInViewportVertically = rect.bottom > 0 && rect.top < viewportHeight;
2371 |                 const isInViewportHorizontally = rect.right > 0 && rect.left < viewportWidth;
2372 |                 const isOnscreen = isInViewportVertically && isInViewportHorizontally;
2373 | 
2374 |                 // Combine checks: Must have size, be on screen, and either have min area or be a link/button.
2375 |                 return hasPositiveSize && isOnscreen && (hasSufficientArea || el.tagName === 'A' || el.tagName === 'BUTTON');
2376 |             }} catch (e) {{
2377 |                 // Errors during checks mean we can't be sure, assume not visible
2378 |                 console.warn('Error in isVisible check:', e);
2379 |                 return false;
2380 |             }}
2381 |         }};
2382 | 
2383 |         const isInteractiveOrSignificant = (el) => {{
2384 |             const tag = el.tagName.toLowerCase();
2385 |             const role = (el.getAttribute('role') || '').toLowerCase();
2386 | 
2387 |             // Common interactive HTML tags
2388 |             const interactiveTags = ['a', 'button', 'input', 'select', 'textarea', 'option', 'label', 'form', 'fieldset', 'details', 'summary', 'dialog', 'menu', 'menuitem'];
2389 |             // Common interactive ARIA roles
2390 |             const interactiveRoles = ['button', 'link', 'checkbox', 'radio', 'menuitem', 'tab', 'switch', 'option', 'searchbox', 'textbox', 'dialog', 'slider', 'spinbutton', 'combobox', 'listbox'];
2391 | 
2392 |             if (interactiveTags.includes(tag) || interactiveRoles.includes(role)) {{
2393 |                 return true;
2394 |             }}
2395 | 
2396 |             // Check for explicit interaction handlers or attributes
2397 |             if (el.onclick || el.href || el.getAttribute('tabindex') !== null || el.getAttribute('contenteditable') === 'true') {{
2398 |                 return true;
2399 |             }}
2400 | 
2401 |             // Consider non-interactive containers with text content if they have sufficient area
2402 |             if ((tag === 'div' || tag === 'section' || tag === 'article' || tag === 'main' || tag === 'span') && el.innerText && el.innerText.trim().length > 0) {{
2403 |                 try {{ const rect = el.getBoundingClientRect(); if (rect.width * rect.height >= MIN_ELEMENT_AREA) return true; }} catch(e) {{}}
2404 |             }}
2405 | 
2406 |              // Consider images with alt text if they have sufficient area
2407 |             if (tag === 'img' && el.alt && el.alt.trim().length > 0) {{
2408 |                  try {{ const rect = el.getBoundingClientRect(); if (rect.width * rect.height >= MIN_ELEMENT_AREA) return true; }} catch(e) {{}}
2409 |             }}
2410 | 
2411 |             return false; // Default to not significant
2412 |         }};
2413 | 
2414 |         const getElementText = (el) => {{
2415 |             try {{
2416 |                 // Handle specific input types
2417 |                 if (el.tagName === 'INPUT') {{
2418 |                     const inputType = el.type.toLowerCase();
2419 |                     if (inputType === 'button' || inputType === 'submit' || inputType === 'reset') return el.value || '';
2420 |                     if (inputType === 'password') return 'Password input field'; // Don't expose value
2421 |                     // For other inputs, prioritize placeholder, then name, then type
2422 |                     return el.placeholder || el.name || el.getAttribute('aria-label') || inputType || '';
2423 |                 }}
2424 |                 if (el.tagName === 'TEXTAREA') {{
2425 |                      return el.placeholder || el.name || el.getAttribute('aria-label') || '';
2426 |                 }}
2427 |                 if (el.tagName === 'SELECT') {{
2428 |                      // Try associated label first
2429 |                      if (el.id) {{
2430 |                          const labels = document.querySelectorAll(`label[for="${{el.id}}"]`);
2431 |                          if (labels.length > 0 && labels[0].textContent) return labels[0].textContent.trim();
2432 |                      }}
2433 |                      return el.name || el.getAttribute('aria-label') || '';
2434 |                 }}
2435 |                 if (el.tagName === 'IMG') {{
2436 |                     return el.alt || ''; // Use alt text for images
2437 |                 }}
2438 |                 // Prefer aria-label if present
2439 |                 const ariaLabel = el.getAttribute('aria-label');
2440 |                 if (ariaLabel) return ariaLabel.trim();
2441 | 
2442 |                 // Look for associated label via `for` attribute (if not already handled for select)
2443 |                 if (el.id && el.tagName !== 'SELECT') {{
2444 |                     const labels = document.querySelectorAll(`label[for="${{el.id}}"]`);
2445 |                      if (labels.length > 0 && labels[0].textContent) return labels[0].textContent.trim();
2446 |                 }}
2447 | 
2448 |                 // Fallback to combined text content of direct children text nodes
2449 |                 let textContent = '';
2450 |                 for (const node of el.childNodes) {{
2451 |                     // Only include direct text node children
2452 |                     if (node.nodeType === Node.TEXT_NODE) {{
2453 |                         textContent += node.textContent;
2454 |                     }}
2455 |                 }}
2456 |                 textContent = textContent.trim();
2457 | 
2458 |                 // If text node content is empty, fallback to innerText (which includes descendants)
2459 |                 if (!textContent) {{
2460 |                     textContent = el.innerText ? el.innerText.trim() : '';
2461 |                 }}
2462 | 
2463 |                 // Limit text length? Maybe not here, handle later.
2464 |                 return textContent;
2465 | 
2466 |             }} catch (e) {{
2467 |                 console.warn('Error in getElementText:', e);
2468 |                 return ''; // Return empty string on error
2469 |             }}
2470 |         }};
2471 | 
2472 |         // --- Traversal Logic ---
2473 |         const outputElements = [];
2474 |         const queue = [document.documentElement]; // Start traversal from root
2475 |         const visited = new Set(); // Keep track of visited nodes
2476 |         let elementIndex = 0; // Counter for unique element IDs
2477 | 
2478 |         while (queue.length > 0 && outputElements.length < MAX_ELEMENTS) {{
2479 |             const node = queue.shift(); // Get next node from queue
2480 | 
2481 |             if (!node || visited.has(node)) {{
2482 |                 continue; // Skip if node is null or already visited
2483 |             }}
2484 |             visited.add(node);
2485 | 
2486 |             // Process the node if it's interactive/significant and visible
2487 |             if (isInteractiveOrSignificant(node) && isVisible(node)) {{
2488 |                 try {{
2489 |                     const rect = node.getBoundingClientRect();
2490 |                     // Assign a unique ID for referencing later
2491 |                     const elementId = `${{prefix || ''}}el_${{elementIndex++}}`;
2492 |                     node.dataset.sbId = elementId; // Store ID on the element itself
2493 | 
2494 |                     // Collect element information
2495 |                     outputElements.push({{
2496 |                         id: elementId,
2497 |                         tag: node.tagName.toLowerCase(),
2498 |                         role: node.getAttribute("role") || "", // Get ARIA role
2499 |                         text: getElementText(node), // Get representative text
2500 |                         bbox: [ // Bounding box coordinates
2501 |                             Math.round(rect.x),
2502 |                             Math.round(rect.y),
2503 |                             Math.round(rect.width),
2504 |                             Math.round(rect.height)
2505 |                         ]
2506 |                     }});
2507 |                 }} catch (e) {{
2508 |                     console.warn('Error processing element:', node, e);
2509 |                 }}
2510 |             }}
2511 | 
2512 |             // --- Queue Children for Traversal ---
2513 |             // Check for Shadow DOM children first
2514 |             if (node.shadowRoot) {{
2515 |                 const shadowChildren = node.shadowRoot.children;
2516 |                 if (shadowChildren) {{
2517 |                     for (let i = 0; i < shadowChildren.length; i++) {{
2518 |                         if (!visited.has(shadowChildren[i])) {{
2519 |                             queue.push(shadowChildren[i]);
2520 |                         }}
2521 |                     }}
2522 |                 }}
2523 |             }}
2524 |             // Check for regular children
2525 |             else if (node.children) {{
2526 |                  const children = node.children;
2527 |                  for (let i = 0; i < children.length; i++) {{
2528 |                      if (!visited.has(children[i])) {{
2529 |                          queue.push(children[i]);
2530 |                      }}
2531 |                  }}
2532 |             }}
2533 | 
2534 |             // Check for IFRAME content document
2535 |             if (node.tagName === 'IFRAME') {{
2536 |                  try {{
2537 |                     // Access contentDocument carefully due to potential cross-origin restrictions
2538 |                     if (node.contentDocument && node.contentDocument.documentElement) {{
2539 |                          if (!visited.has(node.contentDocument.documentElement)) {{
2540 |                              queue.push(node.contentDocument.documentElement);
2541 |                          }}
2542 |                     }}
2543 |                  }} catch (iframeError) {{
2544 |                      console.warn('Could not access iframe content:', node.src || '[no src]', iframeError.message);
2545 |                  }}
2546 |             }}
2547 |         }} // End while loop
2548 | 
2549 |         return outputElements; // Return the collected element data
2550 |     }}
2551 |     """
2552 | 
2553 | 
2554 | async def _build_page_map(
2555 |     page: Page,
2556 | ) -> Tuple[
2557 |     Dict[str, Any], str
2558 | ]:  # Uses globals _max_section_chars_global, _max_widgets_global, _log
2559 |     """Builds a structured representation (map) of the current page content and elements."""
2560 |     # Calculate fingerprint first to check cache
2561 |     fp = await _dom_fingerprint(page)
2562 | 
2563 |     # Check if cached map exists on the page object for the current fingerprint
2564 |     if hasattr(page, "_sb_page_map") and hasattr(page, "_sb_fp") and page._sb_fp == fp:
2565 |         logger.debug(f"Using cached page map for {page.url} (FP: {fp[:8]}...).")
2566 |         cached_map = page._sb_page_map
2567 |         return cached_map, fp
2568 | 
2569 |     logger.debug(f"Building new page map for {page.url} (FP: {fp[:8]}...).")
2570 |     # Initialize map components
2571 |     await _ensure_readability(page)  # Ensure Readability.js is available
2572 |     main_txt = ""
2573 |     elems: List[Dict[str, Any]] = []
2574 |     page_title = "[Error Getting Title]"
2575 | 
2576 |     try:
2577 |         # 1. Extract Main Text Content
2578 |         html_content = await page.content()
2579 |         if html_content:
2580 |             # Try Readability first
2581 |             extracted_text = await page.evaluate(_READ_JS_WRAPPER, html_content)
2582 |             main_txt = extracted_text or ""
2583 | 
2584 |             # Fallback if Readability yields short content
2585 |             if len(main_txt) < 200:
2586 |                 logger.debug("Readability text short (<200 chars), trying basic text extraction.")
2587 | 
2588 |                 # Define the synchronous extraction helper locally
2589 |                 def extract_basic_text(html_str):
2590 |                     try:
2591 |                         # Limit HTML size processed by BeautifulSoup
2592 |                         max_html_size = 3 * 1024 * 1024
2593 |                         limited_html = html_str[:max_html_size]
2594 |                         soup = BeautifulSoup(limited_html, "lxml")
2595 |                         # Remove common non-content tags before text extraction
2596 |                         tags_to_remove = [
2597 |                             "script",
2598 |                             "style",
2599 |                             "nav",
2600 |                             "footer",
2601 |                             "header",
2602 |                             "aside",
2603 |                             "form",
2604 |                             "figure",
2605 |                         ]
2606 |                         found_tags = soup(tags_to_remove)
2607 |                         for tag in found_tags:
2608 |                             tag.decompose()
2609 |                         # Get text, join with spaces, strip extra whitespace
2610 |                         basic_text = soup.get_text(" ", strip=True)
2611 |                         return basic_text
2612 |                     except Exception as bs_err:
2613 |                         logger.warning(f"Basic text extraction with BeautifulSoup failed: {bs_err}")
2614 |                         return ""  # Return empty on error
2615 | 
2616 |                 # Run the sync extraction in the thread pool
2617 |                 loop = asyncio.get_running_loop()
2618 |                 pool = _get_pool()
2619 |                 fallback_text = await loop.run_in_executor(pool, extract_basic_text, html_content)
2620 |                 main_txt = fallback_text  # Use fallback result
2621 | 
2622 |             # Limit the length of the extracted main text
2623 |             main_txt = main_txt[:_max_section_chars_global]
2624 |         else:
2625 |             logger.warning(f"Failed to get HTML content for page map on {page.url}.")
2626 | 
2627 |         # 2. Extract Interactive Elements (across all frames)
2628 |         js_func = _shadow_deep_js()  # Get the JS function string
2629 |         all_extracted_elems = []
2630 |         all_frames = page.frames
2631 |         for i, frame in enumerate(all_frames):
2632 |             if frame.is_detached():
2633 |                 logger.debug(f"Skipping detached frame {i}.")
2634 |                 continue
2635 |             frame_url_short = (frame.url or "unknown")[:80]
2636 |             try:
2637 |                 # Evaluate element extraction JS in the frame with timeout
2638 |                 frame_prefix = f"f{i}:"  # Prefix IDs with frame index
2639 |                 frame_elems = await asyncio.wait_for(
2640 |                     frame.evaluate(js_func, frame_prefix), timeout=5.0
2641 |                 )
2642 |                 all_extracted_elems.extend(frame_elems)
2643 |                 # Log extraction count per frame *only if* elements were found
2644 |                 if frame_elems:
2645 |                     logger.debug(
2646 |                         f"Extracted {len(frame_elems)} elements from frame {i} ({frame_url_short})."
2647 |                     )
2648 |             except PlaywrightTimeoutError:
2649 |                 logger.warning(f"Timeout evaluating elements in frame {i} ({frame_url_short})")
2650 |             except PlaywrightException as e:
2651 |                 # Be more specific about error logging - avoid logging full exception in normal operation unless debug level
2652 |                 logger.warning(
2653 |                     f"Playwright error evaluating elements in frame {i} ({frame_url_short}): {type(e).__name__}"
2654 |                 )
2655 |                 logger.debug(
2656 |                     f"Full PlaywrightException in frame {i}: {e}", exc_info=False
2657 |                 )  # Log full exception only at debug
2658 |             except Exception as e:
2659 |                 logger.error(
2660 |                     f"Unexpected error evaluating elements in frame {i} ({frame_url_short}): {e}",
2661 |                     exc_info=True,  # Log full traceback for unexpected errors
2662 |                 )
2663 | 
2664 |         # Limit the total number of elements stored
2665 |         elems = all_extracted_elems[:_max_widgets_global]
2666 |         logger.debug(
2667 |             f"Total elements extracted: {len(all_extracted_elems)}, stored (limited): {len(elems)}"
2668 |         )  # Log total and limited count
2669 | 
2670 |         # 3. Get Page Title
2671 |         try:
2672 |             page_title_raw = await page.title()
2673 |             page_title = page_title_raw.strip() if page_title_raw else "[No Title]"
2674 |         except PlaywrightException as title_err:
2675 |             logger.warning(f"Could not get page title for {page.url}: {title_err}")
2676 |             # Keep default error title
2677 | 
2678 |     except PlaywrightException as e:
2679 |         logger.error(
2680 |             f"Could not build page map for {page.url}: Playwright error: {e}", exc_info=True
2681 |         )
2682 |     except Exception as e:
2683 |         logger.error(f"Unexpected error building page map for {page.url}: {e}", exc_info=True)
2684 | 
2685 |     # Removed the specific logging block that depended on URL_BOOKSTORE
2686 | 
2687 |     # Assemble the final page map dictionary
2688 |     page_map = {
2689 |         "url": page.url,
2690 |         "title": page_title,
2691 |         "main_text": main_txt,
2692 |         "elements": elems,  # Contains the limited list of elements
2693 |     }
2694 | 
2695 |     # Cache the newly built map and its fingerprint on the page object
2696 |     page._sb_page_map = page_map
2697 |     page._sb_fp = fp
2698 |     logger.debug(f"Page map built and cached for {page.url}.")
2699 | 
2700 |     return page_map, fp
2701 | 
2702 | 
2703 | _SM_GLOBAL = difflib.SequenceMatcher(autojunk=False)
2704 | 
2705 | 
2706 | def _ratio(a: str, b: str) -> float:  # Keep as is
2707 |     """Calculate similarity ratio between two strings using SequenceMatcher."""
2708 |     if not a or not b:
2709 |         return 0.0
2710 |     # Set sequences for the global matcher instance
2711 |     _SM_GLOBAL.set_seqs(a, b)
2712 |     # Calculate and return the ratio
2713 |     similarity_ratio = _SM_GLOBAL.ratio()
2714 |     return similarity_ratio
2715 | 
2716 | 
2717 | def _heuristic_pick(
2718 |     pm: Dict[str, Any], hint: str, role: Optional[str]
2719 | ) -> Optional[str]:  # Uses global _seq_cutoff_global
2720 |     """Finds the best element ID based on text similarity and heuristics."""
2721 |     # Basic validation
2722 |     if not hint or not pm or not pm.get("elements"):
2723 |         return None
2724 | 
2725 |     # Normalize hint text (Unicode normalization and lowercase)
2726 |     h_norm = unicodedata.normalize("NFC", hint).lower()
2727 |     best_id: Optional[str] = None
2728 |     best_score: float = -1.0
2729 |     target_role_lower = role.lower() if role else None
2730 | 
2731 |     elements_list = pm.get("elements", [])
2732 |     for e in elements_list:
2733 |         if not e or not isinstance(e, dict):
2734 |             continue  # Skip invalid element entries
2735 | 
2736 |         el_id = e.get("id")
2737 |         el_text_raw = e.get("text", "")
2738 |         el_role_raw = e.get("role", "")
2739 |         el_tag_raw = e.get("tag", "")
2740 | 
2741 |         if not el_id:
2742 |             continue  # Skip elements without our assigned ID
2743 | 
2744 |         # Normalize element text
2745 |         el_text_norm = unicodedata.normalize("NFC", el_text_raw).lower()
2746 |         el_role_lower = el_role_raw.lower()
2747 |         el_tag_lower = el_tag_raw.lower()
2748 | 
2749 |         # Role filtering (if role specified)
2750 |         # Allow matching button tag if role is button
2751 |         is_role_match = target_role_lower == el_role_lower
2752 |         is_button_match = target_role_lower == "button" and el_tag_lower == "button"
2753 |         if target_role_lower and not is_role_match and not is_button_match:
2754 |             continue  # Skip if role doesn't match
2755 | 
2756 |         # --- Calculate Score ---
2757 |         # Base score: Text similarity
2758 |         score = _ratio(h_norm, el_text_norm)
2759 | 
2760 |         # Bonus: Exact role match
2761 |         if target_role_lower and is_role_match:
2762 |             score += 0.1
2763 | 
2764 |         # Bonus: Keyword matching (e.g., hint mentions "button" and element is button/role=button)
2765 |         hint_keywords = {
2766 |             "button",
2767 |             "submit",
2768 |             "link",
2769 |             "input",
2770 |             "download",
2771 |             "checkbox",
2772 |             "radio",
2773 |             "tab",
2774 |             "menu",
2775 |         }
2776 |         element_keywords = {el_role_lower, el_tag_lower}
2777 |         # Find hint keywords present in the hint text itself
2778 |         hint_words_in_hint = set()
2779 |         split_hint = h_norm.split()
2780 |         for w in split_hint:
2781 |             if w in hint_keywords:
2782 |                 hint_words_in_hint.add(w)
2783 |         # Check for intersection between keywords in hint and element's keywords
2784 |         common_keywords = hint_words_in_hint.intersection(element_keywords)
2785 |         if common_keywords:
2786 |             score += 0.15
2787 | 
2788 |         # Bonus: Hint likely refers to label/placeholder and element seems related
2789 |         has_label_hints = "label for" in h_norm or "placeholder" in h_norm
2790 |         if has_label_hints and score > 0.6:  # Apply only if base similarity is decent
2791 |             score += 0.1
2792 | 
2793 |         # Penalty: Very short element text compared to a long hint
2794 |         is_short_text = len(el_text_raw) < 5
2795 |         is_long_hint = len(hint) > 10
2796 |         if is_short_text and is_long_hint:
2797 |             score -= 0.1
2798 | 
2799 |         # Penalty: Generic container tags without a specific role
2800 |         is_generic_container = el_tag_lower in ("div", "span")
2801 |         has_no_role = not el_role_lower
2802 |         if is_generic_container and has_no_role:
2803 |             score -= 0.05
2804 |         # --- End Score Calculation ---
2805 | 
2806 |         # Update best match if current score is higher
2807 |         if score > best_score:
2808 |             best_id = el_id
2809 |             best_score = score
2810 | 
2811 |     # Return the best ID found if the score meets the cutoff threshold
2812 |     if best_score >= _seq_cutoff_global:
2813 |         return best_id
2814 |     else:
2815 |         return None
2816 | 
2817 | 
2818 | async def _llm_pick(
2819 |     pm: Dict[str, Any], task_hint: str, attempt: int
2820 | ) -> Optional[str]:  # Uses global _llm_model_locator_global
2821 |     """Asks the LLM to pick the best element ID for a given task hint."""
2822 |     if not pm or not task_hint:
2823 |         logger.warning("LLM pick skipped: Missing page map or task hint.")
2824 |         return None
2825 | 
2826 |     # Prepare summary of elements for the LLM prompt
2827 |     elements_summary = []
2828 |     elements_list = pm.get("elements", [])
2829 |     for el in elements_list:
2830 |         el_id = el.get("id")
2831 |         el_tag = el.get("tag")
2832 |         el_role = el.get("role", " ")  # Use space if empty for formatting
2833 |         el_text = el.get("text", " ")  # Use space if empty
2834 |         # Truncate long text for the prompt
2835 |         max_text_len = 80
2836 |         truncated_text = el_text[:max_text_len] + ("..." if len(el_text) > max_text_len else "")
2837 |         # Format summary string
2838 |         summary_str = f"id={el_id} tag={el_tag} role='{el_role}' text='{truncated_text}'"
2839 |         elements_summary.append(summary_str)
2840 | 
2841 |     # System prompt defining the task
2842 |     system_prompt = textwrap.dedent("""
2843 |         You are an expert web automation assistant. Your task is to identify the single best HTML element ID from the provided list that corresponds to the user's request.
2844 |         Analyze the user's task hint and the list of elements (with their ID, tag, role, and text).
2845 |         Choose the element ID (e.g., "el_12" or "f0:el_5") that is the most likely target for the user's action.
2846 |         Consider the element's text, role, tag, and the user's likely intent.
2847 |         If multiple elements seem possible, prioritize elements with clear interactive roles (button, link, input, etc.) or specific text matches.
2848 |         If no element is a clear match for the task hint, respond with `{"id": null}`.
2849 |         Respond ONLY with a JSON object containing the chosen element ID under the key "id". Example: `{"id": "el_42"}` or `{"id": "f1:el_10"}` or `{"id": null}`. Do NOT include explanations or markdown formatting.
2850 |     """).strip()
2851 | 
2852 |     # User prompt containing the context and request
2853 |     elements_str = "\n".join(elements_summary)
2854 |     user_prompt = textwrap.dedent(f"""
2855 |         Page Title: {pm.get("title", "[No Title]")}
2856 |         Page URL: {pm.get("url", "[No URL]")}
2857 | 
2858 |         Available Elements:
2859 |         {elements_str}
2860 | 
2861 |         User Task Hint: "{task_hint}"
2862 |         Attempt Number: {attempt}
2863 | 
2864 |         Based on the task hint and element list, which element ID should be targeted?
2865 |         Respond ONLY with a JSON object containing the 'id' (string or null).
2866 |     """).strip()
2867 | 
2868 |     # Prepare messages for the LLM call
2869 |     msgs = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
2870 | 
2871 |     # Call the LLM, expecting a JSON response
2872 |     res = await _call_llm(
2873 |         msgs,
2874 |         model=_llm_model_locator_global,  # Use configured model
2875 |         expect_json=True,
2876 |         temperature=0.0,  # Low temperature for deterministic selection
2877 |         max_tokens=100,  # Should be enough for {"id": "fX:el_YYY"}
2878 |     )
2879 | 
2880 |     # Process the LLM response
2881 |     if isinstance(res, dict):
2882 |         if "id" in res:
2883 |             el_id = res.get("id")
2884 |             # Validate the format of the returned ID (string starting with el_ or f*:el_, or null)
2885 |             is_valid_null = el_id is None
2886 |             is_valid_string_format = isinstance(el_id, str) and re.match(
2887 |                 r"^(?:f\d+:)?el_\d+$", el_id
2888 |             )
2889 |             if is_valid_null or is_valid_string_format:
2890 |                 if el_id:
2891 |                     logger.debug(
2892 |                         f"LLM picked ID: {el_id} for hint '{task_hint}' (Attempt {attempt})"
2893 |                     )
2894 |                 else:
2895 |                     logger.debug(
2896 |                         f"LLM explicitly picked null ID for hint '{task_hint}' (Attempt {attempt})"
2897 |                     )
2898 |                 return el_id
2899 |             else:
2900 |                 # Log warning if ID format is invalid
2901 |                 logger.warning(
2902 |                     f"LLM returned invalid ID format: {el_id} for hint '{task_hint}' (Attempt {attempt})"
2903 |                 )
2904 |                 return None  # Treat invalid format as no pick
2905 |         elif "error" in res:
2906 |             # Log error if LLM call failed
2907 |             error_msg = res["error"]
2908 |             logger.warning(
2909 |                 f"LLM picker failed for hint '{task_hint}' (Attempt {attempt}): {error_msg}"
2910 |             )
2911 |             return None  # Treat LLM error as no pick
2912 |         else:
2913 |             # Log warning if response dictionary format is unexpected
2914 |             logger.warning(
2915 |                 f"LLM picker returned unexpected dict format: {res.keys()} for hint '{task_hint}' (Attempt {attempt})"
2916 |             )
2917 |             return None  # Treat unexpected format as no pick
2918 |     else:
2919 |         # Log warning if the response is not a dictionary
2920 |         res_type = type(res).__name__
2921 |         logger.warning(
2922 |             f"LLM picker returned unexpected response type: {res_type} for hint '{task_hint}' (Attempt {attempt})"
2923 |         )
2924 |         return None  # Treat unexpected type as no pick
2925 | 
2926 | 
2927 | async def _loc_from_id(page: Page, el_id: str) -> Locator:  # Keep as is
2928 |     """Gets a Playwright Locator object from a data-sb-id attribute."""
2929 |     if not el_id:
2930 |         raise ValueError("Element ID cannot be empty when creating locator.")
2931 | 
2932 |     # Escape the ID for use in CSS selector (esp. if ID contains quotes or backslashes)
2933 |     # Double backslashes for Python string literal, then double again for CSS escaping
2934 |     escaped_id_inner = el_id.replace("\\", "\\\\").replace('"', '\\"')
2935 |     selector = f'[data-sb-id="{escaped_id_inner}"]'
2936 | 
2937 |     # Check if the ID indicates a specific frame (e.g., "f0:el_12")
2938 |     if ":" in el_id and el_id.startswith("f"):
2939 |         try:
2940 |             frame_prefix, element_part = el_id.split(":", 1)
2941 |             frame_index_str = frame_prefix[1:]  # Get the number part after 'f'
2942 |             frame_index = int(frame_index_str)
2943 |             all_frames = page.frames
2944 |             if 0 <= frame_index < len(all_frames):
2945 |                 target_frame = all_frames[frame_index]
2946 |                 # Return the locator within the specified frame
2947 |                 locator_in_frame = target_frame.locator(selector).first
2948 |                 return locator_in_frame
2949 |             else:
2950 |                 # Log warning if frame index is out of bounds, fallback to main frame
2951 |                 logger.warning(
2952 |                     f"Frame index {frame_index} from ID '{el_id}' is out of bounds (0-{len(all_frames) - 1}). Falling back to main frame search."
2953 |                 )
2954 |         except (ValueError, IndexError) as e:
2955 |             # Log warning if parsing fails, fallback to main frame
2956 |             logger.warning(
2957 |                 f"Could not parse frame index from ID '{el_id}'. Falling back to main frame search. Error: {e}"
2958 |             )
2959 | 
2960 |     # Default: return locator in the main frame
2961 |     locator_in_main = page.locator(selector).first
2962 |     return locator_in_main
2963 | 
2964 | 
2965 | # --- Enhanced Locator (as a helper class, not a tool itself) ---
2966 | class EnhancedLocator:  # Keep class, but it's used INTERNALLY by standalone functions
2967 |     """Unified locator using cache, heuristics, and LLM fallback."""
2968 | 
2969 |     def __init__(self, page: Page):
2970 |         self.page = page
2971 |         # Determine site identifier from URL for caching
2972 |         self.site = "unknown"
2973 |         try:
2974 |             page_url = page.url or ""  # Handle case where URL might be None/empty
2975 |             parsed = urlparse(page_url)
2976 |             netloc_raw = parsed.netloc.lower()
2977 |             # Remove www. prefix if present
2978 |             netloc_clean = netloc_raw.replace("www.", "")
2979 |             # Use cleaned netloc, fallback to 'unknown' if empty
2980 |             self.site = netloc_clean or "unknown"
2981 |         except Exception as e:
2982 |             logger.warning(f"Error parsing site from URL '{page.url}' for EnhancedLocator: {e}")
2983 |             # Keep self.site as "unknown"
2984 |             pass
2985 |         # Internal cache for page map and fingerprint for current instance lifecycle
2986 |         self._pm: Optional[Dict[str, Any]] = None
2987 |         self._pm_fp: Optional[str] = None
2988 |         # Timestamp for throttling network idle checks
2989 |         self._last_idle_check: float = 0.0
2990 | 
2991 |     async def _maybe_wait_for_idle(self, timeout: float = 1.5):  # Uses global _last_idle_check
2992 |         """Waits for network idle state, throttled to avoid excessive waits."""
2993 |         now = time.monotonic()
2994 |         time_since_last_check = now - self._last_idle_check
2995 |         # Only check if enough time has passed since the last check
2996 |         if time_since_last_check > 1.0:  # Check at most once per second
2997 |             try:
2998 |                 # Wait for network to be idle for a short period
2999 |                 timeout_ms = int(timeout * 1000)
3000 |                 await self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
3001 |                 self._last_idle_check = time.monotonic()  # Update timestamp on success
3002 |             except PlaywrightException:
3003 |                 # Ignore timeout or other errors, just update timestamp
3004 |                 self._last_idle_check = time.monotonic()
3005 | 
3006 |     async def _get_page_map(self) -> Tuple[Dict[str, Any], str]:  # Calls global _build_page_map
3007 |         """Gets the current page map, potentially building it if needed."""
3008 |         # Short wait/idle check before building map to allow dynamic content to settle
3009 |         await self._maybe_wait_for_idle()
3010 |         sleep_duration = random.uniform(0.1, 0.25)  # Small random delay
3011 |         await asyncio.sleep(sleep_duration)
3012 | 
3013 |         # Build the page map (which includes fingerprint check internally)
3014 |         pm, fp = await _build_page_map(self.page)
3015 | 
3016 |         # Store locally for potential reuse within this instance lifecycle
3017 |         self._pm = pm
3018 |         self._pm_fp = fp
3019 |         return pm, fp
3020 | 
3021 |     async def _selector_cached(
3022 |         self, key: str, fp: str
3023 |     ) -> Optional[Locator]:  # Calls global _cache_get_sync, _log
3024 |         """Checks cache for a selector, validates it, and returns Locator if valid."""
3025 |         loop = asyncio.get_running_loop()
3026 |         pool = _get_pool()
3027 |         # Perform synchronous cache read in thread pool
3028 |         sel = await loop.run_in_executor(pool, _cache_get_sync, key, fp)
3029 | 
3030 |         if sel:
3031 |             logger.debug(f"Cache hit for key prefix {key[:8]}. Selector: '{sel}'")
3032 |             try:
3033 |                 # Extract element ID from selector string like '[data-sb-id="f0:el_12"]'
3034 |                 match = re.search(r'data-sb-id="([^"]+)"', sel)
3035 |                 if not match:
3036 |                     logger.warning(
3037 |                         f"Cached selector '{sel}' has unexpected format. Ignoring cache."
3038 |                     )
3039 |                     return None
3040 |                 loc_id = match.group(1)
3041 | 
3042 |                 # Get the Playwright Locator object using the ID
3043 |                 loc = await _loc_from_id(self.page, loc_id)
3044 | 
3045 |                 # Quick check if the element is visible (short timeout)
3046 |                 await loc.wait_for(state="visible", timeout=500)  # 500ms check
3047 | 
3048 |                 # Log cache hit and return the valid locator
3049 |                 log_key = key[:8]
3050 |                 await _log("locator_cache_hit", selector=sel, key=log_key)
3051 |                 return loc
3052 |             except (PlaywrightException, ValueError) as e:
3053 |                 # Log if cached selector is no longer valid/visible or ID parsing fails
3054 |                 logger.debug(
3055 |                     f"Cached selector '{sel}' failed visibility/location check. Error: {e}"
3056 |                 )
3057 |                 # Consider deleting the stale cache entry here?
3058 |                 # await loop.run_in_executor(pool, _cache_delete_sync, key) # Potentially aggressive
3059 |         return None  # Cache miss or invalid cached selector
3060 | 
3061 |     async def locate(
3062 |         self, task_hint: str, *, role: Optional[str] = None, timeout: int = 5000
3063 |     ) -> (
3064 |         Locator
3065 |     ):  # Uses globals _retry_after_fail_global, _log, _get_pool, _cache_put_sync, _llm_pick
3066 |         """
3067 |         Finds the best Locator for a task hint using cache, heuristics, LLM, and smarter fallbacks.
3068 | 
3069 |         Args:
3070 |             task_hint: Natural language description of the element to locate.
3071 |             role: Optional specific ARIA role to filter potential matches.
3072 |             timeout: Maximum time in milliseconds to find the element.
3073 | 
3074 |         Returns:
3075 |             A Playwright Locator object pointing to the best match found.
3076 | 
3077 |         Raises:
3078 |             ValueError: If task_hint is empty.
3079 |             PlaywrightTimeoutError: If no suitable element is found within the timeout across all methods.
3080 |             ToolError: For internal errors during location.
3081 |         """
3082 |         if not task_hint or not task_hint.strip():
3083 |             raise ValueError("locate requires a non-empty 'task_hint'")
3084 | 
3085 |         start_time = time.monotonic()
3086 |         timeout_sec = timeout / 1000.0
3087 |         loop = asyncio.get_running_loop()
3088 |         pool = _get_pool()
3089 | 
3090 |         # --- 1. Generate Cache Key ---
3091 |         page_url = self.page.url or ""
3092 |         parsed_url = urlparse(page_url)
3093 |         path = parsed_url.path or "/"
3094 |         # Normalize hint and role for cache key stability
3095 |         normalized_hint = unicodedata.normalize("NFC", task_hint).lower().strip()
3096 |         normalized_role = role.lower().strip() if role else None
3097 |         key_data = {
3098 |             "site": self.site,
3099 |             "path": path,
3100 |             "hint": normalized_hint,
3101 |             "role": normalized_role,
3102 |         }
3103 |         key_src = json.dumps(key_data, sort_keys=True)
3104 |         key_src_bytes = key_src.encode("utf-8")
3105 |         cache_key = hashlib.sha256(key_src_bytes).hexdigest()
3106 |         key_preview = cache_key[:8]
3107 |         log_prefix = (
3108 |             f"EnhancedLocator(key={key_preview}, hint='{task_hint[:50]}...', role='{role}')"
3109 |         )
3110 |         logger.debug(f"{log_prefix}: Initiating locate.")
3111 | 
3112 |         # --- 2. Check Cache with Current DOM Fingerprint ---
3113 |         logger.debug(f"{log_prefix}: Checking cache...")
3114 |         current_dom_fp = await _dom_fingerprint(self.page)
3115 |         logger.debug(f"{log_prefix}: Current DOM FP: {current_dom_fp[:12]}...")
3116 |         try:
3117 |             cached_loc = await self._selector_cached(cache_key, current_dom_fp)
3118 |             if cached_loc:
3119 |                 logger.info(f"{log_prefix}: Cache HIT.")
3120 |                 await _log(
3121 |                     "locator_success", hint=task_hint, role=role, method="cache", key=key_preview
3122 |                 )
3123 |                 return cached_loc
3124 |             else:
3125 |                 logger.debug(f"{log_prefix}: Cache MISS.")
3126 |         except Exception as cache_err:
3127 |             logger.warning(f"{log_prefix}: Error checking cache: {cache_err}")
3128 | 
3129 |         # --- 3. Cache Miss: Get Page Map and Try Heuristics ---
3130 |         logger.debug(f"{log_prefix}: Trying heuristics...")
3131 |         try:
3132 |             (
3133 |                 pm,
3134 |                 current_dom_fp,
3135 |             ) = await self._get_page_map()  # Get map (updates fingerprint if changed)
3136 |             map_keys = list(pm.keys()) if pm else []
3137 |             num_elements = len(pm.get("elements", [])) if pm else 0
3138 |             logger.debug(
3139 |                 f"{log_prefix}: Page map obtained. FP={current_dom_fp[:8]}, Keys={map_keys}, Elements={num_elements}"
3140 |             )
3141 | 
3142 |             heuristic_id = _heuristic_pick(pm, task_hint, role)
3143 |             logger.debug(f"{log_prefix}: Heuristic pick result ID: '{heuristic_id}'")
3144 | 
3145 |             if heuristic_id:
3146 |                 try:
3147 |                     logger.debug(f"{log_prefix}: Validating heuristic pick ID '{heuristic_id}'...")
3148 |                     loc = await _loc_from_id(self.page, heuristic_id)
3149 |                     await loc.scroll_into_view_if_needed(timeout=2000)
3150 |                     wait_timeout_heur = max(1000, timeout // 3)  # Use portion of timeout
3151 |                     logger.debug(
3152 |                         f"{log_prefix}: Waiting for heuristic element visibility ({wait_timeout_heur}ms)..."
3153 |                     )
3154 |                     await loc.wait_for(state="visible", timeout=wait_timeout_heur)
3155 |                     logger.info(f"{log_prefix}: Heuristic pick VALIDATED (ID: {heuristic_id}).")
3156 | 
3157 |                     # Cache the successful heuristic result
3158 |                     selector_str = f'[data-sb-id="{heuristic_id}"]'
3159 |                     await loop.run_in_executor(
3160 |                         pool, _cache_put_sync, cache_key, selector_str, current_dom_fp
3161 |                     )
3162 |                     await _log(
3163 |                         "locator_heuristic_match", selector=heuristic_id, hint=task_hint, role=role
3164 |                     )
3165 |                     await _log(
3166 |                         "locator_success",
3167 |                         hint=task_hint,
3168 |                         role=role,
3169 |                         method="heuristic",
3170 |                         selector=heuristic_id,
3171 |                     )
3172 |                     return loc
3173 |                 except (PlaywrightException, ValueError) as e_heur_val:
3174 |                     logger.debug(
3175 |                         f"{log_prefix}: Heuristic pick '{heuristic_id}' validation FAILED. Error: {e_heur_val}"
3176 |                     )
3177 |                     # Continue to LLM fallback
3178 |         except Exception as map_heur_err:
3179 |             logger.warning(
3180 |                 f"{log_prefix}: Error during page map or heuristic processing: {map_heur_err}"
3181 |             )
3182 |             # Ensure pm is defined for LLM step, even if empty
3183 |             pm = pm if "pm" in locals() else {}
3184 |             current_dom_fp = (
3185 |                 current_dom_fp
3186 |                 if "current_dom_fp" in locals()
3187 |                 else await _dom_fingerprint(self.page)
3188 |             )
3189 | 
3190 |         # --- 4. Heuristic Failed: Try LLM Picker (with retries) ---
3191 |         logger.debug(f"{log_prefix}: Trying LLM picker...")
3192 |         num_llm_attempts = 1 + _retry_after_fail_global
3193 |         for att in range(1, num_llm_attempts + 1):
3194 |             elapsed_sec = time.monotonic() - start_time
3195 |             if elapsed_sec >= timeout_sec:
3196 |                 logger.warning(f"{log_prefix}: Timeout reached before completing LLM attempts.")
3197 |                 break  # Break loop, proceed to fallback
3198 | 
3199 |             logger.debug(f"{log_prefix}: LLM pick attempt {att}/{num_llm_attempts}...")
3200 |             # Ensure page map 'pm' is available from heuristic step or refreshed
3201 |             if not pm or (
3202 |                 "error" in pm and att > 1
3203 |             ):  # Refresh if map invalid or after first attempt
3204 |                 logger.debug(f"{log_prefix}: Refreshing page map before LLM attempt {att}...")
3205 |                 try:
3206 |                     pm, current_dom_fp = await self._get_page_map()
3207 |                     logger.debug(f"{log_prefix}: Page map refreshed. FP={current_dom_fp[:8]}.")
3208 |                 except Exception as map_refresh_err:
3209 |                     logger.warning(
3210 |                         f"{log_prefix}: Failed to refresh page map for LLM attempt {att}: {map_refresh_err}"
3211 |                     )
3212 |                     # Try proceeding without map refresh? Or break? Let's break to avoid confusing LLM.
3213 |                     break
3214 | 
3215 |             llm_id = await _llm_pick(pm, task_hint, att)
3216 |             logger.debug(f"{log_prefix}: LLM pick result (Attempt {att}): ID='{llm_id}'")
3217 | 
3218 |             if not llm_id:
3219 |                 logger.debug(f"{log_prefix}: LLM pick attempt {att} returned no ID.")
3220 |                 if att < num_llm_attempts:
3221 |                     continue  # Refresh happens at start of next loop iteration if needed
3222 |                 else:
3223 |                     break  # Last LLM attempt failed, proceed to fallback
3224 | 
3225 |             # LLM returned an ID, try to validate it
3226 |             try:
3227 |                 logger.debug(f"{log_prefix}: Validating LLM pick ID '{llm_id}' (Attempt {att})...")
3228 |                 loc = await _loc_from_id(self.page, llm_id)
3229 |                 try:  # Log outerHTML for debugging LLM picks
3230 |                     loc_llm_outer_html = await loc.evaluate(
3231 |                         "element => element.outerHTML", timeout=500
3232 |                     )
3233 |                     logger.debug(
3234 |                         f"{log_prefix}: LLM picked element outerHTML: {loc_llm_outer_html[:200]}..."
3235 |                     )
3236 |                 except Exception as eval_err:
3237 |                     logger.debug(
3238 |                         f"{log_prefix}: Error getting outerHTML for LLM pick {llm_id}: {eval_err}"
3239 |                     )
3240 | 
3241 |                 await loc.scroll_into_view_if_needed(timeout=2000)
3242 |                 elapsed_now_sec = time.monotonic() - start_time
3243 |                 remaining_timeout_ms = max(500, timeout - int(elapsed_now_sec * 1000))
3244 |                 if remaining_timeout_ms <= 0:
3245 |                     raise PlaywrightTimeoutError("Timeout before LLM validation wait.")
3246 |                 logger.debug(
3247 |                     f"{log_prefix}: Waiting for LLM element visibility ({remaining_timeout_ms}ms)..."
3248 |                 )
3249 |                 await loc.wait_for(state="visible", timeout=remaining_timeout_ms)
3250 |                 logger.info(f"{log_prefix}: LLM pick VALIDATED (ID: {llm_id}, Attempt {att}).")
3251 | 
3252 |                 # Cache the successful LLM result
3253 |                 selector_str = f'[data-sb-id="{llm_id}"]'
3254 |                 await loop.run_in_executor(
3255 |                     pool, _cache_put_sync, cache_key, selector_str, current_dom_fp
3256 |                 )
3257 |                 await _log(
3258 |                     "locator_llm_pick", selector=llm_id, attempt=att, hint=task_hint, role=role
3259 |                 )
3260 |                 await _log(
3261 |                     "locator_success",
3262 |                     hint=task_hint,
3263 |                     role=role,
3264 |                     method="llm",
3265 |                     selector=llm_id,
3266 |                     attempt=att,
3267 |                 )
3268 |                 return loc
3269 |             except (PlaywrightException, ValueError) as e_llm_val:
3270 |                 logger.debug(
3271 |                     f"{log_prefix}: LLM pick '{llm_id}' (attempt {att}) validation FAILED. Error: {e_llm_val}"
3272 |                 )
3273 |                 # Continue to next LLM attempt loop iteration (map refresh handled at loop start)
3274 | 
3275 |         # --- 5. LLM Failed: Try Fallback Selectors ---
3276 |         logger.debug(f"{log_prefix}: Trying fallback selectors...")
3277 | 
3278 |         fallback_strategies = [
3279 |             (
3280 |                 "placeholder",
3281 |                 f'[placeholder*="{task_hint}" i]',
3282 |             ),  # Case-insensitive placeholder contains hint
3283 |             (
3284 |                 "aria-label",
3285 |                 f'[aria-label*="{task_hint}" i]',
3286 |             ),  # Case-insensitive aria-label contains hint
3287 |             ("exact_text", f'text="{task_hint}"'),  # Exact text match
3288 |             (
3289 |                 "contains_text",
3290 |                 f'text*="{task_hint}" i',
3291 |             ),  # Case-insensitive text contains hint (use cautiously)
3292 |         ]
3293 | 
3294 |         for name, selector in fallback_strategies:
3295 |             elapsed_sec_fb = time.monotonic() - start_time
3296 |             remaining_timeout_ms_fb = max(500, timeout - int(elapsed_sec_fb * 1000))
3297 |             if remaining_timeout_ms_fb <= 500 and elapsed_sec_fb >= timeout_sec:  # Check both
3298 |                 logger.warning(
3299 |                     f"{log_prefix}: Timeout reached before trying fallback selector '{name}'."
3300 |                 )
3301 |                 break  # Stop trying fallbacks if time is up
3302 | 
3303 |             logger.debug(
3304 |                 f"{log_prefix}: Trying fallback strategy '{name}' with selector: {selector}"
3305 |             )
3306 |             try:
3307 |                 loc = self.page.locator(selector).first
3308 |                 # Adjust scroll/wait timeout based on remaining time
3309 |                 scroll_timeout_fb = max(500, remaining_timeout_ms_fb // 3)
3310 |                 wait_timeout_fb = max(500, remaining_timeout_ms_fb // 2)
3311 | 
3312 |                 await loc.scroll_into_view_if_needed(timeout=scroll_timeout_fb)
3313 |                 logger.debug(
3314 |                     f"{log_prefix}: Waiting for fallback '{name}' visibility ({wait_timeout_fb}ms)..."
3315 |                 )
3316 |                 await loc.wait_for(state="visible", timeout=wait_timeout_fb)
3317 | 
3318 |                 # Fallback succeeded
3319 |                 logger.info(f"{log_prefix}: Locator found via fallback strategy '{name}'.")
3320 |                 await _log(
3321 |                     "locator_text_fallback",
3322 |                     selector=selector,
3323 |                     hint=task_hint,
3324 |                     role=role,
3325 |                     strategy=name,
3326 |                 )
3327 |                 await _log(
3328 |                     "locator_success",
3329 |                     hint=task_hint,
3330 |                     role=role,
3331 |                     method="fallback",
3332 |                     strategy=name,
3333 |                     selector=selector,
3334 |                 )
3335 |                 return loc
3336 |             except PlaywrightTimeoutError:
3337 |                 logger.debug(
3338 |                     f"{log_prefix}: Fallback strategy '{name}' (selector: {selector}) failed (Timeout)."
3339 |                 )
3340 |             except PlaywrightException as text_fallback_err:
3341 |                 logger.debug(
3342 |                     f"{log_prefix}: Fallback strategy '{name}' (selector: {selector}) failed (Playwright Error: {text_fallback_err})."
3343 |                 )
3344 |             except Exception as fallback_unexpected:
3345 |                 logger.warning(
3346 |                     f"{log_prefix}: Unexpected error during fallback strategy '{name}': {fallback_unexpected}"
3347 |                 )
3348 | 
3349 |         # --- 6. All Methods Failed ---
3350 |         final_elapsed_sec = time.monotonic() - start_time
3351 |         log_hint = task_hint[:120]
3352 |         log_duration = round(final_elapsed_sec, 1)
3353 |         await _log("locator_fail_all", hint=log_hint, duration_s=log_duration, role=role)
3354 |         logger.error(
3355 |             f"{log_prefix}: FAILED to find element within {timeout_sec:.1f}s using all methods."
3356 |         )
3357 |         raise PlaywrightTimeoutError(
3358 |             f"EnhancedLocator failed to find element for hint: '{task_hint}' within {timeout_sec:.1f}s using all methods (cache, heuristic, LLM, fallbacks)."
3359 |         )
3360 | 
3361 | 
3362 | # --- Smart Actions (Helpers using EnhancedLocator) ---
3363 | async def _detect_web_obstacles(page: Page) -> Dict[str, Any]:
3364 |     """Detect common web obstacles that might interfere with automation."""
3365 |     obstacles = {
3366 |         "captcha_detected": False,
3367 |         "cookie_banner": False,
3368 |         "cloudflare_challenge": False,
3369 |         "login_required": False,
3370 |         "details": []
3371 |     }
3372 |     
3373 |     try:
3374 |         # Comprehensive CAPTCHA detection
3375 |         captcha_js = """() => {
3376 |             const indicators = [];
3377 |             
3378 |             // Text-based detection
3379 |             if (document.body.innerText.toLowerCase().includes('captcha') ||
3380 |                 document.body.innerText.toLowerCase().includes('recaptcha') ||
3381 |                 document.body.innerText.toLowerCase().includes('i\'m not a robot')) {
3382 |                 indicators.push('captcha_text_found');
3383 |             }
3384 |             
3385 |             // Element-based detection
3386 |             if (document.querySelector('iframe[title*="captcha" i]') ||
3387 |                 document.querySelector('iframe[src*="captcha" i]') ||
3388 |                 document.querySelector('[id*="captcha" i]') ||
3389 |                 document.querySelector('[class*="captcha" i]') ||
3390 |                 document.querySelector('div[class*="recaptcha" i]') ||
3391 |                 document.querySelector('.g-recaptcha') ||
3392 |                 document.querySelector('#recaptcha')) {
3393 |                 indicators.push('captcha_element_found');
3394 |             }
3395 |             
3396 |             // Cookie banner detection
3397 |             if (document.querySelector('[class*="cookie" i]') ||
3398 |                 document.querySelector('[id*="cookie" i]') ||
3399 |                 document.body.innerText.toLowerCase().includes('accept cookies') ||
3400 |                 document.body.innerText.toLowerCase().includes('cookie policy')) {
3401 |                 indicators.push('cookie_banner_found');
3402 |             }
3403 |             
3404 |             // Cloudflare detection
3405 |             if (document.body.innerText.includes('Cloudflare') &&
3406 |                 (document.body.innerText.includes('checking') || 
3407 |                  document.body.innerText.includes('security'))) {
3408 |                 indicators.push('cloudflare_challenge');
3409 |             }
3410 |             
3411 |             // Login detection
3412 |             if (document.querySelector('input[type="password"]') &&
3413 |                 (document.body.innerText.toLowerCase().includes('sign in') ||
3414 |                  document.body.innerText.toLowerCase().includes('log in') ||
3415 |                  document.body.innerText.toLowerCase().includes('login'))) {
3416 |                 indicators.push('login_required');
3417 |             }
3418 |             
3419 |             return indicators;
3420 |         }"""
3421 |         
3422 |         detected_indicators = await page.evaluate(captcha_js)
3423 |         
3424 |         # Process results
3425 |         for indicator in detected_indicators:
3426 |             if 'captcha' in indicator:
3427 |                 obstacles["captcha_detected"] = True
3428 |                 obstacles["details"].append(f"CAPTCHA detected: {indicator}")
3429 |             elif 'cookie' in indicator:
3430 |                 obstacles["cookie_banner"] = True
3431 |                 obstacles["details"].append(f"Cookie banner detected: {indicator}")
3432 |             elif 'cloudflare' in indicator:
3433 |                 obstacles["cloudflare_challenge"] = True
3434 |                 obstacles["details"].append(f"Cloudflare challenge detected: {indicator}")
3435 |             elif 'login' in indicator:
3436 |                 obstacles["login_required"] = True
3437 |                 obstacles["details"].append(f"Login requirement detected: {indicator}")
3438 |         
3439 |         return obstacles
3440 |         
3441 |     except Exception as e:
3442 |         logger.warning(f"Error detecting web obstacles: {e}")
3443 |         return obstacles
3444 | 
3445 | 
3446 | @resilient(max_attempts=3, backoff=0.5)
3447 | async def smart_click(
3448 |     page: Page, task_hint: str, *, target_kwargs: Optional[Dict] = None, timeout_ms: int = 5000
3449 | ) -> bool:  # Uses global _log, _get_pool, _cache_put_sync
3450 |     """Locates an element using a hint and clicks it."""
3451 |     # Validate or generate task_hint
3452 |     effective_task_hint = task_hint
3453 |     if not task_hint or not task_hint.strip():
3454 |         if target_kwargs:
3455 |             name = target_kwargs.get("name", "")
3456 |             role = target_kwargs.get("role", "")
3457 |             if name or role:
3458 |                 role_part = role or "element"
3459 |                 name_part = f" named '{name}'" if name else ""
3460 |                 effective_task_hint = f"Click the {role_part}{name_part}"
3461 |                 logger.warning(f"smart_click missing hint, generated: '{effective_task_hint}'")
3462 |             else:
3463 |                 # Neither name nor role provided in target_kwargs
3464 |                 raise ToolInputError(
3465 |                     "smart_click requires a non-empty 'task_hint' or a 'target' dictionary with 'name' or 'role'."
3466 |                 )
3467 |         else:
3468 |             # No target_kwargs provided either
3469 |             raise ToolInputError("smart_click requires a non-empty 'task_hint'.")
3470 | 
3471 |     # First, detect web obstacles that might interfere with automation
3472 |     try:
3473 |         obstacles = await _detect_web_obstacles(page)
3474 |         
3475 |         # Handle CAPTCHA detection - fail early if trying to click CAPTCHA
3476 |         if obstacles["captcha_detected"] and ("captcha" in effective_task_hint.lower() or "recaptcha" in effective_task_hint.lower()):
3477 |             logger.error(f"Cannot click CAPTCHA element: '{effective_task_hint}'. CAPTCHAs are designed to prevent automation.")
3478 |             raise ToolError(
3479 |                 f"CAPTCHA interaction blocked for task: '{effective_task_hint}'. "
3480 |                 "Manual intervention required. CAPTCHAs cannot be automatically solved."
3481 |             )
3482 |         
3483 |         # Log any obstacles detected for diagnostic purposes
3484 |         if any([obstacles["captcha_detected"], obstacles["cookie_banner"], obstacles["cloudflare_challenge"], obstacles["login_required"]]):
3485 |             await _log("smart_click_obstacles_detected", task_hint=effective_task_hint, obstacles=obstacles)
3486 |             logger.info(f"Web obstacles detected before click attempt: {obstacles['details']}")
3487 |             
3488 |             # Try to handle cookie banners automatically
3489 |             if obstacles["cookie_banner"]:
3490 |                 logger.info("Attempting to dismiss cookie banner before main click action...")
3491 |                 cookie_selectors = [
3492 |                     'button:has-text("Accept")', 'button:has-text("Accept All")', 
3493 |                     'button:has-text("OK")', 'button:has-text("Allow")',
3494 |                     '[id*="accept" i]', '[class*="accept" i]'
3495 |                 ]
3496 |                 for selector in cookie_selectors:
3497 |                     try:
3498 |                         cookie_btn = page.locator(selector).first
3499 |                         await cookie_btn.click(timeout=2000)
3500 |                         logger.info(f"Successfully dismissed cookie banner using: {selector}")
3501 |                         await asyncio.sleep(0.5)  # Brief pause after dismissal
3502 |                         break
3503 |                     except Exception:
3504 |                         continue
3505 |             
3506 |             # Give Cloudflare challenges a moment to complete
3507 |             if obstacles["cloudflare_challenge"]:
3508 |                 logger.info("Cloudflare challenge detected, waiting briefly...")
3509 |                 await asyncio.sleep(3)
3510 |                 
3511 |     except Exception as obstacle_err:
3512 |         logger.warning(f"Error during obstacle detection: {obstacle_err}. Proceeding with click attempt.")
3513 | 
3514 |     loc_helper = EnhancedLocator(page)
3515 |     # Prepare log details, prioritizing target_kwargs if available
3516 |     log_target = {}
3517 |     if target_kwargs:
3518 |         log_target.update(target_kwargs)
3519 |     else:
3520 |         log_target["hint"] = effective_task_hint  # Log the hint used
3521 | 
3522 |     try:
3523 |         # Locate the element using the enhanced locator
3524 |         element = await loc_helper.locate(task_hint=effective_task_hint, timeout=timeout_ms)
3525 |         element_id_for_cache = await element.get_attribute("data-sb-id")
3526 | 
3527 |         # Prepare and execute the click
3528 |         await element.scroll_into_view_if_needed(timeout=3000)  # Scroll with timeout
3529 |         await _pause(page)  # Add jitter before click
3530 |         click_timeout = max(1000, timeout_ms // 2)  # Use portion of overall timeout
3531 |         await element.click(timeout=click_timeout)
3532 | 
3533 |         # Update cache if successful and ID was retrieved
3534 |         if element_id_for_cache:
3535 |             fp = await _dom_fingerprint(
3536 |                 page
3537 |             )  # Get current fingerprint after click potentially changed DOM
3538 |             # Generate cache key again (could be helper function)
3539 |             page_url_after_click = page.url or ""
3540 |             parsed_url_after_click = urlparse(page_url_after_click)
3541 |             path_after_click = parsed_url_after_click.path or "/"
3542 |             key_data_after_click = {
3543 |                 "site": loc_helper.site,
3544 |                 "path": path_after_click,  # Use path *after* click
3545 |                 "hint": effective_task_hint.lower(),
3546 |             }
3547 |             key_src_after_click = json.dumps(key_data_after_click, sort_keys=True)
3548 |             cache_key_after_click = hashlib.sha256(key_src_after_click.encode()).hexdigest()
3549 |             selector_str = f'[data-sb-id="{element_id_for_cache}"]'
3550 |             loop_after_click = asyncio.get_running_loop()
3551 |             pool_after_click = _get_pool()
3552 |             await loop_after_click.run_in_executor(
3553 |                 pool_after_click, _cache_put_sync, cache_key_after_click, selector_str, fp
3554 |             )
3555 | 
3556 |         # Log success
3557 |         await _log("click_success", target=log_target)
3558 |         return True
3559 | 
3560 |     except PlaywrightTimeoutError as e:
3561 |         # Element not found or visible within timeout
3562 |         await _log("click_fail_notfound", target=log_target, error=str(e))
3563 |         raise ToolError(
3564 |             f"Click failed: Element not found/visible for hint '{effective_task_hint}'. {e}",
3565 |             details=log_target,
3566 |         ) from e
3567 |     except PlaywrightException as e:
3568 |         # Other Playwright errors during click/scroll/locate
3569 |         await _log("click_fail_playwright", target=log_target, error=str(e))
3570 |         raise ToolError(f"Click failed due to Playwright error: {e}", details=log_target) from e
3571 |     except Exception as e:
3572 |         # Unexpected errors
3573 |         await _log("click_fail_unexpected", target=log_target, error=str(e))
3574 |         raise ToolError(f"Unexpected error during click: {e}", details=log_target) from e
3575 | 
3576 | 
3577 | @resilient(max_attempts=3, backoff=0.5)
3578 | async def smart_type(
3579 |     page: Page,
3580 |     task_hint: str,
3581 |     text: str,
3582 |     *,
3583 |     press_enter: bool = False,
3584 |     clear_before: bool = True,
3585 |     target_kwargs: Optional[Dict] = None,
3586 |     timeout_ms: int = 5000,
3587 | ) -> bool:  # Uses global _log, get_secret, _get_pool, _cache_put_sync
3588 |     """Locates an element using a hint and types text into it."""
3589 |     # Validate or generate task_hint
3590 |     effective_task_hint = task_hint
3591 |     if not task_hint or not task_hint.strip():
3592 |         if target_kwargs:
3593 |             name = target_kwargs.get("name", "")
3594 |             role = target_kwargs.get("role", "input")  # Default role to input for type
3595 |             if name or role:
3596 |                 role_part = role or "element"
3597 |                 name_part = f" named '{name}'" if name else ""
3598 |                 effective_task_hint = f"Type into the {role_part}{name_part}"
3599 |                 logger.warning(f"smart_type missing hint, generated: '{effective_task_hint}'")
3600 |             else:
3601 |                 raise ToolInputError(
3602 |                     "smart_type requires a non-empty 'task_hint' or a 'target' dictionary with 'name' or 'role'."
3603 |                 )
3604 |         else:
3605 |             raise ToolInputError("smart_type requires a non-empty 'task_hint'.")
3606 | 
3607 |     loc_helper = EnhancedLocator(page)
3608 |     # Prepare log details
3609 |     log_target = {}
3610 |     if target_kwargs:
3611 |         log_target.update(target_kwargs)
3612 |     else:
3613 |         log_target["hint"] = effective_task_hint
3614 | 
3615 |     resolved_text = text
3616 |     log_value = "***SECRET***"  # Default log value for secrets
3617 |     # Resolve secrets if needed
3618 |     if text.startswith("secret:"):
3619 |         secret_path = text[len("secret:") :]
3620 |         try:
3621 |             resolved_text = get_secret(secret_path)
3622 |             # Keep log_value as "***SECRET***"
3623 |         except (KeyError, ValueError, RuntimeError) as e:
3624 |             await _log("type_fail_secret", target=log_target, secret_ref=secret_path, error=str(e))
3625 |             raise ToolInputError(f"Failed to resolve secret '{secret_path}': {e}") from e
3626 |     else:
3627 |         # Create safe log value for non-secrets (truncate if long)
3628 |         if len(text) > 23:
3629 |             log_value = text[:20] + "..."
3630 |         else:
3631 |             log_value = text
3632 | 
3633 |     try:
3634 |         # Locate the element
3635 |         element = await loc_helper.locate(task_hint=effective_task_hint, timeout=timeout_ms)
3636 |         element_id_for_cache = await element.get_attribute("data-sb-id")
3637 | 
3638 |         # Prepare and perform the typing action
3639 |         await element.scroll_into_view_if_needed(timeout=3000)
3640 |         await _pause(page)  # Jitter before interaction
3641 | 
3642 |         if clear_before:
3643 |             await element.fill("")  # Clear the field first
3644 | 
3645 |         # Type the resolved text with human-like delay
3646 |         type_delay = random.uniform(30, 80)
3647 |         await element.type(resolved_text, delay=type_delay)
3648 | 
3649 |         # Optionally press Enter
3650 |         if press_enter:
3651 |             await _pause(page, (50, 150))  # Short pause before Enter
3652 |             try:
3653 |                 # Try pressing Enter directly
3654 |                 await element.press(
3655 |                     "Enter", timeout=1000, noWaitAfter=True
3656 |                 )  # Don't wait for navigation here
3657 |             except PlaywrightException as e:
3658 |                 # Fallback: If Enter press fails (e.g., on non-input), try clicking the element again
3659 |                 # This might trigger submission if it's also a button or linked element.
3660 |                 logger.warning(
3661 |                     f"Enter key press failed for hint '{effective_task_hint}', trying smart_click fallback: {e}"
3662 |                 )
3663 |                 try:
3664 |                     await smart_click(
3665 |                         page, task_hint=effective_task_hint, target_kwargs=target_kwargs
3666 |                     )
3667 |                 except Exception as click_e:
3668 |                     logger.warning(
3669 |                         f"Fallback smart_click after failed Enter press also failed: {click_e}"
3670 |                     )
3671 |                     # Decide if this should re-raise or just log. Logging for now.
3672 | 
3673 |         # Update cache if successful
3674 |         if element_id_for_cache:
3675 |             fp = await _dom_fingerprint(page)
3676 |             page_url_after_type = page.url or ""
3677 |             parsed_url_after_type = urlparse(page_url_after_type)
3678 |             path_after_type = parsed_url_after_type.path or "/"
3679 |             key_data_after_type = {
3680 |                 "site": loc_helper.site,
3681 |                 "path": path_after_type,
3682 |                 "hint": effective_task_hint.lower(),
3683 |             }
3684 |             key_src_after_type = json.dumps(key_data_after_type, sort_keys=True)
3685 |             cache_key_after_type = hashlib.sha256(key_src_after_type.encode()).hexdigest()
3686 |             selector_str = f'[data-sb-id="{element_id_for_cache}"]'
3687 |             loop_after_type = asyncio.get_running_loop()
3688 |             pool_after_type = _get_pool()
3689 |             await loop_after_type.run_in_executor(
3690 |                 pool_after_type, _cache_put_sync, cache_key_after_type, selector_str, fp
3691 |             )
3692 | 
3693 |         # Log success
3694 |         await _log("type_success", target=log_target, value=log_value, entered=press_enter)
3695 |         return True
3696 | 
3697 |     except PlaywrightTimeoutError as e:
3698 |         # Element not found or visible
3699 |         await _log("type_fail_notfound", target=log_target, value=log_value, error=str(e))
3700 |         raise ToolError(
3701 |             f"Type failed: Element not found/visible for hint '{effective_task_hint}'. {e}",
3702 |             details=log_target,
3703 |         ) from e
3704 |     except PlaywrightException as e:
3705 |         # Other Playwright errors
3706 |         await _log("type_fail_playwright", target=log_target, value=log_value, error=str(e))
3707 |         raise ToolError(f"Type failed due to Playwright error: {e}", details=log_target) from e
3708 |     except Exception as e:
3709 |         # Unexpected errors
3710 |         await _log("type_fail_unexpected", target=log_target, value=log_value, error=str(e))
3711 |         raise ToolError(f"Unexpected error during type: {e}", details=log_target) from e
3712 | 
3713 | 
3714 | # --- LATE IMPORT TO BREAK CYCLE ---
3715 | # Import the decorators here, just before they are needed for the tool functions.
3716 | # This assumes the rest of the module has been initialized by the time Python reaches here.
3717 | try:
3718 |     from ultimate_mcp_server.tools.base import with_error_handling, with_tool_metrics
3719 | except ImportError as e:
3720 |      # This indicates the cycle might still exist or base failed to load for other reasons
3721 |      logger.critical(f"CRITICAL: Failed to late-import base decorators needed for Smart Browser tools: {e}")
3722 |      raise
3723 | 
3724 | 
3725 | @with_tool_metrics
3726 | @with_error_handling
3727 | async def browse(
3728 |     url: str, wait_for_selector: Optional[str] = None, wait_for_navigation: bool = True
3729 | ) -> Dict[str, Any]:
3730 |     """
3731 |     Navigates to a URL using a dedicated browser tab, waits for load state
3732 |     (and optionally a selector), then extracts and returns the page state.
3733 | 
3734 |     Args:
3735 |         url: The URL to navigate to (scheme will be added if missing).
3736 |         wait_for_selector: Optional CSS selector to wait for after navigation.
3737 |         wait_for_navigation: Whether to wait for 'networkidle' (True) or
3738 |                              'domcontentloaded' (False).
3739 | 
3740 |     Returns:
3741 |         A dictionary containing success status and the final page state.
3742 |     """
3743 |     await _ensure_initialized()
3744 |     _update_activity()
3745 | 
3746 |     # --- Input Validation ---
3747 |     if not isinstance(url, str) or not url.strip():
3748 |         raise ToolInputError("URL cannot be empty.")
3749 |     # Add scheme if missing
3750 |     if not url.startswith(("http://", "https://")):
3751 |         url = "https://" + url
3752 |         logger.debug(f"Prepended 'https://' to URL: {url}")
3753 | 
3754 |     # --- Proxy Check ---
3755 |     proxy_cfg = _get_proxy_config()
3756 |     if proxy_cfg and _PROXY_ALLOWED_DOMAINS_LIST is not None:
3757 |         if not _is_domain_allowed_for_proxy(url):
3758 |             proxy_server = proxy_cfg.get("server", "Configured Proxy")
3759 |             error_msg = f"Navigation blocked by proxy domain rules for '{url}' via {proxy_server}."
3760 |             await _log("browse_fail_proxy_disallowed", url=url, proxy=proxy_server)
3761 |             raise ToolError(error_msg, error_code="proxy_domain_disallowed")
3762 | 
3763 |     # --- Execution ---
3764 |     ctx, _ = await get_browser_context()  # Get shared context
3765 |     async with _tab_context(ctx) as page:  # Use temp page from shared context
3766 |         await _log("navigate_start", url=url)
3767 |         try:
3768 |             # Determine wait state based on argument
3769 |             wait_until_state = "networkidle" if wait_for_navigation else "domcontentloaded"
3770 |             nav_timeout = 60000  # 60 seconds
3771 |             await page.goto(url, wait_until=wait_until_state, timeout=nav_timeout)
3772 | 
3773 |             # Optionally wait for a specific selector
3774 |             if wait_for_selector:
3775 |                 selector_timeout = 15000  # 15 seconds
3776 |                 try:
3777 |                     await page.wait_for_selector(
3778 |                         wait_for_selector, state="visible", timeout=selector_timeout
3779 |                     )
3780 |                     await _log("navigate_wait_selector_ok", url=url, selector=wait_for_selector)
3781 |                 except PlaywrightTimeoutError:
3782 |                     # Log timeout but proceed, might still be usable
3783 |                     logger.warning(
3784 |                         f"Timeout waiting for selector '{wait_for_selector}' at {url} after navigation."
3785 |                     )
3786 |                     await _log(
3787 |                         "navigate_wait_selector_timeout", url=url, selector=wait_for_selector
3788 |                     )
3789 | 
3790 |             # Pause and get final state
3791 |             await _pause(page, (50, 200))
3792 |             state = await get_page_state(page)  # Use helper to get structured state
3793 |             await _log("navigate_success", url=url, title=state.get("title"))
3794 | 
3795 |             # Return success and page state
3796 |             return {"success": True, "page_state": state}
3797 | 
3798 |         except PlaywrightException as e:
3799 |             # Handle Playwright-specific navigation errors
3800 |             await _log("navigate_fail_playwright", url=url, error=str(e))
3801 |             # Decorator will wrap this in ToolError
3802 |             raise ToolError(f"Navigation failed for {url}: {e}") from e
3803 |         except Exception as e:
3804 |             # Handle unexpected errors during navigation/state extraction
3805 |             await _log("navigate_fail_unexpected", url=url, error=str(e))
3806 |             # Decorator will wrap this in ToolError
3807 |             raise ToolError(f"Unexpected error browsing {url}: {e}") from e
3808 | 
3809 | 
3810 | @with_tool_metrics
3811 | @with_error_handling
3812 | async def click(
3813 |     url: str,
3814 |     target: Optional[Dict[str, Any]] = None,
3815 |     task_hint: Optional[str] = None,
3816 |     wait_ms: int = 1000,
3817 | ) -> Dict[str, Any]:
3818 |     """
3819 |     Navigates to a URL, clicks an element identified by task_hint or target,
3820 |     waits, and returns the resulting page state.
3821 | 
3822 |     Args:
3823 |         url: The URL to navigate to first.
3824 |         target: Optional dictionary (like Plan-Step target) used to generate hint if task_hint missing.
3825 |         task_hint: Natural language description of the element to click.
3826 |         wait_ms: Milliseconds to wait after the click action completes.
3827 | 
3828 |     Returns:
3829 |         A dictionary containing success status and the final page state after the click.
3830 |     """
3831 |     await _ensure_initialized()
3832 |     _update_activity()
3833 | 
3834 |     # --- Input Validation: Determine task_hint ---
3835 |     effective_task_hint = task_hint
3836 |     if not effective_task_hint:
3837 |         if target and (target.get("name") or target.get("role")):
3838 |             name = target.get("name", "")
3839 |             role = target.get("role", "")
3840 |             role_part = role or "element"
3841 |             name_part = f" named '{name}'" if name else ""
3842 |             effective_task_hint = f"Click the {role_part}{name_part}"
3843 |             logger.debug(f"click tool generated task_hint: '{effective_task_hint}'")
3844 |         else:
3845 |             raise ToolInputError(
3846 |                 "click tool requires 'task_hint', or 'target' dict with 'name' or 'role'."
3847 |             )
3848 | 
3849 |     # --- Execution ---
3850 |     ctx, _ = await get_browser_context()
3851 |     async with _tab_context(ctx) as page:
3852 |         await _log("click_extract_navigate", url=url, hint=effective_task_hint)
3853 |         # Navigate to the page
3854 |         try:
3855 |             nav_timeout = 60000
3856 |             await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
3857 |         except PlaywrightException as e:
3858 |             raise ToolError(f"Navigation to '{url}' failed before click attempt: {e}") from e
3859 | 
3860 |         # Perform the click using the smart helper
3861 |         # smart_click handles EnhancedLocator, interaction, logging, and errors
3862 |         await smart_click(
3863 |             page,
3864 |             task_hint=effective_task_hint,
3865 |             target_kwargs=target,  # Pass target for logging inside smart_click
3866 |             timeout_ms=10000,  # Timeout for locating the element
3867 |         )
3868 | 
3869 |         # Wait after click if specified
3870 |         if wait_ms > 0:
3871 |             await page.wait_for_timeout(wait_ms)
3872 | 
3873 |         # Wait for network to potentially settle after click (best effort)
3874 |         try:
3875 |             idle_timeout = 10000
3876 |             await page.wait_for_load_state("networkidle", timeout=idle_timeout)
3877 |         except PlaywrightTimeoutError:
3878 |             logger.debug("Network idle wait timeout after click action.")
3879 | 
3880 |         # Pause and get final state
3881 |         await _pause(page, (50, 200))
3882 |         final_state = await get_page_state(page)
3883 |         await _log("click_extract_success", url=page.url, hint=effective_task_hint)
3884 | 
3885 |         # Return success and the state after the click
3886 |         return {"success": True, "page_state": final_state}
3887 | 
3888 | 
3889 | @with_tool_metrics
3890 | @with_error_handling
3891 | async def type_text(
3892 |     url: str,
3893 |     fields: List[Dict[str, Any]],
3894 |     submit_hint: Optional[str] = None,
3895 |     submit_target: Optional[Dict[str, Any]] = None,
3896 |     wait_after_submit_ms: int = 2000,
3897 | ) -> Dict[str, Any]:
3898 |     """
3899 |     Navigates to a URL, fills specified form fields using task hints,
3900 |     optionally clicks a submit element, waits, and returns the final page state.
3901 | 
3902 |     Args:
3903 |         url: The URL containing the form.
3904 |         fields: A list of dictionaries, each specifying a field to type into.
3905 |                 Required keys per dict: 'task_hint' (or 'target') and 'text'.
3906 |                 Optional keys: 'enter' (bool), 'clear_before' (bool).
3907 |         submit_hint: Optional natural language description of the submit element.
3908 |         submit_target: Optional target dictionary for the submit element.
3909 |         wait_after_submit_ms: Milliseconds to wait after submission.
3910 | 
3911 |     Returns:
3912 |         A dictionary containing success status and the final page state.
3913 |     """
3914 |     await _ensure_initialized()
3915 |     _update_activity()
3916 | 
3917 |     # --- Input Validation ---
3918 |     if not fields or not isinstance(fields, list):
3919 |         raise ToolInputError("'fields' must be a non-empty list of dictionaries.")
3920 |     if submit_hint and submit_target:
3921 |         logger.warning("Both submit_hint and submit_target provided; submit_hint will be used.")
3922 |     elif not submit_hint and not submit_target:
3923 |         logger.debug("No submit_hint or submit_target provided; form will not be submitted.")
3924 | 
3925 |     # --- Execution ---
3926 |     ctx, _ = await get_browser_context()
3927 |     async with _tab_context(ctx) as page:
3928 |         await _log("fill_form_navigate", url=url)
3929 |         # Navigate to the form page
3930 |         try:
3931 |             nav_timeout = 60000
3932 |             await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
3933 |         except PlaywrightException as e:
3934 |             raise ToolError(f"Navigation to '{url}' failed before filling form: {e}") from e
3935 | 
3936 |         # Wait briefly for form elements to likely appear (best effort)
3937 |         try:
3938 |             form_wait_timeout = 5000
3939 |             await page.wait_for_selector(
3940 |                 "form, input, textarea, select", state="visible", timeout=form_wait_timeout
3941 |             )
3942 |             logger.debug("Form elements found, proceeding with field filling.")
3943 |         except PlaywrightTimeoutError:
3944 |             logger.warning("Did not quickly find typical form elements. Proceeding anyway.")
3945 | 
3946 |         # Loop through fields and type text
3947 |         for i, field in enumerate(fields):
3948 |             if not isinstance(field, dict):
3949 |                 raise ToolInputError(f"Item at index {i} in 'fields' is not a dictionary.")
3950 | 
3951 |             # Determine hint for the field
3952 |             field_hint = field.get("task_hint")
3953 |             field_target = field.get("target")
3954 |             if not field_hint:
3955 |                 if field_target and (field_target.get("name") or field_target.get("role")):
3956 |                     name = field_target.get("name", "")
3957 |                     role = field_target.get("role", "input")
3958 |                     field_hint = (
3959 |                         f"{role or 'Input field'} '{name}'" if name else f"{role or 'Input field'}"
3960 |                     )
3961 |                 else:
3962 |                     raise ToolInputError(
3963 |                         f"Field at index {i} requires 'task_hint' or 'target' with name/role."
3964 |                     )
3965 | 
3966 |             # Get text to type
3967 |             text_to_type = field.get("text")
3968 |             if text_to_type is None:  # Allow empty string, but not None
3969 |                 raise ToolInputError(
3970 |                     f"Field at index {i} ('{field_hint}') missing required 'text'."
3971 |                 )
3972 | 
3973 |             # Log the action for this field
3974 |             await _log("fill_form_field", index=i, hint=field_hint)
3975 | 
3976 |             # Use smart_type helper for the actual typing
3977 |             await smart_type(
3978 |                 page,
3979 |                 task_hint=field_hint,
3980 |                 text=text_to_type,
3981 |                 press_enter=field.get("enter", False),
3982 |                 clear_before=field.get("clear_before", True),
3983 |                 target_kwargs=field_target,  # Pass target for logging inside smart_type
3984 |                 timeout_ms=5000,
3985 |             )
3986 |             await _pause(page, (50, 150))  # Short pause between fields
3987 | 
3988 |         # Handle optional submission
3989 |         final_submit_hint = submit_hint
3990 |         if not final_submit_hint and submit_target:  # Generate hint from target if needed
3991 |             if submit_target.get("name") or submit_target.get("role"):
3992 |                 name = submit_target.get("name", "")
3993 |                 role = submit_target.get("role", "button")
3994 |                 final_submit_hint = f"Submit {role or 'button'}" + (f" '{name}'" if name else "")
3995 |             else:
3996 |                 logger.warning(
3997 |                     "submit_target provided but lacks 'name' or 'role'; cannot generate hint. Skipping submit."
3998 |                 )
3999 |                 final_submit_hint = None  # Ensure submit doesn't happen
4000 | 
4001 |         if final_submit_hint:
4002 |             await _log("fill_form_submit", hint=final_submit_hint)
4003 |             # Use smart_click helper for submission
4004 |             await smart_click(
4005 |                 page,
4006 |                 task_hint=final_submit_hint,
4007 |                 target_kwargs=submit_target,
4008 |                 timeout_ms=10000,
4009 |             )
4010 |             # Wait after submission
4011 |             try:
4012 |                 submit_idle_timeout = 15000
4013 |                 await page.wait_for_load_state("networkidle", timeout=submit_idle_timeout)
4014 |             except PlaywrightTimeoutError:
4015 |                 logger.debug("Network idle wait timeout after form submission.")
4016 |             if wait_after_submit_ms > 0:
4017 |                 await page.wait_for_timeout(wait_after_submit_ms)
4018 | 
4019 |         # Get final page state
4020 |         await _pause(page, (100, 300))
4021 |         final_state = await get_page_state(page)
4022 |         await _log(
4023 |             "fill_form_success",
4024 |             url=page.url,
4025 |             num_fields=len(fields),
4026 |             submitted=bool(final_submit_hint),
4027 |         )
4028 | 
4029 |         return {"success": True, "page_state": final_state}
4030 | 
4031 | 
4032 | @with_tool_metrics
4033 | @with_error_handling
4034 | async def parallel(
4035 |     urls: List[str], action: str = "get_state", max_tabs: Optional[int] = None
4036 | ) -> Dict[str, Any]:
4037 |     """
4038 |     Processes multiple URLs in parallel using isolated browser tabs via TabPool.
4039 |     Currently only supports the 'get_state' action for each URL.
4040 | 
4041 |     Args:
4042 |         urls: A list of URLs to process.
4043 |         action: The action to perform on each URL (currently only 'get_state').
4044 |         max_tabs: Optional override for the maximum number of concurrent tabs.
4045 |                   If None, uses the globally configured limit.
4046 | 
4047 |     Returns:
4048 |         A dictionary containing success status, a list of results for each URL,
4049 |         and counts of processed and successful URLs.
4050 |     """
4051 |     await _ensure_initialized()
4052 |     _update_activity()
4053 | 
4054 |     # --- Input Validation ---
4055 |     if not urls or not isinstance(urls, list):
4056 |         raise ToolInputError("'urls' must be a non-empty list.")
4057 |     if not all(isinstance(u, str) and u.strip() for u in urls):
4058 |         raise ToolInputError("All items in 'urls' list must be non-empty strings.")
4059 |     if action != "get_state":
4060 |         raise ToolInputError(
4061 |             f"Unsupported action '{action}'. Currently only 'get_state' is allowed."
4062 |         )
4063 |     if (
4064 |         max_tabs is not None
4065 |         and not isinstance(max_tabs, int)
4066 |         or (isinstance(max_tabs, int) and max_tabs <= 0)
4067 |     ):
4068 |         raise ToolInputError("'max_tabs' override must be a positive integer if provided.")
4069 | 
4070 |     # --- Setup Tab Pool ---
4071 |     # Use global pool unless max_tabs override is provided
4072 |     pool_to_use = tab_pool
4073 |     if max_tabs is not None:
4074 |         logger.info(f"Using temporary TabPool with max_tabs override: {max_tabs}")
4075 |         pool_to_use = TabPool(max_tabs=max_tabs)
4076 | 
4077 |     # --- Define Per-URL Processing Function ---
4078 |     # This function runs inside the TabPool's managed page context
4079 |     async def process_url_action(page: Page, *, url_to_process: str) -> Dict[str, Any]:
4080 |         # Ensure URL has scheme
4081 |         full_url = (
4082 |             url_to_process
4083 |             if url_to_process.startswith(("http://", "https://"))
4084 |             else f"https://{url_to_process}"
4085 |         )
4086 |         result = {"url": url_to_process, "success": False}  # Default result structure
4087 | 
4088 |         try:
4089 |             await _log("parallel_navigate", url=full_url, action=action)
4090 |             # Navigate to the URL
4091 |             nav_timeout = 45000  # Shorter timeout for parallel tasks
4092 |             await page.goto(full_url, wait_until="networkidle", timeout=nav_timeout)
4093 | 
4094 |             # Perform the specified action
4095 |             if action == "get_state":
4096 |                 page_state = await get_page_state(page)
4097 |                 result["success"] = True
4098 |                 result["page_state"] = page_state
4099 |             # Add other actions here if needed in the future
4100 |             # elif action == "some_other_action":
4101 |             #     # ... perform other action ...
4102 |             #     result["success"] = True
4103 |             #     result["details"] = ...
4104 | 
4105 |             return result
4106 | 
4107 |         except PlaywrightException as e:
4108 |             error_msg = f"Playwright error processing {full_url}: {e}"
4109 |             logger.warning(error_msg)
4110 |             await _log("parallel_url_error", url=full_url, action=action, error=str(e))
4111 |             result["error"] = error_msg
4112 |             return result
4113 |         except Exception as e:
4114 |             error_msg = f"Unexpected error processing {full_url}: {e}"
4115 |             logger.error(error_msg, exc_info=True)  # Log traceback for unexpected
4116 |             await _log("parallel_url_error", url=full_url, action=action, error=str(e))
4117 |             result["error"] = error_msg
4118 |             return result
4119 | 
4120 |     # --- Create Tasks for TabPool ---
4121 |     # Use functools.partial to pass the specific URL to each task instance
4122 |     tasks_to_run = []
4123 |     for u in urls:
4124 |         # Create a partial function that captures the url_to_process kwarg
4125 |         task_func = functools.partial(process_url_action, url_to_process=u)
4126 |         tasks_to_run.append(task_func)
4127 | 
4128 |     # --- Run Tasks Concurrently using TabPool ---
4129 |     logger.info(f"Starting parallel processing of {len(urls)} URLs with action '{action}'...")
4130 |     # pool.map handles concurrency, semaphore, context/page creation/cleanup
4131 |     results = await pool_to_use.map(tasks_to_run)
4132 |     logger.info("Parallel processing complete.")
4133 | 
4134 |     # --- Process Results ---
4135 |     successful_count = sum(1 for r in results if isinstance(r, dict) and r.get("success"))
4136 |     processed_count = len(results)
4137 |     await _log(
4138 |         "parallel_process_complete",
4139 |         total=len(urls),
4140 |         processed=processed_count,
4141 |         successful=successful_count,
4142 |         action=action,
4143 |     )
4144 | 
4145 |     # --- Return Final Summary ---
4146 |     return {
4147 |         "success": True,  # Indicates the overall parallel orchestration completed
4148 |         "results": results,  # List containing result dict for each URL
4149 |         "processed_count": processed_count,
4150 |         "successful_count": successful_count,
4151 |     }
4152 | 
4153 | 
4154 | # --- Download Helpers ---
4155 | async def _run_in_thread(func, *args):  # Keep as is
4156 |     """Runs a synchronous function in the thread pool."""
4157 |     loop = asyncio.get_running_loop()
4158 |     pool = _get_pool()
4159 |     try:
4160 |         result = await loop.run_in_executor(pool, func, *args)
4161 |         return result
4162 |     except RuntimeError as e:
4163 |         if "cannot schedule new futures after shutdown" in str(e):
4164 |             logger.warning("Thread pool is shutdown. Creating a temporary pool for operation.")
4165 |             with concurrent.futures.ThreadPoolExecutor(max_workers=1) as temp_pool:
4166 |                 result = await loop.run_in_executor(temp_pool, func, *args)
4167 |                 return result
4168 |         else:
4169 |             raise
4170 | 
4171 | 
4172 | async def _compute_hash_async(data: bytes) -> str:  # Keep as is
4173 |     """Computes SHA256 hash of bytes data asynchronously in a thread."""
4174 | 
4175 |     # Define the synchronous hashing function locally
4176 |     def sync_hash(d):
4177 |         hasher = hashlib.sha256()
4178 |         hasher.update(d)
4179 |         return hasher.hexdigest()
4180 | 
4181 |     # Run the sync function in the thread pool
4182 |     hex_digest = await _run_in_thread(sync_hash, data)
4183 |     return hex_digest
4184 | 
4185 | 
4186 | async def _read_file_async(path: Path) -> bytes:  # Keep as is
4187 |     """Reads file content asynchronously using aiofiles."""
4188 |     async with aiofiles.open(path, mode="rb") as f:
4189 |         content = await f.read()
4190 |         return content
4191 | 
4192 | 
4193 | async def _write_file_async(path: Path, data: bytes):  # Keep as is
4194 |     """Writes bytes data to a file asynchronously using aiofiles."""
4195 |     async with aiofiles.open(path, mode="wb") as f:
4196 |         await f.write(data)
4197 | 
4198 | 
4199 | def _extract_tables_sync(path: Path) -> List[Dict]:  # Keep as is
4200 |     """Synchronously extracts tables from PDF, Excel, or CSV files."""
4201 |     ext = path.suffix.lower()
4202 |     results: List[Dict] = []
4203 |     try:
4204 |         if ext == ".pdf":
4205 |             try:
4206 |                 import tabula  # Optional dependency
4207 | 
4208 |                 # Read all tables from all pages, keep data as strings
4209 |                 dfs = tabula.read_pdf(
4210 |                     str(path),
4211 |                     pages="all",
4212 |                     multiple_tables=True,
4213 |                     pandas_options={"dtype": str},
4214 |                     silent=True,
4215 |                 )
4216 |                 if dfs:  # If tables were found
4217 |                     table_list = []
4218 |                     for i, df in enumerate(dfs):
4219 |                         # Convert DataFrame to list of dicts (rows)
4220 |                         rows_data = df.to_dict(orient="records")
4221 |                         table_entry = {"type": "pdf_table", "page": i + 1, "rows": rows_data}
4222 |                         table_list.append(table_entry)
4223 |                     results = table_list
4224 |             except ImportError:
4225 |                 logger.debug("tabula-py library not installed. Skipping PDF table extraction.")
4226 |             except Exception as pdf_err:
4227 |                 # Catch errors during Tabula processing
4228 |                 logger.warning(f"Tabula PDF table extraction failed for {path.name}: {pdf_err}")
4229 | 
4230 |         elif ext in (".xls", ".xlsx"):
4231 |             try:
4232 |                 import pandas as pd  # Optional dependency
4233 | 
4234 |                 # Read all sheets, keep data as strings
4235 |                 xl_dict = pd.read_excel(str(path), sheet_name=None, dtype=str)
4236 |                 sheet_list = []
4237 |                 for sheet_name, df in xl_dict.items():
4238 |                     rows_data = df.to_dict(orient="records")
4239 |                     sheet_entry = {
4240 |                         "type": "excel_sheet",
4241 |                         "sheet_name": sheet_name,
4242 |                         "rows": rows_data,
4243 |                     }
4244 |                     sheet_list.append(sheet_entry)
4245 |                 results = sheet_list
4246 |             except ImportError:
4247 |                 logger.debug(
4248 |                     "pandas/openpyxl/xlrd library not installed. Skipping Excel table extraction."
4249 |                 )
4250 |             except Exception as excel_err:
4251 |                 logger.warning(f"Pandas Excel table extraction failed for {path.name}: {excel_err}")
4252 | 
4253 |         elif ext == ".csv":
4254 |             try:
4255 |                 import pandas as pd  # Optional dependency
4256 | 
4257 |                 # Read CSV, keep data as strings
4258 |                 df = pd.read_csv(str(path), dtype=str)
4259 |                 rows_data = df.to_dict(orient="records")
4260 |                 # Create a list containing the single table representation
4261 |                 results = [{"type": "csv_table", "rows": rows_data}]
4262 |             except ImportError:
4263 |                 logger.debug("pandas library not installed. Skipping CSV table extraction.")
4264 |             except Exception as csv_err:
4265 |                 logger.warning(f"Pandas CSV table extraction failed for {path.name}: {csv_err}")
4266 | 
4267 |     except Exception as outer_err:
4268 |         # Catch errors during import or setup
4269 |         logger.error(f"Error during table extraction setup for {path.name}: {outer_err}")
4270 | 
4271 |     return results
4272 | 
4273 | 
4274 | async def _extract_tables_async(path: Path) -> list:  # Uses global _log
4275 |     """Asynchronously extracts tables by running sync helper in thread pool."""
4276 |     try:
4277 |         # Run the synchronous extraction function in the thread pool
4278 |         tables = await asyncio.to_thread(_extract_tables_sync, path)
4279 |         if tables:
4280 |             num_tables = len(tables)
4281 |             await _log("table_extract_success", file=str(path), num_tables=num_tables)
4282 |         # Return the list of tables (or empty list if none found/error)
4283 |         return tables
4284 |     except Exception as e:
4285 |         # Log error during async execution/threading
4286 |         await _log("table_extract_error", file=str(path), error=str(e))
4287 |         return []  # Return empty list on error
4288 | 
4289 | 
4290 | @resilient()  # Keep the retry decorator if desired
4291 | async def smart_download(
4292 |     page: Page,
4293 |     task_hint: str,
4294 |     dest_dir: Optional[Union[str, Path]] = None,
4295 |     target_kwargs: Optional[Dict] = None,
4296 | ) -> Dict[str, Any]:
4297 |     """
4298 |     Initiates download via click, saves via Playwright, reads file directly
4299 |     for analysis (hash, tables), managing paths via FileSystem Tools.
4300 |     """
4301 |     final_dl_dir_path_str = "Unknown"  # For logging context, default value
4302 |     out_path: Optional[Path] = None  # Define earlier for clarity, default None
4303 | 
4304 |     # --- Determine and Prepare Download Directory using FileSystemTool ---
4305 |     try:
4306 |         # Determine the target directory path string
4307 |         if dest_dir:
4308 |             download_dir_path_str = str(dest_dir)
4309 |         else:
4310 |             # Default: Use a path relative to the allowed 'storage' base directory
4311 |             default_dl_subdir = "smart_browser_downloads"
4312 |             download_dir_path_str = f"storage/{default_dl_subdir}"
4313 | 
4314 |         logger.info(
4315 |             f"Ensuring download directory exists: '{download_dir_path_str}' using filesystem tool."
4316 |         )
4317 |         # Use STANDALONE create_directory tool
4318 |         create_dir_result = await create_directory(path=download_dir_path_str)
4319 | 
4320 |         # Validate the result from the filesystem tool
4321 |         if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
4322 |             error_detail = "Invalid response"
4323 |             if isinstance(create_dir_result, dict):
4324 |                 error_detail = create_dir_result.get("error", "Unknown")
4325 |             raise ToolError(
4326 |                 f"Failed to prepare download directory '{download_dir_path_str}'. Filesystem tool error: {error_detail}"
4327 |             )
4328 | 
4329 |         # Use the actual absolute path returned by the tool
4330 |         final_dl_dir_path_str = create_dir_result.get(
4331 |             "path", download_dir_path_str
4332 |         )  # Use path from result, fallback to input
4333 |         final_dl_dir_path = Path(final_dl_dir_path_str)  # Convert to Path object for local use
4334 |         logger.info(f"Download directory confirmed/created at: {final_dl_dir_path}")
4335 | 
4336 |     except ToolError as e:
4337 |         logger.error(
4338 |             f"ToolError preparing download directory '{download_dir_path_str}': {e}", exc_info=True
4339 |         )
4340 |         raise  # Re-raise ToolError
4341 |     except Exception as e:
4342 |         # Catch any other unexpected errors during directory prep
4343 |         logger.error(
4344 |             f"Unexpected error preparing download directory '{download_dir_path_str}': {e}",
4345 |             exc_info=True,
4346 |         )
4347 |         raise ToolError(
4348 |             f"An unexpected error occurred preparing download directory: {str(e)}"
4349 |         ) from e
4350 |     # --- End Directory Preparation ---
4351 | 
4352 |     # Prepare log details
4353 |     log_target = {}
4354 |     if target_kwargs:
4355 |         log_target.update(target_kwargs)
4356 |     else:
4357 |         log_target["hint"] = task_hint
4358 | 
4359 |     try:
4360 |         # --- Initiate Download ---
4361 |         # Wait for the download event to occur after the click
4362 |         download_timeout_ms = 60000  # 60 seconds for download to start
4363 |         async with page.expect_download(timeout=download_timeout_ms) as dl_info:
4364 |             # Use the smart_click helper function to trigger the download
4365 |             click_timeout_ms = 10000  # 10 seconds for the click itself
4366 |             await smart_click(
4367 |                 page, task_hint=task_hint, target_kwargs=target_kwargs, timeout_ms=click_timeout_ms
4368 |             )
4369 |             logger.debug(
4370 |                 f"Click initiated for download hint: '{task_hint}'. Waiting for download start..."
4371 |             )
4372 | 
4373 |         # Get the Download object
4374 |         dl = await dl_info.value
4375 |         logger.info(
4376 |             f"Download started. Suggested filename: '{dl.suggested_filename}', URL: {dl.url}"
4377 |         )
4378 | 
4379 |         # Sanitize filename provided by browser
4380 |         suggested_fname_raw = dl.suggested_filename
4381 |         default_fname = f"download_{int(time.time())}.dat"
4382 |         suggested_fname = suggested_fname_raw or default_fname
4383 | 
4384 |         # Remove potentially harmful characters
4385 |         safe_fname_chars = re.sub(r"[^\w.\- ]", "_", suggested_fname)
4386 |         # Replace whitespace with underscores
4387 |         safe_fname_spaces = re.sub(r"\s+", "_", safe_fname_chars)
4388 |         # Remove leading/trailing problematic characters
4389 |         safe_fname_strip = safe_fname_spaces.strip("._-")
4390 |         # Ensure filename is not empty after sanitization
4391 |         safe_fname = safe_fname_strip or default_fname
4392 | 
4393 |         # --- Construct initial desired path (within the verified directory) ---
4394 |         initial_desired_path = final_dl_dir_path / safe_fname
4395 | 
4396 |         # --- Get Unique Path using FileSystemTool ---
4397 |         logger.debug(f"Requesting unique path based on initial suggestion: {initial_desired_path}")
4398 |         try:
4399 |             # Use STANDALONE get_unique_filepath tool
4400 |             unique_path_result = await get_unique_filepath(path=str(initial_desired_path))
4401 |             if not isinstance(unique_path_result, dict) or not unique_path_result.get("success"):
4402 |                 error_detail = "Invalid response"
4403 |                 if isinstance(unique_path_result, dict):
4404 |                     error_detail = unique_path_result.get("error", "Unknown")
4405 |                 raise ToolError(
4406 |                     f"Failed to get unique download path. Filesystem tool error: {error_detail}"
4407 |                 )
4408 | 
4409 |             final_unique_path_str = unique_path_result.get("path")
4410 |             if not final_unique_path_str:
4411 |                 raise ToolError(
4412 |                     "Filesystem tool get_unique_filepath succeeded but did not return a path."
4413 |                 )
4414 | 
4415 |             out_path = Path(final_unique_path_str)  # Use the unique path for saving
4416 |             logger.info(f"Determined unique download save path: {out_path}")
4417 | 
4418 |         except ToolError as e:
4419 |             logger.error(
4420 |                 f"Error determining unique download path based on '{initial_desired_path}': {e}",
4421 |                 exc_info=True,
4422 |             )
4423 |             raise  # Re-raise ToolError
4424 |         except Exception as e:
4425 |             logger.error(
4426 |                 f"Unexpected error getting unique download path for '{initial_desired_path}': {e}",
4427 |                 exc_info=True,
4428 |             )
4429 |             raise ToolError(
4430 |                 f"An unexpected error occurred finding a unique save path: {str(e)}"
4431 |             ) from e
4432 |         # --- End Getting Unique Path ---
4433 | 
4434 |         # --- Save Download using Playwright ---
4435 |         logger.info(f"Playwright saving download from '{dl.url}' to unique path: {out_path}")
4436 |         # Playwright handles the actual streaming and saving to the specified path
4437 |         await dl.save_as(out_path)
4438 |         logger.info(f"Playwright download save complete: {out_path}")
4439 | 
4440 |         # --- Read back file DIRECTLY for Analysis (using out_path) ---
4441 |         file_data: Optional[bytes] = None
4442 |         file_size = -1
4443 |         sha256_hash = None
4444 |         read_back_error = None
4445 | 
4446 |         try:
4447 |             # Read the file content using our async helper
4448 |             logger.debug(f"Reading back downloaded file directly from {out_path} for analysis...")
4449 |             file_data = await _read_file_async(out_path)
4450 |             file_size = len(file_data)
4451 |             logger.debug(f"Successfully read back {file_size} bytes from {out_path} directly.")
4452 | 
4453 |         # Handle potential errors during the direct read-back
4454 |         except FileNotFoundError:
4455 |             read_back_error = f"Downloaded file {out_path} disappeared before read-back."
4456 |             # Optionally try to delete the potentially incomplete entry if FS allows
4457 |             # try: await delete_path(str(out_path)) # Needs delete_path tool
4458 |             # except Exception as del_e: logger.warning(f"Failed to cleanup missing file {out_path}: {del_e}")
4459 |         except IOError as e:
4460 |             read_back_error = f"IO error reading back downloaded file {out_path}: {e}"
4461 |         except Exception as e:
4462 |             read_back_error = f"Unexpected error reading back downloaded file {out_path}: {e}"
4463 |             # Log full traceback for unexpected errors
4464 |             logger.error(f"Unexpected error reading back {out_path}: {e}", exc_info=True)
4465 | 
4466 |         # If read-back failed, log and raise ToolError indicating partial success/failure
4467 |         if read_back_error:
4468 |             logger.error(read_back_error)
4469 |             # Prepare info about the failed read-back
4470 |             partial_info = {
4471 |                 "success": False,  # Mark overall operation as failed due to analysis failure
4472 |                 "file_path": str(out_path),
4473 |                 "file_name": out_path.name,
4474 |                 "error": f"Download saved, but failed to read back for analysis: {read_back_error}",
4475 |                 "url": dl.url,
4476 |             }
4477 |             await _log("download_success_readback_fail", target=log_target, **partial_info)
4478 |             # Raise ToolError to signal failure clearly to the caller
4479 |             raise ToolError(partial_info["error"], details=partial_info)
4480 | 
4481 |         # --- Hashing and Table Extraction (if read-back succeeded) ---
4482 |         # Compute hash from the bytes read directly
4483 |         if file_data is not None:  # Should always be true if read_back_error is None
4484 |             sha256_hash = await _compute_hash_async(file_data)
4485 |             logger.debug(f"Computed SHA256 hash for {out_path.name}: {sha256_hash[:8]}...")
4486 |         else:
4487 |             # This case should technically not be reachable if read_back_error is None
4488 |             logger.error(
4489 |                 f"Internal state error: file_data is None after successful read back for {out_path}."
4490 |             )
4491 |             # Fallback hash or handle as error? For now, hash will be None.
4492 | 
4493 |         tables = []
4494 |         # Check file extension to decide if table extraction is applicable
4495 |         file_extension = out_path.suffix.lower()
4496 |         is_table_extractable = file_extension in (".pdf", ".xls", ".xlsx", ".csv")
4497 | 
4498 |         if is_table_extractable:
4499 |             logger.debug(f"Attempting table extraction for {out_path.name}...")
4500 |             try:
4501 |                 # Use the async helper which runs sync extraction in a thread
4502 |                 # _extract_tables_async reads the file itself from out_path
4503 |                 table_extraction_task = asyncio.create_task(_extract_tables_async(out_path))
4504 |                 # Wait for extraction with a timeout
4505 |                 extraction_timeout = 120  # seconds
4506 |                 tables = await asyncio.wait_for(table_extraction_task, timeout=extraction_timeout)
4507 |                 if tables:
4508 |                     logger.info(
4509 |                         f"Successfully extracted {len(tables)} table(s) from {out_path.name}"
4510 |                     )
4511 |                 else:
4512 |                     logger.debug(f"No tables found or extracted from {out_path.name}")
4513 | 
4514 |             except asyncio.TimeoutError:
4515 |                 logger.warning(
4516 |                     f"Table extraction timed out after {extraction_timeout}s for {out_path.name}"
4517 |                 )
4518 |                 # Ensure the task is cancelled if it timed out
4519 |                 if "table_extraction_task" in locals() and not table_extraction_task.done():
4520 |                     table_extraction_task.cancel()
4521 |                     try:
4522 |                         # Give cancellation a moment to propagate (best effort)
4523 |                         await asyncio.wait_for(table_extraction_task, timeout=1.0)
4524 |                     except asyncio.CancelledError:
4525 |                         pass  # Expected outcome of cancellation
4526 |                     except asyncio.TimeoutError:
4527 |                         logger.warning(
4528 |                             "Timeout waiting for table extraction task cancellation after initial timeout."
4529 |                         )
4530 |                     except Exception as cancel_err:
4531 |                         logger.warning(
4532 |                             f"Error during table extraction task cancellation: {cancel_err}"
4533 |                         )
4534 |                 # Continue, tables will remain empty list
4535 |             except Exception as extract_err:
4536 |                 # Catch other errors during extraction process
4537 |                 logger.error(
4538 |                     f"Table extraction failed unexpectedly for {out_path.name}: {extract_err}",
4539 |                     exc_info=True,
4540 |                 )
4541 |                 # Continue, tables will remain empty list
4542 | 
4543 |         # --- Success ---
4544 |         # Prepare the success result dictionary
4545 |         info = {
4546 |             "success": True,
4547 |             "file_path": str(out_path),  # Return the final unique absolute path
4548 |             "file_name": out_path.name,
4549 |             "sha256": sha256_hash,  # Use the hash computed from read-back data
4550 |             "size_bytes": file_size,  # Use the size from read-back data
4551 |             "url": dl.url,  # URL the download originated from
4552 |             "tables_extracted": bool(tables),  # Indicate if tables were extracted
4553 |             "tables": tables[:5],  # Include a preview of first 5 tables (if any)
4554 |         }
4555 |         # Log success event (exclude large tables data from log details)
4556 |         log_info_safe = info.copy()
4557 |         if "tables" in log_info_safe:
4558 |             del log_info_safe["tables"]  # Remove tables for cleaner log
4559 |         log_info_safe["num_tables"] = len(tables) if tables else 0
4560 |         await _log("download_success", target=log_target, **log_info_safe)
4561 |         return info
4562 | 
4563 |     # --- Error Handling (Catch errors from download initiation or Playwright saving) ---
4564 |     except (ToolInputError, ToolError) as e:
4565 |         # These errors are raised explicitly above (e.g., dir prep, unique path, read-back) or by smart_click
4566 |         # Log the specific error type and message
4567 |         error_path_context = str(out_path) if out_path else "N/A"
4568 |         await _log("download_fail_other", target=log_target, error=str(e), path=error_path_context)
4569 |         raise  # Re-raise the specific ToolError/InputError
4570 |     except PlaywrightTimeoutError as e:
4571 |         # Timeout occurred during page.expect_download or within smart_click
4572 |         error_path_context = str(out_path) if out_path else "N/A"
4573 |         await _log(
4574 |             "download_fail_timeout", target=log_target, error=str(e), path=error_path_context
4575 |         )
4576 |         raise ToolError(f"Download operation timed out: {e}") from e
4577 |     except PlaywrightException as e:
4578 |         # Other playwright errors during expect_download, save_as, or smart_click
4579 |         error_path_context = str(out_path) if out_path else "N/A"
4580 |         await _log(
4581 |             "download_fail_playwright", target=log_target, error=str(e), path=error_path_context
4582 |         )
4583 |         raise ToolError(f"Download failed due to Playwright error: {e}") from e
4584 |     except Exception as e:
4585 |         # Catch-all for unexpected errors during the download process
4586 |         error_path_context = str(out_path) if out_path else "N/A"
4587 |         await _log(
4588 |             "download_fail_unexpected", target=log_target, error=str(e), path=error_path_context
4589 |         )
4590 |         logger.error(
4591 |             f"Unexpected error during smart_download for hint '{task_hint}': {e}", exc_info=True
4592 |         )  # Log traceback
4593 |         raise ToolError(f"Unexpected error during download: {e}") from e
4594 | 
4595 | 
4596 | # --- PDF/Docs Crawler Helpers (Keep as is, minor splits) ---
4597 | _SLUG_RE = re.compile(r"[^a-z0-9\-_]+")
4598 | 
4599 | 
4600 | def _slugify(text: str, max_len: int = 60) -> str:
4601 |     """Converts text to a URL-friendly slug."""
4602 |     if not text:
4603 |         return "file"  # Default slug for empty input
4604 | 
4605 |     # Normalize Unicode characters (e.g., accents to base letters)
4606 |     normalized_text = unicodedata.normalize("NFKD", text)
4607 |     # Encode to ASCII, ignoring characters that cannot be represented
4608 |     ascii_bytes = normalized_text.encode("ascii", "ignore")
4609 |     # Decode back to string
4610 |     ascii_text = ascii_bytes.decode()
4611 |     # Convert to lowercase
4612 |     lower_text = ascii_text.lower()
4613 |     # Replace non-alphanumeric (excluding '-', '_') with hyphens
4614 |     slug_hyphens = _SLUG_RE.sub("-", lower_text)
4615 |     # Remove leading/trailing hyphens
4616 |     slug_trimmed = slug_hyphens.strip("-")
4617 |     # Replace multiple consecutive hyphens with a single hyphen
4618 |     slug_single_hyphens = re.sub(r"-{2,}", "-", slug_trimmed)
4619 |     # Truncate to maximum length
4620 |     slug_truncated = slug_single_hyphens[:max_len]
4621 |     # Trim hyphens again after potential truncation
4622 |     final_slug = slug_truncated.strip("-")
4623 | 
4624 |     # Ensure slug is not empty after all operations
4625 |     return final_slug or "file"  # Return default if empty
4626 | 
4627 | 
4628 | def _get_dir_slug(url: str) -> str:
4629 |     """Creates a slug based on the last path components or domain of a URL."""
4630 |     try:
4631 |         parsed_url = urlparse(url)
4632 |         # Split path into components, filtering out empty strings and root slash
4633 |         path_obj = Path(parsed_url.path)
4634 |         path_parts = []
4635 |         for part in path_obj.parts:
4636 |             if part and part != "/":
4637 |                 path_parts.append(part)
4638 | 
4639 |         # Create slug based on path components
4640 |         num_parts = len(path_parts)
4641 |         if num_parts >= 2:
4642 |             # Use last two path parts if available
4643 |             part_minus_2_slug = _slugify(path_parts[-2], 20)
4644 |             part_minus_1_slug = _slugify(path_parts[-1], 20)
4645 |             dir_slug = f"{part_minus_2_slug}-{part_minus_1_slug}"
4646 |             return dir_slug
4647 |         elif num_parts == 1:
4648 |             # Use the single path part
4649 |             part_slug = _slugify(path_parts[-1], 40)
4650 |             return part_slug
4651 |         else:
4652 |             # Fallback to domain name if path is empty or just '/'
4653 |             domain_slug = _slugify(parsed_url.netloc, 40)
4654 |             return domain_slug or "domain"  # Use 'domain' if netloc is also empty
4655 | 
4656 |     except Exception as e:
4657 |         logger.warning(f"Error creating directory slug for URL '{url}': {e}")
4658 |         return "path"  # Fallback slug on error
4659 | 
4660 | 
4661 | async def _fetch_html(
4662 |     client: httpx.AsyncClient, url: str, rate_limiter: Optional["RateLimiter"] = None
4663 | ) -> Optional[str]:
4664 |     """Fetches HTML content from a URL using httpx, respecting rate limits."""
4665 |     try:
4666 |         # Acquire rate limit permit if limiter is provided
4667 |         if rate_limiter:
4668 |             await rate_limiter.acquire()
4669 | 
4670 |         # Make GET request with streaming response
4671 |         request_timeout = 20.0
4672 |         async with client.stream(
4673 |             "GET", url, follow_redirects=True, timeout=request_timeout
4674 |         ) as response:
4675 |             # Check for non-success status codes
4676 |             response.raise_for_status()  # Raises HTTPStatusError for 4xx/5xx
4677 | 
4678 |             # Handle No Content response
4679 |             if response.status_code == 204:
4680 |                 logger.debug(f"Received HTTP 204 No Content for {url}")
4681 |                 return None
4682 | 
4683 |             # Check content type - must be HTML
4684 |             content_type_header = response.headers.get("content-type", "")
4685 |             content_type = content_type_header.lower()
4686 |             if "text/html" not in content_type:
4687 |                 logger.debug(f"Skipping non-HTML content type '{content_type}' for {url}")
4688 |                 return None
4689 | 
4690 |             # Check content length limit
4691 |             max_html_size = 5 * 1024 * 1024  # 5 MiB
4692 |             content_length_header = response.headers.get("content-length")
4693 |             if content_length_header:
4694 |                 try:
4695 |                     content_length = int(content_length_header)
4696 |                     if content_length > max_html_size:
4697 |                         logger.debug(
4698 |                             f"Skipping large HTML content ({content_length} bytes) for {url}"
4699 |                         )
4700 |                         return None
4701 |                 except ValueError:
4702 |                     logger.warning(
4703 |                         f"Invalid Content-Length header '{content_length_header}' for {url}"
4704 |                     )
4705 |                     # Proceed cautiously without length check
4706 | 
4707 |             # Read the response body bytes
4708 |             html_bytes = await response.aread()
4709 | 
4710 |             # Decode HTML bytes to string (try UTF-8, then fallback)
4711 |             decoded_html: Optional[str] = None
4712 |             try:
4713 |                 decoded_html = html_bytes.decode("utf-8")
4714 |             except UnicodeDecodeError:
4715 |                 try:
4716 |                     # Fallback to Latin-1 if UTF-8 fails
4717 |                     decoded_html = html_bytes.decode("iso-8859-1")
4718 |                     logger.debug(f"Decoded HTML from {url} using iso-8859-1 fallback.")
4719 |                 except UnicodeDecodeError:
4720 |                     # Log warning if both decodings fail
4721 |                     logger.warning(f"Could not decode HTML from {url} using utf-8 or iso-8859-1.")
4722 |                     return None  # Cannot process undecodable content
4723 | 
4724 |             return decoded_html
4725 | 
4726 |     except httpx.HTTPStatusError as e:
4727 |         # Log client/server errors (4xx/5xx)
4728 |         status_code = e.response.status_code
4729 |         logger.debug(f"HTTP error {status_code} fetching {url}: {e}")
4730 |         return None
4731 |     except httpx.RequestError as e:
4732 |         # Log network-related errors (DNS, connection, timeout etc.)
4733 |         logger.warning(f"Network error fetching {url}: {e}")
4734 |         return None
4735 |     except Exception as e:
4736 |         # Log other unexpected errors during fetch
4737 |         logger.error(f"Unexpected error fetching {url}: {e}", exc_info=True)
4738 |         return None
4739 | 
4740 | 
4741 | def _extract_links(base_url: str, html: str) -> Tuple[List[str], List[str]]:
4742 |     """Extracts absolute PDF and internal HTML page links from HTML content."""
4743 |     pdfs: Set[str] = set()
4744 |     pages: Set[str] = set()
4745 |     try:
4746 |         soup = BeautifulSoup(html, "html.parser")  # Use default parser
4747 |         parsed_base_url = urlparse(base_url)
4748 |         base_netloc = parsed_base_url.netloc
4749 | 
4750 |         # Find all <a> tags with an href attribute
4751 |         anchor_tags = soup.find_all("a", href=True)
4752 | 
4753 |         for a in anchor_tags:
4754 |             href_raw = a["href"]
4755 |             # Skip empty, fragment, mailto, tel, or javascript links
4756 |             if not href_raw or href_raw.startswith(("#", "mailto:", "tel:", "javascript:")):
4757 |                 continue
4758 | 
4759 |             try:
4760 |                 # Resolve relative URLs to absolute URLs
4761 |                 abs_url = urllib.parse.urljoin(base_url, href_raw)
4762 |                 parsed_url = urlparse(abs_url)
4763 | 
4764 |                 # Clean URL by removing fragment identifier
4765 |                 clean_url = parsed_url._replace(fragment="").geturl()
4766 |                 path_lower = parsed_url.path.lower()
4767 | 
4768 |                 # Check if it's a PDF link
4769 |                 if path_lower.endswith(".pdf"):
4770 |                     pdfs.add(clean_url)
4771 |                 # Check if it's an internal HTML page link
4772 |                 elif parsed_url.netloc == base_netloc:
4773 |                     # Check if path seems like HTML or directory listing
4774 |                     is_html_like = path_lower.endswith((".html", ".htm", "/"))
4775 |                     # Or if it has no file extension in the last path segment
4776 |                     path_name = Path(parsed_url.path).name
4777 |                     has_no_ext = "." not in path_name
4778 |                     # Ensure it's not mistakenly identified as PDF again
4779 |                     not_pdf = not path_lower.endswith(".pdf")
4780 | 
4781 |                     if (is_html_like or has_no_ext) and not_pdf:
4782 |                         pages.add(clean_url)
4783 | 
4784 |             except ValueError:
4785 |                 # Ignore errors resolving invalid URLs (e.g., bad characters)
4786 |                 pass
4787 |             except Exception as link_err:
4788 |                 # Log other errors during link processing
4789 |                 logger.warning(f"Error processing link '{href_raw}' on page {base_url}: {link_err}")
4790 | 
4791 |     except Exception as soup_err:
4792 |         # Log errors during BeautifulSoup parsing
4793 |         logger.error(f"Error parsing HTML for links on {base_url}: {soup_err}", exc_info=True)
4794 | 
4795 |     # Return lists of unique PDF and page URLs found
4796 |     return list(pdfs), list(pages)
4797 | 
4798 | 
4799 | class RateLimiter:  # Keep class definition
4800 |     """Simple asynchronous rate limiter using asyncio.Lock."""
4801 | 
4802 |     def __init__(self, rate_limit: float = 1.0):
4803 |         if rate_limit <= 0:
4804 |             raise ValueError("Rate limit must be positive.")
4805 |         # Calculate the minimum interval between requests in seconds
4806 |         self.interval = 1.0 / rate_limit
4807 |         self.last_request_time: float = 0  # Time of the last request completion
4808 |         self.lock = asyncio.Lock()  # Lock to ensure atomic check/wait/update
4809 | 
4810 |     async def acquire(self):
4811 |         """Acquires a permit, sleeping if necessary to maintain the rate limit."""
4812 |         async with self.lock:
4813 |             now = time.monotonic()
4814 |             time_since_last = now - self.last_request_time
4815 |             # Calculate how long we need to wait
4816 |             time_to_wait = self.interval - time_since_last
4817 | 
4818 |             if time_to_wait > 0:
4819 |                 # Sleep for the required duration
4820 |                 await asyncio.sleep(time_to_wait)
4821 |                 # Update 'now' after sleeping
4822 |                 now = time.monotonic()
4823 | 
4824 |             # Update the last request time to the current time
4825 |             self.last_request_time = now
4826 | 
4827 | 
4828 | async def crawl_for_pdfs(
4829 |     start_url: str,
4830 |     include_regex: Optional[str] = None,
4831 |     max_depth: int = 2,
4832 |     max_pdfs: int = 100,
4833 |     max_pages_crawl: int = 500,
4834 |     rate_limit_rps: float = 2.0,
4835 | ) -> List[str]:
4836 |     """Crawls a website to find PDF links."""
4837 |     # Compile include regex if provided
4838 |     inc_re: Optional[re.Pattern] = None
4839 |     if include_regex:
4840 |         try:
4841 |             inc_re = re.compile(include_regex, re.IGNORECASE)
4842 |         except re.error as e:
4843 |             raise ToolInputError(f"Invalid include_regex provided: {e}") from e
4844 | 
4845 |     # Initialize crawl state
4846 |     seen_urls: Set[str] = set()
4847 |     pdf_urls_found: Set[str] = set()
4848 |     # Queue stores tuples of (url, depth)
4849 |     queue: deque[tuple[str, int]] = deque()
4850 |     queue.append((start_url, 0))  # Start at depth 0
4851 |     seen_urls.add(start_url)
4852 |     visit_count = 0
4853 |     rate_limiter = RateLimiter(rate_limit_rps)
4854 |     base_netloc = urlparse(start_url).netloc
4855 |     # Basic user agent for politeness
4856 |     headers = {
4857 |         "User-Agent": "Mozilla/5.0 (compatible; SmartBrowserBot/1.0; +http://example.com/bot)"
4858 |     }
4859 | 
4860 |     # Use httpx.AsyncClient for connection pooling
4861 |     client_timeout = 30.0
4862 |     async with httpx.AsyncClient(
4863 |         follow_redirects=True, timeout=client_timeout, headers=headers
4864 |     ) as client:
4865 |         # Main crawl loop
4866 |         while queue:
4867 |             # Check stopping conditions
4868 |             if len(pdf_urls_found) >= max_pdfs:
4869 |                 logger.info(f"PDF crawl stopped: Max PDFs ({max_pdfs}) reached.")
4870 |                 break
4871 |             if visit_count >= max_pages_crawl:
4872 |                 logger.warning(f"PDF crawl stopped: Max pages crawled ({max_pages_crawl}) reached.")
4873 |                 break
4874 | 
4875 |             # Get next URL and depth from queue
4876 |             current_url, current_depth = queue.popleft()
4877 |             visit_count += 1
4878 |             logger.debug(f"Crawling [Depth {current_depth}, Visit {visit_count}]: {current_url}")
4879 | 
4880 |             # Fetch HTML content for the current page
4881 |             html = await _fetch_html(client, current_url, rate_limiter)
4882 |             if not html:
4883 |                 continue  # Skip if fetch failed or not HTML
4884 | 
4885 |             # Extract links from the fetched HTML
4886 |             pdfs, pages = _extract_links(current_url, html)
4887 | 
4888 |             # Process found PDF links
4889 |             for pdf_url in pdfs:
4890 |                 if pdf_url not in pdf_urls_found:
4891 |                     # Apply include regex if specified
4892 |                     if inc_re is None or inc_re.search(pdf_url):
4893 |                         pdf_urls_found.add(pdf_url)
4894 |                         logger.info(f"PDF found: {pdf_url} (Total: {len(pdf_urls_found)})")
4895 |                         # Check if max PDFs reached after adding
4896 |                         if len(pdf_urls_found) >= max_pdfs:
4897 |                             break  # Exit inner loop
4898 | 
4899 |             # Check max PDFs again after processing all PDFs on page
4900 |             if len(pdf_urls_found) >= max_pdfs:
4901 |                 break  # Exit outer loop
4902 | 
4903 |             # Process found HTML page links for further crawling
4904 |             if current_depth < max_depth:
4905 |                 for page_url in pages:
4906 |                     try:
4907 |                         parsed_page_url = urlparse(page_url)
4908 |                         # Only crawl pages on the same domain and not seen before
4909 |                         is_same_domain = parsed_page_url.netloc == base_netloc
4910 |                         is_not_seen = page_url not in seen_urls
4911 |                         if is_same_domain and is_not_seen:
4912 |                             seen_urls.add(page_url)
4913 |                             # Add to queue with incremented depth
4914 |                             queue.append((page_url, current_depth + 1))
4915 |                     except ValueError:
4916 |                         # Ignore errors parsing potential page URLs
4917 |                         pass
4918 | 
4919 |     # Log final counts after loop finishes
4920 |     logger.info(
4921 |         f"PDF crawl finished. Found {len(pdf_urls_found)} matching PDFs after visiting {visit_count} pages."
4922 |     )
4923 |     return list(pdf_urls_found)
4924 | 
4925 | 
4926 | async def _download_file_direct(
4927 |     url: str, dest_dir_str: str, seq: int = 1
4928 | ) -> Dict:  # Uses Filesystem Tools
4929 |     """Downloads a file directly using httpx and saves using filesystem tools."""
4930 |     final_output_path_str: Optional[str] = None  # Path where file is ultimately saved
4931 |     downloaded_content: Optional[bytes] = None
4932 |     initial_filename = ""  # Keep track for error reporting
4933 | 
4934 |     try:
4935 |         # --- Determine Initial Filename ---
4936 |         parsed_url = urlparse(url)
4937 |         path_basename = os.path.basename(parsed_url.path) if parsed_url.path else ""
4938 | 
4939 |         # Create a filename if URL path is empty or root, or has no extension
4940 |         use_generated_name = not path_basename or path_basename == "/" or "." not in path_basename
4941 | 
4942 |         if use_generated_name:
4943 |             dir_slug = _get_dir_slug(url)  # Slug based on parent path or domain
4944 |             base_name = f"{seq:03d}_{dir_slug}_{_slugify(path_basename or 'download')}"
4945 |             # Add appropriate extension (default .dat)
4946 |             file_ext = ".pdf" if url.lower().endswith(".pdf") else ".dat"
4947 |             initial_filename = base_name + file_ext
4948 |         else:
4949 |             # Use and sanitize the filename from the URL path
4950 |             sanitized_basename = _slugify(path_basename)
4951 |             initial_filename = f"{seq:03d}_{sanitized_basename}"
4952 | 
4953 |         # Initial desired path within the destination directory
4954 |         initial_desired_path = os.path.join(dest_dir_str, initial_filename)
4955 |         refined_desired_path = initial_desired_path  # Start with initial path
4956 | 
4957 |         # --- Fetch File Content ---
4958 |         headers = {
4959 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",  # Standard UA
4960 |             "Accept": "*/*",
4961 |             "Accept-Encoding": "gzip, deflate, br",
4962 |             "Connection": "keep-alive",
4963 |         }
4964 |         download_timeout = 120.0  # Allow 2 minutes for download
4965 |         async with httpx.AsyncClient(
4966 |             follow_redirects=True, timeout=download_timeout, headers=headers
4967 |         ) as client:
4968 |             async with client.stream("GET", url) as response:
4969 |                 # Check for successful status code
4970 |                 if response.status_code != 200:
4971 |                     error_msg = f"HTTP {response.status_code} {response.reason_phrase}"
4972 |                     status_code = response.status_code
4973 |                     # Return error dictionary immediately
4974 |                     return {
4975 |                         "url": url,
4976 |                         "error": error_msg,
4977 |                         "status_code": status_code,
4978 |                         "success": False,
4979 |                         "path": initial_desired_path,  # Report intended path on error
4980 |                     }
4981 | 
4982 |                 # --- Refine Filename based on Headers (Content-Disposition, Content-Type) ---
4983 |                 # Check Content-Disposition header for filename suggestion
4984 |                 content_disposition = response.headers.get("content-disposition")
4985 |                 if content_disposition:
4986 |                     # Simple regex to find filename*= or filename=
4987 |                     match = re.search(r'filename\*?="?([^"]+)"?', content_disposition)
4988 |                     if match:
4989 |                         header_filename_raw = match.group(1)
4990 |                         # Try URL decoding potential encoding
4991 |                         try:
4992 |                             header_filename_decoded = urllib.parse.unquote(header_filename_raw)
4993 |                         except Exception:
4994 |                             header_filename_decoded = header_filename_raw  # Fallback
4995 |                         # Sanitize and prepend sequence number
4996 |                         refined_filename = f"{seq:03d}_{_slugify(header_filename_decoded)}"
4997 |                         refined_desired_path = os.path.join(dest_dir_str, refined_filename)
4998 |                         logger.debug(
4999 |                             f"Refined filename from Content-Disposition: {refined_filename}"
5000 |                         )
5001 | 
5002 |                 # Check Content-Type header to potentially correct extension
5003 |                 content_type_header = response.headers.get("content-type", "")
5004 |                 content_type = content_type_header.split(";")[0].strip().lower()
5005 |                 current_stem, current_ext = os.path.splitext(refined_desired_path)
5006 |                 # Correct extension if Content-Type is PDF and current ext isn't
5007 |                 if content_type == "application/pdf" and current_ext.lower() != ".pdf":
5008 |                     refined_desired_path = current_stem + ".pdf"
5009 |                     logger.debug("Corrected file extension to .pdf based on Content-Type.")
5010 | 
5011 |                 # Read the downloaded content
5012 |                 downloaded_content = await response.aread()
5013 |                 bytes_read = len(downloaded_content)
5014 |                 logger.debug(f"Downloaded {bytes_read} bytes for {url}.")
5015 | 
5016 |         # Ensure content was downloaded
5017 |         if downloaded_content is None:
5018 |             raise ToolError(
5019 |                 "Downloaded content is unexpectedly None after successful HTTP request."
5020 |             )
5021 | 
5022 |         # --- Get Unique Save Path using Filesystem Tool ---
5023 |         try:
5024 |             unique_path_result = await get_unique_filepath(
5025 |                 path=refined_desired_path
5026 |             )  # STANDALONE call
5027 |             if not isinstance(unique_path_result, dict) or not unique_path_result.get("success"):
5028 |                 error_msg = (
5029 |                     unique_path_result.get("error", "Unknown")
5030 |                     if isinstance(unique_path_result, dict)
5031 |                     else "Invalid response"
5032 |                 )
5033 |                 raise ToolError(f"Failed to get unique download path. Error: {error_msg}")
5034 | 
5035 |             final_output_path_str = unique_path_result.get("path")
5036 |             if not final_output_path_str:
5037 |                 raise ToolError(
5038 |                     "Filesystem tool get_unique_filepath succeeded but did not return path."
5039 |                 )
5040 |             logger.info(f"Determined unique download save path: {final_output_path_str}")
5041 |         except Exception as e:
5042 |             # Wrap error getting unique path
5043 |             raise ToolError(
5044 |                 f"Could not determine unique save path based on '{refined_desired_path}': {str(e)}"
5045 |             ) from e
5046 | 
5047 |         # --- Write File using Filesystem Tool ---
5048 |         try:
5049 |             write_result = await write_file(
5050 |                 path=final_output_path_str, content=downloaded_content
5051 |             )  # STANDALONE call
5052 |             if not isinstance(write_result, dict) or not write_result.get("success"):
5053 |                 error_msg = (
5054 |                     write_result.get("error", "Unknown")
5055 |                     if isinstance(write_result, dict)
5056 |                     else "Invalid response"
5057 |                 )
5058 |                 raise ToolError(
5059 |                     f"Filesystem tool failed to write downloaded file to '{final_output_path_str}'. Error: {error_msg}"
5060 |                 )
5061 |             logger.info(f"Successfully saved file to: {final_output_path_str}")
5062 |         except Exception as e:
5063 |             # Wrap error during file write
5064 |             raise ToolError(
5065 |                 f"Could not write downloaded file to '{final_output_path_str}': {str(e)}"
5066 |             ) from e
5067 | 
5068 |         # --- Calculate Hash ---
5069 |         hasher = hashlib.sha256()
5070 |         hasher.update(downloaded_content)
5071 |         file_hash = hasher.hexdigest()
5072 | 
5073 |         # --- Log and Return Success ---
5074 |         await _log(
5075 |             "download_direct_success",
5076 |             url=url,
5077 |             file=final_output_path_str,
5078 |             size=bytes_read,
5079 |             sha256=file_hash,
5080 |         )
5081 |         return {
5082 |             "url": url,
5083 |             "file": final_output_path_str,  # The actual saved path
5084 |             "size": bytes_read,
5085 |             "sha256": file_hash,
5086 |             "success": True,
5087 |         }
5088 | 
5089 |     except httpx.RequestError as e:
5090 |         # Handle network errors during download attempt
5091 |         logger.warning(f"Network error downloading {url}: {e}")
5092 |         return {
5093 |             "url": url,
5094 |             "error": f"Network error: {e}",
5095 |             "success": False,
5096 |             "path": final_output_path_str or initial_filename,
5097 |         }  # Report final path if available
5098 |     except (ToolError, ToolInputError) as e:
5099 |         # Handle errors raised explicitly during path/write operations
5100 |         logger.error(f"Tool error downloading {url} directly: {e}", exc_info=True)
5101 |         return {
5102 |             "url": url,
5103 |             "error": f"Download failed: {e}",
5104 |             "success": False,
5105 |             "path": final_output_path_str or initial_filename,
5106 |         }
5107 |     except Exception as e:
5108 |         # Catch any other unexpected errors
5109 |         logger.error(f"Unexpected error downloading {url} directly: {e}", exc_info=True)
5110 |         return {
5111 |             "url": url,
5112 |             "error": f"Download failed unexpectedly: {e}",
5113 |             "success": False,
5114 |             "path": final_output_path_str or initial_filename,
5115 |         }
5116 | 
5117 | 
5118 | # --- OSS Documentation Crawler Helpers ---
5119 | _DOC_EXTS = (".html", ".htm", "/")  # Common extensions/endings for HTML pages
5120 | _DOC_STOP_PAT = re.compile(
5121 |     r"\.(png|jpg|jpeg|gif|svg|css|js|zip|tgz|gz|whl|exe|dmg|ico|woff|woff2|map|json|xml|txt|pdf|md)$",  # Added pdf, md
5122 |     re.IGNORECASE,
5123 | )  # File extensions to ignore during crawl
5124 | 
5125 | 
5126 | def _looks_like_docs_url(url: str) -> bool:
5127 |     """
5128 |     Heuristically checks if a URL looks like a documentation page.
5129 | 
5130 |     Args:
5131 |         url: The URL string to check.
5132 | 
5133 |     Returns:
5134 |         True if the URL appears to be a documentation page, False otherwise.
5135 |     """
5136 |     if not url or not isinstance(url, str):
5137 |         return False
5138 | 
5139 |     try:
5140 |         url_low = url.lower()
5141 |         parsed = urllib.parse.urlparse(url_low)
5142 | 
5143 |         # 1. Penalize URLs with query strings (often dynamic/non-doc pages)
5144 |         if parsed.query:
5145 |             return False
5146 | 
5147 |         # 2. Penalize common non-doc paths explicitly
5148 |         common_non_doc_paths = [
5149 |             # Common application paths
5150 |             "/api/",  # Sometimes docs, but often API endpoints themselves
5151 |             "/blog/",
5152 |             "/news/",
5153 |             "/community/",
5154 |             "/forum/",
5155 |             "/support/",
5156 |             "/contact/",
5157 |             "/about/",
5158 |             "/pricing/",
5159 |             "/login/",
5160 |             "/register/",
5161 |             "/signup/",
5162 |             "/signin/",
5163 |             "/account/",
5164 |             "/profile/",
5165 |             "/cart/",
5166 |             "/checkout/",
5167 |             # Common asset/download paths
5168 |             "/download/",
5169 |             "/install/",
5170 |             "/_static/",
5171 |             "/_images/",
5172 |             "/assets/",
5173 |             "/media/",
5174 |             "/static/",
5175 |             "/vendor/",
5176 |             "/node_modules/",
5177 |             # Specific framework/site paths unlikely to be main docs
5178 |             "/wp-content/",
5179 |             "/wp-admin/",
5180 |             "/sites/default/files/",
5181 |         ]
5182 |         # Use a generator expression for slightly better efficiency
5183 |         if any(non_doc_path in parsed.path for non_doc_path in common_non_doc_paths):
5184 |             return False
5185 | 
5186 |         # 3. Check for keywords indicating documentation in URL or path
5187 |         doc_keywords = [
5188 |             "docs",
5189 |             "doc",
5190 |             "documentation",
5191 |             "guide",
5192 |             "manual",
5193 |             "tutorial",
5194 |             "tuto",
5195 |             "reference",
5196 |             "ref",
5197 |             "api",
5198 |             "faq",
5199 |             "howto",
5200 |             "userguide",
5201 |             "develop",
5202 |             "example",
5203 |             "usage",
5204 |             "getting-started",
5205 |             "quickstart",
5206 |         ]
5207 |         # Check in netloc (e.g., docs.example.com) and path
5208 |         has_doc_keyword = any(
5209 |             keyword in parsed.netloc or keyword in parsed.path for keyword in doc_keywords
5210 |         )
5211 | 
5212 |         # 4. Check if URL ends with typical HTML extension or directory slash
5213 |         ends_with_doc_ext = url_low.endswith(_DOC_EXTS)
5214 | 
5215 |         # 5. Check if URL is hosted on a common documentation platform
5216 |         common_doc_hosts = [
5217 |             "readthedocs.io",
5218 |             "netlify.app",
5219 |             "vercel.app",
5220 |             "github.io",
5221 |             "gitlab.io",
5222 |             "pages.dev",  # Cloudflare Pages
5223 |             "gitbook.io",
5224 |             "docusaurus.io",  # Often custom domains, but sometimes subdomains
5225 |         ]
5226 |         is_common_host = any(host in parsed.netloc for host in common_doc_hosts)
5227 | 
5228 |         # 6. Check if URL path contains a file extension we want to stop at
5229 |         path_has_stop_ext = bool(_DOC_STOP_PAT.search(parsed.path))
5230 | 
5231 |         # Combine checks:
5232 |         # - MUST NOT have a stop extension
5233 |         # - MUST satisfy one of the positive indicators:
5234 |         #   - Contains a documentation keyword
5235 |         #   - Ends like an HTML page or directory
5236 |         #   - Is hosted on a common documentation platform
5237 |         is_likely_doc = not path_has_stop_ext and (
5238 |             has_doc_keyword or ends_with_doc_ext or is_common_host
5239 |         )
5240 | 
5241 |         # Log decision process if debugging needed
5242 |         # logger.debug(f"URL Check: {url_low} -> StopExt:{path_has_stop_ext}, Keyword:{has_doc_keyword}, DocExt:{ends_with_doc_ext}, CommonHost:{is_common_host} => LikelyDoc:{is_likely_doc}")
5243 | 
5244 |         return is_likely_doc
5245 | 
5246 |     except ValueError:  # Handle potential errors from urlparse
5247 |         logger.warning(f"Error parsing URL for documentation check: {url}", exc_info=True)
5248 |         return False
5249 |     except Exception as e:  # Catch any other unexpected errors
5250 |         logger.error(f"Unexpected error in _looks_like_docs_url for {url}: {e}", exc_info=True)
5251 |         return False
5252 | 
5253 | 
5254 | async def _pick_docs_root(pkg_name: str) -> Optional[str]:
5255 |     """
5256 |     Attempts to find the root documentation URL for a package using web search.
5257 | 
5258 |     Uses multiple search queries and engines, then applies heuristics (_looks_like_docs_url)
5259 |     to find the most likely documentation root URL.
5260 | 
5261 |     Args:
5262 |         pkg_name: The name of the package to find documentation for.
5263 | 
5264 |     Returns:
5265 |         The most likely documentation root URL as a string, or None if not found.
5266 | 
5267 |     Raises:
5268 |         ToolInputError: If the package name is invalid.
5269 |         ToolError: If the web search fails critically or no suitable URL is found.
5270 |     """
5271 |     if not pkg_name or not isinstance(pkg_name, str):
5272 |         raise ToolInputError("Package name must be a non-empty string.")
5273 | 
5274 |     try:
5275 |         logger.info(f"Searching for documentation root for package: '{pkg_name}'")
5276 | 
5277 |         # --- Prepare search queries and engines ---
5278 |         queries = [
5279 |             f'"{pkg_name}" official documentation website',  # More precise
5280 |             f"{pkg_name} documentation",
5281 |             f"{pkg_name} python library docs",  # Specific to python
5282 |             f"{pkg_name} user guide",
5283 |             f"how to use {pkg_name}",
5284 |         ]
5285 |         # Cycle engines to mitigate potential blocks/bias or differing results
5286 |         engines = ["duckduckgo", "bing"]
5287 |         all_search_hits: List[Dict[str, Any]] = []
5288 |         MAX_RESULTS_PER_QUERY = 3  # Get fewer results per query, but run more queries
5289 | 
5290 |         # --- Run searches ---
5291 |         for i, query in enumerate(queries):
5292 |             engine = engines[i % len(engines)]
5293 |             logger.debug(f"Trying search query [{i + 1}/{len(queries)}]: '{query}' on {engine}")
5294 |             try:
5295 |                 await asyncio.sleep(0.2)  # Small delay between searches
5296 |                 # Assuming search_web returns a list of dicts directly now
5297 |                 search_res_list = await search_web(
5298 |                     query, engine=engine, max_results=MAX_RESULTS_PER_QUERY
5299 |                 )
5300 |                 if isinstance(search_res_list, list):
5301 |                     all_search_hits.extend(search_res_list)
5302 |                 else:
5303 |                     # Log if search_web returns unexpected format (it shouldn't based on its definition)
5304 |                     logger.warning(
5305 |                         f"Search query '{query}' on {engine} returned unexpected format: {type(search_res_list)}. Expected list."
5306 |                     )
5307 | 
5308 |             except ToolError as search_err:
5309 |                 # Log specific tool errors from search_web but continue trying other queries
5310 |                 logger.warning(f"Web search query '{query}' failed on {engine}: {search_err}")
5311 |             except Exception as e:
5312 |                 # Log unexpected errors during a specific search call but continue
5313 |                 logger.error(
5314 |                     f"Unexpected error during web search for query '{query}': {e}", exc_info=True
5315 |                 )
5316 | 
5317 |         # Check if any results were gathered at all
5318 |         if not all_search_hits:
5319 |             raise ToolError(
5320 |                 f"Web search yielded no results for documentation queries related to '{pkg_name}'."
5321 |             )
5322 | 
5323 |         # --- Evaluate results ---
5324 |         logger.debug(
5325 |             f"Evaluating {len(all_search_hits)} potential documentation URLs for '{pkg_name}'."
5326 |         )
5327 |         best_candidate: Optional[str] = None
5328 |         candidate_urls_considered: Set[str] = set()
5329 | 
5330 |         for i, hit in enumerate(all_search_hits):  # Add index for logging
5331 |             url = hit.get("url")
5332 |             title = hit.get("title", "N/A")  # Get title for context
5333 |             logger.debug(
5334 |                 f"  Hit [{i + 1}/{len(all_search_hits)}]: URL='{url}', Title='{title}'"
5335 |             )  # Log the hit being processed
5336 | 
5337 |             if not url:
5338 |                 logger.debug("    -> Skipping hit (no URL)")
5339 |                 continue
5340 | 
5341 |             # Basic URL cleaning: normalize scheme, netloc, path; remove fragment
5342 |             try:
5343 |                 parsed_hit = urllib.parse.urlparse(url)
5344 |                 # Remove www. prefix for easier comparison
5345 |                 cleaned_netloc = parsed_hit.netloc.lower().replace("www.", "")
5346 |                 # Reconstruct URL without fragment, using cleaned netloc
5347 |                 cleaned_url = parsed_hit._replace(fragment="", netloc=cleaned_netloc).geturl()
5348 | 
5349 |                 # Ensure URL is not already processed (avoids redundant checks)
5350 |                 if cleaned_url in candidate_urls_considered:
5351 |                     logger.debug(f"    -> Skipping hit (already considered: {cleaned_url})")
5352 |                     continue
5353 |                 candidate_urls_considered.add(cleaned_url)
5354 | 
5355 |             except ValueError:
5356 |                 # Handle potential errors during URL parsing
5357 |                 logger.warning(f"    -> Skipping hit (invalid URL): {url}")
5358 |                 continue
5359 | 
5360 |             # Apply the heuristic check (_looks_like_docs_url assumes it's defined elsewhere)
5361 |             is_likely = _looks_like_docs_url(cleaned_url)
5362 |             logger.debug(
5363 |                 f"    -> Heuristic check for '{cleaned_url}': {is_likely}"
5364 |             )  # Log heuristic result
5365 | 
5366 |             if is_likely:
5367 |                 logger.info(
5368 |                     f"Found likely documentation page via search: {cleaned_url} (Original: {url})"
5369 |                 )
5370 |                 # Simple strategy: take the *first* likely candidate found.
5371 |                 best_candidate = cleaned_url
5372 |                 break  # Stop after finding the first likely candidate
5373 | 
5374 |         # --- Fallback if heuristic finds nothing ---
5375 |         if not best_candidate and all_search_hits:
5376 |             # Fallback: Take the first result URL, clean it, and hope for the best.
5377 |             first_url_original = all_search_hits[0].get("url")
5378 |             if first_url_original:
5379 |                 try:
5380 |                     parsed_first = urllib.parse.urlparse(first_url_original)
5381 |                     # Perform the same cleaning as above for consistency
5382 |                     cleaned_first_netloc = parsed_first.netloc.lower().replace("www.", "")
5383 |                     cleaned_first_url = parsed_first._replace(
5384 |                         fragment="", netloc=cleaned_first_netloc
5385 |                     ).geturl()
5386 |                     logger.warning(
5387 |                         f"_looks_like_docs_url heuristic failed. Falling back to first search result: {cleaned_first_url}"
5388 |                     )
5389 |                     best_candidate = cleaned_first_url
5390 |                 except ValueError:
5391 |                     logger.error(f"Could not parse fallback first URL: {first_url_original}")
5392 |                     # best_candidate remains None, error will be raised below
5393 | 
5394 |         # --- Final Check and Root Derivation ---
5395 |         if not best_candidate:
5396 |             logger.error(
5397 |                 f"Could not find any suitable documentation URL for '{pkg_name}' after evaluating {len(candidate_urls_considered)} candidates."
5398 |             )
5399 |             # Optionally log considered URLs if helpful for debugging
5400 |             # logger.debug(f"Considered URLs: {candidate_urls_considered}")
5401 |             raise ToolError(
5402 |                 f"Could not automatically find a likely documentation site for package '{pkg_name}'. Web search did not yield a suitable URL."
5403 |             )
5404 | 
5405 |         # Try to derive a more "root" URL from the best candidate found
5406 |         final_root_url: str
5407 |         try:
5408 |             parsed_candidate = urllib.parse.urlparse(best_candidate)
5409 |             path_segments = [seg for seg in parsed_candidate.path.split("/") if seg]
5410 | 
5411 |             # If the path has multiple segments, try going up one level
5412 |             # Only do this if the parent path still looks like documentation
5413 |             if len(path_segments) > 1:
5414 |                 parent_path = "/".join(path_segments[:-1])
5415 |                 # Ensure trailing slash for derived root URL, clear query/fragment
5416 |                 root_derived = parsed_candidate._replace(
5417 |                     path=f"/{parent_path}/", query="", fragment=""
5418 |                 ).geturl()
5419 | 
5420 |                 # Check if the derived parent path still looks like docs
5421 |                 if _looks_like_docs_url(root_derived):
5422 |                     logger.info(
5423 |                         f"Derived potential docs root by going up one level: {root_derived}"
5424 |                     )
5425 |                     final_root_url = root_derived
5426 |                 else:
5427 |                     # Parent doesn't look like docs, stick with the cleaned candidate URL
5428 |                     final_root_url = parsed_candidate._replace(query="", fragment="").geturl()
5429 |                     logger.info(
5430 |                         f"Parent path '{parent_path}/' didn't seem like docs root. Using original candidate (cleaned): {final_root_url}"
5431 |                     )
5432 |             else:
5433 |                 # Only one path segment or root path, use the cleaned candidate URL as is
5434 |                 final_root_url = parsed_candidate._replace(query="", fragment="").geturl()
5435 |                 logger.info(
5436 |                     f"Candidate URL is shallow or root. Using cleaned candidate as root: {final_root_url}"
5437 |                 )
5438 | 
5439 |         except Exception as parse_err:
5440 |             # Handle errors during parsing or root derivation
5441 |             logger.warning(
5442 |                 f"Error parsing/deriving root from best candidate URL {best_candidate}: {parse_err}. Using candidate as is (cleaned)."
5443 |             )
5444 |             # Fallback: Clean the best candidate URL (remove query/fragment) and return it
5445 |             try:
5446 |                 parsed_fallback = urllib.parse.urlparse(best_candidate)
5447 |                 final_root_url = parsed_fallback._replace(query="", fragment="").geturl()
5448 |             except ValueError:
5449 |                 # Should not happen if best_candidate was parseable before, but handle defensively
5450 |                 logger.error(
5451 |                     f"Failed to parse even the fallback candidate {best_candidate}. Returning original candidate."
5452 |                 )
5453 |                 final_root_url = best_candidate  # Last resort
5454 | 
5455 |         return final_root_url
5456 | 
5457 |     # Note: ToolError is raised explicitly above if no candidate found or web search fails.
5458 |     # This catch block handles unexpected errors during the process.
5459 |     except Exception as e:
5460 |         logger.error(
5461 |             f"Unexpected error finding documentation root for '{pkg_name}': {e}", exc_info=True
5462 |         )
5463 |         # Raise a generic ToolError indicating the failure cause
5464 |         raise ToolError(
5465 |             f"An unexpected error occurred while finding documentation for '{pkg_name}': {str(e)}"
5466 |         ) from e
5467 | 
5468 | 
5469 | # Import optional libraries for summarization, handle missing imports
5470 | try:
5471 |     import trafilatura
5472 | except ImportError:
5473 |     trafilatura = None
5474 |     logger.debug("trafilatura library not found, summarization quality may be reduced.")
5475 | try:
5476 |     from readability import Document  # Using python-readability (lxml based)
5477 | except ImportError:
5478 |     Document = None
5479 |     logger.debug("readability-lxml library not found, summarization quality may be reduced.")
5480 | 
5481 | 
5482 | def _summarize_html_sync(html: str, max_len: int = 10000) -> str:
5483 |     """Synchronously extracts main text content from HTML using multiple libraries."""
5484 |     if not html:
5485 |         return ""
5486 | 
5487 |     # Limit input HTML size to prevent excessive memory/CPU usage
5488 |     MAX_HTML_SIZE = 3 * 1024 * 1024  # 3 MiB
5489 |     if len(html) > MAX_HTML_SIZE:
5490 |         logger.warning(f"HTML content truncated to {MAX_HTML_SIZE} bytes for summarization.")
5491 |         html = html[:MAX_HTML_SIZE]
5492 | 
5493 |     text = ""
5494 | 
5495 |     # 1. Try Trafilatura (often good for articles/main content)
5496 |     if trafilatura is not None:
5497 |         try:
5498 |             # Favor precision over recall, exclude comments/tables
5499 |             extracted = trafilatura.extract(
5500 |                 html, include_comments=False, include_tables=False, favor_precision=True
5501 |             )
5502 |             if (
5503 |                 extracted and len(extracted) > 100
5504 |             ):  # Basic check if extraction yielded substantial text
5505 |                 text = extracted
5506 |                 logger.debug("Summarized HTML using Trafilatura.")
5507 |         except Exception as e:
5508 |             logger.warning(f"Trafilatura failed during HTML summarization: {e}")
5509 |             # Continue to next method if it fails
5510 | 
5511 |     # 2. Try Readability-lxml if Trafilatura failed or yielded short text
5512 |     if (not text or len(text) < 200) and Document is not None:
5513 |         try:
5514 |             doc = Document(html)
5515 |             # Get summary HTML (main content block)
5516 |             summary_html = doc.summary(html_partial=True)
5517 |             # Parse the summary HTML and extract text
5518 |             soup = BeautifulSoup(
5519 |                 summary_html, "html.parser"
5520 |             )  # Use html.parser for potentially partial HTML
5521 |             extracted_text = soup.get_text(" ", strip=True)
5522 |             if extracted_text and len(extracted_text) > 50:  # Lower threshold for readability
5523 |                 text = extracted_text
5524 |                 logger.debug("Summarized HTML using Readability-lxml.")
5525 |         except Exception as e:
5526 |             logger.warning(f"Readability-lxml failed during HTML summarization: {e}")
5527 |             # Continue to fallback if it fails
5528 | 
5529 |     # 3. Fallback: BeautifulSoup basic text extraction (if others failed/short)
5530 |     if not text or len(text) < 100:
5531 |         logger.debug("Using BeautifulSoup fallback for HTML summarization.")
5532 |         try:
5533 |             soup = BeautifulSoup(html, "lxml")  # Use lxml for robustness
5534 |             # Remove common non-content tags before text extraction
5535 |             tags_to_remove = [
5536 |                 "script",
5537 |                 "style",
5538 |                 "nav",
5539 |                 "header",
5540 |                 "footer",
5541 |                 "aside",
5542 |                 "form",
5543 |                 "figure",
5544 |                 "figcaption",
5545 |                 "noscript",
5546 |             ]
5547 |             found_tags = soup(tags_to_remove)
5548 |             for tag in found_tags:
5549 |                 tag.decompose()
5550 |             # Get remaining text, join with spaces, strip extra whitespace
5551 |             extracted_text = soup.get_text(" ", strip=True)
5552 |             text = extracted_text  # Use BS result even if short
5553 |         except Exception as e:
5554 |             logger.warning(f"BeautifulSoup fallback failed during HTML summarization: {e}")
5555 |             # text might remain empty if BS also fails
5556 | 
5557 |     # Final cleanup: normalize whitespace and truncate
5558 |     cleaned_text = re.sub(r"\s+", " ", text).strip()
5559 |     final_text = cleaned_text[:max_len]
5560 |     return final_text
5561 | 
5562 | 
5563 | async def _grab_readable(
5564 |     client: httpx.AsyncClient, url: str, rate_limiter: RateLimiter
5565 | ) -> Optional[str]:
5566 |     """Fetches HTML and extracts readable text content asynchronously."""
5567 |     # Fetch HTML using the helper function
5568 |     html = await _fetch_html(client, url, rate_limiter)
5569 |     if html:
5570 |         # Run the synchronous summarization function in the thread pool
5571 |         readable_text = await _run_in_thread(_summarize_html_sync, html)
5572 |         return readable_text
5573 |     else:
5574 |         # Return None if HTML fetch failed
5575 |         return None
5576 | 
5577 | 
5578 | async def crawl_docs_site(
5579 |     root_url: str, max_pages: int = 40, rate_limit_rps: float = 3.0
5580 | ) -> List[Tuple[str, str]]:
5581 |     """Crawls a documentation site starting from root_url and extracts readable text."""
5582 |     # Validate root URL and get starting domain
5583 |     try:
5584 |         parsed_start_url = urlparse(root_url)
5585 |         start_netloc = parsed_start_url.netloc
5586 |         if not start_netloc:
5587 |             raise ValueError("Root URL must have a valid domain name.")
5588 |     except (ValueError, AssertionError) as e:
5589 |         raise ToolInputError(
5590 |             f"Invalid root URL provided for documentation crawl: '{root_url}'. Error: {e}"
5591 |         ) from e
5592 | 
5593 |     # Initialize crawl state
5594 |     seen_urls: Set[str] = set()
5595 |     queue: deque[str] = deque()
5596 |     queue.append(root_url)  # Start with the root URL
5597 |     seen_urls.add(root_url)
5598 |     # List to store tuples of (url, extracted_text)
5599 |     output_pages: List[Tuple[str, str]] = []
5600 |     visit_count = 0
5601 |     # Set a max number of visits to prevent infinite loops on large/cyclic sites
5602 |     max_visits = max(max_pages * 5, 200)  # Visit more URLs than pages needed
5603 |     rate_limiter = RateLimiter(rate_limit_rps)
5604 |     headers = {"User-Agent": "Mozilla/5.0 (compatible; SmartBrowserDocBot/1.0)"}
5605 |     logger.info(
5606 |         f"Starting documentation crawl from: {root_url} (Max pages: {max_pages}, Max visits: {max_visits})"
5607 |     )
5608 | 
5609 |     # Use httpx.AsyncClient for connection pooling
5610 |     client_timeout = 30.0
5611 |     async with httpx.AsyncClient(
5612 |         follow_redirects=True, timeout=client_timeout, headers=headers
5613 |     ) as client:
5614 |         # Main crawl loop
5615 |         while queue:
5616 |             # Check stopping conditions
5617 |             if len(output_pages) >= max_pages:
5618 |                 logger.info(f"Doc crawl stopped: Reached max pages ({max_pages}).")
5619 |                 break
5620 |             if visit_count >= max_visits:
5621 |                 logger.warning(f"Doc crawl stopped: Reached max visits ({max_visits}).")
5622 |                 break
5623 | 
5624 |             # Get next URL from queue
5625 |             current_url = queue.popleft()
5626 |             visit_count += 1
5627 |             logger.debug(
5628 |                 f"Doc Crawl [Visit {visit_count}/{max_visits}, Found {len(output_pages)}/{max_pages}]: {current_url}"
5629 |             )
5630 | 
5631 |             # Grab readable text content from the URL
5632 |             readable_text = await _grab_readable(client, current_url, rate_limiter)
5633 | 
5634 |             # If readable text was extracted, add it to results
5635 |             if readable_text:
5636 |                 output_pages.append((current_url, readable_text))
5637 |                 logger.debug(
5638 |                     f"Collected readable content from: {current_url} (Length: {len(readable_text)})"
5639 |                 )
5640 | 
5641 |                 # Check if max pages reached after adding
5642 |                 if len(output_pages) >= max_pages:
5643 |                     break  # Exit loop early
5644 | 
5645 |                 # Fetch HTML again (or reuse if cached) to extract links for further crawling
5646 |                 # Re-fetching ensures we get links even if _grab_readable modified/simplified HTML structure
5647 |                 # (Could potentially optimize by passing HTML between functions if summarizer doesn't modify structure needed for links)
5648 |                 html_for_links = await _fetch_html(client, current_url, rate_limiter)
5649 |                 if html_for_links:
5650 |                     _, page_links = _extract_links(current_url, html_for_links)
5651 |                     # Process found page links
5652 |                     for link_url in page_links:
5653 |                         try:
5654 |                             parsed_link = urlparse(link_url)
5655 |                             # Check if link is on the same domain
5656 |                             is_same_domain = parsed_link.netloc == start_netloc
5657 |                             # Check if it looks like a doc page we haven't seen
5658 |                             is_doc_link = _looks_like_docs_url(link_url)
5659 |                             is_not_seen = link_url not in seen_urls
5660 | 
5661 |                             if is_same_domain and is_doc_link and is_not_seen:
5662 |                                 seen_urls.add(link_url)
5663 |                                 queue.append(link_url)  # Add to crawl queue
5664 |                         except ValueError:
5665 |                             # Ignore errors parsing potential link URLs
5666 |                             pass
5667 |             else:
5668 |                 logger.debug(f"No readable content extracted from: {current_url}")
5669 | 
5670 |     # Log final results after loop finishes
5671 |     logger.info(
5672 |         f"Documentation crawl finished. Collected content from {len(output_pages)} pages after {visit_count} visits."
5673 |     )
5674 |     return output_pages
5675 | 
5676 | 
5677 | # --- Page State Extraction ---
5678 | async def get_page_state(
5679 |     page: Page, max_elements: Optional[int] = None
5680 | ) -> dict[str, Any]:  # Uses global _log
5681 |     """Extracts the current state of the page using the page map functionality."""
5682 |     if max_elements is not None:
5683 |         # Note: _max_widgets_global now controls element count in _build_page_map
5684 |         logger.warning(
5685 |             "get_page_state 'max_elements' argument is deprecated and has no effect. Use global config 'max_widgets' instead."
5686 |         )
5687 | 
5688 |     # Check if page is valid
5689 |     if not page or page.is_closed():
5690 |         logger.warning("get_page_state called on closed or invalid page.")
5691 |         return {
5692 |             "error": "Page is closed or invalid",
5693 |             "url": getattr(page, "url", "unknown"),  # Try to get URL even if closed
5694 |             "title": "[Error: Page Closed]",
5695 |             "elements": [],
5696 |             "main_text": "",
5697 |         }
5698 | 
5699 |     start_time = time.monotonic()
5700 |     try:
5701 |         # Use the helper function to build (or retrieve cached) page map
5702 |         page_map, fingerprint = await _build_page_map(page)
5703 |         duration = time.monotonic() - start_time
5704 |         duration_ms = int(duration * 1000)
5705 |         num_elements = len(page_map.get("elements", []))
5706 |         page_url = page_map.get("url")
5707 |         page_title = page_map.get("title")
5708 | 
5709 |         # Log successful extraction
5710 |         await _log(
5711 |             "page_state_extracted",
5712 |             url=page_url,
5713 |             title=page_title,
5714 |             duration_ms=duration_ms,
5715 |             num_elements=num_elements,
5716 |             fp=fingerprint[:8],
5717 |         )
5718 | 
5719 |         # Return the constructed page map
5720 |         return page_map
5721 | 
5722 |     except Exception as e:
5723 |         # Catch any unexpected errors during state extraction
5724 |         duration = time.monotonic() - start_time
5725 |         duration_ms = int(duration * 1000)
5726 |         page_url = page.url or "unknown"  # Get URL directly from page on error
5727 |         logger.error(f"Error getting page state for {page_url}: {e}", exc_info=True)
5728 |         # Log error event
5729 |         await _log(
5730 |             "page_error", action="get_state", url=page_url, error=str(e), duration_ms=duration_ms
5731 |         )
5732 |         # Return error structure
5733 |         return {
5734 |             "error": f"Failed to get page state: {e}",
5735 |             "url": page_url,
5736 |             "title": "[Error Getting State]",
5737 |             "elements": [],
5738 |             "main_text": "",
5739 |         }
5740 | 
5741 | 
5742 | # --- LLM Bridge ---
5743 | def _extract_json_block(text: str) -> Optional[str]:  # Keep as is
5744 |     """Extracts the first JSON code block (markdown or bare) from text."""
5745 |     # Try finding markdown code block first ```json ... ```
5746 |     pattern_md = r"```json\s*(\{.*\}|\[.*\])\s*```"
5747 |     match_markdown = re.search(pattern_md, text, re.DOTALL)
5748 |     if match_markdown:
5749 |         json_str = match_markdown.group(1).strip()
5750 |         return json_str
5751 | 
5752 |     # Try finding bare JSON object or array { ... } or [ ... ]
5753 |     # This is less reliable, might match partial structures
5754 |     pattern_bare = r"(\{.*\}|\[.*\])"
5755 |     match_bare = re.search(pattern_bare, text, re.DOTALL)
5756 |     if match_bare:
5757 |         block = match_bare.group(0)
5758 |         # Basic sanity check for balanced braces/brackets
5759 |         has_balanced_braces = block.count("{") == block.count("}")
5760 |         has_balanced_brackets = block.count("[") == block.count("]")
5761 |         if has_balanced_braces and has_balanced_brackets:
5762 |             return block.strip()  # Return the matched bare block
5763 | 
5764 |     # No JSON block found
5765 |     return None
5766 | 
5767 | 
5768 | def _llm_resilient(max_attempts: int = 3, backoff: float = 1.0):  # Keep as is
5769 |     """Decorator for LLM calls, retrying on rate limits and transient errors."""
5770 | 
5771 |     def wrap(fn):
5772 |         @functools.wraps(fn)
5773 |         async def inner(*a, **kw):
5774 |             attempt = 0
5775 |             while True:
5776 |                 try:
5777 |                     # Add delay before retrying (not on first attempt)
5778 |                     if attempt > 0:
5779 |                         delay_factor = 2 ** (attempt - 1)
5780 |                         base_delay = backoff * delay_factor
5781 |                         jitter = random.uniform(0.8, 1.2)
5782 |                         jitter_delay = base_delay * jitter
5783 |                         logger.debug(
5784 |                             f"LLM resilient retry {attempt}: Sleeping for {jitter_delay:.2f}s..."
5785 |                         )
5786 |                         await asyncio.sleep(jitter_delay)
5787 |                     # Call the wrapped LLM function
5788 |                     result = await fn(*a, **kw)
5789 |                     return result
5790 | 
5791 |                 except ProviderError as e:
5792 |                     # Check if it's a rate limit error (common for 429 status)
5793 |                     err_str_lower = str(e).lower()
5794 |                     is_rate_limit = (
5795 |                         "429" in str(e)  # Check status code in error message
5796 |                         or "rate limit" in err_str_lower
5797 |                         or "too many requests" in err_str_lower
5798 |                         or "quota" in err_str_lower
5799 |                     )
5800 |                     if is_rate_limit:
5801 |                         attempt += 1
5802 |                         func_name = getattr(fn, "__name__", "?")
5803 |                         if attempt >= max_attempts:
5804 |                             logger.error(
5805 |                                 f"LLM rate limit: '{func_name}' failed after {max_attempts} attempts: {e}"
5806 |                             )
5807 |                             raise ToolError(
5808 |                                 f"LLM rate-limit exceeded after {max_attempts} attempts: {e}"
5809 |                             ) from e
5810 | 
5811 |                         # Check for Retry-After header suggestion in error
5812 |                         retry_after_seconds = None
5813 |                         retry_after_match = re.search(r"retry[- ]after[: ]+(\d+)", err_str_lower)
5814 |                         if retry_after_match:
5815 |                             try:
5816 |                                 retry_after_seconds = int(retry_after_match.group(1))
5817 |                             except ValueError:
5818 |                                 pass  # Ignore if number parsing fails
5819 | 
5820 |                         # Calculate delay: Use Retry-After if available, else exponential backoff
5821 |                         if retry_after_seconds:
5822 |                             delay = retry_after_seconds
5823 |                             logger.warning(
5824 |                                 f"LLM rate limit for '{func_name}'. Retrying after suggested {delay:.2f}s (attempt {attempt}/{max_attempts})"
5825 |                             )
5826 |                         else:
5827 |                             delay_factor = 2 ** (
5828 |                                 attempt - 1
5829 |                             )  # Use previous attempt for backoff calculation
5830 |                             base_delay = backoff * delay_factor
5831 |                             jitter = random.uniform(0.8, 1.2)
5832 |                             delay = base_delay * jitter
5833 |                             logger.warning(
5834 |                                 f"LLM rate limit for '{func_name}'. Retrying after {delay:.2f}s (attempt {attempt}/{max_attempts})"
5835 |                             )
5836 | 
5837 |                         # Sleep before next attempt (actual sleep happens at loop start)
5838 |                         # await asyncio.sleep(delay) # Moved sleep logic to loop start
5839 |                         continue  # Go to next iteration to retry
5840 |                     else:
5841 |                         # Different ProviderError, re-raise
5842 |                         raise
5843 |                 except (httpx.RequestError, asyncio.TimeoutError) as e:
5844 |                     # Handle transient network errors or timeouts
5845 |                     attempt += 1
5846 |                     func_name = getattr(fn, "__name__", "?")
5847 |                     if attempt >= max_attempts:
5848 |                         logger.error(
5849 |                             f"LLM call: '{func_name}' failed due to transient error after {max_attempts} attempts: {e}"
5850 |                         )
5851 |                         raise ToolError(
5852 |                             f"LLM call failed after {max_attempts} attempts: {e}"
5853 |                         ) from e
5854 |                     # Calculate delay using exponential backoff
5855 |                     # delay_factor = 2**(attempt - 1) # Using previous attempt number
5856 |                     # base_delay = backoff * delay_factor
5857 |                     # jitter = random.uniform(0.8, 1.2)
5858 |                     # delay = base_delay * jitter
5859 |                     # logger.warning(f"LLM transient error for '{func_name}'. Retrying after {delay:.2f}s (attempt {attempt}/{max_attempts})")
5860 |                     # await asyncio.sleep(delay) # Moved sleep logic to loop start
5861 |                     logger.warning(
5862 |                         f"LLM transient error for '{func_name}'. Retrying (attempt {attempt}/{max_attempts}). Error: {e}"
5863 |                     )
5864 |                     continue  # Go to next iteration
5865 |                 except Exception:
5866 |                     # For any other unexpected errors, re-raise immediately
5867 |                     raise
5868 | 
5869 |         return inner
5870 | 
5871 |     return wrap
5872 | 
5873 | 
5874 | @_llm_resilient(max_attempts=3, backoff=1.0)
5875 | async def _call_llm(
5876 |     messages: Sequence[Dict[str, str]],
5877 |     model: str = _llm_model_locator_global,
5878 |     expect_json: bool = False,
5879 |     temperature: float = 0.1,
5880 |     max_tokens: int = 1024,
5881 | ) -> Union[Dict[str, Any], List[Any]]:  # Uses global _log
5882 |     """Makes a call to the LLM using the standalone chat_completion tool."""
5883 |     if not messages:
5884 |         logger.error("_call_llm received empty messages list.")
5885 |         return {"error": "No messages provided to LLM."}
5886 | 
5887 |     # Determine provider and model name
5888 |     llm_provider = Provider.OPENAI.value  # Default provider
5889 |     llm_model_name = model  # Default model name
5890 |     if model:
5891 |         try:
5892 |             extracted_provider, extracted_model = parse_model_string(model)
5893 |             if extracted_provider:
5894 |                 llm_provider = extracted_provider
5895 |             if extracted_model:
5896 |                 llm_model_name = extracted_model
5897 |         except Exception as parse_err:
5898 |             logger.warning(f"Could not parse model string '{model}': {parse_err}. Using defaults.")
5899 | 
5900 |     # Prepare arguments for chat_completion
5901 |     llm_args: Dict[str, Any] = {
5902 |         "provider": llm_provider,
5903 |         "model": llm_model_name,
5904 |         "messages": list(messages),  # Ensure it's a mutable list
5905 |         "temperature": temperature,
5906 |         "max_tokens": max_tokens,
5907 |         "additional_params": {},  # For provider-specific params like response_format
5908 |     }
5909 | 
5910 |     # Handle JSON mode expectation
5911 |     use_json_instruction = (
5912 |         False  # Flag to add manual instruction if native JSON mode fails/unsupported
5913 |     )
5914 |     if expect_json:
5915 |         try:
5916 |             # Check if the provider/model combination supports native JSON response format
5917 |             provider_instance = await get_provider(llm_provider)
5918 |             # Example check (adapt based on actual provider capabilities)
5919 |             supports_native_json = False
5920 |             if llm_provider == Provider.OPENAI.value and llm_model_name.startswith(
5921 |                 ("gpt-4", "gpt-3.5-turbo")
5922 |             ):  # Check specific OpenAI models known to support it
5923 |                 supports_native_json = True
5924 |             # Or use a generic check if provider interface defines it
5925 |             elif hasattr(provider_instance, "supports_json_response_format"):
5926 |                 supports_native_json = await provider_instance.supports_json_response_format(
5927 |                     llm_model_name
5928 |                 )
5929 | 
5930 |             if supports_native_json:
5931 |                 logger.debug(
5932 |                     f"Provider '{llm_provider}' model '{llm_model_name}' supports native JSON mode."
5933 |                 )
5934 |                 # Add the provider-specific parameter for JSON mode
5935 |                 # This varies by provider (e.g., OpenAI uses response_format)
5936 |                 if llm_provider == Provider.OPENAI.value:
5937 |                     llm_args["additional_params"]["response_format"] = {"type": "json_object"}
5938 |                 # Add other providers' JSON format params here if needed
5939 |                 use_json_instruction = False  # Native mode used
5940 |             else:
5941 |                 logger.debug(
5942 |                     f"Provider '{llm_provider}' model '{llm_model_name}' does not natively support JSON mode. Using manual instruction."
5943 |                 )
5944 |                 use_json_instruction = True  # Need manual instruction
5945 |         except Exception as e:
5946 |             logger.warning(
5947 |                 f"Could not determine native JSON support for provider '{llm_provider}': {e}. Assuming manual instruction needed."
5948 |             )
5949 |             use_json_instruction = True
5950 | 
5951 |     # Add manual JSON instruction if needed
5952 |     if use_json_instruction:
5953 |         json_instruction = "\n\nIMPORTANT: Respond ONLY with valid JSON. Your entire response must start with `{` or `[` and end with `}` or `]`. Do not include ```json markers, comments, or any explanatory text before or after the JSON structure."
5954 |         modified_messages = list(llm_args["messages"])  # Work on a copy
5955 |         # Append instruction to the last user message, or add a new user message
5956 |         if modified_messages and modified_messages[-1]["role"] == "user":
5957 |             modified_messages[-1]["content"] += json_instruction
5958 |         else:
5959 |             # Add a new user message if last wasn't 'user' or list was empty
5960 |             modified_messages.append(
5961 |                 {
5962 |                     "role": "user",
5963 |                     "content": "Provide the response based on the previous messages."
5964 |                     + json_instruction,
5965 |                 }
5966 |             )
5967 |         llm_args["messages"] = modified_messages  # Update args with modified messages
5968 | 
5969 |     # Make the actual call to the standalone chat_completion tool
5970 |     try:
5971 |         start_time = time.monotonic()
5972 |         resp = await chat_completion(**llm_args)
5973 |         duration = time.monotonic() - start_time
5974 |         duration_ms = int(duration * 1000)
5975 |         model_returned = resp.get(
5976 |             "model", llm_model_name
5977 |         )  # Use model returned in response if available
5978 |         is_success = resp.get("success", False)
5979 |         is_cached = resp.get("cached_result", False)
5980 | 
5981 |         # Log the completion details
5982 |         await _log(
5983 |             "llm_call_complete",
5984 |             model=model_returned,
5985 |             duration_ms=duration_ms,
5986 |             success=is_success,
5987 |             cached=is_cached,
5988 |             provider=llm_provider,
5989 |         )
5990 | 
5991 |         # Process the response
5992 |         if not is_success:
5993 |             error_msg = resp.get("error", "LLM call failed with no specific error message.")
5994 |             # Try to get raw response details for debugging
5995 |             raw_resp_detail = None
5996 |             if isinstance(resp.get("details"), dict):
5997 |                 raw_resp_detail = resp["details"].get("raw_response")
5998 |             if not raw_resp_detail:
5999 |                 raw_resp_detail = resp.get("raw_response")  # Fallback check
6000 |             logger.warning(
6001 |                 f"LLM call failed: {error_msg}. Raw response preview: {str(raw_resp_detail)[:200]}"
6002 |             )
6003 |             return {"error": f"LLM API Error: {error_msg}", "raw_response": raw_resp_detail}
6004 | 
6005 |         # Extract content from the successful response message
6006 |         assistant_message = resp.get("message", {})
6007 |         content = assistant_message.get("content")
6008 |         raw_text = content.strip() if isinstance(content, str) else ""
6009 | 
6010 |         if not raw_text:
6011 |             logger.warning("LLM returned empty response content.")
6012 |             return {"error": "LLM returned empty response content."}
6013 | 
6014 |         # Handle based on whether JSON was expected
6015 |         if not expect_json:
6016 |             # Return the raw text directly
6017 |             return {"text": raw_text}
6018 |         else:
6019 |             # Attempt to parse the response as JSON
6020 |             try:
6021 |                 # Try direct JSON parsing first
6022 |                 parsed_json = json.loads(raw_text)
6023 |                 return parsed_json
6024 |             except json.JSONDecodeError:
6025 |                 # If direct parsing fails, try extracting a JSON block
6026 |                 logger.warning(
6027 |                     "LLM response was not valid JSON directly. Trying to extract JSON block..."
6028 |                 )
6029 |                 json_block = _extract_json_block(raw_text)
6030 |                 if json_block:
6031 |                     try:
6032 |                         parsed_block = json.loads(json_block)
6033 |                         logger.warning(
6034 |                             "Successfully parsed JSON block extracted from LLM response."
6035 |                         )
6036 |                         return parsed_block
6037 |                     except json.JSONDecodeError as e:
6038 |                         # Error parsing the extracted block
6039 |                         block_preview = json_block[:500]
6040 |                         error_msg = f"Could not parse extracted JSON block: {e}. Block preview: {block_preview}..."
6041 |                         logger.error(error_msg)
6042 |                         return {
6043 |                             "error": error_msg,
6044 |                             "raw_response": raw_text[:1000],
6045 |                         }  # Return raw text for debugging
6046 |                 else:
6047 |                     # No valid JSON block found within the text
6048 |                     error_msg = "Could not parse JSON from LLM response (no valid block found)."
6049 |                     logger.error(error_msg)
6050 |                     return {
6051 |                         "error": error_msg,
6052 |                         "raw_response": raw_text[:1000],
6053 |                     }  # Return raw text for debugging
6054 | 
6055 |     except ProviderError as e:
6056 |         # Catch errors raised by the chat_completion tool itself (e.g., auth, config)
6057 |         logger.error(f"LLM Provider error during chat_completion call: {e}")
6058 |         raw_resp_detail = None
6059 |         if hasattr(e, "details") and isinstance(getattr(e, "details", None), dict):
6060 |             raw_resp_detail = e.details.get("raw_response")
6061 |         return {"error": f"LLM Provider Error: {e}", "raw_response": raw_resp_detail}
6062 |     except Exception as e:
6063 |         # Catch any other unexpected errors during the call or processing
6064 |         logger.error(f"Unexpected error during LLM call: {e}", exc_info=True)
6065 |         return {"error": f"LLM call failed unexpectedly: {e}"}
6066 | 
6067 | 
6068 | # --- Macro/Autopilot Planners ---
6069 | ALLOWED_ACTIONS = {"click", "type", "wait", "download", "extract", "finish", "scroll"}
6070 | 
6071 | 
6072 | async def _plan_macro(
6073 |     page_state: Dict[str, Any], task: str, model: str = _llm_model_locator_global
6074 | ) -> List[Dict[str, Any]]:  # Uses global _llm_model_locator_global
6075 |     """Generates a sequence of browser actions (macro steps) based on page state and a task."""
6076 |     # Detailed description of allowed actions for the LLM
6077 |     action_details = """
6078 |     Allowed Actions:
6079 |     - `click`: Clicks an element. Requires `task_hint` (description of the element to click).
6080 |     - `type`: Types text into an input field. Requires `task_hint` (description of the field) and `text` (the text to type). Optional: `enter: true` to press Enter after typing, `clear_before: false` to avoid clearing field first.
6081 |     - `wait`: Pauses execution. Requires `ms` (milliseconds to wait). Use sparingly for unavoidable dynamic content delays.
6082 |     - `download`: Clicks a link/button to initiate a download. Requires `task_hint` (description of download element). Optional: `dest` (destination directory path relative to storage).
6083 |     - `extract`: Extracts text from elements matching a CSS selector. Requires `selector`. Returns a list of strings.
6084 |     - `scroll`: Scrolls the page. Requires `direction` ('up', 'down', 'top', 'bottom'). Optional: `amount_px` (pixels for 'up'/'down', default 500).
6085 |     - `finish`: Indicates the task is complete. No arguments needed. Should be the last step if the task goal is achieved.
6086 |     """
6087 | 
6088 |     # Prepare summary of elements for the LLM prompt
6089 |     elements_summary = []
6090 |     elements_list = page_state.get("elements", [])
6091 |     for el in elements_list:
6092 |         el_id = el.get("id")
6093 |         el_tag = el.get("tag")
6094 |         el_role = el.get("role", " ")
6095 |         el_text = el.get("text", " ")
6096 |         max_text_len = 80
6097 |         truncated_text = el_text[:max_text_len] + ("..." if len(el_text) > max_text_len else "")
6098 |         summary_str = f"id={el_id} tag={el_tag} role='{el_role}' text='{truncated_text}'"
6099 |         elements_summary.append(summary_str)
6100 | 
6101 |     # System prompt for the macro planner LLM
6102 |     system_prompt = textwrap.dedent(f"""
6103 |         You are an expert web automation assistant. Your goal is to create a sequence of steps (a macro) to accomplish a user's task on the current web page.
6104 |         You will be given the current page state (URL, Title, main text content, and a list of interactive elements with their IDs, tags, roles, and text).
6105 |         You will also be given the user's task.
6106 |         Based on the page state and task, generate a JSON list of action steps.
6107 |         
6108 |         EACH step in the list MUST be a JSON object containing an "action" key specifying the action name (e.g., "click", "type").
6109 |         Other keys in the object should be the required arguments for that action (e.g., "task_hint", "text", "ms", "selector", "direction").
6110 | 
6111 |         {action_details}
6112 | 
6113 |         Generate ONLY the JSON list of steps following this structure: `[ {{"action": "action_name", "arg1": "value1", ...}}, ... ]`.
6114 | 
6115 |         DO NOT include explanations or markdown formatting!
6116 |         
6117 |         If the task seems impossible or cannot be mapped to the available actions/elements, return an empty list `[]`.
6118 |         
6119 |         If the task is already complete based on the current state (e.g., "find the price" and price is visible), you can return a `finish` step or an empty list.
6120 |     """).strip()
6121 | 
6122 |     # User prompt with page state and task
6123 |     elements_str = "\n".join(elements_summary)
6124 |     main_text_preview = page_state.get("main_text", "")[:500]  # Preview main text
6125 |     user_prompt = textwrap.dedent(f"""
6126 |         Current Page State:
6127 |         URL: {page_state.get("url", "[No URL]")}
6128 |         Title: {page_state.get("title", "[No Title]")}
6129 |         Main Text (Preview): {main_text_preview}...
6130 |         Elements:
6131 |         {elements_str}
6132 | 
6133 |         User Task: "{task}"
6134 | 
6135 |         Generate the JSON list of steps to accomplish this task. Respond ONLY with the JSON list.
6136 |     """).strip()
6137 | 
6138 |     # Prepare messages and call LLM
6139 |     messages = [
6140 |         {"role": "system", "content": system_prompt},
6141 |         {"role": "user", "content": user_prompt},
6142 |     ]
6143 |     result = await _call_llm(
6144 |         messages,
6145 |         model=model,
6146 |         expect_json=True,
6147 |         temperature=0.0,
6148 |         max_tokens=2048,  # Allow reasonable size for plan
6149 |     )
6150 | 
6151 |     # Process and validate the LLM response (Revised to handle single dict)
6152 |     plan_list: Optional[List[Dict[str, Any]]] = None
6153 |     if isinstance(result, list):
6154 |         plan_list = result
6155 |     elif isinstance(result, dict) and "error" in result:
6156 |         # Handle errors reported by the LLM call itself
6157 |         error_detail = result.get("raw_response", result["error"])
6158 |         raise ToolError(f"Macro planner LLM call failed: {result['error']}", details=error_detail)
6159 |     elif isinstance(result, dict):
6160 |         # --- Handling case where LLM returns a single step dict ---
6161 |         if "action" in result:  # Check if it looks like a valid step
6162 |             logger.warning(
6163 |                 "LLM returned a single step dictionary instead of a list for macro plan. Wrapping it in a list."
6164 |             )
6165 |             plan_list = [result]
6166 |         elif "steps" in result and isinstance(result["steps"], list):
6167 |             # Handle cases where LLM wraps the list in a "steps" key (existing logic)
6168 |             logger.warning("LLM wrapped macro plan in 'steps' key. Extracting list.")
6169 |             plan_list = result["steps"]
6170 |         else:
6171 |             # It's a dict, but doesn't look like a step or contain 'steps'
6172 |             response_type = type(result).__name__
6173 |             response_preview = str(result)[:500]
6174 |             raise ToolError(
6175 |                 f"Macro planner returned unexpected dictionary format: {response_type}. Preview: '{response_preview}...'",
6176 |                 details={"raw_response": response_preview},
6177 |             )
6178 |     else:
6179 |         # Handle other unexpected response formats
6180 |         response_type = type(result).__name__
6181 |         response_preview = str(result)[:500]
6182 |         raise ToolError(
6183 |             f"Macro planner returned unexpected format: {response_type}. Expected list or dict. Preview: '{response_preview}...'",
6184 |             details={"raw_response": response_preview},
6185 |         )
6186 | 
6187 |     # Validate individual steps in the plan
6188 |     validated_plan = []
6189 |     if plan_list is not None:  # Check if we have a list to validate (could be empty list)
6190 |         for i, step in enumerate(plan_list):
6191 |             if not isinstance(step, dict) or "action" not in step:
6192 |                 # Log raw response preview on validation error
6193 |                 logger.warning(
6194 |                     f"Macro plan step {i + 1} invalid format (not dict or missing 'action'): {step}. RAW LLM RESPONSE PREVIEW: {str(result)[:500]}"
6195 |                 )
6196 |                 continue  # Skip invalid step format
6197 | 
6198 |             action = step.get("action")
6199 |             if action not in ALLOWED_ACTIONS:
6200 |                 logger.warning(f"Macro plan step {i + 1} has invalid action '{action}': {step}")
6201 |                 continue  # Skip step with unknown action
6202 | 
6203 |             # --- Basic argument checks ---
6204 |             error_flag = False
6205 |             if action in ("click", "download") and not step.get("task_hint"):
6206 |                 logger.warning(f"Macro plan step {i + 1} '{action}' missing 'task_hint': {step}")
6207 |                 error_flag = True
6208 |             if action == "type":
6209 |                 if not step.get("task_hint"):
6210 |                     logger.warning(f"Macro plan step {i + 1} 'type' missing 'task_hint': {step}")
6211 |                     error_flag = True
6212 |                 if step.get("text") is None:  # Allow empty string, but not None
6213 |                     logger.warning(f"Macro plan step {i + 1} 'type' missing 'text': {step}")
6214 |                     error_flag = True
6215 |             if action == "wait" and step.get("ms") is None:
6216 |                 logger.warning(f"Macro plan step {i + 1} 'wait' missing 'ms': {step}")
6217 |                 error_flag = True
6218 |             if action == "extract" and not step.get("selector"):
6219 |                 logger.warning(f"Macro plan step {i + 1} 'extract' missing 'selector': {step}")
6220 |                 error_flag = True
6221 |             if action == "scroll" and step.get("direction") not in ("up", "down", "top", "bottom"):
6222 |                 logger.warning(
6223 |                     f"Macro plan step {i + 1} 'scroll' has invalid or missing 'direction': {step}"
6224 |                 )
6225 |                 error_flag = True
6226 |             # Add more specific checks as needed...
6227 | 
6228 |             if not error_flag:
6229 |                 validated_plan.append(step)  # Add valid step to the final plan
6230 |             else:
6231 |                 logger.warning(
6232 |                     f"Skipping invalid macro step {i + 1} due to missing/invalid arguments."
6233 |                 )
6234 | 
6235 |     # --- Final check and logging/error based on validation outcome ---
6236 |     if not validated_plan:  # If plan is empty after validation
6237 |         response_preview = str(result)[:500] if result else "None"
6238 |         # Distinguish between LLM intentionally returning [] and validation failing all steps
6239 |         if plan_list is not None and len(plan_list) > 0:
6240 |             # LLM returned steps, but all were invalid
6241 |             raise ToolError(
6242 |                 "Macro planner generated plan, but all steps were invalid.",
6243 |                 details={"raw_response": response_preview, "original_plan_length": len(plan_list)},
6244 |             )
6245 |         elif plan_list is None:
6246 |             # This case should ideally be caught earlier by the type checking
6247 |             raise ToolError(
6248 |                 "Macro planner failed to generate a valid list or dictionary of steps.",
6249 |                 details={"raw_response": response_preview},
6250 |             )
6251 |         else:  # LLM returned [], which is valid
6252 |             logger.info(
6253 |                 "Macro planner returned an empty list, indicating task completion or impossibility."
6254 |             )
6255 |             # Return the empty list in this case
6256 |             return []
6257 | 
6258 |     logger.debug(f"Validated macro plan has {len(validated_plan)} steps.")
6259 |     return validated_plan
6260 | 
6261 | 
6262 | _AVAILABLE_TOOLS = {  # Keep as is
6263 |     # Tool Name: (Standalone Function Name, {Arg Name: Arg Type Hint})
6264 |     "search_web": (
6265 |         "search",
6266 |         {
6267 |             "query": "str",
6268 |             "engine": "Optional[str: bing|duckduckgo|yandex]",
6269 |             "max_results": "Optional[int]",
6270 |         },
6271 |     ),
6272 |     "browse_page": (
6273 |         "browse",
6274 |         {
6275 |             "url": "str",
6276 |             "wait_for_selector": "Optional[str]",
6277 |             "wait_for_navigation": "Optional[bool]",
6278 |         },
6279 |     ),  # Updated browse args
6280 |     "click_element": (
6281 |         "click",
6282 |         {
6283 |             "url": "str",
6284 |             "task_hint": "Optional[str]",
6285 |             "target": "Optional[dict]",
6286 |             "wait_ms": "Optional[int]",
6287 |         },
6288 |     ),  # Updated click args
6289 |     "type_into_fields": (
6290 |         "type_text",
6291 |         {
6292 |             "url": "str",
6293 |             "fields": "List[dict{'task_hint':str,'text':str,'enter':bool?,'clear_before':bool?}]",
6294 |             "submit_hint": "Optional[str]",
6295 |             "submit_target": "Optional[dict]",
6296 |             "wait_after_submit_ms": "Optional[int]",
6297 |         },
6298 |     ),  # Updated type_text args
6299 |     "download_file_via_click": (
6300 |         "download",
6301 |         {
6302 |             "url": "str",
6303 |             "task_hint": "Optional[str]",
6304 |             "target": "Optional[dict]",
6305 |             "dest_dir": "Optional[str]",
6306 |         },
6307 |     ),  # Updated download args
6308 |     "run_page_macro": (
6309 |         "run_macro",
6310 |         {
6311 |             "url": "str",
6312 |             "task": "str",
6313 |             "model": "Optional[str]",
6314 |             "max_rounds": "Optional[int]",
6315 |             "timeout_seconds": "Optional[int]",
6316 |         },
6317 |     ),  # Updated run_macro args
6318 |     "download_all_pdfs_from_site": (
6319 |         "download_site_pdfs",
6320 |         {
6321 |             "start_url": "str",
6322 |             "dest_subfolder": "Optional[str]",
6323 |             "include_regex": "Optional[str]",
6324 |             "max_depth": "Optional[int]",
6325 |             "max_pdfs": "Optional[int]",
6326 |             "rate_limit_rps": "Optional[float]",
6327 |         },
6328 |     ),  # Updated download_site_pdfs args
6329 |     "collect_project_documentation": (
6330 |         "collect_documentation",
6331 |         {"package": "str", "max_pages": "Optional[int]", "rate_limit_rps": "Optional[float]"},
6332 |     ),  # Updated collect_documentation args
6333 |     "process_urls_in_parallel": (
6334 |         "parallel",
6335 |         {"urls": "List[str]", "action": "str('get_state')", "max_tabs": "Optional[int]"},
6336 |     ),  # Updated parallel args
6337 |     "get_filesystem_status": ("filesystem_status", {}),  # Example Filesystem tool
6338 |     "read_file": ("read_file", {"path": "str"}),  # Example Filesystem tool
6339 |     "write_file": (
6340 |         "write_file",
6341 |         {"path": "str", "content": "Union[str, bytes]", "append": "Optional[bool]"},
6342 |     ),  # Example Filesystem tool
6343 | }
6344 | 
6345 | _PLANNER_SYS = textwrap.dedent("""
6346 |     You are an AI assistant acting as the central planner for a web automation and information retrieval system.
6347 |     Your goal is to achieve the user's complex task by selecting the appropriate tool and providing the correct arguments for each step.
6348 |     You will be given the user's overall task and a summary of results from previous steps (if any).
6349 |     You have access to a set of tools, described below with their names and argument schemas (use JSON format for args).
6350 |     Select ONE tool to execute next that will make progress towards the user's goal.
6351 |     Carefully consider the user's task and the previous results to choose the best tool and arguments.
6352 |     If a previous step failed, analyze the error and decide whether to retry, try a different approach, or ask for clarification (if interaction allowed). For now, focus on selecting the next best tool.
6353 |     If the task requires information from the web, use `search_web` first unless a specific URL is provided or implied.
6354 |     If the task involves interacting with a specific webpage (clicking, typing, downloading), use the appropriate browser tool (`browse_page`, `click_element`, `type_into_fields`, `download_file_via_click`, `run_page_macro`). Use the URL from previous steps if available.
6355 |     For filesystem operations, use the filesystem tools like `read_file`, `write_file`.
6356 |     Use `run_page_macro` for multi-step interactions on a single page described in natural language.
6357 |     Use `collect_project_documentation` or `download_all_pdfs_from_site` for specialized crawling tasks.
6358 |     Use `process_urls_in_parallel` only when needing the *same* simple action (like getting state) on *multiple* distinct URLs.
6359 | 
6360 |     Respond ONLY with a JSON list containing a single step object. The object must have:
6361 |     - "tool": The name of the selected tool (string).
6362 |     - "args": A JSON object containing the arguments for the tool (matching the schema).
6363 | 
6364 |     Example Response:
6365 |     ```json
6366 |     [
6367 |       {
6368 |         "tool": "search_web",
6369 |         "args": {
6370 |           "query": "latest news on AI regulation",
6371 |           "engine": "duckduckgo"
6372 |         }
6373 |       }
6374 |     ]
6375 |     ```
6376 |     If you determine the task is complete based on the prior results, respond with an empty JSON list `[]`.
6377 | """).strip()
6378 | 
6379 | 
6380 | async def _plan_autopilot(
6381 |     task: str, prior_results: Optional[List[Dict]] = None
6382 | ) -> List[Dict[str, Any]]:  # Uses global _AVAILABLE_TOOLS, _PLANNER_SYS, _call_llm
6383 |     """Generates the next step (tool call) for the Autopilot based on task and history."""
6384 |     # Describe available tools for the LLM prompt
6385 |     tools_desc = {}
6386 |     for name, data in _AVAILABLE_TOOLS.items():
6387 |         func_name, schema = data
6388 |         tools_desc[name] = schema
6389 | 
6390 |     # Summarize prior results concisely
6391 |     prior_summary = "None"
6392 |     if prior_results:
6393 |         summaries = []
6394 |         # Summarize last 3 steps for context, or fewer if less than 3 executed
6395 |         start_index = max(0, len(prior_results) - 3)
6396 |         for i, res in enumerate(prior_results[start_index:], start=start_index + 1):
6397 |             tool_used = res.get("tool", "?")
6398 |             was_success = res.get("success", False)
6399 |             outcome_marker = "[OK]" if was_success else "[FAIL]"
6400 |             # Get result summary or error message - prefer 'message' if present, else result/error
6401 |             result_data = res.get("message", res.get("result", res.get("error", "")))
6402 |             # Handle dict results slightly better
6403 |             if isinstance(result_data, dict):
6404 |                 # Extract key info or just summarize keys
6405 |                 dict_preview = str(list(result_data.keys()))
6406 |                 details_str = f"Dict{dict_preview[:130]}" + (
6407 |                     "..." if len(dict_preview) > 130 else ""
6408 |                 )
6409 |             else:
6410 |                 details_str = str(result_data)[:150] + (
6411 |                     "..." if len(str(result_data)) > 150 else ""
6412 |                 )  # Truncate long results/errors
6413 | 
6414 |             summary_line = f"Step {i}: Ran {tool_used} -> {outcome_marker} ({details_str})"
6415 |             summaries.append(summary_line)
6416 |         prior_summary = "\n".join(summaries)
6417 | 
6418 |     # Construct the user prompt
6419 |     tools_json_str = json.dumps(tools_desc, indent=2)
6420 |     # Use the same _PLANNER_SYS prompt, as it requests a list with one step
6421 |     user_prompt = (
6422 |         f"AVAILABLE TOOLS (Schema):\n{tools_json_str}\n\n"
6423 |         f"PRIOR RESULTS SUMMARY (Last {len(summaries) if prior_results else 0} steps):\n{prior_summary}\n\n"
6424 |         f"USER TASK:\n{task}\n\n"
6425 |         "Select the single best tool and arguments for the *next* step to achieve the user task. "
6426 |         "Respond ONLY with a JSON list containing exactly one step object (tool, args), or an empty list [] if the task is complete or cannot proceed."
6427 |     )
6428 | 
6429 |     # Prepare messages and call the LLM planner
6430 |     messages = [
6431 |         {"role": "system", "content": _PLANNER_SYS},  # Use the standardized system prompt
6432 |         {"role": "user", "content": user_prompt},
6433 |     ]
6434 |     response = await _call_llm(
6435 |         messages,
6436 |         expect_json=True,
6437 |         temperature=0.0,
6438 |         max_tokens=2048,
6439 |     )
6440 | 
6441 |     # --- Process and validate the LLM response (Revised) ---
6442 |     if isinstance(response, dict) and "error" in response:
6443 |         raise ToolError(f"Autopilot planner LLM call failed: {response['error']}")
6444 | 
6445 |     current_plan_list: List[Dict[str, Any]] = []  # Initialize as empty list
6446 | 
6447 |     if isinstance(response, list):
6448 |         current_plan_list = response  # LLM returned the expected list
6449 |     elif isinstance(response, dict):
6450 |         # --- Handling case where LLM returns a single step dict ---
6451 |         if "tool" in response and "args" in response:  # Check if it looks like a valid step
6452 |             logger.warning(
6453 |                 "Autopilot planner returned a single step dictionary instead of a list. Wrapping it."
6454 |             )
6455 |             current_plan_list = [response]
6456 |         else:
6457 |             # It's a dict, but doesn't look like a valid step
6458 |             response_type = type(response).__name__
6459 |             raise ToolError(
6460 |                 f"Autopilot planner returned unexpected dictionary format: {response_type}. Expected a JSON list or a valid step dict."
6461 |             )
6462 |     else:
6463 |         # Handle other unexpected response formats
6464 |         response_type = type(response).__name__
6465 |         raise ToolError(
6466 |             f"Autopilot planner returned unexpected format: {response_type}. Expected a JSON list."
6467 |         )
6468 | 
6469 |     # --- Validate the structure and content of the step(s) ---
6470 |     validated_plan: List[Dict[str, Any]] = []
6471 |     if len(current_plan_list) > 1:
6472 |         logger.warning(
6473 |             f"Autopilot planner returned multiple steps ({len(current_plan_list)}). Only using the first one."
6474 |         )
6475 |     elif len(current_plan_list) == 0:
6476 |         logger.info(
6477 |             "Autopilot planner returned an empty list, indicating task completion or inability to proceed."
6478 |         )
6479 |         return []  # Return empty list as intended
6480 | 
6481 |     # Process the first (and only expected) step
6482 |     if len(current_plan_list) >= 1:
6483 |         step = current_plan_list[0]
6484 |         if not isinstance(step, dict):
6485 |             logger.warning(f"Autopilot planner step is not a dictionary: {step}")
6486 |             return []  # Return empty plan if format is wrong
6487 | 
6488 |         tool_name = step.get("tool")
6489 |         tool_args = step.get("args")
6490 | 
6491 |         if not tool_name or not isinstance(tool_args, dict):
6492 |             logger.warning(
6493 |                 f"Autopilot planner step missing 'tool' or 'args' (must be dict): {step}"
6494 |             )
6495 |             return []  # Return empty plan if structure is wrong
6496 | 
6497 |         if tool_name not in _AVAILABLE_TOOLS:
6498 |             logger.warning(f"Autopilot planner selected unknown tool '{tool_name}': {step}")
6499 |             return []  # Return empty plan if tool is unknown
6500 | 
6501 |         # Optional: Add deeper validation of args based on _AVAILABLE_TOOLS schema if needed
6502 | 
6503 |         # If validation passes, add the single step to the plan
6504 |         validated_plan.append(step)
6505 | 
6506 |     # Return the validated plan (containing 0 or 1 step)
6507 |     return validated_plan
6508 | 
6509 | 
6510 | # --- Step Runner (for Macro) ---
6511 | async def run_steps(
6512 |     page: Page, steps: Sequence[Dict[str, Any]]
6513 | ) -> List[Dict[str, Any]]:  # Uses global smart_click, smart_type, smart_download
6514 |     """Executes a sequence of predefined macro steps on a given page."""
6515 |     results: List[Dict[str, Any]] = []  # Stores results of each step
6516 | 
6517 |     for i, step in enumerate(steps):
6518 |         action = step.get("action")
6519 |         step_result = step.copy()  # Start with original step data
6520 |         step_result["success"] = False  # Default to failure
6521 |         start_time = time.monotonic()
6522 |         step_num = i + 1
6523 |         should_break = False  # Initialize break flag for this step
6524 | 
6525 |         if not action:
6526 |             step_result["error"] = f"Step {step_num}: Missing 'action' key."
6527 |             logger.warning(step_result["error"])
6528 |             results.append(step_result)
6529 |             continue  # Skip to next step
6530 | 
6531 |         try:
6532 |             logger.debug(
6533 |                 f"Executing Macro Step {step_num}: Action='{action}', Args={ {k: v for k, v in step.items() if k != 'action'} }"
6534 |             )
6535 |             # --- Execute Action ---
6536 |             if action == "click":
6537 |                 hint = step.get("task_hint")
6538 |                 target_fallback = step.get("target")  # Optional fallback args
6539 |                 if not hint:
6540 |                     raise ToolInputError(
6541 |                         f"Step {step_num} ('click'): Missing required argument 'task_hint'."
6542 |                     )
6543 |                 
6544 |                 # Check for and handle common obstacles like reCAPTCHA
6545 |                 if "recaptcha" in hint.lower() or "captcha" in hint.lower():
6546 |                     # Try to detect CAPTCHA presence first
6547 |                     captcha_js = """() => {
6548 |                         return document.body.innerText.toLowerCase().includes('captcha') || 
6549 |                                document.querySelector('iframe[title*=captcha]') !== null ||
6550 |                                document.querySelector('[id*=captcha]') !== null ||
6551 |                                document.querySelector('[class*=captcha]') !== null ||
6552 |                                document.querySelector('div[class*="recaptcha"]') !== null;
6553 |                     }"""
6554 |                     captcha_detected = await page.evaluate(captcha_js)
6555 |                     if captcha_detected:
6556 |                         logger.warning(f"Step {step_num}: CAPTCHA detected but cannot be automatically solved. Marking as failed.")
6557 |                         step_result["error"] = "CAPTCHA detected - requires manual intervention"
6558 |                         step_result["success"] = False
6559 |                         # Continue to finally block without raising exception
6560 |                     else:
6561 |                         # Use the smart_click helper
6562 |                         click_success = await smart_click(
6563 |                             page, task_hint=hint, target_kwargs=target_fallback
6564 |                         )
6565 |                         step_result["success"] = click_success
6566 |                 else:
6567 |                     # Use the smart_click helper
6568 |                     click_success = await smart_click(
6569 |                         page, task_hint=hint, target_kwargs=target_fallback
6570 |                     )
6571 |                     step_result["success"] = click_success  # Should be True if no exception
6572 | 
6573 |             elif action == "type":
6574 |                 hint = step.get("task_hint")
6575 |                 target_fallback = step.get("target")
6576 |                 text = step.get("text")
6577 |                 if not hint:
6578 |                     raise ToolInputError(
6579 |                         f"Step {step_num} ('type'): Missing required argument 'task_hint'."
6580 |                     )
6581 |                 if text is None:  # Allow empty string, but not None
6582 |                     raise ToolInputError(
6583 |                         f"Step {step_num} ('type'): Missing required argument 'text'."
6584 |                     )
6585 |                 # Get optional arguments
6586 |                 press_enter = step.get("enter", False)  # Default False
6587 |                 clear_before = step.get("clear_before", True)  # Default True
6588 |                 # Use the smart_type helper
6589 |                 type_success = await smart_type(
6590 |                     page,
6591 |                     task_hint=hint,
6592 |                     text=text,
6593 |                     press_enter=press_enter,
6594 |                     clear_before=clear_before,
6595 |                     target_kwargs=target_fallback,
6596 |                     timeout_ms=5000,
6597 |                 )
6598 |                 step_result["success"] = type_success
6599 | 
6600 |             elif action == "wait":
6601 |                 ms = step.get("ms")
6602 |                 if ms is None:
6603 |                     raise ToolInputError(
6604 |                         f"Step {step_num} ('wait'): Missing required argument 'ms'."
6605 |                     )
6606 |                 try:
6607 |                     wait_ms = int(ms)
6608 |                     if wait_ms < 0:
6609 |                         raise ValueError("Wait time must be non-negative")
6610 |                     await page.wait_for_timeout(wait_ms)
6611 |                     step_result["success"] = True
6612 |                 except (ValueError, TypeError) as e:
6613 |                     raise ToolInputError(
6614 |                         f"Step {step_num} ('wait'): Invalid 'ms' value '{ms}'. {e}"
6615 |                     ) from e
6616 | 
6617 |             elif action == "download":
6618 |                 hint = step.get("task_hint")
6619 |                 target_fallback = step.get("target")
6620 |                 if not hint:
6621 |                     raise ToolInputError(
6622 |                         f"Step {step_num} ('download'): Missing required argument 'task_hint'."
6623 |                     )
6624 |                 dest_dir = step.get("dest")  # Optional destination directory
6625 |                 # Use the smart_download helper
6626 |                 download_outcome = await smart_download(
6627 |                     page, task_hint=hint, dest_dir=dest_dir, target_kwargs=target_fallback
6628 |                 )
6629 |                 step_result["result"] = download_outcome  # Store full download result
6630 |                 # Success is determined by the helper's output
6631 |                 step_result["success"] = download_outcome.get("success", False)
6632 | 
6633 |             elif action == "extract":
6634 |                 selector = step.get("selector")
6635 |                 if not selector:
6636 |                     raise ToolInputError(
6637 |                         f"Step {step_num} ('extract'): Missing required argument 'selector'."
6638 |                     )
6639 |                 # Use Playwright's evaluate_all to get text from matching elements
6640 |                 js_func = "(elements => elements.map(el => el.innerText || el.textContent || ''))"
6641 |                 extracted_texts_raw = await page.locator(selector).evaluate_all(js_func)
6642 |                 # Clean up results: filter empty strings and strip whitespace
6643 |                 extracted_texts_clean = []
6644 |                 for t in extracted_texts_raw:
6645 |                     stripped_t = t.strip()
6646 |                     if stripped_t:
6647 |                         extracted_texts_clean.append(stripped_t)
6648 |                 step_result["result"] = extracted_texts_clean
6649 |                 step_result["success"] = True  # Extraction itself succeeds if selector is valid
6650 | 
6651 |             elif action == "scroll":
6652 |                 direction = step.get("direction")
6653 |                 amount = step.get("amount_px")
6654 |                 if not direction or direction not in ["up", "down", "top", "bottom"]:
6655 |                     error_msg = f"Step {step_num} ('scroll'): Invalid or missing scroll direction: '{direction}'. Must be 'up', 'down', 'top', or 'bottom'."
6656 |                     step_result["error"] = error_msg
6657 |                     step_result["success"] = False
6658 |                     logger.warning(error_msg)
6659 |                     # Continue to finally block without raising, as scroll failure might not be critical
6660 |                 else:
6661 |                     if direction == "top":
6662 |                         js_scroll = "() => window.scrollTo(0, 0)"
6663 |                         await page.evaluate(js_scroll)
6664 |                     elif direction == "bottom":
6665 |                         js_scroll = "() => window.scrollTo(0, document.body.scrollHeight)"
6666 |                         await page.evaluate(js_scroll)
6667 |                     elif direction == "up":
6668 |                         scroll_amount = int(amount or 500)  # Default 500px
6669 |                         js_scroll = "(px) => window.scrollBy(0, -px)"
6670 |                         await page.evaluate(js_scroll, scroll_amount)
6671 |                     elif direction == "down":
6672 |                         scroll_amount = int(amount or 500)  # Default 500px
6673 |                         js_scroll = "(px) => window.scrollBy(0, px)"
6674 |                         await page.evaluate(js_scroll, scroll_amount)
6675 |                     step_result["success"] = True
6676 | 
6677 |             elif action == "finish":
6678 |                 logger.info(f"Macro execution Step {step_num}: Reached 'finish' action.")
6679 |                 step_result["success"] = True
6680 |                 # No Playwright action needed
6681 | 
6682 |             else:
6683 |                 # Should not happen if _plan_macro validates actions, but safety check
6684 |                 raise ValueError(
6685 |                     f"Step {step_num}: Unknown action '{action}' encountered during execution."
6686 |                 )
6687 | 
6688 |             # Record duration on success or handled failure (like scroll direction)
6689 |             duration_ms = int((time.monotonic() - start_time) * 1000)
6690 |             step_result["duration_ms"] = duration_ms
6691 | 
6692 |         except (
6693 |             PlaywrightTimeoutError,
6694 |             ToolError,
6695 |             ToolInputError,
6696 |             ValueError,
6697 |             AssertionError,
6698 |             Exception,
6699 |         ) as e:
6700 |             # Catch errors during action execution
6701 |             err_type = type(e).__name__
6702 |             error_msg = f"{err_type} during action '{action}': {e}"
6703 |             step_result["error"] = error_msg
6704 |             step_result["success"] = False  # Ensure success is false on error
6705 |             logger.warning(f"Macro Step {step_num} ('{action}') failed: {error_msg}")
6706 |             
6707 |             # Special handling for CAPTCHA-related errors
6708 |             if "captcha" in str(e).lower() or "recaptcha" in str(e).lower():
6709 |                 logger.warning(f"Step {step_num}: CAPTCHA-related error detected. Suggesting manual intervention.")
6710 |                 step_result["error"] = f"CAPTCHA-related error: {error_msg}. Manual intervention may be required."
6711 |             
6712 |             # Record duration even on failure
6713 |             duration_ms = int((time.monotonic() - start_time) * 1000)
6714 |             step_result["duration_ms"] = duration_ms
6715 | 
6716 |         finally:
6717 |             # Always log the step result and append to the list
6718 |             log_details = step_result.copy()  # Create copy for logging
6719 |             # Avoid logging potentially large results directly
6720 |             if "result" in log_details:
6721 |                 log_details["result_summary"] = str(log_details["result"])[:200] + "..."
6722 |                 del log_details["result"]
6723 |             await _log("macro_step_result", **log_details)
6724 |             results.append(step_result)
6725 |             
6726 |             # If a 'finish' action succeeded, stop executing further steps
6727 |             if action == "finish" and step_result.get("success", False):
6728 |                 logger.info(
6729 |                     f"Stopping macro execution after successful 'finish' action at step {step_num}."
6730 |                 )
6731 |                 should_break = True
6732 | 
6733 |         # Check break flag - this now works because should_break is properly scoped
6734 |         if should_break:
6735 |             logger.info(f"Breaking execution loop after step {step_num}")
6736 |             break
6737 | 
6738 |     return results  # Return list of results for all executed steps
6739 | 
6740 | 
6741 | # --- Universal Search ---
6742 | _ua_rotation_count = 0
6743 | _user_agent_pools = {  # Keep as is, ensure actual UAs are filled in
6744 |     "bing": deque(
6745 |         [
6746 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51",
6747 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
6748 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
6749 |         ]
6750 |     ),
6751 |     "duckduckgo": deque(
6752 |         [
6753 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
6754 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
6755 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
6756 |         ]
6757 |     ),
6758 |     "yandex": deque(
6759 |         [  # Yandex might be more sensitive, use diverse UAs
6760 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 YaBrowser/23.5.2.625 Yowser/2.5 Safari/537.36",
6761 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 YaBrowser/23.5.2.625 Yowser/2.5 Safari/537.36",
6762 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 YaBrowser/23.5.2.625 Yowser/2.5 Safari/537.36",
6763 |         ]
6764 |     ),
6765 | }
6766 | 
6767 | 
6768 | @resilient(max_attempts=2, backoff=1.0)
6769 | async def search_web(
6770 |     query: str, engine: str = "bing", max_results: int = 10
6771 | ) -> List[Dict[str, str]]:  # Uses global _log, _ua_rotation_count, _user_agent_pools
6772 |     """Performs a web search using a specified engine via browser automation."""
6773 |     global _ua_rotation_count
6774 |     engine_lower = engine.lower()
6775 |     if engine_lower not in ("bing", "duckduckgo", "yandex"):
6776 |         raise ToolInputError(
6777 |             f"Invalid search engine specified: '{engine}'. Use 'bing', 'duckduckgo', or 'yandex'."
6778 |         )
6779 | 
6780 |     # Sanitize query (basic removal of non-alphanumeric/space/hyphen/dot)
6781 |     safe_query_chars = re.sub(r"[^\w\s\-\.]", "", query)
6782 |     safe_query = safe_query_chars.strip()
6783 |     if not safe_query:
6784 |         raise ToolInputError("Search query cannot be empty or contain only invalid characters.")
6785 | 
6786 |     # URL encode the safe query
6787 |     qs = urllib.parse.quote_plus(safe_query)
6788 |     nonce = random.randint(1000, 9999)  # Simple nonce/cache buster
6789 | 
6790 |     # Search URLs and CSS Selectors per engine
6791 |     search_details = {
6792 |         "bing": {
6793 |             "url": f"https://www.bing.com/search?q={qs}&form=QBLH&nc={nonce}",
6794 |             "selectors": {
6795 |                 "item": "li.b_algo",  # Correct: Main result container
6796 |                 "title": "h2 > a",  # Correct: Targets link within H2 for title
6797 |                 "link": "h2 > a",  # Correct: Same element for link href
6798 |                 "snippet": "div.b_caption p, .TextContainer.OrganicText",  # CORRECTED: Handles standard captions and organic text containers
6799 |                 "snippet_alt": ".b_caption",  # CORRECTED: General caption container as fallback
6800 |             },
6801 |         },
6802 |         "duckduckgo": {
6803 |             "url": f"https://html.duckduckgo.com/html/?q={qs}&nc={nonce}",
6804 |             "selectors": {
6805 |                 # Use the more specific class for the main result item
6806 |                 "item": "div.web-result",
6807 |                 # Add classes for specificity, although h2>a might have been okay
6808 |                 "title": "h2.result__title > a.result__a",
6809 |                 "link": "h2.result__title > a.result__a",
6810 |                 # Snippet selector looks correct
6811 |                 "snippet": "a.result__snippet",
6812 |                 # Add the snippet_alt just in case structure varies slightly
6813 |                 "snippet_alt": "div.result__snippet",  # Alternative if snippet isn't a link
6814 |             },
6815 |         },
6816 |         "yandex": {
6817 |             # Yandex search results structure
6818 |             "url": f"https://yandex.com/search/?text={qs}&nc={nonce}&lr=202",  # Added &lr=202 based on your example URL for consistency
6819 |             "selectors": {
6820 |                 "item": "li.serp-item",  # Correct: Main result container
6821 |                 "title": "a.OrganicTitle-Link",  # CORRECTED: Target the main link for title text
6822 |                 "link": "a.OrganicTitle-Link",  # CORRECTED: Target the main link for href attribute
6823 |                 "snippet": ".TextContainer.OrganicText",  # Correct: Specific snippet container
6824 |                 "snippet_alt": ".Organic-ContentWrapper",  # Correct: Parent as fallback
6825 |             },
6826 |         },
6827 |     }
6828 | 
6829 |     engine_info = search_details[engine_lower]
6830 |     search_url = engine_info["url"]
6831 |     sel = engine_info["selectors"]
6832 | 
6833 |     # Rotate User Agent
6834 |     _ua_rotation_count += 1
6835 |     ua_pool = _user_agent_pools[engine_lower]
6836 |     if _ua_rotation_count % 20 == 0 and len(ua_pool) > 1:
6837 |         # Rotate deque periodically
6838 |         first_ua = ua_pool.popleft()
6839 |         ua_pool.append(first_ua)
6840 |     ua = ua_pool[0]  # Use the current first UA
6841 | 
6842 |     # Get incognito context with specific UA
6843 |     context_args = {"user_agent": ua, "locale": "en-US"}  # Set UA and locale
6844 |     ctx, _ = await get_browser_context(use_incognito=True, context_args=context_args)
6845 |     page = None  # Initialize page variable
6846 | 
6847 |     try:
6848 |         page = await ctx.new_page()
6849 |         await _log("search_start", engine=engine_lower, query=query, url=search_url, ua=ua)
6850 |         # Navigate to search URL
6851 |         nav_timeout = 30000  # 30 seconds
6852 |         await page.goto(search_url, wait_until="domcontentloaded", timeout=nav_timeout)
6853 | 
6854 |         # Handle DuckDuckGo HTML meta refresh if present
6855 |         if engine_lower == "duckduckgo":
6856 |             try:
6857 |                 meta_refresh_selector = 'meta[http-equiv="refresh"]'
6858 |                 meta_refresh = await page.query_selector(meta_refresh_selector)
6859 |                 if meta_refresh:
6860 |                     content_attr = await meta_refresh.get_attribute("content")
6861 |                     if content_attr and "url=" in content_attr.lower():
6862 |                         # Extract redirect URL
6863 |                         match = re.search(r'url=([^"]+)', content_attr, re.IGNORECASE)
6864 |                         if match:
6865 |                             redirect_url_raw = match.group(1)
6866 |                             # Basic clean up of URL just in case
6867 |                             redirect_url = redirect_url_raw.strip("'\" ")
6868 |                             logger.info(
6869 |                                 f"Following meta refresh redirect on DDG HTML: {redirect_url}"
6870 |                             )
6871 |                             await page.goto(
6872 |                                 redirect_url, wait_until="domcontentloaded", timeout=20000
6873 |                             )
6874 |                             await asyncio.sleep(0.5)  # Brief pause after redirect
6875 |             except PlaywrightException as e:
6876 |                 logger.warning(f"Error checking/following meta refresh on DDG HTML: {e}")
6877 | 
6878 |         # Wait for results container to be visible
6879 |         wait_selector_timeout = 10000  # 10 seconds
6880 |         try:
6881 |             await page.wait_for_selector(
6882 |                 sel["item"], state="visible", timeout=wait_selector_timeout
6883 |             )
6884 |         except PlaywrightTimeoutError as e:
6885 |             # Check for CAPTCHA before assuming no results
6886 |             captcha_js = "() => document.body.innerText.toLowerCase().includes('captcha') || document.querySelector('iframe[title*=captcha]') || document.querySelector('[id*=captcha]')"
6887 |             captcha_found = await page.evaluate(captcha_js)
6888 |             if captcha_found:
6889 |                 await _log("search_captcha", engine=engine_lower, query=query)
6890 |                 raise ToolError(
6891 |                     f"CAPTCHA detected on {engine_lower} search.", error_code="captcha_detected"
6892 |                 ) from e
6893 |             else:
6894 |                 # No results selector found, and no obvious CAPTCHA
6895 |                 await _log(
6896 |                     "search_no_results_selector",
6897 |                     engine=engine_lower,
6898 |                     query=query,
6899 |                     selector=sel["item"],
6900 |                 )
6901 |                 return []  # Return empty list for no results
6902 | 
6903 |         # Brief pause and try to accept consent cookies (best effort)
6904 |         await asyncio.sleep(random.uniform(0.5, 1.5))
6905 |         consent_selectors = [
6906 |             'button:has-text("Accept")',
6907 |             'button:has-text("Agree")',
6908 |             'button[id*="consent"]',
6909 |             'button[class*="consent"]',
6910 |         ]
6911 |         for btn_sel in consent_selectors:
6912 |             try:
6913 |                 consent_button = page.locator(btn_sel).first
6914 |                 await consent_button.click(timeout=1000)  # Short timeout for consent click
6915 |                 logger.debug(f"Clicked potential consent button: {btn_sel}")
6916 |                 await asyncio.sleep(0.3)  # Pause after click
6917 |                 break  # Stop after first successful click
6918 |             except PlaywrightException:
6919 |                 pass  # Ignore if selector not found or click fails
6920 | 
6921 |         # Extract results using page.evaluate
6922 |         extract_js = """
6923 |         (args) => {
6924 |             const results = [];
6925 |             const items = document.querySelectorAll(args.sel.item);
6926 |             for (let i = 0; i < Math.min(items.length, args.max_results); i++) {
6927 |                 const item = items[i];
6928 |                 const titleEl = item.querySelector(args.sel.title);
6929 |                 const linkEl = item.querySelector(args.sel.link);
6930 |                 let snippetEl = item.querySelector(args.sel.snippet);
6931 |                 // Use fallback snippet selector if primary not found
6932 |                 if (!snippetEl && args.sel.snippet_alt) {
6933 |                      snippetEl = item.querySelector(args.sel.snippet_alt);
6934 |                 }
6935 | 
6936 |                 const title = titleEl ? titleEl.innerText.trim() : '';
6937 |                 let link = linkEl ? linkEl.href : '';
6938 |                 // Clean DDG HTML links
6939 |                 if (link && link.includes('uddg=')) {
6940 |                     try {
6941 |                         const urlParams = new URLSearchParams(link.split('?')[1]);
6942 |                         link = urlParams.get('uddg') || link;
6943 |                     } catch (e) { /* ignore URL parsing errors */ }
6944 |                 }
6945 |                 const snippet = snippetEl ? snippetEl.innerText.trim() : '';
6946 | 
6947 |                 // Only add if essential parts (link and title or snippet) are present
6948 |                 if (link && (title || snippet)) {
6949 |                     results.push({ title, link, snippet });
6950 |                 }
6951 |             }
6952 |             return results;
6953 |         }
6954 |         """
6955 |         eval_args = {"sel": sel, "max_results": max_results}
6956 |         results = await page.evaluate(extract_js, eval_args)
6957 | 
6958 |         # Log completion and return results
6959 |         num_results = len(results)
6960 |         await _log("search_complete", engine=engine_lower, query=query, num_results=num_results)
6961 |         return results
6962 | 
6963 |     except PlaywrightException as e:
6964 |         # Handle Playwright errors during navigation or interaction
6965 |         await _log("search_error_playwright", engine=engine_lower, query=query, error=str(e))
6966 |         raise ToolError(f"Playwright error during {engine_lower} search for '{query}': {e}") from e
6967 |     except Exception as e:
6968 |         # Handle unexpected errors
6969 |         await _log("search_error_unexpected", engine=engine_lower, query=query, error=str(e))
6970 |         raise ToolError(f"Unexpected error during {engine_lower} search for '{query}': {e}") from e
6971 |     finally:
6972 |         # Ensure page and context are closed
6973 |         if page and not page.is_closed():
6974 |             await page.close()
6975 |         if ctx:
6976 |             await ctx.close()
6977 | 
6978 | 
6979 | # --- Initialization Function ---
6980 | async def _ensure_initialized():  # Uses MANY globals
6981 |     """Main initialization sequence for standalone Smart Browser tools."""
6982 |     global \
6983 |         _is_initialized, \
6984 |         _thread_pool, \
6985 |         _locator_cache_cleanup_task_handle, \
6986 |         _inactivity_monitor_task_handle
6987 |     global _SB_INTERNAL_BASE_PATH_STR, _STATE_FILE, _LOG_FILE, _CACHE_DB, _READ_JS_CACHE
6988 |     # Globals for config values
6989 |     global _sb_state_key_b64_global, _sb_max_tabs_global, _sb_tab_timeout_global
6990 |     global _sb_inactivity_timeout_global, _headless_mode_global, _vnc_enabled_global
6991 |     global _vnc_password_global, _proxy_pool_str_global, _proxy_allowed_domains_str_global
6992 |     global _vault_allowed_paths_str_global, _max_widgets_global, _max_section_chars_global
6993 |     global _dom_fp_limit_global, _llm_model_locator_global, _retry_after_fail_global
6994 |     global _seq_cutoff_global, _area_min_global, _high_risk_domains_set_global
6995 |     global _cpu_count, _pw, _browser, _ctx
6996 |     global _pid, _last_activity
6997 | 
6998 |     # Ensure _last_activity has a valid monotonic time ASAP if it's still at its module-load default.
6999 |     # This is a defensive measure.
7000 |     if _last_activity == 0.0:
7001 |         _last_activity = time.monotonic()
7002 |         logger.debug(
7003 |             f"Defensively setting initial _last_activity in _ensure_initialized: {_last_activity}"
7004 |         )
7005 | 
7006 |     # Quick check if already initialized
7007 |     if _is_initialized:
7008 |         return
7009 | 
7010 |     # Use lock to prevent concurrent initialization
7011 |     async with _init_lock:
7012 |         # Double-check after acquiring lock
7013 |         if _is_initialized:
7014 |             return
7015 |         logger.info("Performing first-time async initialization of SmartBrowser tools...")
7016 | 
7017 |         # --- Step 1: Load Config into Globals ---
7018 |         try:
7019 |             config = get_config()
7020 |             sb_config: SmartBrowserConfig = config.smart_browser  # Access nested config
7021 | 
7022 |             # Assign config values to globals, using defaults if config value is None/missing
7023 |             _sb_state_key_b64_global = sb_config.sb_state_key_b64 or _sb_state_key_b64_global
7024 |             _sb_max_tabs_global = sb_config.sb_max_tabs or _sb_max_tabs_global
7025 |             _sb_tab_timeout_global = sb_config.sb_tab_timeout or _sb_tab_timeout_global
7026 |             _sb_inactivity_timeout_global = (
7027 |                 sb_config.sb_inactivity_timeout or _sb_inactivity_timeout_global
7028 |             )
7029 |             # Handle booleans carefully (check for None, not just falsiness)
7030 |             if sb_config.headless_mode is not None:
7031 |                 _headless_mode_global = sb_config.headless_mode
7032 |             if sb_config.vnc_enabled is not None:
7033 |                 _vnc_enabled_global = sb_config.vnc_enabled
7034 |             _vnc_password_global = sb_config.vnc_password or _vnc_password_global
7035 |             _proxy_pool_str_global = sb_config.proxy_pool_str or _proxy_pool_str_global
7036 |             _proxy_allowed_domains_str_global = (
7037 |                 sb_config.proxy_allowed_domains_str or _proxy_allowed_domains_str_global
7038 |             )
7039 |             _vault_allowed_paths_str_global = (
7040 |                 sb_config.vault_allowed_paths_str or _vault_allowed_paths_str_global
7041 |             )
7042 |             _max_widgets_global = sb_config.max_widgets or _max_widgets_global
7043 |             _max_section_chars_global = sb_config.max_section_chars or _max_section_chars_global
7044 |             _dom_fp_limit_global = sb_config.dom_fp_limit or _dom_fp_limit_global
7045 |             _llm_model_locator_global = sb_config.llm_model_locator or _llm_model_locator_global
7046 |             if sb_config.retry_after_fail is not None:
7047 |                 _retry_after_fail_global = sb_config.retry_after_fail
7048 |             if sb_config.seq_cutoff is not None:
7049 |                 _seq_cutoff_global = sb_config.seq_cutoff
7050 |             _area_min_global = sb_config.area_min or _area_min_global
7051 |             # Handle set carefully (assign if present in config)
7052 |             if sb_config.high_risk_domains_set is not None:
7053 |                 _high_risk_domains_set_global = sb_config.high_risk_domains_set
7054 | 
7055 |             logger.info("Smart Browser configuration loaded into global variables.")
7056 |             # Update derived settings from config strings
7057 |             _update_proxy_settings()
7058 |             _update_vault_paths()
7059 | 
7060 |             # --- Reconfigure thread pool based on loaded config ---
7061 |             # Get current max_workers (handle potential attribute absence)
7062 |             current_max_workers = getattr(
7063 |                 _thread_pool, "_max_workers", min(32, (_cpu_count or 1) * 2 + 4)
7064 |             )
7065 |             # Calculate desired based on *loaded* max tabs config
7066 |             desired_max_workers = min(32, _sb_max_tabs_global * 2)
7067 |             # Recreate pool only if worker count needs to change
7068 |             if current_max_workers != desired_max_workers:
7069 |                 logger.info(
7070 |                     f"Reconfiguring thread pool max_workers from {current_max_workers} to {desired_max_workers} based on config."
7071 |                 )
7072 |                 _thread_pool.shutdown(wait=True)  # Wait for existing tasks
7073 |                 _thread_pool = concurrent.futures.ThreadPoolExecutor(
7074 |                     max_workers=desired_max_workers, thread_name_prefix="sb_worker"
7075 |                 )
7076 | 
7077 |         except Exception as e:
7078 |             logger.error(
7079 |                 f"Error loading Smart Browser config: {e}. Using default global values.",
7080 |                 exc_info=True,
7081 |             )
7082 |             # Ensure derived settings are updated even if config load fails
7083 |             _update_proxy_settings()
7084 |             _update_vault_paths()
7085 | 
7086 |         # --- Step 2: Prepare Internal Storage Directory ---
7087 |         try:
7088 |             # Define relative path for internal storage (within the main storage area)
7089 |             internal_storage_relative_path = "storage/smart_browser_internal"
7090 |             logger.info(
7091 |                 f"Ensuring internal storage directory exists: '{internal_storage_relative_path}' using filesystem tool."
7092 |             )
7093 |             # Use STANDALONE create_directory tool
7094 |             create_dir_result = await create_directory(path=internal_storage_relative_path)
7095 |             # Validate result
7096 |             if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
7097 |                 error_msg = (
7098 |                     create_dir_result.get("error", "Unknown")
7099 |                     if isinstance(create_dir_result, dict)
7100 |                     else "Invalid response"
7101 |                 )
7102 |                 raise ToolError(
7103 |                     f"Filesystem tool failed to create internal directory '{internal_storage_relative_path}'. Error: {error_msg}"
7104 |                 )
7105 | 
7106 |             resolved_base_path_str = create_dir_result.get("path")
7107 |             if not resolved_base_path_str:
7108 |                 raise ToolError(
7109 |                     "Filesystem tool create_directory succeeded but did not return the absolute path."
7110 |                 )
7111 | 
7112 |             # Set global path variables based on the resolved absolute path
7113 |             _SB_INTERNAL_BASE_PATH_STR = resolved_base_path_str
7114 |             internal_base_path = Path(_SB_INTERNAL_BASE_PATH_STR)
7115 |             _STATE_FILE = internal_base_path / "storage_state.enc"
7116 |             _LOG_FILE = internal_base_path / "audit.log"
7117 |             _CACHE_DB = internal_base_path / "locator_cache.db"  # Adjusted name from original
7118 |             _READ_JS_CACHE = internal_base_path / "readability.js"
7119 |             logger.info(
7120 |                 f"Smart Browser internal file paths configured within: {internal_base_path}"
7121 |             )
7122 | 
7123 |             # Initialize components that depend on these paths
7124 |             _init_last_hash()  # Initialize audit log hash chain (sync)
7125 |             _init_locator_cache_db_sync()  # Initialize DB schema (sync)
7126 | 
7127 |         except Exception as e:
7128 |             # If storage setup fails, it's critical, stop initialization
7129 |             logger.critical(
7130 |                 f"CRITICAL FAILURE: Could not initialize Smart Browser internal storage at '{internal_storage_relative_path}': {e}",
7131 |             )
7132 |             return  # Do not proceed
7133 | 
7134 |         # --- Step 3: Initialize Browser Context (triggers Playwright launch if needed) ---
7135 |         try:
7136 |             logger.info("Initializing Playwright browser and shared context...")
7137 |             await get_browser_context()  # Call helper to ensure PW, browser, shared context exist
7138 |             logger.info("Playwright browser and shared context initialized successfully.")
7139 |         except Exception as e:
7140 |             logger.critical(
7141 |                 f"CRITICAL FAILURE: Failed to initialize Playwright components: {e}", exc_info=True
7142 |             )
7143 |             # Attempt cleanup? Maybe not here, shutdown handler should cover it.
7144 |             return  # Stop initialization
7145 | 
7146 |         # --- Step 4: Start Background Tasks ---
7147 |         # Start Inactivity Monitor
7148 |         timeout_sec = _sb_inactivity_timeout_global
7149 |         if timeout_sec > 0:
7150 |             if _inactivity_monitor_task_handle is None or _inactivity_monitor_task_handle.done():
7151 |                 logger.info(
7152 |                     f"Starting browser inactivity monitor task (Timeout: {timeout_sec}s)..."
7153 |                 )
7154 |                 _inactivity_monitor_task_handle = asyncio.create_task(
7155 |                     _inactivity_monitor(timeout_sec)
7156 |                 )
7157 |             else:
7158 |                 logger.debug("Inactivity monitor task already running.")
7159 |         else:
7160 |             logger.info("Browser inactivity monitor disabled (timeout <= 0).")
7161 | 
7162 |         # Start Locator Cache Cleanup Task
7163 |         cleanup_interval_sec = 24 * 60 * 60  # Run daily
7164 |         if _locator_cache_cleanup_task_handle is None or _locator_cache_cleanup_task_handle.done():
7165 |             logger.info(
7166 |                 f"Starting locator cache cleanup task (Interval: {cleanup_interval_sec}s)..."
7167 |             )
7168 |             _locator_cache_cleanup_task_handle = asyncio.create_task(
7169 |                 _locator_cache_cleanup_task(interval_seconds=cleanup_interval_sec)
7170 |             )
7171 |         else:
7172 |             logger.debug("Locator cache cleanup task already running.")
7173 | 
7174 |         # --- Finalize ---
7175 |         _is_initialized = True
7176 |         _last_activity = time.monotonic()  # Set initial activity time after successful init
7177 |         logger.info("SmartBrowser tools async components initialized successfully.")
7178 | 
7179 | 
7180 | # --- Helper: Inactivity Monitor ---
7181 | async def _inactivity_monitor(timeout_seconds: int):  # Uses globals _browser, _last_activity
7182 |     """Monitors browser inactivity and triggers shutdown if idle for too long."""
7183 |     check_interval = 60  # Check every 60 seconds
7184 |     logger.info(
7185 |         f"Inactivity monitor started. Timeout: {timeout_seconds}s, Check Interval: {check_interval}s."
7186 |     )
7187 |     while True:
7188 |         await asyncio.sleep(check_interval)
7189 |         browser_active = False
7190 |         try:
7191 |             # Safely check browser status under lock
7192 |             async with _playwright_lock:
7193 |                 if _browser is not None and _browser.is_connected():
7194 |                     browser_active = True
7195 |         except Exception as check_err:
7196 |             logger.warning(f"Error checking browser status in inactivity monitor: {check_err}")
7197 |             # Assume active or handle error? Assume active to avoid premature shutdown on check error.
7198 |             browser_active = True  # Or consider stopping monitor?
7199 | 
7200 |         if not browser_active:
7201 |             logger.info("Inactivity monitor: Browser is closed or disconnected. Stopping monitor.")
7202 |             break  # Exit monitor loop if browser is gone
7203 | 
7204 |         # Calculate idle time
7205 |         current_time = time.monotonic()
7206 |         idle_time = current_time - _last_activity
7207 | 
7208 |         logger.debug(
7209 |             f"Inactivity check: Idle time = {idle_time:.1f}s (Timeout = {timeout_seconds}s)"
7210 |         )
7211 | 
7212 |         # Check if idle time exceeds timeout
7213 |         if idle_time > timeout_seconds:
7214 |             logger.info(
7215 |                 f"Browser inactive for {idle_time:.1f}s (exceeds {timeout_seconds}s timeout). Initiating automatic shutdown."
7216 |             )
7217 |             # First stop this monitor task to prevent further shutdown attempts
7218 |             logger.info("Inactivity monitor stopped.")
7219 |             try:
7220 |                 # Initiate shutdown (ensures it runs only once)
7221 |                 await _initiate_shutdown()
7222 |             except Exception as e:
7223 |                 # Log error during shutdown attempt, but break anyway
7224 |                 logger.error(
7225 |                     f"Error during automatic shutdown initiated by inactivity monitor: {e}",
7226 |                     exc_info=True,
7227 |                 )
7228 |             # Exit monitor loop after attempting shutdown
7229 |             break
7230 | 
7231 | 
7232 | @with_tool_metrics
7233 | @with_error_handling
7234 | async def search(query: str, engine: str = "bing", max_results: int = 10) -> Dict[str, Any]:
7235 |     """Performs a web search using the helper function and returns results."""
7236 |     # Ensure SB is initialized
7237 |     await _ensure_initialized()
7238 |     # Update activity timestamp
7239 |     _update_activity()
7240 | 
7241 |     # --- Input Validation ---
7242 |     if max_results <= 0:
7243 |         logger.warning(f"max_results was {max_results}. Setting to default 10.")
7244 |         max_results = 10
7245 |     # Engine validation happens within search_web helper
7246 | 
7247 |     # --- Execute Search ---
7248 |     # Call the underlying search_web helper function
7249 |     results = await search_web(query, engine=engine, max_results=max_results)
7250 |     result_count = len(results)
7251 | 
7252 |     # --- Return Result ---
7253 |     return {
7254 |         "success": True,
7255 |         "query": query,
7256 |         "engine": engine.lower(),  # Return normalized engine name
7257 |         "results": results,
7258 |         "result_count": result_count,
7259 |     }
7260 | 
7261 | 
7262 | @with_tool_metrics
7263 | @with_error_handling
7264 | async def download(  # This is the exported tool function
7265 |     url: str,
7266 |     target: Optional[Dict[str, Any]] = None,
7267 |     task_hint: Optional[str] = None,
7268 |     dest_dir: Optional[str] = None,
7269 | ) -> Dict[str, Any]:
7270 |     """Navigates, clicks (using hint/target) to download, saves file, returns info."""
7271 |     # Ensure SB is initialized
7272 |     await _ensure_initialized()
7273 |     # Update activity timestamp
7274 |     _update_activity()
7275 | 
7276 |     # --- Input Validation: Determine task_hint ---
7277 |     effective_task_hint = task_hint
7278 |     if not effective_task_hint:  # Generate hint if missing
7279 |         if target and (target.get("name") or target.get("role")):
7280 |             name = target.get("name", "")
7281 |             role = target.get("role", "")  # Default role empty if not specified
7282 |             hint_base = "Download link/button"
7283 |             target_desc = f"{name or role}".strip()  # Use name or role
7284 |             if target_desc:
7285 |                 effective_task_hint = f"{hint_base} '{target_desc}'"
7286 |             else:
7287 |                 effective_task_hint = hint_base  # Fallback if target has no name/role
7288 |             logger.debug(f"download tool generated task_hint: '{effective_task_hint}'")
7289 |         else:
7290 |             raise ToolInputError(
7291 |                 "download tool requires 'task_hint', or 'target' dict containing 'name' or 'role'."
7292 |             )
7293 | 
7294 |     # --- Get Context and Execute ---
7295 |     ctx, _ = await get_browser_context()
7296 |     async with _tab_context(ctx) as page:
7297 |         # Navigate to the page containing the download link
7298 |         await _log("download_navigate", url=url, hint=effective_task_hint)
7299 |         try:
7300 |             nav_timeout = 60000
7301 |             await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
7302 |         except PlaywrightException as e:
7303 |             # Use f-string for cleaner message concatenation
7304 |             raise ToolError(
7305 |                 f"Navigation failed for URL '{url}' before download attempt: {e}"
7306 |             ) from e
7307 | 
7308 |         # Call the underlying smart_download helper function
7309 |         # This helper now handles the click, waiting for download, saving, and analysis
7310 |         download_info = await smart_download(
7311 |             page,
7312 |             task_hint=effective_task_hint,
7313 |             dest_dir=dest_dir,  # Pass optional destination directory
7314 |             target_kwargs=target,  # Pass optional target details
7315 |         )
7316 | 
7317 |         # smart_download raises ToolError on failure, so this check is mostly redundant
7318 |         # but kept as a safeguard. The result structure is also slightly different now.
7319 |         if not download_info.get("success"):
7320 |             error_msg = download_info.get("error", "Download failed with unknown error.")
7321 |             raise ToolError(f"Download failed: {error_msg}", details=download_info)
7322 | 
7323 |         # Return success structure containing the download details
7324 |         return {"success": True, "download": download_info}
7325 | 
7326 | 
7327 | @with_tool_metrics
7328 | @with_error_handling
7329 | async def download_site_pdfs(
7330 |     start_url: str,
7331 |     dest_subfolder: Optional[str] = None,
7332 |     include_regex: Optional[str] = None,
7333 |     max_depth: int = 2,
7334 |     max_pdfs: int = 100,
7335 |     max_pages_crawl: int = 500,
7336 |     rate_limit_rps: float = 1.0,
7337 | ) -> Dict[str, Any]:
7338 |     """Crawls site, finds PDFs, downloads them directly using httpx and FileSystemTool."""
7339 |     # Ensure SB is initialized
7340 |     await _ensure_initialized()
7341 |     # Update activity timestamp
7342 |     _update_activity()
7343 | 
7344 |     # --- Validate Inputs ---
7345 |     if not start_url:
7346 |         raise ToolInputError("start_url cannot be empty.")
7347 |     if max_depth < 0:
7348 |         raise ToolInputError("max_depth cannot be negative.")
7349 |     if max_pdfs <= 0:
7350 |         raise ToolInputError("max_pdfs must be positive.")
7351 |     if max_pages_crawl <= 0:
7352 |         raise ToolInputError("max_pages_crawl must be positive.")
7353 |     if rate_limit_rps <= 0:
7354 |         raise ToolInputError("rate_limit_rps must be positive.")
7355 | 
7356 |     # --- Prepare Download Directory ---
7357 |     final_dest_dir_str: Optional[str] = None
7358 |     try:
7359 |         # Generate a safe subfolder name from input or domain
7360 |         if dest_subfolder:
7361 |             safe_subfolder = _slugify(dest_subfolder, 50)
7362 |         else:
7363 |             try:
7364 |                 parsed_start = urlparse(start_url)
7365 |                 domain_slug = _slugify(parsed_start.netloc, 50)
7366 |                 safe_subfolder = domain_slug or "downloaded_pdfs"  # Fallback if domain is empty
7367 |             except Exception:
7368 |                 safe_subfolder = "downloaded_pdfs"  # Fallback on URL parse error
7369 | 
7370 |         # Define relative path within the main storage area
7371 |         dest_dir_relative_path = f"storage/smart_browser_site_pdfs/{safe_subfolder}"
7372 |         logger.info(
7373 |             f"Ensuring download directory exists for PDF crawl: '{dest_dir_relative_path}' using filesystem tool."
7374 |         )
7375 |         # Use STANDALONE create_directory tool
7376 |         create_dir_result = await create_directory(path=dest_dir_relative_path)
7377 |         if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
7378 |             error_msg = (
7379 |                 create_dir_result.get("error", "Unknown")
7380 |                 if isinstance(create_dir_result, dict)
7381 |                 else "Invalid response"
7382 |             )
7383 |             raise ToolError(
7384 |                 f"Filesystem tool failed to create directory '{dest_dir_relative_path}'. Error: {error_msg}"
7385 |             )
7386 | 
7387 |         # Get the absolute path returned by the tool
7388 |         final_dest_dir_str = create_dir_result.get("path")
7389 |         if not final_dest_dir_str:
7390 |             raise ToolError(
7391 |                 f"Filesystem tool create_directory succeeded for '{dest_dir_relative_path}' but did not return an absolute path."
7392 |             )
7393 |         logger.info(f"PDF crawl download directory confirmed/created at: {final_dest_dir_str}")
7394 |     except Exception as e:
7395 |         # Wrap directory preparation errors
7396 |         raise ToolError(
7397 |             f"Could not prepare download directory '{dest_dir_relative_path}': {str(e)}"
7398 |         ) from e
7399 | 
7400 |     # --- Crawl for PDF URLs ---
7401 |     logger.info(
7402 |         f"Starting PDF crawl from: {start_url} (Max Depth: {max_depth}, Max PDFs: {max_pdfs}, Max Pages: {max_pages_crawl})"
7403 |     )
7404 |     try:
7405 |         # Use the helper function to find PDF URLs
7406 |         pdf_urls = await crawl_for_pdfs(
7407 |             start_url,
7408 |             include_regex,
7409 |             max_depth,
7410 |             max_pdfs,
7411 |             max_pages_crawl,
7412 |             rate_limit_rps=5.0,  # Use slightly higher rate for crawl itself
7413 |         )
7414 |     except Exception as crawl_err:
7415 |         raise ToolError(
7416 |             f"Error during PDF crawl phase from '{start_url}': {crawl_err}"
7417 |         ) from crawl_err
7418 | 
7419 |     if not pdf_urls:
7420 |         logger.info("No matching PDF URLs found during crawl.")
7421 |         return {
7422 |             "success": True,
7423 |             "pdf_count": 0,
7424 |             "failed_count": 0,
7425 |             "dest_dir": final_dest_dir_str,
7426 |             "files": [],  # Empty list as no files were downloaded
7427 |         }
7428 | 
7429 |     # --- Download Found PDFs ---
7430 |     num_found = len(pdf_urls)
7431 |     logger.info(
7432 |         f"Found {num_found} PDF URLs. Starting downloads to '{final_dest_dir_str}' (Rate Limit: {rate_limit_rps}/s)..."
7433 |     )
7434 |     # Use the specified rate limit for downloads
7435 |     limiter = RateLimiter(rate_limit_rps)
7436 | 
7437 |     # Define the async task for downloading a single file
7438 |     async def download_task(url, seq):
7439 |         await limiter.acquire()  # Wait for rate limit permit
7440 |         # Use the direct download helper
7441 |         result = await _download_file_direct(url, final_dest_dir_str, seq)
7442 |         return result
7443 | 
7444 |     # Create and run download tasks concurrently
7445 |     download_tasks = []
7446 |     for i, url in enumerate(pdf_urls):
7447 |         task = asyncio.create_task(download_task(url, i + 1))
7448 |         download_tasks.append(task)
7449 | 
7450 |     results = await asyncio.gather(*download_tasks)  # Wait for all downloads
7451 | 
7452 |     # Process results
7453 |     successful_downloads = []
7454 |     failed_downloads = []
7455 |     for r in results:
7456 |         if isinstance(r, dict) and r.get("success"):
7457 |             successful_downloads.append(r)
7458 |         else:
7459 |             failed_downloads.append(r)  # Includes non-dict results or dicts with success=False
7460 | 
7461 |     num_successful = len(successful_downloads)
7462 |     num_failed = len(failed_downloads)
7463 | 
7464 |     # Log summary
7465 |     log_details = {
7466 |         "start_url": start_url,
7467 |         "found": num_found,
7468 |         "successful": num_successful,
7469 |         "failed": num_failed,
7470 |         "dest_dir": final_dest_dir_str,
7471 |     }
7472 |     if failed_downloads:
7473 |         # Log preview of failed download errors
7474 |         errors_preview = []
7475 |         for res in failed_downloads[:3]:  # Log first 3 errors
7476 |             err_url = res.get("url", "N/A") if isinstance(res, dict) else "N/A"
7477 |             err_msg = res.get("error", "Unknown error") if isinstance(res, dict) else str(res)
7478 |             errors_preview.append(f"{err_url}: {err_msg}")
7479 |         log_details["errors_preview"] = errors_preview
7480 |     await _log("download_site_pdfs_complete", **log_details)
7481 | 
7482 |     # Return final result
7483 |     return {
7484 |         "success": True,  # Overall tool execution success
7485 |         "pdf_count": num_successful,
7486 |         "failed_count": num_failed,
7487 |         "dest_dir": final_dest_dir_str,
7488 |         "files": results,  # Return list of all result dicts (success and failure)
7489 |     }
7490 | 
7491 | 
7492 | @with_tool_metrics
7493 | @with_error_handling
7494 | async def collect_documentation(
7495 |     package: str, max_pages: int = 40, rate_limit_rps: float = 2.0
7496 | ) -> Dict[str, Any]:
7497 |     """Finds docs site, crawls, extracts text, saves using FileSystemTool."""
7498 |     # Ensure SB is initialized
7499 |     await _ensure_initialized()
7500 |     # Update activity timestamp
7501 |     _update_activity()
7502 | 
7503 |     # --- Validate Inputs ---
7504 |     if not package:
7505 |         raise ToolInputError("Package name cannot be empty.")
7506 |     if max_pages <= 0:
7507 |         raise ToolInputError("max_pages must be positive.")
7508 |     if rate_limit_rps <= 0:
7509 |         raise ToolInputError("rate_limit_rps must be positive.")
7510 | 
7511 |     # --- Find Documentation Root URL ---
7512 |     try:
7513 |         docs_root = await _pick_docs_root(package)
7514 |         if not docs_root:
7515 |             raise ToolError(
7516 |                 f"Could not automatically find a likely documentation site for package '{package}'."
7517 |             )
7518 |     except Exception as e:
7519 |         # Wrap errors during root finding
7520 |         raise ToolError(f"Error finding documentation root for '{package}': {str(e)}") from e
7521 | 
7522 |     # --- Crawl Documentation Site ---
7523 |     logger.info(f"Found potential docs root: {docs_root}. Starting documentation crawl...")
7524 |     try:
7525 |         # Use the helper function to crawl and extract content
7526 |         pages_content = await crawl_docs_site(
7527 |             docs_root, max_pages=max_pages, rate_limit_rps=rate_limit_rps
7528 |         )
7529 |     except Exception as e:
7530 |         # Wrap errors during crawling
7531 |         raise ToolError(
7532 |             f"Error crawling documentation site starting from {docs_root}: {str(e)}"
7533 |         ) from e
7534 | 
7535 |     # Check if content was collected
7536 |     num_pages_collected = len(pages_content)
7537 |     if num_pages_collected == 0:
7538 |         logger.info(f"No readable content collected from documentation site for '{package}'.")
7539 |         return {
7540 |             "success": True,  # Tool ran successfully, but found no content
7541 |             "package": package,
7542 |             "pages_collected": 0,
7543 |             "file_path": None,  # No file saved
7544 |             "root_url": docs_root,
7545 |             "message": "No readable content pages were collected from the documentation site.",
7546 |         }
7547 |     logger.info(f"Collected readable content from {num_pages_collected} pages for '{package}'.")
7548 | 
7549 |     # --- Prepare Output Directory ---
7550 |     output_dir_relative_path = "storage/smart_browser_docs_collected"
7551 |     created_dir_path: Optional[str] = None
7552 |     try:
7553 |         logger.info(
7554 |             f"Ensuring documentation output directory exists: '{output_dir_relative_path}' using filesystem tool."
7555 |         )
7556 |         create_result = await create_directory(path=output_dir_relative_path)  # STANDALONE call
7557 |         if not isinstance(create_result, dict) or not create_result.get("success"):
7558 |             error_msg = (
7559 |                 create_result.get("error", "Unknown")
7560 |                 if isinstance(create_result, dict)
7561 |                 else "Invalid response"
7562 |             )
7563 |             raise ToolError(
7564 |                 f"Filesystem tool failed to create directory '{output_dir_relative_path}'. Error: {error_msg}"
7565 |             )
7566 |         created_dir_path = create_result.get("path")  # Get absolute path
7567 |         if not created_dir_path:
7568 |             raise ToolError(
7569 |                 f"Filesystem tool create_directory for '{output_dir_relative_path}' did not return an absolute path."
7570 |             )
7571 |         logger.info(f"Ensured output directory exists at: '{created_dir_path}'")
7572 |     except Exception as e:
7573 |         # Wrap directory preparation errors
7574 |         raise ToolError(
7575 |             f"Could not prepare output directory '{output_dir_relative_path}': {str(e)}"
7576 |         ) from e
7577 | 
7578 |     # --- Format Content and Determine Filename ---
7579 |     # Create a unique filename based on package and timestamp
7580 |     now_utc_str = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
7581 |     safe_pkg_name = _slugify(package, 40)
7582 |     filename = f"{safe_pkg_name}_docs_{now_utc_str}.txt"
7583 |     # Construct relative path for writing (FS tool handles base path resolution)
7584 |     fpath_relative = f"{output_dir_relative_path}/{filename}"
7585 | 
7586 |     # Combine collected content into a single string
7587 |     separator = "\n\n" + ("=" * 80) + "\n\n"  # Separator between pages
7588 |     header = f"# Documentation for: {package}\n# Crawl Root: {docs_root}\n{separator}"
7589 |     combined_content = header
7590 |     try:
7591 |         page_texts = []
7592 |         for i, (url, text) in enumerate(pages_content):
7593 |             page_header = f"## Page {i + 1}: {str(url)}\n\n"
7594 |             page_body = str(text).strip()  # Ensure text is string and stripped
7595 |             page_texts.append(page_header + page_body)
7596 |         # Join all page sections with the separator
7597 |         combined_content += separator.join(page_texts)
7598 |     except Exception as e:
7599 |         # Handle potential errors during string formatting/joining
7600 |         raise ToolError(f"Error formatting collected documentation content: {str(e)}") from e
7601 | 
7602 |     # --- Write Combined Content using Filesystem Tool ---
7603 |     final_absolute_fpath: Optional[str] = None
7604 |     try:
7605 |         logger.info(f"Writing combined documentation content to relative path: {fpath_relative}")
7606 |         write_result = await write_file(
7607 |             path=fpath_relative, content=combined_content
7608 |         )  # STANDALONE call
7609 |         if not isinstance(write_result, dict) or not write_result.get("success"):
7610 |             error_msg = (
7611 |                 write_result.get("error", "Unknown")
7612 |                 if isinstance(write_result, dict)
7613 |                 else "Invalid response"
7614 |             )
7615 |             raise ToolError(
7616 |                 f"Filesystem tool failed to write documentation file '{fpath_relative}'. Error: {error_msg}"
7617 |             )
7618 | 
7619 |         # Get the absolute path where the file was actually written
7620 |         final_absolute_fpath = write_result.get("path")
7621 |         if not final_absolute_fpath:
7622 |             logger.warning(
7623 |                 f"Filesystem tool write_file for '{fpath_relative}' did not return an absolute path. Using relative path in result."
7624 |             )
7625 |             final_absolute_fpath = fpath_relative  # Fallback for logging/return value
7626 | 
7627 |         logger.info(f"Successfully wrote combined documentation to: {final_absolute_fpath}")
7628 |     except Exception as e:
7629 |         # Wrap errors during file write
7630 |         raise ToolError(f"Could not write documentation file '{fpath_relative}': {str(e)}") from e
7631 | 
7632 |     # --- Log Success and Return Result ---
7633 |     await _log(
7634 |         "docs_collected_success",
7635 |         package=package,
7636 |         root=docs_root,
7637 |         pages=num_pages_collected,
7638 |         file=str(final_absolute_fpath),
7639 |     )
7640 |     return {
7641 |         "success": True,
7642 |         "package": package,
7643 |         "pages_collected": num_pages_collected,
7644 |         "file_path": str(final_absolute_fpath),  # Return the absolute path
7645 |         "root_url": docs_root,
7646 |         "message": f"Collected and saved content from {num_pages_collected} pages for '{package}'.",
7647 |     }
7648 | 
7649 | 
7650 | @with_tool_metrics
7651 | @with_error_handling
7652 | async def run_macro(  # Renamed from execute_macro to avoid confusion
7653 |     url: str,
7654 |     task: str,
7655 |     model: str = _llm_model_locator_global,
7656 |     max_rounds: int = 7,
7657 |     timeout_seconds: int = 600,
7658 | ) -> Dict[str, Any]:
7659 |     """Navigates to URL and executes a natural language task using LLM planner and step runner."""
7660 |     # Ensure SB is initialized
7661 |     await _ensure_initialized()
7662 |     # Update activity timestamp
7663 |     _update_activity()
7664 | 
7665 |     # --- Input Validation ---
7666 |     if not url:
7667 |         raise ToolInputError("URL cannot be empty.")
7668 |     if not task:
7669 |         raise ToolInputError("Task description cannot be empty.")
7670 |     if max_rounds <= 0:
7671 |         raise ToolInputError("max_rounds must be positive.")
7672 |     if timeout_seconds <= 0:
7673 |         raise ToolInputError("timeout_seconds must be positive.")
7674 | 
7675 |     # Define the inner function to run with timeout
7676 |     async def run_macro_inner():
7677 |         ctx, _ = await get_browser_context()
7678 |         async with _tab_context(ctx) as page:
7679 |             # Navigate to the starting URL
7680 |             await _log("macro_navigate", url=url, task=task)
7681 |             try:
7682 |                 nav_timeout = 60000
7683 |                 await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
7684 |             except PlaywrightException as e:
7685 |                 # Use f-string for cleaner message
7686 |                 raise ToolError(f"Navigation to '{url}' failed before starting macro: {e}") from e
7687 | 
7688 |             # Call the helper function that contains the plan-act loop
7689 |             # This helper handles planning, running steps, and logging rounds/errors
7690 |             step_results = await _run_macro_execution_loop(page, task, max_rounds, model)
7691 | 
7692 |             # Get final page state after macro execution
7693 |             final_state = {}  # Initialize as empty dict
7694 |             try:
7695 |                 final_state = await get_page_state(page)
7696 |             except Exception as e:
7697 |                 logger.error(f"Failed to get final page state after macro execution: {e}")
7698 |                 final_state = {"error": f"Failed to get final page state: {e}"}
7699 | 
7700 |             # Determine overall macro success
7701 |             # Success if:
7702 |             # 1. A 'finish' step was executed successfully OR
7703 |             # 2. All steps executed (excluding wait/finish/extract?) succeeded.
7704 |             finished_successfully = any(
7705 |                 s.get("action") == "finish" and s.get("success") for s in step_results
7706 |             )
7707 |             # Check if all non-finish/wait/extract steps succeeded (if any exist)
7708 |             all_other_steps_succeeded = True
7709 |             non_terminal_steps_exist = False
7710 |             for s in step_results:
7711 |                 action = s.get("action")
7712 |                 # Consider steps other than these potentially "passive" ones for failure check
7713 |                 if action not in ("finish", "wait", "extract", "scroll", "error"):
7714 |                     non_terminal_steps_exist = True  # noqa: F841
7715 |                     if not s.get("success", False):
7716 |                         all_other_steps_succeeded = False
7717 |                         break  # Found a failed critical step
7718 | 
7719 |             # Macro succeeds if finished explicitly or if all critical steps passed (and at least one step ran)
7720 |             macro_success = finished_successfully or (
7721 |                 bool(step_results) and all_other_steps_succeeded
7722 |             )
7723 | 
7724 |             # Return final results
7725 |             return {
7726 |                 "success": macro_success,
7727 |                 "task": task,
7728 |                 "steps": step_results,  # List of results for each step executed
7729 |                 "final_page_state": final_state,
7730 |             }
7731 | 
7732 |     # Run the inner function with an overall timeout
7733 |     try:
7734 |         result = await asyncio.wait_for(run_macro_inner(), timeout=timeout_seconds)
7735 |         return result
7736 |     except asyncio.TimeoutError:
7737 |         # Handle overall macro timeout
7738 |         await _log("macro_timeout", url=url, task=task, timeout=timeout_seconds)
7739 |         return {
7740 |             "success": False,
7741 |             "task": task,
7742 |             "error": f"Macro execution timed out after {timeout_seconds}s.",
7743 |             "steps": [],  # No steps completed within timeout (or results lost)
7744 |             "final_page_state": {"error": "Macro timed out"},
7745 |         }
7746 | 
7747 | 
7748 | async def _run_macro_execution_loop(
7749 |     page: Page, task: str, max_rounds: int, model: str
7750 | ) -> List[Dict[str, Any]]:
7751 |     """Internal helper containing the plan-and-execute loop for run_macro."""
7752 |     all_step_results: List[Dict[str, Any]] = []
7753 |     current_task_description = task  # Initial task
7754 | 
7755 |     for i in range(max_rounds):
7756 |         round_num = i + 1
7757 |         logger.info(f"--- Macro Round {round_num}/{max_rounds} ---")
7758 |         task_preview = current_task_description[:100] + (
7759 |             "..." if len(current_task_description) > 100 else ""
7760 |         )
7761 |         logger.info(f"Current Task: {task_preview}")
7762 | 
7763 |         try:
7764 |             # 1. Get Current Page State
7765 |             logger.debug(f"Macro Round {round_num}: Getting page state...")
7766 |             state = await get_page_state(page)
7767 |             if "error" in state:  # Handle error getting state
7768 |                 error_msg = (
7769 |                     f"Failed to get page state before planning round {round_num}: {state['error']}"
7770 |                 )
7771 |                 logger.error(error_msg)
7772 |                 # Add error step and stop
7773 |                 all_step_results.append(
7774 |                     {"action": "error", "success": False, "error": error_msg, "round": round_num}
7775 |                 )
7776 |                 return all_step_results
7777 | 
7778 |             # 2. Plan Next Steps using LLM
7779 |             logger.debug(f"Macro Round {round_num}: Planning steps with LLM...")
7780 |             plan = await _plan_macro(state, current_task_description, model)
7781 |             await _log(
7782 |                 "macro_plan_generated",
7783 |                 round=round_num,
7784 |                 task=current_task_description,
7785 |                 plan_length=len(plan),
7786 |                 plan_preview=plan[:2],
7787 |             )
7788 | 
7789 |             # Check if plan is empty (task complete or impossible)
7790 |             if not plan:
7791 |                 logger.info(
7792 |                     f"Macro Round {round_num}: Planner returned empty plan. Assuming task complete or impossible."
7793 |                 )
7794 |                 await _log("macro_plan_empty", round=round_num, task=current_task_description)
7795 |                 break  # Exit loop
7796 | 
7797 |             # 3. Execute Planned Steps
7798 |             logger.info(f"Macro Round {round_num}: Executing {len(plan)} planned steps...")
7799 |             step_results_this_round = await run_steps(page, plan)
7800 |             all_step_results.extend(step_results_this_round)  # Add results to overall list
7801 | 
7802 |             # 4. Check Round Outcome
7803 |             finished_this_round = any(
7804 |                 s.get("action") == "finish" and s.get("success") for s in step_results_this_round
7805 |             )
7806 |             last_step_failed = False
7807 |             if step_results_this_round:
7808 |                 last_step = step_results_this_round[-1]
7809 |                 # Check if the *last* step failed and wasn't a passive action
7810 |                 is_passive_action = last_step.get("action") in (
7811 |                     "wait",
7812 |                     "finish",
7813 |                     "extract",
7814 |                     "scroll",
7815 |                     "error",
7816 |                 )
7817 |                 if not last_step.get("success", False) and not is_passive_action:
7818 |                     last_step_failed = True
7819 |                     error_info = last_step.get("error", "?")
7820 |                     failed_action = last_step.get("action", "?")
7821 |                     await _log(
7822 |                         "macro_fail_step", round=round_num, action=failed_action, error=error_info
7823 |                     )
7824 |                     logger.warning(
7825 |                         f"Macro Round {round_num} stopped due to failed critical step: Action='{failed_action}', Error='{error_info}'"
7826 |                     )
7827 | 
7828 |             # Exit loop if 'finish' action succeeded or last critical step failed
7829 |             if finished_this_round:
7830 |                 await _log("macro_finish_action", round=round_num)
7831 |                 logger.info(
7832 |                     f"Macro finished successfully via 'finish' action in round {round_num}."
7833 |                 )
7834 |                 return all_step_results  # Return immediately after successful finish
7835 |             if last_step_failed:
7836 |                 logger.info(f"Stopping macro execution after failed step in round {round_num}.")
7837 |                 return all_step_results  # Return results up to the failure
7838 | 
7839 |             # If loop continues, update task description for next round?
7840 |             # (Currently, task description remains the same throughout)
7841 |             # current_task_description = "Refine based on results..." # Example modification point
7842 | 
7843 |         except ToolError as e:
7844 |             # Handle errors during planning or state retrieval specifically
7845 |             await _log(
7846 |                 "macro_error_tool", round=round_num, task=current_task_description, error=str(e)
7847 |             )
7848 |             logger.error(f"Macro Round {round_num} failed with ToolError: {e}")
7849 |             all_step_results.append(
7850 |                 {
7851 |                     "action": "error",
7852 |                     "success": False,
7853 |                     "error": f"ToolError in Round {round_num}: {e}",
7854 |                     "round": round_num,
7855 |                 }
7856 |             )
7857 |             return all_step_results  # Stop execution on tool errors
7858 |         except Exception as e:
7859 |             # Handle unexpected errors during the round
7860 |             await _log(
7861 |                 "macro_error_unexpected",
7862 |                 round=round_num,
7863 |                 task=current_task_description,
7864 |                 error=str(e),
7865 |             )
7866 |             logger.error(f"Macro Round {round_num} failed unexpectedly: {e}", exc_info=True)
7867 |             all_step_results.append(
7868 |                 {
7869 |                     "action": "error",
7870 |                     "success": False,
7871 |                     "error": f"Unexpected Error in Round {round_num}: {e}",
7872 |                     "round": round_num,
7873 |                 }
7874 |             )
7875 |             return all_step_results  # Stop execution on unexpected errors
7876 | 
7877 |     # If loop finishes due to max_rounds
7878 |     await _log("macro_exceeded_rounds", max_rounds=max_rounds, task=task)
7879 |     logger.warning(f"Macro stopped after reaching maximum rounds ({max_rounds}) for task: {task}")
7880 |     return all_step_results  # Return all collected results
7881 | 
7882 | 
7883 | @with_tool_metrics
7884 | @with_error_handling
7885 | async def autopilot(
7886 |     task: str,
7887 |     scratch_subdir: str = "autopilot_runs",
7888 |     max_steps: int = 10,
7889 |     timeout_seconds: int = 1800,
7890 | ) -> Dict[str, Any]:
7891 |     """Executes a complex multi-step task using LLM planning and available tools."""
7892 |     # Ensure SB is initialized
7893 |     await _ensure_initialized()
7894 | 
7895 |     # --- Validate Inputs ---
7896 |     if not task:
7897 |         raise ToolInputError("Task description cannot be empty.")
7898 |     if max_steps <= 0:
7899 |         raise ToolInputError("max_steps must be positive.")
7900 |     if timeout_seconds <= 0:
7901 |         raise ToolInputError("timeout_seconds must be positive.")
7902 | 
7903 |     # --- Prepare Scratch Directory and Logging ---
7904 |     final_scratch_dir_str: Optional[str] = None
7905 |     log_path: Optional[Path] = None
7906 |     try:
7907 |         # Define base path for scratch files
7908 |         scratch_base_relative = "storage/smart_browser_scratch"
7909 |         # Sanitize user-provided subdir name
7910 |         safe_subdir = _slugify(scratch_subdir, 50) or "autopilot_run"  # Fallback name
7911 |         scratch_dir_relative_path = f"{scratch_base_relative}/{safe_subdir}"
7912 | 
7913 |         logger.info(
7914 |             f"Ensuring autopilot scratch directory exists: '{scratch_dir_relative_path}' using filesystem tool."
7915 |         )
7916 |         # Use STANDALONE create_directory tool
7917 |         create_dir_result = await create_directory(path=scratch_dir_relative_path)
7918 |         if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
7919 |             error_msg = (
7920 |                 create_dir_result.get("error", "Unknown")
7921 |                 if isinstance(create_dir_result, dict)
7922 |                 else "Invalid response"
7923 |             )
7924 |             raise ToolError(
7925 |                 f"Filesystem tool failed to create scratch directory '{scratch_dir_relative_path}'. Error: {error_msg}"
7926 |             )
7927 | 
7928 |         # Get the absolute path
7929 |         final_scratch_dir_str = create_dir_result.get("path")
7930 |         if not final_scratch_dir_str:
7931 |             raise ToolError(
7932 |                 f"Filesystem tool create_directory for '{scratch_dir_relative_path}' did not return an absolute path."
7933 |             )
7934 |         final_scratch_dir_path = Path(final_scratch_dir_str)
7935 |         logger.info(f"Autopilot scratch directory confirmed/created at: {final_scratch_dir_path}")
7936 | 
7937 |         # Prepare log file path within the scratch directory
7938 |         run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
7939 |         log_filename = f"autopilot_run_{run_id}.jsonl"
7940 |         log_path = final_scratch_dir_path / log_filename
7941 |         logger.info(f"Autopilot run '{run_id}' started. Execution log: {log_path}")
7942 | 
7943 |     except Exception as e:
7944 |         # Wrap directory preparation errors
7945 |         raise ToolError(
7946 |             f"Could not prepare scratch directory '{scratch_dir_relative_path}': {str(e)}"
7947 |         ) from e
7948 | 
7949 |     # Define the inner function to run with timeout
7950 |     async def autopilot_inner():
7951 |         all_results: List[Dict] = []  # Stores results of each step
7952 |         current_task_description = task  # Initial task
7953 | 
7954 |         try:
7955 |             # --- Initial Planning ---
7956 |             logger.info("Autopilot: Generating initial plan...")
7957 |             current_plan = await _plan_autopilot(
7958 |                 current_task_description, None
7959 |             )  # Initial plan has no prior results
7960 |             step_num = 0
7961 | 
7962 |             # --- Execution Loop ---
7963 |             while step_num < max_steps and current_plan:
7964 |                 step_num += 1
7965 |                 step_to_execute = current_plan[0]  # Get the next step
7966 |                 tool_name = step_to_execute.get("tool")
7967 |                 args = step_to_execute.get("args", {})
7968 |                 # Initialize log entry for this step
7969 |                 step_log = {
7970 |                     "step": step_num,
7971 |                     "tool": tool_name,
7972 |                     "args": args,
7973 |                     "success": False,
7974 |                     "result": None,
7975 |                     "error": None,
7976 |                 }
7977 |                 logger.info(
7978 |                     f"--- Autopilot Step {step_num}/{max_steps}: Executing Tool '{tool_name}' ---"
7979 |                 )
7980 |                 logger.debug(f"Step {step_num} Args: {args}")
7981 | 
7982 |                 # Validate tool exists
7983 |                 if tool_name not in _AVAILABLE_TOOLS:
7984 |                     error_msg = f"Planner selected unknown tool '{tool_name}'."
7985 |                     step_log["error"] = error_msg
7986 |                     logger.error(error_msg)
7987 |                     current_plan = []  # Stop execution if tool is unknown
7988 |                 else:
7989 |                     # --- Tool Lookup and Execution ---
7990 |                     method_name = _AVAILABLE_TOOLS[tool_name][0]  # Get function name string
7991 |                     # Look up the actual function object
7992 |                     tool_func = globals().get(method_name)  # Check current module globals first
7993 |                     if not tool_func or not callable(tool_func):
7994 |                         # Try external tool lookups if not found locally
7995 |                         tool_func = _get_filesystem_tool(method_name) or _get_completion_tool(
7996 |                             method_name
7997 |                         )
7998 | 
7999 |                     if not tool_func or not callable(tool_func):
8000 |                         # Tool function implementation not found
8001 |                         error_msg = f"Internal error: Could not find function implementation for tool '{tool_name}' (expected function: '{method_name}')."
8002 |                         step_log["error"] = error_msg
8003 |                         logger.error(error_msg)
8004 |                         current_plan = []  # Stop execution
8005 |                     else:
8006 |                         # --- Execute the Found Tool Function ---
8007 |                         try:
8008 |                             await _log(
8009 |                                 "autopilot_step_start", step=step_num, tool=tool_name, args=args
8010 |                             )
8011 |                             _update_activity()  # Update activity before long tool call
8012 |                             # Call the standalone tool function with its arguments
8013 |                             outcome = await tool_func(**args)
8014 |                             _update_activity()  # Update activity after tool call returns
8015 | 
8016 |                             # Record outcome in step log
8017 |                             step_log["success"] = outcome.get("success", False)
8018 |                             step_log["result"] = outcome  # Store the full result dict
8019 | 
8020 |                             # --- Plan for Next Step (or Replan on Failure) ---
8021 |                             if step_log["success"]:
8022 |                                 await _log(
8023 |                                     "autopilot_step_success",
8024 |                                     step=step_num,
8025 |                                     tool=tool_name,
8026 |                                     result_summary=str(outcome)[:200],
8027 |                                 )
8028 |                                 logger.info(
8029 |                                     f"Autopilot Step {step_num} ({tool_name}) completed successfully."
8030 |                                 )
8031 |                                 # Remove completed step and plan next based on success
8032 |                                 current_plan.pop(0)  # Remove executed step
8033 |                                 if current_plan:  # If plan wasn't just one step
8034 |                                     logger.debug("Plan has remaining steps, continuing...")
8035 |                                 elif not current_plan:  # Plan is now empty after successful step
8036 |                                     logger.info(
8037 |                                         "Autopilot: Attempting to generate next plan step..."
8038 |                                     )
8039 |                                     try:
8040 |                                         current_plan = await _plan_autopilot(
8041 |                                             current_task_description, all_results + [step_log]
8042 |                                         )
8043 |                                         plan_count = len(current_plan)
8044 |                                         logger.info(f"Generated next plan ({plan_count} step(s)).")
8045 |                                         await _log(
8046 |                                             "autopilot_replan_success",
8047 |                                             reason="step_complete",
8048 |                                             new_steps=plan_count,
8049 |                                         )
8050 |                                     except Exception as replan_err:
8051 |                                         logger.error(
8052 |                                             f"Autopilot replanning after step success failed: {replan_err}"
8053 |                                         )
8054 |                                         await _log(
8055 |                                             "autopilot_replan_fail",
8056 |                                             reason="step_complete",
8057 |                                             error=str(replan_err),
8058 |                                         )
8059 |                                         current_plan = []  # Stop if replanning fails
8060 |                             else:
8061 |                                 # Step failed
8062 |                                 step_log["error"] = outcome.get(
8063 |                                     "error", f"Tool '{tool_name}' failed without specific error."
8064 |                                 )
8065 |                                 await _log(
8066 |                                     "autopilot_step_fail",
8067 |                                     step=step_num,
8068 |                                     tool=tool_name,
8069 |                                     error=step_log["error"],
8070 |                                 )
8071 |                                 logger.warning(
8072 |                                     f"Autopilot Step {step_num} ({tool_name}) failed: {step_log['error']}"
8073 |                                 )
8074 |                                 logger.info(f"Attempting replan after failed step {step_num}...")
8075 |                                 try:
8076 |                                     # Replan based on the failure
8077 |                                     new_plan_tail = await _plan_autopilot(
8078 |                                         current_task_description, all_results + [step_log]
8079 |                                     )
8080 |                                     current_plan = new_plan_tail  # Replace old plan with new one
8081 |                                     plan_count = len(current_plan)
8082 |                                     logger.info(
8083 |                                         f"Replanning successful after failure. New plan has {plan_count} step(s)."
8084 |                                     )
8085 |                                     await _log(
8086 |                                         "autopilot_replan_success",
8087 |                                         reason="step_fail",
8088 |                                         new_steps=plan_count,
8089 |                                     )
8090 |                                 except Exception as replan_err:
8091 |                                     logger.error(
8092 |                                         f"Autopilot replanning after step failure failed: {replan_err}"
8093 |                                     )
8094 |                                     await _log(
8095 |                                         "autopilot_replan_fail",
8096 |                                         reason="step_fail",
8097 |                                         error=str(replan_err),
8098 |                                     )
8099 |                                     current_plan = []  # Stop if replanning fails
8100 | 
8101 |                         except (
8102 |                             ToolInputError,
8103 |                             ToolError,
8104 |                             ValueError,
8105 |                             TypeError,
8106 |                             AssertionError,
8107 |                         ) as e:
8108 |                             # Catch errors *during* tool execution (e.g., bad args passed validation but failed in tool)
8109 |                             error_msg = f"{type(e).__name__} executing '{tool_name}': {e}"
8110 |                             step_log["error"] = error_msg
8111 |                             step_log["success"] = False
8112 |                             logger.error(
8113 |                                 f"Autopilot Step {step_num} ({tool_name}) execution failed: {error_msg}",
8114 |                                 exc_info=True,
8115 |                             )
8116 |                             current_plan = []  # Stop execution on tool error
8117 |                         except Exception as e:
8118 |                             # Catch unexpected errors during tool execution
8119 |                             error_msg = f"Unexpected error executing '{tool_name}': {e}"
8120 |                             step_log["error"] = error_msg
8121 |                             step_log["success"] = False
8122 |                             logger.critical(
8123 |                                 f"Autopilot Step {step_num} ({tool_name}) failed unexpectedly: {error_msg}",
8124 |                                 exc_info=True,
8125 |                             )
8126 |                             current_plan = []  # Stop execution
8127 | 
8128 |                 # Append the result of this step to the overall results
8129 |                 all_results.append(step_log)
8130 |                 # --- Log Step Result to File ---
8131 |                 if log_path:
8132 |                     try:
8133 |                         log_entry = (
8134 |                             json.dumps(step_log, default=str) + "\n"
8135 |                         )  # Use default=str for non-serializable types
8136 |                         async with aiofiles.open(log_path, "a", encoding="utf-8") as log_f:
8137 |                             await log_f.write(log_entry)
8138 |                     except IOError as log_e:
8139 |                         logger.error(f"Failed to write autopilot step log to {log_path}: {log_e}")
8140 |                     except Exception as json_e:
8141 |                         logger.error(f"Failed to serialize step log for writing: {json_e}")
8142 | 
8143 |             # --- Loop End Conditions ---
8144 |             if step_num >= max_steps:
8145 |                 logger.warning(f"Autopilot stopped: Reached maximum step limit ({max_steps}).")
8146 |                 await _log("autopilot_max_steps", task=task, steps=step_num)
8147 |             elif not current_plan and step_num > 0:
8148 |                 # Plan became empty (either task finished or replan failed/returned empty)
8149 |                 final_step_success = all_results[-1].get("success", False) if all_results else False
8150 |                 if final_step_success:
8151 |                     logger.info(f"Autopilot plan complete after {step_num} steps.")
8152 |                     await _log("autopilot_plan_end", task=task, steps=step_num, status="completed")
8153 |                 else:
8154 |                     logger.warning(
8155 |                         f"Autopilot stopped after {step_num} steps due to failure or inability to plan next step."
8156 |                     )
8157 |                     await _log(
8158 |                         "autopilot_plan_end", task=task, steps=step_num, status="failed_or_stuck"
8159 |                     )
8160 |             elif step_num == 0:
8161 |                 # Initial plan was empty
8162 |                 logger.warning("Autopilot: Initial plan was empty. No steps executed.")
8163 |                 await _log("autopilot_plan_end", task=task, steps=0, status="no_plan")
8164 | 
8165 |             # Determine overall success based on the success of the *last* executed step
8166 |             overall_success = bool(all_results) and all_results[-1].get("success", False)
8167 |             # Return final summary
8168 |             return {
8169 |                 "success": overall_success,
8170 |                 "steps_executed": step_num,
8171 |                 "run_log": str(log_path) if log_path else None,
8172 |                 "final_results": all_results[-3:],  # Return summary of last few steps
8173 |             }
8174 |         except Exception as autopilot_err:
8175 |             # Catch critical errors during planning or loop setup
8176 |             logger.critical(
8177 |                 f"Autopilot run failed critically before or during execution loop: {autopilot_err}",
8178 |                 exc_info=True,
8179 |             )
8180 |             await _log("autopilot_critical_error", task=task, error=str(autopilot_err))
8181 |             # Log error to file if possible
8182 |             error_entry = {
8183 |                 "step": 0,
8184 |                 "success": False,
8185 |                 "error": f"Autopilot critical failure: {autopilot_err}",
8186 |             }
8187 |             if log_path:
8188 |                 try:
8189 |                     log_entry = json.dumps(error_entry, default=str) + "\n"
8190 |                     async with aiofiles.open(log_path, "a", encoding="utf-8") as log_f:
8191 |                         await log_f.write(log_entry)
8192 |                 except Exception as final_log_e:
8193 |                     logger.error(
8194 |                         f"Failed to write final critical error log to {log_path}: {final_log_e}"
8195 |                     )
8196 |             # Raise ToolError to indicate autopilot failure
8197 |             raise ToolError(f"Autopilot failed critically: {autopilot_err}") from autopilot_err
8198 | 
8199 |     # --- Run with Timeout ---
8200 |     try:
8201 |         result = await asyncio.wait_for(autopilot_inner(), timeout=timeout_seconds)
8202 |         return result
8203 |     except asyncio.TimeoutError:
8204 |         error_msg = f"Autopilot execution timed out after {timeout_seconds}s."
8205 |         logger.error(error_msg)
8206 |         await _log("autopilot_timeout", task=task, timeout=timeout_seconds)
8207 |         # Log timeout to file if possible
8208 |         if log_path:
8209 |             try:
8210 |                 timeout_entry = {"step": -1, "success": False, "error": error_msg}
8211 |                 log_entry = json.dumps(timeout_entry, default=str) + "\n"
8212 |                 async with aiofiles.open(log_path, "a", encoding="utf-8") as log_f:
8213 |                     await log_f.write(log_entry)
8214 |             except Exception as timeout_log_e:
8215 |                 logger.error(f"Failed to write timeout log entry to {log_path}: {timeout_log_e}")
8216 |         # Return timeout failure
8217 |         return {
8218 |             "success": False,
8219 |             "error": error_msg,
8220 |             "steps_executed": -1,  # Indicate timeout before completion
8221 |             "run_log": str(log_path) if log_path else None,
8222 |             "final_results": [],  # No final results available on timeout
8223 |         }
8224 | 
```