This is page 45 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│ ├── __init__.py
│ ├── advanced_agent_flows_using_unified_memory_system_demo.py
│ ├── advanced_extraction_demo.py
│ ├── advanced_unified_memory_system_demo.py
│ ├── advanced_vector_search_demo.py
│ ├── analytics_reporting_demo.py
│ ├── audio_transcription_demo.py
│ ├── basic_completion_demo.py
│ ├── cache_demo.py
│ ├── claude_integration_demo.py
│ ├── compare_synthesize_demo.py
│ ├── cost_optimization.py
│ ├── data
│ │ ├── sample_event.txt
│ │ ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│ │ └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│ ├── docstring_refiner_demo.py
│ ├── document_conversion_and_processing_demo.py
│ ├── entity_relation_graph_demo.py
│ ├── filesystem_operations_demo.py
│ ├── grok_integration_demo.py
│ ├── local_text_tools_demo.py
│ ├── marqo_fused_search_demo.py
│ ├── measure_model_speeds.py
│ ├── meta_api_demo.py
│ ├── multi_provider_demo.py
│ ├── ollama_integration_demo.py
│ ├── prompt_templates_demo.py
│ ├── python_sandbox_demo.py
│ ├── rag_example.py
│ ├── research_workflow_demo.py
│ ├── sample
│ │ ├── article.txt
│ │ ├── backprop_paper.pdf
│ │ ├── buffett.pdf
│ │ ├── contract_link.txt
│ │ ├── legal_contract.txt
│ │ ├── medical_case.txt
│ │ ├── northwind.db
│ │ ├── research_paper.txt
│ │ ├── sample_data.json
│ │ └── text_classification_samples
│ │ ├── email_classification.txt
│ │ ├── news_samples.txt
│ │ ├── product_reviews.txt
│ │ └── support_tickets.txt
│ ├── sample_docs
│ │ └── downloaded
│ │ └── attention_is_all_you_need.pdf
│ ├── sentiment_analysis_demo.py
│ ├── simple_completion_demo.py
│ ├── single_shot_synthesis_demo.py
│ ├── smart_browser_demo.py
│ ├── sql_database_demo.py
│ ├── sse_client_demo.py
│ ├── test_code_extraction.py
│ ├── test_content_detection.py
│ ├── test_ollama.py
│ ├── text_classification_demo.py
│ ├── text_redline_demo.py
│ ├── tool_composition_examples.py
│ ├── tournament_code_demo.py
│ ├── tournament_text_demo.py
│ ├── unified_memory_system_demo.py
│ ├── vector_search_demo.py
│ ├── web_automation_instruction_packs.py
│ └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│ └── smart_browser_internal
│ ├── locator_cache.db
│ ├── readability.js
│ └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── integration
│ │ ├── __init__.py
│ │ └── test_server.py
│ ├── manual
│ │ ├── test_extraction_advanced.py
│ │ └── test_extraction.py
│ └── unit
│ ├── __init__.py
│ ├── test_cache.py
│ ├── test_providers.py
│ └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── commands.py
│ │ ├── helpers.py
│ │ └── typer_cli.py
│ ├── clients
│ │ ├── __init__.py
│ │ ├── completion_client.py
│ │ └── rag_client.py
│ ├── config
│ │ └── examples
│ │ └── filesystem_config.yaml
│ ├── config.py
│ ├── constants.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── evaluation
│ │ │ ├── base.py
│ │ │ └── evaluators.py
│ │ ├── providers
│ │ │ ├── __init__.py
│ │ │ ├── anthropic.py
│ │ │ ├── base.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemini.py
│ │ │ ├── grok.py
│ │ │ ├── ollama.py
│ │ │ ├── openai.py
│ │ │ └── openrouter.py
│ │ ├── server.py
│ │ ├── state_store.py
│ │ ├── tournaments
│ │ │ ├── manager.py
│ │ │ ├── tasks.py
│ │ │ └── utils.py
│ │ └── ums_api
│ │ ├── __init__.py
│ │ ├── ums_database.py
│ │ ├── ums_endpoints.py
│ │ ├── ums_models.py
│ │ └── ums_services.py
│ ├── exceptions.py
│ ├── graceful_shutdown.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── analytics
│ │ │ ├── __init__.py
│ │ │ ├── metrics.py
│ │ │ └── reporting.py
│ │ ├── cache
│ │ │ ├── __init__.py
│ │ │ ├── cache_service.py
│ │ │ ├── persistence.py
│ │ │ ├── strategies.py
│ │ │ └── utils.py
│ │ ├── cache.py
│ │ ├── document.py
│ │ ├── knowledge_base
│ │ │ ├── __init__.py
│ │ │ ├── feedback.py
│ │ │ ├── manager.py
│ │ │ ├── rag_engine.py
│ │ │ ├── retriever.py
│ │ │ └── utils.py
│ │ ├── prompts
│ │ │ ├── __init__.py
│ │ │ ├── repository.py
│ │ │ └── templates.py
│ │ ├── prompts.py
│ │ └── vector
│ │ ├── __init__.py
│ │ ├── embeddings.py
│ │ └── vector_service.py
│ ├── tool_token_counter.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── audio_transcription.py
│ │ ├── base.py
│ │ ├── completion.py
│ │ ├── docstring_refiner.py
│ │ ├── document_conversion_and_processing.py
│ │ ├── enhanced-ums-lookbook.html
│ │ ├── entity_relation_graph.py
│ │ ├── excel_spreadsheet_automation.py
│ │ ├── extraction.py
│ │ ├── filesystem.py
│ │ ├── html_to_markdown.py
│ │ ├── local_text_tools.py
│ │ ├── marqo_fused_search.py
│ │ ├── meta_api_tool.py
│ │ ├── ocr_tools.py
│ │ ├── optimization.py
│ │ ├── provider.py
│ │ ├── pyodide_boot_template.html
│ │ ├── python_sandbox.py
│ │ ├── rag.py
│ │ ├── redline-compiled.css
│ │ ├── sentiment_analysis.py
│ │ ├── single_shot_synthesis.py
│ │ ├── smart_browser.py
│ │ ├── sql_databases.py
│ │ ├── text_classification.py
│ │ ├── text_redline_tools.py
│ │ ├── tournament.py
│ │ ├── ums_explorer.html
│ │ └── unified_memory_system.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── async_utils.py
│ │ ├── display.py
│ │ ├── logging
│ │ │ ├── __init__.py
│ │ │ ├── console.py
│ │ │ ├── emojis.py
│ │ │ ├── formatter.py
│ │ │ ├── logger.py
│ │ │ ├── panels.py
│ │ │ ├── progress.py
│ │ │ └── themes.py
│ │ ├── parse_yaml.py
│ │ ├── parsing.py
│ │ ├── security.py
│ │ └── text.py
│ └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/smart_browser.py:
--------------------------------------------------------------------------------
```python
1 | # ultimate_mcp_server/tools/smart_browser.py
2 | """
3 | Smart Browser - Standalone Playwright-powered web automation tools for Ultimate MCP Server.
4 |
5 | Provides enterprise-grade web automation with comprehensive features for scraping,
6 | testing, and browser automation tasks with built-in security, resilience, and ML capabilities.
7 |
8 | Refactored into standalone functions for compatibility with the MCP tool registration system.
9 | State and lifecycle are managed via global variables and explicit init/shutdown calls.
10 | """
11 |
12 | # Python Standard Library Imports
13 | import asyncio
14 | import atexit
15 | import base64
16 | import concurrent.futures
17 | import difflib
18 | import functools
19 | import hashlib
20 | import json
21 | import os
22 | import random
23 | import re
24 | import signal
25 | import sqlite3
26 | import subprocess
27 | import textwrap
28 | import threading
29 | import time
30 | import unicodedata
31 | import urllib.parse
32 |
33 | # Python Standard Library Type Hinting and Collections Imports
34 | from collections import deque
35 | from contextlib import asynccontextmanager, closing
36 | from datetime import datetime, timezone
37 | from pathlib import Path
38 | from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
39 | from urllib.parse import urlparse
40 |
41 | # Third-Party Library Imports
42 | import aiofiles
43 | import httpx
44 | from bs4 import BeautifulSoup
45 | from cryptography.exceptions import InvalidTag
46 | from cryptography.hazmat.primitives.ciphers.aead import AESGCM
47 | from playwright._impl._errors import Error as PlaywrightException
48 | from playwright._impl._errors import TimeoutError as PlaywrightTimeoutError
49 | from playwright.async_api import Browser, BrowserContext, Locator, Page, async_playwright
50 |
51 | # First-Party Library Imports (MCP Specific)
52 | from ultimate_mcp_server.config import SmartBrowserConfig, get_config
53 |
54 | # Assuming these are available and work standalone
55 | from ultimate_mcp_server.constants import Provider
56 | from ultimate_mcp_server.core.providers.base import get_provider, parse_model_string
57 | from ultimate_mcp_server.exceptions import ProviderError, ToolError, ToolInputError
58 |
59 | # Import STANDALONE filesystem and completion tools
60 | from ultimate_mcp_server.tools.completion import chat_completion
61 | from ultimate_mcp_server.tools.filesystem import (
62 | create_directory,
63 | get_unique_filepath,
64 | read_file,
65 | write_file,
66 | )
67 | from ultimate_mcp_server.utils import get_logger
68 |
69 | # For loop binding and forked process detection
70 | _pid = os.getpid()
71 |
72 | # --- Global Logger ---
73 | logger = get_logger("ultimate_mcp_server.tools.smart_browser")
74 |
75 | # --- Load External Tools Dynamically (Best Effort) ---
76 | # This allows using tools defined later without circular imports at top level
77 | # We'll look them up by name when needed in autopilot.
78 | _filesystem_tools_module = None
79 | _completion_tools_module = None
80 |
81 |
82 | def _get_filesystem_tool(name):
83 | global _filesystem_tools_module
84 | if _filesystem_tools_module is None:
85 | import ultimate_mcp_server.tools.filesystem as fs
86 |
87 | _filesystem_tools_module = fs
88 | tool_func = getattr(_filesystem_tools_module, name, None)
89 | return tool_func
90 |
91 |
92 | def _get_completion_tool(name):
93 | global _completion_tools_module
94 | if _completion_tools_module is None:
95 | import ultimate_mcp_server.tools.completion as cm
96 |
97 | _completion_tools_module = cm
98 | tool_func = getattr(_completion_tools_module, name, None)
99 | return tool_func
100 |
101 |
102 | # --- Global Configuration Variables ---
103 | # (These will be populated by _ensure_initialized)
104 | _sb_state_key_b64_global: Optional[str] = None
105 | _sb_max_tabs_global: int = 5
106 | _sb_tab_timeout_global: int = 300
107 | _sb_inactivity_timeout_global: int = 600
108 | _headless_mode_global: bool = True
109 | _vnc_enabled_global: bool = False
110 | _vnc_password_global: Optional[str] = None
111 | _proxy_pool_str_global: str = ""
112 | _proxy_allowed_domains_str_global: str = "*"
113 | _vault_allowed_paths_str_global: str = "secret/data/,kv/data/"
114 | _max_widgets_global: int = 300
115 | _max_section_chars_global: int = 5000
116 | _dom_fp_limit_global: int = 20000
117 | _llm_model_locator_global: str = "openai/gpt-4.1-mini" # Updated default
118 | _retry_after_fail_global: int = 1
119 | _seq_cutoff_global: float = 0.72
120 | _area_min_global: int = 400
121 | _high_risk_domains_set_global: Set[str] = set()
122 | _SB_INTERNAL_BASE_PATH_STR: Optional[str] = None
123 | _STATE_FILE: Optional[Path] = None
124 | _LOG_FILE: Optional[Path] = None
125 | _CACHE_DB: Optional[Path] = None
126 | _READ_JS_CACHE: Optional[Path] = None
127 | _PROXY_CONFIG_DICT: Optional[Dict[str, Any]] = None
128 | _PROXY_ALLOWED_DOMAINS_LIST: Optional[List[str]] = None
129 | _ALLOWED_VAULT_PATHS: Set[str] = set()
130 |
131 | # --- Global State Variables ---
132 | _pw: Optional[async_playwright] = None
133 | _browser: Optional[Browser] = None
134 | _ctx: Optional[BrowserContext] = None # Shared context
135 | _vnc_proc: Optional[subprocess.Popen] = None
136 | _last_hash: str | None = None
137 | _js_lib_cached: Set[str] = set()
138 | _db_connection: sqlite3.Connection | None = None
139 | _locator_cache_cleanup_task_handle: Optional[asyncio.Task] = None
140 | _inactivity_monitor_task_handle: Optional[asyncio.Task] = None # New handle for monitor task
141 | _last_activity: float = 0.0 # Global last activity timestamp
142 |
143 | # --- Locks ---
144 | _init_lock = asyncio.Lock()
145 | _playwright_lock = asyncio.Lock()
146 | _js_lib_lock = asyncio.Lock()
147 | _audit_log_lock = asyncio.Lock()
148 | _db_conn_pool_lock = threading.RLock() # Keep RLock for sync DB access from async context
149 | _shutdown_lock = asyncio.Lock()
150 |
151 | # --- Flags ---
152 | _is_initialized = False
153 | _shutdown_initiated = False
154 |
155 | # --- Thread Pool ---
156 | _cpu_count = os.cpu_count() or 1
157 | _thread_pool = concurrent.futures.ThreadPoolExecutor(
158 | max_workers=min(32, _cpu_count * 2 + 4), thread_name_prefix="sb_worker"
159 | )
160 |
161 | # --- Helper Functions ---
162 |
163 |
164 | def _update_activity():
165 | """Updates the global activity timestamp. Should be called by user-facing tool functions."""
166 | global _last_activity
167 | now = time.monotonic()
168 | logger.debug(f"Updating last activity timestamp to {now}")
169 | _last_activity = now
170 |
171 |
172 | def _get_pool(): # Keep as is
173 | global _thread_pool, _pid
174 | if _pid != os.getpid():
175 | _thread_pool.shutdown(wait=False)
176 | pool_max_workers = min(32, _sb_max_tabs_global * 2)
177 | _thread_pool = concurrent.futures.ThreadPoolExecutor(
178 | max_workers=pool_max_workers, thread_name_prefix="sb_worker"
179 | )
180 | _pid = os.getpid()
181 | return _thread_pool
182 |
183 |
184 | # --- Encryption ---
185 | CIPHER_VERSION = b"SB1"
186 | AAD_TAG = b"smart-browser-state-v1"
187 |
188 |
189 | def _key() -> bytes | None: # Uses global _sb_state_key_b64_global
190 | """Get AES-GCM key from the globally set config value."""
191 | if not _sb_state_key_b64_global:
192 | return None
193 | try:
194 | decoded = base64.b64decode(_sb_state_key_b64_global)
195 | key_length = len(decoded)
196 | if key_length not in (16, 24, 32):
197 | logger.warning(f"Invalid SB State Key length: {key_length} bytes. Need 16, 24, or 32.")
198 | return None
199 | return decoded
200 | except (ValueError, TypeError) as e:
201 | logger.warning(f"Invalid base64 SB State Key: {e}")
202 | return None
203 |
204 |
205 | def _enc(buf: bytes) -> bytes: # Uses global _key
206 | """Encrypt data using AES-GCM with AAD if key is set."""
207 | k = _key()
208 | if not k:
209 | logger.debug("SB_STATE_KEY not set. Skipping encryption for state.")
210 | return buf
211 | try:
212 | nonce = os.urandom(12)
213 | cipher = AESGCM(k)
214 | encrypted_data = cipher.encrypt(nonce, buf, AAD_TAG)
215 | result = CIPHER_VERSION + nonce + encrypted_data
216 | return result
217 | except Exception as e:
218 | logger.error(f"Encryption failed: {e}", exc_info=True)
219 | raise RuntimeError(f"Encryption failed: {e}") from e
220 |
221 |
222 | def _dec(buf: bytes) -> bytes | None: # Uses global _key, _STATE_FILE
223 | """Decrypt data using AES-GCM with AAD if key is set and buffer looks encrypted."""
224 | k = _key()
225 | if not k:
226 | logger.debug("SB_STATE_KEY not set. Assuming state is unencrypted.")
227 | try:
228 | stripped_buf = buf.strip()
229 | if stripped_buf.startswith(b"{") or stripped_buf.startswith(b"["):
230 | return buf
231 | else:
232 | logger.warning("Unencrypted state file doesn't look like JSON. Ignoring.")
233 | return None
234 | except Exception:
235 | logger.warning("Error checking unencrypted state file format. Ignoring.")
236 | return None
237 |
238 | if not buf.startswith(CIPHER_VERSION):
239 | logger.warning(
240 | "State file exists but lacks expected encryption header. Treating as legacy/invalid."
241 | )
242 | if _STATE_FILE and _STATE_FILE.exists():
243 | try:
244 | _STATE_FILE.unlink()
245 | except Exception:
246 | pass
247 | return None
248 |
249 | hdr_len = len(CIPHER_VERSION)
250 | nonce_len = 12
251 | min_len = hdr_len + nonce_len + 1 # Header + Nonce + Tag(at least 1 byte)
252 | if len(buf) < min_len:
253 | logger.error("State file too short to be valid encrypted data")
254 | return None
255 |
256 | _hdr_start = 0
257 | _hdr_end = hdr_len
258 | _nonce_start = _hdr_end
259 | _nonce_end = _hdr_end + nonce_len
260 | _ciphertext_start = _nonce_end
261 |
262 | _HDR = buf[_hdr_start:_hdr_end]
263 | nonce = buf[_nonce_start:_nonce_end]
264 | ciphertext = buf[_ciphertext_start:]
265 |
266 | try:
267 | cipher = AESGCM(k)
268 | decrypted_data = cipher.decrypt(nonce, ciphertext, AAD_TAG)
269 | return decrypted_data
270 | except InvalidTag:
271 | logger.error("Decryption failed: Invalid tag (tampered/wrong key?)")
272 | if _STATE_FILE and _STATE_FILE.exists():
273 | try:
274 | _STATE_FILE.unlink()
275 | except Exception:
276 | pass
277 | raise RuntimeError("State-file authentication failed (InvalidTag)") from None
278 | except Exception as e:
279 | logger.error(f"Decryption failed: {e}.", exc_info=True)
280 | if _STATE_FILE and _STATE_FILE.exists():
281 | try:
282 | _STATE_FILE.unlink()
283 | except Exception:
284 | pass
285 | return None
286 |
287 |
288 | # --- Locator Cache DB ---
289 | def _get_db_connection() -> sqlite3.Connection: # Uses global _db_connection, _CACHE_DB
290 | """Get or create the single shared SQLite connection."""
291 | global _db_connection
292 | with _db_conn_pool_lock:
293 | if _db_connection is None:
294 | if _CACHE_DB is None:
295 | raise RuntimeError("Database path (_CACHE_DB) not initialized before DB access.")
296 | try:
297 | conn = sqlite3.connect(
298 | _CACHE_DB,
299 | check_same_thread=False,
300 | isolation_level=None,
301 | timeout=10,
302 | )
303 | conn.execute("PRAGMA journal_mode=WAL")
304 | conn.execute("PRAGMA synchronous=FULL")
305 | conn.execute("PRAGMA busy_timeout = 10000")
306 | _db_connection = conn
307 | logger.info(f"Initialized SQLite DB connection to {_CACHE_DB}")
308 | except sqlite3.Error as e:
309 | logger.critical(
310 | f"Failed to connect/init SQLite DB at {_CACHE_DB}: {e}", exc_info=True
311 | )
312 | raise RuntimeError(f"Failed to initialize database: {e}") from e
313 | return _db_connection
314 |
315 |
316 | def _close_db_connection(): # Uses global _db_connection
317 | """Close the SQLite connection."""
318 | global _db_connection
319 | with _db_conn_pool_lock:
320 | if _db_connection is not None:
321 | conn_to_close = _db_connection
322 | _db_connection = None # Set to None first
323 | try:
324 | conn_to_close.execute("PRAGMA wal_checkpoint(TRUNCATE);")
325 | except sqlite3.Error as e:
326 | logger.warning(f"Error during WAL checkpoint before closing DB: {e}")
327 | try:
328 | conn_to_close.close()
329 | logger.info("Closed SQLite DB connection.")
330 | except sqlite3.Error as e:
331 | logger.error(f"Error closing SQLite DB connection: {e}")
332 |
333 |
334 | atexit.register(_close_db_connection) # Keep atexit hook
335 |
336 |
337 | def _init_locator_cache_db_sync(): # Uses global _CACHE_DB
338 | """Synchronous DB schema initialization for the locator cache."""
339 | conn = None
340 | if _CACHE_DB is None:
341 | logger.error("Cannot initialize locator DB: Path not set.")
342 | return # Cannot proceed without path
343 | try:
344 | conn = _get_db_connection()
345 | with closing(conn.cursor()) as cursor:
346 | create_table_sql = """CREATE TABLE IF NOT EXISTS selector_cache(
347 | key TEXT,
348 | selector TEXT NOT NULL,
349 | dom_fp TEXT NOT NULL,
350 | hits INTEGER DEFAULT 1,
351 | created_ts INTEGER DEFAULT (strftime('%s', 'now')),
352 | last_hit INTEGER DEFAULT (strftime('%s', 'now')),
353 | PRIMARY KEY (key, dom_fp)
354 | );"""
355 | cursor.execute(create_table_sql)
356 | try:
357 | cursor.execute("SELECT last_hit FROM selector_cache LIMIT 1")
358 | except sqlite3.OperationalError:
359 | logger.info("Adding last_hit column to selector_cache table...")
360 | alter_table_sql = "ALTER TABLE selector_cache ADD COLUMN last_hit INTEGER DEFAULT(strftime('%s','now'))"
361 | cursor.execute(alter_table_sql)
362 | logger.info(f"Enhanced Locator cache DB schema initialized/verified at {_CACHE_DB}")
363 | except sqlite3.Error as e:
364 | logger.critical(f"Failed to initialize locator cache DB schema: {e}", exc_info=True)
365 | raise RuntimeError(f"Failed to initialize locator cache database: {e}") from e
366 | except RuntimeError as e: # Catch error from _get_db_connection if path is missing
367 | logger.critical(f"Failed to get DB connection for schema init: {e}")
368 | raise
369 |
370 |
371 | def _cache_put_sync(key: str, selector: str, dom_fp: str) -> None: # Uses global _get_db_connection
372 | """Synchronous write/update to the locator cache."""
373 | try:
374 | conn = _get_db_connection()
375 | insert_sql = """INSERT INTO selector_cache(key, selector, dom_fp, created_ts, last_hit)
376 | VALUES (?, ?, ?, strftime('%s', 'now'), strftime('%s', 'now'))
377 | ON CONFLICT(key, dom_fp) DO UPDATE SET
378 | hits = hits + 1,
379 | last_hit = strftime('%s', 'now')
380 | WHERE key = excluded.key AND dom_fp = excluded.dom_fp;"""
381 | params = (key, selector, dom_fp)
382 | conn.execute(insert_sql, params)
383 | except sqlite3.Error as e:
384 | key_prefix = key[:8]
385 | logger.error(f"Failed to write to locator cache (key prefix={key_prefix}...): {e}")
386 | except RuntimeError as e:
387 | logger.error(f"Failed to get DB connection for cache put: {e}")
388 |
389 |
390 | def _cache_delete_sync(key: str) -> None: # Uses global _get_db_connection
391 | """Synchronously delete an entry from the locator cache by key."""
392 | key_prefix = key[:8]
393 | try:
394 | conn = _get_db_connection()
395 | logger.debug(f"Deleting stale cache entry with key prefix: {key_prefix}...")
396 | delete_sql = "DELETE FROM selector_cache WHERE key = ?"
397 | params = (key,)
398 | cursor = conn.execute(delete_sql, params)
399 | if cursor.rowcount > 0:
400 | logger.debug(f"Successfully deleted stale cache entry {key_prefix}...")
401 | except sqlite3.Error as e:
402 | logger.error(f"Failed to delete stale cache entry (key prefix={key_prefix}...): {e}")
403 | except RuntimeError as e:
404 | logger.error(f"Failed to get DB connection for cache delete: {e}")
405 | except Exception as e:
406 | logger.error(
407 | f"Unexpected error deleting cache entry (key prefix={key_prefix}...): {e}",
408 | exc_info=True,
409 | )
410 |
411 |
412 | def _cache_get_sync(key: str, dom_fp: str) -> Optional[str]: # Uses global _get_db_connection
413 | """Synchronous read from cache, checking fingerprint. Deletes stale entries."""
414 | row = None
415 | try:
416 | conn = _get_db_connection()
417 | with closing(conn.cursor()) as cursor:
418 | select_sql = "SELECT selector FROM selector_cache WHERE key=? AND dom_fp=?"
419 | params_select = (key, dom_fp)
420 | cursor.execute(select_sql, params_select)
421 | row = cursor.fetchone()
422 | if row:
423 | update_sql = "UPDATE selector_cache SET last_hit = strftime('%s', 'now') WHERE key=? AND dom_fp=?"
424 | params_update = (key, dom_fp)
425 | conn.execute(update_sql, params_update)
426 | selector = row[0]
427 | return selector
428 | # If row not found with matching key and fp, check if key exists at all
429 | check_key_sql = "SELECT 1 FROM selector_cache WHERE key=? LIMIT 1"
430 | params_check = (key,)
431 | cursor.execute(check_key_sql, params_check)
432 | key_exists = cursor.fetchone()
433 | if key_exists:
434 | key_prefix = key[:8]
435 | logger.debug(
436 | f"Cache key '{key_prefix}...' found but DOM fingerprint mismatch. Deleting."
437 | )
438 | _cache_delete_sync(key)
439 | except sqlite3.Error as e:
440 | logger.error(f"Failed to read from locator cache (key={key}): {e}")
441 | except RuntimeError as e:
442 | logger.error(f"Failed to get DB connection for cache get: {e}")
443 | return None
444 |
445 |
446 | # --- Locator Cache Cleanup ---
447 | def _cleanup_locator_cache_db_sync(
448 | retention_days: int = 90,
449 | ) -> int: # Uses global _get_db_connection
450 | """Synchronously removes old entries from the locator cache DB."""
451 | deleted_count = 0
452 | if retention_days <= 0:
453 | logger.info("Locator cache cleanup skipped (retention_days <= 0).")
454 | return 0
455 | try:
456 | conn = _get_db_connection()
457 | # Note: f-string for time modification is safe as retention_days is an int
458 | cutoff_time_sql = f"strftime('%s', 'now', '-{retention_days} days')"
459 | logger.info(
460 | f"Running locator cache cleanup: Removing entries older than {retention_days} days or with hits=0..."
461 | )
462 | with closing(conn.cursor()) as cursor:
463 | # Use placeholder for the time comparison to be safer if possible, but strftime makes it tricky
464 | # For this controlled use case, f-string is acceptable.
465 | delete_sql = (
466 | f"DELETE FROM selector_cache WHERE created_ts < ({cutoff_time_sql}) OR hits = 0"
467 | )
468 | cursor.execute(delete_sql)
469 | deleted_count = cursor.rowcount
470 | # Vacuum only if significant changes were made
471 | if deleted_count > 500:
472 | logger.info(f"Vacuuming locator cache DB after deleting {deleted_count} entries...")
473 | cursor.execute("VACUUM;")
474 | logger.info(f"Locator cache cleanup finished. Removed {deleted_count} old entries.")
475 | return deleted_count
476 | except sqlite3.Error as e:
477 | logger.error(f"Error during locator cache cleanup: {e}")
478 | return -1
479 | except RuntimeError as e:
480 | logger.error(f"Failed to get DB connection for cache cleanup: {e}")
481 | return -1
482 | except Exception as e:
483 | logger.error(f"Unexpected error during locator cache cleanup: {e}", exc_info=True)
484 | return -1
485 |
486 |
487 | async def _locator_cache_cleanup_task(
488 | interval_seconds: int = 24 * 60 * 60,
489 | ): # Uses global _get_pool
490 | """Background task to periodically run locator cache cleanup."""
491 | if interval_seconds <= 0:
492 | logger.info("Locator cache cleanup task disabled (interval <= 0).")
493 | return
494 | logger.info(f"Locator cache cleanup task started. Running every {interval_seconds} seconds.")
495 | # Initial delay before first run
496 | await asyncio.sleep(interval_seconds)
497 | while True:
498 | try:
499 | loop = asyncio.get_running_loop()
500 | pool = _get_pool()
501 | result_count = await loop.run_in_executor(pool, _cleanup_locator_cache_db_sync)
502 | if result_count < 0:
503 | logger.warning("Locator cache cleanup run encountered an error.")
504 | await asyncio.sleep(interval_seconds)
505 | except asyncio.CancelledError:
506 | logger.info("Locator cache cleanup task cancelled.")
507 | break
508 | except Exception as e:
509 | logger.error(f"Error in locator cache cleanup task loop: {e}", exc_info=True)
510 | # Wait longer after an error before retrying
511 | await asyncio.sleep(60 * 5)
512 |
513 |
514 | # --- Audit Log ---
515 | _salt = os.urandom(16)
516 |
517 |
518 | def _sanitize_for_log(obj: Any) -> Any: # Keep as is
519 | # ... (implementation largely unchanged, but split multi-line expressions) ...
520 | if isinstance(obj, str):
521 | try:
522 | # Remove control characters
523 | s = re.sub(r"[\x00-\x1f\x7f]", "", obj)
524 | # JSON encode to handle quotes, backslashes etc.
525 | encoded = json.dumps(s)
526 | # Remove the outer quotes added by json.dumps
527 | if len(encoded) >= 2:
528 | return encoded[1:-1]
529 | else:
530 | return ""
531 | except TypeError:
532 | return "???" # Should not happen for str, but safety first
533 | elif isinstance(obj, dict):
534 | # Recursively sanitize dictionary values
535 | new_dict = {}
536 | for k, v in obj.items():
537 | sanitized_v = _sanitize_for_log(v)
538 | str_k = str(k) # Ensure keys are strings
539 | new_dict[str_k] = sanitized_v
540 | return new_dict
541 | elif isinstance(obj, list):
542 | # Recursively sanitize list items
543 | new_list = []
544 | for item in obj:
545 | sanitized_item = _sanitize_for_log(item)
546 | new_list.append(sanitized_item)
547 | return new_list
548 | elif isinstance(obj, (int, float, bool, type(None))):
549 | # Allow simple types directly
550 | return obj
551 | else:
552 | # Attempt to stringify, sanitize, and encode other types
553 | try:
554 | s = str(obj)
555 | s = re.sub(r"[\x00-\x1f\x7f]", "", s)
556 | encoded = json.dumps(s)
557 | if len(encoded) >= 2:
558 | return encoded[1:-1]
559 | else:
560 | return ""
561 | except Exception:
562 | # Fallback for types that fail stringification/encoding
563 | return "???"
564 |
565 |
566 | _EVENT_EMOJI_MAP = { # Keep as is
567 | # ... (emoji map unchanged) ...
568 | "browser_start": "🚀",
569 | "browser_shutdown": "🛑",
570 | "browser_shutdown_complete": "🏁",
571 | "browser_context_create": "➕",
572 | "browser_incognito_context": "🕶️",
573 | "browser_context_close_shared": "➖",
574 | "browser_close": "🚪",
575 | "page_open": "📄",
576 | "page_close": "덮",
577 | "page_error": "🔥",
578 | "tab_timeout": "⏱️",
579 | "tab_cancelled": "🚫",
580 | "tab_error": "💥",
581 | "navigate": "➡️",
582 | "navigate_start": "➡️",
583 | "navigate_success": "✅",
584 | "navigate_fail_playwright": "❌",
585 | "navigate_fail_unexpected": "💣",
586 | "navigate_wait_selector_ok": "👌",
587 | "navigate_wait_selector_timeout": "⏳",
588 | "page_state_extracted": "ℹ️",
589 | "browse_fail_proxy_disallowed": "🛡️",
590 | "click": "🖱️",
591 | "click_success": "🖱️✅",
592 | "click_fail_notfound": "🖱️❓",
593 | "click_fail_playwright": "🖱️❌",
594 | "click_fail_unexpected": "🖱️💣",
595 | "type": "⌨️",
596 | "type_success": "⌨️✅",
597 | "type_fail_secret": "⌨️🔑",
598 | "type_fail_notfound": "⌨️❓",
599 | "type_fail_playwright": "⌨️❌",
600 | "type_fail_unexpected": "⌨️💣",
601 | "scroll": "↕️",
602 | "locator_cache_hit": "⚡",
603 | "locator_heuristic_match": "🧠",
604 | "locator_llm_pick": "🤖🎯",
605 | "locator_fail_all": "❓❓",
606 | "locator_text_fallback": "✍️",
607 | "locator_success": "🎯",
608 | "locator_fail": "❓",
609 | "download": "💾",
610 | "download_navigate": "🚚",
611 | "download_success": "💾✅",
612 | "download_fail_notfound": "💾❓",
613 | "download_fail_timeout": "💾⏱️",
614 | "download_fail_playwright": "💾❌",
615 | "download_fail_unexpected": "💾💣",
616 | "download_pdf_http": "📄💾",
617 | "download_direct_success": "✨💾",
618 | "download_pdf_error": "📄🔥",
619 | "download_site_pdfs_complete": "📚✅",
620 | "table_extract_success": "📊✅",
621 | "table_extract_error": "📊❌",
622 | "docs_collected_success": "📖✅",
623 | "docs_harvest": "📖",
624 | "search": "🔍",
625 | "search_start": "🔍➡️",
626 | "search_complete": "🔍✅",
627 | "search_captcha": "🤖",
628 | "search_no_results_selector": "🤷",
629 | "search_error_playwright": "🔍❌",
630 | "search_error_unexpected": "🔍💣",
631 | "macro_plan": "📝",
632 | "macro_plan_generated": "📝✅",
633 | "macro_plan_empty": "📝🤷",
634 | "macro_step_result": "▶️",
635 | "macro_complete": "🎉",
636 | "macro_finish_action": "🏁",
637 | "macro_error": "💥",
638 | "macro_exceeded_rounds": "🔄",
639 | "macro_fail_step": "❌",
640 | "macro_error_tool": "🛠️💥",
641 | "macro_error_unexpected": "💣💥",
642 | "macro_navigate": "🗺️➡️",
643 | "click_extract_navigate": "🖱️🗺️",
644 | "click_extract_success": "🖱️✅✨",
645 | "fill_form_navigate": "✍️🗺️",
646 | "fill_form_field": "✍️",
647 | "fill_form_submit": "✔️",
648 | "fill_form_success": "✍️✅",
649 | "autopilot_run": "🧑✈️",
650 | "autopilot_step_start": "▶️",
651 | "autopilot_step_success": "✅",
652 | "autopilot_step_fail": "❌",
653 | "autopilot_replan_success": "🧠🔄",
654 | "autopilot_replan_fail": "🧠❌",
655 | "autopilot_max_steps": "🚫🔄",
656 | "autopilot_plan_end": "🏁",
657 | "autopilot_critical_error": "💥🧑✈️",
658 | "parallel_navigate": "🚦➡️",
659 | "parallel_url_error": "🚦🔥",
660 | "parallel_process_complete": "🚦🏁",
661 | "retry": "⏳",
662 | "retry_fail": "⚠️",
663 | "retry_fail_unexpected": "💣⚠️",
664 | "retry_unexpected": "⏳💣",
665 | "llm_call_complete": "🤖💬",
666 | }
667 |
668 |
669 | async def _log(event: str, **details): # Uses global _last_hash, _salt, _LOG_FILE
670 | """Append a hash-chained entry to the audit log asynchronously."""
671 | global _last_hash, _salt
672 | if _LOG_FILE is None: # Need to check if path is set
673 | logger.warning(f"Audit log skipped for event '{event}': Log file path not initialized.")
674 | return
675 | now_utc = datetime.now(timezone.utc)
676 | ts_iso = now_utc.isoformat()
677 | sanitized_details = _sanitize_for_log(details)
678 | emoji_key = _EVENT_EMOJI_MAP.get(event, "❓")
679 | async with _audit_log_lock:
680 | current_last_hash = _last_hash
681 | entry = {
682 | "ts": ts_iso,
683 | "event": event,
684 | "details": sanitized_details,
685 | "prev": current_last_hash,
686 | "emoji": emoji_key,
687 | }
688 | entry_json = json.dumps(entry, sort_keys=True, separators=(",", ":"))
689 | payload = _salt + entry_json.encode("utf-8")
690 | hasher = hashlib.sha256(payload)
691 | h = hasher.hexdigest()
692 | log_entry_data = {"hash": h, **entry}
693 | log_entry_line = json.dumps(log_entry_data, separators=(",", ":")) + "\n"
694 | try:
695 | async with aiofiles.open(_LOG_FILE, "a", encoding="utf-8") as f:
696 | await f.write(log_entry_line)
697 | await f.flush()
698 | # os.fsync is sync, run in executor if strict atomic persistence needed
699 | # loop = asyncio.get_running_loop()
700 | # await loop.run_in_executor(_get_pool(), os.fsync, f.fileno())
701 | _last_hash = h
702 | except IOError as e:
703 | logger.error(f"Failed to write to audit log {_LOG_FILE}: {e}")
704 | except Exception as e:
705 | logger.error(f"Unexpected error writing audit log: {e}", exc_info=True)
706 |
707 |
708 | def _init_last_hash(): # Uses global _LOG_FILE, _last_hash
709 | """Initializes the last hash from the audit log file."""
710 | global _last_hash
711 | if _LOG_FILE is None:
712 | logger.info("Audit log initialization skipped: _LOG_FILE path not set yet.")
713 | return
714 | if _LOG_FILE.exists():
715 | try:
716 | with open(_LOG_FILE, "rb") as f:
717 | f.seek(0, os.SEEK_END)
718 | file_size = f.tell()
719 | if file_size == 0: # Empty file
720 | _last_hash = None
721 | logger.info("Audit log file found but is empty.")
722 | return
723 |
724 | # Read backwards efficiently (simplified version)
725 | buffer_size = 4096
726 | last_line = b""
727 | read_pos = max(0, file_size - buffer_size)
728 |
729 | while read_pos >= 0:
730 | f.seek(read_pos)
731 | buffer = f.read(buffer_size)
732 | lines = buffer.splitlines() # Split by \n, \r, or \r\n
733 | if lines:
734 | # Find the last *complete* line in the buffer
735 | # A complete line will either be the last one if the buffer ends with newline,
736 | # or the second to last one otherwise.
737 | is_last_line_complete = buffer.endswith(b"\n") or buffer.endswith(b"\r")
738 | if is_last_line_complete:
739 | last_line_candidate = lines[-1]
740 | elif len(lines) > 1:
741 | last_line_candidate = lines[-2] # Use second-to-last if last is partial
742 | else: # File smaller than buffer or only one partial line
743 | last_line_candidate = b"" # Assume partial
744 |
745 | # Ensure candidate is not empty and potentially valid JSON before breaking
746 | if last_line_candidate.strip().startswith(b"{"):
747 | last_line = last_line_candidate
748 | break # Found a likely valid, complete line
749 |
750 | if read_pos == 0:
751 | # Reached beginning, check if the first line itself is the only one
752 | if len(lines) == 1 and lines[0].strip().startswith(b"{"):
753 | last_line = lines[0]
754 | break
755 |
756 | # Move back, overlapping slightly to ensure line endings are caught
757 | read_pos = max(0, read_pos - (buffer_size // 2))
758 |
759 | if last_line:
760 | try:
761 | decoded_line = last_line.decode("utf-8")
762 | last_entry = json.loads(decoded_line)
763 | found_hash = last_entry.get("hash")
764 | _last_hash = found_hash
765 | if _last_hash:
766 | hash_preview = _last_hash[:8]
767 | logger.info(
768 | f"Initialized audit log chain from last hash: {hash_preview}..."
769 | )
770 | else:
771 | logger.warning(
772 | "Last log entry parsed but missing 'hash'. Starting new chain."
773 | )
774 | _last_hash = None
775 | except (json.JSONDecodeError, UnicodeDecodeError) as e:
776 | logger.error(f"Error decoding last line of audit log: {e}. Starting new chain.")
777 | _last_hash = None
778 | else:
779 | logger.info("Could not read last complete line from audit log. Starting new chain.")
780 | _last_hash = None
781 | except Exception as e:
782 | logger.error(
783 | f"Failed to read last hash from audit log {_LOG_FILE}: {e}. Starting new chain.",
784 | exc_info=True,
785 | )
786 | _last_hash = None
787 | else:
788 | logger.info("No existing audit log found. Starting new chain.")
789 | _last_hash = None
790 |
791 |
792 | # --- Resilient Decorator ---
793 | def resilient(max_attempts: int = 3, backoff: float = 0.3): # Uses global _log
794 | """Decorator for async functions; retries on common transient errors."""
795 |
796 | def wrap(fn):
797 | import functools # Ensure functools is imported locally for the decorator
798 |
799 | @functools.wraps(fn)
800 | async def inner(*a, **kw):
801 | attempt = 0
802 | while True:
803 | try:
804 | if attempt > 0:
805 | # Calculate jittered delay before retrying
806 | delay_factor = 2 ** (attempt - 1)
807 | base_delay = backoff * delay_factor
808 | jitter = random.uniform(0.8, 1.2)
809 | jitter_delay = base_delay * jitter
810 | await asyncio.sleep(jitter_delay)
811 | result = await fn(*a, **kw)
812 | return result
813 | except (PlaywrightTimeoutError, httpx.RequestError, asyncio.TimeoutError) as e:
814 | attempt += 1
815 | func_name = getattr(fn, "__name__", "unknown_func")
816 | if attempt >= max_attempts:
817 | await _log(
818 | "retry_fail", func=func_name, attempts=max_attempts, error=str(e)
819 | )
820 | raise ToolError(
821 | f"Operation '{func_name}' failed after {max_attempts} attempts: {e}"
822 | ) from e
823 | # Calculate delay for logging purposes (actual sleep is at loop start)
824 | delay_factor_log = 2 ** (attempt - 1)
825 | base_delay_log = backoff * delay_factor_log
826 | jitter_log = random.uniform(
827 | 0.8, 1.2
828 | ) # Recalculate for log consistency, might differ slightly from sleep
829 | delay_log = base_delay_log * jitter_log
830 | rounded_delay = round(delay_log, 2)
831 | await _log(
832 | "retry",
833 | func=func_name,
834 | attempt=attempt,
835 | max_attempts=max_attempts,
836 | sleep=rounded_delay,
837 | error=str(e),
838 | )
839 | # Sleep moved to start of the next iteration
840 | except (
841 | ToolError,
842 | ValueError,
843 | TypeError,
844 | KeyError,
845 | KeyboardInterrupt,
846 | sqlite3.Error,
847 | ):
848 | # Non-retryable errors specific to the application or unrecoverable
849 | raise # Re-raise immediately
850 | except Exception as e:
851 | # Catch other unexpected exceptions and retry them
852 | attempt += 1
853 | func_name = getattr(fn, "__name__", "unknown_func")
854 | if attempt >= max_attempts:
855 | await _log(
856 | "retry_fail_unexpected",
857 | func=func_name,
858 | attempts=max_attempts,
859 | error=str(e),
860 | )
861 | raise ToolError(
862 | f"Operation '{func_name}' failed with unexpected error after {max_attempts} attempts: {e}"
863 | ) from e
864 | # Calculate delay for logging
865 | delay_factor_log = 2 ** (attempt - 1)
866 | base_delay_log = backoff * delay_factor_log
867 | jitter_log = random.uniform(0.8, 1.2)
868 | delay_log = base_delay_log * jitter_log
869 | rounded_delay = round(delay_log, 2)
870 | await _log(
871 | "retry_unexpected",
872 | func=func_name,
873 | attempt=attempt,
874 | max_attempts=max_attempts,
875 | sleep=rounded_delay,
876 | error=str(e),
877 | )
878 | # Sleep moved to start of the next iteration
879 |
880 | return inner
881 |
882 | return wrap
883 |
884 |
885 | # --- Secret Vault ---
886 | def _update_vault_paths(): # Uses global _vault_allowed_paths_str_global, _ALLOWED_VAULT_PATHS
887 | """Parse the vault allowed paths string from global config into the global set."""
888 | global _ALLOWED_VAULT_PATHS
889 | new_set = set()
890 | path_list = _vault_allowed_paths_str_global.split(",")
891 | for path in path_list:
892 | stripped_path = path.strip()
893 | if stripped_path:
894 | # Ensure path ends with a slash for prefix matching
895 | formatted_path = stripped_path.rstrip("/") + "/"
896 | new_set.add(formatted_path)
897 | _ALLOWED_VAULT_PATHS = new_set
898 |
899 |
900 | def get_secret(path_key: str) -> str: # Uses global _ALLOWED_VAULT_PATHS
901 | """Retrieves secret from environment or HashiCorp Vault."""
902 | # ... (implementation largely unchanged, relies on _ALLOWED_VAULT_PATHS global, split multi-line expressions) ...
903 | if path_key.startswith("env:"):
904 | var = path_key[4:]
905 | val = os.getenv(var)
906 | if val is None:
907 | raise ToolInputError(f"Environment variable secret '{var}' not set.")
908 | return val
909 | if path_key.startswith("vault:"):
910 | try:
911 | import hvac
912 | except ImportError as e:
913 | raise RuntimeError("'hvac' library required for Vault access.") from e
914 | addr = os.getenv("VAULT_ADDR")
915 | token = os.getenv("VAULT_TOKEN")
916 | if not addr or not token:
917 | raise RuntimeError("VAULT_ADDR and VAULT_TOKEN environment variables must be set.")
918 |
919 | vault_uri_part = path_key[len("vault:") :]
920 | if "://" in vault_uri_part:
921 | raise ValueError("Vault path cannot contain '://'. Use format 'mount/path#key'.")
922 | if "#" not in vault_uri_part:
923 | raise ValueError("Vault path must include '#key'. Use format 'mount/path#key'.")
924 |
925 | path_part_raw, key_name = vault_uri_part.split("#", 1)
926 | path_part = path_part_raw.strip("/")
927 |
928 | if not _ALLOWED_VAULT_PATHS:
929 | _update_vault_paths() # Ensure allowed paths are populated
930 |
931 | # Check if the requested path is allowed
932 | path_to_check = path_part + "/" # Ensure trailing slash for prefix check
933 | found_prefix = False
934 | for prefix in _ALLOWED_VAULT_PATHS:
935 | if path_to_check.startswith(prefix):
936 | found_prefix = True
937 | break
938 | if not found_prefix:
939 | logger.warning(
940 | f"Access denied for Vault path '{path_part}'. Allowed prefixes: {_ALLOWED_VAULT_PATHS}"
941 | )
942 | raise ValueError(f"Access to Vault path '{path_part}' is not allowed.")
943 |
944 | client = hvac.Client(url=addr, token=token)
945 | if not client.is_authenticated():
946 | raise RuntimeError(f"Vault authentication failed for {addr}.")
947 |
948 | path_segments = path_part.split("/")
949 | if not path_segments:
950 | raise ValueError(f"Invalid Vault path format: '{path_part}'")
951 |
952 | mount_point = path_segments[0]
953 | rest_segments = path_segments[1:]
954 | secret_sub_path = "/".join(rest_segments)
955 |
956 | # Try KV v2 first
957 | try:
958 | resp_v2 = client.secrets.kv.v2.read_secret_version(
959 | mount_point=mount_point, path=secret_sub_path
960 | )
961 | data_v2 = resp_v2["data"]["data"]
962 | if key_name in data_v2:
963 | return data_v2[key_name]
964 | else:
965 | # Key not found in this v2 secret
966 | pass # Will proceed to check v1 or raise later
967 | except hvac.exceptions.InvalidPath:
968 | # Path doesn't exist in KV v2 mount, try KV v1
969 | pass
970 | except (KeyError, TypeError):
971 | # Error accessing nested data['data'], indicates issue with response structure
972 | logger.warning(
973 | f"Unexpected response structure from Vault KV v2 for path '{path_part}'."
974 | )
975 | pass
976 | except Exception as e:
977 | logger.error(f"Error reading Vault KV v2 secret '{path_part}': {e}")
978 | # Don't raise immediately, allow fallback to v1 if configured
979 | pass
980 |
981 | # Try KV v1
982 | try:
983 | resp_v1 = client.secrets.kv.v1.read_secret(
984 | mount_point=mount_point, path=secret_sub_path
985 | )
986 | data_v1 = resp_v1["data"]
987 | if key_name in data_v1:
988 | return data_v1[key_name]
989 | else:
990 | # Key not found in v1 either
991 | raise KeyError(
992 | f"Key '{key_name}' not found in Vault secret at '{path_part}' (tried KV v2 & v1)."
993 | )
994 | except hvac.exceptions.InvalidPath:
995 | # Path not found in v1 either (and wasn't found in v2 or errored)
996 | raise KeyError(
997 | f"Secret path '{path_part}' not found in Vault (tried KV v2 & v1)."
998 | ) from None
999 | except KeyError:
1000 | # Re-raise the KeyError from the v1 check if key wasn't found there
1001 | raise KeyError(f"Key '{key_name}' not found at '{path_part}' (KV v1).") from None
1002 | except Exception as e:
1003 | logger.error(f"Error reading Vault KV v1 secret '{path_part}': {e}")
1004 | raise RuntimeError(f"Failed to read Vault secret (KV v1): {e}") from e
1005 |
1006 | # If scheme is not 'env:' or 'vault:'
1007 | raise ValueError(f"Unknown secret scheme or invalid path format: {path_key}")
1008 |
1009 |
1010 | # --- Playwright Lifecycle ---
1011 | def _update_proxy_settings(): # Uses globals
1012 | """Parse global proxy config strings into usable dict/list."""
1013 | global _PROXY_CONFIG_DICT, _PROXY_ALLOWED_DOMAINS_LIST
1014 | _PROXY_CONFIG_DICT = None # Reset
1015 | if _proxy_pool_str_global:
1016 | # Split and filter empty strings
1017 | proxies_raw = _proxy_pool_str_global.split(";")
1018 | proxies = []
1019 | for p in proxies_raw:
1020 | stripped_p = p.strip()
1021 | if stripped_p:
1022 | proxies.append(stripped_p)
1023 |
1024 | if proxies:
1025 | chosen_proxy = random.choice(proxies)
1026 | try:
1027 | parsed = urlparse(chosen_proxy)
1028 | # Basic validation
1029 | is_valid_scheme = parsed.scheme in ("http", "https", "socks5", "socks5h")
1030 | has_netloc = bool(parsed.netloc)
1031 | no_fragment = "#" not in chosen_proxy # Fragments not allowed in proxy URL itself
1032 |
1033 | if is_valid_scheme and has_netloc and no_fragment:
1034 | # Construct base server URL without credentials
1035 | if parsed.port:
1036 | hostname_port = f"{parsed.hostname}:{parsed.port}"
1037 | else:
1038 | hostname_port = parsed.hostname
1039 | server_url = f"{parsed.scheme}://{hostname_port}"
1040 |
1041 | proxy_dict: Dict[str, Any] = {"server": server_url}
1042 | if parsed.username:
1043 | unquoted_username = urllib.parse.unquote(parsed.username)
1044 | proxy_dict["username"] = unquoted_username
1045 | if parsed.password:
1046 | unquoted_password = urllib.parse.unquote(parsed.password)
1047 | proxy_dict["password"] = unquoted_password
1048 |
1049 | _PROXY_CONFIG_DICT = proxy_dict
1050 | logger.info(f"Proxy settings parsed: Using {proxy_dict.get('server')}")
1051 | else:
1052 | logger.warning(f"Invalid proxy URL format/scheme: '{chosen_proxy}'. Skipping.")
1053 | except Exception as e:
1054 | logger.warning(f"Error parsing proxy URL '{chosen_proxy}': {e}")
1055 |
1056 | # Parse allowed domains
1057 | if not _proxy_allowed_domains_str_global or _proxy_allowed_domains_str_global == "*":
1058 | _PROXY_ALLOWED_DOMAINS_LIST = None # None means allow all
1059 | logger.info("Proxy allowed domains: * (all allowed)")
1060 | else:
1061 | domains_raw = _proxy_allowed_domains_str_global.split(",")
1062 | domains = []
1063 | for d in domains_raw:
1064 | stripped_d = d.strip()
1065 | if stripped_d:
1066 | lower_d = stripped_d.lower()
1067 | domains.append(lower_d)
1068 |
1069 | # Ensure domains start with a dot for proper suffix matching
1070 | new_domain_list = []
1071 | for d in domains:
1072 | if d.startswith("."):
1073 | new_domain_list.append(d)
1074 | else:
1075 | new_domain_list.append("." + d)
1076 | _PROXY_ALLOWED_DOMAINS_LIST = new_domain_list
1077 | logger.info(f"Proxy allowed domains parsed: {_PROXY_ALLOWED_DOMAINS_LIST}")
1078 |
1079 |
1080 | def _get_proxy_config() -> Optional[Dict[str, Any]]: # Uses global _PROXY_CONFIG_DICT
1081 | """Returns the globally cached parsed proxy dictionary."""
1082 | return _PROXY_CONFIG_DICT
1083 |
1084 |
1085 | def _is_domain_allowed_for_proxy(url: str) -> bool: # Uses global _PROXY_ALLOWED_DOMAINS_LIST
1086 | """Checks if the URL's domain is allowed based on globally cached list."""
1087 | if _PROXY_ALLOWED_DOMAINS_LIST is None:
1088 | return True # Allow all if list is None (wildcard)
1089 | try:
1090 | parsed_url = urlparse(url)
1091 | domain = parsed_url.netloc.lower()
1092 | if not domain:
1093 | return False # Cannot determine domain
1094 |
1095 | # Check domain and its superdomains against the allowed list
1096 | domain_parts = domain.split(".")
1097 | for i in range(len(domain_parts)):
1098 | sub_domain_check = "." + ".".join(domain_parts[i:])
1099 | if sub_domain_check in _PROXY_ALLOWED_DOMAINS_LIST:
1100 | return True
1101 | # Check exact domain match as well (if domain doesn't start with .)
1102 | # The logic above already covers this because we ensure allowed domains start with '.'
1103 | # e.g. if "example.com" is requested and ".example.com" is allowed, it matches.
1104 | return False # No allowed suffix matched
1105 | except Exception as e:
1106 | logger.warning(f"Error parsing URL '{url}' for proxy domain check: {e}")
1107 | return False # Deny on error
1108 |
1109 |
1110 | def _run_sync(coro): # Keep as is
1111 | try:
1112 | loop = asyncio.get_running_loop()
1113 | except RuntimeError:
1114 | # No running loop, run in a new one
1115 | return asyncio.run(coro)
1116 | else:
1117 | # Loop exists, run in threadsafe way if called from sync context
1118 | future = asyncio.run_coroutine_threadsafe(coro, loop) # noqa: F841
1119 | # If needing the result synchronously (careful with deadlocks):
1120 | # return future.result()
1121 | return None # Or return future if caller handles it
1122 |
1123 |
1124 | async def _try_close_browser(): # Uses global _browser
1125 | """Attempt to close the browser gracefully via atexit."""
1126 | global _browser
1127 | browser_to_close = _browser # Capture current browser instance
1128 | if browser_to_close and browser_to_close.is_connected():
1129 | logger.info("Attempting to close browser via atexit handler...")
1130 | try:
1131 | await browser_to_close.close()
1132 | logger.info("Browser closed successfully via atexit.")
1133 | except Exception as e:
1134 | logger.error(f"Error closing browser during atexit: {e}")
1135 | finally:
1136 | # Only reset global _browser if it hasn't changed in the meantime
1137 | if _browser == browser_to_close:
1138 | _browser = None
1139 |
1140 |
1141 | async def get_browser_context(
1142 | use_incognito: bool = False,
1143 | context_args: Optional[Dict[str, Any]] = None,
1144 | ) -> tuple[BrowserContext, Browser]: # Uses MANY globals
1145 | """Get or create a browser context using global config values."""
1146 | global _pw, _browser, _ctx
1147 | async with _playwright_lock:
1148 | # 1. Ensure Playwright is started
1149 | if not _pw:
1150 | try:
1151 | playwright_manager = async_playwright()
1152 | _pw = await playwright_manager.start()
1153 | logger.info("Playwright started.")
1154 | except Exception as e:
1155 | raise RuntimeError(f"Failed to start Playwright: {e}") from e
1156 |
1157 | # 2. Handle Headless Mode and VNC
1158 | is_headless = _headless_mode_global
1159 | if not is_headless:
1160 | _start_vnc() # Starts VNC if enabled and not already running
1161 |
1162 | # 3. Ensure Browser is launched and connected
1163 | if not _browser or not _browser.is_connected():
1164 | if _browser: # Close previous instance if disconnected
1165 | try:
1166 | await _browser.close()
1167 | except Exception as close_err:
1168 | logger.warning(
1169 | f"Error closing previous disconnected browser instance: {close_err}"
1170 | )
1171 | try:
1172 | browser_args = [
1173 | "--no-sandbox",
1174 | "--disable-dev-shm-usage",
1175 | "--disable-gpu",
1176 | "--window-size=1280,1024",
1177 | ]
1178 | launched_browser = await _pw.chromium.launch(
1179 | headless=is_headless,
1180 | args=browser_args,
1181 | )
1182 | _browser = launched_browser
1183 | logger.info(f"Browser launched (Headless: {is_headless}).")
1184 | # Register atexit handler *after* successful launch
1185 | atexit.register(lambda: _run_sync(_try_close_browser()))
1186 | except PlaywrightException as e:
1187 | raise RuntimeError(f"Failed to launch browser: {e}") from e
1188 |
1189 | # 4. Prepare Context Arguments
1190 | default_args = {
1191 | "viewport": {"width": 1280, "height": 1024},
1192 | "locale": "en-US",
1193 | "timezone_id": "UTC",
1194 | "accept_downloads": True,
1195 | }
1196 | if context_args:
1197 | default_args.update(context_args)
1198 |
1199 | # 5. Handle Incognito Context Request
1200 | if use_incognito:
1201 | try:
1202 | incog_ctx = await _browser.new_context(**default_args)
1203 | await _log("browser_incognito_context", args=default_args)
1204 | # Apply proxy routing rules if necessary for incognito context
1205 | proxy_cfg = _get_proxy_config()
1206 | if proxy_cfg:
1207 | await _add_proxy_routing_rule(incog_ctx, proxy_cfg)
1208 | return incog_ctx, _browser
1209 | except PlaywrightException as e:
1210 | raise ToolError(f"Failed to create incognito context: {e}") from e
1211 |
1212 | # 6. Handle Shared Context Request
1213 | if not _ctx or not _ctx.browser: # Check if shared context needs creation/recreation
1214 | if _ctx: # Close previous invalid context if any
1215 | try:
1216 | await _ctx.close()
1217 | except Exception as close_err:
1218 | logger.warning(f"Error closing previous invalid shared context: {close_err}")
1219 |
1220 | try:
1221 | # Load state before creating context
1222 | loaded_state = await _load_state()
1223 | proxy_cfg = _get_proxy_config()
1224 |
1225 | final_ctx_args = default_args.copy()
1226 | if loaded_state:
1227 | final_ctx_args["storage_state"] = loaded_state
1228 | if proxy_cfg:
1229 | # Note: Using context.route for proxy filtering now,
1230 | # but setting proxy here is still needed for Playwright to use it.
1231 | final_ctx_args["proxy"] = proxy_cfg
1232 |
1233 | # Create the new shared context
1234 | new_shared_ctx = await _browser.new_context(**final_ctx_args)
1235 | _ctx = new_shared_ctx
1236 |
1237 | # Log context creation details (excluding potentially large state)
1238 | log_args = {}
1239 | for k, v in final_ctx_args.items():
1240 | if k != "storage_state":
1241 | log_args[k] = v
1242 | await _log(
1243 | "browser_context_create",
1244 | headless=is_headless,
1245 | proxy=bool(proxy_cfg),
1246 | args=log_args,
1247 | )
1248 |
1249 | # Apply proxy routing rules if needed
1250 | if proxy_cfg:
1251 | await _add_proxy_routing_rule(_ctx, proxy_cfg)
1252 |
1253 | # Start maintenance loop for the *new* shared context
1254 | asyncio.create_task(_context_maintenance_loop(_ctx))
1255 |
1256 | except PlaywrightException as e:
1257 | raise RuntimeError(f"Failed to create shared context: {e}") from e
1258 | except Exception as e: # Catch errors during state load/save too
1259 | raise RuntimeError(f"Failed during shared context creation/state load: {e}") from e
1260 |
1261 | # 7. Return the valid shared context and browser
1262 | return _ctx, _browser
1263 |
1264 |
1265 | async def _add_proxy_routing_rule(
1266 | context: BrowserContext, proxy_config: Dict[str, Any]
1267 | ): # Uses global _PROXY_ALLOWED_DOMAINS_LIST
1268 | """Adds routing rule to enforce proxy domain restrictions if enabled."""
1269 | # Check if domain restrictions are active
1270 | if _PROXY_ALLOWED_DOMAINS_LIST is None:
1271 | logger.debug("No proxy domain restrictions configured. Skipping routing rule.")
1272 | return
1273 |
1274 | async def handle_route(route):
1275 | request_url = route.request.url
1276 | if not _is_domain_allowed_for_proxy(request_url):
1277 | logger.warning(f"Proxy blocked for disallowed domain: {request_url}. Aborting request.")
1278 | try:
1279 | await route.abort("accessdenied")
1280 | except PlaywrightException as e:
1281 | # Log error but don't crash the handler
1282 | logger.error(f"Error aborting route for {request_url}: {e}")
1283 | else:
1284 | # Domain is allowed, let the request proceed (through the proxy set on the context)
1285 | try:
1286 | await route.continue_()
1287 | except PlaywrightException as e:
1288 | # Log error but don't crash the handler
1289 | logger.error(f"Error continuing route for {request_url}: {e}")
1290 |
1291 | try:
1292 | # Route all network requests ('**/*')
1293 | await context.route("**/*", handle_route)
1294 | logger.info("Proxy domain restriction routing rule added.")
1295 | except PlaywrightException as e:
1296 | logger.error(f"Failed to add proxy routing rule: {e}")
1297 |
1298 |
1299 | def _start_vnc(): # Uses globals
1300 | """Starts X11VNC if VNC enabled and password set."""
1301 | global _vnc_proc
1302 | # Check if already running or not enabled
1303 | if _vnc_proc or not _vnc_enabled_global:
1304 | return
1305 |
1306 | vnc_pass = _vnc_password_global
1307 | if not vnc_pass:
1308 | logger.debug("VNC start skipped: Password not set.")
1309 | return
1310 |
1311 | display = os.getenv("DISPLAY", ":0")
1312 | try:
1313 | # Check if x11vnc command exists
1314 | which_cmd = ["which", "x11vnc"]
1315 | result = subprocess.run(which_cmd, capture_output=True, text=True, check=False)
1316 | if result.returncode != 0:
1317 | logger.warning("x11vnc command not found in PATH. Cannot start VNC server.")
1318 | return
1319 |
1320 | # Prepare command arguments
1321 | cmd = [
1322 | "x11vnc",
1323 | "-display",
1324 | display,
1325 | "-passwd",
1326 | vnc_pass, # Use the password directly
1327 | "-forever", # Keep running until explicitly killed
1328 | "-localhost", # Only listen on localhost
1329 | "-quiet", # Reduce log output
1330 | "-noxdamage", # Compatibility option
1331 | ]
1332 |
1333 | # Use setsid to run in a new session, allowing clean termination
1334 | if hasattr(os, "setsid"):
1335 | preexec_fn = os.setsid
1336 | else:
1337 | preexec_fn = None # Not available on Windows
1338 |
1339 | # Start the process
1340 | vnc_process = subprocess.Popen(
1341 | cmd,
1342 | stdout=subprocess.DEVNULL, # Redirect stdout
1343 | stderr=subprocess.DEVNULL, # Redirect stderr
1344 | preexec_fn=preexec_fn, # Run in new session if possible
1345 | )
1346 | _vnc_proc = vnc_process
1347 | logger.info(
1348 | f"Password-protected VNC server started on display {display} (localhost only). PID: {_vnc_proc.pid}"
1349 | )
1350 |
1351 | # Register cleanup function to run on exit
1352 | atexit.register(_cleanup_vnc)
1353 |
1354 | except FileNotFoundError:
1355 | # This shouldn't happen if `which` check passed, but belts and suspenders
1356 | logger.warning("x11vnc command found by 'which' but Popen failed (FileNotFoundError).")
1357 | except Exception as e:
1358 | logger.error(f"Failed to start VNC server: {e}", exc_info=True)
1359 | _vnc_proc = None # Ensure proc is None if start failed
1360 |
1361 |
1362 | def _cleanup_vnc(): # Uses global _vnc_proc
1363 | """Terminates the VNC server process."""
1364 | global _vnc_proc
1365 | proc = _vnc_proc # Capture current process instance
1366 | if proc and proc.poll() is None: # Check if process exists and is running
1367 | logger.info(f"Terminating VNC server process (PID: {proc.pid})...")
1368 | try:
1369 | # Try to terminate the whole process group first (more reliable)
1370 | if hasattr(os, "getpgid") and hasattr(os, "killpg"):
1371 | try:
1372 | pgid = os.getpgid(proc.pid)
1373 | os.killpg(pgid, signal.SIGTERM)
1374 | logger.debug(f"Sent SIGTERM to process group {pgid}.")
1375 | except ProcessLookupError:
1376 | # Process group might already be gone
1377 | logger.debug("VNC process group not found, trying direct SIGTERM.")
1378 | proc.terminate()
1379 | except Exception as pg_err:
1380 | logger.warning(
1381 | f"Error sending SIGTERM to process group, trying direct SIGTERM: {pg_err}"
1382 | )
1383 | proc.terminate() # Fallback to terminating just the process
1384 | else:
1385 | # Fallback if killpg/getpgid not available
1386 | proc.terminate()
1387 | logger.debug("Sent SIGTERM directly to VNC process.")
1388 |
1389 | # Wait for termination with timeout
1390 | proc.wait(timeout=5)
1391 | logger.info("VNC server process terminated gracefully.")
1392 | except subprocess.TimeoutExpired:
1393 | logger.warning("VNC server did not terminate after SIGTERM. Sending SIGKILL.")
1394 | # Force kill if SIGTERM failed
1395 | if hasattr(os, "getpgid") and hasattr(os, "killpg"):
1396 | try:
1397 | pgid = os.getpgid(proc.pid)
1398 | os.killpg(pgid, signal.SIGKILL)
1399 | logger.debug(f"Sent SIGKILL to process group {pgid}.")
1400 | except ProcessLookupError:
1401 | logger.debug("VNC process group not found for SIGKILL, trying direct SIGKILL.")
1402 | proc.kill() # Fallback to killing just the process
1403 | except Exception as pg_kill_err:
1404 | logger.warning(
1405 | f"Error sending SIGKILL to process group, trying direct SIGKILL: {pg_kill_err}"
1406 | )
1407 | proc.kill() # Fallback
1408 | else:
1409 | proc.kill() # Fallback if killpg not available
1410 | logger.debug("Sent SIGKILL directly to VNC process.")
1411 | # Wait briefly after SIGKILL
1412 | try:
1413 | proc.wait(timeout=2)
1414 | except Exception:
1415 | # Ignore errors during wait after SIGKILL
1416 | pass
1417 | except ProcessLookupError:
1418 | # Process was already gone before we could signal it
1419 | logger.info("VNC process already terminated before cleanup.")
1420 | except Exception as e:
1421 | logger.error(f"Error during VNC cleanup: {e}")
1422 | finally:
1423 | # Ensure global state reflects VNC is stopped
1424 | if _vnc_proc == proc: # Avoid race condition if started again quickly
1425 | _vnc_proc = None
1426 |
1427 |
1428 | async def _load_state() -> dict[str, Any] | None: # Uses global _STATE_FILE, _get_pool, _dec
1429 | """Loads browser state asynchronously. Decryption runs in executor if needed."""
1430 | if _STATE_FILE is None or not _STATE_FILE.exists():
1431 | logger.info("Browser state file path not set or file not found. No state loaded.")
1432 | return None
1433 |
1434 | loop = asyncio.get_running_loop()
1435 | pool = _get_pool()
1436 | try:
1437 | # Read the potentially encrypted file content
1438 | async with aiofiles.open(_STATE_FILE, "rb") as f:
1439 | file_data = await f.read()
1440 |
1441 | # Decrypt if necessary (runs sync _dec in thread pool)
1442 | # _dec handles the check for whether encryption is active or not
1443 | try:
1444 | decrypted_data = await loop.run_in_executor(pool, _dec, file_data)
1445 | except RuntimeError as e:
1446 | if "cannot schedule new futures after shutdown" in str(e):
1447 | logger.warning(
1448 | "Thread pool is shutdown. Creating a temporary pool for state loading."
1449 | )
1450 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as temp_pool:
1451 | decrypted_data = await loop.run_in_executor(temp_pool, _dec, file_data)
1452 | else:
1453 | raise
1454 |
1455 | if decrypted_data is None:
1456 | # _dec logs specific reasons (invalid format, decryption failure, etc.)
1457 | logger.warning("Failed to load or decrypt state data. State file might be invalid.")
1458 | # Optionally remove the invalid file here if desired
1459 | # try: _STATE_FILE.unlink(); except Exception: pass
1460 | return None
1461 |
1462 | # Parse the decrypted JSON data
1463 | state_dict = json.loads(decrypted_data)
1464 | logger.info(f"Browser state loaded successfully from {_STATE_FILE}.")
1465 | return state_dict
1466 |
1467 | except FileNotFoundError:
1468 | # This case should be caught by the initial check, but handle defensively
1469 | logger.info(f"Browser state file {_STATE_FILE} not found during read.")
1470 | return None
1471 | except json.JSONDecodeError as e:
1472 | logger.error(
1473 | f"Failed to parse browser state JSON from {_STATE_FILE}: {e}. Removing corrupt file."
1474 | )
1475 | if _STATE_FILE:
1476 | try:
1477 | _STATE_FILE.unlink()
1478 | except Exception as unlink_e:
1479 | logger.error(f"Failed to remove corrupt state file {_STATE_FILE}: {unlink_e}")
1480 | return None
1481 | except RuntimeError as e: # Catch auth errors from _dec (InvalidTag)
1482 | logger.error(
1483 | f"Failed to authenticate/load browser state from {_STATE_FILE}: {e}", exc_info=True
1484 | )
1485 | if _STATE_FILE:
1486 | try:
1487 | _STATE_FILE.unlink()
1488 | except Exception as unlink_e:
1489 | logger.error(
1490 | f"Failed to remove unauthenticated state file {_STATE_FILE}: {unlink_e}"
1491 | )
1492 | return None
1493 | except Exception as e:
1494 | logger.error(f"Failed to load browser state from {_STATE_FILE}: {e}", exc_info=True)
1495 | # Optionally remove the problematic file
1496 | if _STATE_FILE:
1497 | try:
1498 | _STATE_FILE.unlink()
1499 | except Exception as unlink_e:
1500 | logger.error(f"Failed to remove problematic state file {_STATE_FILE}: {unlink_e}")
1501 | return None
1502 |
1503 |
1504 | async def _save_state(ctx: BrowserContext): # Uses global _get_pool, _enc, _STATE_FILE, _key, _playwright_lock
1505 | """Saves browser state asynchronously using FileSystemTool's write_file."""
1506 | if _STATE_FILE is None:
1507 | logger.warning("Skipping save state: State file path (_STATE_FILE) not initialized.")
1508 | return
1509 |
1510 | # Acquire lock *before* checking context validity to prevent race with shutdown
1511 | async with _playwright_lock:
1512 | # Re-check context validity *after* acquiring the lock
1513 | if not ctx or not ctx.browser or not ctx.browser.is_connected():
1514 | logger.debug("Skipping save state: Context or browser became invalid/disconnected before save.")
1515 | return
1516 |
1517 | loop = asyncio.get_running_loop()
1518 | pool = _get_pool()
1519 | validated_fpath = str(_STATE_FILE)
1520 |
1521 | try:
1522 | # 1. Get the current storage state from Playwright (NOW protected by lock)
1523 | state = await ctx.storage_state()
1524 |
1525 | # 2. Serialize state to JSON bytes
1526 | state_json = json.dumps(state)
1527 | state_bytes = state_json.encode("utf-8")
1528 |
1529 | # 3. Encrypt the state bytes if key is configured (runs sync _enc in thread pool)
1530 | try:
1531 | data_to_write = await loop.run_in_executor(pool, _enc, state_bytes)
1532 | except RuntimeError as e:
1533 | if "cannot schedule new futures after shutdown" in str(e):
1534 | logger.warning("Thread pool is shutdown. Creating a temporary pool for state encryption.")
1535 | # Fallback pool creation remains useful
1536 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as temp_pool:
1537 | data_to_write = await loop.run_in_executor(temp_pool, _enc, state_bytes)
1538 | else:
1539 | raise # Re-raise other RuntimeErrors
1540 |
1541 | # 4. Write the (potentially encrypted) bytes using the standalone filesystem tool
1542 | logger.debug(f"Attempting to save state to: {validated_fpath} using filesystem tool.")
1543 | write_result = await write_file(path=validated_fpath, content=data_to_write) # Pass bytes
1544 |
1545 | # 5. Check result from filesystem tool
1546 | if not isinstance(write_result, dict) or not write_result.get("success"):
1547 | error_detail = "Invalid response"
1548 | if isinstance(write_result, dict):
1549 | error_detail = write_result.get("error", "Unknown")
1550 | logger.error(
1551 | f"Failed to save browser state using filesystem tool. Reason: {error_detail}"
1552 | )
1553 | # Log but don't raise ToolError here directly, let the maintenance loop handle logging it
1554 | return # Exit if write failed
1555 |
1556 | # 6. Log success
1557 | actual_path = write_result.get("path", validated_fpath)
1558 | logger.debug(f"Successfully saved state to file: {actual_path}") # Changed log level
1559 |
1560 | except PlaywrightException as e:
1561 | # Catch errors specifically from ctx.storage_state() if the context closed unexpectedly
1562 | # even with the lock (less likely now, but possible)
1563 | logger.warning(f"Playwright error during save state (context likely closed): {e}")
1564 | # Don't raise, let the loop continue/exit gracefully
1565 | except ToolError as e:
1566 | # Pass ToolError through (e.g., from write_file) - should be logged by caller
1567 | logger.error(f"ToolError during save state: {e}")
1568 | # Don't re-raise here, maintenance loop will log the error
1569 | except Exception as e:
1570 | logger.error(f"Unexpected error saving browser state (path: {validated_fpath}): {e}", exc_info=True)
1571 | # Don't raise ToolError here, let the maintenance loop log the failure
1572 |
1573 |
1574 | @asynccontextmanager
1575 | async def _tab_context(ctx: BrowserContext): # Uses global _log
1576 | """Async context manager for creating and cleaning up a Page."""
1577 | page = None
1578 | context_id = id(ctx) # Get ID for logging before potential errors
1579 | try:
1580 | page = await ctx.new_page()
1581 | await _log("page_open", context_id=context_id)
1582 | yield page
1583 | except PlaywrightException as e:
1584 | # Log the error before raising
1585 | await _log("page_error", context_id=context_id, action="create", error=str(e))
1586 | raise ToolError(f"Failed to create browser page: {e}") from e
1587 | finally:
1588 | if page and not page.is_closed():
1589 | try:
1590 | await page.close()
1591 | await _log("page_close", context_id=context_id)
1592 | except PlaywrightException as e:
1593 | # Log error during close, but don't prevent cleanup completion
1594 | logger.warning(f"Error closing page for context {context_id}: {e}")
1595 | await _log("page_error", context_id=context_id, action="close", error=str(e))
1596 |
1597 |
1598 | async def _context_maintenance_loop(ctx: BrowserContext): # Uses global _save_state
1599 | """Periodically saves state for the shared context."""
1600 | save_interval_seconds = 15 * 60 # Save every 15 minutes
1601 | context_id = id(ctx)
1602 | logger.info(f"Starting context maintenance loop for shared context {context_id}.")
1603 |
1604 | while True:
1605 | try:
1606 | # Check if context is still valid *before* sleeping
1607 | # Use is_connected() for a more robust check
1608 | if not ctx or not ctx.browser or not ctx.browser.is_connected():
1609 | logger.info(f"Shared context {context_id} seems invalid or disconnected. Stopping maintenance loop.")
1610 | break
1611 |
1612 | # Wait for the specified interval
1613 | await asyncio.sleep(save_interval_seconds)
1614 |
1615 | # Re-check context validity *after* sleeping, before saving
1616 | if not ctx or not ctx.browser or not ctx.browser.is_connected():
1617 | logger.info(f"Shared context {context_id} became invalid/disconnected during sleep. Stopping maintenance loop.")
1618 | break
1619 |
1620 | # Save the state (which now handles its own locking and errors more gracefully)
1621 | await _save_state(ctx)
1622 |
1623 | except asyncio.CancelledError:
1624 | logger.info(f"Context maintenance loop for {context_id} cancelled.")
1625 | break # Exit loop cleanly on cancellation
1626 | except Exception as e:
1627 | # Log unexpected errors in the loop itself (e.g., during the sleep?)
1628 | logger.error(f"Unexpected error in context maintenance loop for {context_id}: {e}", exc_info=True)
1629 | # Wait a bit longer before retrying after an unexpected loop error
1630 | await asyncio.sleep(60)
1631 |
1632 |
1633 | # --- Standalone Shutdown Function ---
1634 | # --- Replace the existing shutdown function in smart_browser.py ---
1635 | async def shutdown(): # Uses MANY globals
1636 | """Gracefully shut down Playwright, browser, context, VNC, and thread pool."""
1637 | global \
1638 | _pw, \
1639 | _browser, \
1640 | _ctx, \
1641 | _vnc_proc, \
1642 | _thread_pool, \
1643 | _locator_cache_cleanup_task_handle, \
1644 | _inactivity_monitor_task_handle, \
1645 | _is_initialized, \
1646 | _shutdown_initiated # Added missing global reference
1647 |
1648 | # Use lock to prevent concurrent shutdown calls
1649 | # Check _shutdown_initiated flag inside lock for atomicity
1650 | async with _shutdown_lock:
1651 | if _shutdown_initiated:
1652 | logger.debug("Shutdown already initiated or in progress. Skipping.")
1653 | return
1654 | if not _is_initialized:
1655 | logger.info("Shutdown called but Smart Browser was not initialized. Skipping.")
1656 | return
1657 | # Mark shutdown as initiated *inside* the lock
1658 | _shutdown_initiated = True
1659 |
1660 | logger.info("Initiating graceful shutdown for Smart Browser...")
1661 |
1662 | # Set a global shutdown timeout to prevent hanging
1663 | shutdown_timeout = 10.0 # 10 seconds to complete shutdown or we'll force through
1664 | shutdown_start_time = time.monotonic()
1665 |
1666 | # Function to check if shutdown is taking too long
1667 | def is_shutdown_timeout():
1668 | return (time.monotonic() - shutdown_start_time) > shutdown_timeout
1669 |
1670 | # 1. Cancel Background Tasks First
1671 | tasks_to_cancel = [
1672 | (_locator_cache_cleanup_task_handle, "Locator Cache Cleanup Task"),
1673 | (_inactivity_monitor_task_handle, "Inactivity Monitor Task"),
1674 | ]
1675 | for task_handle, task_name in tasks_to_cancel:
1676 | if task_handle and not task_handle.done():
1677 | logger.info(f"Cancelling {task_name}...")
1678 | task_handle.cancel()
1679 | try:
1680 | # Wait briefly for cancellation to complete
1681 | await asyncio.wait_for(task_handle, timeout=2.0)
1682 | logger.info(f"{task_name} cancellation confirmed.") # Changed log level
1683 | except asyncio.CancelledError:
1684 | logger.info(f"{task_name} cancellation confirmed.") # Expected outcome
1685 | except asyncio.TimeoutError:
1686 | logger.warning(f"Timeout waiting for {task_name} cancellation.")
1687 | except Exception as e:
1688 | err_type = type(e).__name__
1689 | logger.warning(f"Error waiting for {task_name} cancellation: {err_type}")
1690 |
1691 | # Reset task handles
1692 | _locator_cache_cleanup_task_handle = None
1693 | _inactivity_monitor_task_handle = None
1694 |
1695 | # 2. Cancel any active tab pool operations
1696 | await tab_pool.cancel_all() # Handles incognito contexts
1697 |
1698 | # 3. Close Playwright resources (under lock to prevent concurrent access)
1699 | async with _playwright_lock:
1700 | # Close Shared Context (save state first, if possible)
1701 | ctx_to_close = _ctx
1702 | _ctx = None # Immediately unset global reference
1703 |
1704 | # Skip state saving if we're already at the timeout
1705 | if is_shutdown_timeout():
1706 | logger.warning("Skipping state saving due to shutdown timeout")
1707 | # --- Robust Check and Save State ---
1708 | elif ctx_to_close and ctx_to_close.browser and ctx_to_close.browser.is_connected():
1709 | logger.info("Attempting to save state for shared browser context...")
1710 | try:
1711 | # Add timeout for state saving
1712 | await asyncio.wait_for(_save_state(ctx_to_close), timeout=3.0)
1713 | logger.info("State saving attempted for shared context.") # Log attempt, success logged within _save_state
1714 | except asyncio.TimeoutError:
1715 | logger.warning("State saving timed out after 3 seconds")
1716 | except Exception as e:
1717 | # Catch any unexpected error from _save_state itself (should be rare now)
1718 | logger.error(f"Unexpected error during final state save attempt: {e}", exc_info=True)
1719 | elif ctx_to_close:
1720 | logger.warning("Skipping final state save: Shared context or browser already closed/disconnected.")
1721 | else:
1722 | logger.debug("Skipping final state save: No shared context exists.")
1723 | # --- End Robust Check and Save State ---
1724 |
1725 | # Close the context object itself
1726 | if ctx_to_close and not is_shutdown_timeout():
1727 | logger.info("Closing shared browser context object...")
1728 | try:
1729 | # Add timeout for context closing
1730 | await asyncio.wait_for(ctx_to_close.close(), timeout=3.0)
1731 | await _log("browser_context_close_shared")
1732 | logger.info("Shared browser context closed.")
1733 | except asyncio.TimeoutError:
1734 | logger.warning("Browser context close timed out after 3 seconds")
1735 | except Exception as e:
1736 | # Log error but continue shutdown
1737 | logger.error(f"Error closing shared context object: {e}", exc_info=False) # Keep log less verbose
1738 | elif ctx_to_close:
1739 | logger.warning("Skipping browser context close due to shutdown timeout")
1740 |
1741 | # Close Browser
1742 | browser_to_close = _browser
1743 | _browser = None # Immediately unset global reference
1744 | if browser_to_close and browser_to_close.is_connected() and not is_shutdown_timeout():
1745 | logger.info("Closing browser instance...")
1746 | try:
1747 | # Add timeout for browser closing - shorter timeout to avoid hanging
1748 | await asyncio.wait_for(browser_to_close.close(), timeout=3.0)
1749 | await _log("browser_close")
1750 | logger.info("Browser instance closed.")
1751 | except asyncio.TimeoutError:
1752 | logger.warning("Browser close timed out after 3 seconds")
1753 | except Exception as e:
1754 | logger.error(f"Error closing browser: {e}", exc_info=False) # Keep log less verbose
1755 | elif browser_to_close and browser_to_close.is_connected():
1756 | logger.warning("Skipping browser close due to shutdown timeout")
1757 |
1758 | # Stop Playwright
1759 | pw_to_stop = _pw
1760 | _pw = None # Immediately unset global reference
1761 | if pw_to_stop and not is_shutdown_timeout():
1762 | logger.info("Stopping Playwright...")
1763 | try:
1764 | # Add timeout for playwright stop - shorter timeout
1765 | await asyncio.wait_for(pw_to_stop.stop(), timeout=2.0)
1766 | logger.info("Playwright stopped.")
1767 | except asyncio.TimeoutError:
1768 | logger.warning("Playwright stop timed out after 2 seconds")
1769 | except Exception as e:
1770 | logger.error(f"Error stopping Playwright: {e}", exc_info=False) # Keep log less verbose
1771 | elif pw_to_stop:
1772 | logger.warning("Skipping Playwright stop due to shutdown timeout")
1773 |
1774 | # 4. Cleanup Synchronous Resources - always do this regardless of timeout
1775 | _cleanup_vnc()
1776 | _close_db_connection()
1777 |
1778 | # 5. Log completion and reset flags
1779 | await _log("browser_shutdown_complete")
1780 | if is_shutdown_timeout():
1781 | logger.warning("Smart Browser shutdown reached timeout limit - some resources may not be fully released")
1782 | else:
1783 | logger.info("Smart Browser graceful shutdown complete.")
1784 | _is_initialized = False
1785 |
1786 | # 6. Shutdown Thread Pool (MOVED TO THE VERY END)
1787 | logger.info("Shutting down thread pool...")
1788 | pool_to_shutdown = _get_pool()
1789 | # Don't wait for tasks if we're already at timeout
1790 | if is_shutdown_timeout():
1791 | try:
1792 | pool_to_shutdown.shutdown(wait=False)
1793 | logger.info("Thread pool shutdown initiated without waiting")
1794 | except Exception as e:
1795 | logger.error(f"Error during thread pool non-waiting shutdown: {e}")
1796 | else:
1797 | # Give the pool a short timeout to avoid hanging
1798 | try:
1799 | time_left = max(0, shutdown_timeout - (time.monotonic() - shutdown_start_time))
1800 | # Use the minimum of 3 seconds or remaining time
1801 | wait_time = min(3.0, time_left)
1802 |
1803 | # Create a separate thread to shut down the pool with a timeout
1804 | import threading
1805 | shutdown_complete = threading.Event()
1806 |
1807 | def shutdown_pool_with_timeout():
1808 | try:
1809 | pool_to_shutdown.shutdown(wait=True)
1810 | shutdown_complete.set()
1811 | except Exception as e:
1812 | logger.error(f"Error in thread pool shutdown thread: {e}")
1813 |
1814 | # Start the shutdown in a separate thread
1815 | thread = threading.Thread(target=shutdown_pool_with_timeout)
1816 | thread.daemon = True
1817 | thread.start()
1818 |
1819 | # Wait for completion or timeout
1820 | if shutdown_complete.wait(wait_time):
1821 | logger.info("Thread pool shut down successfully.")
1822 | else:
1823 | logger.warning(f"Thread pool shutdown timed out after {wait_time} seconds")
1824 | # Try non-waiting shutdown as fallback
1825 | try:
1826 | pool_to_shutdown.shutdown(wait=False)
1827 | except Exception:
1828 | pass # Already logged above
1829 | except Exception as e:
1830 | logger.error(f"Error setting up thread pool shutdown: {e}")
1831 | # Fallback to non-waiting shutdown
1832 | pool_to_shutdown.shutdown(wait=False)
1833 |
1834 |
1835 | async def _initiate_shutdown(): # Uses global _shutdown_initiated, _shutdown_lock
1836 | """Ensures shutdown runs only once."""
1837 | global _shutdown_initiated
1838 | async with _shutdown_lock:
1839 | if not _shutdown_initiated:
1840 | _shutdown_initiated = True
1841 | await shutdown()
1842 | else:
1843 | logger.debug("Shutdown already initiated. Ignoring duplicate request.")
1844 |
1845 |
1846 | # --- Signal Handling (Keep top-level) ---
1847 | def _signal_handler(sig, frame):
1848 | """Handle termination signals gracefully."""
1849 | signal_name = signal.Signals(sig).name
1850 | logger.info(f"Received signal {signal_name} ({sig}). Initiating Smart Browser shutdown...")
1851 | try:
1852 | # Try to get the running event loop
1853 | loop = asyncio.get_running_loop()
1854 | if loop.is_running():
1855 | # Schedule shutdown in the running loop, don't block signal handler
1856 | asyncio.create_task(_initiate_shutdown())
1857 | else:
1858 | # No running loop, attempt synchronous run (best effort)
1859 | logger.warning(
1860 | "No running event loop found in signal handler. Attempting sync shutdown."
1861 | )
1862 | try:
1863 | asyncio.run(_initiate_shutdown())
1864 | except RuntimeError as e:
1865 | logger.error(f"Could not run async shutdown synchronously from signal handler: {e}")
1866 | except RuntimeError as e:
1867 | # Error getting the loop itself
1868 | logger.error(
1869 | f"Error getting event loop in signal handler: {e}. Shutdown might be incomplete."
1870 | )
1871 |
1872 |
1873 | # Register signal handlers in a try-except block
1874 | try:
1875 | signal.signal(signal.SIGTERM, _signal_handler)
1876 | signal.signal(signal.SIGINT, _signal_handler) # Handle Ctrl+C too
1877 | except ValueError:
1878 | # This can happen if not running in the main thread
1879 | logger.warning(
1880 | "Could not register signal handlers (not running in main thread?). Graceful shutdown on SIGTERM/SIGINT might not work."
1881 | )
1882 |
1883 |
1884 | # --- Tab Pool (Keep global instance) ---
1885 | class TabPool: # Keep class definition
1886 | """Runs async callables needing a Page in parallel, bounded by global config."""
1887 |
1888 | def __init__(self, max_tabs: int | None = None):
1889 | if max_tabs is not None:
1890 | self.max_tabs = max_tabs
1891 | else:
1892 | self.max_tabs = _sb_max_tabs_global
1893 |
1894 | if self.max_tabs <= 0:
1895 | logger.warning(f"TabPool max_tabs configured to {self.max_tabs}. Setting to 1.")
1896 | self.max_tabs = 1
1897 | self.sem = asyncio.Semaphore(self.max_tabs)
1898 | self._active_contexts: Set[BrowserContext] = set() # Store contexts being used
1899 | self._context_lock = asyncio.Lock() # Protect access to _active_contexts
1900 | logger.info(f"TabPool initialized with max_tabs={self.max_tabs}")
1901 |
1902 | async def _run(self, fn: Callable[[Page], Awaitable[Any]]) -> Any:
1903 | """Internal method to run a single function within a managed tab."""
1904 | timeout_seconds = _sb_tab_timeout_global
1905 | incognito_ctx: Optional[BrowserContext] = None
1906 | task = asyncio.current_task()
1907 | task_id = id(task)
1908 | func_name = getattr(fn, "__name__", "anon_tab_fn")
1909 |
1910 | try:
1911 | # Acquire semaphore before creating context/page
1912 | async with self.sem:
1913 | # Create a new incognito context for isolation
1914 | # Pass None for context_args to use defaults
1915 | incognito_ctx, _ = await get_browser_context(use_incognito=True, context_args=None)
1916 |
1917 | # Add context to active set under lock
1918 | async with self._context_lock:
1919 | self._active_contexts.add(incognito_ctx)
1920 |
1921 | # Use the async context manager for the page
1922 | async with _tab_context(incognito_ctx) as page:
1923 | # Run the provided function with timeout
1924 | result = await asyncio.wait_for(fn(page), timeout=timeout_seconds)
1925 | return result # Return the successful result
1926 |
1927 | except asyncio.TimeoutError:
1928 | await _log("tab_timeout", function=func_name, timeout=timeout_seconds, task_id=task_id)
1929 | # Return error structure on timeout
1930 | return {
1931 | "error": f"Tab operation '{func_name}' timed out after {timeout_seconds}s",
1932 | "success": False,
1933 | }
1934 | except asyncio.CancelledError:
1935 | # Log cancellation and re-raise
1936 | await _log("tab_cancelled", function=func_name, task_id=task_id)
1937 | raise # Important to propagate cancellation
1938 | except Exception as e:
1939 | # Log any other exceptions during execution
1940 | await _log(
1941 | "tab_error", function=func_name, error=str(e), task_id=task_id, exc_info=True
1942 | )
1943 | # Return error structure
1944 | return {"error": f"Tab operation '{func_name}' failed: {e}", "success": False}
1945 | finally:
1946 | # Cleanup: Remove context from active set and close it
1947 | if incognito_ctx:
1948 | incog_ctx_id = id(incognito_ctx) # Get ID before potential close error
1949 | async with self._context_lock:
1950 | self._active_contexts.discard(incognito_ctx)
1951 | try:
1952 | await incognito_ctx.close()
1953 | logger.debug(f"Incognito context {incog_ctx_id} closed for task {task_id}.")
1954 | except PlaywrightException as close_err:
1955 | # Log error but don't let it prevent other cleanup
1956 | logger.warning(
1957 | f"Error closing incognito context {incog_ctx_id} for task {task_id}: {close_err}"
1958 | )
1959 |
1960 | async def map(self, fns: Sequence[Callable[[Page], Awaitable[Any]]]) -> List[Any]:
1961 | """Runs multiple functions concurrently using the tab pool."""
1962 | if not fns:
1963 | return []
1964 |
1965 | # Create tasks for each function using the internal _run method
1966 | tasks = []
1967 | for fn in fns:
1968 | task = asyncio.create_task(self._run(fn))
1969 | tasks.append(task)
1970 |
1971 | # Wait for all tasks to complete
1972 | results = await asyncio.gather(*tasks, return_exceptions=True)
1973 |
1974 | # Process results, handling potential exceptions returned by gather
1975 | processed_results = []
1976 | for i, res in enumerate(results):
1977 | if isinstance(res, Exception):
1978 | # Log the exception if a task failed unexpectedly
1979 | func_name = getattr(fns[i], "__name__", f"fn_{i}")
1980 | logger.error(f"Error in TabPool.map for '{func_name}': {res}", exc_info=res)
1981 | # Append an error dictionary for failed tasks
1982 | processed_results.append(
1983 | {"error": f"Task '{func_name}' failed with exception: {res}", "success": False}
1984 | )
1985 | else:
1986 | # Append the result directly (which might be an error dict from _run)
1987 | processed_results.append(res)
1988 | return processed_results
1989 |
1990 | async def cancel_all(self):
1991 | """Attempts to close all currently active incognito contexts managed by the pool."""
1992 | contexts_to_close: List[BrowserContext] = []
1993 | # Safely get the list of active contexts and clear the set under lock
1994 | async with self._context_lock:
1995 | contexts_to_close = list(self._active_contexts)
1996 | self._active_contexts.clear()
1997 |
1998 | if not contexts_to_close:
1999 | logger.debug("TabPool cancel_all: No active contexts to close.")
2000 | return
2001 |
2002 | logger.info(
2003 | f"TabPool cancel_all: Attempting to close {len(contexts_to_close)} active incognito contexts."
2004 | )
2005 | # Create closing tasks for each context
2006 | close_tasks = []
2007 | for ctx in contexts_to_close:
2008 | task = asyncio.create_task(ctx.close())
2009 | close_tasks.append(task)
2010 |
2011 | # Wait for all close tasks to complete, collecting results/exceptions
2012 | results = await asyncio.gather(*close_tasks, return_exceptions=True)
2013 |
2014 | # Count and log errors during closure
2015 | errors = 0
2016 | for res in results:
2017 | if isinstance(res, Exception):
2018 | errors += 1
2019 | if errors:
2020 | logger.warning(
2021 | f"TabPool cancel_all: Encountered {errors} errors while closing contexts."
2022 | )
2023 | else:
2024 | logger.info(
2025 | f"TabPool cancel_all: Successfully closed {len(contexts_to_close)} contexts."
2026 | )
2027 |
2028 |
2029 | # Global instance of the TabPool
2030 | tab_pool = TabPool()
2031 |
2032 |
2033 | # --- Human Jitter ---
2034 | def _risk_factor(url: str) -> float: # Uses global _high_risk_domains_set_global
2035 | """Calculates risk factor based on URL's domain (higher for known tricky domains)."""
2036 | if not url:
2037 | return 1.0 # Default risk if URL is empty
2038 | try:
2039 | parsed_url = urlparse(url)
2040 | domain = parsed_url.netloc.lower()
2041 | # Remove common www prefix
2042 | if domain.startswith("www."):
2043 | domain = domain[4:]
2044 |
2045 | if not domain:
2046 | return 1.0 # Default risk if domain cannot be parsed
2047 |
2048 | # Check if domain or its parent domains are in the high-risk set
2049 | domain_parts = domain.split(".")
2050 | for i in range(len(domain_parts)):
2051 | # Construct subdomain like ".example.com", ".com"
2052 | sub_domain_check = "." + ".".join(domain_parts[i:])
2053 | if sub_domain_check in _high_risk_domains_set_global:
2054 | return 2.0 # High risk factor
2055 |
2056 | # No match found in high-risk set
2057 | return 1.0 # Standard risk factor
2058 | except Exception as e:
2059 | logger.warning(f"Error calculating risk factor for URL '{url}': {e}")
2060 | return 1.0 # Default risk on error
2061 |
2062 |
2063 | async def _pause(
2064 | page: Page, base_ms_range: tuple[int, int] = (150, 500)
2065 | ): # Uses global _risk_factor
2066 | """Introduce a short, randomized pause, adjusted by URL risk factor and page complexity."""
2067 | if not page or page.is_closed():
2068 | return # Do nothing if page is invalid
2069 |
2070 | risk = _risk_factor(page.url)
2071 | min_ms, max_ms = base_ms_range
2072 | base_delay_ms = random.uniform(min_ms, max_ms)
2073 | adjusted_delay_ms = base_delay_ms * risk
2074 |
2075 | try:
2076 | # Estimate page complexity based on number of interactive elements
2077 | # Use a simpler selector for broad compatibility
2078 | selector = "a, button, input, select, textarea, [role=button], [role=link], [onclick]"
2079 | js_expr = f"() => document.querySelectorAll('{selector}').length"
2080 | element_count = await page.evaluate(js_expr)
2081 |
2082 | # If count is 0, might be an error or very simple page, assume moderate complexity
2083 | if element_count == 0:
2084 | element_count = max(element_count, 100) # Avoid division by zero/tiny factors
2085 |
2086 | # Skip pauses for low-risk, very simple pages
2087 | is_low_risk = risk == 1.0
2088 | is_simple_page = element_count < 50
2089 | if is_low_risk and is_simple_page:
2090 | return # No pause needed
2091 |
2092 | # Increase delay slightly based on complexity, capping the factor
2093 | complexity_factor_base = 1.0 + (element_count / 500.0)
2094 | complexity_factor = min(complexity_factor_base, 1.5) # Cap factor at 1.5
2095 | adjusted_delay_ms *= complexity_factor
2096 |
2097 | except PlaywrightException as e:
2098 | # Ignore errors during element count evaluation, proceed with risk-adjusted delay
2099 | logger.debug(f"Could not evaluate element count for pause adjustment: {e}")
2100 | pass
2101 |
2102 | # Cap the final delay to avoid excessive pauses
2103 | final_delay_ms = min(adjusted_delay_ms, 3000) # Max 3 seconds pause
2104 |
2105 | # Convert ms to seconds and sleep
2106 | final_delay_sec = final_delay_ms / 1000.0
2107 | await asyncio.sleep(final_delay_sec)
2108 |
2109 |
2110 | # --- Enhanced Locator Helpers (Depend on globals, use Filesystem tools) ---
2111 | _READ_JS_WRAPPER = textwrap.dedent("""
2112 | (html) => {
2113 | // Ensure Readability library is loaded in the window scope
2114 | const R = window.__sbReadability;
2115 | if (!R || !html) {
2116 | console.warn('Readability object or HTML missing.');
2117 | return ""; // Cannot proceed without library or content
2118 | }
2119 | try {
2120 | // Create a DOM from the HTML string
2121 | const parser = new DOMParser();
2122 | const doc = parser.parseFromString(html, "text/html");
2123 |
2124 | // Basic validation of the parsed document
2125 | if (!doc || !doc.body || doc.body.innerHTML.trim() === '') {
2126 | console.warn('Parsed document is invalid or empty.');
2127 | return "";
2128 | }
2129 |
2130 | // Use Readability to parse the article content
2131 | const article = new R.Readability(doc).parse();
2132 |
2133 | // Return the text content if parsing was successful
2134 | return article ? article.textContent : "";
2135 |
2136 | } catch (e) {
2137 | // Log errors during parsing
2138 | console.warn('Readability parsing failed:', e);
2139 | return ""; // Return empty string on error
2140 | }
2141 | }
2142 | """)
2143 |
2144 |
2145 | async def _ensure_readability(page: Page) -> None: # Uses global _READ_JS_CACHE
2146 | """Ensures Readability.js is injected, using STANDALONE filesystem tools."""
2147 | # Check if already injected
2148 | is_injected_js = "() => window.__sbReadability !== undefined"
2149 | already_injected = await page.evaluate(is_injected_js)
2150 | if already_injected:
2151 | logger.debug("Readability.js already injected.")
2152 | return
2153 |
2154 | if _READ_JS_CACHE is None:
2155 | logger.warning("Readability cache path (_READ_JS_CACHE) not set. Cannot cache script.")
2156 | # Proceed to fetch, but won't cache
2157 | else:
2158 | cache_file_path = str(_READ_JS_CACHE)
2159 |
2160 | src: Optional[str] = None
2161 |
2162 | # Try reading from cache if path is set
2163 | if _READ_JS_CACHE:
2164 | try:
2165 | logger.debug(f"Attempting to load Readability.js from cache: {cache_file_path}")
2166 | read_result = await read_file(path=cache_file_path)
2167 | if isinstance(read_result, dict) and not read_result.get("success"):
2168 | error_msg = read_result.get("error", "Unknown read error")
2169 | error_code = read_result.get("error_code", "")
2170 | logger.warning(
2171 | f"Failed to read Readability.js cache {cache_file_path}: {error_msg} (Code: {error_code}). Full response: {read_result}. Will attempt fetch." # Log full dict
2172 | )
2173 | if isinstance(read_result, dict) and read_result.get("success"):
2174 | content_list = read_result.get("content", [])
2175 | if isinstance(content_list, list) and content_list:
2176 | # Assuming single file content for this cache
2177 | file_content = content_list[0]
2178 | if isinstance(file_content, dict):
2179 | src = file_content.get("text")
2180 | if src:
2181 | logger.debug(
2182 | f"Readability.js loaded successfully from cache: {cache_file_path}"
2183 | )
2184 | else:
2185 | logger.warning(
2186 | f"Readability cache file {cache_file_path} content missing 'text'."
2187 | )
2188 | else:
2189 | logger.warning(
2190 | f"Readability cache file {cache_file_path} content format unexpected."
2191 | )
2192 | else:
2193 | logger.info(
2194 | f"Readability cache file {cache_file_path} exists but is empty or has no content list."
2195 | )
2196 | # Handle specific file not found error (or other read errors) from standalone tool
2197 | elif isinstance(read_result, dict) and not read_result.get("success"):
2198 | error_msg = read_result.get("error", "Unknown read error")
2199 | error_code = read_result.get("error_code", "")
2200 | if "does not exist" in error_msg.lower() or "PATH_NOT_FOUND" in error_code:
2201 | logger.info(
2202 | f"Readability.js cache file not found ({cache_file_path}). Will attempt fetch."
2203 | )
2204 | else:
2205 | logger.warning(
2206 | f"Failed to read Readability.js cache {cache_file_path}: {error_msg}. Will attempt fetch."
2207 | )
2208 | else: # Unexpected response format
2209 | logger.warning(
2210 | f"Unexpected response from read_file for {cache_file_path}. Will attempt fetch."
2211 | )
2212 |
2213 | except ToolError as e: # Catch explicit ToolError if raised by read_file internally
2214 | if "does not exist" in str(e).lower() or "PATH_NOT_FOUND" in getattr(
2215 | e, "error_code", ""
2216 | ):
2217 | logger.info(
2218 | f"Readability.js cache file not found ({cache_file_path}). Will attempt fetch."
2219 | )
2220 | else:
2221 | logger.warning(
2222 | f"ToolError reading Readability.js cache {cache_file_path}: {e}. Will attempt fetch."
2223 | )
2224 | except Exception as e:
2225 | # Catch any other unexpected errors during cache read
2226 | logger.warning(
2227 | f"Unexpected error reading Readability.js cache {cache_file_path}: {e}. Will attempt fetch.",
2228 | exc_info=True,
2229 | )
2230 |
2231 | # Fetch from CDN if not loaded from cache
2232 | if src is None:
2233 | logger.info("Fetching Readability.js from CDN...")
2234 | try:
2235 | async with httpx.AsyncClient() as client:
2236 | # Use a reliable CDN link
2237 | cdn_url = "https://cdnjs.cloudflare.com/ajax/libs/readability/0.5.0/Readability.js"
2238 | response = await client.get(cdn_url, timeout=15.0)
2239 | response.raise_for_status() # Raise exception for bad status codes
2240 | fetched_src = response.text
2241 | fetched_size = len(fetched_src)
2242 | await _log("readability_js_fetch", url=cdn_url, size=fetched_size)
2243 |
2244 | if fetched_src:
2245 | # Try writing to cache if path is set
2246 | if _READ_JS_CACHE:
2247 | try:
2248 | logger.debug(
2249 | f"Attempting to save fetched Readability.js to cache: {cache_file_path}"
2250 | )
2251 | # Use STANDALONE write_file tool
2252 | write_res = await write_file(
2253 | path=cache_file_path, content=fetched_src
2254 | ) # Pass string content
2255 |
2256 | if isinstance(write_res, dict) and write_res.get("success"):
2257 | logger.info(f"Saved fetched Readability.js to cache: {cache_file_path}")
2258 | else:
2259 | error_msg = (
2260 | write_res.get("error", "Unknown write error")
2261 | if isinstance(write_res, dict)
2262 | else "Invalid write_file response"
2263 | )
2264 | logger.warning(
2265 | f"Failed to write Readability.js cache ({cache_file_path}): {error_msg}"
2266 | )
2267 | except Exception as write_err:
2268 | # Log error but proceed with injection using fetched source
2269 | logger.warning(
2270 | f"Error writing Readability.js cache ({cache_file_path}): {write_err}"
2271 | )
2272 |
2273 | # Use the fetched source for injection
2274 | src = fetched_src
2275 | else:
2276 | logger.warning("Fetched empty content for Readability.js from CDN.")
2277 |
2278 | except httpx.HTTPStatusError as fetch_err:
2279 | logger.error(
2280 | f"HTTP error fetching Readability.js from {fetch_err.request.url}: {fetch_err.response.status_code}"
2281 | )
2282 | except httpx.RequestError as fetch_err:
2283 | logger.error(f"Network error fetching Readability.js: {fetch_err}")
2284 | except Exception as fetch_err:
2285 | logger.error(f"Failed to fetch/cache Readability.js: {fetch_err}", exc_info=True)
2286 |
2287 | # Inject the script if source code was successfully obtained (from cache or fetch)
2288 | if src:
2289 | # Wrap the source code to assign the Readability class to a window property
2290 | wrapped_src = f"window.__sbReadability = (() => {{ {src}; return Readability; }})();"
2291 | try:
2292 | await page.add_script_tag(content=wrapped_src)
2293 | logger.debug("Readability.js injected successfully.")
2294 | except PlaywrightException as e:
2295 | # Handle potential injection errors (e.g., Content Security Policy)
2296 | err_str = str(e)
2297 | if "Content Security Policy" in err_str:
2298 | page_url = page.url # Get URL for context
2299 | logger.warning(
2300 | f"Could not inject Readability.js due to Content Security Policy on {page_url}."
2301 | )
2302 | else:
2303 | logger.error(f"Failed to inject Readability.js script tag: {e}", exc_info=True)
2304 | except Exception as e:
2305 | logger.error(f"Unexpected error injecting Readability.js: {e}", exc_info=True)
2306 | else:
2307 | # Log if source couldn't be obtained
2308 | logger.warning("Failed to load or fetch Readability.js source. Proceeding without it.")
2309 |
2310 |
2311 | async def _dom_fingerprint(page: Page) -> str: # Uses global _dom_fp_limit_global
2312 | """Calculates a fingerprint of the page's visible text content."""
2313 | try:
2314 | # Evaluate JS to get the initial part of the body's innerText
2315 | js_expr = f"() => document.body.innerText.slice(0, {_dom_fp_limit_global})"
2316 | txt_content = await page.main_frame.evaluate(js_expr)
2317 |
2318 | # Ensure text is not None and strip whitespace
2319 | cleaned_txt = (txt_content or "").strip()
2320 |
2321 | # Encode the text to bytes (ignoring errors) and hash it
2322 | txt_bytes = cleaned_txt.encode("utf-8", "ignore")
2323 | hasher = hashlib.sha256(txt_bytes)
2324 | fingerprint = hasher.hexdigest()
2325 | return fingerprint
2326 |
2327 | except PlaywrightException as e:
2328 | # Log error if evaluation fails, return hash of empty string
2329 | logger.warning(f"Could not get text for DOM fingerprint: {e}")
2330 | empty_hash = hashlib.sha256(b"").hexdigest()
2331 | return empty_hash
2332 | except Exception as e:
2333 | # Catch unexpected errors
2334 | logger.error(f"Unexpected error calculating DOM fingerprint: {e}", exc_info=True)
2335 | empty_hash = hashlib.sha256(b"").hexdigest()
2336 | return empty_hash
2337 |
2338 |
2339 | def _shadow_deep_js() -> str: # Uses globals _max_widgets_global, _area_min_global
2340 | """JS function string to find elements, traversing shadow DOM."""
2341 | # This JS function is complex but self-contained. Keep as multi-line f-string.
2342 | # Relies on _max_widgets_global and _area_min_global from Python scope.
2343 | return f"""
2344 | (prefix) => {{
2345 | const MAX_ELEMENTS = {_max_widgets_global};
2346 | const MIN_ELEMENT_AREA = {_area_min_global};
2347 |
2348 | // --- Helper Functions ---
2349 | const isVisible = (el) => {{
2350 | if (!el || typeof el.getBoundingClientRect !== 'function') {{ return false; }}
2351 | try {{
2352 | // Check CSS visibility properties
2353 | const style = window.getComputedStyle(el);
2354 | if (style.display === 'none' || style.visibility === 'hidden' || parseFloat(style.opacity) === 0 || el.hidden) {{
2355 | return false;
2356 | }}
2357 | // Check if it has an offset parent (not detached or position:fixed parent hidden)
2358 | if (!el.offsetParent && style.position !== 'fixed') {{
2359 | return false;
2360 | }}
2361 |
2362 | // Check bounding box dimensions and position
2363 | const rect = el.getBoundingClientRect();
2364 | const hasPositiveSize = rect.width > 1 && rect.height > 1; // Needs some dimensions
2365 | const hasSufficientArea = (rect.width * rect.height) >= MIN_ELEMENT_AREA;
2366 |
2367 | // Check if it's within the viewport bounds (partially is sufficient)
2368 | const viewportHeight = window.innerHeight || document.documentElement.clientHeight;
2369 | const viewportWidth = window.innerWidth || document.documentElement.clientWidth;
2370 | const isInViewportVertically = rect.bottom > 0 && rect.top < viewportHeight;
2371 | const isInViewportHorizontally = rect.right > 0 && rect.left < viewportWidth;
2372 | const isOnscreen = isInViewportVertically && isInViewportHorizontally;
2373 |
2374 | // Combine checks: Must have size, be on screen, and either have min area or be a link/button.
2375 | return hasPositiveSize && isOnscreen && (hasSufficientArea || el.tagName === 'A' || el.tagName === 'BUTTON');
2376 | }} catch (e) {{
2377 | // Errors during checks mean we can't be sure, assume not visible
2378 | console.warn('Error in isVisible check:', e);
2379 | return false;
2380 | }}
2381 | }};
2382 |
2383 | const isInteractiveOrSignificant = (el) => {{
2384 | const tag = el.tagName.toLowerCase();
2385 | const role = (el.getAttribute('role') || '').toLowerCase();
2386 |
2387 | // Common interactive HTML tags
2388 | const interactiveTags = ['a', 'button', 'input', 'select', 'textarea', 'option', 'label', 'form', 'fieldset', 'details', 'summary', 'dialog', 'menu', 'menuitem'];
2389 | // Common interactive ARIA roles
2390 | const interactiveRoles = ['button', 'link', 'checkbox', 'radio', 'menuitem', 'tab', 'switch', 'option', 'searchbox', 'textbox', 'dialog', 'slider', 'spinbutton', 'combobox', 'listbox'];
2391 |
2392 | if (interactiveTags.includes(tag) || interactiveRoles.includes(role)) {{
2393 | return true;
2394 | }}
2395 |
2396 | // Check for explicit interaction handlers or attributes
2397 | if (el.onclick || el.href || el.getAttribute('tabindex') !== null || el.getAttribute('contenteditable') === 'true') {{
2398 | return true;
2399 | }}
2400 |
2401 | // Consider non-interactive containers with text content if they have sufficient area
2402 | if ((tag === 'div' || tag === 'section' || tag === 'article' || tag === 'main' || tag === 'span') && el.innerText && el.innerText.trim().length > 0) {{
2403 | try {{ const rect = el.getBoundingClientRect(); if (rect.width * rect.height >= MIN_ELEMENT_AREA) return true; }} catch(e) {{}}
2404 | }}
2405 |
2406 | // Consider images with alt text if they have sufficient area
2407 | if (tag === 'img' && el.alt && el.alt.trim().length > 0) {{
2408 | try {{ const rect = el.getBoundingClientRect(); if (rect.width * rect.height >= MIN_ELEMENT_AREA) return true; }} catch(e) {{}}
2409 | }}
2410 |
2411 | return false; // Default to not significant
2412 | }};
2413 |
2414 | const getElementText = (el) => {{
2415 | try {{
2416 | // Handle specific input types
2417 | if (el.tagName === 'INPUT') {{
2418 | const inputType = el.type.toLowerCase();
2419 | if (inputType === 'button' || inputType === 'submit' || inputType === 'reset') return el.value || '';
2420 | if (inputType === 'password') return 'Password input field'; // Don't expose value
2421 | // For other inputs, prioritize placeholder, then name, then type
2422 | return el.placeholder || el.name || el.getAttribute('aria-label') || inputType || '';
2423 | }}
2424 | if (el.tagName === 'TEXTAREA') {{
2425 | return el.placeholder || el.name || el.getAttribute('aria-label') || '';
2426 | }}
2427 | if (el.tagName === 'SELECT') {{
2428 | // Try associated label first
2429 | if (el.id) {{
2430 | const labels = document.querySelectorAll(`label[for="${{el.id}}"]`);
2431 | if (labels.length > 0 && labels[0].textContent) return labels[0].textContent.trim();
2432 | }}
2433 | return el.name || el.getAttribute('aria-label') || '';
2434 | }}
2435 | if (el.tagName === 'IMG') {{
2436 | return el.alt || ''; // Use alt text for images
2437 | }}
2438 | // Prefer aria-label if present
2439 | const ariaLabel = el.getAttribute('aria-label');
2440 | if (ariaLabel) return ariaLabel.trim();
2441 |
2442 | // Look for associated label via `for` attribute (if not already handled for select)
2443 | if (el.id && el.tagName !== 'SELECT') {{
2444 | const labels = document.querySelectorAll(`label[for="${{el.id}}"]`);
2445 | if (labels.length > 0 && labels[0].textContent) return labels[0].textContent.trim();
2446 | }}
2447 |
2448 | // Fallback to combined text content of direct children text nodes
2449 | let textContent = '';
2450 | for (const node of el.childNodes) {{
2451 | // Only include direct text node children
2452 | if (node.nodeType === Node.TEXT_NODE) {{
2453 | textContent += node.textContent;
2454 | }}
2455 | }}
2456 | textContent = textContent.trim();
2457 |
2458 | // If text node content is empty, fallback to innerText (which includes descendants)
2459 | if (!textContent) {{
2460 | textContent = el.innerText ? el.innerText.trim() : '';
2461 | }}
2462 |
2463 | // Limit text length? Maybe not here, handle later.
2464 | return textContent;
2465 |
2466 | }} catch (e) {{
2467 | console.warn('Error in getElementText:', e);
2468 | return ''; // Return empty string on error
2469 | }}
2470 | }};
2471 |
2472 | // --- Traversal Logic ---
2473 | const outputElements = [];
2474 | const queue = [document.documentElement]; // Start traversal from root
2475 | const visited = new Set(); // Keep track of visited nodes
2476 | let elementIndex = 0; // Counter for unique element IDs
2477 |
2478 | while (queue.length > 0 && outputElements.length < MAX_ELEMENTS) {{
2479 | const node = queue.shift(); // Get next node from queue
2480 |
2481 | if (!node || visited.has(node)) {{
2482 | continue; // Skip if node is null or already visited
2483 | }}
2484 | visited.add(node);
2485 |
2486 | // Process the node if it's interactive/significant and visible
2487 | if (isInteractiveOrSignificant(node) && isVisible(node)) {{
2488 | try {{
2489 | const rect = node.getBoundingClientRect();
2490 | // Assign a unique ID for referencing later
2491 | const elementId = `${{prefix || ''}}el_${{elementIndex++}}`;
2492 | node.dataset.sbId = elementId; // Store ID on the element itself
2493 |
2494 | // Collect element information
2495 | outputElements.push({{
2496 | id: elementId,
2497 | tag: node.tagName.toLowerCase(),
2498 | role: node.getAttribute("role") || "", // Get ARIA role
2499 | text: getElementText(node), // Get representative text
2500 | bbox: [ // Bounding box coordinates
2501 | Math.round(rect.x),
2502 | Math.round(rect.y),
2503 | Math.round(rect.width),
2504 | Math.round(rect.height)
2505 | ]
2506 | }});
2507 | }} catch (e) {{
2508 | console.warn('Error processing element:', node, e);
2509 | }}
2510 | }}
2511 |
2512 | // --- Queue Children for Traversal ---
2513 | // Check for Shadow DOM children first
2514 | if (node.shadowRoot) {{
2515 | const shadowChildren = node.shadowRoot.children;
2516 | if (shadowChildren) {{
2517 | for (let i = 0; i < shadowChildren.length; i++) {{
2518 | if (!visited.has(shadowChildren[i])) {{
2519 | queue.push(shadowChildren[i]);
2520 | }}
2521 | }}
2522 | }}
2523 | }}
2524 | // Check for regular children
2525 | else if (node.children) {{
2526 | const children = node.children;
2527 | for (let i = 0; i < children.length; i++) {{
2528 | if (!visited.has(children[i])) {{
2529 | queue.push(children[i]);
2530 | }}
2531 | }}
2532 | }}
2533 |
2534 | // Check for IFRAME content document
2535 | if (node.tagName === 'IFRAME') {{
2536 | try {{
2537 | // Access contentDocument carefully due to potential cross-origin restrictions
2538 | if (node.contentDocument && node.contentDocument.documentElement) {{
2539 | if (!visited.has(node.contentDocument.documentElement)) {{
2540 | queue.push(node.contentDocument.documentElement);
2541 | }}
2542 | }}
2543 | }} catch (iframeError) {{
2544 | console.warn('Could not access iframe content:', node.src || '[no src]', iframeError.message);
2545 | }}
2546 | }}
2547 | }} // End while loop
2548 |
2549 | return outputElements; // Return the collected element data
2550 | }}
2551 | """
2552 |
2553 |
2554 | async def _build_page_map(
2555 | page: Page,
2556 | ) -> Tuple[
2557 | Dict[str, Any], str
2558 | ]: # Uses globals _max_section_chars_global, _max_widgets_global, _log
2559 | """Builds a structured representation (map) of the current page content and elements."""
2560 | # Calculate fingerprint first to check cache
2561 | fp = await _dom_fingerprint(page)
2562 |
2563 | # Check if cached map exists on the page object for the current fingerprint
2564 | if hasattr(page, "_sb_page_map") and hasattr(page, "_sb_fp") and page._sb_fp == fp:
2565 | logger.debug(f"Using cached page map for {page.url} (FP: {fp[:8]}...).")
2566 | cached_map = page._sb_page_map
2567 | return cached_map, fp
2568 |
2569 | logger.debug(f"Building new page map for {page.url} (FP: {fp[:8]}...).")
2570 | # Initialize map components
2571 | await _ensure_readability(page) # Ensure Readability.js is available
2572 | main_txt = ""
2573 | elems: List[Dict[str, Any]] = []
2574 | page_title = "[Error Getting Title]"
2575 |
2576 | try:
2577 | # 1. Extract Main Text Content
2578 | html_content = await page.content()
2579 | if html_content:
2580 | # Try Readability first
2581 | extracted_text = await page.evaluate(_READ_JS_WRAPPER, html_content)
2582 | main_txt = extracted_text or ""
2583 |
2584 | # Fallback if Readability yields short content
2585 | if len(main_txt) < 200:
2586 | logger.debug("Readability text short (<200 chars), trying basic text extraction.")
2587 |
2588 | # Define the synchronous extraction helper locally
2589 | def extract_basic_text(html_str):
2590 | try:
2591 | # Limit HTML size processed by BeautifulSoup
2592 | max_html_size = 3 * 1024 * 1024
2593 | limited_html = html_str[:max_html_size]
2594 | soup = BeautifulSoup(limited_html, "lxml")
2595 | # Remove common non-content tags before text extraction
2596 | tags_to_remove = [
2597 | "script",
2598 | "style",
2599 | "nav",
2600 | "footer",
2601 | "header",
2602 | "aside",
2603 | "form",
2604 | "figure",
2605 | ]
2606 | found_tags = soup(tags_to_remove)
2607 | for tag in found_tags:
2608 | tag.decompose()
2609 | # Get text, join with spaces, strip extra whitespace
2610 | basic_text = soup.get_text(" ", strip=True)
2611 | return basic_text
2612 | except Exception as bs_err:
2613 | logger.warning(f"Basic text extraction with BeautifulSoup failed: {bs_err}")
2614 | return "" # Return empty on error
2615 |
2616 | # Run the sync extraction in the thread pool
2617 | loop = asyncio.get_running_loop()
2618 | pool = _get_pool()
2619 | fallback_text = await loop.run_in_executor(pool, extract_basic_text, html_content)
2620 | main_txt = fallback_text # Use fallback result
2621 |
2622 | # Limit the length of the extracted main text
2623 | main_txt = main_txt[:_max_section_chars_global]
2624 | else:
2625 | logger.warning(f"Failed to get HTML content for page map on {page.url}.")
2626 |
2627 | # 2. Extract Interactive Elements (across all frames)
2628 | js_func = _shadow_deep_js() # Get the JS function string
2629 | all_extracted_elems = []
2630 | all_frames = page.frames
2631 | for i, frame in enumerate(all_frames):
2632 | if frame.is_detached():
2633 | logger.debug(f"Skipping detached frame {i}.")
2634 | continue
2635 | frame_url_short = (frame.url or "unknown")[:80]
2636 | try:
2637 | # Evaluate element extraction JS in the frame with timeout
2638 | frame_prefix = f"f{i}:" # Prefix IDs with frame index
2639 | frame_elems = await asyncio.wait_for(
2640 | frame.evaluate(js_func, frame_prefix), timeout=5.0
2641 | )
2642 | all_extracted_elems.extend(frame_elems)
2643 | # Log extraction count per frame *only if* elements were found
2644 | if frame_elems:
2645 | logger.debug(
2646 | f"Extracted {len(frame_elems)} elements from frame {i} ({frame_url_short})."
2647 | )
2648 | except PlaywrightTimeoutError:
2649 | logger.warning(f"Timeout evaluating elements in frame {i} ({frame_url_short})")
2650 | except PlaywrightException as e:
2651 | # Be more specific about error logging - avoid logging full exception in normal operation unless debug level
2652 | logger.warning(
2653 | f"Playwright error evaluating elements in frame {i} ({frame_url_short}): {type(e).__name__}"
2654 | )
2655 | logger.debug(
2656 | f"Full PlaywrightException in frame {i}: {e}", exc_info=False
2657 | ) # Log full exception only at debug
2658 | except Exception as e:
2659 | logger.error(
2660 | f"Unexpected error evaluating elements in frame {i} ({frame_url_short}): {e}",
2661 | exc_info=True, # Log full traceback for unexpected errors
2662 | )
2663 |
2664 | # Limit the total number of elements stored
2665 | elems = all_extracted_elems[:_max_widgets_global]
2666 | logger.debug(
2667 | f"Total elements extracted: {len(all_extracted_elems)}, stored (limited): {len(elems)}"
2668 | ) # Log total and limited count
2669 |
2670 | # 3. Get Page Title
2671 | try:
2672 | page_title_raw = await page.title()
2673 | page_title = page_title_raw.strip() if page_title_raw else "[No Title]"
2674 | except PlaywrightException as title_err:
2675 | logger.warning(f"Could not get page title for {page.url}: {title_err}")
2676 | # Keep default error title
2677 |
2678 | except PlaywrightException as e:
2679 | logger.error(
2680 | f"Could not build page map for {page.url}: Playwright error: {e}", exc_info=True
2681 | )
2682 | except Exception as e:
2683 | logger.error(f"Unexpected error building page map for {page.url}: {e}", exc_info=True)
2684 |
2685 | # Removed the specific logging block that depended on URL_BOOKSTORE
2686 |
2687 | # Assemble the final page map dictionary
2688 | page_map = {
2689 | "url": page.url,
2690 | "title": page_title,
2691 | "main_text": main_txt,
2692 | "elements": elems, # Contains the limited list of elements
2693 | }
2694 |
2695 | # Cache the newly built map and its fingerprint on the page object
2696 | page._sb_page_map = page_map
2697 | page._sb_fp = fp
2698 | logger.debug(f"Page map built and cached for {page.url}.")
2699 |
2700 | return page_map, fp
2701 |
2702 |
2703 | _SM_GLOBAL = difflib.SequenceMatcher(autojunk=False)
2704 |
2705 |
2706 | def _ratio(a: str, b: str) -> float: # Keep as is
2707 | """Calculate similarity ratio between two strings using SequenceMatcher."""
2708 | if not a or not b:
2709 | return 0.0
2710 | # Set sequences for the global matcher instance
2711 | _SM_GLOBAL.set_seqs(a, b)
2712 | # Calculate and return the ratio
2713 | similarity_ratio = _SM_GLOBAL.ratio()
2714 | return similarity_ratio
2715 |
2716 |
2717 | def _heuristic_pick(
2718 | pm: Dict[str, Any], hint: str, role: Optional[str]
2719 | ) -> Optional[str]: # Uses global _seq_cutoff_global
2720 | """Finds the best element ID based on text similarity and heuristics."""
2721 | # Basic validation
2722 | if not hint or not pm or not pm.get("elements"):
2723 | return None
2724 |
2725 | # Normalize hint text (Unicode normalization and lowercase)
2726 | h_norm = unicodedata.normalize("NFC", hint).lower()
2727 | best_id: Optional[str] = None
2728 | best_score: float = -1.0
2729 | target_role_lower = role.lower() if role else None
2730 |
2731 | elements_list = pm.get("elements", [])
2732 | for e in elements_list:
2733 | if not e or not isinstance(e, dict):
2734 | continue # Skip invalid element entries
2735 |
2736 | el_id = e.get("id")
2737 | el_text_raw = e.get("text", "")
2738 | el_role_raw = e.get("role", "")
2739 | el_tag_raw = e.get("tag", "")
2740 |
2741 | if not el_id:
2742 | continue # Skip elements without our assigned ID
2743 |
2744 | # Normalize element text
2745 | el_text_norm = unicodedata.normalize("NFC", el_text_raw).lower()
2746 | el_role_lower = el_role_raw.lower()
2747 | el_tag_lower = el_tag_raw.lower()
2748 |
2749 | # Role filtering (if role specified)
2750 | # Allow matching button tag if role is button
2751 | is_role_match = target_role_lower == el_role_lower
2752 | is_button_match = target_role_lower == "button" and el_tag_lower == "button"
2753 | if target_role_lower and not is_role_match and not is_button_match:
2754 | continue # Skip if role doesn't match
2755 |
2756 | # --- Calculate Score ---
2757 | # Base score: Text similarity
2758 | score = _ratio(h_norm, el_text_norm)
2759 |
2760 | # Bonus: Exact role match
2761 | if target_role_lower and is_role_match:
2762 | score += 0.1
2763 |
2764 | # Bonus: Keyword matching (e.g., hint mentions "button" and element is button/role=button)
2765 | hint_keywords = {
2766 | "button",
2767 | "submit",
2768 | "link",
2769 | "input",
2770 | "download",
2771 | "checkbox",
2772 | "radio",
2773 | "tab",
2774 | "menu",
2775 | }
2776 | element_keywords = {el_role_lower, el_tag_lower}
2777 | # Find hint keywords present in the hint text itself
2778 | hint_words_in_hint = set()
2779 | split_hint = h_norm.split()
2780 | for w in split_hint:
2781 | if w in hint_keywords:
2782 | hint_words_in_hint.add(w)
2783 | # Check for intersection between keywords in hint and element's keywords
2784 | common_keywords = hint_words_in_hint.intersection(element_keywords)
2785 | if common_keywords:
2786 | score += 0.15
2787 |
2788 | # Bonus: Hint likely refers to label/placeholder and element seems related
2789 | has_label_hints = "label for" in h_norm or "placeholder" in h_norm
2790 | if has_label_hints and score > 0.6: # Apply only if base similarity is decent
2791 | score += 0.1
2792 |
2793 | # Penalty: Very short element text compared to a long hint
2794 | is_short_text = len(el_text_raw) < 5
2795 | is_long_hint = len(hint) > 10
2796 | if is_short_text and is_long_hint:
2797 | score -= 0.1
2798 |
2799 | # Penalty: Generic container tags without a specific role
2800 | is_generic_container = el_tag_lower in ("div", "span")
2801 | has_no_role = not el_role_lower
2802 | if is_generic_container and has_no_role:
2803 | score -= 0.05
2804 | # --- End Score Calculation ---
2805 |
2806 | # Update best match if current score is higher
2807 | if score > best_score:
2808 | best_id = el_id
2809 | best_score = score
2810 |
2811 | # Return the best ID found if the score meets the cutoff threshold
2812 | if best_score >= _seq_cutoff_global:
2813 | return best_id
2814 | else:
2815 | return None
2816 |
2817 |
2818 | async def _llm_pick(
2819 | pm: Dict[str, Any], task_hint: str, attempt: int
2820 | ) -> Optional[str]: # Uses global _llm_model_locator_global
2821 | """Asks the LLM to pick the best element ID for a given task hint."""
2822 | if not pm or not task_hint:
2823 | logger.warning("LLM pick skipped: Missing page map or task hint.")
2824 | return None
2825 |
2826 | # Prepare summary of elements for the LLM prompt
2827 | elements_summary = []
2828 | elements_list = pm.get("elements", [])
2829 | for el in elements_list:
2830 | el_id = el.get("id")
2831 | el_tag = el.get("tag")
2832 | el_role = el.get("role", " ") # Use space if empty for formatting
2833 | el_text = el.get("text", " ") # Use space if empty
2834 | # Truncate long text for the prompt
2835 | max_text_len = 80
2836 | truncated_text = el_text[:max_text_len] + ("..." if len(el_text) > max_text_len else "")
2837 | # Format summary string
2838 | summary_str = f"id={el_id} tag={el_tag} role='{el_role}' text='{truncated_text}'"
2839 | elements_summary.append(summary_str)
2840 |
2841 | # System prompt defining the task
2842 | system_prompt = textwrap.dedent("""
2843 | You are an expert web automation assistant. Your task is to identify the single best HTML element ID from the provided list that corresponds to the user's request.
2844 | Analyze the user's task hint and the list of elements (with their ID, tag, role, and text).
2845 | Choose the element ID (e.g., "el_12" or "f0:el_5") that is the most likely target for the user's action.
2846 | Consider the element's text, role, tag, and the user's likely intent.
2847 | If multiple elements seem possible, prioritize elements with clear interactive roles (button, link, input, etc.) or specific text matches.
2848 | If no element is a clear match for the task hint, respond with `{"id": null}`.
2849 | Respond ONLY with a JSON object containing the chosen element ID under the key "id". Example: `{"id": "el_42"}` or `{"id": "f1:el_10"}` or `{"id": null}`. Do NOT include explanations or markdown formatting.
2850 | """).strip()
2851 |
2852 | # User prompt containing the context and request
2853 | elements_str = "\n".join(elements_summary)
2854 | user_prompt = textwrap.dedent(f"""
2855 | Page Title: {pm.get("title", "[No Title]")}
2856 | Page URL: {pm.get("url", "[No URL]")}
2857 |
2858 | Available Elements:
2859 | {elements_str}
2860 |
2861 | User Task Hint: "{task_hint}"
2862 | Attempt Number: {attempt}
2863 |
2864 | Based on the task hint and element list, which element ID should be targeted?
2865 | Respond ONLY with a JSON object containing the 'id' (string or null).
2866 | """).strip()
2867 |
2868 | # Prepare messages for the LLM call
2869 | msgs = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
2870 |
2871 | # Call the LLM, expecting a JSON response
2872 | res = await _call_llm(
2873 | msgs,
2874 | model=_llm_model_locator_global, # Use configured model
2875 | expect_json=True,
2876 | temperature=0.0, # Low temperature for deterministic selection
2877 | max_tokens=100, # Should be enough for {"id": "fX:el_YYY"}
2878 | )
2879 |
2880 | # Process the LLM response
2881 | if isinstance(res, dict):
2882 | if "id" in res:
2883 | el_id = res.get("id")
2884 | # Validate the format of the returned ID (string starting with el_ or f*:el_, or null)
2885 | is_valid_null = el_id is None
2886 | is_valid_string_format = isinstance(el_id, str) and re.match(
2887 | r"^(?:f\d+:)?el_\d+$", el_id
2888 | )
2889 | if is_valid_null or is_valid_string_format:
2890 | if el_id:
2891 | logger.debug(
2892 | f"LLM picked ID: {el_id} for hint '{task_hint}' (Attempt {attempt})"
2893 | )
2894 | else:
2895 | logger.debug(
2896 | f"LLM explicitly picked null ID for hint '{task_hint}' (Attempt {attempt})"
2897 | )
2898 | return el_id
2899 | else:
2900 | # Log warning if ID format is invalid
2901 | logger.warning(
2902 | f"LLM returned invalid ID format: {el_id} for hint '{task_hint}' (Attempt {attempt})"
2903 | )
2904 | return None # Treat invalid format as no pick
2905 | elif "error" in res:
2906 | # Log error if LLM call failed
2907 | error_msg = res["error"]
2908 | logger.warning(
2909 | f"LLM picker failed for hint '{task_hint}' (Attempt {attempt}): {error_msg}"
2910 | )
2911 | return None # Treat LLM error as no pick
2912 | else:
2913 | # Log warning if response dictionary format is unexpected
2914 | logger.warning(
2915 | f"LLM picker returned unexpected dict format: {res.keys()} for hint '{task_hint}' (Attempt {attempt})"
2916 | )
2917 | return None # Treat unexpected format as no pick
2918 | else:
2919 | # Log warning if the response is not a dictionary
2920 | res_type = type(res).__name__
2921 | logger.warning(
2922 | f"LLM picker returned unexpected response type: {res_type} for hint '{task_hint}' (Attempt {attempt})"
2923 | )
2924 | return None # Treat unexpected type as no pick
2925 |
2926 |
2927 | async def _loc_from_id(page: Page, el_id: str) -> Locator: # Keep as is
2928 | """Gets a Playwright Locator object from a data-sb-id attribute."""
2929 | if not el_id:
2930 | raise ValueError("Element ID cannot be empty when creating locator.")
2931 |
2932 | # Escape the ID for use in CSS selector (esp. if ID contains quotes or backslashes)
2933 | # Double backslashes for Python string literal, then double again for CSS escaping
2934 | escaped_id_inner = el_id.replace("\\", "\\\\").replace('"', '\\"')
2935 | selector = f'[data-sb-id="{escaped_id_inner}"]'
2936 |
2937 | # Check if the ID indicates a specific frame (e.g., "f0:el_12")
2938 | if ":" in el_id and el_id.startswith("f"):
2939 | try:
2940 | frame_prefix, element_part = el_id.split(":", 1)
2941 | frame_index_str = frame_prefix[1:] # Get the number part after 'f'
2942 | frame_index = int(frame_index_str)
2943 | all_frames = page.frames
2944 | if 0 <= frame_index < len(all_frames):
2945 | target_frame = all_frames[frame_index]
2946 | # Return the locator within the specified frame
2947 | locator_in_frame = target_frame.locator(selector).first
2948 | return locator_in_frame
2949 | else:
2950 | # Log warning if frame index is out of bounds, fallback to main frame
2951 | logger.warning(
2952 | f"Frame index {frame_index} from ID '{el_id}' is out of bounds (0-{len(all_frames) - 1}). Falling back to main frame search."
2953 | )
2954 | except (ValueError, IndexError) as e:
2955 | # Log warning if parsing fails, fallback to main frame
2956 | logger.warning(
2957 | f"Could not parse frame index from ID '{el_id}'. Falling back to main frame search. Error: {e}"
2958 | )
2959 |
2960 | # Default: return locator in the main frame
2961 | locator_in_main = page.locator(selector).first
2962 | return locator_in_main
2963 |
2964 |
2965 | # --- Enhanced Locator (as a helper class, not a tool itself) ---
2966 | class EnhancedLocator: # Keep class, but it's used INTERNALLY by standalone functions
2967 | """Unified locator using cache, heuristics, and LLM fallback."""
2968 |
2969 | def __init__(self, page: Page):
2970 | self.page = page
2971 | # Determine site identifier from URL for caching
2972 | self.site = "unknown"
2973 | try:
2974 | page_url = page.url or "" # Handle case where URL might be None/empty
2975 | parsed = urlparse(page_url)
2976 | netloc_raw = parsed.netloc.lower()
2977 | # Remove www. prefix if present
2978 | netloc_clean = netloc_raw.replace("www.", "")
2979 | # Use cleaned netloc, fallback to 'unknown' if empty
2980 | self.site = netloc_clean or "unknown"
2981 | except Exception as e:
2982 | logger.warning(f"Error parsing site from URL '{page.url}' for EnhancedLocator: {e}")
2983 | # Keep self.site as "unknown"
2984 | pass
2985 | # Internal cache for page map and fingerprint for current instance lifecycle
2986 | self._pm: Optional[Dict[str, Any]] = None
2987 | self._pm_fp: Optional[str] = None
2988 | # Timestamp for throttling network idle checks
2989 | self._last_idle_check: float = 0.0
2990 |
2991 | async def _maybe_wait_for_idle(self, timeout: float = 1.5): # Uses global _last_idle_check
2992 | """Waits for network idle state, throttled to avoid excessive waits."""
2993 | now = time.monotonic()
2994 | time_since_last_check = now - self._last_idle_check
2995 | # Only check if enough time has passed since the last check
2996 | if time_since_last_check > 1.0: # Check at most once per second
2997 | try:
2998 | # Wait for network to be idle for a short period
2999 | timeout_ms = int(timeout * 1000)
3000 | await self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
3001 | self._last_idle_check = time.monotonic() # Update timestamp on success
3002 | except PlaywrightException:
3003 | # Ignore timeout or other errors, just update timestamp
3004 | self._last_idle_check = time.monotonic()
3005 |
3006 | async def _get_page_map(self) -> Tuple[Dict[str, Any], str]: # Calls global _build_page_map
3007 | """Gets the current page map, potentially building it if needed."""
3008 | # Short wait/idle check before building map to allow dynamic content to settle
3009 | await self._maybe_wait_for_idle()
3010 | sleep_duration = random.uniform(0.1, 0.25) # Small random delay
3011 | await asyncio.sleep(sleep_duration)
3012 |
3013 | # Build the page map (which includes fingerprint check internally)
3014 | pm, fp = await _build_page_map(self.page)
3015 |
3016 | # Store locally for potential reuse within this instance lifecycle
3017 | self._pm = pm
3018 | self._pm_fp = fp
3019 | return pm, fp
3020 |
3021 | async def _selector_cached(
3022 | self, key: str, fp: str
3023 | ) -> Optional[Locator]: # Calls global _cache_get_sync, _log
3024 | """Checks cache for a selector, validates it, and returns Locator if valid."""
3025 | loop = asyncio.get_running_loop()
3026 | pool = _get_pool()
3027 | # Perform synchronous cache read in thread pool
3028 | sel = await loop.run_in_executor(pool, _cache_get_sync, key, fp)
3029 |
3030 | if sel:
3031 | logger.debug(f"Cache hit for key prefix {key[:8]}. Selector: '{sel}'")
3032 | try:
3033 | # Extract element ID from selector string like '[data-sb-id="f0:el_12"]'
3034 | match = re.search(r'data-sb-id="([^"]+)"', sel)
3035 | if not match:
3036 | logger.warning(
3037 | f"Cached selector '{sel}' has unexpected format. Ignoring cache."
3038 | )
3039 | return None
3040 | loc_id = match.group(1)
3041 |
3042 | # Get the Playwright Locator object using the ID
3043 | loc = await _loc_from_id(self.page, loc_id)
3044 |
3045 | # Quick check if the element is visible (short timeout)
3046 | await loc.wait_for(state="visible", timeout=500) # 500ms check
3047 |
3048 | # Log cache hit and return the valid locator
3049 | log_key = key[:8]
3050 | await _log("locator_cache_hit", selector=sel, key=log_key)
3051 | return loc
3052 | except (PlaywrightException, ValueError) as e:
3053 | # Log if cached selector is no longer valid/visible or ID parsing fails
3054 | logger.debug(
3055 | f"Cached selector '{sel}' failed visibility/location check. Error: {e}"
3056 | )
3057 | # Consider deleting the stale cache entry here?
3058 | # await loop.run_in_executor(pool, _cache_delete_sync, key) # Potentially aggressive
3059 | return None # Cache miss or invalid cached selector
3060 |
3061 | async def locate(
3062 | self, task_hint: str, *, role: Optional[str] = None, timeout: int = 5000
3063 | ) -> (
3064 | Locator
3065 | ): # Uses globals _retry_after_fail_global, _log, _get_pool, _cache_put_sync, _llm_pick
3066 | """
3067 | Finds the best Locator for a task hint using cache, heuristics, LLM, and smarter fallbacks.
3068 |
3069 | Args:
3070 | task_hint: Natural language description of the element to locate.
3071 | role: Optional specific ARIA role to filter potential matches.
3072 | timeout: Maximum time in milliseconds to find the element.
3073 |
3074 | Returns:
3075 | A Playwright Locator object pointing to the best match found.
3076 |
3077 | Raises:
3078 | ValueError: If task_hint is empty.
3079 | PlaywrightTimeoutError: If no suitable element is found within the timeout across all methods.
3080 | ToolError: For internal errors during location.
3081 | """
3082 | if not task_hint or not task_hint.strip():
3083 | raise ValueError("locate requires a non-empty 'task_hint'")
3084 |
3085 | start_time = time.monotonic()
3086 | timeout_sec = timeout / 1000.0
3087 | loop = asyncio.get_running_loop()
3088 | pool = _get_pool()
3089 |
3090 | # --- 1. Generate Cache Key ---
3091 | page_url = self.page.url or ""
3092 | parsed_url = urlparse(page_url)
3093 | path = parsed_url.path or "/"
3094 | # Normalize hint and role for cache key stability
3095 | normalized_hint = unicodedata.normalize("NFC", task_hint).lower().strip()
3096 | normalized_role = role.lower().strip() if role else None
3097 | key_data = {
3098 | "site": self.site,
3099 | "path": path,
3100 | "hint": normalized_hint,
3101 | "role": normalized_role,
3102 | }
3103 | key_src = json.dumps(key_data, sort_keys=True)
3104 | key_src_bytes = key_src.encode("utf-8")
3105 | cache_key = hashlib.sha256(key_src_bytes).hexdigest()
3106 | key_preview = cache_key[:8]
3107 | log_prefix = (
3108 | f"EnhancedLocator(key={key_preview}, hint='{task_hint[:50]}...', role='{role}')"
3109 | )
3110 | logger.debug(f"{log_prefix}: Initiating locate.")
3111 |
3112 | # --- 2. Check Cache with Current DOM Fingerprint ---
3113 | logger.debug(f"{log_prefix}: Checking cache...")
3114 | current_dom_fp = await _dom_fingerprint(self.page)
3115 | logger.debug(f"{log_prefix}: Current DOM FP: {current_dom_fp[:12]}...")
3116 | try:
3117 | cached_loc = await self._selector_cached(cache_key, current_dom_fp)
3118 | if cached_loc:
3119 | logger.info(f"{log_prefix}: Cache HIT.")
3120 | await _log(
3121 | "locator_success", hint=task_hint, role=role, method="cache", key=key_preview
3122 | )
3123 | return cached_loc
3124 | else:
3125 | logger.debug(f"{log_prefix}: Cache MISS.")
3126 | except Exception as cache_err:
3127 | logger.warning(f"{log_prefix}: Error checking cache: {cache_err}")
3128 |
3129 | # --- 3. Cache Miss: Get Page Map and Try Heuristics ---
3130 | logger.debug(f"{log_prefix}: Trying heuristics...")
3131 | try:
3132 | (
3133 | pm,
3134 | current_dom_fp,
3135 | ) = await self._get_page_map() # Get map (updates fingerprint if changed)
3136 | map_keys = list(pm.keys()) if pm else []
3137 | num_elements = len(pm.get("elements", [])) if pm else 0
3138 | logger.debug(
3139 | f"{log_prefix}: Page map obtained. FP={current_dom_fp[:8]}, Keys={map_keys}, Elements={num_elements}"
3140 | )
3141 |
3142 | heuristic_id = _heuristic_pick(pm, task_hint, role)
3143 | logger.debug(f"{log_prefix}: Heuristic pick result ID: '{heuristic_id}'")
3144 |
3145 | if heuristic_id:
3146 | try:
3147 | logger.debug(f"{log_prefix}: Validating heuristic pick ID '{heuristic_id}'...")
3148 | loc = await _loc_from_id(self.page, heuristic_id)
3149 | await loc.scroll_into_view_if_needed(timeout=2000)
3150 | wait_timeout_heur = max(1000, timeout // 3) # Use portion of timeout
3151 | logger.debug(
3152 | f"{log_prefix}: Waiting for heuristic element visibility ({wait_timeout_heur}ms)..."
3153 | )
3154 | await loc.wait_for(state="visible", timeout=wait_timeout_heur)
3155 | logger.info(f"{log_prefix}: Heuristic pick VALIDATED (ID: {heuristic_id}).")
3156 |
3157 | # Cache the successful heuristic result
3158 | selector_str = f'[data-sb-id="{heuristic_id}"]'
3159 | await loop.run_in_executor(
3160 | pool, _cache_put_sync, cache_key, selector_str, current_dom_fp
3161 | )
3162 | await _log(
3163 | "locator_heuristic_match", selector=heuristic_id, hint=task_hint, role=role
3164 | )
3165 | await _log(
3166 | "locator_success",
3167 | hint=task_hint,
3168 | role=role,
3169 | method="heuristic",
3170 | selector=heuristic_id,
3171 | )
3172 | return loc
3173 | except (PlaywrightException, ValueError) as e_heur_val:
3174 | logger.debug(
3175 | f"{log_prefix}: Heuristic pick '{heuristic_id}' validation FAILED. Error: {e_heur_val}"
3176 | )
3177 | # Continue to LLM fallback
3178 | except Exception as map_heur_err:
3179 | logger.warning(
3180 | f"{log_prefix}: Error during page map or heuristic processing: {map_heur_err}"
3181 | )
3182 | # Ensure pm is defined for LLM step, even if empty
3183 | pm = pm if "pm" in locals() else {}
3184 | current_dom_fp = (
3185 | current_dom_fp
3186 | if "current_dom_fp" in locals()
3187 | else await _dom_fingerprint(self.page)
3188 | )
3189 |
3190 | # --- 4. Heuristic Failed: Try LLM Picker (with retries) ---
3191 | logger.debug(f"{log_prefix}: Trying LLM picker...")
3192 | num_llm_attempts = 1 + _retry_after_fail_global
3193 | for att in range(1, num_llm_attempts + 1):
3194 | elapsed_sec = time.monotonic() - start_time
3195 | if elapsed_sec >= timeout_sec:
3196 | logger.warning(f"{log_prefix}: Timeout reached before completing LLM attempts.")
3197 | break # Break loop, proceed to fallback
3198 |
3199 | logger.debug(f"{log_prefix}: LLM pick attempt {att}/{num_llm_attempts}...")
3200 | # Ensure page map 'pm' is available from heuristic step or refreshed
3201 | if not pm or (
3202 | "error" in pm and att > 1
3203 | ): # Refresh if map invalid or after first attempt
3204 | logger.debug(f"{log_prefix}: Refreshing page map before LLM attempt {att}...")
3205 | try:
3206 | pm, current_dom_fp = await self._get_page_map()
3207 | logger.debug(f"{log_prefix}: Page map refreshed. FP={current_dom_fp[:8]}.")
3208 | except Exception as map_refresh_err:
3209 | logger.warning(
3210 | f"{log_prefix}: Failed to refresh page map for LLM attempt {att}: {map_refresh_err}"
3211 | )
3212 | # Try proceeding without map refresh? Or break? Let's break to avoid confusing LLM.
3213 | break
3214 |
3215 | llm_id = await _llm_pick(pm, task_hint, att)
3216 | logger.debug(f"{log_prefix}: LLM pick result (Attempt {att}): ID='{llm_id}'")
3217 |
3218 | if not llm_id:
3219 | logger.debug(f"{log_prefix}: LLM pick attempt {att} returned no ID.")
3220 | if att < num_llm_attempts:
3221 | continue # Refresh happens at start of next loop iteration if needed
3222 | else:
3223 | break # Last LLM attempt failed, proceed to fallback
3224 |
3225 | # LLM returned an ID, try to validate it
3226 | try:
3227 | logger.debug(f"{log_prefix}: Validating LLM pick ID '{llm_id}' (Attempt {att})...")
3228 | loc = await _loc_from_id(self.page, llm_id)
3229 | try: # Log outerHTML for debugging LLM picks
3230 | loc_llm_outer_html = await loc.evaluate(
3231 | "element => element.outerHTML", timeout=500
3232 | )
3233 | logger.debug(
3234 | f"{log_prefix}: LLM picked element outerHTML: {loc_llm_outer_html[:200]}..."
3235 | )
3236 | except Exception as eval_err:
3237 | logger.debug(
3238 | f"{log_prefix}: Error getting outerHTML for LLM pick {llm_id}: {eval_err}"
3239 | )
3240 |
3241 | await loc.scroll_into_view_if_needed(timeout=2000)
3242 | elapsed_now_sec = time.monotonic() - start_time
3243 | remaining_timeout_ms = max(500, timeout - int(elapsed_now_sec * 1000))
3244 | if remaining_timeout_ms <= 0:
3245 | raise PlaywrightTimeoutError("Timeout before LLM validation wait.")
3246 | logger.debug(
3247 | f"{log_prefix}: Waiting for LLM element visibility ({remaining_timeout_ms}ms)..."
3248 | )
3249 | await loc.wait_for(state="visible", timeout=remaining_timeout_ms)
3250 | logger.info(f"{log_prefix}: LLM pick VALIDATED (ID: {llm_id}, Attempt {att}).")
3251 |
3252 | # Cache the successful LLM result
3253 | selector_str = f'[data-sb-id="{llm_id}"]'
3254 | await loop.run_in_executor(
3255 | pool, _cache_put_sync, cache_key, selector_str, current_dom_fp
3256 | )
3257 | await _log(
3258 | "locator_llm_pick", selector=llm_id, attempt=att, hint=task_hint, role=role
3259 | )
3260 | await _log(
3261 | "locator_success",
3262 | hint=task_hint,
3263 | role=role,
3264 | method="llm",
3265 | selector=llm_id,
3266 | attempt=att,
3267 | )
3268 | return loc
3269 | except (PlaywrightException, ValueError) as e_llm_val:
3270 | logger.debug(
3271 | f"{log_prefix}: LLM pick '{llm_id}' (attempt {att}) validation FAILED. Error: {e_llm_val}"
3272 | )
3273 | # Continue to next LLM attempt loop iteration (map refresh handled at loop start)
3274 |
3275 | # --- 5. LLM Failed: Try Fallback Selectors ---
3276 | logger.debug(f"{log_prefix}: Trying fallback selectors...")
3277 |
3278 | fallback_strategies = [
3279 | (
3280 | "placeholder",
3281 | f'[placeholder*="{task_hint}" i]',
3282 | ), # Case-insensitive placeholder contains hint
3283 | (
3284 | "aria-label",
3285 | f'[aria-label*="{task_hint}" i]',
3286 | ), # Case-insensitive aria-label contains hint
3287 | ("exact_text", f'text="{task_hint}"'), # Exact text match
3288 | (
3289 | "contains_text",
3290 | f'text*="{task_hint}" i',
3291 | ), # Case-insensitive text contains hint (use cautiously)
3292 | ]
3293 |
3294 | for name, selector in fallback_strategies:
3295 | elapsed_sec_fb = time.monotonic() - start_time
3296 | remaining_timeout_ms_fb = max(500, timeout - int(elapsed_sec_fb * 1000))
3297 | if remaining_timeout_ms_fb <= 500 and elapsed_sec_fb >= timeout_sec: # Check both
3298 | logger.warning(
3299 | f"{log_prefix}: Timeout reached before trying fallback selector '{name}'."
3300 | )
3301 | break # Stop trying fallbacks if time is up
3302 |
3303 | logger.debug(
3304 | f"{log_prefix}: Trying fallback strategy '{name}' with selector: {selector}"
3305 | )
3306 | try:
3307 | loc = self.page.locator(selector).first
3308 | # Adjust scroll/wait timeout based on remaining time
3309 | scroll_timeout_fb = max(500, remaining_timeout_ms_fb // 3)
3310 | wait_timeout_fb = max(500, remaining_timeout_ms_fb // 2)
3311 |
3312 | await loc.scroll_into_view_if_needed(timeout=scroll_timeout_fb)
3313 | logger.debug(
3314 | f"{log_prefix}: Waiting for fallback '{name}' visibility ({wait_timeout_fb}ms)..."
3315 | )
3316 | await loc.wait_for(state="visible", timeout=wait_timeout_fb)
3317 |
3318 | # Fallback succeeded
3319 | logger.info(f"{log_prefix}: Locator found via fallback strategy '{name}'.")
3320 | await _log(
3321 | "locator_text_fallback",
3322 | selector=selector,
3323 | hint=task_hint,
3324 | role=role,
3325 | strategy=name,
3326 | )
3327 | await _log(
3328 | "locator_success",
3329 | hint=task_hint,
3330 | role=role,
3331 | method="fallback",
3332 | strategy=name,
3333 | selector=selector,
3334 | )
3335 | return loc
3336 | except PlaywrightTimeoutError:
3337 | logger.debug(
3338 | f"{log_prefix}: Fallback strategy '{name}' (selector: {selector}) failed (Timeout)."
3339 | )
3340 | except PlaywrightException as text_fallback_err:
3341 | logger.debug(
3342 | f"{log_prefix}: Fallback strategy '{name}' (selector: {selector}) failed (Playwright Error: {text_fallback_err})."
3343 | )
3344 | except Exception as fallback_unexpected:
3345 | logger.warning(
3346 | f"{log_prefix}: Unexpected error during fallback strategy '{name}': {fallback_unexpected}"
3347 | )
3348 |
3349 | # --- 6. All Methods Failed ---
3350 | final_elapsed_sec = time.monotonic() - start_time
3351 | log_hint = task_hint[:120]
3352 | log_duration = round(final_elapsed_sec, 1)
3353 | await _log("locator_fail_all", hint=log_hint, duration_s=log_duration, role=role)
3354 | logger.error(
3355 | f"{log_prefix}: FAILED to find element within {timeout_sec:.1f}s using all methods."
3356 | )
3357 | raise PlaywrightTimeoutError(
3358 | f"EnhancedLocator failed to find element for hint: '{task_hint}' within {timeout_sec:.1f}s using all methods (cache, heuristic, LLM, fallbacks)."
3359 | )
3360 |
3361 |
3362 | # --- Smart Actions (Helpers using EnhancedLocator) ---
3363 | async def _detect_web_obstacles(page: Page) -> Dict[str, Any]:
3364 | """Detect common web obstacles that might interfere with automation."""
3365 | obstacles = {
3366 | "captcha_detected": False,
3367 | "cookie_banner": False,
3368 | "cloudflare_challenge": False,
3369 | "login_required": False,
3370 | "details": []
3371 | }
3372 |
3373 | try:
3374 | # Comprehensive CAPTCHA detection
3375 | captcha_js = """() => {
3376 | const indicators = [];
3377 |
3378 | // Text-based detection
3379 | if (document.body.innerText.toLowerCase().includes('captcha') ||
3380 | document.body.innerText.toLowerCase().includes('recaptcha') ||
3381 | document.body.innerText.toLowerCase().includes('i\'m not a robot')) {
3382 | indicators.push('captcha_text_found');
3383 | }
3384 |
3385 | // Element-based detection
3386 | if (document.querySelector('iframe[title*="captcha" i]') ||
3387 | document.querySelector('iframe[src*="captcha" i]') ||
3388 | document.querySelector('[id*="captcha" i]') ||
3389 | document.querySelector('[class*="captcha" i]') ||
3390 | document.querySelector('div[class*="recaptcha" i]') ||
3391 | document.querySelector('.g-recaptcha') ||
3392 | document.querySelector('#recaptcha')) {
3393 | indicators.push('captcha_element_found');
3394 | }
3395 |
3396 | // Cookie banner detection
3397 | if (document.querySelector('[class*="cookie" i]') ||
3398 | document.querySelector('[id*="cookie" i]') ||
3399 | document.body.innerText.toLowerCase().includes('accept cookies') ||
3400 | document.body.innerText.toLowerCase().includes('cookie policy')) {
3401 | indicators.push('cookie_banner_found');
3402 | }
3403 |
3404 | // Cloudflare detection
3405 | if (document.body.innerText.includes('Cloudflare') &&
3406 | (document.body.innerText.includes('checking') ||
3407 | document.body.innerText.includes('security'))) {
3408 | indicators.push('cloudflare_challenge');
3409 | }
3410 |
3411 | // Login detection
3412 | if (document.querySelector('input[type="password"]') &&
3413 | (document.body.innerText.toLowerCase().includes('sign in') ||
3414 | document.body.innerText.toLowerCase().includes('log in') ||
3415 | document.body.innerText.toLowerCase().includes('login'))) {
3416 | indicators.push('login_required');
3417 | }
3418 |
3419 | return indicators;
3420 | }"""
3421 |
3422 | detected_indicators = await page.evaluate(captcha_js)
3423 |
3424 | # Process results
3425 | for indicator in detected_indicators:
3426 | if 'captcha' in indicator:
3427 | obstacles["captcha_detected"] = True
3428 | obstacles["details"].append(f"CAPTCHA detected: {indicator}")
3429 | elif 'cookie' in indicator:
3430 | obstacles["cookie_banner"] = True
3431 | obstacles["details"].append(f"Cookie banner detected: {indicator}")
3432 | elif 'cloudflare' in indicator:
3433 | obstacles["cloudflare_challenge"] = True
3434 | obstacles["details"].append(f"Cloudflare challenge detected: {indicator}")
3435 | elif 'login' in indicator:
3436 | obstacles["login_required"] = True
3437 | obstacles["details"].append(f"Login requirement detected: {indicator}")
3438 |
3439 | return obstacles
3440 |
3441 | except Exception as e:
3442 | logger.warning(f"Error detecting web obstacles: {e}")
3443 | return obstacles
3444 |
3445 |
3446 | @resilient(max_attempts=3, backoff=0.5)
3447 | async def smart_click(
3448 | page: Page, task_hint: str, *, target_kwargs: Optional[Dict] = None, timeout_ms: int = 5000
3449 | ) -> bool: # Uses global _log, _get_pool, _cache_put_sync
3450 | """Locates an element using a hint and clicks it."""
3451 | # Validate or generate task_hint
3452 | effective_task_hint = task_hint
3453 | if not task_hint or not task_hint.strip():
3454 | if target_kwargs:
3455 | name = target_kwargs.get("name", "")
3456 | role = target_kwargs.get("role", "")
3457 | if name or role:
3458 | role_part = role or "element"
3459 | name_part = f" named '{name}'" if name else ""
3460 | effective_task_hint = f"Click the {role_part}{name_part}"
3461 | logger.warning(f"smart_click missing hint, generated: '{effective_task_hint}'")
3462 | else:
3463 | # Neither name nor role provided in target_kwargs
3464 | raise ToolInputError(
3465 | "smart_click requires a non-empty 'task_hint' or a 'target' dictionary with 'name' or 'role'."
3466 | )
3467 | else:
3468 | # No target_kwargs provided either
3469 | raise ToolInputError("smart_click requires a non-empty 'task_hint'.")
3470 |
3471 | # First, detect web obstacles that might interfere with automation
3472 | try:
3473 | obstacles = await _detect_web_obstacles(page)
3474 |
3475 | # Handle CAPTCHA detection - fail early if trying to click CAPTCHA
3476 | if obstacles["captcha_detected"] and ("captcha" in effective_task_hint.lower() or "recaptcha" in effective_task_hint.lower()):
3477 | logger.error(f"Cannot click CAPTCHA element: '{effective_task_hint}'. CAPTCHAs are designed to prevent automation.")
3478 | raise ToolError(
3479 | f"CAPTCHA interaction blocked for task: '{effective_task_hint}'. "
3480 | "Manual intervention required. CAPTCHAs cannot be automatically solved."
3481 | )
3482 |
3483 | # Log any obstacles detected for diagnostic purposes
3484 | if any([obstacles["captcha_detected"], obstacles["cookie_banner"], obstacles["cloudflare_challenge"], obstacles["login_required"]]):
3485 | await _log("smart_click_obstacles_detected", task_hint=effective_task_hint, obstacles=obstacles)
3486 | logger.info(f"Web obstacles detected before click attempt: {obstacles['details']}")
3487 |
3488 | # Try to handle cookie banners automatically
3489 | if obstacles["cookie_banner"]:
3490 | logger.info("Attempting to dismiss cookie banner before main click action...")
3491 | cookie_selectors = [
3492 | 'button:has-text("Accept")', 'button:has-text("Accept All")',
3493 | 'button:has-text("OK")', 'button:has-text("Allow")',
3494 | '[id*="accept" i]', '[class*="accept" i]'
3495 | ]
3496 | for selector in cookie_selectors:
3497 | try:
3498 | cookie_btn = page.locator(selector).first
3499 | await cookie_btn.click(timeout=2000)
3500 | logger.info(f"Successfully dismissed cookie banner using: {selector}")
3501 | await asyncio.sleep(0.5) # Brief pause after dismissal
3502 | break
3503 | except Exception:
3504 | continue
3505 |
3506 | # Give Cloudflare challenges a moment to complete
3507 | if obstacles["cloudflare_challenge"]:
3508 | logger.info("Cloudflare challenge detected, waiting briefly...")
3509 | await asyncio.sleep(3)
3510 |
3511 | except Exception as obstacle_err:
3512 | logger.warning(f"Error during obstacle detection: {obstacle_err}. Proceeding with click attempt.")
3513 |
3514 | loc_helper = EnhancedLocator(page)
3515 | # Prepare log details, prioritizing target_kwargs if available
3516 | log_target = {}
3517 | if target_kwargs:
3518 | log_target.update(target_kwargs)
3519 | else:
3520 | log_target["hint"] = effective_task_hint # Log the hint used
3521 |
3522 | try:
3523 | # Locate the element using the enhanced locator
3524 | element = await loc_helper.locate(task_hint=effective_task_hint, timeout=timeout_ms)
3525 | element_id_for_cache = await element.get_attribute("data-sb-id")
3526 |
3527 | # Prepare and execute the click
3528 | await element.scroll_into_view_if_needed(timeout=3000) # Scroll with timeout
3529 | await _pause(page) # Add jitter before click
3530 | click_timeout = max(1000, timeout_ms // 2) # Use portion of overall timeout
3531 | await element.click(timeout=click_timeout)
3532 |
3533 | # Update cache if successful and ID was retrieved
3534 | if element_id_for_cache:
3535 | fp = await _dom_fingerprint(
3536 | page
3537 | ) # Get current fingerprint after click potentially changed DOM
3538 | # Generate cache key again (could be helper function)
3539 | page_url_after_click = page.url or ""
3540 | parsed_url_after_click = urlparse(page_url_after_click)
3541 | path_after_click = parsed_url_after_click.path or "/"
3542 | key_data_after_click = {
3543 | "site": loc_helper.site,
3544 | "path": path_after_click, # Use path *after* click
3545 | "hint": effective_task_hint.lower(),
3546 | }
3547 | key_src_after_click = json.dumps(key_data_after_click, sort_keys=True)
3548 | cache_key_after_click = hashlib.sha256(key_src_after_click.encode()).hexdigest()
3549 | selector_str = f'[data-sb-id="{element_id_for_cache}"]'
3550 | loop_after_click = asyncio.get_running_loop()
3551 | pool_after_click = _get_pool()
3552 | await loop_after_click.run_in_executor(
3553 | pool_after_click, _cache_put_sync, cache_key_after_click, selector_str, fp
3554 | )
3555 |
3556 | # Log success
3557 | await _log("click_success", target=log_target)
3558 | return True
3559 |
3560 | except PlaywrightTimeoutError as e:
3561 | # Element not found or visible within timeout
3562 | await _log("click_fail_notfound", target=log_target, error=str(e))
3563 | raise ToolError(
3564 | f"Click failed: Element not found/visible for hint '{effective_task_hint}'. {e}",
3565 | details=log_target,
3566 | ) from e
3567 | except PlaywrightException as e:
3568 | # Other Playwright errors during click/scroll/locate
3569 | await _log("click_fail_playwright", target=log_target, error=str(e))
3570 | raise ToolError(f"Click failed due to Playwright error: {e}", details=log_target) from e
3571 | except Exception as e:
3572 | # Unexpected errors
3573 | await _log("click_fail_unexpected", target=log_target, error=str(e))
3574 | raise ToolError(f"Unexpected error during click: {e}", details=log_target) from e
3575 |
3576 |
3577 | @resilient(max_attempts=3, backoff=0.5)
3578 | async def smart_type(
3579 | page: Page,
3580 | task_hint: str,
3581 | text: str,
3582 | *,
3583 | press_enter: bool = False,
3584 | clear_before: bool = True,
3585 | target_kwargs: Optional[Dict] = None,
3586 | timeout_ms: int = 5000,
3587 | ) -> bool: # Uses global _log, get_secret, _get_pool, _cache_put_sync
3588 | """Locates an element using a hint and types text into it."""
3589 | # Validate or generate task_hint
3590 | effective_task_hint = task_hint
3591 | if not task_hint or not task_hint.strip():
3592 | if target_kwargs:
3593 | name = target_kwargs.get("name", "")
3594 | role = target_kwargs.get("role", "input") # Default role to input for type
3595 | if name or role:
3596 | role_part = role or "element"
3597 | name_part = f" named '{name}'" if name else ""
3598 | effective_task_hint = f"Type into the {role_part}{name_part}"
3599 | logger.warning(f"smart_type missing hint, generated: '{effective_task_hint}'")
3600 | else:
3601 | raise ToolInputError(
3602 | "smart_type requires a non-empty 'task_hint' or a 'target' dictionary with 'name' or 'role'."
3603 | )
3604 | else:
3605 | raise ToolInputError("smart_type requires a non-empty 'task_hint'.")
3606 |
3607 | loc_helper = EnhancedLocator(page)
3608 | # Prepare log details
3609 | log_target = {}
3610 | if target_kwargs:
3611 | log_target.update(target_kwargs)
3612 | else:
3613 | log_target["hint"] = effective_task_hint
3614 |
3615 | resolved_text = text
3616 | log_value = "***SECRET***" # Default log value for secrets
3617 | # Resolve secrets if needed
3618 | if text.startswith("secret:"):
3619 | secret_path = text[len("secret:") :]
3620 | try:
3621 | resolved_text = get_secret(secret_path)
3622 | # Keep log_value as "***SECRET***"
3623 | except (KeyError, ValueError, RuntimeError) as e:
3624 | await _log("type_fail_secret", target=log_target, secret_ref=secret_path, error=str(e))
3625 | raise ToolInputError(f"Failed to resolve secret '{secret_path}': {e}") from e
3626 | else:
3627 | # Create safe log value for non-secrets (truncate if long)
3628 | if len(text) > 23:
3629 | log_value = text[:20] + "..."
3630 | else:
3631 | log_value = text
3632 |
3633 | try:
3634 | # Locate the element
3635 | element = await loc_helper.locate(task_hint=effective_task_hint, timeout=timeout_ms)
3636 | element_id_for_cache = await element.get_attribute("data-sb-id")
3637 |
3638 | # Prepare and perform the typing action
3639 | await element.scroll_into_view_if_needed(timeout=3000)
3640 | await _pause(page) # Jitter before interaction
3641 |
3642 | if clear_before:
3643 | await element.fill("") # Clear the field first
3644 |
3645 | # Type the resolved text with human-like delay
3646 | type_delay = random.uniform(30, 80)
3647 | await element.type(resolved_text, delay=type_delay)
3648 |
3649 | # Optionally press Enter
3650 | if press_enter:
3651 | await _pause(page, (50, 150)) # Short pause before Enter
3652 | try:
3653 | # Try pressing Enter directly
3654 | await element.press(
3655 | "Enter", timeout=1000, noWaitAfter=True
3656 | ) # Don't wait for navigation here
3657 | except PlaywrightException as e:
3658 | # Fallback: If Enter press fails (e.g., on non-input), try clicking the element again
3659 | # This might trigger submission if it's also a button or linked element.
3660 | logger.warning(
3661 | f"Enter key press failed for hint '{effective_task_hint}', trying smart_click fallback: {e}"
3662 | )
3663 | try:
3664 | await smart_click(
3665 | page, task_hint=effective_task_hint, target_kwargs=target_kwargs
3666 | )
3667 | except Exception as click_e:
3668 | logger.warning(
3669 | f"Fallback smart_click after failed Enter press also failed: {click_e}"
3670 | )
3671 | # Decide if this should re-raise or just log. Logging for now.
3672 |
3673 | # Update cache if successful
3674 | if element_id_for_cache:
3675 | fp = await _dom_fingerprint(page)
3676 | page_url_after_type = page.url or ""
3677 | parsed_url_after_type = urlparse(page_url_after_type)
3678 | path_after_type = parsed_url_after_type.path or "/"
3679 | key_data_after_type = {
3680 | "site": loc_helper.site,
3681 | "path": path_after_type,
3682 | "hint": effective_task_hint.lower(),
3683 | }
3684 | key_src_after_type = json.dumps(key_data_after_type, sort_keys=True)
3685 | cache_key_after_type = hashlib.sha256(key_src_after_type.encode()).hexdigest()
3686 | selector_str = f'[data-sb-id="{element_id_for_cache}"]'
3687 | loop_after_type = asyncio.get_running_loop()
3688 | pool_after_type = _get_pool()
3689 | await loop_after_type.run_in_executor(
3690 | pool_after_type, _cache_put_sync, cache_key_after_type, selector_str, fp
3691 | )
3692 |
3693 | # Log success
3694 | await _log("type_success", target=log_target, value=log_value, entered=press_enter)
3695 | return True
3696 |
3697 | except PlaywrightTimeoutError as e:
3698 | # Element not found or visible
3699 | await _log("type_fail_notfound", target=log_target, value=log_value, error=str(e))
3700 | raise ToolError(
3701 | f"Type failed: Element not found/visible for hint '{effective_task_hint}'. {e}",
3702 | details=log_target,
3703 | ) from e
3704 | except PlaywrightException as e:
3705 | # Other Playwright errors
3706 | await _log("type_fail_playwright", target=log_target, value=log_value, error=str(e))
3707 | raise ToolError(f"Type failed due to Playwright error: {e}", details=log_target) from e
3708 | except Exception as e:
3709 | # Unexpected errors
3710 | await _log("type_fail_unexpected", target=log_target, value=log_value, error=str(e))
3711 | raise ToolError(f"Unexpected error during type: {e}", details=log_target) from e
3712 |
3713 |
3714 | # --- LATE IMPORT TO BREAK CYCLE ---
3715 | # Import the decorators here, just before they are needed for the tool functions.
3716 | # This assumes the rest of the module has been initialized by the time Python reaches here.
3717 | try:
3718 | from ultimate_mcp_server.tools.base import with_error_handling, with_tool_metrics
3719 | except ImportError as e:
3720 | # This indicates the cycle might still exist or base failed to load for other reasons
3721 | logger.critical(f"CRITICAL: Failed to late-import base decorators needed for Smart Browser tools: {e}")
3722 | raise
3723 |
3724 |
3725 | @with_tool_metrics
3726 | @with_error_handling
3727 | async def browse(
3728 | url: str, wait_for_selector: Optional[str] = None, wait_for_navigation: bool = True
3729 | ) -> Dict[str, Any]:
3730 | """
3731 | Navigates to a URL using a dedicated browser tab, waits for load state
3732 | (and optionally a selector), then extracts and returns the page state.
3733 |
3734 | Args:
3735 | url: The URL to navigate to (scheme will be added if missing).
3736 | wait_for_selector: Optional CSS selector to wait for after navigation.
3737 | wait_for_navigation: Whether to wait for 'networkidle' (True) or
3738 | 'domcontentloaded' (False).
3739 |
3740 | Returns:
3741 | A dictionary containing success status and the final page state.
3742 | """
3743 | await _ensure_initialized()
3744 | _update_activity()
3745 |
3746 | # --- Input Validation ---
3747 | if not isinstance(url, str) or not url.strip():
3748 | raise ToolInputError("URL cannot be empty.")
3749 | # Add scheme if missing
3750 | if not url.startswith(("http://", "https://")):
3751 | url = "https://" + url
3752 | logger.debug(f"Prepended 'https://' to URL: {url}")
3753 |
3754 | # --- Proxy Check ---
3755 | proxy_cfg = _get_proxy_config()
3756 | if proxy_cfg and _PROXY_ALLOWED_DOMAINS_LIST is not None:
3757 | if not _is_domain_allowed_for_proxy(url):
3758 | proxy_server = proxy_cfg.get("server", "Configured Proxy")
3759 | error_msg = f"Navigation blocked by proxy domain rules for '{url}' via {proxy_server}."
3760 | await _log("browse_fail_proxy_disallowed", url=url, proxy=proxy_server)
3761 | raise ToolError(error_msg, error_code="proxy_domain_disallowed")
3762 |
3763 | # --- Execution ---
3764 | ctx, _ = await get_browser_context() # Get shared context
3765 | async with _tab_context(ctx) as page: # Use temp page from shared context
3766 | await _log("navigate_start", url=url)
3767 | try:
3768 | # Determine wait state based on argument
3769 | wait_until_state = "networkidle" if wait_for_navigation else "domcontentloaded"
3770 | nav_timeout = 60000 # 60 seconds
3771 | await page.goto(url, wait_until=wait_until_state, timeout=nav_timeout)
3772 |
3773 | # Optionally wait for a specific selector
3774 | if wait_for_selector:
3775 | selector_timeout = 15000 # 15 seconds
3776 | try:
3777 | await page.wait_for_selector(
3778 | wait_for_selector, state="visible", timeout=selector_timeout
3779 | )
3780 | await _log("navigate_wait_selector_ok", url=url, selector=wait_for_selector)
3781 | except PlaywrightTimeoutError:
3782 | # Log timeout but proceed, might still be usable
3783 | logger.warning(
3784 | f"Timeout waiting for selector '{wait_for_selector}' at {url} after navigation."
3785 | )
3786 | await _log(
3787 | "navigate_wait_selector_timeout", url=url, selector=wait_for_selector
3788 | )
3789 |
3790 | # Pause and get final state
3791 | await _pause(page, (50, 200))
3792 | state = await get_page_state(page) # Use helper to get structured state
3793 | await _log("navigate_success", url=url, title=state.get("title"))
3794 |
3795 | # Return success and page state
3796 | return {"success": True, "page_state": state}
3797 |
3798 | except PlaywrightException as e:
3799 | # Handle Playwright-specific navigation errors
3800 | await _log("navigate_fail_playwright", url=url, error=str(e))
3801 | # Decorator will wrap this in ToolError
3802 | raise ToolError(f"Navigation failed for {url}: {e}") from e
3803 | except Exception as e:
3804 | # Handle unexpected errors during navigation/state extraction
3805 | await _log("navigate_fail_unexpected", url=url, error=str(e))
3806 | # Decorator will wrap this in ToolError
3807 | raise ToolError(f"Unexpected error browsing {url}: {e}") from e
3808 |
3809 |
3810 | @with_tool_metrics
3811 | @with_error_handling
3812 | async def click(
3813 | url: str,
3814 | target: Optional[Dict[str, Any]] = None,
3815 | task_hint: Optional[str] = None,
3816 | wait_ms: int = 1000,
3817 | ) -> Dict[str, Any]:
3818 | """
3819 | Navigates to a URL, clicks an element identified by task_hint or target,
3820 | waits, and returns the resulting page state.
3821 |
3822 | Args:
3823 | url: The URL to navigate to first.
3824 | target: Optional dictionary (like Plan-Step target) used to generate hint if task_hint missing.
3825 | task_hint: Natural language description of the element to click.
3826 | wait_ms: Milliseconds to wait after the click action completes.
3827 |
3828 | Returns:
3829 | A dictionary containing success status and the final page state after the click.
3830 | """
3831 | await _ensure_initialized()
3832 | _update_activity()
3833 |
3834 | # --- Input Validation: Determine task_hint ---
3835 | effective_task_hint = task_hint
3836 | if not effective_task_hint:
3837 | if target and (target.get("name") or target.get("role")):
3838 | name = target.get("name", "")
3839 | role = target.get("role", "")
3840 | role_part = role or "element"
3841 | name_part = f" named '{name}'" if name else ""
3842 | effective_task_hint = f"Click the {role_part}{name_part}"
3843 | logger.debug(f"click tool generated task_hint: '{effective_task_hint}'")
3844 | else:
3845 | raise ToolInputError(
3846 | "click tool requires 'task_hint', or 'target' dict with 'name' or 'role'."
3847 | )
3848 |
3849 | # --- Execution ---
3850 | ctx, _ = await get_browser_context()
3851 | async with _tab_context(ctx) as page:
3852 | await _log("click_extract_navigate", url=url, hint=effective_task_hint)
3853 | # Navigate to the page
3854 | try:
3855 | nav_timeout = 60000
3856 | await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
3857 | except PlaywrightException as e:
3858 | raise ToolError(f"Navigation to '{url}' failed before click attempt: {e}") from e
3859 |
3860 | # Perform the click using the smart helper
3861 | # smart_click handles EnhancedLocator, interaction, logging, and errors
3862 | await smart_click(
3863 | page,
3864 | task_hint=effective_task_hint,
3865 | target_kwargs=target, # Pass target for logging inside smart_click
3866 | timeout_ms=10000, # Timeout for locating the element
3867 | )
3868 |
3869 | # Wait after click if specified
3870 | if wait_ms > 0:
3871 | await page.wait_for_timeout(wait_ms)
3872 |
3873 | # Wait for network to potentially settle after click (best effort)
3874 | try:
3875 | idle_timeout = 10000
3876 | await page.wait_for_load_state("networkidle", timeout=idle_timeout)
3877 | except PlaywrightTimeoutError:
3878 | logger.debug("Network idle wait timeout after click action.")
3879 |
3880 | # Pause and get final state
3881 | await _pause(page, (50, 200))
3882 | final_state = await get_page_state(page)
3883 | await _log("click_extract_success", url=page.url, hint=effective_task_hint)
3884 |
3885 | # Return success and the state after the click
3886 | return {"success": True, "page_state": final_state}
3887 |
3888 |
3889 | @with_tool_metrics
3890 | @with_error_handling
3891 | async def type_text(
3892 | url: str,
3893 | fields: List[Dict[str, Any]],
3894 | submit_hint: Optional[str] = None,
3895 | submit_target: Optional[Dict[str, Any]] = None,
3896 | wait_after_submit_ms: int = 2000,
3897 | ) -> Dict[str, Any]:
3898 | """
3899 | Navigates to a URL, fills specified form fields using task hints,
3900 | optionally clicks a submit element, waits, and returns the final page state.
3901 |
3902 | Args:
3903 | url: The URL containing the form.
3904 | fields: A list of dictionaries, each specifying a field to type into.
3905 | Required keys per dict: 'task_hint' (or 'target') and 'text'.
3906 | Optional keys: 'enter' (bool), 'clear_before' (bool).
3907 | submit_hint: Optional natural language description of the submit element.
3908 | submit_target: Optional target dictionary for the submit element.
3909 | wait_after_submit_ms: Milliseconds to wait after submission.
3910 |
3911 | Returns:
3912 | A dictionary containing success status and the final page state.
3913 | """
3914 | await _ensure_initialized()
3915 | _update_activity()
3916 |
3917 | # --- Input Validation ---
3918 | if not fields or not isinstance(fields, list):
3919 | raise ToolInputError("'fields' must be a non-empty list of dictionaries.")
3920 | if submit_hint and submit_target:
3921 | logger.warning("Both submit_hint and submit_target provided; submit_hint will be used.")
3922 | elif not submit_hint and not submit_target:
3923 | logger.debug("No submit_hint or submit_target provided; form will not be submitted.")
3924 |
3925 | # --- Execution ---
3926 | ctx, _ = await get_browser_context()
3927 | async with _tab_context(ctx) as page:
3928 | await _log("fill_form_navigate", url=url)
3929 | # Navigate to the form page
3930 | try:
3931 | nav_timeout = 60000
3932 | await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
3933 | except PlaywrightException as e:
3934 | raise ToolError(f"Navigation to '{url}' failed before filling form: {e}") from e
3935 |
3936 | # Wait briefly for form elements to likely appear (best effort)
3937 | try:
3938 | form_wait_timeout = 5000
3939 | await page.wait_for_selector(
3940 | "form, input, textarea, select", state="visible", timeout=form_wait_timeout
3941 | )
3942 | logger.debug("Form elements found, proceeding with field filling.")
3943 | except PlaywrightTimeoutError:
3944 | logger.warning("Did not quickly find typical form elements. Proceeding anyway.")
3945 |
3946 | # Loop through fields and type text
3947 | for i, field in enumerate(fields):
3948 | if not isinstance(field, dict):
3949 | raise ToolInputError(f"Item at index {i} in 'fields' is not a dictionary.")
3950 |
3951 | # Determine hint for the field
3952 | field_hint = field.get("task_hint")
3953 | field_target = field.get("target")
3954 | if not field_hint:
3955 | if field_target and (field_target.get("name") or field_target.get("role")):
3956 | name = field_target.get("name", "")
3957 | role = field_target.get("role", "input")
3958 | field_hint = (
3959 | f"{role or 'Input field'} '{name}'" if name else f"{role or 'Input field'}"
3960 | )
3961 | else:
3962 | raise ToolInputError(
3963 | f"Field at index {i} requires 'task_hint' or 'target' with name/role."
3964 | )
3965 |
3966 | # Get text to type
3967 | text_to_type = field.get("text")
3968 | if text_to_type is None: # Allow empty string, but not None
3969 | raise ToolInputError(
3970 | f"Field at index {i} ('{field_hint}') missing required 'text'."
3971 | )
3972 |
3973 | # Log the action for this field
3974 | await _log("fill_form_field", index=i, hint=field_hint)
3975 |
3976 | # Use smart_type helper for the actual typing
3977 | await smart_type(
3978 | page,
3979 | task_hint=field_hint,
3980 | text=text_to_type,
3981 | press_enter=field.get("enter", False),
3982 | clear_before=field.get("clear_before", True),
3983 | target_kwargs=field_target, # Pass target for logging inside smart_type
3984 | timeout_ms=5000,
3985 | )
3986 | await _pause(page, (50, 150)) # Short pause between fields
3987 |
3988 | # Handle optional submission
3989 | final_submit_hint = submit_hint
3990 | if not final_submit_hint and submit_target: # Generate hint from target if needed
3991 | if submit_target.get("name") or submit_target.get("role"):
3992 | name = submit_target.get("name", "")
3993 | role = submit_target.get("role", "button")
3994 | final_submit_hint = f"Submit {role or 'button'}" + (f" '{name}'" if name else "")
3995 | else:
3996 | logger.warning(
3997 | "submit_target provided but lacks 'name' or 'role'; cannot generate hint. Skipping submit."
3998 | )
3999 | final_submit_hint = None # Ensure submit doesn't happen
4000 |
4001 | if final_submit_hint:
4002 | await _log("fill_form_submit", hint=final_submit_hint)
4003 | # Use smart_click helper for submission
4004 | await smart_click(
4005 | page,
4006 | task_hint=final_submit_hint,
4007 | target_kwargs=submit_target,
4008 | timeout_ms=10000,
4009 | )
4010 | # Wait after submission
4011 | try:
4012 | submit_idle_timeout = 15000
4013 | await page.wait_for_load_state("networkidle", timeout=submit_idle_timeout)
4014 | except PlaywrightTimeoutError:
4015 | logger.debug("Network idle wait timeout after form submission.")
4016 | if wait_after_submit_ms > 0:
4017 | await page.wait_for_timeout(wait_after_submit_ms)
4018 |
4019 | # Get final page state
4020 | await _pause(page, (100, 300))
4021 | final_state = await get_page_state(page)
4022 | await _log(
4023 | "fill_form_success",
4024 | url=page.url,
4025 | num_fields=len(fields),
4026 | submitted=bool(final_submit_hint),
4027 | )
4028 |
4029 | return {"success": True, "page_state": final_state}
4030 |
4031 |
4032 | @with_tool_metrics
4033 | @with_error_handling
4034 | async def parallel(
4035 | urls: List[str], action: str = "get_state", max_tabs: Optional[int] = None
4036 | ) -> Dict[str, Any]:
4037 | """
4038 | Processes multiple URLs in parallel using isolated browser tabs via TabPool.
4039 | Currently only supports the 'get_state' action for each URL.
4040 |
4041 | Args:
4042 | urls: A list of URLs to process.
4043 | action: The action to perform on each URL (currently only 'get_state').
4044 | max_tabs: Optional override for the maximum number of concurrent tabs.
4045 | If None, uses the globally configured limit.
4046 |
4047 | Returns:
4048 | A dictionary containing success status, a list of results for each URL,
4049 | and counts of processed and successful URLs.
4050 | """
4051 | await _ensure_initialized()
4052 | _update_activity()
4053 |
4054 | # --- Input Validation ---
4055 | if not urls or not isinstance(urls, list):
4056 | raise ToolInputError("'urls' must be a non-empty list.")
4057 | if not all(isinstance(u, str) and u.strip() for u in urls):
4058 | raise ToolInputError("All items in 'urls' list must be non-empty strings.")
4059 | if action != "get_state":
4060 | raise ToolInputError(
4061 | f"Unsupported action '{action}'. Currently only 'get_state' is allowed."
4062 | )
4063 | if (
4064 | max_tabs is not None
4065 | and not isinstance(max_tabs, int)
4066 | or (isinstance(max_tabs, int) and max_tabs <= 0)
4067 | ):
4068 | raise ToolInputError("'max_tabs' override must be a positive integer if provided.")
4069 |
4070 | # --- Setup Tab Pool ---
4071 | # Use global pool unless max_tabs override is provided
4072 | pool_to_use = tab_pool
4073 | if max_tabs is not None:
4074 | logger.info(f"Using temporary TabPool with max_tabs override: {max_tabs}")
4075 | pool_to_use = TabPool(max_tabs=max_tabs)
4076 |
4077 | # --- Define Per-URL Processing Function ---
4078 | # This function runs inside the TabPool's managed page context
4079 | async def process_url_action(page: Page, *, url_to_process: str) -> Dict[str, Any]:
4080 | # Ensure URL has scheme
4081 | full_url = (
4082 | url_to_process
4083 | if url_to_process.startswith(("http://", "https://"))
4084 | else f"https://{url_to_process}"
4085 | )
4086 | result = {"url": url_to_process, "success": False} # Default result structure
4087 |
4088 | try:
4089 | await _log("parallel_navigate", url=full_url, action=action)
4090 | # Navigate to the URL
4091 | nav_timeout = 45000 # Shorter timeout for parallel tasks
4092 | await page.goto(full_url, wait_until="networkidle", timeout=nav_timeout)
4093 |
4094 | # Perform the specified action
4095 | if action == "get_state":
4096 | page_state = await get_page_state(page)
4097 | result["success"] = True
4098 | result["page_state"] = page_state
4099 | # Add other actions here if needed in the future
4100 | # elif action == "some_other_action":
4101 | # # ... perform other action ...
4102 | # result["success"] = True
4103 | # result["details"] = ...
4104 |
4105 | return result
4106 |
4107 | except PlaywrightException as e:
4108 | error_msg = f"Playwright error processing {full_url}: {e}"
4109 | logger.warning(error_msg)
4110 | await _log("parallel_url_error", url=full_url, action=action, error=str(e))
4111 | result["error"] = error_msg
4112 | return result
4113 | except Exception as e:
4114 | error_msg = f"Unexpected error processing {full_url}: {e}"
4115 | logger.error(error_msg, exc_info=True) # Log traceback for unexpected
4116 | await _log("parallel_url_error", url=full_url, action=action, error=str(e))
4117 | result["error"] = error_msg
4118 | return result
4119 |
4120 | # --- Create Tasks for TabPool ---
4121 | # Use functools.partial to pass the specific URL to each task instance
4122 | tasks_to_run = []
4123 | for u in urls:
4124 | # Create a partial function that captures the url_to_process kwarg
4125 | task_func = functools.partial(process_url_action, url_to_process=u)
4126 | tasks_to_run.append(task_func)
4127 |
4128 | # --- Run Tasks Concurrently using TabPool ---
4129 | logger.info(f"Starting parallel processing of {len(urls)} URLs with action '{action}'...")
4130 | # pool.map handles concurrency, semaphore, context/page creation/cleanup
4131 | results = await pool_to_use.map(tasks_to_run)
4132 | logger.info("Parallel processing complete.")
4133 |
4134 | # --- Process Results ---
4135 | successful_count = sum(1 for r in results if isinstance(r, dict) and r.get("success"))
4136 | processed_count = len(results)
4137 | await _log(
4138 | "parallel_process_complete",
4139 | total=len(urls),
4140 | processed=processed_count,
4141 | successful=successful_count,
4142 | action=action,
4143 | )
4144 |
4145 | # --- Return Final Summary ---
4146 | return {
4147 | "success": True, # Indicates the overall parallel orchestration completed
4148 | "results": results, # List containing result dict for each URL
4149 | "processed_count": processed_count,
4150 | "successful_count": successful_count,
4151 | }
4152 |
4153 |
4154 | # --- Download Helpers ---
4155 | async def _run_in_thread(func, *args): # Keep as is
4156 | """Runs a synchronous function in the thread pool."""
4157 | loop = asyncio.get_running_loop()
4158 | pool = _get_pool()
4159 | try:
4160 | result = await loop.run_in_executor(pool, func, *args)
4161 | return result
4162 | except RuntimeError as e:
4163 | if "cannot schedule new futures after shutdown" in str(e):
4164 | logger.warning("Thread pool is shutdown. Creating a temporary pool for operation.")
4165 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as temp_pool:
4166 | result = await loop.run_in_executor(temp_pool, func, *args)
4167 | return result
4168 | else:
4169 | raise
4170 |
4171 |
4172 | async def _compute_hash_async(data: bytes) -> str: # Keep as is
4173 | """Computes SHA256 hash of bytes data asynchronously in a thread."""
4174 |
4175 | # Define the synchronous hashing function locally
4176 | def sync_hash(d):
4177 | hasher = hashlib.sha256()
4178 | hasher.update(d)
4179 | return hasher.hexdigest()
4180 |
4181 | # Run the sync function in the thread pool
4182 | hex_digest = await _run_in_thread(sync_hash, data)
4183 | return hex_digest
4184 |
4185 |
4186 | async def _read_file_async(path: Path) -> bytes: # Keep as is
4187 | """Reads file content asynchronously using aiofiles."""
4188 | async with aiofiles.open(path, mode="rb") as f:
4189 | content = await f.read()
4190 | return content
4191 |
4192 |
4193 | async def _write_file_async(path: Path, data: bytes): # Keep as is
4194 | """Writes bytes data to a file asynchronously using aiofiles."""
4195 | async with aiofiles.open(path, mode="wb") as f:
4196 | await f.write(data)
4197 |
4198 |
4199 | def _extract_tables_sync(path: Path) -> List[Dict]: # Keep as is
4200 | """Synchronously extracts tables from PDF, Excel, or CSV files."""
4201 | ext = path.suffix.lower()
4202 | results: List[Dict] = []
4203 | try:
4204 | if ext == ".pdf":
4205 | try:
4206 | import tabula # Optional dependency
4207 |
4208 | # Read all tables from all pages, keep data as strings
4209 | dfs = tabula.read_pdf(
4210 | str(path),
4211 | pages="all",
4212 | multiple_tables=True,
4213 | pandas_options={"dtype": str},
4214 | silent=True,
4215 | )
4216 | if dfs: # If tables were found
4217 | table_list = []
4218 | for i, df in enumerate(dfs):
4219 | # Convert DataFrame to list of dicts (rows)
4220 | rows_data = df.to_dict(orient="records")
4221 | table_entry = {"type": "pdf_table", "page": i + 1, "rows": rows_data}
4222 | table_list.append(table_entry)
4223 | results = table_list
4224 | except ImportError:
4225 | logger.debug("tabula-py library not installed. Skipping PDF table extraction.")
4226 | except Exception as pdf_err:
4227 | # Catch errors during Tabula processing
4228 | logger.warning(f"Tabula PDF table extraction failed for {path.name}: {pdf_err}")
4229 |
4230 | elif ext in (".xls", ".xlsx"):
4231 | try:
4232 | import pandas as pd # Optional dependency
4233 |
4234 | # Read all sheets, keep data as strings
4235 | xl_dict = pd.read_excel(str(path), sheet_name=None, dtype=str)
4236 | sheet_list = []
4237 | for sheet_name, df in xl_dict.items():
4238 | rows_data = df.to_dict(orient="records")
4239 | sheet_entry = {
4240 | "type": "excel_sheet",
4241 | "sheet_name": sheet_name,
4242 | "rows": rows_data,
4243 | }
4244 | sheet_list.append(sheet_entry)
4245 | results = sheet_list
4246 | except ImportError:
4247 | logger.debug(
4248 | "pandas/openpyxl/xlrd library not installed. Skipping Excel table extraction."
4249 | )
4250 | except Exception as excel_err:
4251 | logger.warning(f"Pandas Excel table extraction failed for {path.name}: {excel_err}")
4252 |
4253 | elif ext == ".csv":
4254 | try:
4255 | import pandas as pd # Optional dependency
4256 |
4257 | # Read CSV, keep data as strings
4258 | df = pd.read_csv(str(path), dtype=str)
4259 | rows_data = df.to_dict(orient="records")
4260 | # Create a list containing the single table representation
4261 | results = [{"type": "csv_table", "rows": rows_data}]
4262 | except ImportError:
4263 | logger.debug("pandas library not installed. Skipping CSV table extraction.")
4264 | except Exception as csv_err:
4265 | logger.warning(f"Pandas CSV table extraction failed for {path.name}: {csv_err}")
4266 |
4267 | except Exception as outer_err:
4268 | # Catch errors during import or setup
4269 | logger.error(f"Error during table extraction setup for {path.name}: {outer_err}")
4270 |
4271 | return results
4272 |
4273 |
4274 | async def _extract_tables_async(path: Path) -> list: # Uses global _log
4275 | """Asynchronously extracts tables by running sync helper in thread pool."""
4276 | try:
4277 | # Run the synchronous extraction function in the thread pool
4278 | tables = await asyncio.to_thread(_extract_tables_sync, path)
4279 | if tables:
4280 | num_tables = len(tables)
4281 | await _log("table_extract_success", file=str(path), num_tables=num_tables)
4282 | # Return the list of tables (or empty list if none found/error)
4283 | return tables
4284 | except Exception as e:
4285 | # Log error during async execution/threading
4286 | await _log("table_extract_error", file=str(path), error=str(e))
4287 | return [] # Return empty list on error
4288 |
4289 |
4290 | @resilient() # Keep the retry decorator if desired
4291 | async def smart_download(
4292 | page: Page,
4293 | task_hint: str,
4294 | dest_dir: Optional[Union[str, Path]] = None,
4295 | target_kwargs: Optional[Dict] = None,
4296 | ) -> Dict[str, Any]:
4297 | """
4298 | Initiates download via click, saves via Playwright, reads file directly
4299 | for analysis (hash, tables), managing paths via FileSystem Tools.
4300 | """
4301 | final_dl_dir_path_str = "Unknown" # For logging context, default value
4302 | out_path: Optional[Path] = None # Define earlier for clarity, default None
4303 |
4304 | # --- Determine and Prepare Download Directory using FileSystemTool ---
4305 | try:
4306 | # Determine the target directory path string
4307 | if dest_dir:
4308 | download_dir_path_str = str(dest_dir)
4309 | else:
4310 | # Default: Use a path relative to the allowed 'storage' base directory
4311 | default_dl_subdir = "smart_browser_downloads"
4312 | download_dir_path_str = f"storage/{default_dl_subdir}"
4313 |
4314 | logger.info(
4315 | f"Ensuring download directory exists: '{download_dir_path_str}' using filesystem tool."
4316 | )
4317 | # Use STANDALONE create_directory tool
4318 | create_dir_result = await create_directory(path=download_dir_path_str)
4319 |
4320 | # Validate the result from the filesystem tool
4321 | if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
4322 | error_detail = "Invalid response"
4323 | if isinstance(create_dir_result, dict):
4324 | error_detail = create_dir_result.get("error", "Unknown")
4325 | raise ToolError(
4326 | f"Failed to prepare download directory '{download_dir_path_str}'. Filesystem tool error: {error_detail}"
4327 | )
4328 |
4329 | # Use the actual absolute path returned by the tool
4330 | final_dl_dir_path_str = create_dir_result.get(
4331 | "path", download_dir_path_str
4332 | ) # Use path from result, fallback to input
4333 | final_dl_dir_path = Path(final_dl_dir_path_str) # Convert to Path object for local use
4334 | logger.info(f"Download directory confirmed/created at: {final_dl_dir_path}")
4335 |
4336 | except ToolError as e:
4337 | logger.error(
4338 | f"ToolError preparing download directory '{download_dir_path_str}': {e}", exc_info=True
4339 | )
4340 | raise # Re-raise ToolError
4341 | except Exception as e:
4342 | # Catch any other unexpected errors during directory prep
4343 | logger.error(
4344 | f"Unexpected error preparing download directory '{download_dir_path_str}': {e}",
4345 | exc_info=True,
4346 | )
4347 | raise ToolError(
4348 | f"An unexpected error occurred preparing download directory: {str(e)}"
4349 | ) from e
4350 | # --- End Directory Preparation ---
4351 |
4352 | # Prepare log details
4353 | log_target = {}
4354 | if target_kwargs:
4355 | log_target.update(target_kwargs)
4356 | else:
4357 | log_target["hint"] = task_hint
4358 |
4359 | try:
4360 | # --- Initiate Download ---
4361 | # Wait for the download event to occur after the click
4362 | download_timeout_ms = 60000 # 60 seconds for download to start
4363 | async with page.expect_download(timeout=download_timeout_ms) as dl_info:
4364 | # Use the smart_click helper function to trigger the download
4365 | click_timeout_ms = 10000 # 10 seconds for the click itself
4366 | await smart_click(
4367 | page, task_hint=task_hint, target_kwargs=target_kwargs, timeout_ms=click_timeout_ms
4368 | )
4369 | logger.debug(
4370 | f"Click initiated for download hint: '{task_hint}'. Waiting for download start..."
4371 | )
4372 |
4373 | # Get the Download object
4374 | dl = await dl_info.value
4375 | logger.info(
4376 | f"Download started. Suggested filename: '{dl.suggested_filename}', URL: {dl.url}"
4377 | )
4378 |
4379 | # Sanitize filename provided by browser
4380 | suggested_fname_raw = dl.suggested_filename
4381 | default_fname = f"download_{int(time.time())}.dat"
4382 | suggested_fname = suggested_fname_raw or default_fname
4383 |
4384 | # Remove potentially harmful characters
4385 | safe_fname_chars = re.sub(r"[^\w.\- ]", "_", suggested_fname)
4386 | # Replace whitespace with underscores
4387 | safe_fname_spaces = re.sub(r"\s+", "_", safe_fname_chars)
4388 | # Remove leading/trailing problematic characters
4389 | safe_fname_strip = safe_fname_spaces.strip("._-")
4390 | # Ensure filename is not empty after sanitization
4391 | safe_fname = safe_fname_strip or default_fname
4392 |
4393 | # --- Construct initial desired path (within the verified directory) ---
4394 | initial_desired_path = final_dl_dir_path / safe_fname
4395 |
4396 | # --- Get Unique Path using FileSystemTool ---
4397 | logger.debug(f"Requesting unique path based on initial suggestion: {initial_desired_path}")
4398 | try:
4399 | # Use STANDALONE get_unique_filepath tool
4400 | unique_path_result = await get_unique_filepath(path=str(initial_desired_path))
4401 | if not isinstance(unique_path_result, dict) or not unique_path_result.get("success"):
4402 | error_detail = "Invalid response"
4403 | if isinstance(unique_path_result, dict):
4404 | error_detail = unique_path_result.get("error", "Unknown")
4405 | raise ToolError(
4406 | f"Failed to get unique download path. Filesystem tool error: {error_detail}"
4407 | )
4408 |
4409 | final_unique_path_str = unique_path_result.get("path")
4410 | if not final_unique_path_str:
4411 | raise ToolError(
4412 | "Filesystem tool get_unique_filepath succeeded but did not return a path."
4413 | )
4414 |
4415 | out_path = Path(final_unique_path_str) # Use the unique path for saving
4416 | logger.info(f"Determined unique download save path: {out_path}")
4417 |
4418 | except ToolError as e:
4419 | logger.error(
4420 | f"Error determining unique download path based on '{initial_desired_path}': {e}",
4421 | exc_info=True,
4422 | )
4423 | raise # Re-raise ToolError
4424 | except Exception as e:
4425 | logger.error(
4426 | f"Unexpected error getting unique download path for '{initial_desired_path}': {e}",
4427 | exc_info=True,
4428 | )
4429 | raise ToolError(
4430 | f"An unexpected error occurred finding a unique save path: {str(e)}"
4431 | ) from e
4432 | # --- End Getting Unique Path ---
4433 |
4434 | # --- Save Download using Playwright ---
4435 | logger.info(f"Playwright saving download from '{dl.url}' to unique path: {out_path}")
4436 | # Playwright handles the actual streaming and saving to the specified path
4437 | await dl.save_as(out_path)
4438 | logger.info(f"Playwright download save complete: {out_path}")
4439 |
4440 | # --- Read back file DIRECTLY for Analysis (using out_path) ---
4441 | file_data: Optional[bytes] = None
4442 | file_size = -1
4443 | sha256_hash = None
4444 | read_back_error = None
4445 |
4446 | try:
4447 | # Read the file content using our async helper
4448 | logger.debug(f"Reading back downloaded file directly from {out_path} for analysis...")
4449 | file_data = await _read_file_async(out_path)
4450 | file_size = len(file_data)
4451 | logger.debug(f"Successfully read back {file_size} bytes from {out_path} directly.")
4452 |
4453 | # Handle potential errors during the direct read-back
4454 | except FileNotFoundError:
4455 | read_back_error = f"Downloaded file {out_path} disappeared before read-back."
4456 | # Optionally try to delete the potentially incomplete entry if FS allows
4457 | # try: await delete_path(str(out_path)) # Needs delete_path tool
4458 | # except Exception as del_e: logger.warning(f"Failed to cleanup missing file {out_path}: {del_e}")
4459 | except IOError as e:
4460 | read_back_error = f"IO error reading back downloaded file {out_path}: {e}"
4461 | except Exception as e:
4462 | read_back_error = f"Unexpected error reading back downloaded file {out_path}: {e}"
4463 | # Log full traceback for unexpected errors
4464 | logger.error(f"Unexpected error reading back {out_path}: {e}", exc_info=True)
4465 |
4466 | # If read-back failed, log and raise ToolError indicating partial success/failure
4467 | if read_back_error:
4468 | logger.error(read_back_error)
4469 | # Prepare info about the failed read-back
4470 | partial_info = {
4471 | "success": False, # Mark overall operation as failed due to analysis failure
4472 | "file_path": str(out_path),
4473 | "file_name": out_path.name,
4474 | "error": f"Download saved, but failed to read back for analysis: {read_back_error}",
4475 | "url": dl.url,
4476 | }
4477 | await _log("download_success_readback_fail", target=log_target, **partial_info)
4478 | # Raise ToolError to signal failure clearly to the caller
4479 | raise ToolError(partial_info["error"], details=partial_info)
4480 |
4481 | # --- Hashing and Table Extraction (if read-back succeeded) ---
4482 | # Compute hash from the bytes read directly
4483 | if file_data is not None: # Should always be true if read_back_error is None
4484 | sha256_hash = await _compute_hash_async(file_data)
4485 | logger.debug(f"Computed SHA256 hash for {out_path.name}: {sha256_hash[:8]}...")
4486 | else:
4487 | # This case should technically not be reachable if read_back_error is None
4488 | logger.error(
4489 | f"Internal state error: file_data is None after successful read back for {out_path}."
4490 | )
4491 | # Fallback hash or handle as error? For now, hash will be None.
4492 |
4493 | tables = []
4494 | # Check file extension to decide if table extraction is applicable
4495 | file_extension = out_path.suffix.lower()
4496 | is_table_extractable = file_extension in (".pdf", ".xls", ".xlsx", ".csv")
4497 |
4498 | if is_table_extractable:
4499 | logger.debug(f"Attempting table extraction for {out_path.name}...")
4500 | try:
4501 | # Use the async helper which runs sync extraction in a thread
4502 | # _extract_tables_async reads the file itself from out_path
4503 | table_extraction_task = asyncio.create_task(_extract_tables_async(out_path))
4504 | # Wait for extraction with a timeout
4505 | extraction_timeout = 120 # seconds
4506 | tables = await asyncio.wait_for(table_extraction_task, timeout=extraction_timeout)
4507 | if tables:
4508 | logger.info(
4509 | f"Successfully extracted {len(tables)} table(s) from {out_path.name}"
4510 | )
4511 | else:
4512 | logger.debug(f"No tables found or extracted from {out_path.name}")
4513 |
4514 | except asyncio.TimeoutError:
4515 | logger.warning(
4516 | f"Table extraction timed out after {extraction_timeout}s for {out_path.name}"
4517 | )
4518 | # Ensure the task is cancelled if it timed out
4519 | if "table_extraction_task" in locals() and not table_extraction_task.done():
4520 | table_extraction_task.cancel()
4521 | try:
4522 | # Give cancellation a moment to propagate (best effort)
4523 | await asyncio.wait_for(table_extraction_task, timeout=1.0)
4524 | except asyncio.CancelledError:
4525 | pass # Expected outcome of cancellation
4526 | except asyncio.TimeoutError:
4527 | logger.warning(
4528 | "Timeout waiting for table extraction task cancellation after initial timeout."
4529 | )
4530 | except Exception as cancel_err:
4531 | logger.warning(
4532 | f"Error during table extraction task cancellation: {cancel_err}"
4533 | )
4534 | # Continue, tables will remain empty list
4535 | except Exception as extract_err:
4536 | # Catch other errors during extraction process
4537 | logger.error(
4538 | f"Table extraction failed unexpectedly for {out_path.name}: {extract_err}",
4539 | exc_info=True,
4540 | )
4541 | # Continue, tables will remain empty list
4542 |
4543 | # --- Success ---
4544 | # Prepare the success result dictionary
4545 | info = {
4546 | "success": True,
4547 | "file_path": str(out_path), # Return the final unique absolute path
4548 | "file_name": out_path.name,
4549 | "sha256": sha256_hash, # Use the hash computed from read-back data
4550 | "size_bytes": file_size, # Use the size from read-back data
4551 | "url": dl.url, # URL the download originated from
4552 | "tables_extracted": bool(tables), # Indicate if tables were extracted
4553 | "tables": tables[:5], # Include a preview of first 5 tables (if any)
4554 | }
4555 | # Log success event (exclude large tables data from log details)
4556 | log_info_safe = info.copy()
4557 | if "tables" in log_info_safe:
4558 | del log_info_safe["tables"] # Remove tables for cleaner log
4559 | log_info_safe["num_tables"] = len(tables) if tables else 0
4560 | await _log("download_success", target=log_target, **log_info_safe)
4561 | return info
4562 |
4563 | # --- Error Handling (Catch errors from download initiation or Playwright saving) ---
4564 | except (ToolInputError, ToolError) as e:
4565 | # These errors are raised explicitly above (e.g., dir prep, unique path, read-back) or by smart_click
4566 | # Log the specific error type and message
4567 | error_path_context = str(out_path) if out_path else "N/A"
4568 | await _log("download_fail_other", target=log_target, error=str(e), path=error_path_context)
4569 | raise # Re-raise the specific ToolError/InputError
4570 | except PlaywrightTimeoutError as e:
4571 | # Timeout occurred during page.expect_download or within smart_click
4572 | error_path_context = str(out_path) if out_path else "N/A"
4573 | await _log(
4574 | "download_fail_timeout", target=log_target, error=str(e), path=error_path_context
4575 | )
4576 | raise ToolError(f"Download operation timed out: {e}") from e
4577 | except PlaywrightException as e:
4578 | # Other playwright errors during expect_download, save_as, or smart_click
4579 | error_path_context = str(out_path) if out_path else "N/A"
4580 | await _log(
4581 | "download_fail_playwright", target=log_target, error=str(e), path=error_path_context
4582 | )
4583 | raise ToolError(f"Download failed due to Playwright error: {e}") from e
4584 | except Exception as e:
4585 | # Catch-all for unexpected errors during the download process
4586 | error_path_context = str(out_path) if out_path else "N/A"
4587 | await _log(
4588 | "download_fail_unexpected", target=log_target, error=str(e), path=error_path_context
4589 | )
4590 | logger.error(
4591 | f"Unexpected error during smart_download for hint '{task_hint}': {e}", exc_info=True
4592 | ) # Log traceback
4593 | raise ToolError(f"Unexpected error during download: {e}") from e
4594 |
4595 |
4596 | # --- PDF/Docs Crawler Helpers (Keep as is, minor splits) ---
4597 | _SLUG_RE = re.compile(r"[^a-z0-9\-_]+")
4598 |
4599 |
4600 | def _slugify(text: str, max_len: int = 60) -> str:
4601 | """Converts text to a URL-friendly slug."""
4602 | if not text:
4603 | return "file" # Default slug for empty input
4604 |
4605 | # Normalize Unicode characters (e.g., accents to base letters)
4606 | normalized_text = unicodedata.normalize("NFKD", text)
4607 | # Encode to ASCII, ignoring characters that cannot be represented
4608 | ascii_bytes = normalized_text.encode("ascii", "ignore")
4609 | # Decode back to string
4610 | ascii_text = ascii_bytes.decode()
4611 | # Convert to lowercase
4612 | lower_text = ascii_text.lower()
4613 | # Replace non-alphanumeric (excluding '-', '_') with hyphens
4614 | slug_hyphens = _SLUG_RE.sub("-", lower_text)
4615 | # Remove leading/trailing hyphens
4616 | slug_trimmed = slug_hyphens.strip("-")
4617 | # Replace multiple consecutive hyphens with a single hyphen
4618 | slug_single_hyphens = re.sub(r"-{2,}", "-", slug_trimmed)
4619 | # Truncate to maximum length
4620 | slug_truncated = slug_single_hyphens[:max_len]
4621 | # Trim hyphens again after potential truncation
4622 | final_slug = slug_truncated.strip("-")
4623 |
4624 | # Ensure slug is not empty after all operations
4625 | return final_slug or "file" # Return default if empty
4626 |
4627 |
4628 | def _get_dir_slug(url: str) -> str:
4629 | """Creates a slug based on the last path components or domain of a URL."""
4630 | try:
4631 | parsed_url = urlparse(url)
4632 | # Split path into components, filtering out empty strings and root slash
4633 | path_obj = Path(parsed_url.path)
4634 | path_parts = []
4635 | for part in path_obj.parts:
4636 | if part and part != "/":
4637 | path_parts.append(part)
4638 |
4639 | # Create slug based on path components
4640 | num_parts = len(path_parts)
4641 | if num_parts >= 2:
4642 | # Use last two path parts if available
4643 | part_minus_2_slug = _slugify(path_parts[-2], 20)
4644 | part_minus_1_slug = _slugify(path_parts[-1], 20)
4645 | dir_slug = f"{part_minus_2_slug}-{part_minus_1_slug}"
4646 | return dir_slug
4647 | elif num_parts == 1:
4648 | # Use the single path part
4649 | part_slug = _slugify(path_parts[-1], 40)
4650 | return part_slug
4651 | else:
4652 | # Fallback to domain name if path is empty or just '/'
4653 | domain_slug = _slugify(parsed_url.netloc, 40)
4654 | return domain_slug or "domain" # Use 'domain' if netloc is also empty
4655 |
4656 | except Exception as e:
4657 | logger.warning(f"Error creating directory slug for URL '{url}': {e}")
4658 | return "path" # Fallback slug on error
4659 |
4660 |
4661 | async def _fetch_html(
4662 | client: httpx.AsyncClient, url: str, rate_limiter: Optional["RateLimiter"] = None
4663 | ) -> Optional[str]:
4664 | """Fetches HTML content from a URL using httpx, respecting rate limits."""
4665 | try:
4666 | # Acquire rate limit permit if limiter is provided
4667 | if rate_limiter:
4668 | await rate_limiter.acquire()
4669 |
4670 | # Make GET request with streaming response
4671 | request_timeout = 20.0
4672 | async with client.stream(
4673 | "GET", url, follow_redirects=True, timeout=request_timeout
4674 | ) as response:
4675 | # Check for non-success status codes
4676 | response.raise_for_status() # Raises HTTPStatusError for 4xx/5xx
4677 |
4678 | # Handle No Content response
4679 | if response.status_code == 204:
4680 | logger.debug(f"Received HTTP 204 No Content for {url}")
4681 | return None
4682 |
4683 | # Check content type - must be HTML
4684 | content_type_header = response.headers.get("content-type", "")
4685 | content_type = content_type_header.lower()
4686 | if "text/html" not in content_type:
4687 | logger.debug(f"Skipping non-HTML content type '{content_type}' for {url}")
4688 | return None
4689 |
4690 | # Check content length limit
4691 | max_html_size = 5 * 1024 * 1024 # 5 MiB
4692 | content_length_header = response.headers.get("content-length")
4693 | if content_length_header:
4694 | try:
4695 | content_length = int(content_length_header)
4696 | if content_length > max_html_size:
4697 | logger.debug(
4698 | f"Skipping large HTML content ({content_length} bytes) for {url}"
4699 | )
4700 | return None
4701 | except ValueError:
4702 | logger.warning(
4703 | f"Invalid Content-Length header '{content_length_header}' for {url}"
4704 | )
4705 | # Proceed cautiously without length check
4706 |
4707 | # Read the response body bytes
4708 | html_bytes = await response.aread()
4709 |
4710 | # Decode HTML bytes to string (try UTF-8, then fallback)
4711 | decoded_html: Optional[str] = None
4712 | try:
4713 | decoded_html = html_bytes.decode("utf-8")
4714 | except UnicodeDecodeError:
4715 | try:
4716 | # Fallback to Latin-1 if UTF-8 fails
4717 | decoded_html = html_bytes.decode("iso-8859-1")
4718 | logger.debug(f"Decoded HTML from {url} using iso-8859-1 fallback.")
4719 | except UnicodeDecodeError:
4720 | # Log warning if both decodings fail
4721 | logger.warning(f"Could not decode HTML from {url} using utf-8 or iso-8859-1.")
4722 | return None # Cannot process undecodable content
4723 |
4724 | return decoded_html
4725 |
4726 | except httpx.HTTPStatusError as e:
4727 | # Log client/server errors (4xx/5xx)
4728 | status_code = e.response.status_code
4729 | logger.debug(f"HTTP error {status_code} fetching {url}: {e}")
4730 | return None
4731 | except httpx.RequestError as e:
4732 | # Log network-related errors (DNS, connection, timeout etc.)
4733 | logger.warning(f"Network error fetching {url}: {e}")
4734 | return None
4735 | except Exception as e:
4736 | # Log other unexpected errors during fetch
4737 | logger.error(f"Unexpected error fetching {url}: {e}", exc_info=True)
4738 | return None
4739 |
4740 |
4741 | def _extract_links(base_url: str, html: str) -> Tuple[List[str], List[str]]:
4742 | """Extracts absolute PDF and internal HTML page links from HTML content."""
4743 | pdfs: Set[str] = set()
4744 | pages: Set[str] = set()
4745 | try:
4746 | soup = BeautifulSoup(html, "html.parser") # Use default parser
4747 | parsed_base_url = urlparse(base_url)
4748 | base_netloc = parsed_base_url.netloc
4749 |
4750 | # Find all <a> tags with an href attribute
4751 | anchor_tags = soup.find_all("a", href=True)
4752 |
4753 | for a in anchor_tags:
4754 | href_raw = a["href"]
4755 | # Skip empty, fragment, mailto, tel, or javascript links
4756 | if not href_raw or href_raw.startswith(("#", "mailto:", "tel:", "javascript:")):
4757 | continue
4758 |
4759 | try:
4760 | # Resolve relative URLs to absolute URLs
4761 | abs_url = urllib.parse.urljoin(base_url, href_raw)
4762 | parsed_url = urlparse(abs_url)
4763 |
4764 | # Clean URL by removing fragment identifier
4765 | clean_url = parsed_url._replace(fragment="").geturl()
4766 | path_lower = parsed_url.path.lower()
4767 |
4768 | # Check if it's a PDF link
4769 | if path_lower.endswith(".pdf"):
4770 | pdfs.add(clean_url)
4771 | # Check if it's an internal HTML page link
4772 | elif parsed_url.netloc == base_netloc:
4773 | # Check if path seems like HTML or directory listing
4774 | is_html_like = path_lower.endswith((".html", ".htm", "/"))
4775 | # Or if it has no file extension in the last path segment
4776 | path_name = Path(parsed_url.path).name
4777 | has_no_ext = "." not in path_name
4778 | # Ensure it's not mistakenly identified as PDF again
4779 | not_pdf = not path_lower.endswith(".pdf")
4780 |
4781 | if (is_html_like or has_no_ext) and not_pdf:
4782 | pages.add(clean_url)
4783 |
4784 | except ValueError:
4785 | # Ignore errors resolving invalid URLs (e.g., bad characters)
4786 | pass
4787 | except Exception as link_err:
4788 | # Log other errors during link processing
4789 | logger.warning(f"Error processing link '{href_raw}' on page {base_url}: {link_err}")
4790 |
4791 | except Exception as soup_err:
4792 | # Log errors during BeautifulSoup parsing
4793 | logger.error(f"Error parsing HTML for links on {base_url}: {soup_err}", exc_info=True)
4794 |
4795 | # Return lists of unique PDF and page URLs found
4796 | return list(pdfs), list(pages)
4797 |
4798 |
4799 | class RateLimiter: # Keep class definition
4800 | """Simple asynchronous rate limiter using asyncio.Lock."""
4801 |
4802 | def __init__(self, rate_limit: float = 1.0):
4803 | if rate_limit <= 0:
4804 | raise ValueError("Rate limit must be positive.")
4805 | # Calculate the minimum interval between requests in seconds
4806 | self.interval = 1.0 / rate_limit
4807 | self.last_request_time: float = 0 # Time of the last request completion
4808 | self.lock = asyncio.Lock() # Lock to ensure atomic check/wait/update
4809 |
4810 | async def acquire(self):
4811 | """Acquires a permit, sleeping if necessary to maintain the rate limit."""
4812 | async with self.lock:
4813 | now = time.monotonic()
4814 | time_since_last = now - self.last_request_time
4815 | # Calculate how long we need to wait
4816 | time_to_wait = self.interval - time_since_last
4817 |
4818 | if time_to_wait > 0:
4819 | # Sleep for the required duration
4820 | await asyncio.sleep(time_to_wait)
4821 | # Update 'now' after sleeping
4822 | now = time.monotonic()
4823 |
4824 | # Update the last request time to the current time
4825 | self.last_request_time = now
4826 |
4827 |
4828 | async def crawl_for_pdfs(
4829 | start_url: str,
4830 | include_regex: Optional[str] = None,
4831 | max_depth: int = 2,
4832 | max_pdfs: int = 100,
4833 | max_pages_crawl: int = 500,
4834 | rate_limit_rps: float = 2.0,
4835 | ) -> List[str]:
4836 | """Crawls a website to find PDF links."""
4837 | # Compile include regex if provided
4838 | inc_re: Optional[re.Pattern] = None
4839 | if include_regex:
4840 | try:
4841 | inc_re = re.compile(include_regex, re.IGNORECASE)
4842 | except re.error as e:
4843 | raise ToolInputError(f"Invalid include_regex provided: {e}") from e
4844 |
4845 | # Initialize crawl state
4846 | seen_urls: Set[str] = set()
4847 | pdf_urls_found: Set[str] = set()
4848 | # Queue stores tuples of (url, depth)
4849 | queue: deque[tuple[str, int]] = deque()
4850 | queue.append((start_url, 0)) # Start at depth 0
4851 | seen_urls.add(start_url)
4852 | visit_count = 0
4853 | rate_limiter = RateLimiter(rate_limit_rps)
4854 | base_netloc = urlparse(start_url).netloc
4855 | # Basic user agent for politeness
4856 | headers = {
4857 | "User-Agent": "Mozilla/5.0 (compatible; SmartBrowserBot/1.0; +http://example.com/bot)"
4858 | }
4859 |
4860 | # Use httpx.AsyncClient for connection pooling
4861 | client_timeout = 30.0
4862 | async with httpx.AsyncClient(
4863 | follow_redirects=True, timeout=client_timeout, headers=headers
4864 | ) as client:
4865 | # Main crawl loop
4866 | while queue:
4867 | # Check stopping conditions
4868 | if len(pdf_urls_found) >= max_pdfs:
4869 | logger.info(f"PDF crawl stopped: Max PDFs ({max_pdfs}) reached.")
4870 | break
4871 | if visit_count >= max_pages_crawl:
4872 | logger.warning(f"PDF crawl stopped: Max pages crawled ({max_pages_crawl}) reached.")
4873 | break
4874 |
4875 | # Get next URL and depth from queue
4876 | current_url, current_depth = queue.popleft()
4877 | visit_count += 1
4878 | logger.debug(f"Crawling [Depth {current_depth}, Visit {visit_count}]: {current_url}")
4879 |
4880 | # Fetch HTML content for the current page
4881 | html = await _fetch_html(client, current_url, rate_limiter)
4882 | if not html:
4883 | continue # Skip if fetch failed or not HTML
4884 |
4885 | # Extract links from the fetched HTML
4886 | pdfs, pages = _extract_links(current_url, html)
4887 |
4888 | # Process found PDF links
4889 | for pdf_url in pdfs:
4890 | if pdf_url not in pdf_urls_found:
4891 | # Apply include regex if specified
4892 | if inc_re is None or inc_re.search(pdf_url):
4893 | pdf_urls_found.add(pdf_url)
4894 | logger.info(f"PDF found: {pdf_url} (Total: {len(pdf_urls_found)})")
4895 | # Check if max PDFs reached after adding
4896 | if len(pdf_urls_found) >= max_pdfs:
4897 | break # Exit inner loop
4898 |
4899 | # Check max PDFs again after processing all PDFs on page
4900 | if len(pdf_urls_found) >= max_pdfs:
4901 | break # Exit outer loop
4902 |
4903 | # Process found HTML page links for further crawling
4904 | if current_depth < max_depth:
4905 | for page_url in pages:
4906 | try:
4907 | parsed_page_url = urlparse(page_url)
4908 | # Only crawl pages on the same domain and not seen before
4909 | is_same_domain = parsed_page_url.netloc == base_netloc
4910 | is_not_seen = page_url not in seen_urls
4911 | if is_same_domain and is_not_seen:
4912 | seen_urls.add(page_url)
4913 | # Add to queue with incremented depth
4914 | queue.append((page_url, current_depth + 1))
4915 | except ValueError:
4916 | # Ignore errors parsing potential page URLs
4917 | pass
4918 |
4919 | # Log final counts after loop finishes
4920 | logger.info(
4921 | f"PDF crawl finished. Found {len(pdf_urls_found)} matching PDFs after visiting {visit_count} pages."
4922 | )
4923 | return list(pdf_urls_found)
4924 |
4925 |
4926 | async def _download_file_direct(
4927 | url: str, dest_dir_str: str, seq: int = 1
4928 | ) -> Dict: # Uses Filesystem Tools
4929 | """Downloads a file directly using httpx and saves using filesystem tools."""
4930 | final_output_path_str: Optional[str] = None # Path where file is ultimately saved
4931 | downloaded_content: Optional[bytes] = None
4932 | initial_filename = "" # Keep track for error reporting
4933 |
4934 | try:
4935 | # --- Determine Initial Filename ---
4936 | parsed_url = urlparse(url)
4937 | path_basename = os.path.basename(parsed_url.path) if parsed_url.path else ""
4938 |
4939 | # Create a filename if URL path is empty or root, or has no extension
4940 | use_generated_name = not path_basename or path_basename == "/" or "." not in path_basename
4941 |
4942 | if use_generated_name:
4943 | dir_slug = _get_dir_slug(url) # Slug based on parent path or domain
4944 | base_name = f"{seq:03d}_{dir_slug}_{_slugify(path_basename or 'download')}"
4945 | # Add appropriate extension (default .dat)
4946 | file_ext = ".pdf" if url.lower().endswith(".pdf") else ".dat"
4947 | initial_filename = base_name + file_ext
4948 | else:
4949 | # Use and sanitize the filename from the URL path
4950 | sanitized_basename = _slugify(path_basename)
4951 | initial_filename = f"{seq:03d}_{sanitized_basename}"
4952 |
4953 | # Initial desired path within the destination directory
4954 | initial_desired_path = os.path.join(dest_dir_str, initial_filename)
4955 | refined_desired_path = initial_desired_path # Start with initial path
4956 |
4957 | # --- Fetch File Content ---
4958 | headers = {
4959 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", # Standard UA
4960 | "Accept": "*/*",
4961 | "Accept-Encoding": "gzip, deflate, br",
4962 | "Connection": "keep-alive",
4963 | }
4964 | download_timeout = 120.0 # Allow 2 minutes for download
4965 | async with httpx.AsyncClient(
4966 | follow_redirects=True, timeout=download_timeout, headers=headers
4967 | ) as client:
4968 | async with client.stream("GET", url) as response:
4969 | # Check for successful status code
4970 | if response.status_code != 200:
4971 | error_msg = f"HTTP {response.status_code} {response.reason_phrase}"
4972 | status_code = response.status_code
4973 | # Return error dictionary immediately
4974 | return {
4975 | "url": url,
4976 | "error": error_msg,
4977 | "status_code": status_code,
4978 | "success": False,
4979 | "path": initial_desired_path, # Report intended path on error
4980 | }
4981 |
4982 | # --- Refine Filename based on Headers (Content-Disposition, Content-Type) ---
4983 | # Check Content-Disposition header for filename suggestion
4984 | content_disposition = response.headers.get("content-disposition")
4985 | if content_disposition:
4986 | # Simple regex to find filename*= or filename=
4987 | match = re.search(r'filename\*?="?([^"]+)"?', content_disposition)
4988 | if match:
4989 | header_filename_raw = match.group(1)
4990 | # Try URL decoding potential encoding
4991 | try:
4992 | header_filename_decoded = urllib.parse.unquote(header_filename_raw)
4993 | except Exception:
4994 | header_filename_decoded = header_filename_raw # Fallback
4995 | # Sanitize and prepend sequence number
4996 | refined_filename = f"{seq:03d}_{_slugify(header_filename_decoded)}"
4997 | refined_desired_path = os.path.join(dest_dir_str, refined_filename)
4998 | logger.debug(
4999 | f"Refined filename from Content-Disposition: {refined_filename}"
5000 | )
5001 |
5002 | # Check Content-Type header to potentially correct extension
5003 | content_type_header = response.headers.get("content-type", "")
5004 | content_type = content_type_header.split(";")[0].strip().lower()
5005 | current_stem, current_ext = os.path.splitext(refined_desired_path)
5006 | # Correct extension if Content-Type is PDF and current ext isn't
5007 | if content_type == "application/pdf" and current_ext.lower() != ".pdf":
5008 | refined_desired_path = current_stem + ".pdf"
5009 | logger.debug("Corrected file extension to .pdf based on Content-Type.")
5010 |
5011 | # Read the downloaded content
5012 | downloaded_content = await response.aread()
5013 | bytes_read = len(downloaded_content)
5014 | logger.debug(f"Downloaded {bytes_read} bytes for {url}.")
5015 |
5016 | # Ensure content was downloaded
5017 | if downloaded_content is None:
5018 | raise ToolError(
5019 | "Downloaded content is unexpectedly None after successful HTTP request."
5020 | )
5021 |
5022 | # --- Get Unique Save Path using Filesystem Tool ---
5023 | try:
5024 | unique_path_result = await get_unique_filepath(
5025 | path=refined_desired_path
5026 | ) # STANDALONE call
5027 | if not isinstance(unique_path_result, dict) or not unique_path_result.get("success"):
5028 | error_msg = (
5029 | unique_path_result.get("error", "Unknown")
5030 | if isinstance(unique_path_result, dict)
5031 | else "Invalid response"
5032 | )
5033 | raise ToolError(f"Failed to get unique download path. Error: {error_msg}")
5034 |
5035 | final_output_path_str = unique_path_result.get("path")
5036 | if not final_output_path_str:
5037 | raise ToolError(
5038 | "Filesystem tool get_unique_filepath succeeded but did not return path."
5039 | )
5040 | logger.info(f"Determined unique download save path: {final_output_path_str}")
5041 | except Exception as e:
5042 | # Wrap error getting unique path
5043 | raise ToolError(
5044 | f"Could not determine unique save path based on '{refined_desired_path}': {str(e)}"
5045 | ) from e
5046 |
5047 | # --- Write File using Filesystem Tool ---
5048 | try:
5049 | write_result = await write_file(
5050 | path=final_output_path_str, content=downloaded_content
5051 | ) # STANDALONE call
5052 | if not isinstance(write_result, dict) or not write_result.get("success"):
5053 | error_msg = (
5054 | write_result.get("error", "Unknown")
5055 | if isinstance(write_result, dict)
5056 | else "Invalid response"
5057 | )
5058 | raise ToolError(
5059 | f"Filesystem tool failed to write downloaded file to '{final_output_path_str}'. Error: {error_msg}"
5060 | )
5061 | logger.info(f"Successfully saved file to: {final_output_path_str}")
5062 | except Exception as e:
5063 | # Wrap error during file write
5064 | raise ToolError(
5065 | f"Could not write downloaded file to '{final_output_path_str}': {str(e)}"
5066 | ) from e
5067 |
5068 | # --- Calculate Hash ---
5069 | hasher = hashlib.sha256()
5070 | hasher.update(downloaded_content)
5071 | file_hash = hasher.hexdigest()
5072 |
5073 | # --- Log and Return Success ---
5074 | await _log(
5075 | "download_direct_success",
5076 | url=url,
5077 | file=final_output_path_str,
5078 | size=bytes_read,
5079 | sha256=file_hash,
5080 | )
5081 | return {
5082 | "url": url,
5083 | "file": final_output_path_str, # The actual saved path
5084 | "size": bytes_read,
5085 | "sha256": file_hash,
5086 | "success": True,
5087 | }
5088 |
5089 | except httpx.RequestError as e:
5090 | # Handle network errors during download attempt
5091 | logger.warning(f"Network error downloading {url}: {e}")
5092 | return {
5093 | "url": url,
5094 | "error": f"Network error: {e}",
5095 | "success": False,
5096 | "path": final_output_path_str or initial_filename,
5097 | } # Report final path if available
5098 | except (ToolError, ToolInputError) as e:
5099 | # Handle errors raised explicitly during path/write operations
5100 | logger.error(f"Tool error downloading {url} directly: {e}", exc_info=True)
5101 | return {
5102 | "url": url,
5103 | "error": f"Download failed: {e}",
5104 | "success": False,
5105 | "path": final_output_path_str or initial_filename,
5106 | }
5107 | except Exception as e:
5108 | # Catch any other unexpected errors
5109 | logger.error(f"Unexpected error downloading {url} directly: {e}", exc_info=True)
5110 | return {
5111 | "url": url,
5112 | "error": f"Download failed unexpectedly: {e}",
5113 | "success": False,
5114 | "path": final_output_path_str or initial_filename,
5115 | }
5116 |
5117 |
5118 | # --- OSS Documentation Crawler Helpers ---
5119 | _DOC_EXTS = (".html", ".htm", "/") # Common extensions/endings for HTML pages
5120 | _DOC_STOP_PAT = re.compile(
5121 | r"\.(png|jpg|jpeg|gif|svg|css|js|zip|tgz|gz|whl|exe|dmg|ico|woff|woff2|map|json|xml|txt|pdf|md)$", # Added pdf, md
5122 | re.IGNORECASE,
5123 | ) # File extensions to ignore during crawl
5124 |
5125 |
5126 | def _looks_like_docs_url(url: str) -> bool:
5127 | """
5128 | Heuristically checks if a URL looks like a documentation page.
5129 |
5130 | Args:
5131 | url: The URL string to check.
5132 |
5133 | Returns:
5134 | True if the URL appears to be a documentation page, False otherwise.
5135 | """
5136 | if not url or not isinstance(url, str):
5137 | return False
5138 |
5139 | try:
5140 | url_low = url.lower()
5141 | parsed = urllib.parse.urlparse(url_low)
5142 |
5143 | # 1. Penalize URLs with query strings (often dynamic/non-doc pages)
5144 | if parsed.query:
5145 | return False
5146 |
5147 | # 2. Penalize common non-doc paths explicitly
5148 | common_non_doc_paths = [
5149 | # Common application paths
5150 | "/api/", # Sometimes docs, but often API endpoints themselves
5151 | "/blog/",
5152 | "/news/",
5153 | "/community/",
5154 | "/forum/",
5155 | "/support/",
5156 | "/contact/",
5157 | "/about/",
5158 | "/pricing/",
5159 | "/login/",
5160 | "/register/",
5161 | "/signup/",
5162 | "/signin/",
5163 | "/account/",
5164 | "/profile/",
5165 | "/cart/",
5166 | "/checkout/",
5167 | # Common asset/download paths
5168 | "/download/",
5169 | "/install/",
5170 | "/_static/",
5171 | "/_images/",
5172 | "/assets/",
5173 | "/media/",
5174 | "/static/",
5175 | "/vendor/",
5176 | "/node_modules/",
5177 | # Specific framework/site paths unlikely to be main docs
5178 | "/wp-content/",
5179 | "/wp-admin/",
5180 | "/sites/default/files/",
5181 | ]
5182 | # Use a generator expression for slightly better efficiency
5183 | if any(non_doc_path in parsed.path for non_doc_path in common_non_doc_paths):
5184 | return False
5185 |
5186 | # 3. Check for keywords indicating documentation in URL or path
5187 | doc_keywords = [
5188 | "docs",
5189 | "doc",
5190 | "documentation",
5191 | "guide",
5192 | "manual",
5193 | "tutorial",
5194 | "tuto",
5195 | "reference",
5196 | "ref",
5197 | "api",
5198 | "faq",
5199 | "howto",
5200 | "userguide",
5201 | "develop",
5202 | "example",
5203 | "usage",
5204 | "getting-started",
5205 | "quickstart",
5206 | ]
5207 | # Check in netloc (e.g., docs.example.com) and path
5208 | has_doc_keyword = any(
5209 | keyword in parsed.netloc or keyword in parsed.path for keyword in doc_keywords
5210 | )
5211 |
5212 | # 4. Check if URL ends with typical HTML extension or directory slash
5213 | ends_with_doc_ext = url_low.endswith(_DOC_EXTS)
5214 |
5215 | # 5. Check if URL is hosted on a common documentation platform
5216 | common_doc_hosts = [
5217 | "readthedocs.io",
5218 | "netlify.app",
5219 | "vercel.app",
5220 | "github.io",
5221 | "gitlab.io",
5222 | "pages.dev", # Cloudflare Pages
5223 | "gitbook.io",
5224 | "docusaurus.io", # Often custom domains, but sometimes subdomains
5225 | ]
5226 | is_common_host = any(host in parsed.netloc for host in common_doc_hosts)
5227 |
5228 | # 6. Check if URL path contains a file extension we want to stop at
5229 | path_has_stop_ext = bool(_DOC_STOP_PAT.search(parsed.path))
5230 |
5231 | # Combine checks:
5232 | # - MUST NOT have a stop extension
5233 | # - MUST satisfy one of the positive indicators:
5234 | # - Contains a documentation keyword
5235 | # - Ends like an HTML page or directory
5236 | # - Is hosted on a common documentation platform
5237 | is_likely_doc = not path_has_stop_ext and (
5238 | has_doc_keyword or ends_with_doc_ext or is_common_host
5239 | )
5240 |
5241 | # Log decision process if debugging needed
5242 | # logger.debug(f"URL Check: {url_low} -> StopExt:{path_has_stop_ext}, Keyword:{has_doc_keyword}, DocExt:{ends_with_doc_ext}, CommonHost:{is_common_host} => LikelyDoc:{is_likely_doc}")
5243 |
5244 | return is_likely_doc
5245 |
5246 | except ValueError: # Handle potential errors from urlparse
5247 | logger.warning(f"Error parsing URL for documentation check: {url}", exc_info=True)
5248 | return False
5249 | except Exception as e: # Catch any other unexpected errors
5250 | logger.error(f"Unexpected error in _looks_like_docs_url for {url}: {e}", exc_info=True)
5251 | return False
5252 |
5253 |
5254 | async def _pick_docs_root(pkg_name: str) -> Optional[str]:
5255 | """
5256 | Attempts to find the root documentation URL for a package using web search.
5257 |
5258 | Uses multiple search queries and engines, then applies heuristics (_looks_like_docs_url)
5259 | to find the most likely documentation root URL.
5260 |
5261 | Args:
5262 | pkg_name: The name of the package to find documentation for.
5263 |
5264 | Returns:
5265 | The most likely documentation root URL as a string, or None if not found.
5266 |
5267 | Raises:
5268 | ToolInputError: If the package name is invalid.
5269 | ToolError: If the web search fails critically or no suitable URL is found.
5270 | """
5271 | if not pkg_name or not isinstance(pkg_name, str):
5272 | raise ToolInputError("Package name must be a non-empty string.")
5273 |
5274 | try:
5275 | logger.info(f"Searching for documentation root for package: '{pkg_name}'")
5276 |
5277 | # --- Prepare search queries and engines ---
5278 | queries = [
5279 | f'"{pkg_name}" official documentation website', # More precise
5280 | f"{pkg_name} documentation",
5281 | f"{pkg_name} python library docs", # Specific to python
5282 | f"{pkg_name} user guide",
5283 | f"how to use {pkg_name}",
5284 | ]
5285 | # Cycle engines to mitigate potential blocks/bias or differing results
5286 | engines = ["duckduckgo", "bing"]
5287 | all_search_hits: List[Dict[str, Any]] = []
5288 | MAX_RESULTS_PER_QUERY = 3 # Get fewer results per query, but run more queries
5289 |
5290 | # --- Run searches ---
5291 | for i, query in enumerate(queries):
5292 | engine = engines[i % len(engines)]
5293 | logger.debug(f"Trying search query [{i + 1}/{len(queries)}]: '{query}' on {engine}")
5294 | try:
5295 | await asyncio.sleep(0.2) # Small delay between searches
5296 | # Assuming search_web returns a list of dicts directly now
5297 | search_res_list = await search_web(
5298 | query, engine=engine, max_results=MAX_RESULTS_PER_QUERY
5299 | )
5300 | if isinstance(search_res_list, list):
5301 | all_search_hits.extend(search_res_list)
5302 | else:
5303 | # Log if search_web returns unexpected format (it shouldn't based on its definition)
5304 | logger.warning(
5305 | f"Search query '{query}' on {engine} returned unexpected format: {type(search_res_list)}. Expected list."
5306 | )
5307 |
5308 | except ToolError as search_err:
5309 | # Log specific tool errors from search_web but continue trying other queries
5310 | logger.warning(f"Web search query '{query}' failed on {engine}: {search_err}")
5311 | except Exception as e:
5312 | # Log unexpected errors during a specific search call but continue
5313 | logger.error(
5314 | f"Unexpected error during web search for query '{query}': {e}", exc_info=True
5315 | )
5316 |
5317 | # Check if any results were gathered at all
5318 | if not all_search_hits:
5319 | raise ToolError(
5320 | f"Web search yielded no results for documentation queries related to '{pkg_name}'."
5321 | )
5322 |
5323 | # --- Evaluate results ---
5324 | logger.debug(
5325 | f"Evaluating {len(all_search_hits)} potential documentation URLs for '{pkg_name}'."
5326 | )
5327 | best_candidate: Optional[str] = None
5328 | candidate_urls_considered: Set[str] = set()
5329 |
5330 | for i, hit in enumerate(all_search_hits): # Add index for logging
5331 | url = hit.get("url")
5332 | title = hit.get("title", "N/A") # Get title for context
5333 | logger.debug(
5334 | f" Hit [{i + 1}/{len(all_search_hits)}]: URL='{url}', Title='{title}'"
5335 | ) # Log the hit being processed
5336 |
5337 | if not url:
5338 | logger.debug(" -> Skipping hit (no URL)")
5339 | continue
5340 |
5341 | # Basic URL cleaning: normalize scheme, netloc, path; remove fragment
5342 | try:
5343 | parsed_hit = urllib.parse.urlparse(url)
5344 | # Remove www. prefix for easier comparison
5345 | cleaned_netloc = parsed_hit.netloc.lower().replace("www.", "")
5346 | # Reconstruct URL without fragment, using cleaned netloc
5347 | cleaned_url = parsed_hit._replace(fragment="", netloc=cleaned_netloc).geturl()
5348 |
5349 | # Ensure URL is not already processed (avoids redundant checks)
5350 | if cleaned_url in candidate_urls_considered:
5351 | logger.debug(f" -> Skipping hit (already considered: {cleaned_url})")
5352 | continue
5353 | candidate_urls_considered.add(cleaned_url)
5354 |
5355 | except ValueError:
5356 | # Handle potential errors during URL parsing
5357 | logger.warning(f" -> Skipping hit (invalid URL): {url}")
5358 | continue
5359 |
5360 | # Apply the heuristic check (_looks_like_docs_url assumes it's defined elsewhere)
5361 | is_likely = _looks_like_docs_url(cleaned_url)
5362 | logger.debug(
5363 | f" -> Heuristic check for '{cleaned_url}': {is_likely}"
5364 | ) # Log heuristic result
5365 |
5366 | if is_likely:
5367 | logger.info(
5368 | f"Found likely documentation page via search: {cleaned_url} (Original: {url})"
5369 | )
5370 | # Simple strategy: take the *first* likely candidate found.
5371 | best_candidate = cleaned_url
5372 | break # Stop after finding the first likely candidate
5373 |
5374 | # --- Fallback if heuristic finds nothing ---
5375 | if not best_candidate and all_search_hits:
5376 | # Fallback: Take the first result URL, clean it, and hope for the best.
5377 | first_url_original = all_search_hits[0].get("url")
5378 | if first_url_original:
5379 | try:
5380 | parsed_first = urllib.parse.urlparse(first_url_original)
5381 | # Perform the same cleaning as above for consistency
5382 | cleaned_first_netloc = parsed_first.netloc.lower().replace("www.", "")
5383 | cleaned_first_url = parsed_first._replace(
5384 | fragment="", netloc=cleaned_first_netloc
5385 | ).geturl()
5386 | logger.warning(
5387 | f"_looks_like_docs_url heuristic failed. Falling back to first search result: {cleaned_first_url}"
5388 | )
5389 | best_candidate = cleaned_first_url
5390 | except ValueError:
5391 | logger.error(f"Could not parse fallback first URL: {first_url_original}")
5392 | # best_candidate remains None, error will be raised below
5393 |
5394 | # --- Final Check and Root Derivation ---
5395 | if not best_candidate:
5396 | logger.error(
5397 | f"Could not find any suitable documentation URL for '{pkg_name}' after evaluating {len(candidate_urls_considered)} candidates."
5398 | )
5399 | # Optionally log considered URLs if helpful for debugging
5400 | # logger.debug(f"Considered URLs: {candidate_urls_considered}")
5401 | raise ToolError(
5402 | f"Could not automatically find a likely documentation site for package '{pkg_name}'. Web search did not yield a suitable URL."
5403 | )
5404 |
5405 | # Try to derive a more "root" URL from the best candidate found
5406 | final_root_url: str
5407 | try:
5408 | parsed_candidate = urllib.parse.urlparse(best_candidate)
5409 | path_segments = [seg for seg in parsed_candidate.path.split("/") if seg]
5410 |
5411 | # If the path has multiple segments, try going up one level
5412 | # Only do this if the parent path still looks like documentation
5413 | if len(path_segments) > 1:
5414 | parent_path = "/".join(path_segments[:-1])
5415 | # Ensure trailing slash for derived root URL, clear query/fragment
5416 | root_derived = parsed_candidate._replace(
5417 | path=f"/{parent_path}/", query="", fragment=""
5418 | ).geturl()
5419 |
5420 | # Check if the derived parent path still looks like docs
5421 | if _looks_like_docs_url(root_derived):
5422 | logger.info(
5423 | f"Derived potential docs root by going up one level: {root_derived}"
5424 | )
5425 | final_root_url = root_derived
5426 | else:
5427 | # Parent doesn't look like docs, stick with the cleaned candidate URL
5428 | final_root_url = parsed_candidate._replace(query="", fragment="").geturl()
5429 | logger.info(
5430 | f"Parent path '{parent_path}/' didn't seem like docs root. Using original candidate (cleaned): {final_root_url}"
5431 | )
5432 | else:
5433 | # Only one path segment or root path, use the cleaned candidate URL as is
5434 | final_root_url = parsed_candidate._replace(query="", fragment="").geturl()
5435 | logger.info(
5436 | f"Candidate URL is shallow or root. Using cleaned candidate as root: {final_root_url}"
5437 | )
5438 |
5439 | except Exception as parse_err:
5440 | # Handle errors during parsing or root derivation
5441 | logger.warning(
5442 | f"Error parsing/deriving root from best candidate URL {best_candidate}: {parse_err}. Using candidate as is (cleaned)."
5443 | )
5444 | # Fallback: Clean the best candidate URL (remove query/fragment) and return it
5445 | try:
5446 | parsed_fallback = urllib.parse.urlparse(best_candidate)
5447 | final_root_url = parsed_fallback._replace(query="", fragment="").geturl()
5448 | except ValueError:
5449 | # Should not happen if best_candidate was parseable before, but handle defensively
5450 | logger.error(
5451 | f"Failed to parse even the fallback candidate {best_candidate}. Returning original candidate."
5452 | )
5453 | final_root_url = best_candidate # Last resort
5454 |
5455 | return final_root_url
5456 |
5457 | # Note: ToolError is raised explicitly above if no candidate found or web search fails.
5458 | # This catch block handles unexpected errors during the process.
5459 | except Exception as e:
5460 | logger.error(
5461 | f"Unexpected error finding documentation root for '{pkg_name}': {e}", exc_info=True
5462 | )
5463 | # Raise a generic ToolError indicating the failure cause
5464 | raise ToolError(
5465 | f"An unexpected error occurred while finding documentation for '{pkg_name}': {str(e)}"
5466 | ) from e
5467 |
5468 |
5469 | # Import optional libraries for summarization, handle missing imports
5470 | try:
5471 | import trafilatura
5472 | except ImportError:
5473 | trafilatura = None
5474 | logger.debug("trafilatura library not found, summarization quality may be reduced.")
5475 | try:
5476 | from readability import Document # Using python-readability (lxml based)
5477 | except ImportError:
5478 | Document = None
5479 | logger.debug("readability-lxml library not found, summarization quality may be reduced.")
5480 |
5481 |
5482 | def _summarize_html_sync(html: str, max_len: int = 10000) -> str:
5483 | """Synchronously extracts main text content from HTML using multiple libraries."""
5484 | if not html:
5485 | return ""
5486 |
5487 | # Limit input HTML size to prevent excessive memory/CPU usage
5488 | MAX_HTML_SIZE = 3 * 1024 * 1024 # 3 MiB
5489 | if len(html) > MAX_HTML_SIZE:
5490 | logger.warning(f"HTML content truncated to {MAX_HTML_SIZE} bytes for summarization.")
5491 | html = html[:MAX_HTML_SIZE]
5492 |
5493 | text = ""
5494 |
5495 | # 1. Try Trafilatura (often good for articles/main content)
5496 | if trafilatura is not None:
5497 | try:
5498 | # Favor precision over recall, exclude comments/tables
5499 | extracted = trafilatura.extract(
5500 | html, include_comments=False, include_tables=False, favor_precision=True
5501 | )
5502 | if (
5503 | extracted and len(extracted) > 100
5504 | ): # Basic check if extraction yielded substantial text
5505 | text = extracted
5506 | logger.debug("Summarized HTML using Trafilatura.")
5507 | except Exception as e:
5508 | logger.warning(f"Trafilatura failed during HTML summarization: {e}")
5509 | # Continue to next method if it fails
5510 |
5511 | # 2. Try Readability-lxml if Trafilatura failed or yielded short text
5512 | if (not text or len(text) < 200) and Document is not None:
5513 | try:
5514 | doc = Document(html)
5515 | # Get summary HTML (main content block)
5516 | summary_html = doc.summary(html_partial=True)
5517 | # Parse the summary HTML and extract text
5518 | soup = BeautifulSoup(
5519 | summary_html, "html.parser"
5520 | ) # Use html.parser for potentially partial HTML
5521 | extracted_text = soup.get_text(" ", strip=True)
5522 | if extracted_text and len(extracted_text) > 50: # Lower threshold for readability
5523 | text = extracted_text
5524 | logger.debug("Summarized HTML using Readability-lxml.")
5525 | except Exception as e:
5526 | logger.warning(f"Readability-lxml failed during HTML summarization: {e}")
5527 | # Continue to fallback if it fails
5528 |
5529 | # 3. Fallback: BeautifulSoup basic text extraction (if others failed/short)
5530 | if not text or len(text) < 100:
5531 | logger.debug("Using BeautifulSoup fallback for HTML summarization.")
5532 | try:
5533 | soup = BeautifulSoup(html, "lxml") # Use lxml for robustness
5534 | # Remove common non-content tags before text extraction
5535 | tags_to_remove = [
5536 | "script",
5537 | "style",
5538 | "nav",
5539 | "header",
5540 | "footer",
5541 | "aside",
5542 | "form",
5543 | "figure",
5544 | "figcaption",
5545 | "noscript",
5546 | ]
5547 | found_tags = soup(tags_to_remove)
5548 | for tag in found_tags:
5549 | tag.decompose()
5550 | # Get remaining text, join with spaces, strip extra whitespace
5551 | extracted_text = soup.get_text(" ", strip=True)
5552 | text = extracted_text # Use BS result even if short
5553 | except Exception as e:
5554 | logger.warning(f"BeautifulSoup fallback failed during HTML summarization: {e}")
5555 | # text might remain empty if BS also fails
5556 |
5557 | # Final cleanup: normalize whitespace and truncate
5558 | cleaned_text = re.sub(r"\s+", " ", text).strip()
5559 | final_text = cleaned_text[:max_len]
5560 | return final_text
5561 |
5562 |
5563 | async def _grab_readable(
5564 | client: httpx.AsyncClient, url: str, rate_limiter: RateLimiter
5565 | ) -> Optional[str]:
5566 | """Fetches HTML and extracts readable text content asynchronously."""
5567 | # Fetch HTML using the helper function
5568 | html = await _fetch_html(client, url, rate_limiter)
5569 | if html:
5570 | # Run the synchronous summarization function in the thread pool
5571 | readable_text = await _run_in_thread(_summarize_html_sync, html)
5572 | return readable_text
5573 | else:
5574 | # Return None if HTML fetch failed
5575 | return None
5576 |
5577 |
5578 | async def crawl_docs_site(
5579 | root_url: str, max_pages: int = 40, rate_limit_rps: float = 3.0
5580 | ) -> List[Tuple[str, str]]:
5581 | """Crawls a documentation site starting from root_url and extracts readable text."""
5582 | # Validate root URL and get starting domain
5583 | try:
5584 | parsed_start_url = urlparse(root_url)
5585 | start_netloc = parsed_start_url.netloc
5586 | if not start_netloc:
5587 | raise ValueError("Root URL must have a valid domain name.")
5588 | except (ValueError, AssertionError) as e:
5589 | raise ToolInputError(
5590 | f"Invalid root URL provided for documentation crawl: '{root_url}'. Error: {e}"
5591 | ) from e
5592 |
5593 | # Initialize crawl state
5594 | seen_urls: Set[str] = set()
5595 | queue: deque[str] = deque()
5596 | queue.append(root_url) # Start with the root URL
5597 | seen_urls.add(root_url)
5598 | # List to store tuples of (url, extracted_text)
5599 | output_pages: List[Tuple[str, str]] = []
5600 | visit_count = 0
5601 | # Set a max number of visits to prevent infinite loops on large/cyclic sites
5602 | max_visits = max(max_pages * 5, 200) # Visit more URLs than pages needed
5603 | rate_limiter = RateLimiter(rate_limit_rps)
5604 | headers = {"User-Agent": "Mozilla/5.0 (compatible; SmartBrowserDocBot/1.0)"}
5605 | logger.info(
5606 | f"Starting documentation crawl from: {root_url} (Max pages: {max_pages}, Max visits: {max_visits})"
5607 | )
5608 |
5609 | # Use httpx.AsyncClient for connection pooling
5610 | client_timeout = 30.0
5611 | async with httpx.AsyncClient(
5612 | follow_redirects=True, timeout=client_timeout, headers=headers
5613 | ) as client:
5614 | # Main crawl loop
5615 | while queue:
5616 | # Check stopping conditions
5617 | if len(output_pages) >= max_pages:
5618 | logger.info(f"Doc crawl stopped: Reached max pages ({max_pages}).")
5619 | break
5620 | if visit_count >= max_visits:
5621 | logger.warning(f"Doc crawl stopped: Reached max visits ({max_visits}).")
5622 | break
5623 |
5624 | # Get next URL from queue
5625 | current_url = queue.popleft()
5626 | visit_count += 1
5627 | logger.debug(
5628 | f"Doc Crawl [Visit {visit_count}/{max_visits}, Found {len(output_pages)}/{max_pages}]: {current_url}"
5629 | )
5630 |
5631 | # Grab readable text content from the URL
5632 | readable_text = await _grab_readable(client, current_url, rate_limiter)
5633 |
5634 | # If readable text was extracted, add it to results
5635 | if readable_text:
5636 | output_pages.append((current_url, readable_text))
5637 | logger.debug(
5638 | f"Collected readable content from: {current_url} (Length: {len(readable_text)})"
5639 | )
5640 |
5641 | # Check if max pages reached after adding
5642 | if len(output_pages) >= max_pages:
5643 | break # Exit loop early
5644 |
5645 | # Fetch HTML again (or reuse if cached) to extract links for further crawling
5646 | # Re-fetching ensures we get links even if _grab_readable modified/simplified HTML structure
5647 | # (Could potentially optimize by passing HTML between functions if summarizer doesn't modify structure needed for links)
5648 | html_for_links = await _fetch_html(client, current_url, rate_limiter)
5649 | if html_for_links:
5650 | _, page_links = _extract_links(current_url, html_for_links)
5651 | # Process found page links
5652 | for link_url in page_links:
5653 | try:
5654 | parsed_link = urlparse(link_url)
5655 | # Check if link is on the same domain
5656 | is_same_domain = parsed_link.netloc == start_netloc
5657 | # Check if it looks like a doc page we haven't seen
5658 | is_doc_link = _looks_like_docs_url(link_url)
5659 | is_not_seen = link_url not in seen_urls
5660 |
5661 | if is_same_domain and is_doc_link and is_not_seen:
5662 | seen_urls.add(link_url)
5663 | queue.append(link_url) # Add to crawl queue
5664 | except ValueError:
5665 | # Ignore errors parsing potential link URLs
5666 | pass
5667 | else:
5668 | logger.debug(f"No readable content extracted from: {current_url}")
5669 |
5670 | # Log final results after loop finishes
5671 | logger.info(
5672 | f"Documentation crawl finished. Collected content from {len(output_pages)} pages after {visit_count} visits."
5673 | )
5674 | return output_pages
5675 |
5676 |
5677 | # --- Page State Extraction ---
5678 | async def get_page_state(
5679 | page: Page, max_elements: Optional[int] = None
5680 | ) -> dict[str, Any]: # Uses global _log
5681 | """Extracts the current state of the page using the page map functionality."""
5682 | if max_elements is not None:
5683 | # Note: _max_widgets_global now controls element count in _build_page_map
5684 | logger.warning(
5685 | "get_page_state 'max_elements' argument is deprecated and has no effect. Use global config 'max_widgets' instead."
5686 | )
5687 |
5688 | # Check if page is valid
5689 | if not page or page.is_closed():
5690 | logger.warning("get_page_state called on closed or invalid page.")
5691 | return {
5692 | "error": "Page is closed or invalid",
5693 | "url": getattr(page, "url", "unknown"), # Try to get URL even if closed
5694 | "title": "[Error: Page Closed]",
5695 | "elements": [],
5696 | "main_text": "",
5697 | }
5698 |
5699 | start_time = time.monotonic()
5700 | try:
5701 | # Use the helper function to build (or retrieve cached) page map
5702 | page_map, fingerprint = await _build_page_map(page)
5703 | duration = time.monotonic() - start_time
5704 | duration_ms = int(duration * 1000)
5705 | num_elements = len(page_map.get("elements", []))
5706 | page_url = page_map.get("url")
5707 | page_title = page_map.get("title")
5708 |
5709 | # Log successful extraction
5710 | await _log(
5711 | "page_state_extracted",
5712 | url=page_url,
5713 | title=page_title,
5714 | duration_ms=duration_ms,
5715 | num_elements=num_elements,
5716 | fp=fingerprint[:8],
5717 | )
5718 |
5719 | # Return the constructed page map
5720 | return page_map
5721 |
5722 | except Exception as e:
5723 | # Catch any unexpected errors during state extraction
5724 | duration = time.monotonic() - start_time
5725 | duration_ms = int(duration * 1000)
5726 | page_url = page.url or "unknown" # Get URL directly from page on error
5727 | logger.error(f"Error getting page state for {page_url}: {e}", exc_info=True)
5728 | # Log error event
5729 | await _log(
5730 | "page_error", action="get_state", url=page_url, error=str(e), duration_ms=duration_ms
5731 | )
5732 | # Return error structure
5733 | return {
5734 | "error": f"Failed to get page state: {e}",
5735 | "url": page_url,
5736 | "title": "[Error Getting State]",
5737 | "elements": [],
5738 | "main_text": "",
5739 | }
5740 |
5741 |
5742 | # --- LLM Bridge ---
5743 | def _extract_json_block(text: str) -> Optional[str]: # Keep as is
5744 | """Extracts the first JSON code block (markdown or bare) from text."""
5745 | # Try finding markdown code block first ```json ... ```
5746 | pattern_md = r"```json\s*(\{.*\}|\[.*\])\s*```"
5747 | match_markdown = re.search(pattern_md, text, re.DOTALL)
5748 | if match_markdown:
5749 | json_str = match_markdown.group(1).strip()
5750 | return json_str
5751 |
5752 | # Try finding bare JSON object or array { ... } or [ ... ]
5753 | # This is less reliable, might match partial structures
5754 | pattern_bare = r"(\{.*\}|\[.*\])"
5755 | match_bare = re.search(pattern_bare, text, re.DOTALL)
5756 | if match_bare:
5757 | block = match_bare.group(0)
5758 | # Basic sanity check for balanced braces/brackets
5759 | has_balanced_braces = block.count("{") == block.count("}")
5760 | has_balanced_brackets = block.count("[") == block.count("]")
5761 | if has_balanced_braces and has_balanced_brackets:
5762 | return block.strip() # Return the matched bare block
5763 |
5764 | # No JSON block found
5765 | return None
5766 |
5767 |
5768 | def _llm_resilient(max_attempts: int = 3, backoff: float = 1.0): # Keep as is
5769 | """Decorator for LLM calls, retrying on rate limits and transient errors."""
5770 |
5771 | def wrap(fn):
5772 | @functools.wraps(fn)
5773 | async def inner(*a, **kw):
5774 | attempt = 0
5775 | while True:
5776 | try:
5777 | # Add delay before retrying (not on first attempt)
5778 | if attempt > 0:
5779 | delay_factor = 2 ** (attempt - 1)
5780 | base_delay = backoff * delay_factor
5781 | jitter = random.uniform(0.8, 1.2)
5782 | jitter_delay = base_delay * jitter
5783 | logger.debug(
5784 | f"LLM resilient retry {attempt}: Sleeping for {jitter_delay:.2f}s..."
5785 | )
5786 | await asyncio.sleep(jitter_delay)
5787 | # Call the wrapped LLM function
5788 | result = await fn(*a, **kw)
5789 | return result
5790 |
5791 | except ProviderError as e:
5792 | # Check if it's a rate limit error (common for 429 status)
5793 | err_str_lower = str(e).lower()
5794 | is_rate_limit = (
5795 | "429" in str(e) # Check status code in error message
5796 | or "rate limit" in err_str_lower
5797 | or "too many requests" in err_str_lower
5798 | or "quota" in err_str_lower
5799 | )
5800 | if is_rate_limit:
5801 | attempt += 1
5802 | func_name = getattr(fn, "__name__", "?")
5803 | if attempt >= max_attempts:
5804 | logger.error(
5805 | f"LLM rate limit: '{func_name}' failed after {max_attempts} attempts: {e}"
5806 | )
5807 | raise ToolError(
5808 | f"LLM rate-limit exceeded after {max_attempts} attempts: {e}"
5809 | ) from e
5810 |
5811 | # Check for Retry-After header suggestion in error
5812 | retry_after_seconds = None
5813 | retry_after_match = re.search(r"retry[- ]after[: ]+(\d+)", err_str_lower)
5814 | if retry_after_match:
5815 | try:
5816 | retry_after_seconds = int(retry_after_match.group(1))
5817 | except ValueError:
5818 | pass # Ignore if number parsing fails
5819 |
5820 | # Calculate delay: Use Retry-After if available, else exponential backoff
5821 | if retry_after_seconds:
5822 | delay = retry_after_seconds
5823 | logger.warning(
5824 | f"LLM rate limit for '{func_name}'. Retrying after suggested {delay:.2f}s (attempt {attempt}/{max_attempts})"
5825 | )
5826 | else:
5827 | delay_factor = 2 ** (
5828 | attempt - 1
5829 | ) # Use previous attempt for backoff calculation
5830 | base_delay = backoff * delay_factor
5831 | jitter = random.uniform(0.8, 1.2)
5832 | delay = base_delay * jitter
5833 | logger.warning(
5834 | f"LLM rate limit for '{func_name}'. Retrying after {delay:.2f}s (attempt {attempt}/{max_attempts})"
5835 | )
5836 |
5837 | # Sleep before next attempt (actual sleep happens at loop start)
5838 | # await asyncio.sleep(delay) # Moved sleep logic to loop start
5839 | continue # Go to next iteration to retry
5840 | else:
5841 | # Different ProviderError, re-raise
5842 | raise
5843 | except (httpx.RequestError, asyncio.TimeoutError) as e:
5844 | # Handle transient network errors or timeouts
5845 | attempt += 1
5846 | func_name = getattr(fn, "__name__", "?")
5847 | if attempt >= max_attempts:
5848 | logger.error(
5849 | f"LLM call: '{func_name}' failed due to transient error after {max_attempts} attempts: {e}"
5850 | )
5851 | raise ToolError(
5852 | f"LLM call failed after {max_attempts} attempts: {e}"
5853 | ) from e
5854 | # Calculate delay using exponential backoff
5855 | # delay_factor = 2**(attempt - 1) # Using previous attempt number
5856 | # base_delay = backoff * delay_factor
5857 | # jitter = random.uniform(0.8, 1.2)
5858 | # delay = base_delay * jitter
5859 | # logger.warning(f"LLM transient error for '{func_name}'. Retrying after {delay:.2f}s (attempt {attempt}/{max_attempts})")
5860 | # await asyncio.sleep(delay) # Moved sleep logic to loop start
5861 | logger.warning(
5862 | f"LLM transient error for '{func_name}'. Retrying (attempt {attempt}/{max_attempts}). Error: {e}"
5863 | )
5864 | continue # Go to next iteration
5865 | except Exception:
5866 | # For any other unexpected errors, re-raise immediately
5867 | raise
5868 |
5869 | return inner
5870 |
5871 | return wrap
5872 |
5873 |
5874 | @_llm_resilient(max_attempts=3, backoff=1.0)
5875 | async def _call_llm(
5876 | messages: Sequence[Dict[str, str]],
5877 | model: str = _llm_model_locator_global,
5878 | expect_json: bool = False,
5879 | temperature: float = 0.1,
5880 | max_tokens: int = 1024,
5881 | ) -> Union[Dict[str, Any], List[Any]]: # Uses global _log
5882 | """Makes a call to the LLM using the standalone chat_completion tool."""
5883 | if not messages:
5884 | logger.error("_call_llm received empty messages list.")
5885 | return {"error": "No messages provided to LLM."}
5886 |
5887 | # Determine provider and model name
5888 | llm_provider = Provider.OPENAI.value # Default provider
5889 | llm_model_name = model # Default model name
5890 | if model:
5891 | try:
5892 | extracted_provider, extracted_model = parse_model_string(model)
5893 | if extracted_provider:
5894 | llm_provider = extracted_provider
5895 | if extracted_model:
5896 | llm_model_name = extracted_model
5897 | except Exception as parse_err:
5898 | logger.warning(f"Could not parse model string '{model}': {parse_err}. Using defaults.")
5899 |
5900 | # Prepare arguments for chat_completion
5901 | llm_args: Dict[str, Any] = {
5902 | "provider": llm_provider,
5903 | "model": llm_model_name,
5904 | "messages": list(messages), # Ensure it's a mutable list
5905 | "temperature": temperature,
5906 | "max_tokens": max_tokens,
5907 | "additional_params": {}, # For provider-specific params like response_format
5908 | }
5909 |
5910 | # Handle JSON mode expectation
5911 | use_json_instruction = (
5912 | False # Flag to add manual instruction if native JSON mode fails/unsupported
5913 | )
5914 | if expect_json:
5915 | try:
5916 | # Check if the provider/model combination supports native JSON response format
5917 | provider_instance = await get_provider(llm_provider)
5918 | # Example check (adapt based on actual provider capabilities)
5919 | supports_native_json = False
5920 | if llm_provider == Provider.OPENAI.value and llm_model_name.startswith(
5921 | ("gpt-4", "gpt-3.5-turbo")
5922 | ): # Check specific OpenAI models known to support it
5923 | supports_native_json = True
5924 | # Or use a generic check if provider interface defines it
5925 | elif hasattr(provider_instance, "supports_json_response_format"):
5926 | supports_native_json = await provider_instance.supports_json_response_format(
5927 | llm_model_name
5928 | )
5929 |
5930 | if supports_native_json:
5931 | logger.debug(
5932 | f"Provider '{llm_provider}' model '{llm_model_name}' supports native JSON mode."
5933 | )
5934 | # Add the provider-specific parameter for JSON mode
5935 | # This varies by provider (e.g., OpenAI uses response_format)
5936 | if llm_provider == Provider.OPENAI.value:
5937 | llm_args["additional_params"]["response_format"] = {"type": "json_object"}
5938 | # Add other providers' JSON format params here if needed
5939 | use_json_instruction = False # Native mode used
5940 | else:
5941 | logger.debug(
5942 | f"Provider '{llm_provider}' model '{llm_model_name}' does not natively support JSON mode. Using manual instruction."
5943 | )
5944 | use_json_instruction = True # Need manual instruction
5945 | except Exception as e:
5946 | logger.warning(
5947 | f"Could not determine native JSON support for provider '{llm_provider}': {e}. Assuming manual instruction needed."
5948 | )
5949 | use_json_instruction = True
5950 |
5951 | # Add manual JSON instruction if needed
5952 | if use_json_instruction:
5953 | json_instruction = "\n\nIMPORTANT: Respond ONLY with valid JSON. Your entire response must start with `{` or `[` and end with `}` or `]`. Do not include ```json markers, comments, or any explanatory text before or after the JSON structure."
5954 | modified_messages = list(llm_args["messages"]) # Work on a copy
5955 | # Append instruction to the last user message, or add a new user message
5956 | if modified_messages and modified_messages[-1]["role"] == "user":
5957 | modified_messages[-1]["content"] += json_instruction
5958 | else:
5959 | # Add a new user message if last wasn't 'user' or list was empty
5960 | modified_messages.append(
5961 | {
5962 | "role": "user",
5963 | "content": "Provide the response based on the previous messages."
5964 | + json_instruction,
5965 | }
5966 | )
5967 | llm_args["messages"] = modified_messages # Update args with modified messages
5968 |
5969 | # Make the actual call to the standalone chat_completion tool
5970 | try:
5971 | start_time = time.monotonic()
5972 | resp = await chat_completion(**llm_args)
5973 | duration = time.monotonic() - start_time
5974 | duration_ms = int(duration * 1000)
5975 | model_returned = resp.get(
5976 | "model", llm_model_name
5977 | ) # Use model returned in response if available
5978 | is_success = resp.get("success", False)
5979 | is_cached = resp.get("cached_result", False)
5980 |
5981 | # Log the completion details
5982 | await _log(
5983 | "llm_call_complete",
5984 | model=model_returned,
5985 | duration_ms=duration_ms,
5986 | success=is_success,
5987 | cached=is_cached,
5988 | provider=llm_provider,
5989 | )
5990 |
5991 | # Process the response
5992 | if not is_success:
5993 | error_msg = resp.get("error", "LLM call failed with no specific error message.")
5994 | # Try to get raw response details for debugging
5995 | raw_resp_detail = None
5996 | if isinstance(resp.get("details"), dict):
5997 | raw_resp_detail = resp["details"].get("raw_response")
5998 | if not raw_resp_detail:
5999 | raw_resp_detail = resp.get("raw_response") # Fallback check
6000 | logger.warning(
6001 | f"LLM call failed: {error_msg}. Raw response preview: {str(raw_resp_detail)[:200]}"
6002 | )
6003 | return {"error": f"LLM API Error: {error_msg}", "raw_response": raw_resp_detail}
6004 |
6005 | # Extract content from the successful response message
6006 | assistant_message = resp.get("message", {})
6007 | content = assistant_message.get("content")
6008 | raw_text = content.strip() if isinstance(content, str) else ""
6009 |
6010 | if not raw_text:
6011 | logger.warning("LLM returned empty response content.")
6012 | return {"error": "LLM returned empty response content."}
6013 |
6014 | # Handle based on whether JSON was expected
6015 | if not expect_json:
6016 | # Return the raw text directly
6017 | return {"text": raw_text}
6018 | else:
6019 | # Attempt to parse the response as JSON
6020 | try:
6021 | # Try direct JSON parsing first
6022 | parsed_json = json.loads(raw_text)
6023 | return parsed_json
6024 | except json.JSONDecodeError:
6025 | # If direct parsing fails, try extracting a JSON block
6026 | logger.warning(
6027 | "LLM response was not valid JSON directly. Trying to extract JSON block..."
6028 | )
6029 | json_block = _extract_json_block(raw_text)
6030 | if json_block:
6031 | try:
6032 | parsed_block = json.loads(json_block)
6033 | logger.warning(
6034 | "Successfully parsed JSON block extracted from LLM response."
6035 | )
6036 | return parsed_block
6037 | except json.JSONDecodeError as e:
6038 | # Error parsing the extracted block
6039 | block_preview = json_block[:500]
6040 | error_msg = f"Could not parse extracted JSON block: {e}. Block preview: {block_preview}..."
6041 | logger.error(error_msg)
6042 | return {
6043 | "error": error_msg,
6044 | "raw_response": raw_text[:1000],
6045 | } # Return raw text for debugging
6046 | else:
6047 | # No valid JSON block found within the text
6048 | error_msg = "Could not parse JSON from LLM response (no valid block found)."
6049 | logger.error(error_msg)
6050 | return {
6051 | "error": error_msg,
6052 | "raw_response": raw_text[:1000],
6053 | } # Return raw text for debugging
6054 |
6055 | except ProviderError as e:
6056 | # Catch errors raised by the chat_completion tool itself (e.g., auth, config)
6057 | logger.error(f"LLM Provider error during chat_completion call: {e}")
6058 | raw_resp_detail = None
6059 | if hasattr(e, "details") and isinstance(getattr(e, "details", None), dict):
6060 | raw_resp_detail = e.details.get("raw_response")
6061 | return {"error": f"LLM Provider Error: {e}", "raw_response": raw_resp_detail}
6062 | except Exception as e:
6063 | # Catch any other unexpected errors during the call or processing
6064 | logger.error(f"Unexpected error during LLM call: {e}", exc_info=True)
6065 | return {"error": f"LLM call failed unexpectedly: {e}"}
6066 |
6067 |
6068 | # --- Macro/Autopilot Planners ---
6069 | ALLOWED_ACTIONS = {"click", "type", "wait", "download", "extract", "finish", "scroll"}
6070 |
6071 |
6072 | async def _plan_macro(
6073 | page_state: Dict[str, Any], task: str, model: str = _llm_model_locator_global
6074 | ) -> List[Dict[str, Any]]: # Uses global _llm_model_locator_global
6075 | """Generates a sequence of browser actions (macro steps) based on page state and a task."""
6076 | # Detailed description of allowed actions for the LLM
6077 | action_details = """
6078 | Allowed Actions:
6079 | - `click`: Clicks an element. Requires `task_hint` (description of the element to click).
6080 | - `type`: Types text into an input field. Requires `task_hint` (description of the field) and `text` (the text to type). Optional: `enter: true` to press Enter after typing, `clear_before: false` to avoid clearing field first.
6081 | - `wait`: Pauses execution. Requires `ms` (milliseconds to wait). Use sparingly for unavoidable dynamic content delays.
6082 | - `download`: Clicks a link/button to initiate a download. Requires `task_hint` (description of download element). Optional: `dest` (destination directory path relative to storage).
6083 | - `extract`: Extracts text from elements matching a CSS selector. Requires `selector`. Returns a list of strings.
6084 | - `scroll`: Scrolls the page. Requires `direction` ('up', 'down', 'top', 'bottom'). Optional: `amount_px` (pixels for 'up'/'down', default 500).
6085 | - `finish`: Indicates the task is complete. No arguments needed. Should be the last step if the task goal is achieved.
6086 | """
6087 |
6088 | # Prepare summary of elements for the LLM prompt
6089 | elements_summary = []
6090 | elements_list = page_state.get("elements", [])
6091 | for el in elements_list:
6092 | el_id = el.get("id")
6093 | el_tag = el.get("tag")
6094 | el_role = el.get("role", " ")
6095 | el_text = el.get("text", " ")
6096 | max_text_len = 80
6097 | truncated_text = el_text[:max_text_len] + ("..." if len(el_text) > max_text_len else "")
6098 | summary_str = f"id={el_id} tag={el_tag} role='{el_role}' text='{truncated_text}'"
6099 | elements_summary.append(summary_str)
6100 |
6101 | # System prompt for the macro planner LLM
6102 | system_prompt = textwrap.dedent(f"""
6103 | You are an expert web automation assistant. Your goal is to create a sequence of steps (a macro) to accomplish a user's task on the current web page.
6104 | You will be given the current page state (URL, Title, main text content, and a list of interactive elements with their IDs, tags, roles, and text).
6105 | You will also be given the user's task.
6106 | Based on the page state and task, generate a JSON list of action steps.
6107 |
6108 | EACH step in the list MUST be a JSON object containing an "action" key specifying the action name (e.g., "click", "type").
6109 | Other keys in the object should be the required arguments for that action (e.g., "task_hint", "text", "ms", "selector", "direction").
6110 |
6111 | {action_details}
6112 |
6113 | Generate ONLY the JSON list of steps following this structure: `[ {{"action": "action_name", "arg1": "value1", ...}}, ... ]`.
6114 |
6115 | DO NOT include explanations or markdown formatting!
6116 |
6117 | If the task seems impossible or cannot be mapped to the available actions/elements, return an empty list `[]`.
6118 |
6119 | If the task is already complete based on the current state (e.g., "find the price" and price is visible), you can return a `finish` step or an empty list.
6120 | """).strip()
6121 |
6122 | # User prompt with page state and task
6123 | elements_str = "\n".join(elements_summary)
6124 | main_text_preview = page_state.get("main_text", "")[:500] # Preview main text
6125 | user_prompt = textwrap.dedent(f"""
6126 | Current Page State:
6127 | URL: {page_state.get("url", "[No URL]")}
6128 | Title: {page_state.get("title", "[No Title]")}
6129 | Main Text (Preview): {main_text_preview}...
6130 | Elements:
6131 | {elements_str}
6132 |
6133 | User Task: "{task}"
6134 |
6135 | Generate the JSON list of steps to accomplish this task. Respond ONLY with the JSON list.
6136 | """).strip()
6137 |
6138 | # Prepare messages and call LLM
6139 | messages = [
6140 | {"role": "system", "content": system_prompt},
6141 | {"role": "user", "content": user_prompt},
6142 | ]
6143 | result = await _call_llm(
6144 | messages,
6145 | model=model,
6146 | expect_json=True,
6147 | temperature=0.0,
6148 | max_tokens=2048, # Allow reasonable size for plan
6149 | )
6150 |
6151 | # Process and validate the LLM response (Revised to handle single dict)
6152 | plan_list: Optional[List[Dict[str, Any]]] = None
6153 | if isinstance(result, list):
6154 | plan_list = result
6155 | elif isinstance(result, dict) and "error" in result:
6156 | # Handle errors reported by the LLM call itself
6157 | error_detail = result.get("raw_response", result["error"])
6158 | raise ToolError(f"Macro planner LLM call failed: {result['error']}", details=error_detail)
6159 | elif isinstance(result, dict):
6160 | # --- Handling case where LLM returns a single step dict ---
6161 | if "action" in result: # Check if it looks like a valid step
6162 | logger.warning(
6163 | "LLM returned a single step dictionary instead of a list for macro plan. Wrapping it in a list."
6164 | )
6165 | plan_list = [result]
6166 | elif "steps" in result and isinstance(result["steps"], list):
6167 | # Handle cases where LLM wraps the list in a "steps" key (existing logic)
6168 | logger.warning("LLM wrapped macro plan in 'steps' key. Extracting list.")
6169 | plan_list = result["steps"]
6170 | else:
6171 | # It's a dict, but doesn't look like a step or contain 'steps'
6172 | response_type = type(result).__name__
6173 | response_preview = str(result)[:500]
6174 | raise ToolError(
6175 | f"Macro planner returned unexpected dictionary format: {response_type}. Preview: '{response_preview}...'",
6176 | details={"raw_response": response_preview},
6177 | )
6178 | else:
6179 | # Handle other unexpected response formats
6180 | response_type = type(result).__name__
6181 | response_preview = str(result)[:500]
6182 | raise ToolError(
6183 | f"Macro planner returned unexpected format: {response_type}. Expected list or dict. Preview: '{response_preview}...'",
6184 | details={"raw_response": response_preview},
6185 | )
6186 |
6187 | # Validate individual steps in the plan
6188 | validated_plan = []
6189 | if plan_list is not None: # Check if we have a list to validate (could be empty list)
6190 | for i, step in enumerate(plan_list):
6191 | if not isinstance(step, dict) or "action" not in step:
6192 | # Log raw response preview on validation error
6193 | logger.warning(
6194 | f"Macro plan step {i + 1} invalid format (not dict or missing 'action'): {step}. RAW LLM RESPONSE PREVIEW: {str(result)[:500]}"
6195 | )
6196 | continue # Skip invalid step format
6197 |
6198 | action = step.get("action")
6199 | if action not in ALLOWED_ACTIONS:
6200 | logger.warning(f"Macro plan step {i + 1} has invalid action '{action}': {step}")
6201 | continue # Skip step with unknown action
6202 |
6203 | # --- Basic argument checks ---
6204 | error_flag = False
6205 | if action in ("click", "download") and not step.get("task_hint"):
6206 | logger.warning(f"Macro plan step {i + 1} '{action}' missing 'task_hint': {step}")
6207 | error_flag = True
6208 | if action == "type":
6209 | if not step.get("task_hint"):
6210 | logger.warning(f"Macro plan step {i + 1} 'type' missing 'task_hint': {step}")
6211 | error_flag = True
6212 | if step.get("text") is None: # Allow empty string, but not None
6213 | logger.warning(f"Macro plan step {i + 1} 'type' missing 'text': {step}")
6214 | error_flag = True
6215 | if action == "wait" and step.get("ms") is None:
6216 | logger.warning(f"Macro plan step {i + 1} 'wait' missing 'ms': {step}")
6217 | error_flag = True
6218 | if action == "extract" and not step.get("selector"):
6219 | logger.warning(f"Macro plan step {i + 1} 'extract' missing 'selector': {step}")
6220 | error_flag = True
6221 | if action == "scroll" and step.get("direction") not in ("up", "down", "top", "bottom"):
6222 | logger.warning(
6223 | f"Macro plan step {i + 1} 'scroll' has invalid or missing 'direction': {step}"
6224 | )
6225 | error_flag = True
6226 | # Add more specific checks as needed...
6227 |
6228 | if not error_flag:
6229 | validated_plan.append(step) # Add valid step to the final plan
6230 | else:
6231 | logger.warning(
6232 | f"Skipping invalid macro step {i + 1} due to missing/invalid arguments."
6233 | )
6234 |
6235 | # --- Final check and logging/error based on validation outcome ---
6236 | if not validated_plan: # If plan is empty after validation
6237 | response_preview = str(result)[:500] if result else "None"
6238 | # Distinguish between LLM intentionally returning [] and validation failing all steps
6239 | if plan_list is not None and len(plan_list) > 0:
6240 | # LLM returned steps, but all were invalid
6241 | raise ToolError(
6242 | "Macro planner generated plan, but all steps were invalid.",
6243 | details={"raw_response": response_preview, "original_plan_length": len(plan_list)},
6244 | )
6245 | elif plan_list is None:
6246 | # This case should ideally be caught earlier by the type checking
6247 | raise ToolError(
6248 | "Macro planner failed to generate a valid list or dictionary of steps.",
6249 | details={"raw_response": response_preview},
6250 | )
6251 | else: # LLM returned [], which is valid
6252 | logger.info(
6253 | "Macro planner returned an empty list, indicating task completion or impossibility."
6254 | )
6255 | # Return the empty list in this case
6256 | return []
6257 |
6258 | logger.debug(f"Validated macro plan has {len(validated_plan)} steps.")
6259 | return validated_plan
6260 |
6261 |
6262 | _AVAILABLE_TOOLS = { # Keep as is
6263 | # Tool Name: (Standalone Function Name, {Arg Name: Arg Type Hint})
6264 | "search_web": (
6265 | "search",
6266 | {
6267 | "query": "str",
6268 | "engine": "Optional[str: bing|duckduckgo|yandex]",
6269 | "max_results": "Optional[int]",
6270 | },
6271 | ),
6272 | "browse_page": (
6273 | "browse",
6274 | {
6275 | "url": "str",
6276 | "wait_for_selector": "Optional[str]",
6277 | "wait_for_navigation": "Optional[bool]",
6278 | },
6279 | ), # Updated browse args
6280 | "click_element": (
6281 | "click",
6282 | {
6283 | "url": "str",
6284 | "task_hint": "Optional[str]",
6285 | "target": "Optional[dict]",
6286 | "wait_ms": "Optional[int]",
6287 | },
6288 | ), # Updated click args
6289 | "type_into_fields": (
6290 | "type_text",
6291 | {
6292 | "url": "str",
6293 | "fields": "List[dict{'task_hint':str,'text':str,'enter':bool?,'clear_before':bool?}]",
6294 | "submit_hint": "Optional[str]",
6295 | "submit_target": "Optional[dict]",
6296 | "wait_after_submit_ms": "Optional[int]",
6297 | },
6298 | ), # Updated type_text args
6299 | "download_file_via_click": (
6300 | "download",
6301 | {
6302 | "url": "str",
6303 | "task_hint": "Optional[str]",
6304 | "target": "Optional[dict]",
6305 | "dest_dir": "Optional[str]",
6306 | },
6307 | ), # Updated download args
6308 | "run_page_macro": (
6309 | "run_macro",
6310 | {
6311 | "url": "str",
6312 | "task": "str",
6313 | "model": "Optional[str]",
6314 | "max_rounds": "Optional[int]",
6315 | "timeout_seconds": "Optional[int]",
6316 | },
6317 | ), # Updated run_macro args
6318 | "download_all_pdfs_from_site": (
6319 | "download_site_pdfs",
6320 | {
6321 | "start_url": "str",
6322 | "dest_subfolder": "Optional[str]",
6323 | "include_regex": "Optional[str]",
6324 | "max_depth": "Optional[int]",
6325 | "max_pdfs": "Optional[int]",
6326 | "rate_limit_rps": "Optional[float]",
6327 | },
6328 | ), # Updated download_site_pdfs args
6329 | "collect_project_documentation": (
6330 | "collect_documentation",
6331 | {"package": "str", "max_pages": "Optional[int]", "rate_limit_rps": "Optional[float]"},
6332 | ), # Updated collect_documentation args
6333 | "process_urls_in_parallel": (
6334 | "parallel",
6335 | {"urls": "List[str]", "action": "str('get_state')", "max_tabs": "Optional[int]"},
6336 | ), # Updated parallel args
6337 | "get_filesystem_status": ("filesystem_status", {}), # Example Filesystem tool
6338 | "read_file": ("read_file", {"path": "str"}), # Example Filesystem tool
6339 | "write_file": (
6340 | "write_file",
6341 | {"path": "str", "content": "Union[str, bytes]", "append": "Optional[bool]"},
6342 | ), # Example Filesystem tool
6343 | }
6344 |
6345 | _PLANNER_SYS = textwrap.dedent("""
6346 | You are an AI assistant acting as the central planner for a web automation and information retrieval system.
6347 | Your goal is to achieve the user's complex task by selecting the appropriate tool and providing the correct arguments for each step.
6348 | You will be given the user's overall task and a summary of results from previous steps (if any).
6349 | You have access to a set of tools, described below with their names and argument schemas (use JSON format for args).
6350 | Select ONE tool to execute next that will make progress towards the user's goal.
6351 | Carefully consider the user's task and the previous results to choose the best tool and arguments.
6352 | If a previous step failed, analyze the error and decide whether to retry, try a different approach, or ask for clarification (if interaction allowed). For now, focus on selecting the next best tool.
6353 | If the task requires information from the web, use `search_web` first unless a specific URL is provided or implied.
6354 | If the task involves interacting with a specific webpage (clicking, typing, downloading), use the appropriate browser tool (`browse_page`, `click_element`, `type_into_fields`, `download_file_via_click`, `run_page_macro`). Use the URL from previous steps if available.
6355 | For filesystem operations, use the filesystem tools like `read_file`, `write_file`.
6356 | Use `run_page_macro` for multi-step interactions on a single page described in natural language.
6357 | Use `collect_project_documentation` or `download_all_pdfs_from_site` for specialized crawling tasks.
6358 | Use `process_urls_in_parallel` only when needing the *same* simple action (like getting state) on *multiple* distinct URLs.
6359 |
6360 | Respond ONLY with a JSON list containing a single step object. The object must have:
6361 | - "tool": The name of the selected tool (string).
6362 | - "args": A JSON object containing the arguments for the tool (matching the schema).
6363 |
6364 | Example Response:
6365 | ```json
6366 | [
6367 | {
6368 | "tool": "search_web",
6369 | "args": {
6370 | "query": "latest news on AI regulation",
6371 | "engine": "duckduckgo"
6372 | }
6373 | }
6374 | ]
6375 | ```
6376 | If you determine the task is complete based on the prior results, respond with an empty JSON list `[]`.
6377 | """).strip()
6378 |
6379 |
6380 | async def _plan_autopilot(
6381 | task: str, prior_results: Optional[List[Dict]] = None
6382 | ) -> List[Dict[str, Any]]: # Uses global _AVAILABLE_TOOLS, _PLANNER_SYS, _call_llm
6383 | """Generates the next step (tool call) for the Autopilot based on task and history."""
6384 | # Describe available tools for the LLM prompt
6385 | tools_desc = {}
6386 | for name, data in _AVAILABLE_TOOLS.items():
6387 | func_name, schema = data
6388 | tools_desc[name] = schema
6389 |
6390 | # Summarize prior results concisely
6391 | prior_summary = "None"
6392 | if prior_results:
6393 | summaries = []
6394 | # Summarize last 3 steps for context, or fewer if less than 3 executed
6395 | start_index = max(0, len(prior_results) - 3)
6396 | for i, res in enumerate(prior_results[start_index:], start=start_index + 1):
6397 | tool_used = res.get("tool", "?")
6398 | was_success = res.get("success", False)
6399 | outcome_marker = "[OK]" if was_success else "[FAIL]"
6400 | # Get result summary or error message - prefer 'message' if present, else result/error
6401 | result_data = res.get("message", res.get("result", res.get("error", "")))
6402 | # Handle dict results slightly better
6403 | if isinstance(result_data, dict):
6404 | # Extract key info or just summarize keys
6405 | dict_preview = str(list(result_data.keys()))
6406 | details_str = f"Dict{dict_preview[:130]}" + (
6407 | "..." if len(dict_preview) > 130 else ""
6408 | )
6409 | else:
6410 | details_str = str(result_data)[:150] + (
6411 | "..." if len(str(result_data)) > 150 else ""
6412 | ) # Truncate long results/errors
6413 |
6414 | summary_line = f"Step {i}: Ran {tool_used} -> {outcome_marker} ({details_str})"
6415 | summaries.append(summary_line)
6416 | prior_summary = "\n".join(summaries)
6417 |
6418 | # Construct the user prompt
6419 | tools_json_str = json.dumps(tools_desc, indent=2)
6420 | # Use the same _PLANNER_SYS prompt, as it requests a list with one step
6421 | user_prompt = (
6422 | f"AVAILABLE TOOLS (Schema):\n{tools_json_str}\n\n"
6423 | f"PRIOR RESULTS SUMMARY (Last {len(summaries) if prior_results else 0} steps):\n{prior_summary}\n\n"
6424 | f"USER TASK:\n{task}\n\n"
6425 | "Select the single best tool and arguments for the *next* step to achieve the user task. "
6426 | "Respond ONLY with a JSON list containing exactly one step object (tool, args), or an empty list [] if the task is complete or cannot proceed."
6427 | )
6428 |
6429 | # Prepare messages and call the LLM planner
6430 | messages = [
6431 | {"role": "system", "content": _PLANNER_SYS}, # Use the standardized system prompt
6432 | {"role": "user", "content": user_prompt},
6433 | ]
6434 | response = await _call_llm(
6435 | messages,
6436 | expect_json=True,
6437 | temperature=0.0,
6438 | max_tokens=2048,
6439 | )
6440 |
6441 | # --- Process and validate the LLM response (Revised) ---
6442 | if isinstance(response, dict) and "error" in response:
6443 | raise ToolError(f"Autopilot planner LLM call failed: {response['error']}")
6444 |
6445 | current_plan_list: List[Dict[str, Any]] = [] # Initialize as empty list
6446 |
6447 | if isinstance(response, list):
6448 | current_plan_list = response # LLM returned the expected list
6449 | elif isinstance(response, dict):
6450 | # --- Handling case where LLM returns a single step dict ---
6451 | if "tool" in response and "args" in response: # Check if it looks like a valid step
6452 | logger.warning(
6453 | "Autopilot planner returned a single step dictionary instead of a list. Wrapping it."
6454 | )
6455 | current_plan_list = [response]
6456 | else:
6457 | # It's a dict, but doesn't look like a valid step
6458 | response_type = type(response).__name__
6459 | raise ToolError(
6460 | f"Autopilot planner returned unexpected dictionary format: {response_type}. Expected a JSON list or a valid step dict."
6461 | )
6462 | else:
6463 | # Handle other unexpected response formats
6464 | response_type = type(response).__name__
6465 | raise ToolError(
6466 | f"Autopilot planner returned unexpected format: {response_type}. Expected a JSON list."
6467 | )
6468 |
6469 | # --- Validate the structure and content of the step(s) ---
6470 | validated_plan: List[Dict[str, Any]] = []
6471 | if len(current_plan_list) > 1:
6472 | logger.warning(
6473 | f"Autopilot planner returned multiple steps ({len(current_plan_list)}). Only using the first one."
6474 | )
6475 | elif len(current_plan_list) == 0:
6476 | logger.info(
6477 | "Autopilot planner returned an empty list, indicating task completion or inability to proceed."
6478 | )
6479 | return [] # Return empty list as intended
6480 |
6481 | # Process the first (and only expected) step
6482 | if len(current_plan_list) >= 1:
6483 | step = current_plan_list[0]
6484 | if not isinstance(step, dict):
6485 | logger.warning(f"Autopilot planner step is not a dictionary: {step}")
6486 | return [] # Return empty plan if format is wrong
6487 |
6488 | tool_name = step.get("tool")
6489 | tool_args = step.get("args")
6490 |
6491 | if not tool_name or not isinstance(tool_args, dict):
6492 | logger.warning(
6493 | f"Autopilot planner step missing 'tool' or 'args' (must be dict): {step}"
6494 | )
6495 | return [] # Return empty plan if structure is wrong
6496 |
6497 | if tool_name not in _AVAILABLE_TOOLS:
6498 | logger.warning(f"Autopilot planner selected unknown tool '{tool_name}': {step}")
6499 | return [] # Return empty plan if tool is unknown
6500 |
6501 | # Optional: Add deeper validation of args based on _AVAILABLE_TOOLS schema if needed
6502 |
6503 | # If validation passes, add the single step to the plan
6504 | validated_plan.append(step)
6505 |
6506 | # Return the validated plan (containing 0 or 1 step)
6507 | return validated_plan
6508 |
6509 |
6510 | # --- Step Runner (for Macro) ---
6511 | async def run_steps(
6512 | page: Page, steps: Sequence[Dict[str, Any]]
6513 | ) -> List[Dict[str, Any]]: # Uses global smart_click, smart_type, smart_download
6514 | """Executes a sequence of predefined macro steps on a given page."""
6515 | results: List[Dict[str, Any]] = [] # Stores results of each step
6516 |
6517 | for i, step in enumerate(steps):
6518 | action = step.get("action")
6519 | step_result = step.copy() # Start with original step data
6520 | step_result["success"] = False # Default to failure
6521 | start_time = time.monotonic()
6522 | step_num = i + 1
6523 | should_break = False # Initialize break flag for this step
6524 |
6525 | if not action:
6526 | step_result["error"] = f"Step {step_num}: Missing 'action' key."
6527 | logger.warning(step_result["error"])
6528 | results.append(step_result)
6529 | continue # Skip to next step
6530 |
6531 | try:
6532 | logger.debug(
6533 | f"Executing Macro Step {step_num}: Action='{action}', Args={ {k: v for k, v in step.items() if k != 'action'} }"
6534 | )
6535 | # --- Execute Action ---
6536 | if action == "click":
6537 | hint = step.get("task_hint")
6538 | target_fallback = step.get("target") # Optional fallback args
6539 | if not hint:
6540 | raise ToolInputError(
6541 | f"Step {step_num} ('click'): Missing required argument 'task_hint'."
6542 | )
6543 |
6544 | # Check for and handle common obstacles like reCAPTCHA
6545 | if "recaptcha" in hint.lower() or "captcha" in hint.lower():
6546 | # Try to detect CAPTCHA presence first
6547 | captcha_js = """() => {
6548 | return document.body.innerText.toLowerCase().includes('captcha') ||
6549 | document.querySelector('iframe[title*=captcha]') !== null ||
6550 | document.querySelector('[id*=captcha]') !== null ||
6551 | document.querySelector('[class*=captcha]') !== null ||
6552 | document.querySelector('div[class*="recaptcha"]') !== null;
6553 | }"""
6554 | captcha_detected = await page.evaluate(captcha_js)
6555 | if captcha_detected:
6556 | logger.warning(f"Step {step_num}: CAPTCHA detected but cannot be automatically solved. Marking as failed.")
6557 | step_result["error"] = "CAPTCHA detected - requires manual intervention"
6558 | step_result["success"] = False
6559 | # Continue to finally block without raising exception
6560 | else:
6561 | # Use the smart_click helper
6562 | click_success = await smart_click(
6563 | page, task_hint=hint, target_kwargs=target_fallback
6564 | )
6565 | step_result["success"] = click_success
6566 | else:
6567 | # Use the smart_click helper
6568 | click_success = await smart_click(
6569 | page, task_hint=hint, target_kwargs=target_fallback
6570 | )
6571 | step_result["success"] = click_success # Should be True if no exception
6572 |
6573 | elif action == "type":
6574 | hint = step.get("task_hint")
6575 | target_fallback = step.get("target")
6576 | text = step.get("text")
6577 | if not hint:
6578 | raise ToolInputError(
6579 | f"Step {step_num} ('type'): Missing required argument 'task_hint'."
6580 | )
6581 | if text is None: # Allow empty string, but not None
6582 | raise ToolInputError(
6583 | f"Step {step_num} ('type'): Missing required argument 'text'."
6584 | )
6585 | # Get optional arguments
6586 | press_enter = step.get("enter", False) # Default False
6587 | clear_before = step.get("clear_before", True) # Default True
6588 | # Use the smart_type helper
6589 | type_success = await smart_type(
6590 | page,
6591 | task_hint=hint,
6592 | text=text,
6593 | press_enter=press_enter,
6594 | clear_before=clear_before,
6595 | target_kwargs=target_fallback,
6596 | timeout_ms=5000,
6597 | )
6598 | step_result["success"] = type_success
6599 |
6600 | elif action == "wait":
6601 | ms = step.get("ms")
6602 | if ms is None:
6603 | raise ToolInputError(
6604 | f"Step {step_num} ('wait'): Missing required argument 'ms'."
6605 | )
6606 | try:
6607 | wait_ms = int(ms)
6608 | if wait_ms < 0:
6609 | raise ValueError("Wait time must be non-negative")
6610 | await page.wait_for_timeout(wait_ms)
6611 | step_result["success"] = True
6612 | except (ValueError, TypeError) as e:
6613 | raise ToolInputError(
6614 | f"Step {step_num} ('wait'): Invalid 'ms' value '{ms}'. {e}"
6615 | ) from e
6616 |
6617 | elif action == "download":
6618 | hint = step.get("task_hint")
6619 | target_fallback = step.get("target")
6620 | if not hint:
6621 | raise ToolInputError(
6622 | f"Step {step_num} ('download'): Missing required argument 'task_hint'."
6623 | )
6624 | dest_dir = step.get("dest") # Optional destination directory
6625 | # Use the smart_download helper
6626 | download_outcome = await smart_download(
6627 | page, task_hint=hint, dest_dir=dest_dir, target_kwargs=target_fallback
6628 | )
6629 | step_result["result"] = download_outcome # Store full download result
6630 | # Success is determined by the helper's output
6631 | step_result["success"] = download_outcome.get("success", False)
6632 |
6633 | elif action == "extract":
6634 | selector = step.get("selector")
6635 | if not selector:
6636 | raise ToolInputError(
6637 | f"Step {step_num} ('extract'): Missing required argument 'selector'."
6638 | )
6639 | # Use Playwright's evaluate_all to get text from matching elements
6640 | js_func = "(elements => elements.map(el => el.innerText || el.textContent || ''))"
6641 | extracted_texts_raw = await page.locator(selector).evaluate_all(js_func)
6642 | # Clean up results: filter empty strings and strip whitespace
6643 | extracted_texts_clean = []
6644 | for t in extracted_texts_raw:
6645 | stripped_t = t.strip()
6646 | if stripped_t:
6647 | extracted_texts_clean.append(stripped_t)
6648 | step_result["result"] = extracted_texts_clean
6649 | step_result["success"] = True # Extraction itself succeeds if selector is valid
6650 |
6651 | elif action == "scroll":
6652 | direction = step.get("direction")
6653 | amount = step.get("amount_px")
6654 | if not direction or direction not in ["up", "down", "top", "bottom"]:
6655 | error_msg = f"Step {step_num} ('scroll'): Invalid or missing scroll direction: '{direction}'. Must be 'up', 'down', 'top', or 'bottom'."
6656 | step_result["error"] = error_msg
6657 | step_result["success"] = False
6658 | logger.warning(error_msg)
6659 | # Continue to finally block without raising, as scroll failure might not be critical
6660 | else:
6661 | if direction == "top":
6662 | js_scroll = "() => window.scrollTo(0, 0)"
6663 | await page.evaluate(js_scroll)
6664 | elif direction == "bottom":
6665 | js_scroll = "() => window.scrollTo(0, document.body.scrollHeight)"
6666 | await page.evaluate(js_scroll)
6667 | elif direction == "up":
6668 | scroll_amount = int(amount or 500) # Default 500px
6669 | js_scroll = "(px) => window.scrollBy(0, -px)"
6670 | await page.evaluate(js_scroll, scroll_amount)
6671 | elif direction == "down":
6672 | scroll_amount = int(amount or 500) # Default 500px
6673 | js_scroll = "(px) => window.scrollBy(0, px)"
6674 | await page.evaluate(js_scroll, scroll_amount)
6675 | step_result["success"] = True
6676 |
6677 | elif action == "finish":
6678 | logger.info(f"Macro execution Step {step_num}: Reached 'finish' action.")
6679 | step_result["success"] = True
6680 | # No Playwright action needed
6681 |
6682 | else:
6683 | # Should not happen if _plan_macro validates actions, but safety check
6684 | raise ValueError(
6685 | f"Step {step_num}: Unknown action '{action}' encountered during execution."
6686 | )
6687 |
6688 | # Record duration on success or handled failure (like scroll direction)
6689 | duration_ms = int((time.monotonic() - start_time) * 1000)
6690 | step_result["duration_ms"] = duration_ms
6691 |
6692 | except (
6693 | PlaywrightTimeoutError,
6694 | ToolError,
6695 | ToolInputError,
6696 | ValueError,
6697 | AssertionError,
6698 | Exception,
6699 | ) as e:
6700 | # Catch errors during action execution
6701 | err_type = type(e).__name__
6702 | error_msg = f"{err_type} during action '{action}': {e}"
6703 | step_result["error"] = error_msg
6704 | step_result["success"] = False # Ensure success is false on error
6705 | logger.warning(f"Macro Step {step_num} ('{action}') failed: {error_msg}")
6706 |
6707 | # Special handling for CAPTCHA-related errors
6708 | if "captcha" in str(e).lower() or "recaptcha" in str(e).lower():
6709 | logger.warning(f"Step {step_num}: CAPTCHA-related error detected. Suggesting manual intervention.")
6710 | step_result["error"] = f"CAPTCHA-related error: {error_msg}. Manual intervention may be required."
6711 |
6712 | # Record duration even on failure
6713 | duration_ms = int((time.monotonic() - start_time) * 1000)
6714 | step_result["duration_ms"] = duration_ms
6715 |
6716 | finally:
6717 | # Always log the step result and append to the list
6718 | log_details = step_result.copy() # Create copy for logging
6719 | # Avoid logging potentially large results directly
6720 | if "result" in log_details:
6721 | log_details["result_summary"] = str(log_details["result"])[:200] + "..."
6722 | del log_details["result"]
6723 | await _log("macro_step_result", **log_details)
6724 | results.append(step_result)
6725 |
6726 | # If a 'finish' action succeeded, stop executing further steps
6727 | if action == "finish" and step_result.get("success", False):
6728 | logger.info(
6729 | f"Stopping macro execution after successful 'finish' action at step {step_num}."
6730 | )
6731 | should_break = True
6732 |
6733 | # Check break flag - this now works because should_break is properly scoped
6734 | if should_break:
6735 | logger.info(f"Breaking execution loop after step {step_num}")
6736 | break
6737 |
6738 | return results # Return list of results for all executed steps
6739 |
6740 |
6741 | # --- Universal Search ---
6742 | _ua_rotation_count = 0
6743 | _user_agent_pools = { # Keep as is, ensure actual UAs are filled in
6744 | "bing": deque(
6745 | [
6746 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51",
6747 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
6748 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
6749 | ]
6750 | ),
6751 | "duckduckgo": deque(
6752 | [
6753 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
6754 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
6755 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
6756 | ]
6757 | ),
6758 | "yandex": deque(
6759 | [ # Yandex might be more sensitive, use diverse UAs
6760 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 YaBrowser/23.5.2.625 Yowser/2.5 Safari/537.36",
6761 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 YaBrowser/23.5.2.625 Yowser/2.5 Safari/537.36",
6762 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 YaBrowser/23.5.2.625 Yowser/2.5 Safari/537.36",
6763 | ]
6764 | ),
6765 | }
6766 |
6767 |
6768 | @resilient(max_attempts=2, backoff=1.0)
6769 | async def search_web(
6770 | query: str, engine: str = "bing", max_results: int = 10
6771 | ) -> List[Dict[str, str]]: # Uses global _log, _ua_rotation_count, _user_agent_pools
6772 | """Performs a web search using a specified engine via browser automation."""
6773 | global _ua_rotation_count
6774 | engine_lower = engine.lower()
6775 | if engine_lower not in ("bing", "duckduckgo", "yandex"):
6776 | raise ToolInputError(
6777 | f"Invalid search engine specified: '{engine}'. Use 'bing', 'duckduckgo', or 'yandex'."
6778 | )
6779 |
6780 | # Sanitize query (basic removal of non-alphanumeric/space/hyphen/dot)
6781 | safe_query_chars = re.sub(r"[^\w\s\-\.]", "", query)
6782 | safe_query = safe_query_chars.strip()
6783 | if not safe_query:
6784 | raise ToolInputError("Search query cannot be empty or contain only invalid characters.")
6785 |
6786 | # URL encode the safe query
6787 | qs = urllib.parse.quote_plus(safe_query)
6788 | nonce = random.randint(1000, 9999) # Simple nonce/cache buster
6789 |
6790 | # Search URLs and CSS Selectors per engine
6791 | search_details = {
6792 | "bing": {
6793 | "url": f"https://www.bing.com/search?q={qs}&form=QBLH&nc={nonce}",
6794 | "selectors": {
6795 | "item": "li.b_algo", # Correct: Main result container
6796 | "title": "h2 > a", # Correct: Targets link within H2 for title
6797 | "link": "h2 > a", # Correct: Same element for link href
6798 | "snippet": "div.b_caption p, .TextContainer.OrganicText", # CORRECTED: Handles standard captions and organic text containers
6799 | "snippet_alt": ".b_caption", # CORRECTED: General caption container as fallback
6800 | },
6801 | },
6802 | "duckduckgo": {
6803 | "url": f"https://html.duckduckgo.com/html/?q={qs}&nc={nonce}",
6804 | "selectors": {
6805 | # Use the more specific class for the main result item
6806 | "item": "div.web-result",
6807 | # Add classes for specificity, although h2>a might have been okay
6808 | "title": "h2.result__title > a.result__a",
6809 | "link": "h2.result__title > a.result__a",
6810 | # Snippet selector looks correct
6811 | "snippet": "a.result__snippet",
6812 | # Add the snippet_alt just in case structure varies slightly
6813 | "snippet_alt": "div.result__snippet", # Alternative if snippet isn't a link
6814 | },
6815 | },
6816 | "yandex": {
6817 | # Yandex search results structure
6818 | "url": f"https://yandex.com/search/?text={qs}&nc={nonce}&lr=202", # Added &lr=202 based on your example URL for consistency
6819 | "selectors": {
6820 | "item": "li.serp-item", # Correct: Main result container
6821 | "title": "a.OrganicTitle-Link", # CORRECTED: Target the main link for title text
6822 | "link": "a.OrganicTitle-Link", # CORRECTED: Target the main link for href attribute
6823 | "snippet": ".TextContainer.OrganicText", # Correct: Specific snippet container
6824 | "snippet_alt": ".Organic-ContentWrapper", # Correct: Parent as fallback
6825 | },
6826 | },
6827 | }
6828 |
6829 | engine_info = search_details[engine_lower]
6830 | search_url = engine_info["url"]
6831 | sel = engine_info["selectors"]
6832 |
6833 | # Rotate User Agent
6834 | _ua_rotation_count += 1
6835 | ua_pool = _user_agent_pools[engine_lower]
6836 | if _ua_rotation_count % 20 == 0 and len(ua_pool) > 1:
6837 | # Rotate deque periodically
6838 | first_ua = ua_pool.popleft()
6839 | ua_pool.append(first_ua)
6840 | ua = ua_pool[0] # Use the current first UA
6841 |
6842 | # Get incognito context with specific UA
6843 | context_args = {"user_agent": ua, "locale": "en-US"} # Set UA and locale
6844 | ctx, _ = await get_browser_context(use_incognito=True, context_args=context_args)
6845 | page = None # Initialize page variable
6846 |
6847 | try:
6848 | page = await ctx.new_page()
6849 | await _log("search_start", engine=engine_lower, query=query, url=search_url, ua=ua)
6850 | # Navigate to search URL
6851 | nav_timeout = 30000 # 30 seconds
6852 | await page.goto(search_url, wait_until="domcontentloaded", timeout=nav_timeout)
6853 |
6854 | # Handle DuckDuckGo HTML meta refresh if present
6855 | if engine_lower == "duckduckgo":
6856 | try:
6857 | meta_refresh_selector = 'meta[http-equiv="refresh"]'
6858 | meta_refresh = await page.query_selector(meta_refresh_selector)
6859 | if meta_refresh:
6860 | content_attr = await meta_refresh.get_attribute("content")
6861 | if content_attr and "url=" in content_attr.lower():
6862 | # Extract redirect URL
6863 | match = re.search(r'url=([^"]+)', content_attr, re.IGNORECASE)
6864 | if match:
6865 | redirect_url_raw = match.group(1)
6866 | # Basic clean up of URL just in case
6867 | redirect_url = redirect_url_raw.strip("'\" ")
6868 | logger.info(
6869 | f"Following meta refresh redirect on DDG HTML: {redirect_url}"
6870 | )
6871 | await page.goto(
6872 | redirect_url, wait_until="domcontentloaded", timeout=20000
6873 | )
6874 | await asyncio.sleep(0.5) # Brief pause after redirect
6875 | except PlaywrightException as e:
6876 | logger.warning(f"Error checking/following meta refresh on DDG HTML: {e}")
6877 |
6878 | # Wait for results container to be visible
6879 | wait_selector_timeout = 10000 # 10 seconds
6880 | try:
6881 | await page.wait_for_selector(
6882 | sel["item"], state="visible", timeout=wait_selector_timeout
6883 | )
6884 | except PlaywrightTimeoutError as e:
6885 | # Check for CAPTCHA before assuming no results
6886 | captcha_js = "() => document.body.innerText.toLowerCase().includes('captcha') || document.querySelector('iframe[title*=captcha]') || document.querySelector('[id*=captcha]')"
6887 | captcha_found = await page.evaluate(captcha_js)
6888 | if captcha_found:
6889 | await _log("search_captcha", engine=engine_lower, query=query)
6890 | raise ToolError(
6891 | f"CAPTCHA detected on {engine_lower} search.", error_code="captcha_detected"
6892 | ) from e
6893 | else:
6894 | # No results selector found, and no obvious CAPTCHA
6895 | await _log(
6896 | "search_no_results_selector",
6897 | engine=engine_lower,
6898 | query=query,
6899 | selector=sel["item"],
6900 | )
6901 | return [] # Return empty list for no results
6902 |
6903 | # Brief pause and try to accept consent cookies (best effort)
6904 | await asyncio.sleep(random.uniform(0.5, 1.5))
6905 | consent_selectors = [
6906 | 'button:has-text("Accept")',
6907 | 'button:has-text("Agree")',
6908 | 'button[id*="consent"]',
6909 | 'button[class*="consent"]',
6910 | ]
6911 | for btn_sel in consent_selectors:
6912 | try:
6913 | consent_button = page.locator(btn_sel).first
6914 | await consent_button.click(timeout=1000) # Short timeout for consent click
6915 | logger.debug(f"Clicked potential consent button: {btn_sel}")
6916 | await asyncio.sleep(0.3) # Pause after click
6917 | break # Stop after first successful click
6918 | except PlaywrightException:
6919 | pass # Ignore if selector not found or click fails
6920 |
6921 | # Extract results using page.evaluate
6922 | extract_js = """
6923 | (args) => {
6924 | const results = [];
6925 | const items = document.querySelectorAll(args.sel.item);
6926 | for (let i = 0; i < Math.min(items.length, args.max_results); i++) {
6927 | const item = items[i];
6928 | const titleEl = item.querySelector(args.sel.title);
6929 | const linkEl = item.querySelector(args.sel.link);
6930 | let snippetEl = item.querySelector(args.sel.snippet);
6931 | // Use fallback snippet selector if primary not found
6932 | if (!snippetEl && args.sel.snippet_alt) {
6933 | snippetEl = item.querySelector(args.sel.snippet_alt);
6934 | }
6935 |
6936 | const title = titleEl ? titleEl.innerText.trim() : '';
6937 | let link = linkEl ? linkEl.href : '';
6938 | // Clean DDG HTML links
6939 | if (link && link.includes('uddg=')) {
6940 | try {
6941 | const urlParams = new URLSearchParams(link.split('?')[1]);
6942 | link = urlParams.get('uddg') || link;
6943 | } catch (e) { /* ignore URL parsing errors */ }
6944 | }
6945 | const snippet = snippetEl ? snippetEl.innerText.trim() : '';
6946 |
6947 | // Only add if essential parts (link and title or snippet) are present
6948 | if (link && (title || snippet)) {
6949 | results.push({ title, link, snippet });
6950 | }
6951 | }
6952 | return results;
6953 | }
6954 | """
6955 | eval_args = {"sel": sel, "max_results": max_results}
6956 | results = await page.evaluate(extract_js, eval_args)
6957 |
6958 | # Log completion and return results
6959 | num_results = len(results)
6960 | await _log("search_complete", engine=engine_lower, query=query, num_results=num_results)
6961 | return results
6962 |
6963 | except PlaywrightException as e:
6964 | # Handle Playwright errors during navigation or interaction
6965 | await _log("search_error_playwright", engine=engine_lower, query=query, error=str(e))
6966 | raise ToolError(f"Playwright error during {engine_lower} search for '{query}': {e}") from e
6967 | except Exception as e:
6968 | # Handle unexpected errors
6969 | await _log("search_error_unexpected", engine=engine_lower, query=query, error=str(e))
6970 | raise ToolError(f"Unexpected error during {engine_lower} search for '{query}': {e}") from e
6971 | finally:
6972 | # Ensure page and context are closed
6973 | if page and not page.is_closed():
6974 | await page.close()
6975 | if ctx:
6976 | await ctx.close()
6977 |
6978 |
6979 | # --- Initialization Function ---
6980 | async def _ensure_initialized(): # Uses MANY globals
6981 | """Main initialization sequence for standalone Smart Browser tools."""
6982 | global \
6983 | _is_initialized, \
6984 | _thread_pool, \
6985 | _locator_cache_cleanup_task_handle, \
6986 | _inactivity_monitor_task_handle
6987 | global _SB_INTERNAL_BASE_PATH_STR, _STATE_FILE, _LOG_FILE, _CACHE_DB, _READ_JS_CACHE
6988 | # Globals for config values
6989 | global _sb_state_key_b64_global, _sb_max_tabs_global, _sb_tab_timeout_global
6990 | global _sb_inactivity_timeout_global, _headless_mode_global, _vnc_enabled_global
6991 | global _vnc_password_global, _proxy_pool_str_global, _proxy_allowed_domains_str_global
6992 | global _vault_allowed_paths_str_global, _max_widgets_global, _max_section_chars_global
6993 | global _dom_fp_limit_global, _llm_model_locator_global, _retry_after_fail_global
6994 | global _seq_cutoff_global, _area_min_global, _high_risk_domains_set_global
6995 | global _cpu_count, _pw, _browser, _ctx
6996 | global _pid, _last_activity
6997 |
6998 | # Ensure _last_activity has a valid monotonic time ASAP if it's still at its module-load default.
6999 | # This is a defensive measure.
7000 | if _last_activity == 0.0:
7001 | _last_activity = time.monotonic()
7002 | logger.debug(
7003 | f"Defensively setting initial _last_activity in _ensure_initialized: {_last_activity}"
7004 | )
7005 |
7006 | # Quick check if already initialized
7007 | if _is_initialized:
7008 | return
7009 |
7010 | # Use lock to prevent concurrent initialization
7011 | async with _init_lock:
7012 | # Double-check after acquiring lock
7013 | if _is_initialized:
7014 | return
7015 | logger.info("Performing first-time async initialization of SmartBrowser tools...")
7016 |
7017 | # --- Step 1: Load Config into Globals ---
7018 | try:
7019 | config = get_config()
7020 | sb_config: SmartBrowserConfig = config.smart_browser # Access nested config
7021 |
7022 | # Assign config values to globals, using defaults if config value is None/missing
7023 | _sb_state_key_b64_global = sb_config.sb_state_key_b64 or _sb_state_key_b64_global
7024 | _sb_max_tabs_global = sb_config.sb_max_tabs or _sb_max_tabs_global
7025 | _sb_tab_timeout_global = sb_config.sb_tab_timeout or _sb_tab_timeout_global
7026 | _sb_inactivity_timeout_global = (
7027 | sb_config.sb_inactivity_timeout or _sb_inactivity_timeout_global
7028 | )
7029 | # Handle booleans carefully (check for None, not just falsiness)
7030 | if sb_config.headless_mode is not None:
7031 | _headless_mode_global = sb_config.headless_mode
7032 | if sb_config.vnc_enabled is not None:
7033 | _vnc_enabled_global = sb_config.vnc_enabled
7034 | _vnc_password_global = sb_config.vnc_password or _vnc_password_global
7035 | _proxy_pool_str_global = sb_config.proxy_pool_str or _proxy_pool_str_global
7036 | _proxy_allowed_domains_str_global = (
7037 | sb_config.proxy_allowed_domains_str or _proxy_allowed_domains_str_global
7038 | )
7039 | _vault_allowed_paths_str_global = (
7040 | sb_config.vault_allowed_paths_str or _vault_allowed_paths_str_global
7041 | )
7042 | _max_widgets_global = sb_config.max_widgets or _max_widgets_global
7043 | _max_section_chars_global = sb_config.max_section_chars or _max_section_chars_global
7044 | _dom_fp_limit_global = sb_config.dom_fp_limit or _dom_fp_limit_global
7045 | _llm_model_locator_global = sb_config.llm_model_locator or _llm_model_locator_global
7046 | if sb_config.retry_after_fail is not None:
7047 | _retry_after_fail_global = sb_config.retry_after_fail
7048 | if sb_config.seq_cutoff is not None:
7049 | _seq_cutoff_global = sb_config.seq_cutoff
7050 | _area_min_global = sb_config.area_min or _area_min_global
7051 | # Handle set carefully (assign if present in config)
7052 | if sb_config.high_risk_domains_set is not None:
7053 | _high_risk_domains_set_global = sb_config.high_risk_domains_set
7054 |
7055 | logger.info("Smart Browser configuration loaded into global variables.")
7056 | # Update derived settings from config strings
7057 | _update_proxy_settings()
7058 | _update_vault_paths()
7059 |
7060 | # --- Reconfigure thread pool based on loaded config ---
7061 | # Get current max_workers (handle potential attribute absence)
7062 | current_max_workers = getattr(
7063 | _thread_pool, "_max_workers", min(32, (_cpu_count or 1) * 2 + 4)
7064 | )
7065 | # Calculate desired based on *loaded* max tabs config
7066 | desired_max_workers = min(32, _sb_max_tabs_global * 2)
7067 | # Recreate pool only if worker count needs to change
7068 | if current_max_workers != desired_max_workers:
7069 | logger.info(
7070 | f"Reconfiguring thread pool max_workers from {current_max_workers} to {desired_max_workers} based on config."
7071 | )
7072 | _thread_pool.shutdown(wait=True) # Wait for existing tasks
7073 | _thread_pool = concurrent.futures.ThreadPoolExecutor(
7074 | max_workers=desired_max_workers, thread_name_prefix="sb_worker"
7075 | )
7076 |
7077 | except Exception as e:
7078 | logger.error(
7079 | f"Error loading Smart Browser config: {e}. Using default global values.",
7080 | exc_info=True,
7081 | )
7082 | # Ensure derived settings are updated even if config load fails
7083 | _update_proxy_settings()
7084 | _update_vault_paths()
7085 |
7086 | # --- Step 2: Prepare Internal Storage Directory ---
7087 | try:
7088 | # Define relative path for internal storage (within the main storage area)
7089 | internal_storage_relative_path = "storage/smart_browser_internal"
7090 | logger.info(
7091 | f"Ensuring internal storage directory exists: '{internal_storage_relative_path}' using filesystem tool."
7092 | )
7093 | # Use STANDALONE create_directory tool
7094 | create_dir_result = await create_directory(path=internal_storage_relative_path)
7095 | # Validate result
7096 | if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
7097 | error_msg = (
7098 | create_dir_result.get("error", "Unknown")
7099 | if isinstance(create_dir_result, dict)
7100 | else "Invalid response"
7101 | )
7102 | raise ToolError(
7103 | f"Filesystem tool failed to create internal directory '{internal_storage_relative_path}'. Error: {error_msg}"
7104 | )
7105 |
7106 | resolved_base_path_str = create_dir_result.get("path")
7107 | if not resolved_base_path_str:
7108 | raise ToolError(
7109 | "Filesystem tool create_directory succeeded but did not return the absolute path."
7110 | )
7111 |
7112 | # Set global path variables based on the resolved absolute path
7113 | _SB_INTERNAL_BASE_PATH_STR = resolved_base_path_str
7114 | internal_base_path = Path(_SB_INTERNAL_BASE_PATH_STR)
7115 | _STATE_FILE = internal_base_path / "storage_state.enc"
7116 | _LOG_FILE = internal_base_path / "audit.log"
7117 | _CACHE_DB = internal_base_path / "locator_cache.db" # Adjusted name from original
7118 | _READ_JS_CACHE = internal_base_path / "readability.js"
7119 | logger.info(
7120 | f"Smart Browser internal file paths configured within: {internal_base_path}"
7121 | )
7122 |
7123 | # Initialize components that depend on these paths
7124 | _init_last_hash() # Initialize audit log hash chain (sync)
7125 | _init_locator_cache_db_sync() # Initialize DB schema (sync)
7126 |
7127 | except Exception as e:
7128 | # If storage setup fails, it's critical, stop initialization
7129 | logger.critical(
7130 | f"CRITICAL FAILURE: Could not initialize Smart Browser internal storage at '{internal_storage_relative_path}': {e}",
7131 | )
7132 | return # Do not proceed
7133 |
7134 | # --- Step 3: Initialize Browser Context (triggers Playwright launch if needed) ---
7135 | try:
7136 | logger.info("Initializing Playwright browser and shared context...")
7137 | await get_browser_context() # Call helper to ensure PW, browser, shared context exist
7138 | logger.info("Playwright browser and shared context initialized successfully.")
7139 | except Exception as e:
7140 | logger.critical(
7141 | f"CRITICAL FAILURE: Failed to initialize Playwright components: {e}", exc_info=True
7142 | )
7143 | # Attempt cleanup? Maybe not here, shutdown handler should cover it.
7144 | return # Stop initialization
7145 |
7146 | # --- Step 4: Start Background Tasks ---
7147 | # Start Inactivity Monitor
7148 | timeout_sec = _sb_inactivity_timeout_global
7149 | if timeout_sec > 0:
7150 | if _inactivity_monitor_task_handle is None or _inactivity_monitor_task_handle.done():
7151 | logger.info(
7152 | f"Starting browser inactivity monitor task (Timeout: {timeout_sec}s)..."
7153 | )
7154 | _inactivity_monitor_task_handle = asyncio.create_task(
7155 | _inactivity_monitor(timeout_sec)
7156 | )
7157 | else:
7158 | logger.debug("Inactivity monitor task already running.")
7159 | else:
7160 | logger.info("Browser inactivity monitor disabled (timeout <= 0).")
7161 |
7162 | # Start Locator Cache Cleanup Task
7163 | cleanup_interval_sec = 24 * 60 * 60 # Run daily
7164 | if _locator_cache_cleanup_task_handle is None or _locator_cache_cleanup_task_handle.done():
7165 | logger.info(
7166 | f"Starting locator cache cleanup task (Interval: {cleanup_interval_sec}s)..."
7167 | )
7168 | _locator_cache_cleanup_task_handle = asyncio.create_task(
7169 | _locator_cache_cleanup_task(interval_seconds=cleanup_interval_sec)
7170 | )
7171 | else:
7172 | logger.debug("Locator cache cleanup task already running.")
7173 |
7174 | # --- Finalize ---
7175 | _is_initialized = True
7176 | _last_activity = time.monotonic() # Set initial activity time after successful init
7177 | logger.info("SmartBrowser tools async components initialized successfully.")
7178 |
7179 |
7180 | # --- Helper: Inactivity Monitor ---
7181 | async def _inactivity_monitor(timeout_seconds: int): # Uses globals _browser, _last_activity
7182 | """Monitors browser inactivity and triggers shutdown if idle for too long."""
7183 | check_interval = 60 # Check every 60 seconds
7184 | logger.info(
7185 | f"Inactivity monitor started. Timeout: {timeout_seconds}s, Check Interval: {check_interval}s."
7186 | )
7187 | while True:
7188 | await asyncio.sleep(check_interval)
7189 | browser_active = False
7190 | try:
7191 | # Safely check browser status under lock
7192 | async with _playwright_lock:
7193 | if _browser is not None and _browser.is_connected():
7194 | browser_active = True
7195 | except Exception as check_err:
7196 | logger.warning(f"Error checking browser status in inactivity monitor: {check_err}")
7197 | # Assume active or handle error? Assume active to avoid premature shutdown on check error.
7198 | browser_active = True # Or consider stopping monitor?
7199 |
7200 | if not browser_active:
7201 | logger.info("Inactivity monitor: Browser is closed or disconnected. Stopping monitor.")
7202 | break # Exit monitor loop if browser is gone
7203 |
7204 | # Calculate idle time
7205 | current_time = time.monotonic()
7206 | idle_time = current_time - _last_activity
7207 |
7208 | logger.debug(
7209 | f"Inactivity check: Idle time = {idle_time:.1f}s (Timeout = {timeout_seconds}s)"
7210 | )
7211 |
7212 | # Check if idle time exceeds timeout
7213 | if idle_time > timeout_seconds:
7214 | logger.info(
7215 | f"Browser inactive for {idle_time:.1f}s (exceeds {timeout_seconds}s timeout). Initiating automatic shutdown."
7216 | )
7217 | # First stop this monitor task to prevent further shutdown attempts
7218 | logger.info("Inactivity monitor stopped.")
7219 | try:
7220 | # Initiate shutdown (ensures it runs only once)
7221 | await _initiate_shutdown()
7222 | except Exception as e:
7223 | # Log error during shutdown attempt, but break anyway
7224 | logger.error(
7225 | f"Error during automatic shutdown initiated by inactivity monitor: {e}",
7226 | exc_info=True,
7227 | )
7228 | # Exit monitor loop after attempting shutdown
7229 | break
7230 |
7231 |
7232 | @with_tool_metrics
7233 | @with_error_handling
7234 | async def search(query: str, engine: str = "bing", max_results: int = 10) -> Dict[str, Any]:
7235 | """Performs a web search using the helper function and returns results."""
7236 | # Ensure SB is initialized
7237 | await _ensure_initialized()
7238 | # Update activity timestamp
7239 | _update_activity()
7240 |
7241 | # --- Input Validation ---
7242 | if max_results <= 0:
7243 | logger.warning(f"max_results was {max_results}. Setting to default 10.")
7244 | max_results = 10
7245 | # Engine validation happens within search_web helper
7246 |
7247 | # --- Execute Search ---
7248 | # Call the underlying search_web helper function
7249 | results = await search_web(query, engine=engine, max_results=max_results)
7250 | result_count = len(results)
7251 |
7252 | # --- Return Result ---
7253 | return {
7254 | "success": True,
7255 | "query": query,
7256 | "engine": engine.lower(), # Return normalized engine name
7257 | "results": results,
7258 | "result_count": result_count,
7259 | }
7260 |
7261 |
7262 | @with_tool_metrics
7263 | @with_error_handling
7264 | async def download( # This is the exported tool function
7265 | url: str,
7266 | target: Optional[Dict[str, Any]] = None,
7267 | task_hint: Optional[str] = None,
7268 | dest_dir: Optional[str] = None,
7269 | ) -> Dict[str, Any]:
7270 | """Navigates, clicks (using hint/target) to download, saves file, returns info."""
7271 | # Ensure SB is initialized
7272 | await _ensure_initialized()
7273 | # Update activity timestamp
7274 | _update_activity()
7275 |
7276 | # --- Input Validation: Determine task_hint ---
7277 | effective_task_hint = task_hint
7278 | if not effective_task_hint: # Generate hint if missing
7279 | if target and (target.get("name") or target.get("role")):
7280 | name = target.get("name", "")
7281 | role = target.get("role", "") # Default role empty if not specified
7282 | hint_base = "Download link/button"
7283 | target_desc = f"{name or role}".strip() # Use name or role
7284 | if target_desc:
7285 | effective_task_hint = f"{hint_base} '{target_desc}'"
7286 | else:
7287 | effective_task_hint = hint_base # Fallback if target has no name/role
7288 | logger.debug(f"download tool generated task_hint: '{effective_task_hint}'")
7289 | else:
7290 | raise ToolInputError(
7291 | "download tool requires 'task_hint', or 'target' dict containing 'name' or 'role'."
7292 | )
7293 |
7294 | # --- Get Context and Execute ---
7295 | ctx, _ = await get_browser_context()
7296 | async with _tab_context(ctx) as page:
7297 | # Navigate to the page containing the download link
7298 | await _log("download_navigate", url=url, hint=effective_task_hint)
7299 | try:
7300 | nav_timeout = 60000
7301 | await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
7302 | except PlaywrightException as e:
7303 | # Use f-string for cleaner message concatenation
7304 | raise ToolError(
7305 | f"Navigation failed for URL '{url}' before download attempt: {e}"
7306 | ) from e
7307 |
7308 | # Call the underlying smart_download helper function
7309 | # This helper now handles the click, waiting for download, saving, and analysis
7310 | download_info = await smart_download(
7311 | page,
7312 | task_hint=effective_task_hint,
7313 | dest_dir=dest_dir, # Pass optional destination directory
7314 | target_kwargs=target, # Pass optional target details
7315 | )
7316 |
7317 | # smart_download raises ToolError on failure, so this check is mostly redundant
7318 | # but kept as a safeguard. The result structure is also slightly different now.
7319 | if not download_info.get("success"):
7320 | error_msg = download_info.get("error", "Download failed with unknown error.")
7321 | raise ToolError(f"Download failed: {error_msg}", details=download_info)
7322 |
7323 | # Return success structure containing the download details
7324 | return {"success": True, "download": download_info}
7325 |
7326 |
7327 | @with_tool_metrics
7328 | @with_error_handling
7329 | async def download_site_pdfs(
7330 | start_url: str,
7331 | dest_subfolder: Optional[str] = None,
7332 | include_regex: Optional[str] = None,
7333 | max_depth: int = 2,
7334 | max_pdfs: int = 100,
7335 | max_pages_crawl: int = 500,
7336 | rate_limit_rps: float = 1.0,
7337 | ) -> Dict[str, Any]:
7338 | """Crawls site, finds PDFs, downloads them directly using httpx and FileSystemTool."""
7339 | # Ensure SB is initialized
7340 | await _ensure_initialized()
7341 | # Update activity timestamp
7342 | _update_activity()
7343 |
7344 | # --- Validate Inputs ---
7345 | if not start_url:
7346 | raise ToolInputError("start_url cannot be empty.")
7347 | if max_depth < 0:
7348 | raise ToolInputError("max_depth cannot be negative.")
7349 | if max_pdfs <= 0:
7350 | raise ToolInputError("max_pdfs must be positive.")
7351 | if max_pages_crawl <= 0:
7352 | raise ToolInputError("max_pages_crawl must be positive.")
7353 | if rate_limit_rps <= 0:
7354 | raise ToolInputError("rate_limit_rps must be positive.")
7355 |
7356 | # --- Prepare Download Directory ---
7357 | final_dest_dir_str: Optional[str] = None
7358 | try:
7359 | # Generate a safe subfolder name from input or domain
7360 | if dest_subfolder:
7361 | safe_subfolder = _slugify(dest_subfolder, 50)
7362 | else:
7363 | try:
7364 | parsed_start = urlparse(start_url)
7365 | domain_slug = _slugify(parsed_start.netloc, 50)
7366 | safe_subfolder = domain_slug or "downloaded_pdfs" # Fallback if domain is empty
7367 | except Exception:
7368 | safe_subfolder = "downloaded_pdfs" # Fallback on URL parse error
7369 |
7370 | # Define relative path within the main storage area
7371 | dest_dir_relative_path = f"storage/smart_browser_site_pdfs/{safe_subfolder}"
7372 | logger.info(
7373 | f"Ensuring download directory exists for PDF crawl: '{dest_dir_relative_path}' using filesystem tool."
7374 | )
7375 | # Use STANDALONE create_directory tool
7376 | create_dir_result = await create_directory(path=dest_dir_relative_path)
7377 | if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
7378 | error_msg = (
7379 | create_dir_result.get("error", "Unknown")
7380 | if isinstance(create_dir_result, dict)
7381 | else "Invalid response"
7382 | )
7383 | raise ToolError(
7384 | f"Filesystem tool failed to create directory '{dest_dir_relative_path}'. Error: {error_msg}"
7385 | )
7386 |
7387 | # Get the absolute path returned by the tool
7388 | final_dest_dir_str = create_dir_result.get("path")
7389 | if not final_dest_dir_str:
7390 | raise ToolError(
7391 | f"Filesystem tool create_directory succeeded for '{dest_dir_relative_path}' but did not return an absolute path."
7392 | )
7393 | logger.info(f"PDF crawl download directory confirmed/created at: {final_dest_dir_str}")
7394 | except Exception as e:
7395 | # Wrap directory preparation errors
7396 | raise ToolError(
7397 | f"Could not prepare download directory '{dest_dir_relative_path}': {str(e)}"
7398 | ) from e
7399 |
7400 | # --- Crawl for PDF URLs ---
7401 | logger.info(
7402 | f"Starting PDF crawl from: {start_url} (Max Depth: {max_depth}, Max PDFs: {max_pdfs}, Max Pages: {max_pages_crawl})"
7403 | )
7404 | try:
7405 | # Use the helper function to find PDF URLs
7406 | pdf_urls = await crawl_for_pdfs(
7407 | start_url,
7408 | include_regex,
7409 | max_depth,
7410 | max_pdfs,
7411 | max_pages_crawl,
7412 | rate_limit_rps=5.0, # Use slightly higher rate for crawl itself
7413 | )
7414 | except Exception as crawl_err:
7415 | raise ToolError(
7416 | f"Error during PDF crawl phase from '{start_url}': {crawl_err}"
7417 | ) from crawl_err
7418 |
7419 | if not pdf_urls:
7420 | logger.info("No matching PDF URLs found during crawl.")
7421 | return {
7422 | "success": True,
7423 | "pdf_count": 0,
7424 | "failed_count": 0,
7425 | "dest_dir": final_dest_dir_str,
7426 | "files": [], # Empty list as no files were downloaded
7427 | }
7428 |
7429 | # --- Download Found PDFs ---
7430 | num_found = len(pdf_urls)
7431 | logger.info(
7432 | f"Found {num_found} PDF URLs. Starting downloads to '{final_dest_dir_str}' (Rate Limit: {rate_limit_rps}/s)..."
7433 | )
7434 | # Use the specified rate limit for downloads
7435 | limiter = RateLimiter(rate_limit_rps)
7436 |
7437 | # Define the async task for downloading a single file
7438 | async def download_task(url, seq):
7439 | await limiter.acquire() # Wait for rate limit permit
7440 | # Use the direct download helper
7441 | result = await _download_file_direct(url, final_dest_dir_str, seq)
7442 | return result
7443 |
7444 | # Create and run download tasks concurrently
7445 | download_tasks = []
7446 | for i, url in enumerate(pdf_urls):
7447 | task = asyncio.create_task(download_task(url, i + 1))
7448 | download_tasks.append(task)
7449 |
7450 | results = await asyncio.gather(*download_tasks) # Wait for all downloads
7451 |
7452 | # Process results
7453 | successful_downloads = []
7454 | failed_downloads = []
7455 | for r in results:
7456 | if isinstance(r, dict) and r.get("success"):
7457 | successful_downloads.append(r)
7458 | else:
7459 | failed_downloads.append(r) # Includes non-dict results or dicts with success=False
7460 |
7461 | num_successful = len(successful_downloads)
7462 | num_failed = len(failed_downloads)
7463 |
7464 | # Log summary
7465 | log_details = {
7466 | "start_url": start_url,
7467 | "found": num_found,
7468 | "successful": num_successful,
7469 | "failed": num_failed,
7470 | "dest_dir": final_dest_dir_str,
7471 | }
7472 | if failed_downloads:
7473 | # Log preview of failed download errors
7474 | errors_preview = []
7475 | for res in failed_downloads[:3]: # Log first 3 errors
7476 | err_url = res.get("url", "N/A") if isinstance(res, dict) else "N/A"
7477 | err_msg = res.get("error", "Unknown error") if isinstance(res, dict) else str(res)
7478 | errors_preview.append(f"{err_url}: {err_msg}")
7479 | log_details["errors_preview"] = errors_preview
7480 | await _log("download_site_pdfs_complete", **log_details)
7481 |
7482 | # Return final result
7483 | return {
7484 | "success": True, # Overall tool execution success
7485 | "pdf_count": num_successful,
7486 | "failed_count": num_failed,
7487 | "dest_dir": final_dest_dir_str,
7488 | "files": results, # Return list of all result dicts (success and failure)
7489 | }
7490 |
7491 |
7492 | @with_tool_metrics
7493 | @with_error_handling
7494 | async def collect_documentation(
7495 | package: str, max_pages: int = 40, rate_limit_rps: float = 2.0
7496 | ) -> Dict[str, Any]:
7497 | """Finds docs site, crawls, extracts text, saves using FileSystemTool."""
7498 | # Ensure SB is initialized
7499 | await _ensure_initialized()
7500 | # Update activity timestamp
7501 | _update_activity()
7502 |
7503 | # --- Validate Inputs ---
7504 | if not package:
7505 | raise ToolInputError("Package name cannot be empty.")
7506 | if max_pages <= 0:
7507 | raise ToolInputError("max_pages must be positive.")
7508 | if rate_limit_rps <= 0:
7509 | raise ToolInputError("rate_limit_rps must be positive.")
7510 |
7511 | # --- Find Documentation Root URL ---
7512 | try:
7513 | docs_root = await _pick_docs_root(package)
7514 | if not docs_root:
7515 | raise ToolError(
7516 | f"Could not automatically find a likely documentation site for package '{package}'."
7517 | )
7518 | except Exception as e:
7519 | # Wrap errors during root finding
7520 | raise ToolError(f"Error finding documentation root for '{package}': {str(e)}") from e
7521 |
7522 | # --- Crawl Documentation Site ---
7523 | logger.info(f"Found potential docs root: {docs_root}. Starting documentation crawl...")
7524 | try:
7525 | # Use the helper function to crawl and extract content
7526 | pages_content = await crawl_docs_site(
7527 | docs_root, max_pages=max_pages, rate_limit_rps=rate_limit_rps
7528 | )
7529 | except Exception as e:
7530 | # Wrap errors during crawling
7531 | raise ToolError(
7532 | f"Error crawling documentation site starting from {docs_root}: {str(e)}"
7533 | ) from e
7534 |
7535 | # Check if content was collected
7536 | num_pages_collected = len(pages_content)
7537 | if num_pages_collected == 0:
7538 | logger.info(f"No readable content collected from documentation site for '{package}'.")
7539 | return {
7540 | "success": True, # Tool ran successfully, but found no content
7541 | "package": package,
7542 | "pages_collected": 0,
7543 | "file_path": None, # No file saved
7544 | "root_url": docs_root,
7545 | "message": "No readable content pages were collected from the documentation site.",
7546 | }
7547 | logger.info(f"Collected readable content from {num_pages_collected} pages for '{package}'.")
7548 |
7549 | # --- Prepare Output Directory ---
7550 | output_dir_relative_path = "storage/smart_browser_docs_collected"
7551 | created_dir_path: Optional[str] = None
7552 | try:
7553 | logger.info(
7554 | f"Ensuring documentation output directory exists: '{output_dir_relative_path}' using filesystem tool."
7555 | )
7556 | create_result = await create_directory(path=output_dir_relative_path) # STANDALONE call
7557 | if not isinstance(create_result, dict) or not create_result.get("success"):
7558 | error_msg = (
7559 | create_result.get("error", "Unknown")
7560 | if isinstance(create_result, dict)
7561 | else "Invalid response"
7562 | )
7563 | raise ToolError(
7564 | f"Filesystem tool failed to create directory '{output_dir_relative_path}'. Error: {error_msg}"
7565 | )
7566 | created_dir_path = create_result.get("path") # Get absolute path
7567 | if not created_dir_path:
7568 | raise ToolError(
7569 | f"Filesystem tool create_directory for '{output_dir_relative_path}' did not return an absolute path."
7570 | )
7571 | logger.info(f"Ensured output directory exists at: '{created_dir_path}'")
7572 | except Exception as e:
7573 | # Wrap directory preparation errors
7574 | raise ToolError(
7575 | f"Could not prepare output directory '{output_dir_relative_path}': {str(e)}"
7576 | ) from e
7577 |
7578 | # --- Format Content and Determine Filename ---
7579 | # Create a unique filename based on package and timestamp
7580 | now_utc_str = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
7581 | safe_pkg_name = _slugify(package, 40)
7582 | filename = f"{safe_pkg_name}_docs_{now_utc_str}.txt"
7583 | # Construct relative path for writing (FS tool handles base path resolution)
7584 | fpath_relative = f"{output_dir_relative_path}/{filename}"
7585 |
7586 | # Combine collected content into a single string
7587 | separator = "\n\n" + ("=" * 80) + "\n\n" # Separator between pages
7588 | header = f"# Documentation for: {package}\n# Crawl Root: {docs_root}\n{separator}"
7589 | combined_content = header
7590 | try:
7591 | page_texts = []
7592 | for i, (url, text) in enumerate(pages_content):
7593 | page_header = f"## Page {i + 1}: {str(url)}\n\n"
7594 | page_body = str(text).strip() # Ensure text is string and stripped
7595 | page_texts.append(page_header + page_body)
7596 | # Join all page sections with the separator
7597 | combined_content += separator.join(page_texts)
7598 | except Exception as e:
7599 | # Handle potential errors during string formatting/joining
7600 | raise ToolError(f"Error formatting collected documentation content: {str(e)}") from e
7601 |
7602 | # --- Write Combined Content using Filesystem Tool ---
7603 | final_absolute_fpath: Optional[str] = None
7604 | try:
7605 | logger.info(f"Writing combined documentation content to relative path: {fpath_relative}")
7606 | write_result = await write_file(
7607 | path=fpath_relative, content=combined_content
7608 | ) # STANDALONE call
7609 | if not isinstance(write_result, dict) or not write_result.get("success"):
7610 | error_msg = (
7611 | write_result.get("error", "Unknown")
7612 | if isinstance(write_result, dict)
7613 | else "Invalid response"
7614 | )
7615 | raise ToolError(
7616 | f"Filesystem tool failed to write documentation file '{fpath_relative}'. Error: {error_msg}"
7617 | )
7618 |
7619 | # Get the absolute path where the file was actually written
7620 | final_absolute_fpath = write_result.get("path")
7621 | if not final_absolute_fpath:
7622 | logger.warning(
7623 | f"Filesystem tool write_file for '{fpath_relative}' did not return an absolute path. Using relative path in result."
7624 | )
7625 | final_absolute_fpath = fpath_relative # Fallback for logging/return value
7626 |
7627 | logger.info(f"Successfully wrote combined documentation to: {final_absolute_fpath}")
7628 | except Exception as e:
7629 | # Wrap errors during file write
7630 | raise ToolError(f"Could not write documentation file '{fpath_relative}': {str(e)}") from e
7631 |
7632 | # --- Log Success and Return Result ---
7633 | await _log(
7634 | "docs_collected_success",
7635 | package=package,
7636 | root=docs_root,
7637 | pages=num_pages_collected,
7638 | file=str(final_absolute_fpath),
7639 | )
7640 | return {
7641 | "success": True,
7642 | "package": package,
7643 | "pages_collected": num_pages_collected,
7644 | "file_path": str(final_absolute_fpath), # Return the absolute path
7645 | "root_url": docs_root,
7646 | "message": f"Collected and saved content from {num_pages_collected} pages for '{package}'.",
7647 | }
7648 |
7649 |
7650 | @with_tool_metrics
7651 | @with_error_handling
7652 | async def run_macro( # Renamed from execute_macro to avoid confusion
7653 | url: str,
7654 | task: str,
7655 | model: str = _llm_model_locator_global,
7656 | max_rounds: int = 7,
7657 | timeout_seconds: int = 600,
7658 | ) -> Dict[str, Any]:
7659 | """Navigates to URL and executes a natural language task using LLM planner and step runner."""
7660 | # Ensure SB is initialized
7661 | await _ensure_initialized()
7662 | # Update activity timestamp
7663 | _update_activity()
7664 |
7665 | # --- Input Validation ---
7666 | if not url:
7667 | raise ToolInputError("URL cannot be empty.")
7668 | if not task:
7669 | raise ToolInputError("Task description cannot be empty.")
7670 | if max_rounds <= 0:
7671 | raise ToolInputError("max_rounds must be positive.")
7672 | if timeout_seconds <= 0:
7673 | raise ToolInputError("timeout_seconds must be positive.")
7674 |
7675 | # Define the inner function to run with timeout
7676 | async def run_macro_inner():
7677 | ctx, _ = await get_browser_context()
7678 | async with _tab_context(ctx) as page:
7679 | # Navigate to the starting URL
7680 | await _log("macro_navigate", url=url, task=task)
7681 | try:
7682 | nav_timeout = 60000
7683 | await page.goto(url, wait_until="networkidle", timeout=nav_timeout)
7684 | except PlaywrightException as e:
7685 | # Use f-string for cleaner message
7686 | raise ToolError(f"Navigation to '{url}' failed before starting macro: {e}") from e
7687 |
7688 | # Call the helper function that contains the plan-act loop
7689 | # This helper handles planning, running steps, and logging rounds/errors
7690 | step_results = await _run_macro_execution_loop(page, task, max_rounds, model)
7691 |
7692 | # Get final page state after macro execution
7693 | final_state = {} # Initialize as empty dict
7694 | try:
7695 | final_state = await get_page_state(page)
7696 | except Exception as e:
7697 | logger.error(f"Failed to get final page state after macro execution: {e}")
7698 | final_state = {"error": f"Failed to get final page state: {e}"}
7699 |
7700 | # Determine overall macro success
7701 | # Success if:
7702 | # 1. A 'finish' step was executed successfully OR
7703 | # 2. All steps executed (excluding wait/finish/extract?) succeeded.
7704 | finished_successfully = any(
7705 | s.get("action") == "finish" and s.get("success") for s in step_results
7706 | )
7707 | # Check if all non-finish/wait/extract steps succeeded (if any exist)
7708 | all_other_steps_succeeded = True
7709 | non_terminal_steps_exist = False
7710 | for s in step_results:
7711 | action = s.get("action")
7712 | # Consider steps other than these potentially "passive" ones for failure check
7713 | if action not in ("finish", "wait", "extract", "scroll", "error"):
7714 | non_terminal_steps_exist = True # noqa: F841
7715 | if not s.get("success", False):
7716 | all_other_steps_succeeded = False
7717 | break # Found a failed critical step
7718 |
7719 | # Macro succeeds if finished explicitly or if all critical steps passed (and at least one step ran)
7720 | macro_success = finished_successfully or (
7721 | bool(step_results) and all_other_steps_succeeded
7722 | )
7723 |
7724 | # Return final results
7725 | return {
7726 | "success": macro_success,
7727 | "task": task,
7728 | "steps": step_results, # List of results for each step executed
7729 | "final_page_state": final_state,
7730 | }
7731 |
7732 | # Run the inner function with an overall timeout
7733 | try:
7734 | result = await asyncio.wait_for(run_macro_inner(), timeout=timeout_seconds)
7735 | return result
7736 | except asyncio.TimeoutError:
7737 | # Handle overall macro timeout
7738 | await _log("macro_timeout", url=url, task=task, timeout=timeout_seconds)
7739 | return {
7740 | "success": False,
7741 | "task": task,
7742 | "error": f"Macro execution timed out after {timeout_seconds}s.",
7743 | "steps": [], # No steps completed within timeout (or results lost)
7744 | "final_page_state": {"error": "Macro timed out"},
7745 | }
7746 |
7747 |
7748 | async def _run_macro_execution_loop(
7749 | page: Page, task: str, max_rounds: int, model: str
7750 | ) -> List[Dict[str, Any]]:
7751 | """Internal helper containing the plan-and-execute loop for run_macro."""
7752 | all_step_results: List[Dict[str, Any]] = []
7753 | current_task_description = task # Initial task
7754 |
7755 | for i in range(max_rounds):
7756 | round_num = i + 1
7757 | logger.info(f"--- Macro Round {round_num}/{max_rounds} ---")
7758 | task_preview = current_task_description[:100] + (
7759 | "..." if len(current_task_description) > 100 else ""
7760 | )
7761 | logger.info(f"Current Task: {task_preview}")
7762 |
7763 | try:
7764 | # 1. Get Current Page State
7765 | logger.debug(f"Macro Round {round_num}: Getting page state...")
7766 | state = await get_page_state(page)
7767 | if "error" in state: # Handle error getting state
7768 | error_msg = (
7769 | f"Failed to get page state before planning round {round_num}: {state['error']}"
7770 | )
7771 | logger.error(error_msg)
7772 | # Add error step and stop
7773 | all_step_results.append(
7774 | {"action": "error", "success": False, "error": error_msg, "round": round_num}
7775 | )
7776 | return all_step_results
7777 |
7778 | # 2. Plan Next Steps using LLM
7779 | logger.debug(f"Macro Round {round_num}: Planning steps with LLM...")
7780 | plan = await _plan_macro(state, current_task_description, model)
7781 | await _log(
7782 | "macro_plan_generated",
7783 | round=round_num,
7784 | task=current_task_description,
7785 | plan_length=len(plan),
7786 | plan_preview=plan[:2],
7787 | )
7788 |
7789 | # Check if plan is empty (task complete or impossible)
7790 | if not plan:
7791 | logger.info(
7792 | f"Macro Round {round_num}: Planner returned empty plan. Assuming task complete or impossible."
7793 | )
7794 | await _log("macro_plan_empty", round=round_num, task=current_task_description)
7795 | break # Exit loop
7796 |
7797 | # 3. Execute Planned Steps
7798 | logger.info(f"Macro Round {round_num}: Executing {len(plan)} planned steps...")
7799 | step_results_this_round = await run_steps(page, plan)
7800 | all_step_results.extend(step_results_this_round) # Add results to overall list
7801 |
7802 | # 4. Check Round Outcome
7803 | finished_this_round = any(
7804 | s.get("action") == "finish" and s.get("success") for s in step_results_this_round
7805 | )
7806 | last_step_failed = False
7807 | if step_results_this_round:
7808 | last_step = step_results_this_round[-1]
7809 | # Check if the *last* step failed and wasn't a passive action
7810 | is_passive_action = last_step.get("action") in (
7811 | "wait",
7812 | "finish",
7813 | "extract",
7814 | "scroll",
7815 | "error",
7816 | )
7817 | if not last_step.get("success", False) and not is_passive_action:
7818 | last_step_failed = True
7819 | error_info = last_step.get("error", "?")
7820 | failed_action = last_step.get("action", "?")
7821 | await _log(
7822 | "macro_fail_step", round=round_num, action=failed_action, error=error_info
7823 | )
7824 | logger.warning(
7825 | f"Macro Round {round_num} stopped due to failed critical step: Action='{failed_action}', Error='{error_info}'"
7826 | )
7827 |
7828 | # Exit loop if 'finish' action succeeded or last critical step failed
7829 | if finished_this_round:
7830 | await _log("macro_finish_action", round=round_num)
7831 | logger.info(
7832 | f"Macro finished successfully via 'finish' action in round {round_num}."
7833 | )
7834 | return all_step_results # Return immediately after successful finish
7835 | if last_step_failed:
7836 | logger.info(f"Stopping macro execution after failed step in round {round_num}.")
7837 | return all_step_results # Return results up to the failure
7838 |
7839 | # If loop continues, update task description for next round?
7840 | # (Currently, task description remains the same throughout)
7841 | # current_task_description = "Refine based on results..." # Example modification point
7842 |
7843 | except ToolError as e:
7844 | # Handle errors during planning or state retrieval specifically
7845 | await _log(
7846 | "macro_error_tool", round=round_num, task=current_task_description, error=str(e)
7847 | )
7848 | logger.error(f"Macro Round {round_num} failed with ToolError: {e}")
7849 | all_step_results.append(
7850 | {
7851 | "action": "error",
7852 | "success": False,
7853 | "error": f"ToolError in Round {round_num}: {e}",
7854 | "round": round_num,
7855 | }
7856 | )
7857 | return all_step_results # Stop execution on tool errors
7858 | except Exception as e:
7859 | # Handle unexpected errors during the round
7860 | await _log(
7861 | "macro_error_unexpected",
7862 | round=round_num,
7863 | task=current_task_description,
7864 | error=str(e),
7865 | )
7866 | logger.error(f"Macro Round {round_num} failed unexpectedly: {e}", exc_info=True)
7867 | all_step_results.append(
7868 | {
7869 | "action": "error",
7870 | "success": False,
7871 | "error": f"Unexpected Error in Round {round_num}: {e}",
7872 | "round": round_num,
7873 | }
7874 | )
7875 | return all_step_results # Stop execution on unexpected errors
7876 |
7877 | # If loop finishes due to max_rounds
7878 | await _log("macro_exceeded_rounds", max_rounds=max_rounds, task=task)
7879 | logger.warning(f"Macro stopped after reaching maximum rounds ({max_rounds}) for task: {task}")
7880 | return all_step_results # Return all collected results
7881 |
7882 |
7883 | @with_tool_metrics
7884 | @with_error_handling
7885 | async def autopilot(
7886 | task: str,
7887 | scratch_subdir: str = "autopilot_runs",
7888 | max_steps: int = 10,
7889 | timeout_seconds: int = 1800,
7890 | ) -> Dict[str, Any]:
7891 | """Executes a complex multi-step task using LLM planning and available tools."""
7892 | # Ensure SB is initialized
7893 | await _ensure_initialized()
7894 |
7895 | # --- Validate Inputs ---
7896 | if not task:
7897 | raise ToolInputError("Task description cannot be empty.")
7898 | if max_steps <= 0:
7899 | raise ToolInputError("max_steps must be positive.")
7900 | if timeout_seconds <= 0:
7901 | raise ToolInputError("timeout_seconds must be positive.")
7902 |
7903 | # --- Prepare Scratch Directory and Logging ---
7904 | final_scratch_dir_str: Optional[str] = None
7905 | log_path: Optional[Path] = None
7906 | try:
7907 | # Define base path for scratch files
7908 | scratch_base_relative = "storage/smart_browser_scratch"
7909 | # Sanitize user-provided subdir name
7910 | safe_subdir = _slugify(scratch_subdir, 50) or "autopilot_run" # Fallback name
7911 | scratch_dir_relative_path = f"{scratch_base_relative}/{safe_subdir}"
7912 |
7913 | logger.info(
7914 | f"Ensuring autopilot scratch directory exists: '{scratch_dir_relative_path}' using filesystem tool."
7915 | )
7916 | # Use STANDALONE create_directory tool
7917 | create_dir_result = await create_directory(path=scratch_dir_relative_path)
7918 | if not isinstance(create_dir_result, dict) or not create_dir_result.get("success"):
7919 | error_msg = (
7920 | create_dir_result.get("error", "Unknown")
7921 | if isinstance(create_dir_result, dict)
7922 | else "Invalid response"
7923 | )
7924 | raise ToolError(
7925 | f"Filesystem tool failed to create scratch directory '{scratch_dir_relative_path}'. Error: {error_msg}"
7926 | )
7927 |
7928 | # Get the absolute path
7929 | final_scratch_dir_str = create_dir_result.get("path")
7930 | if not final_scratch_dir_str:
7931 | raise ToolError(
7932 | f"Filesystem tool create_directory for '{scratch_dir_relative_path}' did not return an absolute path."
7933 | )
7934 | final_scratch_dir_path = Path(final_scratch_dir_str)
7935 | logger.info(f"Autopilot scratch directory confirmed/created at: {final_scratch_dir_path}")
7936 |
7937 | # Prepare log file path within the scratch directory
7938 | run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
7939 | log_filename = f"autopilot_run_{run_id}.jsonl"
7940 | log_path = final_scratch_dir_path / log_filename
7941 | logger.info(f"Autopilot run '{run_id}' started. Execution log: {log_path}")
7942 |
7943 | except Exception as e:
7944 | # Wrap directory preparation errors
7945 | raise ToolError(
7946 | f"Could not prepare scratch directory '{scratch_dir_relative_path}': {str(e)}"
7947 | ) from e
7948 |
7949 | # Define the inner function to run with timeout
7950 | async def autopilot_inner():
7951 | all_results: List[Dict] = [] # Stores results of each step
7952 | current_task_description = task # Initial task
7953 |
7954 | try:
7955 | # --- Initial Planning ---
7956 | logger.info("Autopilot: Generating initial plan...")
7957 | current_plan = await _plan_autopilot(
7958 | current_task_description, None
7959 | ) # Initial plan has no prior results
7960 | step_num = 0
7961 |
7962 | # --- Execution Loop ---
7963 | while step_num < max_steps and current_plan:
7964 | step_num += 1
7965 | step_to_execute = current_plan[0] # Get the next step
7966 | tool_name = step_to_execute.get("tool")
7967 | args = step_to_execute.get("args", {})
7968 | # Initialize log entry for this step
7969 | step_log = {
7970 | "step": step_num,
7971 | "tool": tool_name,
7972 | "args": args,
7973 | "success": False,
7974 | "result": None,
7975 | "error": None,
7976 | }
7977 | logger.info(
7978 | f"--- Autopilot Step {step_num}/{max_steps}: Executing Tool '{tool_name}' ---"
7979 | )
7980 | logger.debug(f"Step {step_num} Args: {args}")
7981 |
7982 | # Validate tool exists
7983 | if tool_name not in _AVAILABLE_TOOLS:
7984 | error_msg = f"Planner selected unknown tool '{tool_name}'."
7985 | step_log["error"] = error_msg
7986 | logger.error(error_msg)
7987 | current_plan = [] # Stop execution if tool is unknown
7988 | else:
7989 | # --- Tool Lookup and Execution ---
7990 | method_name = _AVAILABLE_TOOLS[tool_name][0] # Get function name string
7991 | # Look up the actual function object
7992 | tool_func = globals().get(method_name) # Check current module globals first
7993 | if not tool_func or not callable(tool_func):
7994 | # Try external tool lookups if not found locally
7995 | tool_func = _get_filesystem_tool(method_name) or _get_completion_tool(
7996 | method_name
7997 | )
7998 |
7999 | if not tool_func or not callable(tool_func):
8000 | # Tool function implementation not found
8001 | error_msg = f"Internal error: Could not find function implementation for tool '{tool_name}' (expected function: '{method_name}')."
8002 | step_log["error"] = error_msg
8003 | logger.error(error_msg)
8004 | current_plan = [] # Stop execution
8005 | else:
8006 | # --- Execute the Found Tool Function ---
8007 | try:
8008 | await _log(
8009 | "autopilot_step_start", step=step_num, tool=tool_name, args=args
8010 | )
8011 | _update_activity() # Update activity before long tool call
8012 | # Call the standalone tool function with its arguments
8013 | outcome = await tool_func(**args)
8014 | _update_activity() # Update activity after tool call returns
8015 |
8016 | # Record outcome in step log
8017 | step_log["success"] = outcome.get("success", False)
8018 | step_log["result"] = outcome # Store the full result dict
8019 |
8020 | # --- Plan for Next Step (or Replan on Failure) ---
8021 | if step_log["success"]:
8022 | await _log(
8023 | "autopilot_step_success",
8024 | step=step_num,
8025 | tool=tool_name,
8026 | result_summary=str(outcome)[:200],
8027 | )
8028 | logger.info(
8029 | f"Autopilot Step {step_num} ({tool_name}) completed successfully."
8030 | )
8031 | # Remove completed step and plan next based on success
8032 | current_plan.pop(0) # Remove executed step
8033 | if current_plan: # If plan wasn't just one step
8034 | logger.debug("Plan has remaining steps, continuing...")
8035 | elif not current_plan: # Plan is now empty after successful step
8036 | logger.info(
8037 | "Autopilot: Attempting to generate next plan step..."
8038 | )
8039 | try:
8040 | current_plan = await _plan_autopilot(
8041 | current_task_description, all_results + [step_log]
8042 | )
8043 | plan_count = len(current_plan)
8044 | logger.info(f"Generated next plan ({plan_count} step(s)).")
8045 | await _log(
8046 | "autopilot_replan_success",
8047 | reason="step_complete",
8048 | new_steps=plan_count,
8049 | )
8050 | except Exception as replan_err:
8051 | logger.error(
8052 | f"Autopilot replanning after step success failed: {replan_err}"
8053 | )
8054 | await _log(
8055 | "autopilot_replan_fail",
8056 | reason="step_complete",
8057 | error=str(replan_err),
8058 | )
8059 | current_plan = [] # Stop if replanning fails
8060 | else:
8061 | # Step failed
8062 | step_log["error"] = outcome.get(
8063 | "error", f"Tool '{tool_name}' failed without specific error."
8064 | )
8065 | await _log(
8066 | "autopilot_step_fail",
8067 | step=step_num,
8068 | tool=tool_name,
8069 | error=step_log["error"],
8070 | )
8071 | logger.warning(
8072 | f"Autopilot Step {step_num} ({tool_name}) failed: {step_log['error']}"
8073 | )
8074 | logger.info(f"Attempting replan after failed step {step_num}...")
8075 | try:
8076 | # Replan based on the failure
8077 | new_plan_tail = await _plan_autopilot(
8078 | current_task_description, all_results + [step_log]
8079 | )
8080 | current_plan = new_plan_tail # Replace old plan with new one
8081 | plan_count = len(current_plan)
8082 | logger.info(
8083 | f"Replanning successful after failure. New plan has {plan_count} step(s)."
8084 | )
8085 | await _log(
8086 | "autopilot_replan_success",
8087 | reason="step_fail",
8088 | new_steps=plan_count,
8089 | )
8090 | except Exception as replan_err:
8091 | logger.error(
8092 | f"Autopilot replanning after step failure failed: {replan_err}"
8093 | )
8094 | await _log(
8095 | "autopilot_replan_fail",
8096 | reason="step_fail",
8097 | error=str(replan_err),
8098 | )
8099 | current_plan = [] # Stop if replanning fails
8100 |
8101 | except (
8102 | ToolInputError,
8103 | ToolError,
8104 | ValueError,
8105 | TypeError,
8106 | AssertionError,
8107 | ) as e:
8108 | # Catch errors *during* tool execution (e.g., bad args passed validation but failed in tool)
8109 | error_msg = f"{type(e).__name__} executing '{tool_name}': {e}"
8110 | step_log["error"] = error_msg
8111 | step_log["success"] = False
8112 | logger.error(
8113 | f"Autopilot Step {step_num} ({tool_name}) execution failed: {error_msg}",
8114 | exc_info=True,
8115 | )
8116 | current_plan = [] # Stop execution on tool error
8117 | except Exception as e:
8118 | # Catch unexpected errors during tool execution
8119 | error_msg = f"Unexpected error executing '{tool_name}': {e}"
8120 | step_log["error"] = error_msg
8121 | step_log["success"] = False
8122 | logger.critical(
8123 | f"Autopilot Step {step_num} ({tool_name}) failed unexpectedly: {error_msg}",
8124 | exc_info=True,
8125 | )
8126 | current_plan = [] # Stop execution
8127 |
8128 | # Append the result of this step to the overall results
8129 | all_results.append(step_log)
8130 | # --- Log Step Result to File ---
8131 | if log_path:
8132 | try:
8133 | log_entry = (
8134 | json.dumps(step_log, default=str) + "\n"
8135 | ) # Use default=str for non-serializable types
8136 | async with aiofiles.open(log_path, "a", encoding="utf-8") as log_f:
8137 | await log_f.write(log_entry)
8138 | except IOError as log_e:
8139 | logger.error(f"Failed to write autopilot step log to {log_path}: {log_e}")
8140 | except Exception as json_e:
8141 | logger.error(f"Failed to serialize step log for writing: {json_e}")
8142 |
8143 | # --- Loop End Conditions ---
8144 | if step_num >= max_steps:
8145 | logger.warning(f"Autopilot stopped: Reached maximum step limit ({max_steps}).")
8146 | await _log("autopilot_max_steps", task=task, steps=step_num)
8147 | elif not current_plan and step_num > 0:
8148 | # Plan became empty (either task finished or replan failed/returned empty)
8149 | final_step_success = all_results[-1].get("success", False) if all_results else False
8150 | if final_step_success:
8151 | logger.info(f"Autopilot plan complete after {step_num} steps.")
8152 | await _log("autopilot_plan_end", task=task, steps=step_num, status="completed")
8153 | else:
8154 | logger.warning(
8155 | f"Autopilot stopped after {step_num} steps due to failure or inability to plan next step."
8156 | )
8157 | await _log(
8158 | "autopilot_plan_end", task=task, steps=step_num, status="failed_or_stuck"
8159 | )
8160 | elif step_num == 0:
8161 | # Initial plan was empty
8162 | logger.warning("Autopilot: Initial plan was empty. No steps executed.")
8163 | await _log("autopilot_plan_end", task=task, steps=0, status="no_plan")
8164 |
8165 | # Determine overall success based on the success of the *last* executed step
8166 | overall_success = bool(all_results) and all_results[-1].get("success", False)
8167 | # Return final summary
8168 | return {
8169 | "success": overall_success,
8170 | "steps_executed": step_num,
8171 | "run_log": str(log_path) if log_path else None,
8172 | "final_results": all_results[-3:], # Return summary of last few steps
8173 | }
8174 | except Exception as autopilot_err:
8175 | # Catch critical errors during planning or loop setup
8176 | logger.critical(
8177 | f"Autopilot run failed critically before or during execution loop: {autopilot_err}",
8178 | exc_info=True,
8179 | )
8180 | await _log("autopilot_critical_error", task=task, error=str(autopilot_err))
8181 | # Log error to file if possible
8182 | error_entry = {
8183 | "step": 0,
8184 | "success": False,
8185 | "error": f"Autopilot critical failure: {autopilot_err}",
8186 | }
8187 | if log_path:
8188 | try:
8189 | log_entry = json.dumps(error_entry, default=str) + "\n"
8190 | async with aiofiles.open(log_path, "a", encoding="utf-8") as log_f:
8191 | await log_f.write(log_entry)
8192 | except Exception as final_log_e:
8193 | logger.error(
8194 | f"Failed to write final critical error log to {log_path}: {final_log_e}"
8195 | )
8196 | # Raise ToolError to indicate autopilot failure
8197 | raise ToolError(f"Autopilot failed critically: {autopilot_err}") from autopilot_err
8198 |
8199 | # --- Run with Timeout ---
8200 | try:
8201 | result = await asyncio.wait_for(autopilot_inner(), timeout=timeout_seconds)
8202 | return result
8203 | except asyncio.TimeoutError:
8204 | error_msg = f"Autopilot execution timed out after {timeout_seconds}s."
8205 | logger.error(error_msg)
8206 | await _log("autopilot_timeout", task=task, timeout=timeout_seconds)
8207 | # Log timeout to file if possible
8208 | if log_path:
8209 | try:
8210 | timeout_entry = {"step": -1, "success": False, "error": error_msg}
8211 | log_entry = json.dumps(timeout_entry, default=str) + "\n"
8212 | async with aiofiles.open(log_path, "a", encoding="utf-8") as log_f:
8213 | await log_f.write(log_entry)
8214 | except Exception as timeout_log_e:
8215 | logger.error(f"Failed to write timeout log entry to {log_path}: {timeout_log_e}")
8216 | # Return timeout failure
8217 | return {
8218 | "success": False,
8219 | "error": error_msg,
8220 | "steps_executed": -1, # Indicate timeout before completion
8221 | "run_log": str(log_path) if log_path else None,
8222 | "final_results": [], # No final results available on timeout
8223 | }
8224 |
```