#
tokens: 67628/50000 1/207 files (page 43/45)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 43 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│   ├── __init__.py
│   ├── advanced_agent_flows_using_unified_memory_system_demo.py
│   ├── advanced_extraction_demo.py
│   ├── advanced_unified_memory_system_demo.py
│   ├── advanced_vector_search_demo.py
│   ├── analytics_reporting_demo.py
│   ├── audio_transcription_demo.py
│   ├── basic_completion_demo.py
│   ├── cache_demo.py
│   ├── claude_integration_demo.py
│   ├── compare_synthesize_demo.py
│   ├── cost_optimization.py
│   ├── data
│   │   ├── sample_event.txt
│   │   ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│   │   └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│   ├── docstring_refiner_demo.py
│   ├── document_conversion_and_processing_demo.py
│   ├── entity_relation_graph_demo.py
│   ├── filesystem_operations_demo.py
│   ├── grok_integration_demo.py
│   ├── local_text_tools_demo.py
│   ├── marqo_fused_search_demo.py
│   ├── measure_model_speeds.py
│   ├── meta_api_demo.py
│   ├── multi_provider_demo.py
│   ├── ollama_integration_demo.py
│   ├── prompt_templates_demo.py
│   ├── python_sandbox_demo.py
│   ├── rag_example.py
│   ├── research_workflow_demo.py
│   ├── sample
│   │   ├── article.txt
│   │   ├── backprop_paper.pdf
│   │   ├── buffett.pdf
│   │   ├── contract_link.txt
│   │   ├── legal_contract.txt
│   │   ├── medical_case.txt
│   │   ├── northwind.db
│   │   ├── research_paper.txt
│   │   ├── sample_data.json
│   │   └── text_classification_samples
│   │       ├── email_classification.txt
│   │       ├── news_samples.txt
│   │       ├── product_reviews.txt
│   │       └── support_tickets.txt
│   ├── sample_docs
│   │   └── downloaded
│   │       └── attention_is_all_you_need.pdf
│   ├── sentiment_analysis_demo.py
│   ├── simple_completion_demo.py
│   ├── single_shot_synthesis_demo.py
│   ├── smart_browser_demo.py
│   ├── sql_database_demo.py
│   ├── sse_client_demo.py
│   ├── test_code_extraction.py
│   ├── test_content_detection.py
│   ├── test_ollama.py
│   ├── text_classification_demo.py
│   ├── text_redline_demo.py
│   ├── tool_composition_examples.py
│   ├── tournament_code_demo.py
│   ├── tournament_text_demo.py
│   ├── unified_memory_system_demo.py
│   ├── vector_search_demo.py
│   ├── web_automation_instruction_packs.py
│   └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│   └── smart_browser_internal
│       ├── locator_cache.db
│       ├── readability.js
│       └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│   ├── __init__.py
│   ├── conftest.py
│   ├── integration
│   │   ├── __init__.py
│   │   └── test_server.py
│   ├── manual
│   │   ├── test_extraction_advanced.py
│   │   └── test_extraction.py
│   └── unit
│       ├── __init__.py
│       ├── test_cache.py
│       ├── test_providers.py
│       └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── commands.py
│   │   ├── helpers.py
│   │   └── typer_cli.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── completion_client.py
│   │   └── rag_client.py
│   ├── config
│   │   └── examples
│   │       └── filesystem_config.yaml
│   ├── config.py
│   ├── constants.py
│   ├── core
│   │   ├── __init__.py
│   │   ├── evaluation
│   │   │   ├── base.py
│   │   │   └── evaluators.py
│   │   ├── providers
│   │   │   ├── __init__.py
│   │   │   ├── anthropic.py
│   │   │   ├── base.py
│   │   │   ├── deepseek.py
│   │   │   ├── gemini.py
│   │   │   ├── grok.py
│   │   │   ├── ollama.py
│   │   │   ├── openai.py
│   │   │   └── openrouter.py
│   │   ├── server.py
│   │   ├── state_store.py
│   │   ├── tournaments
│   │   │   ├── manager.py
│   │   │   ├── tasks.py
│   │   │   └── utils.py
│   │   └── ums_api
│   │       ├── __init__.py
│   │       ├── ums_database.py
│   │       ├── ums_endpoints.py
│   │       ├── ums_models.py
│   │       └── ums_services.py
│   ├── exceptions.py
│   ├── graceful_shutdown.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── analytics
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── reporting.py
│   │   ├── cache
│   │   │   ├── __init__.py
│   │   │   ├── cache_service.py
│   │   │   ├── persistence.py
│   │   │   ├── strategies.py
│   │   │   └── utils.py
│   │   ├── cache.py
│   │   ├── document.py
│   │   ├── knowledge_base
│   │   │   ├── __init__.py
│   │   │   ├── feedback.py
│   │   │   ├── manager.py
│   │   │   ├── rag_engine.py
│   │   │   ├── retriever.py
│   │   │   └── utils.py
│   │   ├── prompts
│   │   │   ├── __init__.py
│   │   │   ├── repository.py
│   │   │   └── templates.py
│   │   ├── prompts.py
│   │   └── vector
│   │       ├── __init__.py
│   │       ├── embeddings.py
│   │       └── vector_service.py
│   ├── tool_token_counter.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── audio_transcription.py
│   │   ├── base.py
│   │   ├── completion.py
│   │   ├── docstring_refiner.py
│   │   ├── document_conversion_and_processing.py
│   │   ├── enhanced-ums-lookbook.html
│   │   ├── entity_relation_graph.py
│   │   ├── excel_spreadsheet_automation.py
│   │   ├── extraction.py
│   │   ├── filesystem.py
│   │   ├── html_to_markdown.py
│   │   ├── local_text_tools.py
│   │   ├── marqo_fused_search.py
│   │   ├── meta_api_tool.py
│   │   ├── ocr_tools.py
│   │   ├── optimization.py
│   │   ├── provider.py
│   │   ├── pyodide_boot_template.html
│   │   ├── python_sandbox.py
│   │   ├── rag.py
│   │   ├── redline-compiled.css
│   │   ├── sentiment_analysis.py
│   │   ├── single_shot_synthesis.py
│   │   ├── smart_browser.py
│   │   ├── sql_databases.py
│   │   ├── text_classification.py
│   │   ├── text_redline_tools.py
│   │   ├── tournament.py
│   │   ├── ums_explorer.html
│   │   └── unified_memory_system.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── async_utils.py
│   │   ├── display.py
│   │   ├── logging
│   │   │   ├── __init__.py
│   │   │   ├── console.py
│   │   │   ├── emojis.py
│   │   │   ├── formatter.py
│   │   │   ├── logger.py
│   │   │   ├── panels.py
│   │   │   ├── progress.py
│   │   │   └── themes.py
│   │   ├── parse_yaml.py
│   │   ├── parsing.py
│   │   ├── security.py
│   │   └── text.py
│   └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/document_conversion_and_processing.py:
--------------------------------------------------------------------------------

```python
   1 | # ultimate_mcp_server/tools/document_conversion_and_processing.py
   2 | """Standalone Document Processing Toolkit functions for MCP Server.
   3 | 
   4 | A comprehensive, fault-tolerant toolkit for document processing, providing:
   5 | (Functionality remains the same as the original class docstring)
   6 | """
   7 | 
   8 | ###############################################################################
   9 | # Imports                                                                     #
  10 | ###############################################################################
  11 | # Standard library imports
  12 | import asyncio
  13 | import base64
  14 | import csv
  15 | import functools
  16 | import hashlib
  17 | import html
  18 | import io
  19 | import json
  20 | import math
  21 | import os
  22 | import re
  23 | import tempfile
  24 | import textwrap
  25 | import time
  26 | from contextlib import contextmanager
  27 | from io import StringIO
  28 | from pathlib import Path
  29 | from typing import (
  30 |     TYPE_CHECKING,
  31 |     Any,
  32 |     Awaitable,
  33 |     Callable,
  34 |     Dict,
  35 |     List,
  36 |     Optional,
  37 |     Pattern,
  38 |     Sequence,
  39 |     Set,
  40 |     Tuple,
  41 |     Union,
  42 | )
  43 | 
  44 | # Third-party imports
  45 | import html2text
  46 | from bs4 import BeautifulSoup, Tag
  47 | from rapidfuzz import fuzz
  48 | 
  49 | # Local application imports
  50 | from ultimate_mcp_server.constants import Provider
  51 | from ultimate_mcp_server.exceptions import ProviderError, ToolError, ToolInputError
  52 | from ultimate_mcp_server.tools.base import (
  53 |     with_error_handling,
  54 |     with_retry,
  55 |     with_tool_metrics,
  56 | )
  57 | from ultimate_mcp_server.tools.completion import generate_completion
  58 | from ultimate_mcp_server.utils import get_logger
  59 | 
  60 | # Type checking imports
  61 | if TYPE_CHECKING:
  62 |     import numpy as np
  63 |     import pandas as pd
  64 |     import tiktoken
  65 |     from docling.datamodel.pipeline_options import AcceleratorDevice as _AcceleratorDeviceType
  66 |     from docling_core.types.doc import DoclingDocument as _DoclingDocumentType
  67 |     from docling_core.types.doc import ImageRefMode as _ImageRefModeType
  68 |     from PIL import Image as PILImage
  69 |     from tiktoken import Encoding
  70 | 
  71 | # ───────────────────── Optional Dependency Check & Initialization ───────────────────
  72 | _DOCLING_AVAILABLE = False
  73 | try:
  74 |     from docling.datamodel.base_models import InputFormat
  75 |     from docling.datamodel.pipeline_options import (
  76 |         AcceleratorDevice,
  77 |         AcceleratorOptions,
  78 |         PdfPipelineOptions,
  79 |     )
  80 |     from docling.document_converter import DocumentConverter, PdfFormatOption
  81 |     from docling_core.types.doc import DoclingDocument, ImageRefMode
  82 | 
  83 |     _DOCLING_AVAILABLE = True
  84 |     _DoclingDocumentType = DoclingDocument
  85 |     _ImageRefModeType = ImageRefMode
  86 |     _AcceleratorDeviceType = AcceleratorDevice
  87 | 
  88 | except ImportError:
  89 |     _DoclingDocumentType = Any
  90 |     _ImageRefModeType = Any
  91 |     _AcceleratorDeviceType = Any
  92 |     InputFormat = None
  93 |     AcceleratorDevice = None
  94 |     AcceleratorOptions = None
  95 |     PdfPipelineOptions = None
  96 |     DocumentConverter = None
  97 |     PdfFormatOption = None
  98 |     pass
  99 | 
 100 | _PANDAS_AVAILABLE = False
 101 | try:
 102 |     import pandas as pd
 103 | 
 104 |     _PANDAS_AVAILABLE = True
 105 | except ModuleNotFoundError:
 106 |     pd = None
 107 | 
 108 | _TIKTOKEN_AVAILABLE = False
 109 | try:
 110 |     import tiktoken
 111 | 
 112 |     _TIKTOKEN_AVAILABLE = True
 113 | except ModuleNotFoundError:
 114 |     tiktoken = None
 115 | 
 116 | _PYPDF2_AVAILABLE = False
 117 | try:
 118 |     import PyPDF2
 119 | 
 120 |     _PYPDF2_AVAILABLE = True
 121 | except ImportError:
 122 |     PyPDF2 = None
 123 | 
 124 | _DOCX_AVAILABLE = False
 125 | try:
 126 |     import docx
 127 | 
 128 |     _DOCX_AVAILABLE = True
 129 | except ImportError:
 130 |     docx = None
 131 | 
 132 | _NUMPY_AVAILABLE = False
 133 | try:
 134 |     import numpy as np
 135 | 
 136 |     _NUMPY_AVAILABLE = True
 137 | except ImportError:
 138 |     np = None
 139 | 
 140 | _PIL_AVAILABLE = False
 141 | try:
 142 |     from PIL import Image, ImageEnhance, ImageFilter
 143 | 
 144 |     _PIL_AVAILABLE = True
 145 | except ImportError:
 146 |     Image, ImageEnhance, ImageFilter = None, None, None
 147 | 
 148 | _CV2_AVAILABLE = False
 149 | try:
 150 |     import cv2
 151 | 
 152 |     _CV2_AVAILABLE = True
 153 | except ImportError:
 154 |     cv2 = None
 155 | 
 156 | _PYTESSERACT_AVAILABLE = False
 157 | try:
 158 |     import pytesseract
 159 | 
 160 |     _PYTESSERACT_AVAILABLE = True
 161 | except ImportError:
 162 |     pytesseract = None
 163 | 
 164 | _PDF2IMAGE_AVAILABLE = False
 165 | try:
 166 |     from pdf2image import convert_from_bytes, convert_from_path
 167 | 
 168 |     _PDF2IMAGE_AVAILABLE = True
 169 | except ImportError:
 170 |     convert_from_bytes, convert_from_path = None, None
 171 | 
 172 | _PDFPLUMBER_AVAILABLE = False
 173 | try:
 174 |     import pdfplumber
 175 | 
 176 |     _PDFPLUMBER_AVAILABLE = True
 177 | except ImportError:
 178 |     pdfplumber = None
 179 | 
 180 | _PYMUPDF_AVAILABLE = False
 181 | try:
 182 |     import pymupdf  # PyMuPDF
 183 | 
 184 |     _PYMUPDF_AVAILABLE = True
 185 | except ImportError:
 186 |     pymupdf = None
 187 | 
 188 | _TRAFILATURA_AVAILABLE = False
 189 | try:
 190 |     import trafilatura
 191 | 
 192 |     _TRAFILATURA_AVAILABLE = True
 193 | except ImportError:
 194 |     trafilatura = None
 195 | 
 196 | _READABILITY_AVAILABLE = False
 197 | try:
 198 |     import readability
 199 | 
 200 |     _READABILITY_AVAILABLE = True
 201 | except ImportError:
 202 |     readability = None
 203 | 
 204 | try:
 205 |     from markdownify import markdownify as _markdownify_fallback
 206 | except ModuleNotFoundError:
 207 |     _markdownify_fallback = None
 208 | 
 209 | # ───────────────────── Module Level Logger ─────────────────────────
 210 | logger = get_logger("ultimate_mcp_server.tools.document_processing")
 211 | 
 212 | # ───────────────────── Module Level Config & State ──────────────────
 213 | DEFAULT_EXTRACTION_STRATEGY = "hybrid_direct_ocr"
 214 | _VALID_FORMATS = {"markdown", "text", "html", "json", "doctags"}
 215 | _OCR_COMPATIBLE_FORMATS = {"text", "markdown"}
 216 | _VALID_EXTRACTION_STRATEGIES = {
 217 |     "docling",
 218 |     "direct_text",
 219 |     "ocr",
 220 |     "hybrid_direct_ocr",
 221 | }
 222 | 
 223 | # Acceleration Device Mapping (Docling)
 224 | if _DOCLING_AVAILABLE and AcceleratorDevice:
 225 |     _ACCEL_MAP = {
 226 |         "auto": AcceleratorDevice.AUTO,
 227 |         "cpu": AcceleratorDevice.CPU,
 228 |         "cuda": AcceleratorDevice.CUDA,
 229 |         "mps": AcceleratorDevice.MPS,
 230 |     }
 231 | else:
 232 |     _ACCEL_MAP = {"auto": "auto", "cpu": "cpu", "cuda": "cuda", "mps": "mps"}
 233 | 
 234 | # HTML Detection Patterns
 235 | _RE_FLAGS = re.MULTILINE | re.IGNORECASE
 236 | _HTML_PATTERNS: Sequence[Pattern] = [
 237 |     re.compile(p, _RE_FLAGS)
 238 |     for p in (
 239 |         r"<\s*[a-zA-Z]+[^>]*>",
 240 |         r"<\s*/\s*[a-zA-Z]+\s*>",
 241 |         r"&[a-zA-Z]+;",
 242 |         r"&#[0-9]+;",
 243 |         r"<!\s*DOCTYPE",
 244 |         r"<!\s*--",
 245 |     )
 246 | ]
 247 | 
 248 | # Content Type Patterns (Used by detect_content_type)
 249 | _CONTENT_PATTERNS: Dict[str, List[Tuple[Pattern, float]]] = {
 250 |     "html": [
 251 |         (re.compile(r"<html", re.I), 5.0),
 252 |         (re.compile(r"<head", re.I), 4.0),
 253 |         (re.compile(r"<body", re.I), 4.0),
 254 |         (re.compile(r"</(div|p|span|a|li)>", re.I), 1.0),
 255 |         (re.compile(r"<[a-z][a-z0-9]*\s+[^>]*>", re.I), 0.8),
 256 |         (re.compile(r"<!DOCTYPE", re.I), 5.0),
 257 |         (re.compile(r"&\w+;"), 0.5),
 258 |     ],
 259 |     "markdown": [
 260 |         (re.compile(r"^#{1,6}\s+", re.M), 4.0),
 261 |         (re.compile(r"^\s*[-*+]\s+", re.M), 2.0),
 262 |         (re.compile(r"^\s*\d+\.\s+", re.M), 2.0),
 263 |         (re.compile(r"`[^`]+`"), 1.5),
 264 |         (re.compile(r"^```", re.M), 5.0),
 265 |         (re.compile(r"\*{1,2}[^*\s]+?\*{1,2}"), 1.0),
 266 |         (re.compile(r"!\[.*?\]\(.*?\)", re.M), 3.0),
 267 |         (re.compile(r"\[.*?\]\(.*?\)", re.M), 2.5),
 268 |         (re.compile(r"^>.*", re.M), 2.0),
 269 |         (re.compile(r"^-{3,}$", re.M), 3.0),
 270 |     ],
 271 |     "code": [
 272 |         (re.compile(r"def\s+\w+\(.*\):"), 3.0),
 273 |         (re.compile(r"class\s+\w+"), 3.0),
 274 |         (re.compile(r"import\s+|from\s+"), 3.0),
 275 |         (
 276 |             re.compile(r"((function\s+\w+\(|const|let|var)\s*.*?=>|\b(document|console|window)\.)"),
 277 |             3.0,
 278 |         ),
 279 |         (re.compile(r"public\s+|private\s+|static\s+"), 2.5),
 280 |         (re.compile(r"#include"), 3.0),
 281 |         (re.compile(r"<\?php"), 4.0),
 282 |         (re.compile(r"console\.log"), 2.0),
 283 |         (re.compile(r";\s*$"), 1.0),
 284 |         (re.compile(r"\b(var|let|const|int|float|string|bool)\b"), 1.5),
 285 |         (re.compile(r"//.*$"), 1.0),
 286 |         (re.compile(r"/\*.*?\*/", re.S), 1.5),
 287 |     ],
 288 | }
 289 | _LANG_PATTERNS: List[Tuple[Pattern, str]] = [
 290 |     (re.compile(r"(def\s+\w+\(.*?\):|import\s+|from\s+\S+\s+import)"), "python"),
 291 |     (
 292 |         re.compile(r"((function\s+\w+\(|const|let|var)\s*.*?=>|\b(document|console|window)\.)"),
 293 |         "javascript",
 294 |     ),
 295 |     (re.compile(r"<(\w+)(.*?)>.*?</\1>", re.S), "html"),
 296 |     (re.compile(r"<\?php"), "php"),
 297 |     (re.compile(r"(public|private|protected)\s+(static\s+)?(void|int|String)"), "java"),
 298 |     (re.compile(r"#include\s+<"), "c/c++"),
 299 |     (re.compile(r"using\s+System;"), "c#"),
 300 |     (re.compile(r"(SELECT|INSERT|UPDATE|DELETE)\s+.*FROM", re.I), "sql"),
 301 |     (re.compile(r":\s+\w+\s*\{"), "css"),
 302 |     (re.compile(r"^[^:]+:\s* # YAML key-value", re.M | re.X), "yaml"),
 303 |     (re.compile(r"\$\w+"), "shell/bash"),
 304 | ]
 305 | 
 306 | # Markdown processing regex
 307 | _BULLET_RX = re.compile(r"^[•‣▪◦‧﹒∙·] ?", re.MULTILINE)
 308 | 
 309 | # Lazy Loading State
 310 | _tiktoken_enc_instance: Union["Encoding", bool, None] = None
 311 | 
 312 | # OCR Caching (Simple in-memory - can be extended)
 313 | _OCR_CACHE: Dict[str, Any] = {}
 314 | 
 315 | # Domain Rules and Compiled Regex (Loaded Lazily)
 316 | _DOMAIN_RULES_CACHE: Optional[Dict] = None
 317 | _ACTIVE_DOMAIN: Optional[str] = None
 318 | _BOUND_RX: Optional[re.Pattern] = None
 319 | _CUSTOM_SECT_RX: Optional[List[Tuple[re.Pattern, str]]] = None
 320 | _METRIC_RX: Optional[List[Tuple[str, re.Pattern]]] = None
 321 | _RISK_RX: Optional[Dict[str, re.Pattern]] = None
 322 | _DOC_LABELS: Optional[List[str]] = None
 323 | _CLASS_PROMPT_PREFIX: Optional[str] = None
 324 | 
 325 | ###############################################################################
 326 | # Utility & Helper Functions (Private Module Level)                           #
 327 | ###############################################################################
 328 | 
 329 | 
 330 | def _log_dependency_warnings():
 331 |     """Logs warnings for missing optional dependencies on first use."""
 332 |     if not _DOCLING_AVAILABLE:
 333 |         logger.warning(
 334 |             "Docling library not available. Advanced PDF/Office conversion features disabled."
 335 |         )
 336 |     if not _PYPDF2_AVAILABLE:
 337 |         logger.warning("PyPDF2 not available. Basic PDF fallback conversion disabled.")
 338 |     if not _DOCX_AVAILABLE:
 339 |         logger.warning("python-docx not available. Basic DOCX fallback conversion disabled.")
 340 |     if not _PANDAS_AVAILABLE:
 341 |         logger.warning("Pandas not available. Pandas output format for tables disabled.")
 342 |     if not _TIKTOKEN_AVAILABLE:
 343 |         logger.warning(
 344 |             "Tiktoken not available. Token-based chunking will fallback to character chunking."
 345 |         )
 346 |     ocr_deps = {
 347 |         "Pillow": _PIL_AVAILABLE,
 348 |         "numpy": _NUMPY_AVAILABLE,
 349 |         "opencv-python": _CV2_AVAILABLE,
 350 |         "pytesseract": _PYTESSERACT_AVAILABLE,
 351 |         "pdf2image": _PDF2IMAGE_AVAILABLE,
 352 |     }
 353 |     missing_ocr = [name for name, avail in ocr_deps.items() if not avail]
 354 |     if missing_ocr:
 355 |         logger.warning(
 356 |             f"Missing OCR dependencies: {', '.join(missing_ocr)}. OCR functionality limited/disabled."
 357 |         )
 358 |     if not _PDFPLUMBER_AVAILABLE and not _PYMUPDF_AVAILABLE:
 359 |         logger.warning(
 360 |             "Missing direct PDF text extraction libraries (pdfplumber/pymupdf). Direct text extraction disabled."
 361 |         )
 362 |     elif not _PDFPLUMBER_AVAILABLE:
 363 |         logger.warning(
 364 |             "pdfplumber not available. Will rely solely on PyMuPDF for direct text extraction."
 365 |         )
 366 |     elif not _PYMUPDF_AVAILABLE:
 367 |         logger.warning(
 368 |             "PyMuPDF not available. Will rely solely on pdfplumber for direct text extraction."
 369 |         )
 370 |     if not _TRAFILATURA_AVAILABLE:
 371 |         logger.warning("Trafilatura not installed. Trafilatura HTML extraction disabled.")
 372 |     if not _READABILITY_AVAILABLE:
 373 |         logger.warning("Readability-lxml not installed. Readability HTML extraction disabled.")
 374 |     if not _markdownify_fallback:
 375 |         logger.warning("Markdownify not installed. HTML to Markdown fallback disabled.")
 376 | 
 377 | 
 378 | # Call once on import to log status
 379 | _log_dependency_warnings()
 380 | 
 381 | 
 382 | def _load_and_compile_domain_rules():
 383 |     """Loads domain rules from config and compiles regex patterns."""
 384 |     global _DOMAIN_RULES_CACHE, _ACTIVE_DOMAIN, _BOUND_RX, _CUSTOM_SECT_RX
 385 |     global _METRIC_RX, _RISK_RX, _DOC_LABELS, _CLASS_PROMPT_PREFIX
 386 | 
 387 |     if _DOMAIN_RULES_CACHE is not None:  # Already loaded
 388 |         return
 389 | 
 390 |     logger.debug("Lazily loading and compiling domain rules...")
 391 |     default_rules = {
 392 |         "generic": {
 393 |             "classification": {
 394 |                 "labels": ["Report", "Contract", "Presentation", "Memo", "Email", "Manual"],
 395 |                 "prompt_prefix": "Classify the document into exactly one of: ",
 396 |             },
 397 |             "sections": {
 398 |                 "boundary_regex": r"^\s*(chapter\s+\d+|section\s+\d+|[A-Z][A-Za-z\s]{3,80})$",
 399 |                 "custom": [],
 400 |             },
 401 |             "metrics": {
 402 |                 "metric_1": {"aliases": ["metric one", "m1"]},
 403 |                 "metric_2": {"aliases": ["metric two", "m2"]},
 404 |             },
 405 |             "risks": {"Risk_A": r"risk a", "Risk_B": r"risk b"},
 406 |         },
 407 |         "finance": {
 408 |             "classification": {
 409 |                 "labels": [
 410 |                     "10-K",
 411 |                     "Credit Agreement",
 412 |                     "Investor Deck",
 413 |                     "Press Release",
 414 |                     "Board Minutes",
 415 |                     "NDA",
 416 |                     "LPA",
 417 |                     "CIM",
 418 |                 ],
 419 |                 "prompt_prefix": "Identify the document type (finance domain): ",
 420 |             },
 421 |             "sections": {
 422 |                 "boundary_regex": r"^\s*(item\s+\d+[a-z]?\.|[A-Z][A-Za-z\s]{3,80})$",
 423 |                 "custom": [
 424 |                     {"regex": r"item\s+1a?\.? .*business", "label": "Business"},
 425 |                     {"regex": r"item\s+1a\.? .*risk factors", "label": "Risk Factors"},
 426 |                     {"regex": r"item\s+7\.? .*management'?s discussion", "label": "MD&A"},
 427 |                     {"regex": r"covena[nv]ts", "label": "Covenants"},
 428 |                 ],
 429 |             },
 430 |             "metrics": {
 431 |                 "revenue": {
 432 |                     "aliases": ["revenue", "net sales", "total sales", "sales revenue", "turnover"]
 433 |                 },
 434 |                 "ebitda": {
 435 |                     "aliases": ["ebitda", "adj. ebitda", "operating profit", "operating income"]
 436 |                 },
 437 |                 "gross_profit": {"aliases": ["gross profit"]},
 438 |                 "net_income": {"aliases": ["net income", "net profit", "earnings"]},
 439 |                 "capex": {"aliases": ["capital expenditures", "capex"]},
 440 |                 "debt": {"aliases": ["total debt", "net debt", "long-term debt"]},
 441 |             },
 442 |             "risks": {
 443 |                 "Change_of_Control": r"change\s+of\s+control",
 444 |                 "ESG_Risk": r"(child\s+labor|environmental\s+violation|scope\s+3)",
 445 |                 "PII": r"(\bSSN\b|social security number|passport no)",
 446 |             },
 447 |         },
 448 |         "legal": {
 449 |             "classification": {
 450 |                 "labels": ["Contract", "NDA", "Lease", "Policy", "License", "Settlement"],
 451 |                 "prompt_prefix": "Classify the legal document into exactly one of: ",
 452 |             },
 453 |             "sections": {
 454 |                 "boundary_regex": r"^\s*(article\s+\d+|section\s+\d+|[A-Z][A-Za-z\s]{3,80})$",
 455 |                 "custom": [
 456 |                     {"regex": r"definitions", "label": "Definitions"},
 457 |                     {"regex": r"termination", "label": "Termination"},
 458 |                     {"regex": r"confidentiality", "label": "Confidentiality"},
 459 |                 ],
 460 |             },
 461 |             "metrics": {},
 462 |             "risks": {
 463 |                 "Indemnity": r"indemnif(y|ication)",
 464 |                 "Liquidated_Damages": r"liquidated damages",
 465 |                 "Governing_Law_NY": r"governing law.*new york",
 466 |                 "Governing_Law_DE": r"governing law.*delaware",
 467 |             },
 468 |         },
 469 |         "medical": {
 470 |             "classification": {
 471 |                 "labels": [
 472 |                     "Clinical Study",
 473 |                     "Patient Report",
 474 |                     "Lab Results",
 475 |                     "Prescription",
 476 |                     "Care Plan",
 477 |                 ],
 478 |                 "prompt_prefix": "Classify the medical document: ",
 479 |             },
 480 |             "sections": {
 481 |                 "boundary_regex": r"^\s*(section\s+\d+|[A-Z][A-Za-z\s]{3,80})$",
 482 |                 "custom": [
 483 |                     {"regex": r"diagnosis", "label": "Diagnosis"},
 484 |                     {"regex": r"treatment", "label": "Treatment"},
 485 |                     {"regex": r"medications", "label": "Medications"},
 486 |                     {"regex": r"allergies", "label": "Allergies"},
 487 |                 ],
 488 |             },
 489 |             "metrics": {
 490 |                 "blood_pressure": {"aliases": ["blood pressure", "bp"]},
 491 |                 "heart_rate": {"aliases": ["heart rate", "hr"]},
 492 |                 "temperature": {"aliases": ["temperature", "temp"]},
 493 |                 "bmi": {"aliases": ["bmi", "body mass index"]},
 494 |             },
 495 |             "risks": {
 496 |                 "Allergy": r"allergic reaction",
 497 |                 "Contraindication": r"contraindicat(ed|ion)",
 498 |                 "Adverse_Event": r"adverse event",
 499 |             },
 500 |         },
 501 |     }
 502 |     _DOMAIN_RULES_CACHE = default_rules
 503 | 
 504 |     _ACTIVE_DOMAIN = os.getenv("DOC_DOMAIN", "generic")
 505 |     # Config loading if needed:
 506 |     # from ultimate_mcp_server.config import get_config
 507 |     # try:
 508 |     #     cfg = get_config()
 509 |     #     _ACTIVE_DOMAIN = cfg.document_processing.domain if cfg and hasattr(cfg, 'document_processing') else "generic"
 510 |     # except Exception as e:
 511 |     #     logger.warning(f"Failed to load document processing domain from config: {e}. Defaulting to 'generic'.")
 512 |     #     _ACTIVE_DOMAIN = "generic"
 513 | 
 514 |     if _ACTIVE_DOMAIN not in _DOMAIN_RULES_CACHE:
 515 |         logger.warning(f"Unknown DOC_DOMAIN '{_ACTIVE_DOMAIN}', defaulting to 'generic'.")
 516 |         _ACTIVE_DOMAIN = "generic"
 517 | 
 518 |     instruction_json = _DOMAIN_RULES_CACHE[_ACTIVE_DOMAIN]
 519 | 
 520 |     try:
 521 |         _BOUND_RX = re.compile(instruction_json["sections"].get("boundary_regex", r"$^"), re.M)
 522 |     except re.error as e:
 523 |         logger.error(f"Invalid boundary regex for domain {_ACTIVE_DOMAIN}: {e}")
 524 |         _BOUND_RX = re.compile(r"$^")
 525 | 
 526 |     _CUSTOM_SECT_RX = []
 527 |     for d in instruction_json["sections"].get("custom", []):
 528 |         try:
 529 |             _CUSTOM_SECT_RX.append((re.compile(d["regex"], re.I), d["label"]))
 530 |         except re.error as e:
 531 |             logger.error(
 532 |                 f"Invalid custom section regex '{d['regex']}' for domain {_ACTIVE_DOMAIN}: {e}"
 533 |             )
 534 | 
 535 |     _METRIC_RX = []
 536 |     for key, cfg in instruction_json.get("metrics", {}).items():
 537 |         aliases = cfg.get("aliases", [])
 538 |         if aliases:
 539 |             try:
 540 |                 sorted_aliases = sorted(aliases, key=len, reverse=True)
 541 |                 joined = "|".join(re.escape(a) for a in sorted_aliases)
 542 |                 if joined:
 543 |                     pattern = re.compile(
 544 |                         rf"""(?i)\b({joined})\b[\s:–-]*([$€£]?\s?-?\d[\d,.]*)""",
 545 |                         re.VERBOSE | re.MULTILINE,
 546 |                     )
 547 |                     _METRIC_RX.append((key, pattern))
 548 |             except re.error as e:
 549 |                 logger.error(
 550 |                     f"Invalid metric regex for alias group '{key}' in domain {_ACTIVE_DOMAIN}: {e}"
 551 |                 )
 552 | 
 553 |     _RISK_RX = {}
 554 |     for t, pat_str in instruction_json.get("risks", {}).items():
 555 |         try:
 556 |             _RISK_RX[t] = re.compile(pat_str, re.I)
 557 |         except re.error as e:
 558 |             logger.error(
 559 |                 f"Invalid risk regex for '{t}' in domain {_ACTIVE_DOMAIN}: '{pat_str}'. Error: {e}"
 560 |             )
 561 | 
 562 |     _DOC_LABELS = instruction_json["classification"].get("labels", [])
 563 |     _CLASS_PROMPT_PREFIX = instruction_json["classification"].get("prompt_prefix", "")
 564 |     logger.info(f"Domain rules loaded and compiled for domain: '{_ACTIVE_DOMAIN}'")
 565 | 
 566 | 
 567 | def _get_active_domain_rules():
 568 |     """Ensures domain rules are loaded and returns them."""
 569 |     if _DOMAIN_RULES_CACHE is None:
 570 |         _load_and_compile_domain_rules()
 571 |     return {
 572 |         "active_domain": _ACTIVE_DOMAIN,
 573 |         "bound_rx": _BOUND_RX,
 574 |         "custom_sect_rx": _CUSTOM_SECT_RX,
 575 |         "metric_rx": _METRIC_RX,
 576 |         "risk_rx": _RISK_RX,
 577 |         "doc_labels": _DOC_LABELS,
 578 |         "class_prompt_prefix": _CLASS_PROMPT_PREFIX,
 579 |     }
 580 | 
 581 | 
 582 | def _get_tiktoken_encoder() -> Optional["Encoding"]:
 583 |     """Lazy load and return the tiktoken encoder instance."""
 584 |     global _tiktoken_enc_instance
 585 |     if _tiktoken_enc_instance is not None:
 586 |         return (
 587 |             _tiktoken_enc_instance
 588 |             if isinstance(_tiktoken_enc_instance, tiktoken.Encoding)
 589 |             else None
 590 |         )
 591 |     if not _TIKTOKEN_AVAILABLE:
 592 |         _tiktoken_enc_instance = False
 593 |         return None
 594 |     try:
 595 |         encoding_name = os.getenv("TIKTOKEN_ENCODING", "cl100k_base")
 596 |         logger.info(f"Lazy-loading tiktoken encoding: {encoding_name}")
 597 |         _tiktoken_enc_instance = tiktoken.get_encoding(encoding_name)  # type: ignore
 598 |         logger.info("Successfully lazy-loaded tiktoken encoder.")
 599 |         return _tiktoken_enc_instance  # type: ignore
 600 |     except Exception as e:
 601 |         logger.error(f"Failed to lazy-load tiktoken: {e}", exc_info=True)
 602 |         _tiktoken_enc_instance = False
 603 |         return None
 604 | 
 605 | 
 606 | async def _standalone_llm_call(
 607 |     *,
 608 |     prompt: str,
 609 |     provider: str = Provider.OPENAI.value,
 610 |     model: str | None = None,
 611 |     temperature: float = 0.3,
 612 |     max_tokens: int | None = None,
 613 |     extra: Dict[str, Any] | None = None,
 614 | ) -> str:
 615 |     """Standalone wrapper to make LLM calls using the completion tool."""
 616 |     if not callable(generate_completion):
 617 |         logger.error("LLM generation function 'generate_completion' is not available.")
 618 |         raise ToolError("LLM_UNAVAILABLE", details={"reason": "generate_completion not available"})
 619 | 
 620 |     chosen_provider = provider
 621 |     try:
 622 |         additional_params = extra or {}
 623 |         response_dict = await generate_completion(
 624 |             prompt=prompt,
 625 |             provider=chosen_provider,
 626 |             model=model,
 627 |             temperature=temperature,
 628 |             max_tokens=max_tokens,
 629 |             additional_params=additional_params,
 630 |         )
 631 |         if isinstance(response_dict, dict):
 632 |             if response_dict.get("isError", False) or not response_dict.get("success", True):
 633 |                 err_detail = response_dict.get("error", {})
 634 |                 err_msg = err_detail.get("message", "Unknown LLM Error")
 635 |                 err_code = err_detail.get("type", "LLM_CALL_FAILED")
 636 |                 logger.error(
 637 |                     f"LLM call failed [{err_code}]: {err_msg}. Raw Response: {response_dict}"
 638 |                 )
 639 |                 raise ToolError(
 640 |                     err_code,
 641 |                     details={
 642 |                         "provider": chosen_provider,
 643 |                         "error": err_msg,
 644 |                         "raw_response": str(response_dict),
 645 |                     },
 646 |                 )
 647 |             llm_content = response_dict.get("text") or response_dict.get("content")
 648 |             if llm_content is None:
 649 |                 logger.error(f"LLM response missing 'text'/'content': {response_dict}")
 650 |                 raise ToolError(
 651 |                     "LLM_INVALID_RESPONSE",
 652 |                     details={"reason": "Missing content", "response_received": str(response_dict)},
 653 |                 )
 654 |             if isinstance(llm_content, str):
 655 |                 return llm_content.strip()
 656 |             else:
 657 |                 logger.warning(f"LLM content not string: {type(llm_content)}. Converting.")
 658 |                 return str(llm_content).strip()
 659 |         else:
 660 |             logger.error(f"LLM response unexpected format: {response_dict}")
 661 |             raise ToolError(
 662 |                 "LLM_INVALID_RESPONSE", details={"response_received": str(response_dict)}
 663 |             )
 664 |     except ProviderError as pe:
 665 |         logger.error(f"LLM provider error ({chosen_provider}): {pe}", exc_info=True)
 666 |         raise ToolError(
 667 |             "LLM_PROVIDER_ERROR",
 668 |             details={"provider": chosen_provider, "error_code": pe.error_code, "error": str(pe)},
 669 |         ) from pe
 670 |     except ToolError as te:
 671 |         raise te
 672 |     except Exception as e:
 673 |         logger.error(f"LLM call failed ({chosen_provider}): {e}", exc_info=True)
 674 |         raise ToolError(
 675 |             "LLM_CALL_FAILED", details={"provider": chosen_provider, "error": str(e)}
 676 |         ) from e
 677 | 
 678 | 
 679 | @contextmanager
 680 | def _span(label: str):
 681 |     """Context manager for timing operations (module level)."""
 682 |     st = time.perf_counter()
 683 |     logger.debug(f"Starting span: {label}")
 684 |     try:
 685 |         yield
 686 |     finally:
 687 |         elapsed = time.perf_counter() - st
 688 |         logger.debug(f"Finished span: {label} ({elapsed:.3f}s)")
 689 | 
 690 | 
 691 | def _get_docling_converter(device, threads: int):
 692 |     """Create a Docling DocumentConverter."""
 693 |     if not _DOCLING_AVAILABLE:
 694 |         raise ToolError("DEPENDENCY_MISSING", details={"dependency": "docling"})
 695 |     if (
 696 |         not PdfPipelineOptions
 697 |         or not AcceleratorOptions
 698 |         or not InputFormat
 699 |         or not PdfFormatOption
 700 |         or not DocumentConverter
 701 |     ):
 702 |         raise ToolError(
 703 |             "INTERNAL_ERROR", details={"reason": "Docling partially imported but types missing"}
 704 |         )
 705 |     opts = PdfPipelineOptions()
 706 |     opts.do_ocr = False
 707 |     opts.generate_page_images = False
 708 |     opts.do_table_extraction = True # Explicitly enable table extraction in the pipeline options
 709 |     opts.accelerator_options = AcceleratorOptions(num_threads=threads, device=device)
 710 |     try:
 711 |         converter_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
 712 |         return DocumentConverter(format_options=converter_options)
 713 |     except Exception as e:
 714 |         logger.error(f"Failed to initialize Docling DocumentConverter: {e}", exc_info=True)
 715 |         raise ToolError(
 716 |             "INITIALIZATION_FAILED", details={"component": "DocumentConverter", "error": str(e)}
 717 |         ) from e
 718 | 
 719 | 
 720 | def _get_input_path_or_temp(
 721 |     document_path: Optional[str], document_data: Optional[bytes]
 722 | ) -> Tuple[Path, bool]:
 723 |     """Gets a valid Path object for input. Saves data to temp file if needed."""
 724 |     is_temp = False
 725 |     if document_path:
 726 |         path = Path(document_path)
 727 |         if not path.is_file():
 728 |             raise ToolInputError(
 729 |                 f"Input file not found: {document_path}", param_name="document_path"
 730 |             )
 731 |         return path, is_temp
 732 |     elif document_data:
 733 |         try:
 734 |             suffix = ".bin"
 735 |             if document_data.startswith(b"%PDF"):
 736 |                 suffix = ".pdf"
 737 |             elif len(document_data) > 10 and document_data[6:10] in (b"JFIF", b"Exif"):
 738 |                 suffix = ".jpg"
 739 |             elif document_data.startswith(b"\x89PNG\r\n\x1a\n"):
 740 |                 suffix = ".png"
 741 |             elif document_data.startswith((b"II*\x00", b"MM\x00*")):
 742 |                 suffix = ".tiff"
 743 |             elif document_data.startswith(b"PK\x03\x04"):
 744 |                 suffix = ".zip"
 745 |             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
 746 |                 tmp_file.write(document_data)
 747 |                 path = Path(tmp_file.name)
 748 |             is_temp = True
 749 |             logger.debug(f"Saved input data to temporary file: {path}")
 750 |             return path, is_temp
 751 |         except Exception as e:
 752 |             raise ToolError(
 753 |                 "TEMP_FILE_ERROR",
 754 |                 details={"error": f"Failed to save input data to temporary file: {e}"},
 755 |             ) from e
 756 |     else:
 757 |         raise ToolInputError("Either 'document_path' or 'document_data' must be provided.")
 758 | 
 759 | 
 760 | @contextmanager
 761 | def _handle_temp_file(path: Path, is_temp: bool):
 762 |     """Context manager to clean up temporary file."""
 763 |     try:
 764 |         yield path
 765 |     finally:
 766 |         if is_temp and path.exists():
 767 |             try:
 768 |                 path.unlink()
 769 |                 logger.debug(f"Cleaned up temporary file: {path}")
 770 |             except OSError as e:
 771 |                 logger.warning(f"Failed to delete temporary file {path}: {e}")
 772 | 
 773 | 
 774 | def _tmp_path(src: str, fmt: str) -> Path:
 775 |     """Generate a temporary file path for output."""
 776 |     src_path = Path(src.split("?")[0])
 777 |     stem = src_path.stem or "document"
 778 |     ext = "md" if fmt == "markdown" else fmt
 779 |     timestamp = int(time.time() * 1000)
 780 |     temp_dir = Path(tempfile.gettempdir())
 781 |     temp_dir.mkdir(parents=True, exist_ok=True)
 782 |     return temp_dir / f"{stem}_{timestamp}.{ext}"
 783 | 
 784 | 
 785 | def _get_docling_metadata(doc: Any) -> dict[str, Any]:
 786 |     """Extract metadata from a Docling document."""
 787 |     if not _DOCLING_AVAILABLE or not doc:
 788 |         return {"error": "Docling not available or document object missing"}
 789 |     num_pages = 0
 790 |     try:
 791 |         num_pages = doc.num_pages() if callable(getattr(doc, "num_pages", None)) else 0
 792 |         has_tables = False
 793 |         has_figures = False
 794 |         has_sections = False
 795 |         if hasattr(doc, "pages") and isinstance(doc.pages, list):
 796 |             for page in doc.pages:
 797 |                 if hasattr(page, "content") and page.content:
 798 |                     if (
 799 |                         callable(getattr(page.content, "has_tables", None))
 800 |                         and page.content.has_tables()
 801 |                     ):
 802 |                         has_tables = True
 803 |                     if (
 804 |                         callable(getattr(page.content, "has_figures", None))
 805 |                         and page.content.has_figures()
 806 |                     ):
 807 |                         has_figures = True
 808 |                 if has_tables and has_figures:
 809 |                     break
 810 |         if hasattr(doc, "texts") and isinstance(doc.texts, list):
 811 |             for item in doc.texts:
 812 |                 if hasattr(item, "__class__") and item.__class__.__name__ == "SectionHeaderItem":
 813 |                     has_sections = True
 814 |                     break
 815 |                 elif hasattr(item, "label") and getattr(item, "label", None) == "section_header":
 816 |                     has_sections = True
 817 |                     break
 818 |         return {
 819 |             "num_pages": num_pages,
 820 |             "has_tables": has_tables,
 821 |             "has_figures": has_figures,
 822 |             "has_sections": has_sections,
 823 |         }
 824 |     except Exception as e:
 825 |         logger.warning(f"Docling metadata collection failed: {e}", exc_info=True)
 826 |         return {
 827 |             "num_pages": num_pages,
 828 |             "has_tables": False,
 829 |             "has_figures": False,
 830 |             "has_sections": False,
 831 |             "metadata_error": str(e),
 832 |         }
 833 | 
 834 | 
 835 | def _get_basic_metadata(text_content: str, num_pages: int = 0) -> dict[str, Any]:
 836 |     """Generate basic metadata for non-Docling content."""
 837 |     has_tables = "| --- |" in text_content or "\t" in text_content
 838 |     has_figures = "![" in text_content
 839 |     has_sections = bool(re.search(r"^#{1,6}\s+", text_content, re.M))
 840 |     return {
 841 |         "num_pages": num_pages,
 842 |         "has_tables": has_tables,
 843 |         "has_figures": has_figures,
 844 |         "has_sections": has_sections,
 845 |     }
 846 | 
 847 | 
 848 | def _json(obj: Any) -> str:
 849 |     """Utility to serialize objects to JSON."""
 850 |     return json.dumps(obj, ensure_ascii=False, separators=(",", ":"))
 851 | 
 852 | 
 853 | def _hash(txt: str) -> str:
 854 |     """Generate SHA-1 hash of text."""
 855 |     return hashlib.sha1(txt.encode("utf-8", "ignore")).hexdigest()
 856 | 
 857 | 
 858 | # --- HTML Helpers ---
 859 | def _is_html_fragment(text: str) -> bool:
 860 |     """Check if text contains likely HTML markup using precompiled patterns."""
 861 |     check_len = min(len(text), 5000)
 862 |     sample = text[:check_len]
 863 |     return any(p.search(sample) for p in _HTML_PATTERNS)
 864 | 
 865 | 
 866 | def _best_soup(html_txt: str) -> Tuple[BeautifulSoup, str]:
 867 |     """Try progressively more forgiving parsers; fall back to empty soup."""
 868 |     parsers = ("html.parser", "lxml", "html5lib")
 869 |     last_exception = None
 870 |     for p_name in parsers:
 871 |         try:
 872 |             # Use ModuleNotFoundError for library availability checks
 873 |             return BeautifulSoup(html_txt, p_name), p_name
 874 |         except ModuleNotFoundError:
 875 |             logger.debug(f"HTML parser '{p_name}' not installed, skipping.")
 876 |             continue
 877 |         except Exception as e_parse:
 878 |             last_exception = e_parse
 879 |             logger.debug(f"HTML parsing with '{p_name}' failed: {e_parse}")
 880 |             continue
 881 | 
 882 |     if last_exception:
 883 |         logger.warning(
 884 |             f"All standard HTML parsers failed ({last_exception}), attempting fragment parsing."
 885 |         )
 886 |     wrapped_html = (
 887 |         f"<!DOCTYPE html><html><head><title>Fragment</title></head><body>{html_txt}</body></html>"
 888 |     )
 889 |     try:
 890 |         return BeautifulSoup(wrapped_html, "html.parser"), "html.parser-fragment"
 891 |     except Exception as e_frag:
 892 |         logger.error(
 893 |             f"Fragment parsing also failed: {e_frag}. Returning empty soup.", exc_info=True
 894 |         )
 895 |         return BeautifulSoup("", "html.parser"), "failed"
 896 | 
 897 | 
 898 | def _clean_html(html_txt: str) -> Tuple[str, str]:
 899 |     """Remove dangerous/pointless elements & attempt structural repair."""
 900 |     soup, parser_used = _best_soup(html_txt)
 901 |     if parser_used == "failed":
 902 |         logger.warning("HTML cleaning skipped due to parsing failure.")
 903 |         return html_txt, parser_used
 904 | 
 905 |     tags_to_remove = [
 906 |         "script",
 907 |         "style",
 908 |         "svg",
 909 |         "iframe",
 910 |         "canvas",
 911 |         "noscript",
 912 |         "meta",
 913 |         "link",
 914 |         "form",
 915 |         "input",
 916 |         "button",
 917 |         "select",
 918 |         "textarea",
 919 |         "nav",
 920 |         "aside",
 921 |         "header",
 922 |         "footer",
 923 |         "video",
 924 |         "audio",
 925 |     ]
 926 |     for el in soup(tags_to_remove):
 927 |         el.decompose()
 928 | 
 929 |     unsafe_attrs = ["style", "onclick", "onload", "onerror", "onmouseover", "onmouseout", "target"]
 930 |     for tag in soup.find_all(True):
 931 |         current_attrs = list(tag.attrs.keys())
 932 |         for attr in current_attrs:
 933 |             attr_val_str = str(tag.get(attr, "")).lower()
 934 |             is_unsafe = (
 935 |                 attr in unsafe_attrs
 936 |                 or attr.startswith("on")
 937 |                 or attr.startswith("data-")
 938 |                 or (attr == "src" and ("javascript:" in attr_val_str or "data:" in attr_val_str))
 939 |                 or (attr == "href" and attr_val_str.startswith("javascript:"))
 940 |             )
 941 |             if is_unsafe and attr in tag.attrs:
 942 |                 del tag[attr]
 943 |     try:
 944 |         text = str(soup)
 945 |         text = html.unescape(text)
 946 |         text = re.sub(r"[ \t\r\f\v]+", " ", text)
 947 |         text = re.sub(r"\n\s*\n", "\n\n", text)
 948 |         text = text.strip()
 949 |     except Exception as e:
 950 |         logger.error(f"Error during HTML text processing (unescape/regex): {e}", exc_info=True)
 951 |         try:
 952 |             return str(soup), parser_used
 953 |         except Exception as stringify_error:
 954 |             logger.error(f"Could not stringify soup after error: {stringify_error}")
 955 |             return html_txt, parser_used
 956 | 
 957 |     return text, parser_used
 958 | 
 959 | 
 960 | # --- Markdown Helpers ---
 961 | def _sanitize(md: str) -> str:
 962 |     """Basic Markdown sanitization."""
 963 |     if not md:
 964 |         return ""
 965 |     md = md.replace("\u00a0", " ")
 966 |     md = _BULLET_RX.sub("- ", md)
 967 |     md = re.sub(r"\n{3,}", "\n\n", md)
 968 |     md = re.sub(r"[ \t]+$", "", md, flags=re.MULTILINE)
 969 |     md = re.sub(r"^[ \t]+", "", md, flags=re.MULTILINE)
 970 |     md = re.sub(r"(^|\n)(#{1,6})([^#\s])", r"\1\2 \3", md)
 971 |     md = re.sub(r"```\s*\n", "```\n", md)
 972 |     md = re.sub(r"\n\s*```", "\n```", md)
 973 |     md = re.sub(r"^[*+]\s", "- ", md, flags=re.MULTILINE)
 974 |     md = re.sub(r"^\d+\.\s", lambda m: f"{m.group(0).strip()} ", md, flags=re.MULTILINE)
 975 |     return md.strip()
 976 | 
 977 | 
 978 | def _improve(md: str) -> str:
 979 |     """Apply structural improvements to Markdown text."""
 980 |     if not md:
 981 |         return ""
 982 |     # Ensure blank lines around major block elements
 983 |     md = re.sub(r"(?<=\S)\n(#{1,6}\s)", r"\n\n\1", md)
 984 |     md = re.sub(r"(^#{1,6}\s.*\S)\n(?!\n|#|```|>|\s*[-*+]|\s*\d+\.)", r"\1\n\n", md, flags=re.M)
 985 |     md = re.sub(r"(?<=\S)\n(```)", r"\n\n\1", md)
 986 |     md = re.sub(r"(```)\n(?!\n)", r"\1\n\n", md)
 987 |     md = re.sub(r"(?<=\S)\n(> )", r"\n\n\1", md)
 988 |     md = re.sub(r"(\n> .*\S)\n(?!\n|>\s)", r"\1\n\n", md, flags=re.M)
 989 |     md = re.sub(r"(?<=\S)\n(\s*([-*+]|\d+\.)\s)", r"\n\n\1", md)
 990 |     md = re.sub(
 991 |         r"(\n(\s*[-*+]\s+|\s*\d+\.\s+).*\S)\n(?!\n|\s*([-*+]|\d+\.)\s)", r"\1\n\n", md, flags=re.M
 992 |     )
 993 |     md = re.sub(r"(?<=\S)\n(-{3,}|\*{3,}|_{3,})$", r"\n\n\1", md, flags=re.M)
 994 |     md = re.sub(r"(^-{3,}|\*{3,}|_{3,})\n(?!\n)", r"\1\n\n", md, flags=re.M)
 995 |     md = re.sub(r"\n{3,}", "\n\n", md)
 996 |     return md.strip()
 997 | 
 998 | 
 999 | def _convert_html_table_to_markdown(table_tag: Tag) -> str:
1000 |     """Converts a single BeautifulSoup table Tag to a Markdown string."""
1001 |     md_rows = []
1002 |     num_cols = 0
1003 |     header_row_tag = table_tag.find("thead")
1004 |     header_cells_tags = []
1005 |     if header_row_tag:
1006 |         header_cells_tags = header_row_tag.find_all(["th", "td"], recursive=False)
1007 |         if not header_cells_tags:
1008 |             header_row_tr = header_row_tag.find("tr")
1009 |             if header_row_tr:
1010 |                 header_cells_tags = header_row_tr.find_all(["th", "td"])
1011 |     if not header_cells_tags:
1012 |         first_row = table_tag.find("tr")
1013 |         if first_row:
1014 |             temp_cells = first_row.find_all(["th", "td"])
1015 |             is_header = any(c.name == "th" for c in temp_cells) or (
1016 |                 len(temp_cells) > 0
1017 |                 and not any(re.match(r"^\s*[\d.,-]+\s*$", c.get_text()) for c in temp_cells)
1018 |             )
1019 |             if is_header:
1020 |                 header_cells_tags = temp_cells
1021 | 
1022 |     if header_cells_tags:
1023 |         num_cols = len(header_cells_tags)
1024 |         hdr = [
1025 |             " ".join(c.get_text(" ", strip=True).replace("|", "\\|").split())
1026 |             for c in header_cells_tags
1027 |         ]
1028 |         md_rows.append("| " + " | ".join(hdr) + " |")
1029 |         md_rows.append("| " + " | ".join(["---"] * num_cols) + " |")
1030 |     else:
1031 |         body_rows_tags = (
1032 |             table_tag.find("tbody").find_all("tr")
1033 |             if table_tag.find("tbody")
1034 |             else table_tag.find_all("tr")
1035 |         )
1036 |         if not body_rows_tags:
1037 |             return ""
1038 |         for r in body_rows_tags:
1039 |             num_cols = max(num_cols, len(r.find_all(["th", "td"])))
1040 |         if num_cols == 0:
1041 |             return ""
1042 |         logger.debug(f"Table has no clear header, assuming {num_cols} columns.")
1043 |         md_rows.append("| " + " | ".join([f"Col {i + 1}" for i in range(num_cols)]) + " |")
1044 |         md_rows.append("| " + " | ".join(["---"] * num_cols) + " |")
1045 | 
1046 |     body_rows_tags = []
1047 |     tbody = table_tag.find("tbody")
1048 |     if tbody:
1049 |         body_rows_tags = tbody.find_all("tr")
1050 |     else:
1051 |         all_trs = table_tag.find_all("tr")
1052 |         start_index = (
1053 |             1
1054 |             if header_cells_tags
1055 |             and all_trs
1056 |             and header_cells_tags[0].find_parent("tr") == all_trs[0]
1057 |             else 0
1058 |         )
1059 |         body_rows_tags = all_trs[start_index:]
1060 | 
1061 |     for r in body_rows_tags:
1062 |         cells = r.find_all(["td", "th"])
1063 |         cell_texts = [
1064 |             " ".join(c.get_text(" ", strip=True).replace("|", "\\|").split()) for c in cells
1065 |         ]
1066 |         cell_texts.extend([""] * (num_cols - len(cells)))
1067 |         cell_texts = cell_texts[:num_cols]
1068 |         md_rows.append("| " + " | ".join(cell_texts) + " |")
1069 | 
1070 |     return "\n".join(md_rows)
1071 | 
1072 | 
1073 | def _convert_html_tables_to_markdown(html_txt: str) -> str:
1074 |     """Finds HTML tables and replaces them with Markdown format within the HTML string."""
1075 |     soup, parser_used = _best_soup(html_txt)
1076 |     if parser_used == "failed":
1077 |         logger.warning("Skipping HTML table conversion due to parsing failure.")
1078 |         return html_txt
1079 |     tables = soup.find_all("table")
1080 |     if not tables:
1081 |         return html_txt
1082 |     logger.debug(f"Found {len(tables)} HTML tables to convert to Markdown.")
1083 |     for table_tag in tables:
1084 |         try:
1085 |             md_table_str = _convert_html_table_to_markdown(table_tag)
1086 |             if md_table_str:
1087 |                 placeholder = soup.new_string(f"\n\n{md_table_str}\n\n")
1088 |                 table_tag.replace_with(placeholder)
1089 |             else:
1090 |                 table_tag.decompose()
1091 |         except Exception as e:
1092 |             logger.error(f"Failed to convert a table to Markdown: {e}", exc_info=True)
1093 |     return str(soup)
1094 | 
1095 | 
1096 | def _html_to_md_core(html_txt: str, links: bool, imgs: bool, tbls: bool, width: int) -> str:
1097 |     """Convert HTML to Markdown using primary and fallback libraries."""
1098 |     try:
1099 |         h = html2text.HTML2Text()
1100 |         h.ignore_links = not links
1101 |         h.ignore_images = not imgs
1102 |         processed_html = html_txt
1103 |         if tbls:
1104 |             processed_html = _convert_html_tables_to_markdown(html_txt)
1105 |             h.ignore_tables = True
1106 |         else:
1107 |             h.ignore_tables = True
1108 | 
1109 |         h.body_width = width if width > 0 else 0
1110 |         h.unicode_snob = True
1111 |         h.escape_snob = True
1112 |         h.skip_internal_links = True
1113 |         h.single_line_break = True
1114 | 
1115 |         md_text = h.handle(processed_html)
1116 |         logger.debug("html2text conversion successful.")
1117 |         return md_text.strip()
1118 |     except Exception as e_html2text:
1119 |         logger.warning(f"html2text failed ({e_html2text}); attempting fallback with markdownify")
1120 |         if _markdownify_fallback and callable(_markdownify_fallback):
1121 |             try:
1122 |                 md_opts = {
1123 |                     "strip": [
1124 |                         "script",
1125 |                         "style",
1126 |                         "meta",
1127 |                         "link",
1128 |                         "head",
1129 |                         "iframe",
1130 |                         "form",
1131 |                         "button",
1132 |                         "input",
1133 |                         "select",
1134 |                         "textarea",
1135 |                         "nav",
1136 |                         "aside",
1137 |                         "header",
1138 |                         "footer",
1139 |                         "svg",
1140 |                         "canvas",
1141 |                         "video",
1142 |                         "audio",
1143 |                     ],
1144 |                     "convert": [
1145 |                         "a",
1146 |                         "p",
1147 |                         "img",
1148 |                         "br",
1149 |                         "hr",
1150 |                         "h1",
1151 |                         "h2",
1152 |                         "h3",
1153 |                         "h4",
1154 |                         "h5",
1155 |                         "h6",
1156 |                         "li",
1157 |                         "ul",
1158 |                         "ol",
1159 |                         "blockquote",
1160 |                         "code",
1161 |                         "pre",
1162 |                         "strong",
1163 |                         "em",
1164 |                         "b",
1165 |                         "i",
1166 |                         "table",
1167 |                         "tr",
1168 |                         "td",
1169 |                         "th",
1170 |                     ],
1171 |                     "heading_style": "ATX",
1172 |                     "bullets": "-",
1173 |                     "strong_em_symbol": "*",
1174 |                     "autolinks": False,
1175 |                 }
1176 |                 if not links:
1177 |                     md_opts["convert"] = [tag for tag in md_opts["convert"] if tag != "a"]
1178 |                 if not imgs:
1179 |                     md_opts["convert"] = [tag for tag in md_opts["convert"] if tag != "img"]
1180 |                 if not tbls:
1181 |                     md_opts["convert"] = [
1182 |                         tag for tag in md_opts["convert"] if tag not in ["table", "tr", "td", "th"]
1183 |                     ]
1184 |                 md_text = _markdownify_fallback(html_txt, **md_opts)
1185 |                 logger.debug("Markdownify fallback conversion successful.")
1186 |                 return md_text.strip()
1187 |             except Exception as e_markdownify:
1188 |                 logger.error(f"Markdownify fallback also failed: {e_markdownify}", exc_info=True)
1189 |                 raise ToolError(
1190 |                     "MARKDOWN_CONVERSION_FAILED",
1191 |                     details={
1192 |                         "reason": "Both failed",
1193 |                         "html2text_error": str(e_html2text),
1194 |                         "markdownify_error": str(e_markdownify),
1195 |                     },
1196 |                 ) from e_markdownify
1197 |         else:
1198 |             logger.error("html2text failed and markdownify fallback is not available.")
1199 |             raise ToolError(
1200 |                 "MARKDOWN_CONVERSION_FAILED",
1201 |                 details={"reason": "html2text failed, no fallback", "error": str(e_html2text)},
1202 |             ) from e_html2text
1203 | 
1204 | 
1205 | ###############################################################################
1206 | # Core OCR & PDF Helper Functions (Standalone)                                #
1207 | ###############################################################################
1208 | 
1209 | 
1210 | def _ocr_check_dep(dep_name: str, is_available: bool, feature: str):
1211 |     """Checks if a required dependency is available, raising ToolError if not."""
1212 |     if not is_available:
1213 |         logger.error(f"Missing required dependency '{dep_name}' for feature '{feature}'.")
1214 |         raise ToolError("DEPENDENCY_MISSING", details={"dependency": dep_name, "feature": feature})
1215 | 
1216 | 
1217 | def _ocr_extract_text_from_pdf_direct(
1218 |     file_path: Path, start_page: int = 0, max_pages: int = 0
1219 | ) -> Tuple[List[str], bool]:
1220 |     """
1221 |     Extracts text directly from PDF using PyMuPDF or PDFPlumber (sync function).
1222 | 
1223 |     Args:
1224 |         file_path: Path to the PDF file.
1225 |         start_page: 0-based starting page index.
1226 |         max_pages: Maximum number of pages to extract (0 for all from start_page).
1227 | 
1228 |     Returns:
1229 |         Tuple containing:
1230 |         - List of strings, one per extracted page (or error marker).
1231 |         - Boolean indicating if meaningful text was found on at least one page.
1232 |     """
1233 |     texts: List[str] = []
1234 |     has_text = False
1235 |     min_chars = 50  # Threshold for considering a page to have meaningful text
1236 | 
1237 |     # --- Try PyMuPDF First ---
1238 |     if _PYMUPDF_AVAILABLE and pymupdf:
1239 |         logger.debug(f"Attempting direct text extraction with PyMuPDF for {file_path}")
1240 |         try:
1241 |             with pymupdf.open(file_path) as doc:  # type: ignore
1242 |                 total_pages = len(doc)
1243 |                 # Calculate 0-based end page index (exclusive)
1244 |                 end_page = (
1245 |                     total_pages if max_pages <= 0 else min(start_page + max_pages, total_pages)
1246 |                 )
1247 |                 # Ensure start_page is valid
1248 |                 start_page = min(start_page, total_pages)
1249 |                 end_page = max(start_page, end_page)  # Ensure end is not before start
1250 | 
1251 |                 for i in range(start_page, end_page):
1252 |                     try:
1253 |                         page = doc.load_page(i)  # Use load_page for clarity
1254 |                         page_text = page.get_text("text") or ""  # Specify text format
1255 |                         texts.append(page_text)
1256 |                         if len(page_text.strip()) >= min_chars:
1257 |                             has_text = True
1258 |                     except Exception as e_page:
1259 |                         logger.warning(
1260 |                             f"PyMuPDF: Error extracting text from page {i + 1}: {e_page}"
1261 |                         )
1262 |                         texts.append(f"[Page {i + 1} Extraction Error: PyMuPDF]")
1263 |                 logger.debug(
1264 |                     f"PyMuPDF extracted {len(texts)} pages. Found meaningful text: {has_text}"
1265 |                 )
1266 |                 return texts, has_text
1267 |         except Exception as e_pymupdf:
1268 |             logger.warning(
1269 |                 f"PyMuPDF direct text extraction failed: {e_pymupdf}. Trying PDFPlumber..."
1270 |             )
1271 |             # Fall through to PDFPlumber if PyMuPDF failed
1272 | 
1273 |     # --- Try PDFPlumber as Fallback ---
1274 |     if _PDFPLUMBER_AVAILABLE and pdfplumber:
1275 |         logger.debug(f"Attempting direct text extraction with PDFPlumber for {file_path}")
1276 |         try:
1277 |             # pdfplumber might require explicit closing
1278 |             pdf = pdfplumber.open(file_path)  # type: ignore
1279 |             try:
1280 |                 total_pages = len(pdf.pages)
1281 |                 end_page = (
1282 |                     total_pages if max_pages <= 0 else min(start_page + max_pages, total_pages)
1283 |                 )
1284 |                 start_page = min(start_page, total_pages)
1285 |                 end_page = max(start_page, end_page)
1286 | 
1287 |                 for i in range(start_page, end_page):
1288 |                     try:
1289 |                         page = pdf.pages[i]
1290 |                         # Use slightly more tolerant settings
1291 |                         page_text = (
1292 |                             page.extract_text(x_tolerance=2, y_tolerance=2, keep_blank_chars=True)
1293 |                             or ""
1294 |                         )
1295 |                         texts.append(page_text)
1296 |                         if len(page_text.strip()) >= min_chars:
1297 |                             has_text = True
1298 |                     except Exception as e_page:
1299 |                         logger.warning(
1300 |                             f"PDFPlumber: Error extracting text from page {i + 1}: {e_page}"
1301 |                         )
1302 |                         texts.append(f"[Page {i + 1} Extraction Error: PDFPlumber]")
1303 |                 logger.debug(
1304 |                     f"PDFPlumber extracted {len(texts)} pages. Found meaningful text: {has_text}."
1305 |                 )
1306 |                 return texts, has_text
1307 |             finally:
1308 |                 pdf.close()  # Ensure file handle is closed
1309 |         except Exception as e_plumber:
1310 |             logger.error(f"PDFPlumber direct text extraction failed: {e_plumber}", exc_info=True)
1311 |             # If PyMuPDF also failed (or wasn't available), raise the final error
1312 |             if (
1313 |                 not _PYMUPDF_AVAILABLE
1314 |             ):  # Only raise if it was the only option tried or PyMuPDF failed before
1315 |                 raise ToolError(
1316 |                     "DIRECT_EXTRACTION_FAILED",
1317 |                     details={"reason": "PDFPlumber failed", "error": str(e_plumber)},
1318 |                 ) from e_plumber
1319 |             else:  # PyMuPDF failed first, now PDFPlumber failed
1320 |                 raise ToolError(
1321 |                     "DIRECT_EXTRACTION_FAILED",
1322 |                     details={
1323 |                         "reason": "Both PyMuPDF and PDFPlumber failed",
1324 |                         "error": str(e_plumber),
1325 |                     },
1326 |                 ) from e_plumber
1327 | 
1328 |     # --- If neither library worked ---
1329 |     logger.error(
1330 |         "No functional direct PDF text extraction library (PyMuPDF or PDFPlumber) available or both failed."
1331 |     )
1332 |     raise ToolError("DIRECT_EXTRACTION_FAILED", details={"reason": "No available/working library"})
1333 | 
1334 | 
1335 | def _ocr_convert_pdf_to_images(
1336 |     file_path: Path, start_page: int = 0, max_pages: int = 0, dpi: int = 300
1337 | ) -> List["PILImage.Image"]:
1338 |     """Converts PDF path pages to PIL Images using 0-based indexing internally (sync function)."""
1339 |     _ocr_check_dep("pdf2image", _PDF2IMAGE_AVAILABLE, "PDF->Image Conversion")
1340 |     _ocr_check_dep("Pillow", _PIL_AVAILABLE, "PDF->Image Conversion")
1341 |     if convert_from_path is None:
1342 |         raise ToolError("INTERNAL_ERROR", details={"reason": "pdf2image.convert_from_path is None"})
1343 | 
1344 |     try:
1345 |         # pdf2image uses 1-based indexing for first_page/last_page args
1346 |         first_page_1based = start_page + 1
1347 |         last_page_1based = None if max_pages <= 0 else first_page_1based + max_pages - 1
1348 |         logger.debug(
1349 |             f"Converting PDF {file_path} (pages {first_page_1based}-{last_page_1based or 'end'}, dpi={dpi})"
1350 |         )
1351 | 
1352 |         with _span(f"pdf2image_path_p{first_page_1based}-{last_page_1based or 'end'}"):
1353 |             # pdf2image handles its own temporary files internally if output_folder=None
1354 |             # Using a TemporaryDirectory might be slightly less efficient but ensures cleanup
1355 |             with tempfile.TemporaryDirectory() as temp_dir:
1356 |                 images = convert_from_path(  # type: ignore
1357 |                     file_path,
1358 |                     dpi=dpi,
1359 |                     first_page=first_page_1based,
1360 |                     last_page=last_page_1based,
1361 |                     output_folder=temp_dir,  # Recommended for multi-threading stability
1362 |                     fmt="png",
1363 |                     thread_count=max(1, os.cpu_count() // 2 if os.cpu_count() else 1),
1364 |                     use_pdftocairo=True,  # Often more reliable than pdftoppm
1365 |                 )
1366 |         logger.info(f"Converted {len(images)} pages from PDF path.")
1367 |         return images  # type: ignore
1368 |     except Exception as e:  # Catch specific pdf2image errors if library defines them
1369 |         logger.error(f"PDF path to image conversion failed: {e}", exc_info=True)
1370 |         raise ToolError(
1371 |             "PDF_CONVERSION_FAILED", details={"reason": "pdf2image path failed", "error": str(e)}
1372 |         ) from e
1373 | 
1374 | 
1375 | def _ocr_preprocess_image(
1376 |     image: "PILImage.Image", preprocessing_options: Optional[Dict[str, Any]] = None
1377 | ) -> "PILImage.Image":
1378 |     """Preprocesses an image for better OCR results (sync function)."""
1379 |     if not _PIL_AVAILABLE:
1380 |         logger.warning("Pillow (PIL) not available. Skipping preprocessing.")
1381 |         return image
1382 |     if not ImageEnhance or not ImageFilter:  # Check specifically for submodules
1383 |         logger.warning("PIL ImageEnhance or ImageFilter missing. Some enhancements skipped.")
1384 | 
1385 |     can_use_cv2 = _CV2_AVAILABLE and _NUMPY_AVAILABLE and cv2 is not None and np is not None
1386 |     if (
1387 |         not can_use_cv2
1388 |         and preprocessing_options
1389 |         and any(k in preprocessing_options for k in ["denoise", "threshold", "deskew"])
1390 |     ):
1391 |         logger.warning("OpenCV/NumPy missing. Advanced preprocessing disabled.")
1392 | 
1393 |     prep_opts = {
1394 |         "denoise": True,
1395 |         "threshold": "otsu",
1396 |         "deskew": True,
1397 |         "enhance_contrast": True,
1398 |         "enhance_brightness": False,
1399 |         "enhance_sharpness": False,
1400 |         "apply_filters": [],
1401 |         "resize_factor": 1.0,
1402 |         **(preprocessing_options or {}),
1403 |     }
1404 |     logger.debug(f"Applying preprocessing with options: {prep_opts}")
1405 | 
1406 |     img_pil = image.copy()
1407 |     # Apply PIL enhancements first
1408 |     if ImageEnhance:
1409 |         if prep_opts.get("enhance_brightness"):
1410 |             img_pil = ImageEnhance.Brightness(img_pil).enhance(1.3)
1411 |         if prep_opts.get("enhance_contrast") and not can_use_cv2:
1412 |             img_pil = ImageEnhance.Contrast(img_pil).enhance(1.4)
1413 |         if prep_opts.get("enhance_sharpness"):
1414 |             img_pil = ImageEnhance.Sharpness(img_pil).enhance(1.5)
1415 |     if ImageFilter:
1416 |         filters = prep_opts.get("apply_filters", [])
1417 |         for filter_name in filters:
1418 |             try:
1419 |                 if filter_name == "unsharp_mask":
1420 |                     img_pil = img_pil.filter(ImageFilter.UnsharpMask(radius=2, percent=150))
1421 |                 elif filter_name == "detail":
1422 |                     img_pil = img_pil.filter(ImageFilter.DETAIL)
1423 |                 elif filter_name == "edge_enhance":
1424 |                     img_pil = img_pil.filter(ImageFilter.EDGE_ENHANCE)
1425 |                 elif filter_name == "smooth":
1426 |                     img_pil = img_pil.filter(ImageFilter.SMOOTH)
1427 |                 else:
1428 |                     logger.warning(f"Unknown PIL filter: {filter_name}")
1429 |             except Exception as e:
1430 |                 logger.warning(f"PIL filter '{filter_name}' failed: {e}")
1431 | 
1432 |     if not can_use_cv2:
1433 |         return img_pil  # Return PIL-enhanced if CV2 unavailable
1434 | 
1435 |     # OpenCV Processing
1436 |     try:
1437 |         img_cv = np.array(img_pil)
1438 |         if len(img_cv.shape) == 3 and img_cv.shape[2] == 3:
1439 |             gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
1440 |         elif len(img_cv.shape) == 3 and img_cv.shape[2] == 4:
1441 |             gray = cv2.cvtColor(img_cv, cv2.COLOR_RGBA2GRAY)
1442 |         else:
1443 |             gray = img_cv
1444 | 
1445 |         original_height, original_width = gray.shape[:2]
1446 |         deskewed_gray = gray.copy()  # Operate on this copy
1447 | 
1448 |         # Deskewing (best on grayscale before thresholding might change shapes)
1449 |         if prep_opts.get("deskew", True):
1450 |             try:
1451 |                 # Use inverted image for finding text blocks if background is light
1452 |                 mean_intensity = np.mean(gray)
1453 |                 invert_for_deskew = mean_intensity > 128
1454 |                 deskew_input = cv2.bitwise_not(gray) if invert_for_deskew else gray
1455 | 
1456 |                 # Use a less aggressive threshold for finding angle
1457 |                 _, angle_thresh = cv2.threshold(
1458 |                     deskew_input, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
1459 |                 )
1460 |                 coords = cv2.findNonZero(angle_thresh)
1461 |                 if coords is not None and len(coords) > 10:
1462 |                     angle = cv2.minAreaRect(coords)[-1]
1463 |                     if angle < -45:
1464 |                         angle = -(90 + angle)
1465 |                     else:
1466 |                         angle = -angle
1467 | 
1468 |                     if abs(angle) > 0.1:
1469 |                         (h, w) = gray.shape[:2]
1470 |                         center = (w // 2, h // 2)
1471 |                         M = cv2.getRotationMatrix2D(center, angle, 1.0)
1472 |                         # Rotate original grayscale image
1473 |                         deskewed_gray = cv2.warpAffine(
1474 |                             gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
1475 |                         )
1476 |                         logger.debug(f"Deskewed image by {angle:.2f} degrees.")
1477 |             except Exception as e_deskew:
1478 |                 logger.warning(f"Deskewing failed: {e_deskew}. Using original orientation.")
1479 |                 deskewed_gray = gray  # Reset to original gray if deskew fails
1480 | 
1481 |         processed_img = deskewed_gray  # Start processing from (potentially) deskewed gray
1482 | 
1483 |         # Adaptive scaling calculation (applied later)
1484 |         resize_factor = prep_opts.get("resize_factor", 1.0)
1485 |         if resize_factor == 1.0:
1486 |             longest_edge = max(original_width, original_height)
1487 |             target_low, target_high = 1500, 3500
1488 |             if 0 < longest_edge < target_low:
1489 |                 resize_factor = math.ceil(target_low / longest_edge * 10) / 10
1490 |             elif longest_edge > target_high:
1491 |                 resize_factor = math.floor(target_high / longest_edge * 10) / 10
1492 |             resize_factor = max(0.5, min(3.0, resize_factor))
1493 | 
1494 |         # Contrast enhancement on grayscale
1495 |         if prep_opts.get("enhance_contrast", True):
1496 |             clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
1497 |             processed_img = clahe.apply(processed_img)
1498 | 
1499 |         # Denoising grayscale (before thresholding)
1500 |         if prep_opts.get("denoise", True):
1501 |             # Adjust h based on image size? Might be overkill.
1502 |             # h_param = math.ceil(10 * math.log10(max(10, min(original_width, original_height))))
1503 |             processed_img = cv2.fastNlMeansDenoising(processed_img, None, 10, 7, 21)
1504 | 
1505 |         # Thresholding
1506 |         threshold_method = prep_opts.get("threshold", "otsu")
1507 |         if threshold_method == "otsu":
1508 |             # No need for blur if denoised already
1509 |             _, processed_img = cv2.threshold(
1510 |                 processed_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
1511 |             )
1512 |         elif threshold_method == "adaptive":
1513 |             block_size = max(11, math.floor(min(processed_img.shape[:2]) / 20) * 2 + 1)
1514 |             processed_img = cv2.adaptiveThreshold(
1515 |                 processed_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, 5
1516 |             )
1517 |         # If no threshold, check background and invert if needed for Tesseract
1518 |         elif np.mean(processed_img) < 128:
1519 |             processed_img = cv2.bitwise_not(processed_img)
1520 | 
1521 |         # Resizing (applied last)
1522 |         if resize_factor != 1.0:
1523 |             current_h, current_w = processed_img.shape[:2]
1524 |             new_w = math.ceil(current_w * resize_factor)
1525 |             new_h = math.ceil(current_h * resize_factor)
1526 |             processed_img = cv2.resize(processed_img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
1527 |             logger.debug(f"Resized image by factor {resize_factor:.2f} to {new_w}x{new_h}")
1528 | 
1529 |         final_pil_image = Image.fromarray(processed_img)
1530 |         return final_pil_image
1531 |     except Exception as e_cv:
1532 |         logger.error(f"OpenCV preprocessing failed: {e_cv}", exc_info=True)
1533 |         return img_pil  # Fallback to PIL-processed image
1534 | 
1535 | 
1536 | def _ocr_run_tesseract(
1537 |     image: "PILImage.Image", ocr_language: str = "eng", ocr_config: str = ""
1538 | ) -> str:
1539 |     """Extracts text from an image using Tesseract OCR (sync function)."""
1540 |     _ocr_check_dep("pytesseract", _PYTESSERACT_AVAILABLE, "OCR Text Extraction")
1541 |     _ocr_check_dep("Pillow", _PIL_AVAILABLE, "OCR Text Extraction")
1542 |     if pytesseract is None:
1543 |         raise ToolError("INTERNAL_ERROR", details={"reason": "pytesseract is None"})
1544 |     try:
1545 |         # Combine language and custom config flags
1546 |         custom_config = f"-l {ocr_language} {ocr_config}".strip()
1547 |         logger.debug(f"Running Tesseract with config: '{custom_config}'")
1548 |         with _span(f"pytesseract_ocr_{ocr_language}"):
1549 |             # Use a timeout? Tesseract can sometimes hang. Requires subprocess handling.
1550 |             # For simplicity, no timeout implemented here.
1551 |             text = pytesseract.image_to_string(
1552 |                 image, config=custom_config, timeout=60
1553 |             )  # Add 60s timeout
1554 |         logger.debug(f"Tesseract extracted {len(text)} characters.")
1555 |         return text or ""  # Ensure string return
1556 |     except pytesseract.TesseractNotFoundError as e:
1557 |         logger.error("Tesseract executable not found or not in PATH.")
1558 |         raise ToolError("DEPENDENCY_MISSING", details={"dependency": "Tesseract OCR Engine"}) from e
1559 |     except RuntimeError as e_runtime:  # Catch Tesseract runtime errors (like timeout)
1560 |         logger.error(f"Tesseract runtime error: {e_runtime}", exc_info=True)
1561 |         raise ToolError(
1562 |             "OCR_FAILED", details={"engine": "Tesseract", "error": f"Runtime error: {e_runtime}"}
1563 |         ) from e_runtime
1564 |     except Exception as e:
1565 |         logger.error(f"Tesseract OCR extraction failed: {e}", exc_info=True)
1566 |         raise ToolError("OCR_FAILED", details={"engine": "Tesseract", "error": str(e)}) from e
1567 | 
1568 | 
1569 | def _ocr_is_text_mostly_noise(text: str, noise_threshold: float = 0.4) -> bool:
1570 |     """
1571 |     Determine if extracted text is mostly noise based on character distribution.
1572 |     Considers alphanumeric, whitespace, and common punctuation as 'valid'.
1573 | 
1574 |     Args:
1575 |         text: The text string to analyze.
1576 |         noise_threshold: The ratio (0.0 to 1.0) of non-valid characters above which
1577 |                          the text is considered noisy. Default is 0.4 (40%).
1578 | 
1579 |     Returns:
1580 |         True if the text is considered mostly noise, False otherwise.
1581 |     """
1582 |     if not text or not isinstance(text, str):
1583 |         return False  # Empty or invalid input is not noise
1584 | 
1585 |     text_length = len(text)
1586 |     if text_length < 20:  # Don't evaluate very short strings
1587 |         return False
1588 | 
1589 |     # Define a set of characters generally expected in non-noisy text
1590 |     # (alphanumeric, whitespace, common punctuation/symbols)
1591 |     # Adding more symbols that might appear legitimately in documents
1592 |     valid_char_pattern = re.compile(r"[a-zA-Z0-9\s.,;:!?\"'()\[\]{}%/$£€¥₽₹#@&*+=<>~|_^-]")
1593 | 
1594 |     valid_chars_count = len(valid_char_pattern.findall(text))
1595 | 
1596 |     # Calculate the ratio of characters *not* matching the valid pattern
1597 |     noise_ratio = 1.0 - (valid_chars_count / text_length)
1598 | 
1599 |     is_noise = noise_ratio > noise_threshold
1600 |     if is_noise:
1601 |         # Log only a snippet to avoid flooding logs with potentially large noisy text
1602 |         snippet = text.replace("\n", " ")[:100]  # Replace newlines for cleaner log output
1603 |         logger.debug(
1604 |             f"Text flagged as noisy (Ratio: {noise_ratio:.2f} > {noise_threshold}): '{snippet}...'"
1605 |         )
1606 | 
1607 |     return is_noise
1608 | 
1609 | 
1610 | def _ocr_is_likely_header_or_footer(text: str, line_length_threshold: int = 80) -> bool:
1611 |     """
1612 |     Determine if a single line of text is likely a header or footer based on common patterns.
1613 | 
1614 |     Args:
1615 |         text: The line of text to evaluate.
1616 |         line_length_threshold: Lines longer than this are less likely to be headers/footers. Default 80.
1617 | 
1618 |     Returns:
1619 |         True if the line matches common header/footer patterns, False otherwise.
1620 |     """
1621 |     text = text.strip()
1622 |     if not text or len(text) > line_length_threshold:
1623 |         return False
1624 | 
1625 |     # --- Pattern Checks ---
1626 |     # 1. Page Number patterns (robust check)
1627 |     #    - "Page X", "P. X", "X / Y", "- X -", etc.
1628 |     #    - Allows for variations in spacing and separators
1629 |     if re.search(r"(?i)\b(page|p[ag]{1,2}\.?|seite|s\.?)\s*\d+", text):
1630 |         return True
1631 |     if re.match(r"^\s*[-–—]?\s*\d+\s*[/of\s]+\s*\d+\s*[-–—]?\s*$", text):
1632 |         return True  # e.g., "1 / 10", "1 of 10"
1633 |     if re.match(r"^\s*[-–—]?\s*\d+\s*[-–—]?\s*$", text):
1634 |         return True  # Just a number, possibly bracketed
1635 | 
1636 |     # 2. Date patterns
1637 |     #    - "Month Day, Year", "DD/MM/YYYY", "YYYY-MM-DD", etc.
1638 |     if re.search(
1639 |         r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}", text, re.I
1640 |     ):
1641 |         return True
1642 |     if re.search(r"\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b", text):
1643 |         return True
1644 |     if re.search(r"\b\d{4}[./-]\d{1,2}[./-]\d{1,2}\b", text):
1645 |         return True  # ISO-like
1646 | 
1647 |     # 3. Common keywords (case-insensitive start of line)
1648 |     if re.match(
1649 |         r"^(confidential|internal use only|draft|proprietary|for discussion purposes)", text, re.I
1650 |     ):
1651 |         return True
1652 |     if re.match(r"^(copyright|\(c\)|©)\s*\d*", text, re.I):
1653 |         return True
1654 | 
1655 |     # 4. Repeated characters (often used as separators)
1656 |     #    - Check if the line consists mostly of one or two non-alphanumeric characters
1657 |     non_alnum_chars = re.sub(r"[a-zA-Z0-9\s]", "", text)
1658 |     if len(non_alnum_chars) > 5 and len(set(non_alnum_chars)) <= 2:
1659 |         return True
1660 | 
1661 |     # 5. Company Names / Document Titles (Heuristic - might be too broad)
1662 |     #    - Check if it's short, title-cased, and doesn't end in punctuation?
1663 |     # if len(text.split()) < 7 and text == text.title() and not text.endswith(('.', '?', '!')):
1664 |     #     # Further check: Is this text repeated elsewhere? (Needs broader context)
1665 |     #     pass # This heuristic is often unreliable without more context.
1666 | 
1667 |     # 6. All Caps Short Lines (Potential titles/headers)
1668 |     if text.isupper() and len(text.split()) < 7 and len(text) > 3:
1669 |         return True
1670 | 
1671 |     return False  # Default: Not a header/footer
1672 | 
1673 | 
1674 | def _ocr_remove_headers_and_footers(text: str, max_lines_check: int = 5) -> str:
1675 |     """
1676 |     Removes likely headers and footers from the top/bottom of the text block.
1677 | 
1678 |     Args:
1679 |         text: The block of text (potentially multiple pages concatenated).
1680 |         max_lines_check: How many lines from the top and bottom to examine. Default 5.
1681 | 
1682 |     Returns:
1683 |         The text with potential header/footer lines removed.
1684 |     """
1685 |     if not text or not isinstance(text, str):
1686 |         return ""
1687 | 
1688 |     lines = text.splitlines()
1689 |     num_lines = len(lines)
1690 | 
1691 |     # Don't process if text is too short to reliably identify headers/footers
1692 |     if num_lines < max_lines_check * 2:
1693 |         return text
1694 | 
1695 |     lines_to_remove_indices: Set[int] = set()
1696 | 
1697 |     # Check top lines
1698 |     for i in range(max_lines_check):
1699 |         if i < num_lines:  # Ensure index is valid
1700 |             line_text = lines[i]
1701 |             # Also check if the line is very short (e.g., just whitespace remnants)
1702 |             if _ocr_is_likely_header_or_footer(line_text) or len(line_text.strip()) <= 2:
1703 |                 lines_to_remove_indices.add(i)
1704 |             # Stop checking top lines if a probable content line is found early
1705 |             elif len(line_text) > 80 and i < max_lines_check // 2:  # Heuristic for content line
1706 |                 break
1707 |         else:  # Should not happen given initial num_lines check, but safety
1708 |             break
1709 | 
1710 |     # Check bottom lines
1711 |     for i in range(max_lines_check):
1712 |         idx = num_lines - 1 - i
1713 |         # Ensure index is valid and not already marked for removal from top scan
1714 |         if idx >= 0 and idx not in lines_to_remove_indices:
1715 |             line_text = lines[idx]
1716 |             if _ocr_is_likely_header_or_footer(line_text) or len(line_text.strip()) <= 2:
1717 |                 lines_to_remove_indices.add(idx)
1718 |             # Stop checking bottom lines if a probable content line is found early
1719 |             elif len(line_text) > 80 and i < max_lines_check // 2:
1720 |                 break
1721 |         elif idx < 0:  # Reached top of file during bottom check
1722 |             break
1723 | 
1724 |     if not lines_to_remove_indices:
1725 |         return text  # No lines identified for removal
1726 | 
1727 |     logger.debug(f"Removing {len(lines_to_remove_indices)} potential header/footer lines.")
1728 | 
1729 |     # Build the result, skipping removed lines
1730 |     result_lines = [line for i, line in enumerate(lines) if i not in lines_to_remove_indices]
1731 | 
1732 |     # Remove leading/trailing blank lines potentially left after removal
1733 |     # This needs care: find first/last non-blank line indices
1734 |     first_content_line = -1
1735 |     last_content_line = -1
1736 |     for i, line in enumerate(result_lines):
1737 |         if line.strip():
1738 |             if first_content_line == -1:
1739 |                 first_content_line = i
1740 |             last_content_line = i
1741 | 
1742 |     if first_content_line == -1:  # All lines were removed or blank
1743 |         return ""
1744 |     else:
1745 |         # Join only the content lines, preserving internal blank lines
1746 |         cleaned_text = "\n".join(result_lines[first_content_line : last_content_line + 1])
1747 |         return cleaned_text
1748 | 
1749 | 
1750 | async def _ocr_enhance_text_chunk(
1751 |     chunk: str, output_format: str = "markdown", remove_headers: bool = False
1752 | ) -> str:
1753 |     """Enhances OCR text chunk using LLM (standalone internal helper)."""
1754 |     # --- Apply Basic Rule-based Cleaning First ---
1755 |     cleaned_text = chunk.strip()  # Work on a copy
1756 | 
1757 |     # Join words incorrectly split across lines (common OCR artifact)
1758 |     cleaned_text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", cleaned_text)
1759 | 
1760 |     # Normalize multiple whitespace characters (including newlines within paragraphs)
1761 |     # This is aggressive and might merge intended line breaks within code/poetry
1762 |     # Consider a less aggressive approach if preserving specific line breaks is crucial.
1763 |     # cleaned_text = re.sub(r"\s+", " ", cleaned_text) # Too aggressive
1764 | 
1765 |     # Normalize space/tab characters to single space
1766 |     cleaned_text = re.sub(r"[ \t]+", " ", cleaned_text)
1767 |     # Collapse multiple blank lines (2+ newlines) into exactly two newlines
1768 |     cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text)
1769 | 
1770 |     # Optional header/footer removal using rules *before* LLM
1771 |     if remove_headers:
1772 |         original_len = len(cleaned_text)
1773 |         cleaned_text = _ocr_remove_headers_and_footers(cleaned_text)
1774 |         if len(cleaned_text) < original_len:
1775 |             logger.debug("Applied rule-based header/footer removal pre-LLM.")
1776 | 
1777 |     # Check for noise after initial cleaning
1778 |     if _ocr_is_text_mostly_noise(cleaned_text):
1779 |         logger.warning(
1780 |             "Text chunk noisy after basic cleaning, LLM enhancement might be less effective."
1781 |         )
1782 |         # Decide whether to proceed or return early based on noise level?
1783 |         # For now, proceed with LLM enhancement.
1784 | 
1785 |     # --- LLM Prompt Generation ---
1786 |     format_instruction = ""
1787 |     if output_format == "markdown":
1788 |         format_instruction = """
1789 | 2. Format as clean, readable markdown:
1790 |    - Use appropriate heading levels (#, ##, etc.). Infer structure where possible.
1791 |    - Format lists correctly (bulleted or numbered).
1792 |    - Apply emphasis (*italic*) and strong (**bold**) sparingly where appropriate.
1793 |    - Represent tabular data using markdown table syntax IF table structure is clearly identifiable.
1794 |    - Use code blocks (```) for code snippets or equations if detected."""
1795 |     else:  # output_format == "text"
1796 |         format_instruction = """
1797 | 2. Format as clean, readable plain text:
1798 |    - Ensure clear paragraph separation (double newline).
1799 |    - Maintain list structures with standard markers (e.g., -, 1.).
1800 |    - Avoid markdown syntax like #, *, _, ```, etc."""
1801 | 
1802 |     header_footer_instruction = (
1803 |         "Remove any remaining headers, footers, and page numbers."
1804 |         if remove_headers
1805 |         else "Preserve all content including potential headers/footers."
1806 |     )
1807 |     prompt = f"""You are an expert text processor specialized in correcting OCR errors from scanned documents. Please process the following text according to these instructions:
1808 | 
1809 | 1. Fix OCR-induced errors:
1810 |    - Correct character recognition errors (e.g., 'rn' vs 'm', 'O' vs '0', 'l' vs '1', 'S' vs '5').
1811 |    - Join words incorrectly split across lines (e.g., "hyphen-\nation").
1812 |    - Merge paragraphs that were artificially split by page breaks or scanning artifacts.
1813 |    - Split run-on paragraphs where a clear topic shift or structural break (like a list starting) occurs.
1814 |    - Use context to resolve ambiguities and reconstruct the original meaning accurately.
1815 | {format_instruction}
1816 | 3. Clean up formatting:
1817 |    - Remove redundant spaces within lines.
1818 |    - Ensure consistent paragraph spacing (double newline between paragraphs).
1819 |    - {header_footer_instruction}
1820 | 
1821 | 4. IMPORTANT: Preserve all meaningful content and the original structure as much as possible. Do not add information or summaries. Do not change the substance of the text. Focus solely on fixing OCR errors and applying the requested formatting based *only* on the input text provided.
1822 | 
1823 | Input Text:
1824 | ```text
1825 | {cleaned_text}
1826 | ```
1827 | 
1828 | Corrected Output ({output_format}):"""
1829 | 
1830 |     try:
1831 |         logger.debug(
1832 |             f"Sending chunk (len={len(cleaned_text)}) to LLM for enhancement (format={output_format}, rm_hdrs={remove_headers})."
1833 |         )
1834 |         # Use a capable model (adjust model name as needed)
1835 |         provider = Provider.OPENAI.value
1836 |         model = "gpt-4o-mini"
1837 | 
1838 |         # Estimate max tokens needed
1839 |         estimated_input_tokens = len(cleaned_text) // 3
1840 |         buffer_factor = 1.4 if output_format == "markdown" else 1.2  # Slightly more buffer
1841 |         llm_max_tokens = int(estimated_input_tokens * buffer_factor) + 500
1842 |         # Cap based on typical context window limits (e.g., ~16k tokens for GPT-4 Turbo input, allow ample output)
1843 |         llm_max_tokens = max(1000, min(llm_max_tokens, 8000))
1844 | 
1845 |         # Assume _standalone_llm_call is defined elsewhere
1846 |         enhanced_text = await _standalone_llm_call(
1847 |             prompt=prompt,
1848 |             provider=provider,
1849 |             model=model,
1850 |             temperature=0.1,  # Very low temperature for factual correction
1851 |             max_tokens=llm_max_tokens,
1852 |         )
1853 | 
1854 |         # --- Post-processing LLM Output ---
1855 |         # Remove potential preamble/apologies
1856 |         enhanced_text = re.sub(
1857 |             r"^(Okay, |Here is |Sure, |Here['’]s |Certainly, |Based on the text provided.*?\n)[:\n]?\s*",
1858 |             "",
1859 |             enhanced_text,
1860 |             flags=re.IGNORECASE | re.DOTALL,
1861 |         )
1862 |         # Remove potential markdown fences around the whole output
1863 |         enhanced_text = re.sub(
1864 |             r"^\s*```(?:\w+\n)?([\s\S]*?)\n?```\s*$", r"\1", enhanced_text
1865 |         ).strip()
1866 | 
1867 |         logger.debug(f"LLM enhancement returned text (len={len(enhanced_text)}).")
1868 |         return enhanced_text
1869 | 
1870 |     except ToolError as e:
1871 |         # Log the specific ToolError and fallback
1872 |         logger.error(
1873 |             f"LLM text enhancement failed with ToolError: {e.error_code} - {str(e)}. Returning pre-LLM cleaned text."
1874 |         )
1875 |         return cleaned_text
1876 |     except Exception as e:
1877 |         # Log unexpected errors and fallback
1878 |         logger.error(f"Unexpected error during LLM text enhancement: {e}", exc_info=True)
1879 |         return cleaned_text
1880 | 
1881 | 
1882 | def _ocr_validate_file_path(file_path: str, expected_extension: Optional[str] = None) -> Path:
1883 |     """Validates a file path exists and optionally has the expected extension."""
1884 |     if not file_path or not isinstance(file_path, str):
1885 |         raise ToolInputError("File path cannot be empty or non-string", param_name="file_path")
1886 | 
1887 |     try:
1888 |         # Expand user directory and normalize path separators
1889 |         path = Path(os.path.expanduser(os.path.normpath(file_path)))
1890 |     except Exception as e:
1891 |         raise ToolInputError(
1892 |             f"Invalid file path format: {file_path}. Error: {e}", param_name="file_path"
1893 |         ) from e
1894 | 
1895 |     if not path.exists():
1896 |         raise ToolInputError(f"File not found at path: {path}", param_name="file_path")
1897 |     if not path.is_file():
1898 |         raise ToolInputError(f"Path exists but is not a file: {path}", param_name="file_path")
1899 |     # Check extension case-insensitively
1900 |     if expected_extension and not path.suffix.lower() == expected_extension.lower():
1901 |         raise ToolInputError(
1902 |             f"File does not have the expected extension ({expected_extension}): {path}",
1903 |             param_name="file_path",
1904 |         )
1905 |     # Optional: Check read permissions?
1906 |     # if not os.access(path, os.R_OK):
1907 |     #     raise ToolInputError(f"Cannot read file (permission denied): {path}", param_name="file_path")
1908 | 
1909 |     return path
1910 | 
1911 | 
1912 | def _ocr_detect_tables(image: "PILImage.Image") -> List[Tuple[int, int, int, int]]:
1913 |     """Detects potential tables in an image using OpenCV (sync function)."""
1914 |     # Check dependencies first
1915 |     if not _CV2_AVAILABLE or not _NUMPY_AVAILABLE or not _PIL_AVAILABLE:
1916 |         logger.warning("Cannot detect tables: OpenCV, NumPy, or Pillow not available.")
1917 |         return []
1918 |     # Ensure library objects are valid
1919 |     if cv2 is None or np is None:
1920 |         logger.warning("Cannot detect tables: OpenCV or NumPy object is None.")
1921 |         return []
1922 | 
1923 |     try:
1924 |         img = np.array(image)
1925 |         # Convert to grayscale if necessary
1926 |         if len(img.shape) == 3:
1927 |             gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
1928 |         elif len(img.shape) == 2:
1929 |             gray = img
1930 |         else:
1931 |             logger.warning(f"Unexpected image shape for table detection: {img.shape}")
1932 |             return []
1933 | 
1934 |         # --- Table Detection Logic (Example using line detection) ---
1935 |         # 1. Thresholding (Adaptive often works well for lines)
1936 |         thresh_inv = cv2.adaptiveThreshold(
1937 |             gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 15, 5
1938 |         )
1939 | 
1940 |         # 2. Detect Horizontal Lines
1941 |         horizontal_kernel = cv2.getStructuringElement(
1942 |             cv2.MORPH_RECT, (min(40, gray.shape[1] // 10), 1)
1943 |         )  # Kernel size relative to width
1944 |         detected_horizontal = cv2.morphologyEx(
1945 |             thresh_inv, cv2.MORPH_OPEN, horizontal_kernel, iterations=2
1946 |         )
1947 |         cnts_h, _ = cv2.findContours(
1948 |             detected_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
1949 |         )
1950 | 
1951 |         # 3. Detect Vertical Lines
1952 |         vertical_kernel = cv2.getStructuringElement(
1953 |             cv2.MORPH_RECT, (1, min(40, gray.shape[0] // 10))
1954 |         )  # Kernel size relative to height
1955 |         detected_vertical = cv2.morphologyEx(
1956 |             thresh_inv, cv2.MORPH_OPEN, vertical_kernel, iterations=2
1957 |         )
1958 |         cnts_v, _ = cv2.findContours(detected_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
1959 | 
1960 |         # 4. Combine contours or find bounding boxes of large contours containing lines
1961 |         # Strategy: Find large contours in the original inverted threshold image,
1962 |         # then check if those contours contain significant horiz/vert lines.
1963 |         contours, _ = cv2.findContours(thresh_inv, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
1964 | 
1965 |         table_regions = []
1966 |         img_area = img.shape[0] * img.shape[1]
1967 |         min_table_area = img_area * 0.01  # Lower threshold slightly (1%)
1968 |         min_dimension = 50  # Min width/height for a contour to be considered
1969 | 
1970 |         for contour in contours:
1971 |             x, y, w, h = cv2.boundingRect(contour)
1972 |             area = w * h
1973 |             aspect_ratio = w / max(1, h)
1974 | 
1975 |             # Basic filtering based on size and aspect ratio
1976 |             if (
1977 |                 area > min_table_area
1978 |                 and w > min_dimension
1979 |                 and h > min_dimension
1980 |                 and 0.1 < aspect_ratio < 10.0
1981 |             ):
1982 |                 # Check for significant presence of detected lines within this bounding box
1983 |                 roi_h = detected_horizontal[y : y + h, x : x + w]
1984 |                 roi_v = detected_vertical[y : y + h, x : x + w]
1985 |                 # Heuristic: Check if non-zero pixels (lines) exceed a small fraction of the ROI area or length
1986 |                 min_line_pixels_h = w * 0.3  # Require horizontal lines covering ~30% width
1987 |                 min_line_pixels_v = h * 0.3  # Require vertical lines covering ~30% height
1988 |                 if (
1989 |                     cv2.countNonZero(roi_h) > min_line_pixels_h
1990 |                     and cv2.countNonZero(roi_v) > min_line_pixels_v
1991 |                 ):
1992 |                     table_regions.append((x, y, w, h))
1993 |                 # else:
1994 |                 #    logger.debug(f"Contour rejected: area={area}, w={w}, h={h}, h_px={cv2.countNonZero(roi_h)}, v_px={cv2.countNonZero(roi_v)}")
1995 | 
1996 |         # Optional: Merge overlapping bounding boxes (omitted for simplicity)
1997 |         # merged_regions = merge_overlapping_boxes(table_regions) # Needs implementation
1998 | 
1999 |         logger.debug(f"Detected {len(table_regions)} potential table regions.")
2000 |         return table_regions
2001 | 
2002 |     except Exception as e:
2003 |         logger.error(f"OpenCV Table detection failed: {e}", exc_info=True)
2004 |         return []
2005 | 
2006 | 
2007 | def _ocr_process_toc(toc: List) -> List[Dict[str, Any]]:
2008 |     """Processes a PDF table of contents (from PyMuPDF) into a nested structure."""
2009 |     if not toc:
2010 |         return []
2011 |     result: List[Dict[str, Any]] = []
2012 |     # Stack stores tuples: (level, parent_list_to_append_to)
2013 |     stack: List[Tuple[int, List]] = [(-1, result)]
2014 |     for item in toc:
2015 |         # PyMuPDF TOC item format: [level, title, page, ?dest_dict]
2016 |         if not isinstance(item, (list, tuple)) or len(item) < 3:
2017 |             logger.warning(f"Skipping malformed TOC item: {item}")
2018 |             continue
2019 |         try:
2020 |             level = int(item[0])
2021 |             title = str(item[1])
2022 |             page = int(item[2])
2023 |         except (ValueError, TypeError, IndexError) as e:
2024 |             logger.warning(f"Error parsing TOC item '{item}': {e}")
2025 |             continue
2026 | 
2027 |         # Pop stack until parent level is found
2028 |         while stack[-1][0] >= level:
2029 |             stack.pop()
2030 |             if not stack:  # Should not happen with initial (-1, result)
2031 |                 logger.error("TOC stack became empty unexpectedly.")
2032 |                 return result  # Return what we have so far
2033 | 
2034 |         # Create new entry and add to parent's children list
2035 |         entry: Dict[str, Any] = {"title": title, "page": page, "children": []}
2036 |         stack[-1][1].append(entry)
2037 |         # Push current entry onto stack for potential children
2038 |         stack.append((level, entry["children"]))
2039 |     return result
2040 | 
2041 | 
2042 | def _ocr_split_text_into_chunks(
2043 |     text: str, max_chunk_size: int = 8000, overlap: int = 200
2044 | ) -> List[str]:
2045 |     """Splits text into chunks, trying to respect paragraphs and sentences (sync function)."""
2046 |     if not text or not isinstance(text, str):
2047 |         return []
2048 | 
2049 |     max_chunk_size = max(1000, min(max_chunk_size, 15000))  # Sensible limits
2050 |     overlap = max(50, min(overlap, max_chunk_size // 4))
2051 |     # Ensure min_chunk_size is reasonable, at least larger than overlap
2052 |     min_chunk_size = max(overlap * 2, 100)
2053 | 
2054 |     chunks = []
2055 |     start_index = 0
2056 |     text_len = len(text)
2057 | 
2058 |     while start_index < text_len:
2059 |         end_index = min(start_index + max_chunk_size, text_len)
2060 | 
2061 |         # Handle the last chunk directly
2062 |         if end_index == text_len:
2063 |             chunk = text[start_index:end_index]
2064 |             # Only add if it has meaningful content (more than just whitespace)
2065 |             if chunk.strip():
2066 |                 chunks.append(chunk)
2067 |             break  # End of text reached
2068 | 
2069 |         best_split_index = -1
2070 |         # Prefer double newline (paragraph break)
2071 |         split_point_para = text.rfind("\n\n", max(start_index, end_index - overlap * 2), end_index)
2072 |         if split_point_para != -1 and split_point_para > start_index:  # Ensure split is after start
2073 |             # Check if this split results in a reasonably sized chunk
2074 |             if (split_point_para + 2 - start_index) >= min_chunk_size:
2075 |                 best_split_index = split_point_para + 2
2076 | 
2077 |         # If no good paragraph break, try sentence breaks
2078 |         if best_split_index == -1:
2079 |             sentence_break_pattern = r"[.?!]['\"]?(\s|\n|$)"  # Include end of string
2080 |             # Search within a reasonable lookback window
2081 |             search_region_start = max(start_index, end_index - overlap)
2082 |             search_region = text[search_region_start:end_index]
2083 |             matches = list(re.finditer(sentence_break_pattern, search_region))
2084 |             if matches:
2085 |                 # Find the offset of the last match within the search region
2086 |                 last_match_end_offset = matches[-1].end()
2087 |                 # Calculate the split point relative to the original string
2088 |                 split_point_sentence = search_region_start + last_match_end_offset
2089 |                 # Check if this split is valid and creates a reasonably sized chunk
2090 |                 if (
2091 |                     split_point_sentence > start_index
2092 |                     and (split_point_sentence - start_index) >= min_chunk_size
2093 |                 ):
2094 |                     best_split_index = split_point_sentence
2095 | 
2096 |         # Fallback to single newline or space if still no good break
2097 |         if best_split_index == -1:
2098 |             split_point_newline = text.rfind("\n", max(start_index, end_index - overlap), end_index)
2099 |             split_point_space = text.rfind(" ", max(start_index, end_index - overlap), end_index)
2100 |             # Choose the latest valid break (newline or space)
2101 |             split_point_fallback = max(split_point_newline, split_point_space)
2102 |             if (
2103 |                 split_point_fallback > start_index
2104 |                 and (split_point_fallback + 1 - start_index) >= min_chunk_size
2105 |             ):
2106 |                 best_split_index = split_point_fallback + 1
2107 | 
2108 |         # Force split at max_chunk_size boundary if no suitable break found,
2109 |         # or if the best found break is too early (making the chunk too small)
2110 |         if (
2111 |             best_split_index <= start_index
2112 |             or (best_split_index - start_index) < min_chunk_size // 2
2113 |         ):
2114 |             # Check if simply taking end_index results in a valid chunk start for next iteration
2115 |             potential_next_start = max(start_index + 1, end_index - overlap)
2116 |             if potential_next_start < text_len:  # Avoid forcing if it's the last chunk anyway
2117 |                 best_split_index = end_index
2118 |             else:  # If forcing split here would make the loop end, try a slightly earlier hard split?
2119 |                 # For simplicity, let's stick to end_index, the loop termination handles the last part.
2120 |                 best_split_index = end_index
2121 | 
2122 |         # Extract the chunk
2123 |         chunk = text[start_index:best_split_index]
2124 |         if chunk.strip():  # Only add non-empty chunks
2125 |             chunks.append(chunk)
2126 | 
2127 |         # Calculate the start index for the next chunk
2128 |         next_start = max(start_index + 1, best_split_index - overlap)
2129 | 
2130 |         # Ensure substantial forward progress to avoid infinite loops on edge cases
2131 |         # Use max_chunk_size here instead of the undefined 'size'
2132 |         min_progress = min(max_chunk_size // 10, 50)  # Ensure we advance by at least a small amount
2133 |         next_start = max(next_start, start_index + min_progress)
2134 | 
2135 |         # Safety check: don't let next_start go beyond the text length
2136 |         start_index = min(next_start, text_len)
2137 | 
2138 |     # Filter out any potential empty strings added during edge cases
2139 |     final_chunks = [c for c in chunks if c]
2140 | 
2141 |     logger.debug(f"Split text ({text_len} chars) into {len(final_chunks)} chunks")
2142 |     return final_chunks
2143 | 
2144 | 
2145 | async def _ocr_assess_text_quality(original_text: str, enhanced_text: str) -> Dict[str, Any]:
2146 |     """Assesses the quality of OCR enhancement using LLM (Standalone)."""
2147 |     if not original_text and not enhanced_text:
2148 |         return {"score": 0, "explanation": "No text provided for assessment.", "examples": []}
2149 |     if not original_text:
2150 |         return {
2151 |             "score": 100,
2152 |             "explanation": "Original text was empty, enhanced text provided.",
2153 |             "examples": [],
2154 |         }
2155 |     if not enhanced_text:
2156 |         return {
2157 |             "score": 0,
2158 |             "explanation": "Enhanced text is empty, original text was not.",
2159 |             "examples": [],
2160 |         }
2161 | 
2162 |     max_sample = 4000
2163 |     original_sample = original_text[:max_sample] + (
2164 |         "..." if len(original_text) > max_sample else ""
2165 |     )
2166 |     enhanced_sample = enhanced_text[:max_sample] + (
2167 |         "..." if len(enhanced_text) > max_sample else ""
2168 |     )
2169 | 
2170 |     prompt = f"""Please assess the quality improvement from the 'Original OCR Text' to the 'Enhanced Text'. Focus on:
2171 | 1. Correction of OCR errors (typos, spacing, broken words).
2172 | 2. Improvement in formatting and readability (paragraphs, lists, structure).
2173 | 3. Accuracy in preserving the original meaning and content.
2174 | 4. Effectiveness of removing noise (like headers/footers if applicable).
2175 | 
2176 | Original OCR Text:
2177 | ```
2178 | {original_sample}
2179 | ```
2180 | 
2181 | Enhanced Text:
2182 | ```
2183 | {enhanced_sample}
2184 | ```
2185 | 
2186 | Provide your assessment ONLY in the following JSON format:
2187 | {{
2188 |   "score": <integer score 0-100, where 100 is perfect enhancement>,
2189 |   "explanation": "<brief explanation of the score, highlighting key improvements or remaining issues>",
2190 |   "examples": [
2191 |     "<example 1 of a specific correction or improvement>",
2192 |     "<example 2>",
2193 |     "<example 3 (optional)>"
2194 |   ]
2195 | }}
2196 | Do not add any text before or after the JSON object.
2197 | """
2198 | 
2199 |     try:
2200 |         logger.debug("Requesting LLM quality assessment.")
2201 |         assessment_json_str = await _standalone_llm_call(
2202 |             prompt=prompt, max_tokens=500, temperature=0.2
2203 |         )
2204 |         try:
2205 |             json_match = re.search(r"```(?:json)?\s*([\s\S]+?)\s*```", assessment_json_str)
2206 |             json_str = json_match.group(1).strip() if json_match else assessment_json_str.strip()
2207 |             start_brace = json_str.find("{")
2208 |             end_brace = json_str.rfind("}")
2209 |             if start_brace != -1 and end_brace != -1 and start_brace < end_brace:
2210 |                 json_str = json_str[start_brace : end_brace + 1]
2211 |             elif not json_str.startswith("{"):
2212 |                 raise ValueError("Could not find JSON object boundaries.")
2213 | 
2214 |             assessment_data = json.loads(json_str)
2215 |             if (
2216 |                 not isinstance(assessment_data, dict)
2217 |                 or "score" not in assessment_data
2218 |                 or "explanation" not in assessment_data
2219 |                 or "examples" not in assessment_data
2220 |                 or not isinstance(assessment_data["examples"], list)
2221 |             ):
2222 |                 raise ValueError("Parsed JSON has incorrect structure.")
2223 |             try:
2224 |                 assessment_data["score"] = (
2225 |                     int(assessment_data["score"]) if assessment_data["score"] is not None else None
2226 |                 )
2227 |             except (ValueError, TypeError):
2228 |                 assessment_data["score"] = None
2229 |             assessment_data["explanation"] = str(assessment_data["explanation"])
2230 |             assessment_data["examples"] = [str(ex) for ex in assessment_data["examples"]]
2231 |             logger.debug(
2232 |                 f"Quality assessment received: Score {assessment_data.get('score', 'N/A')}"
2233 |             )
2234 |             return assessment_data
2235 |         except (json.JSONDecodeError, ValueError, TypeError) as e:
2236 |             logger.error(
2237 |                 f"Failed to parse quality assessment JSON: {e}. Raw:\n{assessment_json_str}"
2238 |             )
2239 |             return {
2240 |                 "score": None,
2241 |                 "explanation": f"Parse failed: {e}",
2242 |                 "examples": [],
2243 |                 "raw_response": assessment_json_str,
2244 |             }
2245 |     except Exception as e:
2246 |         logger.error(f"Error during LLM quality assessment call: {e}", exc_info=True)
2247 |         return {"score": None, "explanation": f"LLM call failed: {e}", "examples": []}
2248 | 
2249 | 
2250 | # --- Fallback Conversion Helpers (module level) ---
2251 | async def _fallback_convert_pdf(file_path: Path) -> Dict[str, Any]:
2252 |     """Basic PDF conversion using PyPDF2."""
2253 |     _ocr_check_dep("PyPDF2", _PYPDF2_AVAILABLE, "Basic PDF Fallback Conversion")
2254 |     try:
2255 |         logger.info(f"Using PyPDF2 fallback for PDF: {file_path}")
2256 |         content = ""
2257 |         metadata: Dict[str, Any] = {"is_fallback": True}
2258 |         num_pages = 0
2259 |         if PyPDF2 is None:
2260 |             raise ImportError("PyPDF2 object is None despite _PYPDF2_AVAILABLE=True")
2261 |         with open(file_path, "rb") as f:
2262 |             try:
2263 |                 reader = PyPDF2.PdfReader(f)
2264 |                 num_pages = len(reader.pages)
2265 |                 metadata["num_pages"] = num_pages
2266 |                 pages = []
2267 |                 for i in range(num_pages):
2268 |                     try:
2269 |                         page_text = reader.pages[i].extract_text() or ""
2270 |                         pages.append(page_text)
2271 |                     except Exception as page_err:
2272 |                         logger.warning(
2273 |                             f"PyPDF2 failed to extract text from page {i + 1}: {page_err}"
2274 |                         )
2275 |                         pages.append(f"[Page {i + 1} Extraction Error]")
2276 |                 content = "\n\n".join(pages)
2277 |             except PyPDF2.errors.PdfReadError as pdf_err:
2278 |                 logger.error(f"PyPDF2 could not read PDF {file_path}: {pdf_err}")
2279 |                 raise ToolError(
2280 |                     "PDF_READ_ERROR", details={"library": "PyPDF2", "error": str(pdf_err)}
2281 |                 ) from pdf_err
2282 |         metadata.update(_get_basic_metadata(content, num_pages))
2283 |         return {"content": content, "metadata": metadata}
2284 |     except Exception as e:
2285 |         logger.error(f"PyPDF2 fallback failed unexpectedly: {e}", exc_info=True)
2286 |         raise ToolError(
2287 |             "CONVERSION_FAILED",
2288 |             details={"file": str(file_path), "method": "PyPDF2 Fallback", "error": str(e)},
2289 |         ) from e
2290 | 
2291 | 
2292 | async def _fallback_convert_docx(file_path: Path) -> Dict[str, Any]:
2293 |     """Basic DOCX conversion using python-docx."""
2294 |     _ocr_check_dep("python-docx", _DOCX_AVAILABLE, "DOCX Fallback Conversion")
2295 |     try:
2296 |         logger.info(f"Using python-docx fallback for DOCX: {file_path}")
2297 |         if docx is None:
2298 |             raise ImportError("docx object is None despite _DOCX_AVAILABLE=True")
2299 |         doc = docx.Document(file_path)
2300 |         paragraphs = [para.text for para in doc.paragraphs if para.text]
2301 |         content = "\n\n".join(paragraphs)
2302 |         metadata: Dict[str, Any] = {
2303 |             "num_pages": 0,
2304 |             "has_tables": len(doc.tables) > 0,
2305 |             "has_figures": len(doc.inline_shapes) > 0,
2306 |             "has_sections": len(doc.sections) > 0,
2307 |             "is_fallback": True,
2308 |         }
2309 |         metadata.update(_get_basic_metadata(content))
2310 |         return {"content": content, "metadata": metadata}
2311 |     except Exception as e:
2312 |         logger.error(f"python-docx fallback failed: {e}", exc_info=True)
2313 |         raise ToolError(
2314 |             "CONVERSION_FAILED",
2315 |             details={"file": str(file_path), "method": "python-docx Fallback", "error": str(e)},
2316 |         ) from e
2317 | 
2318 | 
2319 | async def _fallback_convert_text(file_path: Path) -> Dict[str, Any]:
2320 |     """Simple text file reading."""
2321 |     try:
2322 |         logger.info(f"Reading text file directly: {file_path}")
2323 |         content = file_path.read_text(encoding="utf-8", errors="replace")
2324 |         line_count = content.count("\n") + 1
2325 |         page_estimate = max(1, int(line_count / 50))
2326 |         metadata = {"num_pages": page_estimate, "is_fallback": True}
2327 |         metadata.update(_get_basic_metadata(content, page_estimate))
2328 |         return {"content": content, "metadata": metadata}
2329 |     except Exception as e:
2330 |         logger.error(f"Text file reading failed: {e}", exc_info=True)
2331 |         raise ToolError(
2332 |             "CONVERSION_FAILED",
2333 |             details={"file": str(file_path), "method": "Direct Text Read", "error": str(e)},
2334 |         ) from e
2335 | 
2336 | 
2337 | ###############################################################################
2338 | # Standalone Tool Functions (Exportable)                                      #
2339 | ###############################################################################
2340 | 
2341 | 
2342 | # ------------------------ Document Conversion -----------------------------
2343 | @with_tool_metrics
2344 | @with_error_handling
2345 | async def convert_document(
2346 |     document_path: Optional[str] = None,
2347 |     document_data: Optional[bytes] = None,
2348 |     output_format: str = "markdown",
2349 |     extraction_strategy: str = DEFAULT_EXTRACTION_STRATEGY,
2350 |     enhance_with_llm: bool = True,
2351 |     ocr_options: Optional[Dict] = None,
2352 |     output_path: Optional[str] = None,
2353 |     save_to_file: bool = False,
2354 |     page_range: Optional[str] = None,
2355 |     section_filter: Optional[str] = None,
2356 |     accelerator_device: str = "auto",
2357 |     num_threads: int = 4,
2358 | ) -> Dict[str, Any]:
2359 |     """
2360 |     Convert documents (PDF, Office formats, Images) to various formats (Standalone Function).
2361 |     (Args/Returns docs same as original class method)
2362 |     """
2363 |     t0 = time.time()
2364 |     strategy = extraction_strategy.lower()
2365 |     output_format = output_format.lower()
2366 |     ocr_options = ocr_options or {}
2367 | 
2368 |     # --- Input Validation ---
2369 |     if not document_path and not document_data:
2370 |         raise ToolInputError("Either 'document_path' or 'document_data' must be provided.")
2371 |     if document_path and document_data:
2372 |         raise ToolInputError("Provide either 'document_path' or 'document_data', not both.")
2373 |     if strategy not in _VALID_EXTRACTION_STRATEGIES:
2374 |         raise ToolInputError(
2375 |             f"Invalid extraction_strategy. Choose from: {', '.join(_VALID_EXTRACTION_STRATEGIES)}",
2376 |             param_name="extraction_strategy",
2377 |             provided_value=strategy,
2378 |         )
2379 |     if output_format not in _VALID_FORMATS:
2380 |         raise ToolInputError(
2381 |             f"Invalid output_format. Choose from: {', '.join(_VALID_FORMATS)}",
2382 |             param_name="output_format",
2383 |             provided_value=output_format,
2384 |         )
2385 | 
2386 |     # --- Dependency Checks based on strategy ---
2387 |     if strategy == "docling":
2388 |         _ocr_check_dep("docling", _DOCLING_AVAILABLE, "Docling extraction strategy")
2389 |     if strategy in ["direct_text", "hybrid_direct_ocr"]:
2390 |         if not (_PYMUPDF_AVAILABLE or _PDFPLUMBER_AVAILABLE):
2391 |             raise ToolError(
2392 |                 "DEPENDENCY_MISSING",
2393 |                 details={
2394 |                     "dependency": "PyMuPDF or PDFPlumber",
2395 |                     "feature": "Direct Text strategy",
2396 |                 },
2397 |             )
2398 |     if strategy in ["ocr", "hybrid_direct_ocr"]:
2399 |         _ocr_check_dep("pdf2image", _PDF2IMAGE_AVAILABLE, "OCR strategy")
2400 |         _ocr_check_dep("pytesseract", _PYTESSERACT_AVAILABLE, "OCR strategy")
2401 |         _ocr_check_dep("Pillow", _PIL_AVAILABLE, "OCR strategy")
2402 |         if ocr_options.get("preprocessing") and not (_CV2_AVAILABLE and _NUMPY_AVAILABLE):
2403 |             logger.warning(
2404 |                 "Preprocessing options provided but OpenCV/NumPy missing. Preprocessing limited."
2405 |             )
2406 | 
2407 |     # Adjust output format compatibility
2408 |     effective_output_format = output_format
2409 |     if strategy != "docling" and output_format not in _OCR_COMPATIBLE_FORMATS:
2410 |         logger.warning(
2411 |             f"Output format '{output_format}' is not directly supported by strategy '{strategy}'. Defaulting to 'markdown'."
2412 |         )
2413 |         effective_output_format = "markdown"
2414 | 
2415 |     # --- Prepare Input ---
2416 |     input_path_obj: Optional[Path] = None
2417 |     is_temp_file = False
2418 |     input_name: str = "input_data"
2419 | 
2420 |     try:
2421 |         input_path_obj, is_temp_file = _get_input_path_or_temp(document_path, document_data)
2422 |         input_name = input_path_obj.name
2423 | 
2424 |         with _handle_temp_file(input_path_obj, is_temp_file) as current_input_path:
2425 |             input_suffix = current_input_path.suffix.lower()
2426 |             is_pdf = input_suffix == ".pdf"
2427 |             is_image = input_suffix in [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"]
2428 |             is_office = input_suffix in [
2429 |                 ".docx",
2430 |                 ".pptx",
2431 |                 ".xlsx",
2432 |                 ".zip",
2433 |             ]  # Treat zip as potential office
2434 |             is_text = input_suffix in [".txt", ".md", ".html", ".xml", ".json"]
2435 | 
2436 |             # Validate strategy vs input type & adjust strategy if needed
2437 |             if not is_pdf and strategy in ["direct_text", "hybrid_direct_ocr"]:
2438 |                 if is_image:
2439 |                     logger.warning(f"Strategy '{strategy}' needs PDF. Input is image. Using 'ocr'.")
2440 |                     strategy = "ocr"
2441 |                 else:
2442 |                     raise ToolInputError(
2443 |                         f"Strategy '{strategy}' requires PDF input, got '{input_suffix}'."
2444 |                     )
2445 |             if not is_pdf and not is_image and strategy == "ocr":
2446 |                 raise ToolInputError(
2447 |                     f"OCR strategy needs PDF/Image, got '{input_suffix}'. Use 'docling' or handle as text."
2448 |                 )
2449 |             if is_office and strategy != "docling":
2450 |                 if input_suffix == ".docx" and _DOCX_AVAILABLE:
2451 |                     logger.warning("Input is DOCX without 'docling'. Using fallback.")
2452 |                     strategy = "fallback_docx"
2453 |                 # Add other office fallbacks here if needed
2454 |                 else:
2455 |                     raise ToolInputError(
2456 |                         f"Office file ('{input_suffix}') requires 'docling' strategy or specific fallback library."
2457 |                     )
2458 |             if is_text and strategy != "docling":
2459 |                 logger.info(f"Input is text ('{input_suffix}'). Using direct text handling.")
2460 |                 strategy = "fallback_text"
2461 | 
2462 |             # --- Parse Page Range ---
2463 |             pages_to_process: Optional[List[int]] = None
2464 |             total_doc_pages = 0
2465 |             if page_range:
2466 |                 try:
2467 |                     pages_set: Set[int] = set()
2468 |                     parts = page_range.split(",")
2469 |                     for part in parts:
2470 |                         part = part.strip()
2471 |                         if "-" in part:
2472 |                             start_str, end_str = part.split("-", 1)
2473 |                             start, end = int(start_str), int(end_str)
2474 |                             if start < 1 or end < start:
2475 |                                 raise ValueError(f"Invalid range: {start}-{end}")
2476 |                             pages_set.update(range(start - 1, end))
2477 |                         else:
2478 |                             page_num = int(part)
2479 |                             if page_num < 1:
2480 |                                 raise ValueError(f"Page number must be positive: {page_num}")
2481 |                             pages_set.add(page_num - 1)
2482 |                     if not pages_set:
2483 |                         raise ValueError("No valid pages selected.")
2484 |                     pages_to_process = sorted(list(pages_set))
2485 |                     logger.debug(
2486 |                         f"Parsed page range: {page_range} -> 0-based indices: {pages_to_process}"
2487 |                     )
2488 |                 except ValueError as e:
2489 |                     raise ToolInputError(
2490 |                         f"Invalid page_range format: '{page_range}'. Error: {e}",
2491 |                         param_name="page_range",
2492 |                     ) from e
2493 | 
2494 |             # --- Result Structure Defaults ---
2495 |             result_content: Union[str, Dict] = ""
2496 |             doc_metadata: Dict[str, Any] = {}
2497 |             raw_text_pages: List[str] = []
2498 |             final_raw_text: Optional[str] = None
2499 |             quality_metrics: Optional[Dict] = None
2500 |             strategy_used = strategy
2501 | 
2502 |             # ======================== EXTRACTION STRATEGIES ========================
2503 | 
2504 |             if strategy == "docling":
2505 |                 logger.info(f"Using 'docling' strategy for {input_name}")
2506 |                 _ocr_check_dep("docling", _DOCLING_AVAILABLE, "Docling strategy")
2507 |                 device_str = accelerator_device.lower()
2508 |                 if device_str not in _ACCEL_MAP:
2509 |                     logger.warning(f"Invalid device '{device_str}', using 'auto'.")
2510 |                     device_str = "auto"
2511 |                 device = _ACCEL_MAP[device_str]
2512 |                 conv = _get_docling_converter(device, num_threads)
2513 |                 loop = asyncio.get_running_loop()
2514 |                 with _span("docling_conversion"):
2515 |                     docling_result = await loop.run_in_executor(
2516 |                         None, conv.convert, current_input_path
2517 |                     )
2518 |                 if not docling_result or not docling_result.document:
2519 |                     raise ToolError("CONVERSION_FAILED", details={"reason": "Docling empty result"})
2520 |                 doc_obj = docling_result.document
2521 |                 doc_metadata = _get_docling_metadata(doc_obj)
2522 |                 total_doc_pages = doc_metadata.get("num_pages", 0)
2523 | 
2524 |                 if effective_output_format == "markdown":
2525 |                     result_content = doc_obj.export_to_markdown()
2526 |                 elif effective_output_format == "text":
2527 |                     result_content = doc_obj.export_to_text()
2528 |                 elif effective_output_format == "html":
2529 |                     result_content = doc_obj.export_to_html()
2530 |                 elif effective_output_format == "json":
2531 |                     result_content = _json(doc_obj.export_to_dict())
2532 |                 elif effective_output_format == "doctags":
2533 |                     result_content = doc_obj.export_to_doctags()
2534 |                 else:
2535 |                     logger.warning(
2536 |                         f"Unsupported format '{effective_output_format}' for Docling, using markdown."
2537 |                     )
2538 |                     result_content = doc_obj.export_to_markdown()
2539 |                     effective_output_format = "markdown"
2540 | 
2541 |                 if save_to_file:
2542 |                     fp = (
2543 |                         Path(output_path)
2544 |                         if output_path
2545 |                         else _tmp_path(str(current_input_path), effective_output_format)
2546 |                     )
2547 |                     fp.parent.mkdir(parents=True, exist_ok=True)
2548 |                     img_mode = (
2549 |                         _ImageRefModeType.PLACEHOLDER
2550 |                         if effective_output_format in ["text", "json"]
2551 |                         else _ImageRefModeType.REFERENCED
2552 |                     )
2553 |                     save_func_map = {
2554 |                         "markdown": functools.partial(
2555 |                             doc_obj.save_as_markdown, image_mode=img_mode
2556 |                         ),
2557 |                         "text": functools.partial(doc_obj.save_as_markdown(strict_text=True)),
2558 |                         "html": functools.partial(doc_obj.save_as_html, image_mode=img_mode),
2559 |                         "json": functools.partial(doc_obj.save_as_json, image_mode=img_mode),
2560 |                         "doctags": functools.partial(doc_obj.save_as_doctags),
2561 |                     }
2562 |                     save_func = save_func_map.get(effective_output_format)
2563 |                     if save_func and callable(save_func):
2564 |                         with _span(f"docling_save_{effective_output_format}"):
2565 |                             save_func(fp)
2566 |                         logger.info(f"Saved Docling output ({effective_output_format}) to {fp}")
2567 |                         doc_metadata["saved_output_path"] = str(fp)
2568 |                     else:
2569 |                         fp.write_text(str(result_content), encoding="utf-8")
2570 |                         logger.info(f"Saved Docling output (generic text write) to {fp}")
2571 |                         doc_metadata["saved_output_path"] = str(fp)
2572 | 
2573 |             elif strategy.startswith("fallback_"):
2574 |                 fallback_type = strategy.split("_", 1)[1]
2575 |                 logger.info(f"Using fallback strategy for: {fallback_type}")
2576 |                 fallback_result: Optional[Dict[str, Any]] = None
2577 |                 if fallback_type == "docx":
2578 |                     fallback_result = await _fallback_convert_docx(current_input_path)
2579 |                 elif fallback_type == "pdf":
2580 |                     fallback_result = await _fallback_convert_pdf(current_input_path)
2581 |                 elif fallback_type == "text":
2582 |                     fallback_result = await _fallback_convert_text(current_input_path)
2583 |                 if fallback_result:
2584 |                     raw_text_pages = [fallback_result.get("content", "")]
2585 |                     doc_metadata = fallback_result.get("metadata", {})
2586 |                     total_doc_pages = doc_metadata.get("num_pages", 1)
2587 |                     strategy_used = f"fallback_{fallback_type}"
2588 |                 else:
2589 |                     raise ToolError(
2590 |                         "CONVERSION_FAILED",
2591 |                         details={"reason": f"Fallback '{fallback_type}' failed."},
2592 |                     )
2593 | 
2594 |             else:  # Text/OCR strategies
2595 |                 run_ocr = False
2596 |                 run_direct = False
2597 |                 if strategy == "direct_text":
2598 |                     run_direct = True
2599 |                 elif strategy == "ocr":
2600 |                     run_ocr = True
2601 |                 elif strategy == "hybrid_direct_ocr":
2602 |                     if not is_pdf:
2603 |                         run_ocr = True
2604 |                         strategy_used = "ocr"
2605 |                         logger.info("Input is image, using 'ocr'.")
2606 |                     else:
2607 |                         run_direct = True
2608 |                 extract_start_page = pages_to_process[0] if pages_to_process else 0
2609 |                 extract_page_count = len(pages_to_process) if pages_to_process else 0
2610 | 
2611 |                 if run_direct:
2612 |                     logger.info(f"Attempting 'direct_text' strategy for {input_name}")
2613 |                     try:
2614 |                         with _span("direct_text_extraction"):
2615 |                             (
2616 |                                 extracted_pages,
2617 |                                 has_meaningful_text,
2618 |                             ) = await asyncio.to_thread(  # Use helper defined above
2619 |                                 _ocr_extract_text_from_pdf_direct,
2620 |                                 current_input_path,
2621 |                                 start_page=extract_start_page,
2622 |                                 max_pages=extract_page_count,
2623 |                             )
2624 |                         total_doc_pages = len(
2625 |                             extracted_pages
2626 |                         )  # Page count reflects extracted range
2627 |                         if strategy == "hybrid_direct_ocr" and not has_meaningful_text:
2628 |                             logger.warning("Direct text minimal. Falling back to OCR.")
2629 |                             run_ocr = True
2630 |                             strategy_used = "ocr"
2631 |                         elif not has_meaningful_text and strategy == "direct_text":
2632 |                             raise ToolError(
2633 |                                 "DIRECT_EXTRACTION_FAILED",
2634 |                                 details={"reason": "No meaningful text found."},
2635 |                             )
2636 |                         else:
2637 |                             raw_text_pages = extracted_pages
2638 |                             logger.info(f"Direct text success: {len(raw_text_pages)} pages.")
2639 |                     except ToolError as e:
2640 |                         if strategy == "hybrid_direct_ocr":
2641 |                             logger.warning(f"Direct failed ({e.error_code}). Falling back to OCR.")
2642 |                             run_ocr = True
2643 |                             strategy_used = "ocr"
2644 |                         else:
2645 |                             raise e
2646 |                     except Exception as e_direct:
2647 |                         logger.error(f"Unexpected direct text error: {e_direct}", exc_info=True)
2648 |                         if strategy == "hybrid_direct_ocr":
2649 |                             logger.warning("Direct failed. Falling back to OCR.")
2650 |                             run_ocr = True
2651 |                             strategy_used = "ocr"
2652 |                         else:
2653 |                             raise ToolError(
2654 |                                 "DIRECT_EXTRACTION_FAILED", details={"error": str(e_direct)}
2655 |                             ) from e_direct
2656 | 
2657 |                 if run_ocr:
2658 |                     logger.info(f"Using 'ocr' strategy for {input_name}")
2659 |                     strategy_used = "ocr"
2660 |                     ocr_lang = ocr_options.get("language", "eng")
2661 |                     ocr_dpi = ocr_options.get("dpi", 300)
2662 |                     ocr_prep_opts = ocr_options.get("preprocessing")
2663 |                     images: List["PILImage.Image"] = []
2664 |                     if is_pdf:
2665 |                         convert_func = _ocr_convert_pdf_to_images  # Use helper defined above
2666 |                         with _span("pdf_to_images"):
2667 |                             images = await asyncio.to_thread(
2668 |                                 convert_func,
2669 |                                 current_input_path,
2670 |                                 start_page=extract_start_page,
2671 |                                 max_pages=extract_page_count,
2672 |                                 dpi=ocr_dpi,
2673 |                             )
2674 |                         total_doc_pages = len(images)
2675 |                     elif is_image:
2676 |                         _ocr_check_dep("Pillow", _PIL_AVAILABLE, "Image loading")
2677 |                         if Image is None:
2678 |                             raise ToolError(
2679 |                                 "INTERNAL_ERROR", details={"reason": "PIL.Image is None"}
2680 |                             )
2681 |                         with _span(f"load_image_{input_name}"):
2682 |                             img = Image.open(current_input_path)  # type: ignore
2683 |                         images = [img.convert("RGB")]
2684 |                         total_doc_pages = 1
2685 |                         img.close()  # Close after converting
2686 |                     if not images:
2687 |                         raise ToolError("OCR_FAILED", details={"reason": "No images for OCR."})
2688 | 
2689 |                     processed_pages_text: List[str] = [""] * len(images)
2690 | 
2691 |                     async def _process_ocr_page_worker(
2692 |                         idx: int, img: "PILImage.Image"
2693 |                     ) -> Tuple[int, str]:
2694 |                         try:
2695 |                             loop = asyncio.get_running_loop()
2696 |                             with _span(f"ocr_page_{idx}_preprocess"):
2697 |                                 prep_img = await loop.run_in_executor(
2698 |                                     None, _ocr_preprocess_image, img, ocr_prep_opts
2699 |                                 )
2700 |                             with _span(f"ocr_page_{idx}_tesseract"):
2701 |                                 text = await loop.run_in_executor(
2702 |                                     None,
2703 |                                     _ocr_run_tesseract,
2704 |                                     prep_img,
2705 |                                     ocr_lang,
2706 |                                     ocr_options.get("tesseract_config", ""),
2707 |                                 )  # Use helper defined above
2708 |                             if prep_img != img:
2709 |                                 prep_img.close()  # Close preprocessed image if different
2710 |                             return idx, text
2711 |                         except Exception as page_err:
2712 |                             logger.error(
2713 |                                 f"OCR page {idx + extract_start_page} error: {page_err}",
2714 |                                 exc_info=True,
2715 |                             )
2716 |                             return idx, f"[Page {idx + extract_start_page + 1} OCR Error]"
2717 |                         finally:
2718 |                             img.close()  # Close the original image passed to worker
2719 | 
2720 |                     tasks = [_process_ocr_page_worker(i, img) for i, img in enumerate(images)]
2721 |                     page_results = await asyncio.gather(*tasks)
2722 |                     for idx, text in page_results:
2723 |                         processed_pages_text[idx] = text
2724 |                     raw_text_pages = processed_pages_text
2725 |                     logger.info(f"OCR extraction successful for {len(raw_text_pages)} pages.")
2726 | 
2727 |             # --- Stage 2 & 3 (Post-processing for non-Docling) ---
2728 |             if strategy != "docling":
2729 |                 if not raw_text_pages:
2730 |                     raise ToolError(
2731 |                         "EXTRACTION_FAILED",
2732 |                         details={"reason": f"Strategy '{strategy_used}' yielded no text."},
2733 |                     )
2734 |                 final_raw_text = "\n\n".join(raw_text_pages).strip()
2735 |                 if section_filter and final_raw_text:
2736 |                     try:
2737 |                         pat = re.compile(section_filter, re.I | re.M)
2738 |                         blocks = re.split(r"(\n\s*\n)", final_raw_text)
2739 |                         kept_content = ""
2740 |                         for i in range(0, len(blocks), 2):
2741 |                             block = blocks[i]
2742 |                             separator = blocks[i + 1] if i + 1 < len(blocks) else ""
2743 |                         if block and pat.search(block):
2744 |                             kept_content += block + separator
2745 |                         final_raw_text = kept_content.strip()
2746 |                         if not final_raw_text:
2747 |                             logger.warning(
2748 |                                 f"Section filter '{section_filter}' removed all content."
2749 |                             )
2750 |                         else:
2751 |                             logger.info(f"Applied section filter: '{section_filter}'")
2752 |                     except Exception as e_filter:
2753 |                         logger.warning(f"Failed to apply section filter: {e_filter}")
2754 | 
2755 |                 if enhance_with_llm and final_raw_text:
2756 |                     logger.info("Applying LLM enhancement.")
2757 |                     result_content = ""
2758 |                     with _span("llm_text_enhancement"):
2759 |                         chunks = _ocr_split_text_into_chunks(
2760 |                             final_raw_text
2761 |                         )  # Use helper defined above
2762 |                         if chunks:
2763 |                             enhancement_tasks = [
2764 |                                 _ocr_enhance_text_chunk(
2765 |                                     chunk,
2766 |                                     output_format=effective_output_format,
2767 |                                     remove_headers=ocr_options.get("remove_headers", False),
2768 |                                 )
2769 |                                 for chunk in chunks
2770 |                             ]  # Use helper defined above
2771 |                             enhanced_chunks = await asyncio.gather(*enhancement_tasks)
2772 |                             result_content = "\n\n".join(enhanced_chunks).strip()
2773 |                         else:
2774 |                             logger.warning("Text empty pre-LLM.")
2775 |                 else:
2776 |                     result_content = final_raw_text or ""
2777 |                 if not doc_metadata or doc_metadata.get("is_fallback"):
2778 |                     doc_metadata = _get_basic_metadata(str(result_content), total_doc_pages)
2779 |                 if enhance_with_llm and final_raw_text and ocr_options.get("assess_quality", False):
2780 |                     logger.info("Performing OCR quality assessment.")
2781 |                     with _span("ocr_quality_assessment"):
2782 |                         quality_metrics = await _ocr_assess_text_quality(
2783 |                             final_raw_text, str(result_content)
2784 |                         )  # Use helper defined above
2785 | 
2786 |             # ======================== POST-PROCESSING & RETURN ========================
2787 |             final_content = result_content
2788 |             if save_to_file and strategy != "docling":  # Docling saving handled above
2789 |                 fp = (
2790 |                     Path(output_path)
2791 |                     if output_path
2792 |                     else _tmp_path(input_name, effective_output_format)
2793 |                 )
2794 |                 fp.parent.mkdir(parents=True, exist_ok=True)
2795 |                 try:
2796 |                     content_to_save = (
2797 |                         _json(final_content)
2798 |                         if isinstance(final_content, dict)
2799 |                         else str(final_content)
2800 |                     )
2801 |                     fp.write_text(content_to_save, encoding="utf-8")
2802 |                     logger.info(
2803 |                         f"Saved output ({effective_output_format}, strategy: {strategy_used}) to {fp}"
2804 |                     )
2805 |                     doc_metadata["saved_output_path"] = str(fp)
2806 |                 except Exception as e_save:
2807 |                     logger.error(f"Failed to save output file to {fp}: {e_save}", exc_info=True)
2808 |                     doc_metadata["save_error"] = f"Failed to save: {e_save}"
2809 | 
2810 |             elapsed = round(time.time() - t0, 3)
2811 |             response: Dict[str, Any] = {
2812 |                 "success": True,
2813 |                 "content": final_content,
2814 |                 "output_format": effective_output_format,
2815 |                 "processing_time": elapsed,
2816 |                 "document_metadata": doc_metadata,
2817 |                 "extraction_strategy_used": strategy_used,
2818 |             }
2819 |             if final_raw_text is not None and strategy != "docling":
2820 |                 response["raw_text"] = final_raw_text
2821 |             if quality_metrics is not None:
2822 |                 response["ocr_quality_metrics"] = quality_metrics
2823 |             if "saved_output_path" in doc_metadata:
2824 |                 response["file_path"] = doc_metadata["saved_output_path"]
2825 |             logger.info(
2826 |                 f"Completed conversion '{input_name}' -> {effective_output_format} (strategy: {strategy_used}) in {elapsed}s"
2827 |             )
2828 |             return response
2829 |     except Exception as e:
2830 |         logger.error(f"Error in convert_document for '{input_name}': {e}", exc_info=True)
2831 |         if isinstance(e, (ToolInputError, ToolError)):
2832 |             raise e
2833 |         raise ToolError("CONVERSION_FAILED", details={"input": input_name, "error": str(e)}) from e
2834 | 
2835 | 
2836 | # <<< Part 1 code goes here >>>
2837 | 
2838 | ###############################################################################
2839 | # Chunking Helpers (Internal)                                                 #
2840 | ###############################################################################
2841 | 
2842 | 
2843 | async def _internal_token_chunks(doc: str, size: int, overlap: int) -> List[str]:
2844 |     """Chunk document by tokens, respecting sentence boundaries (Internal Helper)."""
2845 |     enc = _get_tiktoken_encoder()
2846 |     if not enc:
2847 |         logger.warning("Tiktoken not available, falling back to character chunking.")
2848 |         char_size = size * 4
2849 |         char_overlap = overlap * 4
2850 |         return await _internal_char_chunks(doc, char_size, char_overlap)
2851 |     if not doc:
2852 |         return []
2853 |     try:
2854 |         tokens = enc.encode(doc, disallowed_special=())
2855 |     except Exception as e:
2856 |         logger.error(f"Tiktoken encoding failed: {e}. Falling back to char.", exc_info=True)
2857 |         return await _internal_char_chunks(doc, size * 4, overlap * 4)
2858 |     if not tokens:
2859 |         return []
2860 |     chunks: List[str] = []
2861 |     current_pos = 0
2862 |     n_tokens = len(tokens)
2863 |     try:
2864 |         sentence_end_tokens = {enc.encode(p)[0] for p in (".", "?", "!", "\n")}
2865 |     except Exception as e:
2866 |         encoding_name = getattr(enc, "name", "unknown")
2867 |         if encoding_name == "cl100k_base":
2868 |             sentence_end_tokens = {13, 30, 106, 198}
2869 |         else:
2870 |             try:
2871 |                 sentence_end_tokens = {enc.encode("\n")[0]}
2872 |             except Exception:  # If even newline encoding fails
2873 |                 logger.error(
2874 |                     f"Cannot encode even newline token for encoding '{encoding_name}'. Using empty set for sentence ends."
2875 |                 )
2876 |                 sentence_end_tokens = set()
2877 |         logger.warning(
2878 |             f"Could not encode sentence ends: {e}. Using fallback tokens: {sentence_end_tokens}"
2879 |         )
2880 |     while current_pos < n_tokens:
2881 |         end_pos = min(current_pos + size, n_tokens)
2882 |         best_split_pos = end_pos
2883 |         if end_pos < n_tokens:
2884 |             lookback_distance = min(overlap, size // 4, end_pos - current_pos)
2885 |             search_start = max(current_pos, end_pos - lookback_distance)
2886 |             for k in range(end_pos - 1, search_start - 1, -1):
2887 |                 if tokens[k] in sentence_end_tokens:
2888 |                     best_split_pos = k + 1
2889 |                     break
2890 |         chunk_token_ids = tokens[current_pos:best_split_pos]
2891 |         if not chunk_token_ids:
2892 |             if current_pos >= n_tokens:
2893 |                 break
2894 |             current_pos += 1
2895 |             continue
2896 |         try:
2897 |             chunk_text = enc.decode(chunk_token_ids).strip()
2898 |             if chunk_text:
2899 |                 chunks.append(chunk_text)
2900 |         except Exception as decode_err:  # Keep variable for logging
2901 |             logger.error(
2902 |                 f"Tiktoken decode failed for {current_pos}:{best_split_pos}: {decode_err}",
2903 |                 exc_info=False,
2904 |             )
2905 |         next_start_pos = best_split_pos - overlap
2906 |         current_pos = max(current_pos + 1, next_start_pos)
2907 |         if current_pos <= best_split_pos - size:
2908 |             current_pos = best_split_pos
2909 |     return chunks
2910 | 
2911 | 
2912 | async def _internal_char_chunks(doc: str, size: int, overlap: int) -> List[str]:
2913 |     """Chunk document by characters, respecting sentence/paragraph boundaries (Internal Helper)."""
2914 |     if not doc:
2915 |         return []
2916 |     chunks: List[str] = []
2917 |     current_pos = 0
2918 |     n_chars = len(doc)
2919 |     sentence_ends = (". ", "? ", "! ", "\n\n")
2920 |     softer_breaks = ("\n", "; ", ": ", ", ", ".)", "?)", "!)", "\t", " ")
2921 |     while current_pos < n_chars:
2922 |         end_pos = min(current_pos + size, n_chars)
2923 |         best_split_pos = end_pos
2924 |         if end_pos < n_chars:
2925 |             lookback_window_start = max(current_pos, end_pos - int(size * 0.2), end_pos - 150)
2926 |             best_found_pos = -1
2927 |             for marker in sentence_ends:
2928 |                 found_pos = doc.rfind(marker, lookback_window_start, end_pos)
2929 |                 if found_pos != -1:
2930 |                     best_found_pos = max(best_found_pos, found_pos + len(marker))
2931 |             if best_found_pos == -1:
2932 |                 for marker in softer_breaks:
2933 |                     found_pos = doc.rfind(marker, lookback_window_start, end_pos)
2934 |                     if found_pos != -1:
2935 |                         best_found_pos = max(best_found_pos, found_pos + len(marker))
2936 |             if best_found_pos > current_pos:
2937 |                 best_split_pos = best_found_pos
2938 |         actual_chunk_text = doc[current_pos:best_split_pos].strip()
2939 |         if actual_chunk_text:
2940 |             chunks.append(actual_chunk_text)
2941 |         next_start_pos = best_split_pos - overlap
2942 |         current_pos = max(current_pos + 1, next_start_pos)
2943 |         if current_pos <= best_split_pos - size:
2944 |             current_pos = best_split_pos
2945 |     return chunks
2946 | 
2947 | 
2948 | async def _internal_paragraph_chunks(doc: str, size: int, overlap: int) -> List[str]:
2949 |     """Chunk document by paragraphs, combining small ones (Internal Helper)."""
2950 |     if not doc:
2951 |         return []
2952 |     paragraphs = [p.strip() for p in re.split(r"\n\s*\n+", doc) if p.strip()]
2953 |     if not paragraphs:
2954 |         return []
2955 |     chunks = []
2956 |     current_chunk_paragraphs: List[str] = []
2957 |     current_chunk_len = 0
2958 | 
2959 |     def get_len(text: str) -> int:
2960 |         return len(text)
2961 | 
2962 |     def is_markdown_table(text: str) -> bool:
2963 |         lines = text.strip().split("\n")
2964 |         return (
2965 |             len(lines) >= 2
2966 |             and all(line.strip().startswith("|") for line in lines[:2])
2967 |             and "|" in lines[0]
2968 |             and re.search(r"\|.*?(-{3,}|:{1,2}-{1,}:?).*?\|", lines[1]) is not None
2969 |         )
2970 | 
2971 |     for p in paragraphs:
2972 |         p_len = get_len(p)
2973 |         potential_new_len = (
2974 |             current_chunk_len + (get_len("\n\n") if current_chunk_paragraphs else 0) + p_len
2975 |         )
2976 |         is_table = is_markdown_table(p)
2977 |         if current_chunk_paragraphs and potential_new_len > size and not is_table:
2978 |             chunks.append("\n\n".join(current_chunk_paragraphs))
2979 |             current_chunk_paragraphs = [p]
2980 |             current_chunk_len = p_len
2981 |         elif p_len > size and not is_table:
2982 |             logger.warning(
2983 |                 f"Paragraph (len {p_len}) starting '{p[:50]}...' exceeds size {size}. Splitting."
2984 |             )
2985 |             if current_chunk_paragraphs:
2986 |                 chunks.append("\n\n".join(current_chunk_paragraphs))
2987 |             sub_chunks = await _internal_char_chunks(p, size, overlap)
2988 |             chunks.extend(sub_chunks)
2989 |             current_chunk_paragraphs = []
2990 |             current_chunk_len = 0
2991 |         else:
2992 |             current_chunk_paragraphs.append(p)
2993 |             current_chunk_len = potential_new_len
2994 |     if current_chunk_paragraphs:
2995 |         chunks.append("\n\n".join(current_chunk_paragraphs))
2996 |     logger.info(f"Chunked into {len(chunks)} paragraphs/groups.")
2997 |     return chunks
2998 | 
2999 | 
3000 | async def _internal_section_chunks(doc: str, size: int, overlap: int) -> List[str]:
3001 |     """Chunk document by identified sections (Internal Helper). Falls back to paragraphs."""
3002 |     try:
3003 |         # Call main tool function (defined in Part 3)
3004 |         section_result = await identify_sections(document=doc)
3005 |         if (
3006 |             isinstance(section_result, dict)
3007 |             and section_result.get("success")
3008 |             and isinstance(section_result.get("sections"), list)
3009 |         ):
3010 |             sections = section_result["sections"]
3011 |         else:
3012 |             logger.warning(
3013 |                 "identify_sections failed/unexpected format. Falling back to paragraphs."
3014 |             )
3015 |             return await _internal_paragraph_chunks(doc, size, overlap)
3016 |         if not sections:
3017 |             logger.info("No sections identified, using paragraph fallback.")
3018 |             return await _internal_paragraph_chunks(doc, size, overlap)
3019 |         section_texts: List[str] = []
3020 |         for s in sections:
3021 |             title = s.get("title", "").strip()
3022 |             text = s.get("text", "").strip()
3023 |             if text:
3024 |                 use_title = title and title.lower() not in [
3025 |                     "introduction",
3026 |                     "main content",
3027 |                     "body",
3028 |                     "abstract",
3029 |                     "summary",
3030 |                 ]
3031 |                 full_section_text = f"## {title}\n\n{text}" if use_title else text
3032 |                 section_texts.append(full_section_text.strip())
3033 | 
3034 |         def contains_markdown_table(text: str) -> bool:
3035 |             lines = text.strip().split("\n")
3036 |             return (
3037 |                 len(lines) >= 2
3038 |                 and all(line.strip().startswith("|") for line in lines[:2])
3039 |                 and "|" in lines[0]
3040 |                 and re.search(r"\|.*?(-{3,}|:{1,2}-{1,}:?).*?\|", lines[1]) is not None
3041 |             )
3042 | 
3043 |         final_chunks = []
3044 |         for text in section_texts:
3045 |             text_len = len(text)
3046 |             has_table = contains_markdown_table(text)
3047 |             should_split = text_len > size * 1.1 and (not has_table or text_len > size * 2)
3048 |             if should_split:
3049 |                 logger.warning(
3050 |                     f"Section chunk (len {text_len}) starting '{text[:50]}...' exceeds size {size}. Sub-chunking."
3051 |                 )
3052 |                 sub_chunks = await _internal_paragraph_chunks(text, size, overlap)
3053 |                 final_chunks.extend(sub_chunks)
3054 |             elif text:
3055 |                 final_chunks.append(text)
3056 |         return final_chunks
3057 |     except Exception as e:
3058 |         logger.error(f"Section chunking failed: {e}. Falling back to paragraphs.", exc_info=True)
3059 |         return await _internal_paragraph_chunks(doc, size, overlap)
3060 | 
3061 | 
3062 | ###############################################################################
3063 | # Standalone Tool Functions (Continued)                                       #
3064 | ###############################################################################
3065 | 
3066 | 
3067 | # ------------------------ Chunking Tool Function (Merged) --------------------
3068 | @with_tool_metrics
3069 | @with_error_handling
3070 | async def chunk_document(
3071 |     document: str,
3072 |     *,
3073 |     chunk_size: int = 1000,
3074 |     chunk_method: str = "paragraph",
3075 |     chunk_overlap: int = 0,
3076 |     chunk_strategy: Optional[str] = None,  # Keep alias for compatibility
3077 | ) -> Dict[str, Any]:
3078 |     """
3079 |     Split document text into chunks using various strategies (Standalone Tool Function).
3080 | 
3081 |     Args:
3082 |         document: Text content to chunk.
3083 |         chunk_size: Target maximum size of each chunk (meaning depends on method: tokens or characters).
3084 |         chunk_method: Chunking method ('token', 'character', 'section', 'paragraph').
3085 |         chunk_overlap: Number of tokens/characters to overlap between chunks (for token/char methods).
3086 |                        Overlap logic for paragraph/section is heuristic/simplified.
3087 |         chunk_strategy: Alias for chunk_method (for backward compatibility).
3088 | 
3089 |     Returns:
3090 |         Dictionary containing list of chunked text strings.
3091 |         Example: {"chunks": ["chunk 1 text...", "chunk 2 text..."], "success": True}
3092 |     """
3093 |     # Use module logger directly
3094 |     _logger = logger
3095 | 
3096 |     if not document or not isinstance(document, str):
3097 |         _logger.warning("Chunking called with empty or invalid document input.")
3098 |         return {"chunks": [], "success": True}
3099 | 
3100 |     size = max(100, int(chunk_size))
3101 |     overlap = max(0, min(int(chunk_overlap), size // 3))
3102 |     method = (chunk_strategy or chunk_method or "paragraph").lower()
3103 | 
3104 |     # Map to internal helpers
3105 |     chunker_map = {
3106 |         "token": _internal_token_chunks,
3107 |         "character": _internal_char_chunks,
3108 |         "section": _internal_section_chunks,  # Relies on identify_sections tool
3109 |         "paragraph": _internal_paragraph_chunks,
3110 |     }
3111 | 
3112 |     strat_func = chunker_map.get(method)
3113 |     if not strat_func:
3114 |         _logger.warning(f"Unknown chunk_method '{method}'. Defaulting to 'paragraph'.")
3115 |         strat_func = _internal_paragraph_chunks
3116 |         method = "paragraph"
3117 | 
3118 |     _logger.info(f"Chunking document using method='{method}', size={size}, overlap={overlap}")
3119 |     chunks: List[str] = []
3120 | 
3121 |     try:
3122 |         t0_chunk = time.time()
3123 |         chunks = await strat_func(document, size, overlap)
3124 |         elapsed_chunk = time.time() - t0_chunk
3125 |         _logger.info(f"Chunking completed in {elapsed_chunk:.3f}s")
3126 |     except Exception as e:
3127 |         _logger.error(f"Error during chunking operation ({method}): {e}", exc_info=True)
3128 |         raise ToolError("CHUNKING_FAILED", details={"method": method, "error": str(e)}) from e
3129 | 
3130 |     final_chunks = [c for c in chunks if isinstance(c, str) and c]
3131 |     _logger.info(f"Generated {len(final_chunks)} chunks.")
3132 |     return {"chunks": final_chunks, "success": True}
3133 | 
3134 | 
3135 | ###############################################################################
3136 | # HTML Processing Helpers & Tools                                             #
3137 | ###############################################################################
3138 | 
3139 | 
3140 | # --- HTML Extraction Helpers ---
3141 | def _extract_readability(html_txt: str) -> str:
3142 |     """Extract main content using readability-lxml (Standalone)."""
3143 |     if not _READABILITY_AVAILABLE or not readability:
3144 |         logger.warning("Readability-lxml not installed. Cannot use readability extraction.")
3145 |         return ""
3146 |     try:
3147 |         # Adjust readability settings for better extraction
3148 |         # Use setdefault to avoid modifying the original regexes if called multiple times
3149 |         # Ensure the default regexes exist before modifying
3150 |         default_unlikely = readability.htmls.DEFAULT_REGEXES.get(
3151 |             "unlikelyCandidates", re.compile(r"$^")
3152 |         )  # Default to matching nothing
3153 |         readability.htmls.DEFAULT_REGEXES["unlikelyCandidates"] = re.compile(
3154 |             default_unlikely.pattern
3155 |             + "|aside|footer|nav|sidebar|footnote|advertisement|related|recommend|share|social|comment|meta",
3156 |             re.I,
3157 |         )
3158 | 
3159 |         default_positive = readability.htmls.DEFAULT_REGEXES.get("positive", re.compile(r"$^"))
3160 |         readability.htmls.DEFAULT_REGEXES["positive"] = re.compile(
3161 |             default_positive.pattern + "|article|main|content|post|entry|body", re.I
3162 |         )
3163 | 
3164 |         default_negative = readability.htmls.DEFAULT_REGEXES.get("negative", re.compile(r"$^"))
3165 |         readability.htmls.DEFAULT_REGEXES["negative"] = re.compile(
3166 |             default_negative.pattern + "|widget|menu|legal|promo|disclaimer", re.I
3167 |         )
3168 | 
3169 |         doc = readability.Document(html_txt)
3170 |         summary_html = doc.summary(html_partial=True)
3171 |         return summary_html
3172 |     except Exception as e:
3173 |         logger.warning(f"Readability extraction failed: {e}", exc_info=True)
3174 |         return ""
3175 | 
3176 | 
3177 | def _extract_trafilatura(html_txt: str) -> str:
3178 |     """Extract main content using trafilatura (Standalone)."""
3179 |     if not _TRAFILATURA_AVAILABLE or not trafilatura:
3180 |         logger.warning("Trafilatura not installed. Cannot use trafilatura extraction.")
3181 |         return ""
3182 |     try:
3183 |         extracted = trafilatura.extract(
3184 |             html_txt,
3185 |             include_comments=False,
3186 |             include_tables=True,
3187 |             favor_precision=True,
3188 |             deduplicate=True,
3189 |             target_language=None,
3190 |             include_formatting=True,
3191 |             output_format="html",
3192 |         )
3193 |         return extracted or ""
3194 |     except Exception as e:
3195 |         logger.warning(f"Trafilatura extraction failed: {e}", exc_info=True)
3196 |         return ""
3197 | 
3198 | 
3199 | # --- HTML Processing Tools ---
3200 | 
3201 | 
3202 | @with_tool_metrics
3203 | @with_error_handling
3204 | async def clean_and_format_text_as_markdown(
3205 |     text: str,
3206 |     force_markdown_conversion: bool = False,
3207 |     extraction_method: str = "auto",  # auto, readability, trafilatura, none
3208 |     preserve_tables: bool = True,
3209 |     preserve_links: bool = True,
3210 |     preserve_images: bool = False,
3211 |     max_line_length: int = 0,  # 0 means no wrapping
3212 | ) -> Dict[str, Any]:
3213 |     """
3214 |     Convert plain text or HTML to clean Markdown, optionally extracting main content (Standalone Tool).
3215 |     """
3216 |     start_time = time.time()
3217 |     if not text or not isinstance(text, str):
3218 |         raise ToolInputError("Input text must be a non-empty string", param_name="text")
3219 | 
3220 |     # Detect content type (function defined in Part 3)
3221 |     content_type_result = await detect_content_type(text)
3222 |     input_type = content_type_result.get("content_type", "unknown")
3223 |     input_confidence = content_type_result.get("confidence", 0.0)
3224 | 
3225 |     was_html = (input_type == "html" and input_confidence > 0.3) or (
3226 |         input_type != "markdown" and input_type != "code" and _is_html_fragment(text)
3227 |     )
3228 | 
3229 |     extraction_method_used = "none"
3230 |     processed_text = text
3231 | 
3232 |     logger.debug(f"Input content type detected as: {input_type}, treating as HTML: {was_html}")
3233 | 
3234 |     if was_html or force_markdown_conversion:
3235 |         was_html = True
3236 |         actual_extraction = extraction_method.lower()
3237 |         if actual_extraction == "auto":
3238 |             if _TRAFILATURA_AVAILABLE:
3239 |                 actual_extraction = "trafilatura"
3240 |             elif _READABILITY_AVAILABLE:
3241 |                 actual_extraction = "readability"
3242 |             else:
3243 |                 actual_extraction = "none"
3244 |             logger.debug(f"Auto-selected extraction method: {actual_extraction}")
3245 | 
3246 |         extraction_method_used = actual_extraction
3247 | 
3248 |         if actual_extraction != "none":
3249 |             extracted_html = ""
3250 |             logger.info(f"Attempting HTML content extraction using: {actual_extraction}")
3251 |             try:
3252 |                 if actual_extraction == "readability":
3253 |                     extracted_html = _extract_readability(processed_text)
3254 |                 elif actual_extraction == "trafilatura":
3255 |                     extracted_html = _extract_trafilatura(processed_text)
3256 | 
3257 |                 if extracted_html and len(extracted_html.strip()) > 50:
3258 |                     processed_text = extracted_html
3259 |                     logger.info(f"Successfully extracted content using {actual_extraction}")
3260 |                 else:
3261 |                     logger.warning(
3262 |                         f"{actual_extraction.capitalize()} extraction yielded minimal content. Using original."
3263 |                     )
3264 |                     extraction_method_used = f"{actual_extraction} (failed)"
3265 |             except Exception as e_extract:
3266 |                 logger.error(
3267 |                     f"Error during {actual_extraction} extraction: {e_extract}", exc_info=True
3268 |                 )
3269 |                 extraction_method_used = f"{actual_extraction} (error)"
3270 | 
3271 |         try:
3272 |             logger.info(
3273 |                 f"Converting HTML (extracted: {extraction_method_used != 'none'}) to Markdown..."
3274 |             )
3275 |             md_text = _html_to_md_core(
3276 |                 processed_text,
3277 |                 links=preserve_links,
3278 |                 imgs=preserve_images,
3279 |                 tbls=preserve_tables,
3280 |                 width=0,  # Disable html2text wrapping here
3281 |             )
3282 |             md_text = _sanitize(md_text)
3283 |             md_text = _improve(md_text)
3284 |             processed_text = md_text
3285 |         except Exception as e_conv:
3286 |             logger.error(f"Error converting HTML to Markdown: {e_conv}", exc_info=True)
3287 |             processed_text = _sanitize(processed_text)
3288 |             logger.warning("HTML to Markdown conversion failed, returning sanitized input.")
3289 | 
3290 |     elif input_type == "markdown" and not force_markdown_conversion:
3291 |         logger.debug("Input detected as Markdown, applying cleanup.")
3292 |         processed_text = _sanitize(text)
3293 |         processed_text = _improve(processed_text)
3294 | 
3295 |     elif input_type == "text" or input_type == "unknown":
3296 |         logger.debug(f"Input detected as {input_type}, applying basic text formatting.")
3297 |         processed_text = re.sub(r"\n{2,}", "<TEMP_PARA_BREAK>", text)
3298 |         processed_text = re.sub(r"\n", " ", processed_text)
3299 |         processed_text = processed_text.replace("<TEMP_PARA_BREAK>", "\n\n")
3300 |         processed_text = _sanitize(processed_text)
3301 |         processed_text = _improve(processed_text)
3302 | 
3303 |     # Apply line wrapping if requested (using textwrap module)
3304 |     if max_line_length > 0:
3305 |         try:
3306 |             wrapped_lines = []
3307 |             current_block = ""
3308 |             in_code_block = False
3309 |             for line in processed_text.split("\n"):
3310 |                 line_stripped = line.strip()
3311 |                 if line_stripped.startswith("```"):
3312 |                     in_code_block = not in_code_block
3313 |                     if current_block:
3314 |                         wrapped_lines.extend(
3315 |                             textwrap.wrap(
3316 |                                 current_block.strip(),
3317 |                                 width=max_line_length,
3318 |                                 break_long_words=False,
3319 |                                 break_on_hyphens=False,
3320 |                             )
3321 |                         )
3322 |                     current_block = ""
3323 |                     wrapped_lines.append(line)
3324 |                 elif (
3325 |                     in_code_block
3326 |                     or line_stripped.startswith(("#", ">", "- ", "* ", "+ "))
3327 |                     or re.match(r"^\d+\.\s", line_stripped)
3328 |                     or line_stripped == ""
3329 |                     or line_stripped.startswith("|")
3330 |                 ):
3331 |                     if current_block:
3332 |                         wrapped_lines.extend(
3333 |                             textwrap.wrap(
3334 |                                 current_block.strip(),
3335 |                                 width=max_line_length,
3336 |                                 break_long_words=False,
3337 |                                 break_on_hyphens=False,
3338 |                             )
3339 |                         )
3340 |                     current_block = ""
3341 |                     wrapped_lines.append(line)
3342 |                 else:
3343 |                     current_block += line + " "
3344 |             if current_block:
3345 |                 wrapped_lines.extend(
3346 |                     textwrap.wrap(
3347 |                         current_block.strip(),
3348 |                         width=max_line_length,
3349 |                         break_long_words=False,
3350 |                         break_on_hyphens=False,
3351 |                     )
3352 |                 )
3353 |             processed_text = "\n".join(wrapped_lines)
3354 |         except Exception as e_wrap:
3355 |             logger.error(f"Error during line wrapping: {e_wrap}")
3356 | 
3357 |     processing_time = time.time() - start_time
3358 |     return {
3359 |         "success": True,
3360 |         "markdown_text": processed_text.strip(),
3361 |         "original_content_type": input_type,
3362 |         "was_html": was_html,
3363 |         "extraction_method_used": extraction_method_used,
3364 |         "processing_time": processing_time,
3365 |     }
3366 | 
3367 | 
3368 | @with_tool_metrics
3369 | @with_error_handling
3370 | async def detect_content_type(text: str) -> Dict[str, Any]:
3371 |     """
3372 |     Detect if text is primarily HTML, Markdown, code, or plain text (Standalone Tool).
3373 |     """
3374 |     t0 = time.time()
3375 |     if not text or not isinstance(text, str):
3376 |         raise ToolInputError("Input text must be a non-empty string", param_name="text")
3377 | 
3378 |     sample_size = 4000
3379 |     if len(text) > sample_size * 2:
3380 |         sample = text[:sample_size] + "\n" + text[-sample_size:]
3381 |     else:
3382 |         sample = text
3383 | 
3384 |     scores = {"html": 0.0, "markdown": 0.0, "code": 0.0, "text": 1.0}
3385 |     detection_criteria: Dict[str, List[str]] = {"html": [], "markdown": [], "code": [], "text": []}
3386 |     max_score = 0.0
3387 | 
3388 |     for type_name, patterns in _CONTENT_PATTERNS.items():
3389 |         type_score = 0.0
3390 |         for pattern, weight in patterns:
3391 |             matches = pattern.findall(sample)
3392 |             if matches:
3393 |                 type_score += weight * 0.2
3394 |                 density_score = min(1.0, len(matches) / 10.0)
3395 |                 type_score += weight * density_score * 0.8
3396 |                 detection_criteria[type_name].append(
3397 |                     f"Pattern matched ({len(matches)}x): {pattern.pattern[:50]}..."
3398 |                 )
3399 | 
3400 |         scores[type_name] = min(scores[type_name] + type_score, 5.0)
3401 |         max_score = max(max_score, scores[type_name])
3402 | 
3403 |     if scores["html"] > 0.1 and any(
3404 |         p[0].pattern in ["<html", "<head", "<body", "<!DOCTYPE"] for p in _CONTENT_PATTERNS["html"]
3405 |     ):
3406 |         scores["html"] *= 1.5
3407 | 
3408 |     if max_score < 0.5:
3409 |         if (
3410 |             len(re.findall(r"\b(the|a|is|was|in|on|at)\b", sample, re.I)) > 10
3411 |             and len(re.findall(r"[.?!]\s", sample)) > 3
3412 |         ):
3413 |             detection_criteria["text"].append("Natural language indicators found")
3414 |             scores["text"] += 0.5
3415 | 
3416 |     if max_score > 0.2:
3417 |         scores["text"] *= 0.8
3418 | 
3419 |     primary_type = max(scores, key=lambda k: scores[k])
3420 | 
3421 |     confidence = min(1.0, scores[primary_type] / max(1.0, max_score * 0.8))
3422 |     sorted_scores = sorted(scores.values(), reverse=True)
3423 |     if len(sorted_scores) > 1 and sorted_scores[0] > sorted_scores[1] * 2:
3424 |         confidence = min(1.0, confidence * 1.2)
3425 |     confidence = min(0.95, confidence) if scores[primary_type] < 3.0 else confidence
3426 |     confidence = min(1.0, confidence)
3427 | 
3428 |     processing_time = time.time() - t0
3429 |     return {
3430 |         "success": True,
3431 |         "content_type": primary_type,
3432 |         "confidence": round(confidence, 3),
3433 |         "detection_criteria": detection_criteria[primary_type],
3434 |         "all_scores": {k: round(v, 2) for k, v in scores.items()},
3435 |         "processing_time": round(processing_time, 3),
3436 |     }
3437 | 
3438 | 
3439 | @with_tool_metrics
3440 | @with_error_handling
3441 | async def batch_format_texts(
3442 |     texts: List[str],
3443 |     force_markdown_conversion: bool = False,
3444 |     extraction_method: str = "auto",
3445 |     max_concurrency: int = 5,
3446 |     preserve_tables: bool = True,
3447 |     preserve_links: bool = True,
3448 |     preserve_images: bool = False,
3449 | ) -> Dict[str, Any]:
3450 |     """Applies 'clean_and_format_text_as_markdown' to texts concurrently (Standalone Tool)."""
3451 |     if not texts or not isinstance(texts, list):
3452 |         raise ToolInputError("Input must be a non-empty list", param_name="texts")
3453 |     if not all(isinstance(t, str) for t in texts):
3454 |         raise ToolInputError("All items in 'texts' list must be strings", param_name="texts")
3455 |     max_concurrency = max(1, max_concurrency)
3456 |     sem = asyncio.Semaphore(max_concurrency)
3457 |     tasks = []
3458 | 
3459 |     async def _process_one_standalone(idx: int, txt: str):
3460 |         async with sem:
3461 |             logger.debug(f"Starting batch formatting for text index {idx}")
3462 |             result_dict = {"original_index": idx}
3463 |             try:
3464 |                 res = await clean_and_format_text_as_markdown(
3465 |                     text=txt,
3466 |                     force_markdown_conversion=force_markdown_conversion,
3467 |                     extraction_method=extraction_method,
3468 |                     preserve_tables=preserve_tables,
3469 |                     preserve_links=preserve_links,
3470 |                     preserve_images=preserve_images,
3471 |                     max_line_length=0,
3472 |                 )
3473 |                 result_dict.update(res)
3474 |                 result_dict["success"] = bool(res.get("success", False))
3475 |                 if result_dict["success"]:
3476 |                     logger.debug(f"Successfully batch formatted text index {idx}")
3477 |             except ToolInputError as e_input:
3478 |                 logger.warning(f"Input error formatting text index {idx}: {e_input}")
3479 |                 result_dict.update(
3480 |                     {
3481 |                         "error": str(e_input),
3482 |                         "success": False,
3483 |                         "error_type": "ToolInputError",
3484 |                         "error_code": e_input.error_code,
3485 |                     }
3486 |                 )
3487 |             except ToolError as e_tool:
3488 |                 logger.warning(
3489 |                     f"Processing error formatting text index {idx}: {e_tool.error_code} - {str(e_tool)}"
3490 |                 )
3491 |                 result_dict.update(
3492 |                     {
3493 |                         "error": str(e_tool),
3494 |                         "success": False,
3495 |                         "error_code": e_tool.error_code,
3496 |                         "error_type": "ToolError",
3497 |                     }
3498 |                 )
3499 |             except Exception as e:
3500 |                 logger.error(f"Unexpected error formatting text index {idx}: {e}", exc_info=True)
3501 |                 result_dict.update({"error": str(e), "success": False, "error_type": "Exception"})
3502 |             return result_dict
3503 | 
3504 |     tic = time.perf_counter()
3505 |     logger.info(
3506 |         f"Starting batch formatting for {len(texts)} texts with concurrency {max_concurrency}..."
3507 |     )
3508 |     for i, t in enumerate(texts):
3509 |         tasks.append(_process_one_standalone(i, t))
3510 |     all_results = await asyncio.gather(*tasks)
3511 |     toc = time.perf_counter()
3512 |     logger.info(f"Batch formatting completed in {toc - tic:.3f}s")
3513 | 
3514 |     all_results.sort(key=lambda r: r.get("original_index", -1))
3515 |     final_results = []
3516 |     success_count = 0
3517 |     failure_count = 0
3518 |     for r in all_results:
3519 |         if r.get("success"):
3520 |             success_count += 1
3521 |         else:
3522 |             failure_count += 1
3523 |         r.pop("original_index", None)
3524 |         final_results.append(r)
3525 |     return {
3526 |         "results": final_results,
3527 |         "total_processing_time": round(toc - tic, 3),
3528 |         "success_count": success_count,
3529 |         "failure_count": failure_count,
3530 |         "success": True,
3531 |     }
3532 | 
3533 | 
3534 | @with_tool_metrics
3535 | @with_error_handling
3536 | async def optimize_markdown_formatting(
3537 |     markdown: str,
3538 |     normalize_headings: bool = False,
3539 |     fix_lists: bool = True,
3540 |     fix_links: bool = True,
3541 |     add_line_breaks: bool = True,
3542 |     compact_mode: bool = False,
3543 |     max_line_length: int = 0,
3544 | ) -> Dict[str, Any]:
3545 |     """
3546 |     Clean up and standardize existing Markdown text (Standalone Tool).
3547 |     Note: `normalize_headings` is currently a basic implementation.
3548 |     """
3549 |     t0 = time.time()
3550 |     if not markdown or not isinstance(markdown, str):
3551 |         raise ToolInputError("Input markdown must be a non-empty string", param_name="markdown")
3552 | 
3553 |     content_type_result = await detect_content_type(markdown)
3554 |     input_type = content_type_result.get("content_type", "unknown")
3555 |     actual_markdown = markdown
3556 |     conversion_note = ""
3557 | 
3558 |     if input_type == "html":
3559 |         logger.info("Input detected as HTML, converting to Markdown before optimizing.")
3560 |         conversion_note = "⚠️ Input was detected as HTML and automatically converted. "
3561 |         try:
3562 |             conversion_result = await clean_and_format_text_as_markdown(
3563 |                 text=markdown, force_markdown_conversion=True, extraction_method="none"
3564 |             )
3565 |             if conversion_result.get("success", False):
3566 |                 actual_markdown = conversion_result.get("markdown_text", "")
3567 |             else:
3568 |                 raise ToolError(
3569 |                     "FORMAT_CONVERSION_FAILED",
3570 |                     details={"error": conversion_result.get("error", "Unknown")},
3571 |                 )
3572 |         except Exception as e_conv:
3573 |             logger.error(f"Failed to convert HTML input for optimization: {e_conv}", exc_info=True)
3574 |             return {
3575 |                 "success": False,
3576 |                 "error": f"Input HTML conversion failed: {e_conv}",
3577 |                 "error_code": "FORMAT_CONVERSION_FAILED",
3578 |             }
3579 |     elif input_type != "markdown":
3580 |         logger.warning(
3581 |             f"Input detected as {input_type}, applying Markdown optimization rules anyway."
3582 |         )
3583 |         conversion_note = f"⚠️ Input detected as {input_type}, not Markdown. "
3584 | 
3585 |     optimized = actual_markdown
3586 |     changes = []
3587 | 
3588 |     if normalize_headings:
3589 |         logger.warning("normalize_headings is not fully implemented.")
3590 |         changes.append("Attempted heading normalization (basic)")
3591 | 
3592 |     if fix_lists:
3593 |         optimized = _sanitize(optimized)  # Handles basic list marker normalization
3594 |         changes.append("Standardized list formatting via sanitize")
3595 | 
3596 |     if fix_links:
3597 |         optimized = re.sub(r"\[\s*([^\]]+?)\s*\]\s*\(\s*([^)]+?)\s*\)", r"[\1](\2)", optimized)
3598 |         changes.append("Cleaned link formatting")
3599 | 
3600 |     if compact_mode:
3601 |         optimized = re.sub(r"\n{3,}", "\n\n", optimized)
3602 |         optimized = re.sub(r"[ \t]+$", "", optimized, flags=re.MULTILINE)
3603 |         changes.append("Applied compact formatting")
3604 |     elif add_line_breaks:
3605 |         optimized = _improve(optimized)
3606 |         changes.append("Added standard line breaks")
3607 | 
3608 |     if max_line_length > 0:
3609 |         try:
3610 |             wrapped_lines = []
3611 |             current_block = ""
3612 |             in_code_block = False
3613 |             for line in optimized.split("\n"):
3614 |                 line_stripped = line.strip()
3615 |                 if line_stripped.startswith("```"):
3616 |                     in_code_block = not in_code_block
3617 |                     if current_block:
3618 |                         wrapped_lines.extend(
3619 |                             textwrap.wrap(
3620 |                                 current_block.strip(),
3621 |                                 width=max_line_length,
3622 |                                 break_long_words=False,
3623 |                                 break_on_hyphens=False,
3624 |                             )
3625 |                         )
3626 |                     current_block = ""
3627 |                     wrapped_lines.append(line)
3628 |                 elif (
3629 |                     in_code_block
3630 |                     or line_stripped.startswith(("#", ">", "- ", "* ", "+ "))
3631 |                     or re.match(r"^\d+\.\s", line_stripped)
3632 |                     or line_stripped == ""
3633 |                     or line_stripped.startswith("|")
3634 |                 ):
3635 |                     if current_block:
3636 |                         wrapped_lines.extend(
3637 |                             textwrap.wrap(
3638 |                                 current_block.strip(),
3639 |                                 width=max_line_length,
3640 |                                 break_long_words=False,
3641 |                                 break_on_hyphens=False,
3642 |                             )
3643 |                         )
3644 |                     current_block = ""
3645 |                     wrapped_lines.append(line)
3646 |                 else:
3647 |                     current_block += line + " "
3648 |             if current_block:
3649 |                 wrapped_lines.extend(
3650 |                     textwrap.wrap(
3651 |                         current_block.strip(),
3652 |                         width=max_line_length,
3653 |                         break_long_words=False,
3654 |                         break_on_hyphens=False,
3655 |                     )
3656 |                 )
3657 |             optimized = "\n".join(wrapped_lines)
3658 |             changes.append(f"Wrapped lines at {max_line_length} chars")
3659 |         except Exception as e_wrap:
3660 |             logger.error(f"Error during line wrapping: {e_wrap}")
3661 | 
3662 |     return {
3663 |         "success": True,
3664 |         "optimized_markdown": optimized.strip(),
3665 |         "changes_summary": conversion_note
3666 |         + (", ".join(changes) if changes else "No specific optimizations applied."),
3667 |         "processing_time": time.time() - t0,
3668 |     }
3669 | 
3670 | 
3671 | # <<< Part 1 and 2 code goes here >>>
3672 | 
3673 | ###############################################################################
3674 | # Document Analysis Tools (Standalone)                                        #
3675 | ###############################################################################
3676 | 
3677 | 
3678 | @with_tool_metrics
3679 | @with_error_handling
3680 | async def identify_sections(document: str) -> Dict[str, Any]:
3681 |     """Identifies logical sections in a document using regex patterns (Standalone Tool)."""
3682 |     if not document or not isinstance(document, str):
3683 |         logger.warning("identify_sections called with empty or invalid input.")
3684 |         return {"sections": [], "success": True}
3685 | 
3686 |     domain_rules = _get_active_domain_rules()
3687 |     bound_rx = domain_rules.get("bound_rx")
3688 |     custom_sect_rx = domain_rules.get("custom_sect_rx", [])
3689 |     if not bound_rx or not isinstance(bound_rx, re.Pattern):
3690 |         raise ToolError(
3691 |             "INITIALIZATION_ERROR", details={"reason": "Section boundary regex not loaded/compiled"}
3692 |         )
3693 | 
3694 |     sections_found: List[Dict[str, Any]] = []
3695 |     last_section_end = 0
3696 |     try:
3697 |         matches = list(bound_rx.finditer(document))
3698 |         if not matches:
3699 |             logger.info(
3700 |                 "No regex-based section boundaries found. Treating document as single section."
3701 |             )
3702 |             if document.strip():
3703 |                 sections_found.append(
3704 |                     {
3705 |                         "title": "Main Content",
3706 |                         "text": document.strip(),
3707 |                         "position": 0,
3708 |                         "start_char": 0,
3709 |                         "end_char": len(document),
3710 |                     }
3711 |                 )
3712 |         else:
3713 |             logger.info(f"Found {len(matches)} potential section boundaries based on regex.")
3714 |             first_match_start = matches[0].start()
3715 |             if first_match_start > 0:
3716 |                 initial_text = document[last_section_end:first_match_start].strip()
3717 |                 if initial_text:
3718 |                     sections_found.append(
3719 |                         {
3720 |                             "title": "Introduction",
3721 |                             "text": initial_text,
3722 |                             "position": 0,
3723 |                             "start_char": last_section_end,
3724 |                             "end_char": first_match_start,
3725 |                         }
3726 |                     )
3727 |                     last_section_end = first_match_start
3728 |             for i, match in enumerate(matches):
3729 |                 title_raw = match.group(0).strip()
3730 |                 title_start_char = match.start()
3731 |                 title_end_char = match.end()
3732 |                 section_content_start = title_end_char
3733 |                 section_content_end = (
3734 |                     matches[i + 1].start() if i < len(matches) - 1 else len(document)
3735 |                 )
3736 |                 section_text = document[section_content_start:section_content_end].strip()
3737 |                 section_title = title_raw
3738 |                 if custom_sect_rx:
3739 |                     for pat, label in custom_sect_rx:
3740 |                         if isinstance(pat, re.Pattern) and pat.search(title_raw):
3741 |                             section_title = label
3742 |                             logger.debug(
3743 |                                 f"Applied custom label '{label}' to section '{title_raw}'."
3744 |                             )
3745 |                             break
3746 |                 if section_text:
3747 |                     sections_found.append(
3748 |                         {
3749 |                             "title": section_title,
3750 |                             "text": section_text,
3751 |                             "position": len(sections_found),
3752 |                             "start_char": title_start_char,
3753 |                             "end_char": section_content_end,
3754 |                         }
3755 |                     )
3756 |                 else:
3757 |                     logger.debug(
3758 |                         f"Skipping section '{section_title}' (no content)."
3759 |                     )  # Corrected typo
3760 |                 last_section_end = section_content_end
3761 |     except Exception as e:
3762 |         logger.error(f"Error during section identification: {e}", exc_info=True)
3763 |         raise ToolError("SECTION_IDENTIFICATION_FAILED", details={"error": str(e)}) from e
3764 |     return {"sections": sections_found, "success": True}
3765 | 
3766 | 
3767 | @with_tool_metrics
3768 | @with_error_handling
3769 | async def extract_entities(
3770 |     document: str, entity_types: Optional[List[str]] = None
3771 | ) -> Dict[str, Any]:
3772 |     """Extracts named entities from document text using an LLM (Standalone Tool)."""
3773 |     if not document or not isinstance(document, str):
3774 |         raise ToolInputError("Input document must be non-empty string", param_name="document")
3775 |     max_context = 3800
3776 |     context = document[:max_context] + ("..." if len(document) > max_context else "")
3777 |     if len(document) > max_context:
3778 |         logger.warning(f"Doc truncated to ~{max_context} chars for entity extraction.")
3779 |     entity_focus = (
3780 |         f"Extract only: {', '.join(entity_types)}."
3781 |         if entity_types
3782 |         else "Extract common types (PERSON, ORG, LOC, DATE, MONEY...)."
3783 |     )
3784 |     prompt = f"""Analyze text, extract entities. {entity_focus} Output ONLY valid JSON object (keys=TYPE, values=list of unique strings).
3785 | Text:
3786 | \"\"\"
3787 | {context}
3788 | \"\"\"
3789 | JSON Output:
3790 | """
3791 |     logger.info(f"Requesting entity extraction. Focus: {entity_types or 'common'}")
3792 |     llm_response_raw = ""
3793 |     try:
3794 |         llm_response_raw = await _standalone_llm_call(
3795 |             prompt=prompt, max_tokens=1500, temperature=0.1
3796 |         )
3797 |         logger.debug(f"LLM raw response for entities:\n{llm_response_raw}")
3798 |         json_str = llm_response_raw
3799 |         json_match = re.search(r"```(?:json)?\s*([\s\S]+?)\s*```", json_str)
3800 |         if json_match:
3801 |             json_str = json_match.group(1).strip()
3802 |         start_brace = json_str.find("{")
3803 |         end_brace = json_str.rfind("}")
3804 |         if start_brace != -1 and end_brace != -1 and start_brace < end_brace:
3805 |             json_str = json_str[start_brace : end_brace + 1]
3806 |         else:
3807 |             logger.warning("Could not find JSON object boundaries for entities.")
3808 |         try:
3809 |             entities_dict = json.loads(json_str)
3810 |         except json.JSONDecodeError as json_e:
3811 |             logger.warning(f"Initial JSON parse failed: {json_e}. Trying lenient find.")
3812 |             match = re.search(r"\{.*\}", llm_response_raw, re.DOTALL)
3813 |             if match:
3814 |                 try:
3815 |                     entities_dict = json.loads(match.group(0))
3816 |                 except json.JSONDecodeError as final_json_e:
3817 |                     raise ValueError(f"Could not parse JSON: {final_json_e}") from final_json_e
3818 |             else:
3819 |                 raise ValueError("No JSON object found") from json_e
3820 |         if not isinstance(entities_dict, dict):
3821 |             raise ValueError("LLM response is not JSON object.")
3822 |         validated_entities: Dict[str, List[str]] = {}
3823 |         for key, value in entities_dict.items():
3824 |             entity_type = str(key).upper().strip()
3825 |             if not entity_type:
3826 |                 continue
3827 |             sanitized_values: Set[str] = set()
3828 |             items_to_process = value if isinstance(value, list) else [value]
3829 |             for item in items_to_process:
3830 |                 text_val = None
3831 |                 if isinstance(item, str) and item.strip():
3832 |                     text_val = item.strip()
3833 |                 elif (
3834 |                     isinstance(item, dict)
3835 |                     and isinstance(item.get("text"), str)
3836 |                     and item["text"].strip()
3837 |                 ):
3838 |                     text_val = item["text"].strip()
3839 |                 if text_val:
3840 |                     text_val = re.sub(r"^[.,!?;:'\"\(] +|[ .,!?;:'\"\) ]+$", "", text_val)
3841 |                 if text_val:
3842 |                     sanitized_values.add(text_val)
3843 |             if sanitized_values:
3844 |                 validated_entities[entity_type] = sorted(list(sanitized_values))
3845 |         logger.info(f"Successfully extracted entities for types: {list(validated_entities.keys())}")
3846 |         return {
3847 |             "entities": validated_entities,
3848 |             "success": True,
3849 |             "raw_llm_response": llm_response_raw,
3850 |         }
3851 |     except (json.JSONDecodeError, ValueError) as e:
3852 |         logger.error(f"Failed to parse LLM response for entities: {e}")
3853 |         return {
3854 |             "entities": {},
3855 |             "error": f"Parse fail: {e}",
3856 |             "raw_llm_response": llm_response_raw,
3857 |             "success": False,
3858 |             "error_code": "LLM_INVALID_RESPONSE",
3859 |         }
3860 |     except ToolError as e:
3861 |         logger.error(f"LLM call failed during entity extraction: {e}", exc_info=False)
3862 |         return {
3863 |             "entities": {},
3864 |             "error": f"LLM fail: {str(e)}",
3865 |             "raw_llm_response": llm_response_raw,
3866 |             "success": False,
3867 |             "error_code": e.error_code,
3868 |         }
3869 |     except Exception as e:
3870 |         logger.error(f"Unexpected entity extraction error: {e}", exc_info=True)
3871 |         return {
3872 |             "entities": {},
3873 |             "error": f"Unexpected: {e}",
3874 |             "raw_llm_response": llm_response_raw,
3875 |             "success": False,
3876 |             "error_code": "ENTITY_EXTRACTION_FAILED",
3877 |         }
3878 | 
3879 | 
3880 | @with_tool_metrics
3881 | @with_error_handling
3882 | async def generate_qa_pairs(document: str, num_questions: int = 5) -> Dict[str, Any]:
3883 |     """Generates question-answer pairs based on the document content using an LLM (Standalone Tool)."""
3884 |     if not document or not isinstance(document, str):
3885 |         raise ToolInputError("Input must be non-empty string", param_name="document")
3886 |     if not isinstance(num_questions, int) or num_questions <= 0:
3887 |         raise ToolInputError("num_questions must be positive int")
3888 |     max_context = 3800
3889 |     context = document[:max_context] + ("..." if len(document) > max_context else "")
3890 |     if len(document) > max_context:
3891 |         logger.warning(f"Doc truncated to ~{max_context} chars for QA generation.")
3892 |     prompt = f"""Based ONLY on text, generate {num_questions} relevant QA pairs. Output ONLY JSON list of objects (keys "question", "answer").
3893 | Text:
3894 | \"\"\"
3895 | {context}
3896 | \"\"\"
3897 | JSON Output:
3898 | """
3899 |     logger.info(f"Requesting {num_questions} QA pairs.")
3900 |     llm_response_raw = ""
3901 |     try:
3902 |         llm_max_tokens = num_questions * 150 + 200
3903 |         llm_response_raw = await _standalone_llm_call(
3904 |             prompt=prompt, max_tokens=llm_max_tokens, temperature=0.4
3905 |         )
3906 |         logger.debug(f"LLM raw response for QA pairs:\n{llm_response_raw}")
3907 |         json_str = llm_response_raw
3908 |         json_match = re.search(r"```(?:json)?\s*([\s\S]+?)\s*```", json_str, re.I)
3909 |         if json_match:
3910 |             json_str = json_match.group(1).strip()
3911 |         start_bracket = json_str.find("[")
3912 |         end_bracket = json_str.rfind("]")
3913 |         if start_bracket != -1 and end_bracket != -1 and start_bracket < end_bracket:
3914 |             json_str = json_str[start_bracket : end_bracket + 1]
3915 |         else:
3916 |             logger.warning("Could not find JSON list boundaries for QA.")
3917 |         try:
3918 |             qa_list = json.loads(json_str)
3919 |         except json.JSONDecodeError as json_e:
3920 |             logger.warning(f"JSON parse for QA failed: {json_e}. Trying regex.")
3921 |             pairs = []
3922 |             extracted = re.findall(
3923 |                 r'\{\s*"question":\s*"(.*?)",\s*"answer":\s*"(.*?)"\s*\}',
3924 |                 llm_response_raw,
3925 |                 re.DOTALL | re.I,
3926 |             )
3927 |             pairs = [
3928 |                 {
3929 |                     "question": q.strip().replace('\\"', '"'),
3930 |                     "answer": a.strip().replace('\\"', '"'),
3931 |                 }
3932 |                 for q, a in extracted
3933 |                 if q.strip() and a.strip()
3934 |             ]
3935 |             if pairs:
3936 |                 logger.info(f"Regex fallback extracted {len(pairs)} pairs.")
3937 |                 return {
3938 |                     "qa_pairs": pairs[:num_questions],
3939 |                     "success": True,
3940 |                     "warning": "Used regex fallback.",
3941 |                     "raw_llm_response": llm_response_raw,
3942 |                 }
3943 |             else:
3944 |                 raise ValueError("Regex fallback found no QA pairs.") from json_e
3945 |         if not isinstance(qa_list, list):
3946 |             raise ValueError("LLM response is not JSON list.")
3947 |         validated_pairs: List[Dict[str, str]] = []
3948 |         for item in qa_list:
3949 |             if isinstance(item, dict):
3950 |                 q = item.get("question")
3951 |                 a = item.get("answer")
3952 |                 if isinstance(q, str) and q.strip() and isinstance(a, str) and a.strip():
3953 |                     validated_pairs.append({"question": q.strip(), "answer": a.strip()})
3954 |                 else:
3955 |                     logger.warning(f"Skipping invalid QA item: {item}")
3956 |             else:
3957 |                 logger.warning(f"Skipping non-dict item in QA list: {item}")
3958 |         if not validated_pairs:
3959 |             logger.warning("LLM response parsed but no valid QA pairs found.")
3960 |         else:
3961 |             logger.info(f"Successfully generated {len(validated_pairs)} valid QA pairs.")
3962 |         return {
3963 |             "qa_pairs": validated_pairs[:num_questions],
3964 |             "success": True,
3965 |             "raw_llm_response": llm_response_raw,
3966 |         }
3967 |     except (json.JSONDecodeError, ValueError) as e:
3968 |         logger.error(f"Failed to parse LLM response for QA pairs: {e}")
3969 |         return {
3970 |             "qa_pairs": [],
3971 |             "error": f"Parse fail: {e}",
3972 |             "raw_llm_response": llm_response_raw,
3973 |             "success": False,
3974 |             "error_code": "LLM_INVALID_RESPONSE",
3975 |         }
3976 |     except ToolError as e:
3977 |         logger.error(f"LLM call failed during QA generation: {e}", exc_info=False)
3978 |         return {
3979 |             "qa_pairs": [],
3980 |             "error": f"LLM fail: {str(e)}",
3981 |             "raw_llm_response": llm_response_raw,
3982 |             "success": False,
3983 |             "error_code": e.error_code,
3984 |         }
3985 |     except Exception as e:
3986 |         logger.error(f"Unexpected QA generation error: {e}", exc_info=True)
3987 |         return {
3988 |             "qa_pairs": [],
3989 |             "error": f"Unexpected: {e}",
3990 |             "raw_llm_response": llm_response_raw,
3991 |             "success": False,
3992 |             "error_code": "QA_GENERATION_FAILED",
3993 |         }
3994 | 
3995 | 
3996 | @with_tool_metrics
3997 | @with_error_handling
3998 | async def summarize_document(
3999 |     document: str, max_length: int = 150, focus: Optional[str] = None
4000 | ) -> Dict[str, Any]:
4001 |     """
4002 |     Generates a concise summary of the document text using an LLM (Standalone Tool).
4003 |     """
4004 |     _logger = logger
4005 |     if not document or not isinstance(document, str):
4006 |         raise ToolInputError("Input document must be a non-empty string", param_name="document")
4007 |     if not isinstance(max_length, int) or max_length <= 10:
4008 |         raise ToolInputError("max_length must be a positive integer > 10", param_name="max_length")
4009 | 
4010 |     llm_caller = _standalone_llm_call
4011 |     max_context = 8000
4012 |     context = document[:max_context] + ("..." if len(document) > max_context else "")
4013 |     if len(document) > max_context:
4014 |         _logger.warning(f"Document truncated to ~{max_context} chars for summarization.")
4015 | 
4016 |     focus_instruction = (
4017 |         f" Focus particularly on aspects related to: {focus}." if focus and focus.strip() else ""
4018 |     )
4019 |     prompt = f"""Generate a concise, coherent summary of the following text, about {max_length} words long.{focus_instruction}
4020 | Capture the main points and key information accurately based ONLY on the provided text. Do not add external information or opinions.
4021 | Output ONLY the summary text itself, without any introductory phrases like "Here is the summary:".
4022 | 
4023 | Text:
4024 | \"\"\"
4025 | {context}
4026 | \"\"\"
4027 | 
4028 | Summary:
4029 | """
4030 |     _logger.info(
4031 |         f"Requesting summary from LLM (max_length≈{max_length}, focus='{focus or 'none'}')."
4032 |     )
4033 |     llm_response_raw = ""
4034 |     try:
4035 |         llm_max_tokens = max(50, min(4000, int(max_length / 0.6)))
4036 |         summary_text = await llm_caller(prompt=prompt, max_tokens=llm_max_tokens, temperature=0.5)
4037 |         llm_response_raw = summary_text
4038 |         summary_text = re.sub(
4039 |             r"^(Here is a summary:|Summary:|The text discusses|This document is about)\s*:?\s*",
4040 |             "",
4041 |             summary_text,
4042 |             flags=re.I,
4043 |         ).strip()
4044 |         word_count = len(summary_text.split())
4045 |         _logger.info(f"Generated summary with {word_count} words (target: {max_length}).")
4046 |         if word_count < max_length * 0.5:
4047 |             _logger.warning(f"Summary ({word_count} words) shorter than requested ({max_length}).")
4048 |         elif word_count > max_length * 1.5:
4049 |             _logger.warning(f"Summary ({word_count} words) longer than requested ({max_length}).")
4050 |         return {
4051 |             "summary": summary_text,
4052 |             "word_count": word_count,
4053 |             "success": True,
4054 |             "raw_llm_response": llm_response_raw,
4055 |         }
4056 |     except ToolError as te:
4057 |         _logger.error(f"ToolError during summarization: {te.error_code} - {str(te)}", exc_info=True)
4058 |         return {
4059 |             "summary": "",
4060 |             "word_count": 0,
4061 |             "error": str(te),
4062 |             "success": False,
4063 |             "raw_llm_response": llm_response_raw,
4064 |             "error_code": te.error_code,
4065 |         }
4066 |     except Exception as e:
4067 |         _logger.error(f"Unexpected error during summarization: {str(e)}", exc_info=True)
4068 |         return {
4069 |             "summary": "",
4070 |             "word_count": 0,
4071 |             "error": f"Unexpected error: {e}",
4072 |             "success": False,
4073 |             "raw_llm_response": llm_response_raw,
4074 |             "error_code": "SUMMARIZATION_FAILED",
4075 |         }
4076 | 
4077 | 
4078 | @with_tool_metrics
4079 | @with_error_handling
4080 | async def extract_metrics(document: str) -> Dict[str, Any]:
4081 |     """Extracts numeric metrics based on domain patterns (Standalone Tool)."""
4082 |     if not document or not isinstance(document, str):
4083 |         raise ToolInputError("Input must be non-empty string", param_name="document")
4084 |     domain_rules = _get_active_domain_rules()
4085 |     metric_rx_list = domain_rules.get("metric_rx")
4086 |     if not metric_rx_list or not isinstance(metric_rx_list, list):
4087 |         logger.warning(
4088 |             f"No metric patterns found for domain '{domain_rules.get('active_domain', 'unknown')}'."
4089 |         )
4090 |         return {"metrics": {}, "success": True}
4091 |     extracted_metrics: Dict[str, List[float]] = {}
4092 |     logger.info(
4093 |         f"Starting metric extraction for domain '{domain_rules.get('active_domain')}' ({len(metric_rx_list)} types)."
4094 |     )
4095 |     for metric_name, pattern in metric_rx_list:
4096 |         if not isinstance(pattern, re.Pattern):
4097 |             logger.warning(f"Skipping invalid pattern type for metric '{metric_name}'.")
4098 |             continue
4099 |         found_values: Set[float] = set()
4100 |         try:
4101 |             matches = pattern.findall(document)
4102 |             if matches:
4103 |                 logger.debug(f"Found {len(matches)} potential matches for metric '{metric_name}'")
4104 |             for match_groups in matches:
4105 |                 val_str = None
4106 |                 if isinstance(match_groups, tuple) and len(match_groups) >= 2:
4107 |                     val_str = str(match_groups[1]).strip()
4108 |                 elif isinstance(match_groups, str):
4109 |                     val_str = match_groups.strip()
4110 |                 if val_str is None:
4111 |                     continue
4112 |                 val_str_cleaned = re.sub(r"[$,€£\s,]", "", val_str)
4113 |                 if val_str_cleaned.endswith("."):
4114 |                     val_str_cleaned = val_str_cleaned[:-1]
4115 |                 if not val_str_cleaned or val_str_cleaned == "-":
4116 |                     continue
4117 |                 try:
4118 |                     found_values.add(float(val_str_cleaned))
4119 |                 except ValueError:
4120 |                     logger.debug(
4121 |                         f"Could not convert value '{val_str_cleaned}' for metric '{metric_name}'."
4122 |                     )
4123 |         except Exception as e:
4124 |             logger.error(f"Error processing regex for metric '{metric_name}': {e}", exc_info=True)
4125 |         if found_values:
4126 |             unique_values = sorted(list(found_values))
4127 |             extracted_metrics[metric_name] = unique_values
4128 |             logger.info(
4129 |                 f"Extracted {len(unique_values)} unique value(s) for metric '{metric_name}': {unique_values}"
4130 |             )
4131 |     return {"metrics": extracted_metrics, "success": True}
4132 | 
4133 | 
4134 | @with_tool_metrics
4135 | @with_error_handling
4136 | async def flag_risks(document: str) -> Dict[str, Any]:
4137 |     """Flags potential risks using domain patterns (Standalone Tool)."""
4138 |     if not document or not isinstance(document, str):
4139 |         raise ToolInputError("Input must be non-empty string", param_name="document")
4140 |     domain_rules = _get_active_domain_rules()
4141 |     risk_rx_dict = domain_rules.get("risk_rx")
4142 |     if not risk_rx_dict or not isinstance(risk_rx_dict, dict):
4143 |         logger.warning(
4144 |             f"No risk patterns found for domain '{domain_rules.get('active_domain', 'unknown')}'."
4145 |         )
4146 |         return {"risks": {}, "success": True}
4147 |     flagged_risks: Dict[str, Dict[str, Any]] = {}
4148 |     logger.info(
4149 |         f"Starting risk flagging for domain '{domain_rules.get('active_domain')}' ({len(risk_rx_dict)} types)."
4150 |     )
4151 |     context_window = 50
4152 |     max_samples = 3
4153 |     for risk_type, pattern in risk_rx_dict.items():
4154 |         if not isinstance(pattern, re.Pattern):
4155 |             logger.warning(f"Skipping invalid pattern type for risk '{risk_type}'.")
4156 |             continue
4157 |         match_contexts: List[str] = []
4158 |         match_count = 0
4159 |         try:
4160 |             for match in pattern.finditer(document):
4161 |                 match_count += 1
4162 |                 if len(match_contexts) < max_samples:
4163 |                     start, end = match.start(), match.end()
4164 |                     ctx_start = max(0, start - context_window)
4165 |                     ctx_end = min(len(document), end + context_window)
4166 |                     snippet = document[ctx_start:ctx_end].replace("\n", " ").strip()
4167 |                     prefix = "..." if ctx_start > 0 else ""
4168 |                     suffix = "..." if ctx_end < len(document) else ""
4169 |                     hl_start = start - ctx_start + len(prefix)
4170 |                     hl_end = end - ctx_start + len(prefix)
4171 |                     formatted_snippet = f"{prefix}{snippet[:hl_start]}**{snippet[hl_start:hl_end]}**{snippet[hl_end:]}{suffix}"
4172 |                     match_contexts.append(formatted_snippet)
4173 |             if match_count > 0:
4174 |                 logger.info(f"Flagged risk '{risk_type}' {match_count} time(s).")
4175 |                 flagged_risks[risk_type] = {"count": match_count, "sample_contexts": match_contexts}
4176 |         except Exception as e:
4177 |             logger.error(f"Error processing regex for risk '{risk_type}': {e}", exc_info=True)
4178 |     return {"risks": flagged_risks, "success": True}
4179 | 
4180 | 
4181 | @with_tool_metrics
4182 | @with_error_handling
4183 | async def canonicalise_entities(entities_input: Dict[str, Any]) -> Dict[str, Any]:
4184 |     """Normalizes and attempts to merge similar entities using fuzzy matching (Standalone Tool)."""
4185 |     entities_list: List[Dict[str, Any]] = []
4186 |     raw_entities = entities_input.get("entities")
4187 |     if isinstance(raw_entities, dict):
4188 |         for etype, text_list in raw_entities.items():
4189 |             entity_type_str = str(etype).upper().strip()
4190 |             if isinstance(text_list, list) and entity_type_str:
4191 |                 for text in text_list:
4192 |                     if isinstance(text, str) and text.strip():
4193 |                         entities_list.append({"text": text.strip(), "type": entity_type_str})
4194 |             else:
4195 |                 logger.warning(
4196 |                     f"Expected list for entity type '{etype}', got {type(text_list)}. Skipping."
4197 |                 )
4198 |     elif isinstance(raw_entities, list):
4199 |         for item in raw_entities:
4200 |             if (
4201 |                 isinstance(item, dict)
4202 |                 and isinstance(item.get("text"), str)
4203 |                 and item["text"].strip()
4204 |                 and isinstance(item.get("type"), str)
4205 |                 and item["type"].strip()
4206 |             ):
4207 |                 entities_list.append(
4208 |                     {
4209 |                         "text": item["text"].strip(),
4210 |                         "type": item["type"].upper().strip(),
4211 |                         "metadata": {k: v for k, v in item.items() if k not in ["text", "type"]},
4212 |                     }
4213 |                 )
4214 |             else:
4215 |                 logger.warning(f"Skipping invalid item in entity list: {item}")
4216 |     else:
4217 |         raise ToolInputError(
4218 |             'Input dict must contain "entities" key with either Dict[str, List[str]] or List[Dict[str, Any]].',
4219 |             param_name="entities_input",
4220 |         )
4221 |     if not entities_list:
4222 |         logger.info("No entities provided for canonicalization.")
4223 |         return {"canonicalized": {}, "success": True}
4224 |     entities_by_type: Dict[str, List[Dict[str, Any]]] = {}
4225 |     for entity in entities_list:
4226 |         etype = entity.get("type", "UNKNOWN")
4227 |         entities_by_type.setdefault(etype, []).append(entity)
4228 |     canonicalized_output: Dict[str, List[Dict[str, Any]]] = {}
4229 |     similarity_threshold = 85
4230 |     for entity_type, entity_group in entities_by_type.items():
4231 |         logger.debug(f"Canonicalising {len(entity_group)} entities of type '{entity_type}'...")
4232 |         entity_group.sort(key=lambda x: len(x.get("text", "")), reverse=True)
4233 |         merged_entities: List[Dict[str, Any]] = []
4234 |         processed_indices = set()
4235 |         for i in range(len(entity_group)):
4236 |             if i in processed_indices:
4237 |                 continue
4238 |             current_entity = entity_group[i]
4239 |             canonical_form = current_entity.get("text", "")
4240 |             if not canonical_form:
4241 |                 processed_indices.add(i)
4242 |                 continue
4243 |             cluster_variants_data = [current_entity]
4244 |             processed_indices.add(i)
4245 |             for j in range(i + 1, len(entity_group)):
4246 |                 if j in processed_indices:
4247 |                     continue
4248 |                 other_entity = entity_group[j]
4249 |                 other_text = other_entity.get("text", "")
4250 |                 if not other_text:
4251 |                     processed_indices.add(j)
4252 |                     continue
4253 |                 score = fuzz.token_sort_ratio(canonical_form.lower(), other_text.lower())
4254 |                 if score >= similarity_threshold:
4255 |                     cluster_variants_data.append(other_entity)
4256 |                     processed_indices.add(j)
4257 |                     logger.debug(
4258 |                         f"  Merging '{other_text}' into '{canonical_form}' (score: {score:.0f})"
4259 |                     )
4260 |             if cluster_variants_data:
4261 |                 canonical_text = cluster_variants_data[0].get("text", "")
4262 |                 variant_texts = sorted(
4263 |                     list({ent.get("text", "") for ent in cluster_variants_data if ent.get("text")})
4264 |                 )
4265 |                 merged_metadata = {}
4266 |                 scores = [
4267 |                     ent.get("metadata", {}).get("score")
4268 |                     for ent in cluster_variants_data
4269 |                     if ent.get("metadata", {}).get("score") is not None
4270 |                 ]
4271 |                 if scores:
4272 |                     merged_metadata["scores"] = scores
4273 |                 merged_entities.append(
4274 |                     {
4275 |                         "text": canonical_text,
4276 |                         "count": len(cluster_variants_data),
4277 |                         "type": entity_type,
4278 |                         "variants": variant_texts,
4279 |                         "metadata": merged_metadata,
4280 |                     }
4281 |                 )
4282 |         merged_entities.sort(key=lambda x: (-x.get("count", 0), x.get("text", "")))
4283 |         canonicalized_output[entity_type] = merged_entities
4284 |         logger.info(
4285 |             f"Canonicalised type '{entity_type}': {len(entity_group)} input -> {len(merged_entities)} unique entities."
4286 |         )
4287 |     return {"canonicalized": canonicalized_output, "success": True}
4288 | 
4289 | 
4290 | ###############################################################################
4291 | # OCR-Specific Tools (Standalone)                                             #
4292 | ###############################################################################
4293 | 
4294 | 
4295 | @with_tool_metrics
4296 | @with_retry(max_retries=2, retry_delay=1.5)
4297 | @with_error_handling
4298 | async def ocr_image(
4299 |     image_path: Optional[str] = None,
4300 |     image_data: Optional[str] = None,  # Base64 encoded string
4301 |     ocr_options: Optional[Dict] = None,
4302 |     enhance_with_llm: bool = True,
4303 |     output_format: str = "markdown",
4304 | ) -> Dict[str, Any]:
4305 |     """
4306 |     Performs OCR on a single image and optionally enhances the text with an LLM (Standalone Tool).
4307 | 
4308 |     Args:
4309 |         image_path: Path to the image file (e.g., PNG, JPG). Mutually exclusive with image_data.
4310 |         image_data: Base64-encoded image data string. Mutually exclusive with image_path.
4311 |         ocr_options: Dictionary of options for OCR/Enhancement:
4312 |             - language (str): Tesseract language(s). Default: "eng".
4313 |             - preprocessing (dict): Image preprocessing options.
4314 |             - remove_headers (bool): Attempt header/footer removal (less effective on single images). Default: False.
4315 |             - assess_quality (bool): Run LLM quality assessment. Default: False.
4316 |             - detect_tables (bool): Attempt to detect tables in the image (used for metadata). Default: True.
4317 |             - tesseract_config (str): Additional Tesseract config options (e.g., '--psm 6'). Default: "".
4318 |         enhance_with_llm: If True (default), enhance the raw OCR text using an LLM.
4319 |         output_format: Target format ('markdown' or 'text').
4320 | 
4321 |     Returns:
4322 |         Dictionary with OCR results (see convert_document return structure).
4323 |     """
4324 |     t0 = time.time()
4325 |     ocr_opts = ocr_options or {}
4326 |     output_format = output_format.lower()
4327 |     if output_format not in _OCR_COMPATIBLE_FORMATS:
4328 |         logger.warning(
4329 |             f"Output format '{output_format}' not ideal for image OCR. Using 'markdown'."
4330 |         )
4331 |         output_format = "markdown"
4332 | 
4333 |     # --- Dependency Checks ---
4334 |     _ocr_check_dep("Pillow", _PIL_AVAILABLE, "Image OCR")
4335 |     _ocr_check_dep("pytesseract", _PYTESSERACT_AVAILABLE, "Image OCR")
4336 |     can_use_cv2 = _CV2_AVAILABLE and _NUMPY_AVAILABLE
4337 |     if ocr_opts.get("preprocessing") and not can_use_cv2:
4338 |         logger.warning(
4339 |             "Preprocessing options provided but OpenCV/NumPy missing. Preprocessing limited."
4340 |         )
4341 |     if ocr_opts.get("detect_tables", True) and not can_use_cv2:
4342 |         logger.warning(
4343 |             "Table detection requires OpenCV/NumPy. Disabling table detection for metadata."
4344 |         )
4345 |         ocr_opts["detect_tables"] = False
4346 | 
4347 |     # --- Input Handling ---
4348 |     if not image_path and not image_data:
4349 |         raise ToolInputError("Either 'image_path' or 'image_data' must be provided.")
4350 |     if image_path and image_data:
4351 |         raise ToolInputError("Provide either 'image_path' or 'image_data', not both.")
4352 | 
4353 |     img: Optional["PILImage.Image"] = None
4354 |     preprocessed_img: Optional["PILImage.Image"] = None
4355 |     input_name = "image_data"
4356 | 
4357 |     try:
4358 |         if image_path:
4359 |             img_path_obj = _ocr_validate_file_path(image_path)
4360 |             input_name = img_path_obj.name
4361 |             with _span(f"load_image_{input_name}"):
4362 |                 img = Image.open(img_path_obj)  # type: ignore
4363 |         elif image_data:
4364 |             if not isinstance(image_data, str):
4365 |                 raise ToolInputError("image_data must be a base64 encoded string.")
4366 |             try:
4367 |                 if image_data.startswith("data:image"):
4368 |                     image_data = image_data.split(";base64,", 1)[1]
4369 |                 img_bytes = base64.b64decode(image_data)
4370 |                 with _span("load_image_bytes"):
4371 |                     img = Image.open(io.BytesIO(img_bytes))  # type: ignore
4372 |                 input_name = f"base64_input_{_hash(image_data[:100])}"
4373 |             except (base64.binascii.Error, ValueError, TypeError) as e_b64:
4374 |                 raise ToolInputError(
4375 |                     f"Invalid base64 image data: {e_b64}", param_name="image_data"
4376 |                 ) from e_b64
4377 |             except Exception as e_img_open:
4378 |                 raise ToolError(
4379 |                     "IMAGE_LOAD_FAILED",
4380 |                     details={"error": f"Failed to open image from bytes: {e_img_open}"},
4381 |                 ) from e_img_open
4382 | 
4383 |         if img is None or not _PIL_AVAILABLE:  # Added check for _PIL_AVAILABLE
4384 |             raise ToolError(
4385 |                 "IMAGE_LOAD_FAILED",
4386 |                 details={"reason": "Image object is None or Pillow unavailable."},
4387 |             )
4388 | 
4389 |         img = img.convert("RGB")
4390 | 
4391 |         # --- OCR Pipeline ---
4392 |         loop = asyncio.get_running_loop()
4393 |         with _span("image_preprocessing"):
4394 |             preprocessed_img = await loop.run_in_executor(
4395 |                 None, _ocr_preprocess_image, img, ocr_opts.get("preprocessing")
4396 |             )
4397 | 
4398 |         ocr_lang = ocr_opts.get("language", "eng")
4399 |         ocr_config_str = ocr_opts.get("tesseract_config", "")
4400 |         with _span("tesseract_ocr"):
4401 |             raw_text = await loop.run_in_executor(
4402 |                 None, _ocr_run_tesseract, preprocessed_img, ocr_lang, ocr_config_str
4403 |             )
4404 | 
4405 |         # --- LLM Enhancement ---
4406 |         final_content = raw_text
4407 |         quality_metrics = None
4408 |         if enhance_with_llm and raw_text.strip():
4409 |             with _span("llm_image_text_enhancement"):
4410 |                 remove_headers = ocr_opts.get("remove_headers", False)
4411 |                 final_content = await _ocr_enhance_text_chunk(
4412 |                     raw_text, output_format=output_format, remove_headers=remove_headers
4413 |                 )
4414 | 
4415 |             if ocr_opts.get("assess_quality", False):
4416 |                 with _span("ocr_quality_assessment"):
4417 |                     quality_metrics = await _ocr_assess_text_quality(raw_text, final_content)
4418 |         else:
4419 |             final_content = raw_text
4420 | 
4421 |         # --- Metadata ---
4422 |         tables_detected = False
4423 |         if ocr_opts.get("detect_tables", True) and can_use_cv2:
4424 |             # Run detection on the preprocessed image
4425 |             detected_regions = await loop.run_in_executor(
4426 |                 None, _ocr_detect_tables, preprocessed_img
4427 |             )
4428 |             tables_detected = len(detected_regions) > 0
4429 |         elif ocr_opts.get("detect_tables", True):
4430 |             logger.warning("Table detection requested but OpenCV/Numpy unavailable.")
4431 | 
4432 |         doc_metadata = {
4433 |             "num_pages": 1,
4434 |             "has_tables": tables_detected,
4435 |             "has_figures": True,
4436 |             "has_sections": bool(re.search(r"^#{1,6}\s+", final_content, re.M))
4437 |             if output_format == "markdown"
4438 |             else False,
4439 |             "image_width": img.width,
4440 |             "image_height": img.height,
4441 |             "ocr_language": ocr_lang,
4442 |         }
4443 | 
4444 |         # --- Construct Response ---
4445 |         elapsed = round(time.time() - t0, 3)
4446 |         response = {
4447 |             "success": True,
4448 |             "content": final_content,
4449 |             "output_format": output_format,
4450 |             "processing_time": elapsed,
4451 |             "document_metadata": doc_metadata,
4452 |             "extraction_strategy_used": "ocr",
4453 |         }
4454 |         if enhance_with_llm:
4455 |             response["raw_text"] = raw_text
4456 |         if quality_metrics:
4457 |             response["ocr_quality_metrics"] = quality_metrics
4458 | 
4459 |         logger.info(f"Completed OCR for '{input_name}' in {elapsed}s")
4460 |         return response
4461 | 
4462 |     except Exception as e:
4463 |         logger.error(f"Error during image OCR for '{input_name}': {e}", exc_info=True)
4464 |         if isinstance(e, (ToolInputError, ToolError)):
4465 |             raise e
4466 |         raise ToolError("IMAGE_OCR_FAILED", details={"input": input_name, "error": str(e)}) from e
4467 |     finally:
4468 |         if img:
4469 |             img.close()
4470 |         if preprocessed_img and preprocessed_img != img:
4471 |             preprocessed_img.close()
4472 | 
4473 | 
4474 | @with_tool_metrics
4475 | @with_retry(max_retries=2, retry_delay=1.0)
4476 | @with_error_handling
4477 | async def enhance_ocr_text(
4478 |     text: str,
4479 |     output_format: str = "markdown",
4480 |     enhancement_options: Optional[Dict] = None,
4481 | ) -> Dict[str, Any]:
4482 |     """
4483 |     Enhances existing OCR text using an LLM to correct errors and improve formatting (Standalone Tool).
4484 | 
4485 |     Args:
4486 |         text: The raw OCR text to enhance.
4487 |         output_format: Target format ('markdown' or 'text').
4488 |         enhancement_options: Dictionary of options:
4489 |             - remove_headers (bool): Attempt to remove headers/footers. Default: False.
4490 |             - assess_quality (bool): Run LLM quality assessment comparing input vs output. Default: False.
4491 | 
4492 |     Returns:
4493 |         Dictionary containing enhanced text and metadata.
4494 |     """
4495 |     t0 = time.time()
4496 |     if not text or not isinstance(text, str):
4497 |         raise ToolInputError("Input 'text' must be a non-empty string", param_name="text")
4498 | 
4499 |     options = enhancement_options or {}
4500 |     output_format = output_format.lower()
4501 |     if output_format not in _OCR_COMPATIBLE_FORMATS:
4502 |         logger.warning(
4503 |             f"Output format '{output_format}' not ideal for text enhancement. Using 'markdown'."
4504 |         )
4505 |         output_format = "markdown"
4506 | 
4507 |     try:
4508 |         final_content = ""
4509 |         quality_metrics = None
4510 | 
4511 |         with _span("llm_text_enhancement"):
4512 |             max_direct_process_len = 15000
4513 |             if len(text) > max_direct_process_len:
4514 |                 logger.info(f"Splitting large text ({len(text)} chars) for enhancement.")
4515 |                 chunks = _ocr_split_text_into_chunks(text)  # Helper defined earlier
4516 |             else:
4517 |                 chunks = [text]
4518 | 
4519 |             if not chunks:
4520 |                 logger.warning("Input text resulted in zero chunks for enhancement.")
4521 |             else:
4522 |                 enhancement_tasks = [
4523 |                     _ocr_enhance_text_chunk(
4524 |                         chunk,
4525 |                         output_format=output_format,
4526 |                         remove_headers=options.get("remove_headers", False),
4527 |                     )
4528 |                     for chunk in chunks
4529 |                 ]  # Helper defined earlier
4530 |                 enhanced_chunks = await asyncio.gather(*enhancement_tasks)
4531 |                 final_content = "\n\n".join(enhanced_chunks).strip()
4532 | 
4533 |         if options.get("assess_quality", False):
4534 |             with _span("ocr_quality_assessment"):
4535 |                 quality_metrics = await _ocr_assess_text_quality(
4536 |                     text, final_content
4537 |                 )  # Helper defined earlier
4538 | 
4539 |         elapsed = round(time.time() - t0, 3)
4540 |         response = {
4541 |             "success": True,
4542 |             "content": final_content,
4543 |             "output_format": output_format,
4544 |             "processing_time": elapsed,
4545 |             "raw_text": text,
4546 |         }
4547 |         if quality_metrics:
4548 |             response["ocr_quality_metrics"] = quality_metrics
4549 |         logger.info(f"Completed OCR text enhancement in {elapsed}s")
4550 |         return response
4551 | 
4552 |     except Exception as e:
4553 |         logger.error(f"Error during OCR text enhancement: {e}", exc_info=True)
4554 |         if isinstance(e, (ToolInputError, ToolError)):
4555 |             raise e
4556 |         raise ToolError("TEXT_ENHANCEMENT_FAILED", details={"error": str(e)}) from e
4557 | 
4558 | 
4559 | @with_tool_metrics
4560 | @with_retry(max_retries=2, retry_delay=1.0)
4561 | @with_error_handling
4562 | async def analyze_pdf_structure(
4563 |     file_path: Optional[str] = None,
4564 |     document_data: Optional[bytes] = None,
4565 |     extract_metadata: bool = True,
4566 |     extract_outline: bool = True,
4567 |     extract_fonts: bool = False,
4568 |     extract_images: bool = False,
4569 |     estimate_ocr_needs: bool = True,
4570 | ) -> Dict[str, Any]:
4571 |     """
4572 |     Analyzes PDF structure (metadata, outline, fonts, images, OCR needs) without full text extraction (Standalone Tool).
4573 |     Requires either PyMuPDF or PDFPlumber.
4574 |     """
4575 |     t0 = time.time()
4576 |     if not _PYMUPDF_AVAILABLE and not _PDFPLUMBER_AVAILABLE:
4577 |         raise ToolError("DEPENDENCY_MISSING", details={"dependency": "PyMuPDF or PDFPlumber"})
4578 | 
4579 |     pdf_lib = "pymupdf" if _PYMUPDF_AVAILABLE else "pdfplumber"
4580 |     logger.info(f"Analyzing PDF structure using {pdf_lib}.")
4581 | 
4582 |     input_path_obj: Optional[Path] = None
4583 |     is_temp_file = False
4584 |     input_name = "input_data"
4585 |     try:
4586 |         input_path_obj, is_temp_file = _get_input_path_or_temp(file_path, document_data)
4587 |         input_name = input_path_obj.name
4588 |         if not input_path_obj.suffix.lower() == ".pdf":
4589 |             raise ToolInputError(f"Input must be PDF, got: {input_path_obj.suffix}")
4590 | 
4591 |         with _handle_temp_file(input_path_obj, is_temp_file) as current_input_path:
4592 |             result: Dict[str, Any] = {
4593 |                 "success": False,
4594 |                 "file_info": input_name,
4595 |                 "analysis_engine": pdf_lib,
4596 |                 "processing_time": 0.0,
4597 |             }
4598 |             loop = asyncio.get_running_loop()
4599 | 
4600 |             if pdf_lib == "pymupdf":
4601 | 
4602 |                 def _analyze_with_pymupdf_sync():
4603 |                     analysis_data = {}
4604 |                     _ocr_check_dep("PyMuPDF", _PYMUPDF_AVAILABLE, "PDF Analysis")
4605 |                     if pymupdf is None:
4606 |                         raise ToolError("INTERNAL_ERROR", details={"reason": "pymupdf is None"})
4607 |                     with pymupdf.open(current_input_path) as doc:  # type: ignore
4608 |                         analysis_data["page_count"] = len(doc)
4609 |                         if extract_metadata:
4610 |                             analysis_data["metadata"] = {
4611 |                                 k: doc.metadata.get(k, "")
4612 |                                 for k in [
4613 |                                     "title",
4614 |                                     "author",
4615 |                                     "subject",
4616 |                                     "keywords",
4617 |                                     "creator",
4618 |                                     "producer",
4619 |                                     "creationDate",
4620 |                                     "modDate",
4621 |                                 ]
4622 |                             }
4623 |                         if extract_outline:
4624 |                             toc = doc.get_toc()
4625 |                             analysis_data["outline"] = (
4626 |                                 _ocr_process_toc(toc) if toc else []
4627 |                             )  # Helper defined earlier
4628 |                         if extract_fonts:
4629 |                             fonts: Set[str] = set()
4630 |                             embedded_fonts: Set[str] = set()
4631 |                             limit = min(10, len(doc))
4632 |                             for i in range(limit):
4633 |                                 for font_info in doc.get_page_fonts(i):
4634 |                                     fonts.add(font_info[3])
4635 |                                     embedded_fonts.add(font_info[3]) if font_info[4] else None
4636 |                             analysis_data["font_info"] = {
4637 |                                 "total_fonts": len(fonts),
4638 |                                 "embedded_fonts": len(embedded_fonts),
4639 |                                 "font_names": sorted(list(fonts)),
4640 |                             }
4641 |                         if extract_images:
4642 |                             img_count = 0
4643 |                             img_types: Dict[str, int] = {}
4644 |                             total_size = 0
4645 |                             limit = min(5, len(doc))
4646 |                             for i in range(limit):
4647 |                                 for img in doc.get_page_images(i, full=True):
4648 |                                     img_count += 1
4649 |                                     xref = img[0]
4650 |                                     try:
4651 |                                         img_info = doc.extract_image(xref)
4652 |                                         img_ext = img_info["ext"]
4653 |                                         img_size = len(img_info["image"])
4654 |                                     except Exception:
4655 |                                         img_ext = "unknown"
4656 |                                         img_size = 0
4657 |                                     img_types[img_ext] = img_types.get(img_ext, 0) + 1
4658 |                                     total_size += img_size
4659 |                             est_total = (
4660 |                                 int(img_count * (len(doc) / max(1, limit)))
4661 |                                 if limit > 0
4662 |                                 else img_count
4663 |                             )
4664 |                             avg_size_kb = (
4665 |                                 int(total_size / max(1, img_count) / 1024) if img_count > 0 else 0
4666 |                             )
4667 |                             analysis_data["image_info"] = {
4668 |                                 "sampled_images": img_count,
4669 |                                 "estimated_total_images": est_total,
4670 |                                 "image_types": img_types,
4671 |                                 "average_size_kb": avg_size_kb,
4672 |                             }
4673 |                         if estimate_ocr_needs:
4674 |                             text_pages = 0
4675 |                             sample_size = min(10, len(doc))
4676 |                             min_chars = 50
4677 |                             for i in range(sample_size):
4678 |                                 if len(doc[i].get_text("text").strip()) > min_chars:
4679 |                                     text_pages += 1
4680 |                             text_ratio = text_pages / max(1, sample_size)
4681 |                             needs_ocr = text_ratio < 0.8
4682 |                             confidence = (
4683 |                                 "high" if text_ratio < 0.2 or text_ratio > 0.95 else "medium"
4684 |                             )
4685 |                             reason = (
4686 |                                 "Likely scanned or image-based."
4687 |                                 if needs_ocr and confidence == "high"
4688 |                                 else "Likely contains extractable text."
4689 |                                 if not needs_ocr and confidence == "high"
4690 |                                 else "Mix of text/image pages likely."
4691 |                             )
4692 |                             analysis_data["ocr_assessment"] = {
4693 |                                 "needs_ocr": needs_ocr,
4694 |                                 "confidence": confidence,
4695 |                                 "reason": reason,
4696 |                                 "text_coverage_ratio": round(text_ratio, 2),
4697 |                             }
4698 |                     return analysis_data
4699 | 
4700 |                 result.update(await loop.run_in_executor(None, _analyze_with_pymupdf_sync))
4701 |             elif pdf_lib == "pdfplumber":
4702 | 
4703 |                 def _analyze_with_pdfplumber_sync():
4704 |                     analysis_data = {}
4705 |                     _ocr_check_dep("pdfplumber", _PDFPLUMBER_AVAILABLE, "PDF Analysis")
4706 |                     if pdfplumber is None:
4707 |                         raise ToolError("INTERNAL_ERROR", details={"reason": "pdfplumber is None"})
4708 |                     with pdfplumber.open(current_input_path) as pdf:  # type: ignore
4709 |                         analysis_data["page_count"] = len(pdf.pages)
4710 |                         if extract_metadata:
4711 |                             analysis_data["metadata"] = {
4712 |                                 k: pdf.metadata.get(k.capitalize(), "")
4713 |                                 for k in [
4714 |                                     "title",
4715 |                                     "author",
4716 |                                     "subject",
4717 |                                     "keywords",
4718 |                                     "creator",
4719 |                                     "producer",
4720 |                                     "creationDate",
4721 |                                     "modDate",
4722 |                                 ]
4723 |                             }
4724 |                         if extract_outline:
4725 |                             analysis_data["outline"] = {
4726 |                                 "error": "Outline extraction not supported by pdfplumber."
4727 |                             }
4728 |                         if extract_fonts:
4729 |                             analysis_data["font_info"] = {
4730 |                                 "error": "Font extraction not supported by pdfplumber."
4731 |                             }
4732 |                         if extract_images:
4733 |                             analysis_data["image_info"] = {
4734 |                                 "error": "Image info not supported by pdfplumber."
4735 |                             }
4736 |                         if estimate_ocr_needs:
4737 |                             text_pages = 0
4738 |                             sample_size = min(10, len(pdf.pages))
4739 |                             min_chars = 50
4740 |                             for i in range(sample_size):
4741 |                                 if len((pdf.pages[i].extract_text() or "").strip()) > min_chars:
4742 |                                     text_pages += 1
4743 |                             text_ratio = text_pages / max(1, sample_size)
4744 |                             needs_ocr = text_ratio < 0.8
4745 |                             confidence = (
4746 |                                 "high" if text_ratio < 0.2 or text_ratio > 0.95 else "medium"
4747 |                             )
4748 |                             reason = (
4749 |                                 "Likely scanned or image-based."
4750 |                                 if needs_ocr and confidence == "high"
4751 |                                 else "Likely contains extractable text."
4752 |                                 if not needs_ocr and confidence == "high"
4753 |                                 else "Mix of text/image pages likely."
4754 |                             )
4755 |                             analysis_data["ocr_assessment"] = {
4756 |                                 "needs_ocr": needs_ocr,
4757 |                                 "confidence": confidence,
4758 |                                 "reason": reason,
4759 |                                 "text_coverage_ratio": round(text_ratio, 2),
4760 |                             }
4761 |                     return analysis_data
4762 | 
4763 |                 result.update(await loop.run_in_executor(None, _analyze_with_pdfplumber_sync))
4764 | 
4765 |             result["success"] = True
4766 |             result["processing_time"] = round(time.time() - t0, 3)
4767 |             logger.info(
4768 |                 f"PDF structure analysis for '{input_name}' completed in {result['processing_time']:.3f}s using {pdf_lib}"
4769 |             )
4770 |             return result
4771 |     except Exception as e:
4772 |         logger.error(f"Error during PDF structure analysis for '{input_name}': {e}", exc_info=True)
4773 |         if isinstance(e, (ToolInputError, ToolError)):
4774 |             raise e
4775 |         raise ToolError(
4776 |             "PDF_ANALYSIS_FAILED", details={"input": input_name, "error": str(e)}
4777 |         ) from e
4778 | 
4779 | 
4780 | @with_tool_metrics
4781 | @with_error_handling
4782 | async def extract_tables(
4783 |     document_path: Optional[str] = None,
4784 |     document_data: Optional[bytes] = None,
4785 |     *,
4786 |     table_mode: str = "csv",
4787 |     output_dir: Optional[str] = None,
4788 |     accelerator_device: str = "auto",
4789 |     num_threads: int = 4,
4790 | ) -> Dict[str, Any]:
4791 |     """
4792 |     Extracts tables found in a document using Docling and returns them (Standalone Tool).
4793 |     NOTE: This tool currently *requires* the 'docling' extraction strategy implicitly.
4794 |     """
4795 |     _ocr_check_dep("docling", _DOCLING_AVAILABLE, "Table Extraction (extract_tables tool)")
4796 | 
4797 |     valid_modes = {"csv", "json", "pandas"}
4798 |     table_mode = table_mode.lower()
4799 |     if table_mode not in valid_modes:
4800 |         raise ToolInputError(
4801 |             f"table_mode must be one of {', '.join(valid_modes)}", param_name="table_mode"
4802 |         )
4803 |     if table_mode == "pandas":
4804 |         _ocr_check_dep("pandas", _PANDAS_AVAILABLE, "extract_tables(mode='pandas')")
4805 |         if pd is None:
4806 |             raise ToolError(
4807 |                 "INTERNAL_ERROR", details={"reason": "Pandas check passed but pd is None."}
4808 |             )
4809 | 
4810 |     input_path_obj: Optional[Path] = None
4811 |     is_temp_file = False
4812 |     input_name = "input_data"
4813 |     try:
4814 |         input_path_obj, is_temp_file = _get_input_path_or_temp(document_path, document_data)
4815 |         input_name = input_path_obj.name
4816 |         logger.info(f"Starting Docling table extraction from {input_name}, mode='{table_mode}'")
4817 | 
4818 |         with _handle_temp_file(input_path_obj, is_temp_file) as current_input_path:
4819 |             try:
4820 |                 device_str = accelerator_device.lower()
4821 |                 if device_str not in _ACCEL_MAP:
4822 |                     logger.warning(f"Invalid device '{device_str}', using 'auto'.")
4823 |                     device_str = "auto"
4824 |                 device = _ACCEL_MAP[device_str]
4825 |                 conv = _get_docling_converter(device, num_threads)
4826 |                 loop = asyncio.get_running_loop()
4827 |                 with _span("docling_table_conversion"):
4828 |                     result = await loop.run_in_executor(None, conv.convert, current_input_path)
4829 |                 if result and result.document:
4830 |                     doc_obj = result.document
4831 |                     logger.info("Docling conversion successful.")
4832 |                 else:
4833 |                     raise ToolError(
4834 |                         "CONVERSION_FAILED", details={"reason": "Docling returned empty result"}
4835 |                     )
4836 |             except Exception as e:
4837 |                 logger.error(f"Error during Docling conversion: {e}", exc_info=True)
4838 |                 raise ToolError(
4839 |                     "CONVERSION_FAILED", details={"doc": str(current_input_path), "error": str(e)}
4840 |                 ) from e
4841 | 
4842 |             if not doc_obj:
4843 |                 return {
4844 |                     "tables": [],
4845 |                     "saved_files": [],
4846 |                     "success": False,
4847 |                     "error": "Conversion failed.",
4848 |                     "error_code": "CONVERSION_FAILED",
4849 |                 }
4850 | 
4851 |             tables_raw_data: List[List[List[str]]] = []
4852 |             try:
4853 |                 with _span("docling_table_extraction"):
4854 |                     if hasattr(doc_obj, "get_tables") and callable(doc_obj.get_tables):
4855 |                         tables_raw_data = doc_obj.get_tables() or []
4856 |                     elif hasattr(doc_obj, "pages") and isinstance(doc_obj.pages, list):
4857 |                         logger.warning("Using page iteration for tables.")
4858 |                         for page in doc_obj.pages:
4859 |                             if (
4860 |                                 hasattr(page, "content")
4861 |                                 and page.content
4862 |                                 and callable(getattr(page.content, "has_tables", None))
4863 |                                 and page.content.has_tables()
4864 |                             ):
4865 |                                 page_tables = page.content.get_tables()
4866 |                                 if page_tables and isinstance(page_tables, list):
4867 |                                     tables_raw_data.extend(
4868 |                                         pt for pt in page_tables if isinstance(pt, list)
4869 |                                     )
4870 |                     else:
4871 |                         logger.error("Cannot extract tables: Missing get_tables/pages.")
4872 |                 sanitized_tables = []
4873 |                 for tbl in tables_raw_data:
4874 |                     if isinstance(tbl, list) and all(isinstance(row, list) for row in tbl):
4875 |                         sanitized_tables.append(
4876 |                             [[str(cell) if cell is not None else "" for cell in row] for row in tbl]
4877 |                         )
4878 |                     else:
4879 |                         logger.warning(f"Skipping malformed table: {type(tbl)}")
4880 |                 tables_raw_data = sanitized_tables
4881 |             except Exception as e:
4882 |                 logger.error(f"Error accessing tables: {e}", exc_info=True)
4883 | 
4884 |             if not tables_raw_data:
4885 |                 logger.warning(f"No tables found in {input_name}.")
4886 |                 return {"tables": [], "saved_files": [], "success": True}
4887 |             logger.info(f"Extracted {len(tables_raw_data)} raw tables.")
4888 | 
4889 |             output_tables: List[Any] = []
4890 |             saved_files: List[str] = []
4891 |             output_dir_path = Path(output_dir) if output_dir else None
4892 |             if output_dir_path:
4893 |                 output_dir_path.mkdir(parents=True, exist_ok=True)
4894 | 
4895 |             with _span("table_formatting_saving"):
4896 |                 for i, raw_table in enumerate(tables_raw_data):
4897 |                     processed_table: Any = None
4898 |                     file_ext = ""
4899 |                     try:
4900 |                         if table_mode == "csv":
4901 |                             output = StringIO()
4902 |                             writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
4903 |                             writer.writerows(raw_table)
4904 |                             processed_table = output.getvalue()
4905 |                             file_ext = "csv"
4906 |                             save_content = processed_table
4907 |                         elif table_mode == "json":
4908 |                             processed_table = raw_table
4909 |                             file_ext = "json"
4910 |                             save_content = _json(processed_table)
4911 |                         elif table_mode == "pandas":
4912 |                             df = pd.DataFrame(raw_table)
4913 |                             if not df.empty and len(df) > 1:
4914 |                                 first_row = df.iloc[0]
4915 |                                 is_header = (
4916 |                                     sum(
4917 |                                         1
4918 |                                         for cell in first_row
4919 |                                         if not str(cell).replace(".", "", 1).strip("-").isdigit()
4920 |                                     )
4921 |                                     > len(first_row) / 2
4922 |                                 )
4923 |                                 if is_header:
4924 |                                     df.columns = first_row
4925 |                                     df = df[1:].reset_index(drop=True)
4926 |                             processed_table = df
4927 |                             file_ext = "csv"
4928 |                             save_content = df
4929 |                         output_tables.append(processed_table)
4930 |                         if output_dir_path and file_ext and save_content is not None:
4931 |                             base_name = Path(input_name).stem
4932 |                             fp = output_dir_path / f"{base_name}_table_{i + 1}.{file_ext}"
4933 |                             try:
4934 |                                 if isinstance(save_content, str):
4935 |                                     fp.write_text(save_content, encoding="utf-8")
4936 |                                 elif isinstance(save_content, pd.DataFrame):
4937 |                                     save_content.to_csv(fp, index=False, encoding="utf-8")
4938 |                                 saved_files.append(str(fp))
4939 |                                 logger.debug(f"Saved table {i + 1} to {fp}")
4940 |                             except Exception as e_save:
4941 |                                 logger.error(
4942 |                                     f"Failed to save table {i + 1} to {fp}: {e_save}", exc_info=True
4943 |                                 )
4944 |                     except Exception as e_format:
4945 |                         logger.error(
4946 |                             f"Failed to format table {i} into '{table_mode}': {e_format}",
4947 |                             exc_info=True,
4948 |                         )
4949 |             logger.info(f"Processed {len(output_tables)} tables into '{table_mode}'.")
4950 |             return {"tables": output_tables, "saved_files": saved_files, "success": True}
4951 |     except Exception as e:
4952 |         logger.error(f"Error in extract_tables for '{input_name}': {e}", exc_info=True)
4953 |         if isinstance(e, (ToolInputError, ToolError)):
4954 |             raise e
4955 |         raise ToolError(
4956 |             "TABLE_EXTRACTION_FAILED", details={"input": input_name, "error": str(e)}
4957 |         ) from e
4958 | 
4959 | 
4960 | ###############################################################################
4961 | # Batch Processing Tool (Standalone)                                          #
4962 | ###############################################################################
4963 | 
4964 | 
4965 | # Map operation names to the standalone functions
4966 | # Placed here to ensure all target functions are defined above
4967 | _OP_MAP: Dict[str, Callable[..., Awaitable[Any]]] = {
4968 |     "convert_document": convert_document,
4969 |     "ocr_image": ocr_image,
4970 |     "enhance_ocr_text": enhance_ocr_text,
4971 |     "clean_and_format_text_as_markdown": clean_and_format_text_as_markdown,
4972 |     "optimize_markdown_formatting": optimize_markdown_formatting,
4973 |     "detect_content_type": detect_content_type,
4974 |     "chunk_document": chunk_document,
4975 |     "summarize_document": summarize_document,
4976 |     "extract_entities": extract_entities,
4977 |     "generate_qa_pairs": generate_qa_pairs,
4978 |     "identify_sections": identify_sections,
4979 |     "extract_metrics": extract_metrics,
4980 |     "flag_risks": flag_risks,
4981 |     "canonicalise_entities": canonicalise_entities,
4982 |     "analyze_pdf_structure": analyze_pdf_structure,
4983 |     "extract_tables": extract_tables,
4984 |     "batch_format_texts": batch_format_texts,
4985 | }
4986 | 
4987 | # Assume necessary imports and _OP_MAP are defined above
4988 | 
4989 | 
4990 | @with_tool_metrics
4991 | @with_error_handling  # Catch errors setting up the batch itself
4992 | async def process_document_batch(
4993 |     inputs: List[Dict[str, Any]],
4994 |     operations: List[Dict[str, Any]],
4995 |     max_concurrency: int = 5,
4996 | ) -> List[Dict[str, Any]]:
4997 |     """
4998 |     Processes a list of input items through a sequence of operations concurrently (Standalone Tool).
4999 | 
5000 |     Args:
5001 |         inputs: List of input dictionaries. Each dict represents an item (e.g., {"document_path": "..."}).
5002 |         operations: List of operation specifications. Each dict defines:
5003 |             - operation (str): Name of the tool function to call (from _OP_MAP).
5004 |             - output_key (str): Key to store the operation's result under in the item's state.
5005 |             - params (dict): Fixed parameters for the operation.
5006 |             Optional:
5007 |             - input_key (str): Key in item state holding primary input (default conventions apply).
5008 |             - input_keys_map (dict): Map function parameters to item state keys.
5009 |             - promote_output (str): Key in result dict to promote to top-level "content".
5010 |         max_concurrency: Max parallel items per operation step.
5011 | 
5012 |     Returns:
5013 |         List of dictionaries, representing the final state of each input item.
5014 |     """
5015 |     # --- Input Validation ---
5016 |     if not isinstance(inputs, list):
5017 |         raise ToolInputError("'inputs' must be a list.")
5018 |     if not isinstance(operations, list):
5019 |         raise ToolInputError("'operations' must be a list.")
5020 |     if not all(isinstance(item, dict) for item in inputs):
5021 |         raise ToolInputError("All items in 'inputs' must be dictionaries.")
5022 |     if not all(isinstance(op, dict) for op in operations):
5023 |         raise ToolInputError("All items in 'operations' must be dictionaries.")
5024 |     max_concurrency = max(1, max_concurrency)
5025 |     if not inputs:
5026 |         logger.warning("Input list is empty.")
5027 |         return []
5028 | 
5029 |     # --- Initialize Results State ---
5030 |     results_state: List[Dict[str, Any]] = []
5031 |     for i, item in enumerate(inputs):
5032 |         state_item = item.copy()
5033 |         state_item["_original_index"] = i
5034 |         state_item["_error_log"] = []
5035 |         state_item["_status"] = "pending"
5036 |         results_state.append(state_item)
5037 | 
5038 |     logger.info(f"Starting batch processing: {len(inputs)} items, {len(operations)} operations.")
5039 | 
5040 |     # --- Define Worker Outside the Loop ---
5041 |     # This worker now takes all potentially changing parameters explicitly
5042 |     async def _apply_op_to_item_worker(
5043 |         item_state: Dict[str, Any],
5044 |         semaphore: asyncio.Semaphore,
5045 |         op_func: Callable,
5046 |         op_name: str,
5047 |         step_label: str,
5048 |         op_output_key: str,
5049 |         op_input_key: Optional[str],
5050 |         op_params: Dict,
5051 |         op_input_map: Dict,
5052 |         op_promote: Optional[str],
5053 |     ) -> Dict[str, Any]:
5054 |         item_idx = item_state["_original_index"]
5055 |         if item_state["_status"] == "failed":
5056 |             return item_state  # Don't process failed items
5057 | 
5058 |         async with semaphore:
5059 |             logger.debug(f"Applying {step_label} to item {item_idx}")
5060 |             call_kwargs = {}
5061 |             primary_input_arg_name = None
5062 |             input_source_key = None
5063 | 
5064 |             try:
5065 |                 # 1. Determine Primary Input Source Key
5066 |                 if op_input_key and op_input_key in item_state:
5067 |                     input_source_key = op_input_key
5068 |                 else:
5069 |                     potential_keys = []
5070 |                     if op_name.startswith(
5071 |                         ("convert_document", "ocr_image", "analyze_pdf_structure", "extract_tables")
5072 |                     ):
5073 |                         potential_keys = [
5074 |                             "document_path",
5075 |                             "image_path",
5076 |                             "file_path",
5077 |                             "document_data",
5078 |                             "image_data",
5079 |                         ]
5080 |                     elif op_name == "canonicalise_entities":
5081 |                         potential_keys = ["entities_input"]
5082 |                     elif op_name == "batch_format_texts":
5083 |                         potential_keys = ["texts"]
5084 |                     else:
5085 |                         potential_keys = ["content", "document", "text"]
5086 | 
5087 |                     for key in potential_keys:
5088 |                         if key in item_state:
5089 |                             input_source_key = key
5090 |                             break
5091 |                     if not input_source_key:
5092 |                         if "content" in item_state:
5093 |                             input_source_key = "content"
5094 |                         elif "document" in item_state:
5095 |                             input_source_key = "document"
5096 |                         else:
5097 |                             raise ToolInputError(
5098 |                                 f"Cannot determine input for op '{op_name}' for item {item_idx}."
5099 |                             )
5100 | 
5101 |                 primary_input_value = item_state[input_source_key]
5102 | 
5103 |                 # 2. Determine Primary Input Argument Name
5104 |                 primary_param_map = {
5105 |                     "document_path": "document_path",
5106 |                     "image_path": "image_path",
5107 |                     "file_path": "file_path",
5108 |                     "document_data": "document_data",
5109 |                     "image_data": "image_data",
5110 |                     "text": "text",
5111 |                     "entities_input": "entities_input",
5112 |                     "texts": "texts",
5113 |                     "document": "document",
5114 |                     "content": "document",
5115 |                 }
5116 |                 primary_input_arg_name = primary_param_map.get(input_source_key)
5117 |                 if not primary_input_arg_name:
5118 |                     # Inspect function signature to find the likely primary argument
5119 |                     try:
5120 |                         func_vars = op_func.__code__.co_varnames[: op_func.__code__.co_argcount]
5121 |                         primary_input_arg_name = (
5122 |                             "document"
5123 |                             if "document" in func_vars
5124 |                             else "text"
5125 |                             if "text" in func_vars
5126 |                             else func_vars[0]
5127 |                             if func_vars
5128 |                             else "input"
5129 |                         )
5130 |                     except (
5131 |                         AttributeError
5132 |                     ):  # Handle cases where introspection fails (e.g., built-ins)
5133 |                         primary_input_arg_name = "document"  # Default guess
5134 |                     logger.warning(
5135 |                         f"Assuming primary arg for op '{op_name}' is '{primary_input_arg_name}'."
5136 |                     )
5137 | 
5138 |                 call_kwargs[primary_input_arg_name] = primary_input_value
5139 | 
5140 |                 # 3. Handle Mapped Inputs
5141 |                 if isinstance(op_input_map, dict):
5142 |                     for param_name, state_key in op_input_map.items():
5143 |                         if state_key not in item_state:
5144 |                             raise ToolInputError(
5145 |                                 f"Mapped key '{state_key}' not found for item {item_idx}.",
5146 |                                 param_name=state_key,
5147 |                             )
5148 |                         if param_name != primary_input_arg_name:
5149 |                             call_kwargs[param_name] = item_state[state_key]
5150 |                         elif call_kwargs[primary_input_arg_name] != item_state[state_key]:
5151 |                             logger.warning(
5152 |                                 f"Mapped input '{param_name}' overrides primary input for item {item_idx}."
5153 |                             )
5154 |                             call_kwargs[primary_input_arg_name] = item_state[state_key]
5155 | 
5156 |                 # 4. Add Fixed Params
5157 |                 if isinstance(op_params, dict):
5158 |                     for p_name, p_value in op_params.items():
5159 |                         if p_name == primary_input_arg_name and p_name in call_kwargs:
5160 |                             logger.warning(
5161 |                                 f"Fixed param '{p_name}' overrides dynamic input for item {item_idx}."
5162 |                             )
5163 |                         call_kwargs[p_name] = p_value
5164 | 
5165 |                 # --- Execute Operation ---
5166 |                 logger.debug(
5167 |                     f"Calling {op_name} for item {item_idx} with args: {list(call_kwargs.keys())}"
5168 |                 )
5169 |                 op_result = await op_func(**call_kwargs)  # Call the standalone function
5170 | 
5171 |                 # --- Process Result ---
5172 |                 if not isinstance(op_result, dict):
5173 |                     raise ToolError(
5174 |                         "INVALID_RESULT_FORMAT",
5175 |                         details={"operation": op_name, "result_type": type(op_result).__name__},
5176 |                     )
5177 |                 item_state[op_output_key] = op_result  # Store full result
5178 | 
5179 |                 # Promote output if requested
5180 |                 if op_promote and isinstance(op_promote, str):
5181 |                     if op_promote in op_result:
5182 |                         item_state["content"] = op_result[op_promote]
5183 |                         logger.debug(f"Promoted '{op_promote}' to 'content' for item {item_idx}")
5184 |                     else:
5185 |                         logger.warning(
5186 |                             f"Cannot promote key '{op_promote}' for item {item_idx}: key not found in result."
5187 |                         )
5188 | 
5189 |                 # Update status based on success flag
5190 |                 if not op_result.get("success", False):
5191 |                     err_msg = op_result.get("error", f"Op '{op_name}' failed.")
5192 |                     err_code = op_result.get("error_code", "PROCESSING_ERROR")
5193 |                     log_entry = f"{step_label} Failed: [{err_code}] {err_msg}"
5194 |                     item_state["_error_log"].append(log_entry)
5195 |                     item_state["_status"] = "failed"
5196 |                     logger.warning(f"Op '{op_name}' failed for item {item_idx}: {err_msg}")
5197 |                 elif item_state["_status"] != "failed":
5198 |                     item_state["_status"] = "processed"
5199 | 
5200 |             # --- Error Handling for Worker ---
5201 |             except ToolInputError as tie:
5202 |                 error_msg = f"{step_label} Input Error: [{tie.error_code}] {str(tie)}"
5203 |                 logger.error(f"{error_msg} for item {item_idx}", exc_info=False)
5204 |                 item_state["_error_log"].append(error_msg)
5205 |                 item_state["_status"] = "failed"
5206 |                 item_state[op_output_key] = {
5207 |                     "error": str(tie),
5208 |                     "error_code": tie.error_code,
5209 |                     "success": False,
5210 |                 }
5211 |             except ToolError as te:
5212 |                 error_msg = f"{step_label} Tool Error: [{te.error_code}] {str(te)}"
5213 |                 logger.error(f"{error_msg} for item {item_idx}", exc_info=True)
5214 |                 item_state["_error_log"].append(error_msg)
5215 |                 item_state["_status"] = "failed"
5216 |                 item_state[op_output_key] = {
5217 |                     "error": str(te),
5218 |                     "error_code": te.error_code,
5219 |                     "success": False,
5220 |                 }
5221 |             except Exception as e:
5222 |                 error_msg = f"{step_label} Unexpected Error: {type(e).__name__}: {str(e)}"
5223 |                 logger.error(f"{error_msg} for item {item_idx}", exc_info=True)
5224 |                 item_state["_error_log"].append(error_msg)
5225 |                 item_state["_status"] = "failed"
5226 |                 item_state[op_output_key] = {
5227 |                     "error": str(e),
5228 |                     "error_type": type(e).__name__,
5229 |                     "success": False,
5230 |                 }
5231 |             return item_state
5232 | 
5233 |     # --- Apply Operations Sequentially ---
5234 |     for op_index, op_spec in enumerate(operations):
5235 |         op_name = op_spec.get("operation")
5236 |         op_output_key = op_spec.get("output_key")
5237 |         op_params = op_spec.get("params", {})
5238 |         op_input_key = op_spec.get("input_key")
5239 |         op_input_map = op_spec.get("input_keys_map", {})
5240 |         op_promote = op_spec.get("promote_output")
5241 | 
5242 |         # --- Validate Operation Spec (robust checks) ---
5243 |         if not op_name or not isinstance(op_name, str) or op_name not in _OP_MAP:
5244 |             error_msg = f"Invalid/unknown operation '{op_name}' at step {op_index + 1}."
5245 |             logger.error(error_msg + " Skipping step for all items.")
5246 |             for item_state in results_state:
5247 |                 if item_state["_status"] != "failed":
5248 |                     item_state["_error_log"].append(error_msg + " (Skipped)")
5249 |                     item_state["_status"] = "failed"
5250 |             continue
5251 |         if not op_output_key or not isinstance(op_output_key, str):
5252 |             error_msg = f"Missing/invalid 'output_key' for '{op_name}' at step {op_index + 1}."
5253 |             logger.error(error_msg + " Skipping step for all items.")
5254 |             for item_state in results_state:
5255 |                 if item_state["_status"] != "failed":
5256 |                     item_state["_error_log"].append(error_msg + " (Skipped)")
5257 |                     item_state["_status"] = "failed"
5258 |             continue
5259 |         if not isinstance(op_params, dict):
5260 |             error_msg = f"Invalid 'params' (must be dict) for '{op_name}' at step {op_index + 1}."
5261 |             logger.error(error_msg + " Skipping step for all items.")
5262 |             for item_state in results_state:
5263 |                 if item_state["_status"] != "failed":
5264 |                     item_state["_error_log"].append(error_msg + " (Skipped)")
5265 |                     item_state["_status"] = "failed"
5266 |             continue
5267 | 
5268 |         # Get the actual function from the map
5269 |         current_op_func = _OP_MAP[op_name]
5270 |         current_step_label = f"Step {op_index + 1}/{len(operations)}: '{op_name}'"
5271 |         logger.info(f"--- Starting {current_step_label} (Concurrency: {max_concurrency}) ---")
5272 | 
5273 |         # --- Run Tasks for Current Step ---
5274 |         step_semaphore = asyncio.Semaphore(max_concurrency)
5275 |         step_tasks = [
5276 |             # Call the single worker function, passing the current loop's values
5277 |             _apply_op_to_item_worker(
5278 |                 item_state=item_state,
5279 |                 semaphore=step_semaphore,
5280 |                 op_func=current_op_func,
5281 |                 op_name=op_name,
5282 |                 step_label=current_step_label,
5283 |                 op_output_key=op_output_key,
5284 |                 op_input_key=op_input_key,
5285 |                 op_params=op_params,
5286 |                 op_input_map=op_input_map,
5287 |                 op_promote=op_promote,
5288 |             )
5289 |             for item_state in results_state
5290 |         ]
5291 |         updated_states = await asyncio.gather(*step_tasks)
5292 |         results_state = updated_states  # Update the main state list
5293 | 
5294 |         # Log summary after step
5295 |         step_processed_count = sum(1 for s in results_state if s.get("_status") == "processed")
5296 |         step_fail_count = sum(1 for s in results_state if s.get("_status") == "failed")
5297 |         logger.info(
5298 |             f"--- Finished {current_step_label} (Processed: {step_processed_count}, Failed: {step_fail_count}) ---"
5299 |         )
5300 | 
5301 |     # --- Final Cleanup ---
5302 |     final_results = []
5303 |     for item_state in results_state:
5304 |         final_item = item_state.copy()
5305 |         final_item.pop("_original_index", None)
5306 |         # Keep _status and _error_log for visibility
5307 |         final_results.append(final_item)
5308 | 
5309 |     logger.info(f"Batch processing finished for {len(inputs)} items.")
5310 |     return final_results
5311 | 
```
Page 43/45FirstPrevNextLast