#
tokens: 29986/50000 1/207 files (page 27/45)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 27 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│   ├── __init__.py
│   ├── advanced_agent_flows_using_unified_memory_system_demo.py
│   ├── advanced_extraction_demo.py
│   ├── advanced_unified_memory_system_demo.py
│   ├── advanced_vector_search_demo.py
│   ├── analytics_reporting_demo.py
│   ├── audio_transcription_demo.py
│   ├── basic_completion_demo.py
│   ├── cache_demo.py
│   ├── claude_integration_demo.py
│   ├── compare_synthesize_demo.py
│   ├── cost_optimization.py
│   ├── data
│   │   ├── sample_event.txt
│   │   ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│   │   └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│   ├── docstring_refiner_demo.py
│   ├── document_conversion_and_processing_demo.py
│   ├── entity_relation_graph_demo.py
│   ├── filesystem_operations_demo.py
│   ├── grok_integration_demo.py
│   ├── local_text_tools_demo.py
│   ├── marqo_fused_search_demo.py
│   ├── measure_model_speeds.py
│   ├── meta_api_demo.py
│   ├── multi_provider_demo.py
│   ├── ollama_integration_demo.py
│   ├── prompt_templates_demo.py
│   ├── python_sandbox_demo.py
│   ├── rag_example.py
│   ├── research_workflow_demo.py
│   ├── sample
│   │   ├── article.txt
│   │   ├── backprop_paper.pdf
│   │   ├── buffett.pdf
│   │   ├── contract_link.txt
│   │   ├── legal_contract.txt
│   │   ├── medical_case.txt
│   │   ├── northwind.db
│   │   ├── research_paper.txt
│   │   ├── sample_data.json
│   │   └── text_classification_samples
│   │       ├── email_classification.txt
│   │       ├── news_samples.txt
│   │       ├── product_reviews.txt
│   │       └── support_tickets.txt
│   ├── sample_docs
│   │   └── downloaded
│   │       └── attention_is_all_you_need.pdf
│   ├── sentiment_analysis_demo.py
│   ├── simple_completion_demo.py
│   ├── single_shot_synthesis_demo.py
│   ├── smart_browser_demo.py
│   ├── sql_database_demo.py
│   ├── sse_client_demo.py
│   ├── test_code_extraction.py
│   ├── test_content_detection.py
│   ├── test_ollama.py
│   ├── text_classification_demo.py
│   ├── text_redline_demo.py
│   ├── tool_composition_examples.py
│   ├── tournament_code_demo.py
│   ├── tournament_text_demo.py
│   ├── unified_memory_system_demo.py
│   ├── vector_search_demo.py
│   ├── web_automation_instruction_packs.py
│   └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│   └── smart_browser_internal
│       ├── locator_cache.db
│       ├── readability.js
│       └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│   ├── __init__.py
│   ├── conftest.py
│   ├── integration
│   │   ├── __init__.py
│   │   └── test_server.py
│   ├── manual
│   │   ├── test_extraction_advanced.py
│   │   └── test_extraction.py
│   └── unit
│       ├── __init__.py
│       ├── test_cache.py
│       ├── test_providers.py
│       └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── commands.py
│   │   ├── helpers.py
│   │   └── typer_cli.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── completion_client.py
│   │   └── rag_client.py
│   ├── config
│   │   └── examples
│   │       └── filesystem_config.yaml
│   ├── config.py
│   ├── constants.py
│   ├── core
│   │   ├── __init__.py
│   │   ├── evaluation
│   │   │   ├── base.py
│   │   │   └── evaluators.py
│   │   ├── providers
│   │   │   ├── __init__.py
│   │   │   ├── anthropic.py
│   │   │   ├── base.py
│   │   │   ├── deepseek.py
│   │   │   ├── gemini.py
│   │   │   ├── grok.py
│   │   │   ├── ollama.py
│   │   │   ├── openai.py
│   │   │   └── openrouter.py
│   │   ├── server.py
│   │   ├── state_store.py
│   │   ├── tournaments
│   │   │   ├── manager.py
│   │   │   ├── tasks.py
│   │   │   └── utils.py
│   │   └── ums_api
│   │       ├── __init__.py
│   │       ├── ums_database.py
│   │       ├── ums_endpoints.py
│   │       ├── ums_models.py
│   │       └── ums_services.py
│   ├── exceptions.py
│   ├── graceful_shutdown.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── analytics
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── reporting.py
│   │   ├── cache
│   │   │   ├── __init__.py
│   │   │   ├── cache_service.py
│   │   │   ├── persistence.py
│   │   │   ├── strategies.py
│   │   │   └── utils.py
│   │   ├── cache.py
│   │   ├── document.py
│   │   ├── knowledge_base
│   │   │   ├── __init__.py
│   │   │   ├── feedback.py
│   │   │   ├── manager.py
│   │   │   ├── rag_engine.py
│   │   │   ├── retriever.py
│   │   │   └── utils.py
│   │   ├── prompts
│   │   │   ├── __init__.py
│   │   │   ├── repository.py
│   │   │   └── templates.py
│   │   ├── prompts.py
│   │   └── vector
│   │       ├── __init__.py
│   │       ├── embeddings.py
│   │       └── vector_service.py
│   ├── tool_token_counter.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── audio_transcription.py
│   │   ├── base.py
│   │   ├── completion.py
│   │   ├── docstring_refiner.py
│   │   ├── document_conversion_and_processing.py
│   │   ├── enhanced-ums-lookbook.html
│   │   ├── entity_relation_graph.py
│   │   ├── excel_spreadsheet_automation.py
│   │   ├── extraction.py
│   │   ├── filesystem.py
│   │   ├── html_to_markdown.py
│   │   ├── local_text_tools.py
│   │   ├── marqo_fused_search.py
│   │   ├── meta_api_tool.py
│   │   ├── ocr_tools.py
│   │   ├── optimization.py
│   │   ├── provider.py
│   │   ├── pyodide_boot_template.html
│   │   ├── python_sandbox.py
│   │   ├── rag.py
│   │   ├── redline-compiled.css
│   │   ├── sentiment_analysis.py
│   │   ├── single_shot_synthesis.py
│   │   ├── smart_browser.py
│   │   ├── sql_databases.py
│   │   ├── text_classification.py
│   │   ├── text_redline_tools.py
│   │   ├── tournament.py
│   │   ├── ums_explorer.html
│   │   └── unified_memory_system.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── async_utils.py
│   │   ├── display.py
│   │   ├── logging
│   │   │   ├── __init__.py
│   │   │   ├── console.py
│   │   │   ├── emojis.py
│   │   │   ├── formatter.py
│   │   │   ├── logger.py
│   │   │   ├── panels.py
│   │   │   ├── progress.py
│   │   │   └── themes.py
│   │   ├── parse_yaml.py
│   │   ├── parsing.py
│   │   ├── security.py
│   │   └── text.py
│   └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/ocr_tools.py:
--------------------------------------------------------------------------------

```python
   1 | """OCR Tools for Ultimate MCP Server.
   2 | 
   3 | This module provides tools for OCR (Optical Character Recognition) processing, 
   4 | leveraging LLMs to improve the quality of extracted text from PDFs and images.
   5 | 
   6 | Features:
   7 | - PDF to image conversion with optimized preprocessing
   8 | - Multiple extraction methods (OCR, direct text extraction, hybrid approach)
   9 | - Intelligent text segmentation and processing for large documents
  10 | - LLM-based error correction and formatting
  11 | - Table detection and formatting
  12 | - Multi-language support
  13 | - Quality assessment with detailed metrics
  14 | - PDF structure analysis
  15 | - Batch processing with concurrency control
  16 | - Sophisticated caching for improved performance
  17 | 
  18 | Example usage:
  19 | ```python
  20 | # Extract text from a PDF file with LLM correction
  21 | result = await client.tools.extract_text_from_pdf(
  22 |     file_path="document.pdf",
  23 |     extraction_method="hybrid",  # Try direct text extraction first, fall back to OCR if needed
  24 |     max_pages=5,
  25 |     skip_pages=0,
  26 |     reformat_as_markdown=True,
  27 |     suppress_headers=True
  28 | )
  29 | 
  30 | # Process an image file with custom preprocessing
  31 | result = await client.tools.process_image_ocr(
  32 |     image_path="scan.jpg",
  33 |     preprocessing_options={
  34 |         "denoise": True,
  35 |         "threshold": "adaptive",
  36 |         "deskew": True
  37 |     },
  38 |     ocr_language="eng+fra",  # Multi-language support
  39 |     assess_quality=True
  40 | )
  41 | 
  42 | # Enhance existing OCR text with LLM
  43 | result = await client.tools.enhance_ocr_text(
  44 |     ocr_text="Text with OCK errors and broken lin- es",
  45 |     reformat_as_markdown=True,
  46 |     remove_headers=True
  47 | )
  48 | 
  49 | # Analyze PDF structure without full extraction
  50 | info = await client.tools.analyze_pdf_structure(
  51 |     file_path="document.pdf",
  52 |     extract_metadata=True,
  53 |     extract_outline=True,
  54 |     extract_fonts=True
  55 | )
  56 | 
  57 | # Batch process multiple PDFs
  58 | result = await client.tools.batch_process_documents(
  59 |     folder_path="/path/to/documents",
  60 |     file_pattern="*.pdf",
  61 |     output_folder="/path/to/output",
  62 |     max_concurrency=3
  63 | )
  64 | ```
  65 | """
  66 | import asyncio
  67 | import base64
  68 | import functools
  69 | import hashlib
  70 | import io
  71 | import json
  72 | import math
  73 | import os
  74 | import re
  75 | import tempfile
  76 | import time
  77 | import traceback
  78 | import uuid
  79 | from concurrent.futures import ThreadPoolExecutor
  80 | from pathlib import Path
  81 | from typing import Any, Dict, List, Optional, Set, Tuple, Union
  82 | 
  83 | # Try importing required libraries with fallbacks
  84 | try:
  85 |     import numpy as np
  86 |     HAS_NUMPY = True
  87 | except ImportError:
  88 |     HAS_NUMPY = False
  89 | 
  90 | try:
  91 |     from PIL import Image, ImageEnhance, ImageFilter
  92 |     HAS_PIL = True
  93 | except ImportError:
  94 |     HAS_PIL = False
  95 | 
  96 | try:
  97 |     import cv2
  98 |     HAS_CV2 = True
  99 | except ImportError:
 100 |     HAS_CV2 = False
 101 | 
 102 | try:
 103 |     import pytesseract
 104 |     HAS_PYTESSERACT = True
 105 | except ImportError:
 106 |     HAS_PYTESSERACT = False
 107 | 
 108 | try:
 109 |     from pdf2image import convert_from_bytes, convert_from_path
 110 |     HAS_PDF2IMAGE = True
 111 | except ImportError:
 112 |     HAS_PDF2IMAGE = False
 113 | 
 114 | try:
 115 |     import pdfplumber
 116 |     HAS_PDFPLUMBER = True
 117 | except ImportError:
 118 |     HAS_PDFPLUMBER = False
 119 | 
 120 | try:
 121 |     import pymupdf  # PyMuPDF
 122 |     HAS_PYMUPDF = True
 123 | except ImportError:
 124 |     HAS_PYMUPDF = False
 125 | 
 126 | # Import tools and helpers from ultimate
 127 | from ultimate_mcp_server.constants import Provider, TaskType
 128 | from ultimate_mcp_server.exceptions import ProviderError, ToolError, ToolInputError
 129 | from ultimate_mcp_server.tools.base import (
 130 |     with_cache,
 131 |     with_error_handling,
 132 |     with_retry,
 133 |     with_tool_metrics,
 134 | )
 135 | from ultimate_mcp_server.tools.completion import generate_completion
 136 | from ultimate_mcp_server.utils import get_logger
 137 | 
 138 | logger = get_logger("ultimate_mcp_server.tools.ocr")
 139 | 
 140 | # Cache for storing preprocessed images and extracted text
 141 | OCR_CACHE = {}
 142 | 
 143 | # Check if required dependencies are available
 144 | def _check_ocr_dependencies():
 145 |     """Checks if OCR dependencies are available and returns a dictionary of requirements."""
 146 |     requirements = {
 147 |         "numpy": HAS_NUMPY,
 148 |         "PIL": HAS_PIL,
 149 |         "cv2": HAS_CV2,
 150 |         "pytesseract": HAS_PYTESSERACT,
 151 |         "pdf2image": HAS_PDF2IMAGE,
 152 |         "pdfplumber": HAS_PDFPLUMBER,
 153 |         "pymupdf": HAS_PYMUPDF
 154 |     }
 155 |     
 156 |     missing = [lib for lib, available in requirements.items() if not available]
 157 |     
 158 |     if missing:
 159 |         logger.warning(f"Some OCR dependencies are missing: {', '.join(missing)}")
 160 |         logger.warning("OCR functionality may be limited. Install required packages with:")
 161 |         packages = {
 162 |             "numpy": "numpy",
 163 |             "PIL": "pillow",
 164 |             "cv2": "opencv-python-headless",
 165 |             "pytesseract": "pytesseract",
 166 |             "pdf2image": "pdf2image",
 167 |             "pdfplumber": "pdfplumber",
 168 |             "pymupdf": "pymupdf"
 169 |         }
 170 |         
 171 |         pip_command = f"pip install {' '.join(packages[lib] for lib in missing)}"
 172 |         logger.warning(f"  {pip_command}")
 173 |     
 174 |     return requirements, missing
 175 | 
 176 | # Check dependencies early
 177 | OCR_REQUIREMENTS, MISSING_REQUIREMENTS = _check_ocr_dependencies()
 178 | 
 179 | # --- Helper functions for OCR processing ---
 180 | 
 181 | def _validate_file_path(file_path: str, expected_extension: Optional[str] = None) -> None:
 182 |     """
 183 |     Validates a file path exists and optionally has the expected extension.
 184 |     
 185 |     Args:
 186 |         file_path: Path to the file to validate
 187 |         expected_extension: Optional file extension to check (e.g., '.pdf')
 188 |         
 189 |     Raises:
 190 |         ToolInputError: If validation fails
 191 |     """
 192 |     if not file_path:
 193 |         raise ToolInputError("File path cannot be empty")
 194 |     
 195 |     file_path = os.path.expanduser(os.path.normpath(file_path))
 196 |     
 197 |     if not os.path.exists(file_path):
 198 |         raise ToolInputError(f"File not found: {file_path}")
 199 |     
 200 |     if not os.path.isfile(file_path):
 201 |         raise ToolInputError(f"Path is not a file: {file_path}")
 202 |     
 203 |     if expected_extension and not file_path.lower().endswith(expected_extension.lower()):
 204 |         raise ToolInputError(f"File does not have the expected extension ({expected_extension}): {file_path}")
 205 | 
 206 | def _get_task_type_for_ocr(extraction_method: str = "hybrid") -> str:
 207 |     """
 208 |     Returns the appropriate TaskType for OCR operations based on extraction method.
 209 |     
 210 |     Args:
 211 |         extraction_method: The extraction method being used
 212 |         
 213 |     Returns:
 214 |         The TaskType value as a string
 215 |     """
 216 |     if extraction_method == "direct":
 217 |         return TaskType.TEXT_EXTRACTION.value
 218 |     elif extraction_method == "ocr":
 219 |         return TaskType.OCR.value
 220 |     else:  # hybrid
 221 |         return TaskType.OCR.value
 222 | 
 223 | def _handle_provider_error(e: Exception, operation: str) -> ToolError:
 224 |     """
 225 |     Handles provider-specific errors and converts them to tool errors.
 226 |     
 227 |     Args:
 228 |         e: The exception that was raised
 229 |         operation: Description of the operation that failed
 230 |         
 231 |     Returns:
 232 |         A ToolError with appropriate message
 233 |     """
 234 |     if isinstance(e, ProviderError):
 235 |         # Handle specific provider errors
 236 |         return ToolError(f"Provider error during {operation}: {str(e)}")
 237 |     else:
 238 |         # Handle generic errors
 239 |         return ToolError(f"Error during {operation}: {str(e)}")
 240 | 
 241 | def _preprocess_image(image: Image.Image, preprocessing_options: Optional[Dict[str, Any]] = None) -> Image.Image:
 242 |     """
 243 |     Preprocesses an image for better OCR results.
 244 |     
 245 |     Args:
 246 |         image: PIL Image object
 247 |         preprocessing_options: Dictionary of preprocessing options
 248 |             - denoise: Whether to apply denoising (default: True)
 249 |             - threshold: Thresholding method ('otsu', 'adaptive', 'none') (default: 'otsu')
 250 |             - deskew: Whether to deskew the image (default: True)
 251 |             - enhance_contrast: Whether to enhance contrast (default: True)
 252 |             - enhance_brightness: Whether to enhance brightness (default: False)
 253 |             - enhance_sharpness: Whether to enhance sharpness (default: False)
 254 |             - apply_filters: List of filters to apply (default: [])
 255 |             - resize_factor: Factor to resize the image by (default: 1.0)
 256 |         
 257 |     Returns:
 258 |         Preprocessed PIL Image object
 259 |     """
 260 |     if not HAS_CV2 or not HAS_NUMPY or not HAS_PIL:
 261 |         logger.warning("Image preprocessing requires opencv-python, numpy, and pillow. Using original image.")
 262 |         return image
 263 |     
 264 |     # Default preprocessing options
 265 |     if preprocessing_options is None:
 266 |         preprocessing_options = {
 267 |             "denoise": True,
 268 |             "threshold": "otsu",
 269 |             "deskew": True,
 270 |             "enhance_contrast": True,
 271 |             "enhance_brightness": False,
 272 |             "enhance_sharpness": False,
 273 |             "apply_filters": [],
 274 |             "resize_factor": 1.0
 275 |         }
 276 |     
 277 |     # Apply PIL enhancements before OpenCV processing if enabled
 278 |     if HAS_PIL:
 279 |         # Enhance brightness if requested
 280 |         if preprocessing_options.get("enhance_brightness", False):
 281 |             enhancer = ImageEnhance.Brightness(image)
 282 |             # Increase brightness by 30%
 283 |             image = enhancer.enhance(1.3)
 284 |         
 285 |         # Enhance contrast if requested using PIL (in addition to OpenCV method)
 286 |         if preprocessing_options.get("enhance_contrast", True):
 287 |             enhancer = ImageEnhance.Contrast(image)
 288 |             # Increase contrast by 40%
 289 |             image = enhancer.enhance(1.4)
 290 |         
 291 |         # Enhance sharpness if requested
 292 |         if preprocessing_options.get("enhance_sharpness", False):
 293 |             enhancer = ImageEnhance.Sharpness(image)
 294 |             # Increase sharpness by 50%
 295 |             image = enhancer.enhance(1.5)
 296 |             
 297 |         # Apply filters if specified
 298 |         filters = preprocessing_options.get("apply_filters", [])
 299 |         for filter_name in filters:
 300 |             if filter_name == "unsharp_mask":
 301 |                 image = image.filter(ImageFilter.UnsharpMask(radius=2, percent=150))
 302 |             elif filter_name == "detail":
 303 |                 image = image.filter(ImageFilter.DETAIL)
 304 |             elif filter_name == "edge_enhance":
 305 |                 image = image.filter(ImageFilter.EDGE_ENHANCE)
 306 |             elif filter_name == "smooth":
 307 |                 image = image.filter(ImageFilter.SMOOTH)
 308 |     
 309 |     # Convert PIL Image to OpenCV format
 310 |     img = np.array(image)
 311 |     if len(img.shape) == 3 and img.shape[2] == 3:
 312 |         gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
 313 |     else:
 314 |         gray = img
 315 |     
 316 |     # Calculate optimal scaling based on image size and content
 317 |     original_height, original_width = gray.shape[:2]
 318 |     resize_factor = preprocessing_options.get("resize_factor", 1.0)
 319 |     
 320 |     # Adaptive scaling based on image dimensions for optimal OCR
 321 |     # For very small images, increase size; for very large images, reduce
 322 |     if resize_factor == 1.0:  # Only auto-adjust if user didn't specify
 323 |         # Calculate the ideal size range for OCR (1500-3500 pixels on longest edge)
 324 |         longest_edge = max(original_width, original_height)
 325 |         if longest_edge < 1500:
 326 |             # For small images, scale up to improve OCR
 327 |             resize_factor = math.ceil(1500 / longest_edge * 10) / 10  # Round to nearest 0.1
 328 |         elif longest_edge > 3500:
 329 |             # For large images, scale down to improve performance
 330 |             resize_factor = math.floor(3500 / longest_edge * 10) / 10  # Round to nearest 0.1
 331 |     
 332 |     # Enhance contrast
 333 |     if preprocessing_options.get("enhance_contrast", True):
 334 |         gray = cv2.equalizeHist(gray)
 335 |     
 336 |     # Apply thresholding
 337 |     threshold_method = preprocessing_options.get("threshold", "otsu")
 338 |     if threshold_method == "otsu":
 339 |         _, img_thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
 340 |     elif threshold_method == "adaptive":
 341 |         # Calculate optimal block size based on image dimensions (odd number)
 342 |         block_size = math.floor(min(gray.shape) / 30)
 343 |         block_size = max(3, block_size)
 344 |         if block_size % 2 == 0:
 345 |             block_size += 1
 346 |         img_thresholded = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, 2)
 347 |     else:
 348 |         img_thresholded = gray
 349 |     
 350 |     # Denoise
 351 |     if preprocessing_options.get("denoise", True):
 352 |         # Calculate optimal denoising parameters based on image size
 353 |         h_param = math.ceil(10 * math.log10(min(original_width, original_height)))
 354 |         img_denoised = cv2.fastNlMeansDenoising(img_thresholded, None, h_param, 7, 21)
 355 |     else:
 356 |         img_denoised = img_thresholded
 357 |     
 358 |     # Deskew
 359 |     if preprocessing_options.get("deskew", True) and HAS_NUMPY:
 360 |         try:
 361 |             coords = np.column_stack(np.where(img_denoised > 0))
 362 |             angle = cv2.minAreaRect(coords)[-1]
 363 |             
 364 |             if angle < -45:
 365 |                 angle = -(90 + angle)
 366 |             else:
 367 |                 angle = -angle
 368 |                 
 369 |             # Rotate to correct skew if significant skew detected
 370 |             if abs(angle) > 0.5:
 371 |                 (h, w) = img_denoised.shape[:2]
 372 |                 center = (w // 2, h // 2)
 373 |                 M = cv2.getRotationMatrix2D(center, angle, 1.0)
 374 |                 img_deskewed = cv2.warpAffine(img_denoised, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
 375 |             else:
 376 |                 img_deskewed = img_denoised
 377 |         except Exception as e:
 378 |             logger.warning(f"Deskewing failed: {str(e)}. Using non-deskewed image.")
 379 |             img_deskewed = img_denoised
 380 |     else:
 381 |         img_deskewed = img_denoised
 382 |     
 383 |     # Resize if needed
 384 |     if resize_factor != 1.0:
 385 |         # Use ceiling to ensure we don't lose pixels in important small details
 386 |         new_w = math.ceil(original_width * resize_factor)
 387 |         new_h = math.ceil(original_height * resize_factor)
 388 |         img_resized = cv2.resize(img_deskewed, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
 389 |     else:
 390 |         img_resized = img_deskewed
 391 |     
 392 |     # Convert back to PIL Image
 393 |     return Image.fromarray(img_resized)
 394 | 
 395 | def _extract_text_with_ocr(image: Image.Image, ocr_language: str = "eng", ocr_config: str = "") -> str:
 396 |     """
 397 |     Extracts text from an image using OCR.
 398 |     
 399 |     Args:
 400 |         image: PIL Image object
 401 |         ocr_language: Language(s) for OCR (default: "eng")
 402 |         ocr_config: Additional configuration for Tesseract
 403 |         
 404 |     Returns:
 405 |         Extracted text
 406 |     """
 407 |     if not HAS_PYTESSERACT:
 408 |         raise ToolError("pytesseract is required for OCR text extraction")
 409 |     
 410 |     try:
 411 |         custom_config = f"-l {ocr_language} {ocr_config}"
 412 |         return pytesseract.image_to_string(image, config=custom_config)
 413 |     except Exception as e:
 414 |         logger.error(f"OCR extraction failed: {str(e)}")
 415 |         raise ToolError(f"OCR extraction failed: {str(e)}") from e
 416 | 
 417 | def _extract_text_from_pdf_direct(file_path: str, start_page: int = 0, max_pages: int = 0) -> Tuple[List[str], bool]:
 418 |     """
 419 |     Extracts text directly from a PDF file without OCR.
 420 |     
 421 |     Args:
 422 |         file_path: Path to the PDF file
 423 |         start_page: First page to extract (0-indexed)
 424 |         max_pages: Maximum number of pages to extract (0 = all)
 425 |         
 426 |     Returns:
 427 |         Tuple of (extracted_text_list, has_text)
 428 |     """
 429 |     texts = []
 430 |     has_text = False
 431 |     
 432 |     if HAS_PDFPLUMBER:
 433 |         try:
 434 |             with pdfplumber.open(file_path) as pdf:
 435 |                 total_pages = len(pdf.pages)
 436 |                 end_page = total_pages if max_pages == 0 else min(start_page + max_pages, total_pages)
 437 |                 
 438 |                 for i in range(start_page, end_page):
 439 |                     try:
 440 |                         page = pdf.pages[i]
 441 |                         text = page.extract_text(x_tolerance=3, y_tolerance=3)
 442 |                         if text and text.strip():
 443 |                             has_text = True
 444 |                         texts.append(text or "")
 445 |                     except Exception as e:
 446 |                         logger.warning(f"Error extracting text from page {i+1}: {str(e)}")
 447 |                         texts.append("")
 448 |         except Exception as e:
 449 |             logger.error(f"Error extracting text directly from PDF: {str(e)}")
 450 |             raise ToolError(f"Failed to extract text directly from PDF: {str(e)}") from e
 451 |     
 452 |     elif HAS_PYMUPDF:
 453 |         try:
 454 |             with pymupdf.open(file_path) as doc:
 455 |                 total_pages = len(doc)
 456 |                 end_page = total_pages if max_pages == 0 else min(start_page + max_pages, total_pages)
 457 |                 
 458 |                 for i in range(start_page, end_page):
 459 |                     try:
 460 |                         page = doc[i]
 461 |                         text = page.get_text()
 462 |                         if text and text.strip():
 463 |                             has_text = True
 464 |                         texts.append(text or "")
 465 |                     except Exception as e:
 466 |                         logger.warning(f"Error extracting text from page {i+1}: {str(e)}")
 467 |                         texts.append("")
 468 |         except Exception as e:
 469 |             logger.error(f"Error extracting text directly from PDF: {str(e)}")
 470 |             raise ToolError(f"Failed to extract text directly from PDF: {str(e)}") from e
 471 |     
 472 |     else:
 473 |         logger.warning("No PDF text extraction library available (pdfplumber or PyMuPDF)")
 474 |         raise ToolError("No PDF text extraction library available. Install pdfplumber or PyMuPDF.")
 475 |     
 476 |     return texts, has_text
 477 | 
 478 | def _convert_pdf_to_images(file_path, start_page=0, max_pages=0, dpi=300):
 479 |     """
 480 |     Converts pages of a PDF file to PIL Image objects.
 481 |     
 482 |     Args:
 483 |         file_path: Path to the PDF file
 484 |         start_page: First page to convert (0-indexed)
 485 |         max_pages: Maximum number of pages to convert (0 = all)
 486 |         dpi: DPI for rendering (default: 300)
 487 |         
 488 |     Returns:
 489 |         List of PIL Image objects
 490 |     """
 491 |     if not HAS_PDF2IMAGE:
 492 |         raise ToolError("pdf2image is required for PDF to image conversion")
 493 |     
 494 |     try:
 495 |         # Create a temporary directory to store intermediate images
 496 |         # This helps with memory management for large PDFs
 497 |         with tempfile.TemporaryDirectory() as temp_dir:
 498 |             # pdf2image uses 1-based indexing
 499 |             first_page = start_page + 1
 500 |             last_page = None if max_pages == 0 else first_page + max_pages - 1
 501 |             
 502 |             # Use the temp directory for output_folder
 503 |             images = convert_from_path(
 504 |                 file_path,
 505 |                 dpi=dpi,
 506 |                 first_page=first_page,
 507 |                 last_page=last_page,
 508 |                 output_folder=temp_dir
 509 |             )
 510 |             
 511 |             return images
 512 |     except Exception as e:
 513 |         logger.error(f"PDF to image conversion failed: {str(e)}")
 514 |         raise ToolError(f"Failed to convert PDF to images: {str(e)}") from e
 515 | 
 516 | def _convert_pdf_bytes_to_images(pdf_bytes, start_page=0, max_pages=0, dpi=300):
 517 |     """
 518 |     Converts pages of a PDF from bytes to PIL Image objects.
 519 |     
 520 |     Args:
 521 |         pdf_bytes: PDF content as bytes
 522 |         start_page: First page to convert (0-indexed)
 523 |         max_pages: Maximum number of pages to convert (0 = all)
 524 |         dpi: DPI for rendering (default: 300)
 525 |         
 526 |     Returns:
 527 |         List of PIL Image objects
 528 |     """
 529 |     if not HAS_PDF2IMAGE:
 530 |         raise ToolError("pdf2image is required for PDF to image conversion")
 531 |     
 532 |     try:
 533 |         # Create a temporary directory to store intermediate images
 534 |         # This helps with memory management for large PDFs
 535 |         with tempfile.TemporaryDirectory() as temp_dir:
 536 |             # pdf2image uses 1-based indexing
 537 |             first_page = start_page + 1
 538 |             last_page = None if max_pages == 0 else first_page + max_pages - 1
 539 |             
 540 |             # Use the temp directory for output_folder
 541 |             images = convert_from_bytes(
 542 |                 pdf_bytes,
 543 |                 dpi=dpi,
 544 |                 first_page=first_page,
 545 |                 last_page=last_page,
 546 |                 output_folder=temp_dir
 547 |             )
 548 |             
 549 |             return images
 550 |     except Exception as e:
 551 |         logger.error(f"PDF bytes to image conversion failed: {str(e)}")
 552 |         raise ToolError(f"Failed to convert PDF bytes to images: {str(e)}") from e
 553 | 
 554 | def _generate_cache_key(data, prefix="ocr"):
 555 |     """Generate a cache key for the given data."""
 556 |     if isinstance(data, str) and os.path.exists(data):
 557 |         # For file paths, use mtime and size
 558 |         stat = os.stat(data)
 559 |         key_data = f"{data}:{stat.st_mtime}:{stat.st_size}"
 560 |     elif isinstance(data, Image.Image):
 561 |         # For PIL images, convert to bytes and hash
 562 |         img_bytes = io.BytesIO()
 563 |         data.save(img_bytes, format=data.format or 'PNG')
 564 |         key_data = img_bytes.getvalue()
 565 |     elif isinstance(data, dict):
 566 |         # For dictionaries, convert to JSON
 567 |         key_data = json.dumps(data, sort_keys=True)
 568 |     else:
 569 |         # For other data, use string representation
 570 |         key_data = str(data)
 571 |     
 572 |     # Generate hash
 573 |     h = hashlib.md5(key_data.encode() if isinstance(key_data, str) else key_data)
 574 |     
 575 |     # Add a UUID component for uniqueness across process restarts
 576 |     unique_id = str(uuid.uuid4())[:8]
 577 |     
 578 |     return f"{prefix}_{h.hexdigest()}_{unique_id}"
 579 | 
 580 | def _split_text_into_chunks(text, max_chunk_size=8000, overlap=200):
 581 |     """
 582 |     Splits text into chunks of specified maximum size with overlap.
 583 |     
 584 |     Args:
 585 |         text: Text to split
 586 |         max_chunk_size: Maximum chunk size in characters
 587 |         overlap: Overlap between chunks in characters
 588 |         
 589 |     Returns:
 590 |         List of text chunks
 591 |     """
 592 |     if not text:
 593 |         return []
 594 |     
 595 |     # Ensure reasonable values
 596 |     max_chunk_size = max(1000, min(max_chunk_size, 15000))
 597 |     overlap = max(50, min(overlap, max_chunk_size // 4))
 598 |     
 599 |     # Split by paragraphs first
 600 |     paragraphs = re.split(r'\n\s*\n', text)
 601 |     
 602 |     chunks = []
 603 |     current_chunk = []
 604 |     current_length = 0
 605 |     
 606 |     for paragraph in paragraphs:
 607 |         para_length = len(paragraph)
 608 |         
 609 |         if current_length + para_length <= max_chunk_size:
 610 |             # Paragraph fits in current chunk
 611 |             current_chunk.append(paragraph)
 612 |             current_length += para_length + 2  # +2 for the newlines
 613 |         else:
 614 |             # Paragraph doesn't fit
 615 |             if current_chunk:
 616 |                 # Save current chunk
 617 |                 chunks.append("\n\n".join(current_chunk))
 618 |             
 619 |             if para_length <= max_chunk_size:
 620 |                 # Start new chunk with this paragraph
 621 |                 current_chunk = [paragraph]
 622 |                 current_length = para_length + 2
 623 |             else:
 624 |                 # Paragraph too large, split into sentences
 625 |                 sentences = re.split(r'(?<=[.!?])\s+', paragraph)
 626 |                 current_chunk = []
 627 |                 current_length = 0
 628 |                 
 629 |                 for sentence in sentences:
 630 |                     sentence_length = len(sentence)
 631 |                     
 632 |                     if current_length + sentence_length <= max_chunk_size:
 633 |                         # Sentence fits in current chunk
 634 |                         current_chunk.append(sentence)
 635 |                         current_length += sentence_length + 1  # +1 for the space
 636 |                     else:
 637 |                         # Sentence doesn't fit
 638 |                         if current_chunk:
 639 |                             # Save current chunk
 640 |                             chunks.append(" ".join(current_chunk))
 641 |                         
 642 |                         if sentence_length <= max_chunk_size:
 643 |                             # Start new chunk with this sentence
 644 |                             current_chunk = [sentence]
 645 |                             current_length = sentence_length + 1
 646 |                         else:
 647 |                             # Sentence too large, split by words
 648 |                             words = sentence.split()
 649 |                             current_chunk = []
 650 |                             current_length = 0
 651 |                             current_part = []
 652 |                             part_length = 0
 653 |                             
 654 |                             for word in words:
 655 |                                 word_length = len(word)
 656 |                                 
 657 |                                 if part_length + word_length + 1 <= max_chunk_size:
 658 |                                     current_part.append(word)
 659 |                                     part_length += word_length + 1  # +1 for the space
 660 |                                 else:
 661 |                                     if current_part:
 662 |                                         chunks.append(" ".join(current_part))
 663 |                                     current_part = [word]
 664 |                                     part_length = word_length + 1
 665 |                             
 666 |                             if current_part:
 667 |                                 current_chunk = current_part
 668 |                                 current_length = part_length
 669 |     
 670 |     # Add the last chunk if it exists
 671 |     if current_chunk:
 672 |         chunks.append("\n\n".join(current_chunk) if len(current_chunk) > 1 else current_chunk[0])
 673 |     
 674 |     # Add overlap between chunks
 675 |     result = []
 676 |     prev_end = ""
 677 |     
 678 |     for i, chunk in enumerate(chunks):
 679 |         if i > 0 and prev_end:
 680 |             # Find a good overlap point (try to break at paragraph or sentence)
 681 |             overlap_text = prev_end
 682 |             if "\n\n" in overlap_text:
 683 |                 parts = overlap_text.split("\n\n")
 684 |                 if len(parts) > 1:
 685 |                     overlap_text = parts[-1]
 686 |             
 687 |             # Prepend overlap to current chunk
 688 |             chunk = overlap_text + " " + chunk
 689 |         
 690 |         # Save end of current chunk for next iteration
 691 |         prev_end = chunk[-overlap:] if len(chunk) > overlap else chunk
 692 |         
 693 |         result.append(chunk)
 694 |     
 695 |     return result
 696 | 
 697 | def _detect_tables(image: Image.Image) -> List[Tuple[int, int, int, int]]:
 698 |     """
 699 |     Detects potential tables in an image.
 700 |     
 701 |     Args:
 702 |         image: PIL Image object
 703 |         
 704 |     Returns:
 705 |         List of detected table regions as (x, y, width, height) tuples
 706 |     """
 707 |     if not HAS_CV2 or not HAS_NUMPY:
 708 |         return []
 709 |     
 710 |     # Convert PIL Image to OpenCV format
 711 |     img = np.array(image)
 712 |     if len(img.shape) == 3 and img.shape[2] == 3:
 713 |         gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
 714 |     else:
 715 |         gray = img
 716 |     
 717 |     # Apply thresholding and morphological operations
 718 |     _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
 719 |     
 720 |     # Create a kernel for dilation
 721 |     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
 722 |     dilated = cv2.dilate(thresh, kernel, iterations=5)
 723 |     
 724 |     # Find contours
 725 |     contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 726 |     
 727 |     # Filter contours to find potential tables
 728 |     table_regions = []
 729 |     for contour in contours:
 730 |         x, y, w, h = cv2.boundingRect(contour)
 731 |         
 732 |         # Tables usually have a certain aspect ratio and size
 733 |         aspect_ratio = w / h
 734 |         area = w * h
 735 |         img_area = img.shape[0] * img.shape[1]
 736 |         
 737 |         if 0.5 <= aspect_ratio <= 3.0 and area > img_area * 0.05:
 738 |             table_regions.append((x, y, w, h))
 739 |     
 740 |     return table_regions
 741 | 
 742 | def _crop_image(image: Image.Image, region: Tuple[int, int, int, int]) -> Image.Image:
 743 |     """
 744 |     Crops an image to the specified region.
 745 |     
 746 |     Args:
 747 |         image: PIL Image object
 748 |         region: Tuple of (x, y, width, height)
 749 |         
 750 |     Returns:
 751 |         Cropped PIL Image object
 752 |     """
 753 |     x, y, width, height = region
 754 |     return image.crop((x, y, x + width, y + height))
 755 | 
 756 | def _is_text_mostly_noise(text, noise_threshold=0.3):
 757 |     """Determine if extracted text is mostly noise based on character distribution."""
 758 |     if not text or len(text) < 10:
 759 |         return False
 760 |     
 761 |     # Calculate the ratio of non-alphanumeric and non-punctuation characters
 762 |     total_chars = len(text)
 763 |     valid_chars = sum(1 for c in text if c.isalnum() or c.isspace() or c in '.,;:!?"-\'()[]{}')
 764 |     
 765 |     noise_ratio = 1 - (valid_chars / total_chars)
 766 |     return noise_ratio > noise_threshold
 767 | 
 768 | def _is_likely_header_or_footer(text, line_length_threshold=50):
 769 |     """Determine if a text line is likely a header or footer."""
 770 |     text = text.strip()
 771 |     if len(text) == 0:
 772 |         return False
 773 |         
 774 |     # Short lines with page numbers
 775 |     if len(text) < line_length_threshold and re.search(r'\b\d+\b', text):
 776 |         return True
 777 |     
 778 |     # Common header/footer patterns
 779 |     patterns = [
 780 |         r'^\d+$',  # Just a page number
 781 |         r'^Page\s+\d+(\s+of\s+\d+)?$',  # Page X of Y
 782 |         r'^[\w\s]+\s+\|\s+\d+$',  # Title | Page
 783 |         r'^\w+\s+\d{1,2},?\s+\d{4}$',  # Date format
 784 |         r'^Copyright',  # Copyright notices
 785 |         r'^\w+\s+\d{1,2}(st|nd|rd|th)?,?\s+\d{4}$',  # Date with ordinal
 786 |         r'^\d{1,2}/\d{1,2}/\d{2,4}$'  # Date in MM/DD/YY format
 787 |     ]
 788 |     
 789 |     for pattern in patterns:
 790 |         if re.search(pattern, text, re.IGNORECASE):
 791 |             return True
 792 |     
 793 |     return False
 794 | 
 795 | def _remove_headers_and_footers(text, max_line_length=70):
 796 |     """
 797 |     Removes headers and footers from text.
 798 |     
 799 |     Args:
 800 |         text: Text to process
 801 |         max_line_length: Maximum length for a line to be considered a header/footer
 802 |         
 803 |     Returns:
 804 |         Text with headers and footers removed
 805 |     """
 806 |     if not text:
 807 |         return text
 808 |     
 809 |     # Split text into lines
 810 |     lines = text.splitlines()
 811 |     result = []
 812 |     
 813 |     for _i, line in enumerate(lines):
 814 |         # Skip empty lines
 815 |         if not line.strip():
 816 |             result.append(line)
 817 |             continue
 818 |         
 819 |         # Check if line is likely a header or footer
 820 |         if len(line.strip()) <= max_line_length and _is_likely_header_or_footer(line):
 821 |             # Replace with empty line to maintain spacing
 822 |             result.append("")
 823 |             continue
 824 |         
 825 |         result.append(line)
 826 |     
 827 |     # Join lines back together
 828 |     return "\n".join(result)
 829 | 
 830 | async def _process_text_chunk(chunk: str, reformat_as_markdown: bool = False, remove_headers: bool = False) -> str:
 831 |     """
 832 |     Processes a chunk of OCR text with LLM enhancement.
 833 |     
 834 |     Args:
 835 |         chunk: Text chunk to process
 836 |         reformat_as_markdown: Whether to format as markdown
 837 |         remove_headers: Whether to remove headers and footers
 838 |         
 839 |     Returns:
 840 |         Enhanced text chunk
 841 |     """
 842 |     if not chunk.strip():
 843 |         return ""
 844 |     
 845 |     # First apply simple rule-based fixes
 846 |     cleaned_text = chunk
 847 |     
 848 |     # Fix hyphenated words at line breaks
 849 |     cleaned_text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: f"{m.group(1)}{m.group(2)}", cleaned_text)
 850 |     
 851 |     # Remove obvious noise
 852 |     if _is_text_mostly_noise(cleaned_text):
 853 |         logger.warning("Text chunk appears to be mostly noise, applying aggressive cleaning")
 854 |         # Replace unusual characters with spaces
 855 |         cleaned_text = re.sub(r'[^\w\s.,;:!?"\'\(\)\[\]\{\}-]', ' ', cleaned_text)
 856 |         # Normalize spaces
 857 |         cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
 858 |     
 859 |     # Remove headers and footers if requested
 860 |     if remove_headers:
 861 |         cleaned_text = _remove_headers_and_footers(cleaned_text)
 862 |     
 863 |     # Prepare LLM enhancement prompt
 864 |     if reformat_as_markdown:
 865 |         prompt = f"""Correct OCR errors in this text and format it as markdown. Follow these instructions:
 866 | 
 867 | 1. Fix OCR-induced errors:
 868 |    - Correct words split across line breaks (e.g., "cor- rect" → "correct")
 869 |    - Fix typos like 'rn' misread as 'm', '0' misread as 'O', etc.
 870 |    - Merge split paragraphs but preserve intentional paragraph breaks
 871 |    - Use context and common sense to correct errors
 872 | 
 873 | 2. Format as markdown:
 874 |    - Convert headings to markdown headings (# for main title, ## for subtitles, etc.)
 875 |    - Format lists as proper markdown lists
 876 |    - Use emphasis (*italic*) and strong (**bold**) where appropriate
 877 |    - Create tables using markdown syntax if tabular data is detected
 878 |    - For code or equations, use appropriate markdown formatting
 879 | 
 880 | 3. Clean up formatting:
 881 |    - Remove unnecessary line breaks within paragraphs
 882 |    - Preserve paragraph structure
 883 |    - Remove duplicated text
 884 |    - {"Remove headers, footers, and page numbers" if remove_headers else "Preserve all content including headers/footers"}
 885 | 
 886 | 4. Preserve the original content's meaning and information.
 887 | 
 888 | Here is the text to correct and format:
 889 | 
 890 | ```
 891 | {cleaned_text}
 892 | ```
 893 | 
 894 | Provide ONLY the corrected markdown text with no explanations or comments.
 895 | """
 896 |     else:
 897 |         prompt = f"""Correct OCR errors in this text. Follow these instructions:
 898 | 
 899 | 1. Fix OCR-induced errors:
 900 |    - Correct words split across line breaks (e.g., "cor- rect" → "correct")
 901 |    - Fix typos like 'rn' misread as 'm', '0' misread as 'O', etc.
 902 |    - Merge split paragraphs but preserve intentional paragraph breaks
 903 |    - Use context and common sense to correct errors
 904 | 
 905 | 2. Clean up formatting:
 906 |    - Remove unnecessary line breaks within paragraphs
 907 |    - Preserve paragraph structure
 908 |    - Remove duplicated text
 909 |    - {"Remove headers, footers, and page numbers" if remove_headers else "Preserve all content including headers/footers"}
 910 | 
 911 | 3. Preserve the original content's meaning and information.
 912 | 
 913 | Here is the text to correct:
 914 | 
 915 | ```
 916 | {cleaned_text}
 917 | ```
 918 | 
 919 | Provide ONLY the corrected text with no explanations or comments.
 920 | """
 921 |     
 922 |     try:
 923 |         # Use generate_completion to process the text
 924 |         task_type = TaskType.TEXT_ENHANCEMENT.value
 925 |         
 926 |         result = await generate_completion(
 927 |             prompt=prompt,
 928 |             provider=Provider.ANTHROPIC.value,  # Default to Anthropic for high-quality text processing
 929 |             temperature=0.2,  # Low temperature for consistent results
 930 |             max_tokens=len(cleaned_text) + 1000,  # Allow some expansion for formatting
 931 |             task_type=task_type
 932 |         )
 933 |         
 934 |         if not result or not result.get("text"):
 935 |             logger.warning("LLM text enhancement returned empty result")
 936 |             return cleaned_text
 937 |         
 938 |         enhanced_text = result["text"]
 939 |         
 940 |         # Remove any "Here is the corrected..." prefixes that LLMs sometimes add
 941 |         enhanced_text = re.sub(r'^(Here is|The corrected|Here\'s)[^:]*:?\s*', '', enhanced_text, flags=re.IGNORECASE)
 942 |         
 943 |         return enhanced_text
 944 |     except ProviderError as e:
 945 |         logger.error(f"Provider error during text enhancement: {str(e)}")
 946 |         # Fall back to the cleaned text
 947 |         return cleaned_text
 948 |     except Exception as e:
 949 |         logger.error(f"Error during LLM text enhancement: {str(e)}")
 950 |         # Fall back to the cleaned text
 951 |         return cleaned_text
 952 | 
 953 | # --- Main OCR tool functions ---
 954 | 
 955 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
 956 | @with_tool_metrics
 957 | @with_retry(max_retries=3, retry_delay=1)
 958 | @with_error_handling
 959 | async def extract_text_from_pdf(
 960 |     file_path: str,
 961 |     extraction_method: str = "hybrid",
 962 |     max_pages: int = 0,
 963 |     skip_pages: int = 0,
 964 |     preprocessing_options: Optional[Dict[str, Any]] = None,
 965 |     ocr_language: str = "eng",
 966 |     reformat_as_markdown: bool = False,
 967 |     suppress_headers: bool = False,
 968 |     assess_quality: bool = False,
 969 |     dpi: int = 300
 970 | ) -> Dict[str, Any]:
 971 |     """
 972 |     Extracts and enhances text from a PDF document.
 973 |     
 974 |     This tool can use multiple extraction methods: direct text extraction from the PDF,
 975 |     OCR-based extraction, or a hybrid approach that uses direct extraction when possible
 976 |     and falls back to OCR when necessary. The extracted text is then enhanced using an 
 977 |     LLM to correct OCR errors and optionally format the output as markdown.
 978 |     
 979 |     Args:
 980 |         file_path: Path to the PDF file
 981 |         extraction_method: Method to use for text extraction:
 982 |             - "direct": Extract text directly from the PDF (fastest, but may fail for scanned PDFs)
 983 |             - "ocr": Always use OCR (slower but works for scanned PDFs)
 984 |             - "hybrid": Try direct extraction first, fall back to OCR if needed (default)
 985 |         max_pages: Maximum number of pages to process (0 = all pages)
 986 |         skip_pages: Number of pages to skip from the beginning (0-indexed)
 987 |         preprocessing_options: Dictionary of options for image preprocessing:
 988 |             - denoise: Whether to apply denoising (default: True)
 989 |             - threshold: Thresholding method ('otsu', 'adaptive', 'none') (default: 'otsu')
 990 |             - deskew: Whether to deskew the image (default: True)
 991 |             - enhance_contrast: Whether to enhance contrast (default: True)
 992 |             - resize_factor: Factor to resize the image (default: 1.0)
 993 |         ocr_language: Language(s) for OCR, e.g., "eng" or "eng+fra" (default: "eng")
 994 |         reformat_as_markdown: Whether to format the output as markdown (default: False)
 995 |         suppress_headers: Whether to remove headers, footers, and page numbers (default: False)
 996 |         assess_quality: Whether to assess the quality of the OCR improvement (default: False)
 997 |         dpi: DPI for PDF rendering when using OCR (default: 300)
 998 |     
 999 |     Returns:
1000 |         A dictionary containing:
1001 |         {
1002 |             "success": true,
1003 |             "text": "The extracted and enhanced text...",
1004 |             "raw_text": "The original OCR text before enhancement...",
1005 |             "pages_processed": 5,
1006 |             "extraction_method_used": "hybrid",
1007 |             "file_path": "/path/to/document.pdf",
1008 |             "quality_metrics": {  # Only if assess_quality=True
1009 |                 "score": 85,
1010 |                 "explanation": "Explanation of quality score..."
1011 |             },
1012 |             "processing_time": 12.34  # Seconds
1013 |         }
1014 |     
1015 |     Raises:
1016 |         ToolInputError: If the file path is invalid or the file is not a PDF
1017 |         ToolError: If text extraction fails
1018 |     """
1019 |     start_time = time.time()
1020 |     
1021 |     # Validate file path
1022 |     _validate_file_path(file_path, expected_extension=".pdf")
1023 |     
1024 |     # Check extraction method
1025 |     valid_methods = ["direct", "ocr", "hybrid"]
1026 |     if extraction_method not in valid_methods:
1027 |         raise ToolInputError(
1028 |             f"Invalid extraction method: '{extraction_method}'. Must be one of: {', '.join(valid_methods)}"
1029 |         )
1030 |     
1031 |     # Check dependencies based on extraction method
1032 |     if extraction_method in ["ocr", "hybrid"]:
1033 |         if not HAS_PDF2IMAGE or not HAS_PYTESSERACT:
1034 |             logger.warning(f"OCR extraction requires pdf2image and pytesseract. {extraction_method} may fail.")
1035 |     
1036 |     if extraction_method in ["direct", "hybrid"]:
1037 |         if not HAS_PDFPLUMBER and not HAS_PYMUPDF:
1038 |             logger.warning("Direct extraction requires pdfplumber or PyMuPDF.")
1039 |     
1040 |     # Initialize result
1041 |     result = {
1042 |         "success": False,
1043 |         "file_path": file_path,
1044 |         "pages_processed": 0,
1045 |         "extraction_method_used": extraction_method
1046 |     }
1047 |     
1048 |     method_used = extraction_method
1049 |     raw_text_list = []
1050 |     extracted_text_list = []
1051 |     has_direct_text = False
1052 |     
1053 |     try:
1054 |         # Step 1: Extract text
1055 |         if extraction_method in ["direct", "hybrid"]:
1056 |             try:
1057 |                 logger.info(f"Attempting direct text extraction from PDF: {file_path}")
1058 |                 direct_text_list, has_direct_text = _extract_text_from_pdf_direct(
1059 |                     file_path,
1060 |                     start_page=skip_pages,
1061 |                     max_pages=max_pages
1062 |                 )
1063 |                 
1064 |                 raw_text_list = direct_text_list
1065 |                 logger.info(f"Direct text extraction {'succeeded' if has_direct_text else 'failed'}")
1066 |                 
1067 |                 if has_direct_text and extraction_method == "direct":
1068 |                     # If direct extraction found text and that's the requested method, we're done
1069 |                     method_used = "direct"
1070 |                     extracted_text_list = direct_text_list
1071 |                     logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages")
1072 |                 
1073 |                 elif has_direct_text and extraction_method == "hybrid":
1074 |                     # If hybrid mode and direct extraction worked, use it
1075 |                     method_used = "direct"
1076 |                     extracted_text_list = direct_text_list
1077 |                     logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages (hybrid mode)")
1078 |                 
1079 |                 elif extraction_method == "direct" and not has_direct_text:
1080 |                     # If direct mode but no text found, we fail
1081 |                     raise ToolError("Direct text extraction failed to find text in the PDF")
1082 |                 
1083 |                 # If hybrid mode and no text found, fall back to OCR
1084 |                 if extraction_method == "hybrid" and not has_direct_text:
1085 |                     logger.info("No text found via direct extraction, falling back to OCR (hybrid mode)")
1086 |                     method_used = "ocr"
1087 |                     # Continue to OCR extraction below
1088 |             
1089 |             except Exception as e:
1090 |                 logger.error(f"Direct text extraction failed: {str(e)}")
1091 |                 if extraction_method == "direct":
1092 |                     raise ToolError(f"Direct text extraction failed: {str(e)}") from e
1093 |                 
1094 |                 logger.info("Falling back to OCR extraction")
1095 |                 method_used = "ocr"
1096 |         
1097 |         # Step 2: OCR extraction if needed
1098 |         if method_used == "ocr" or extraction_method == "ocr":
1099 |             method_used = "ocr"
1100 |             logger.info(f"Performing OCR-based text extraction on PDF: {file_path}")
1101 |             
1102 |             # Convert PDF to images
1103 |             images = _convert_pdf_to_images(
1104 |                 file_path,
1105 |                 start_page=skip_pages,
1106 |                 max_pages=max_pages,
1107 |                 dpi=dpi
1108 |             )
1109 |             
1110 |             # Extract text using OCR
1111 |             raw_text_list = []
1112 |             with ThreadPoolExecutor() as executor:
1113 |                 # Preprocess images in parallel
1114 |                 preprocessed_images = list(executor.map(
1115 |                     lambda img: _preprocess_image(img, preprocessing_options),
1116 |                     images
1117 |                 ))
1118 |                 
1119 |                 # Extract text in parallel
1120 |                 ocr_config = ""
1121 |                 ocr_results = list(executor.map(
1122 |                     lambda img: _extract_text_with_ocr(img, ocr_language, ocr_config),
1123 |                     preprocessed_images
1124 |                 ))
1125 |             
1126 |             extracted_text_list = ocr_results
1127 |             raw_text_list = ocr_results
1128 |             logger.info(f"OCR extraction completed for {len(extracted_text_list)} pages")
1129 |         
1130 |         # Step 3: Process extracted text
1131 |         logger.info("Processing extracted text with LLM enhancement")
1132 |         
1133 |         # Combine text from pages
1134 |         full_raw_text = "\n\n".join(raw_text_list)
1135 |         
1136 |         # Split into chunks for LLM processing
1137 |         chunks = _split_text_into_chunks(full_raw_text)
1138 |         logger.info(f"Text split into {len(chunks)} chunks for LLM processing")
1139 |         
1140 |         # Process chunks in parallel
1141 |         enhanced_chunks = await asyncio.gather(*[
1142 |             _process_text_chunk(chunk, reformat_as_markdown, suppress_headers)
1143 |             for chunk in chunks
1144 |         ])
1145 |         
1146 |         # Combine chunks
1147 |         enhanced_text = "\n\n".join(enhanced_chunks)
1148 |         
1149 |         # Step 4: Assess quality if requested
1150 |         quality_metrics = None
1151 |         if assess_quality:
1152 |             logger.info("Assessing quality of text enhancement")
1153 |             quality_metrics = await _assess_text_quality(full_raw_text, enhanced_text)
1154 |         
1155 |         # Prepare final result
1156 |         processing_time = time.time() - start_time
1157 |         result.update({
1158 |             "success": True,
1159 |             "text": enhanced_text,
1160 |             "raw_text": full_raw_text,
1161 |             "pages_processed": len(raw_text_list),
1162 |             "extraction_method_used": method_used,
1163 |             "processing_time": processing_time
1164 |         })
1165 |         
1166 |         if quality_metrics:
1167 |             result["quality_metrics"] = quality_metrics
1168 |         
1169 |         logger.info(f"Text extraction and enhancement completed successfully in {processing_time:.2f}s")
1170 |         return result
1171 |     
1172 |     except Exception as e:
1173 |         logger.error(f"Error in extract_text_from_pdf: {str(e)}")
1174 |         logger.error(traceback.format_exc())
1175 |         raise ToolError(f"Failed to extract and enhance text from PDF: {str(e)}") from e
1176 | 
1177 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
1178 | @with_tool_metrics
1179 | @with_retry(max_retries=3, retry_delay=1)
1180 | @with_error_handling
1181 | async def extract_text_from_pdf_bytes(
1182 |     pdf_bytes: bytes,
1183 |     extraction_method: str = "hybrid",
1184 |     max_pages: int = 0,
1185 |     skip_pages: int = 0,
1186 |     preprocessing_options: Optional[Dict[str, Any]] = None,
1187 |     ocr_language: str = "eng",
1188 |     reformat_as_markdown: bool = False,
1189 |     suppress_headers: bool = False,
1190 |     assess_quality: bool = False,
1191 |     dpi: int = 300
1192 | ) -> Dict[str, Any]:
1193 |     """
1194 |     Extracts and enhances text from PDF bytes data.
1195 |     
1196 |     This tool works like extract_text_from_pdf but accepts PDF data as bytes instead of a file path.
1197 |     It can use multiple extraction methods and enhance the extracted text using an LLM.
1198 |     
1199 |     Args:
1200 |         pdf_bytes: PDF content as bytes
1201 |         extraction_method: Method to use for text extraction:
1202 |             - "direct": Extract text directly from the PDF (fastest, but may fail for scanned PDFs)
1203 |             - "ocr": Always use OCR (slower but works for scanned PDFs)
1204 |             - "hybrid": Try direct extraction first, fall back to OCR if needed (default)
1205 |         max_pages: Maximum number of pages to process (0 = all pages)
1206 |         skip_pages: Number of pages to skip from the beginning (0-indexed)
1207 |         preprocessing_options: Dictionary of options for image preprocessing
1208 |         ocr_language: Language(s) for OCR, e.g., "eng" or "eng+fra" (default: "eng")
1209 |         reformat_as_markdown: Whether to format the output as markdown (default: False)
1210 |         suppress_headers: Whether to remove headers, footers, and page numbers (default: False)
1211 |         assess_quality: Whether to assess the quality of the OCR improvement (default: False)
1212 |         dpi: DPI for PDF rendering when using OCR (default: 300)
1213 |     
1214 |     Returns:
1215 |         A dictionary with the extracted and enhanced text, same format as extract_text_from_pdf
1216 |     
1217 |     Raises:
1218 |         ToolInputError: If the PDF bytes are invalid
1219 |         ToolError: If text extraction fails
1220 |     """
1221 |     start_time = time.time()
1222 |     
1223 |     # Validate input
1224 |     if not pdf_bytes:
1225 |         raise ToolInputError("PDF bytes cannot be empty")
1226 |     
1227 |     # Check extraction method
1228 |     valid_methods = ["direct", "ocr", "hybrid"]
1229 |     if extraction_method not in valid_methods:
1230 |         raise ToolInputError(
1231 |             f"Invalid extraction method: '{extraction_method}'. Must be one of: {', '.join(valid_methods)}"
1232 |         )
1233 |     
1234 |     # Check dependencies based on extraction method
1235 |     if extraction_method in ["ocr", "hybrid"]:
1236 |         if not HAS_PDF2IMAGE or not HAS_PYTESSERACT:
1237 |             logger.warning(f"OCR extraction requires pdf2image and pytesseract. {extraction_method} may fail.")
1238 |     
1239 |     if extraction_method in ["direct", "hybrid"]:
1240 |         if not HAS_PDFPLUMBER and not HAS_PYMUPDF:
1241 |             logger.warning("Direct extraction requires pdfplumber or PyMuPDF.")
1242 |     
1243 |     # Initialize result
1244 |     result = {
1245 |         "success": False,
1246 |         "pages_processed": 0,
1247 |         "extraction_method_used": extraction_method
1248 |     }
1249 |     
1250 |     method_used = extraction_method
1251 |     raw_text_list = []
1252 |     extracted_text_list = []
1253 |     has_direct_text = False
1254 |     
1255 |     try:
1256 |         # Create a temporary file for processing
1257 |         with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
1258 |             temp_path = temp_pdf.name
1259 |             temp_pdf.write(pdf_bytes)
1260 |             temp_pdf.flush()
1261 |         
1262 |         try:
1263 |             # Step 1: Extract text
1264 |             if extraction_method in ["direct", "hybrid"]:
1265 |                 try:
1266 |                     logger.info("Attempting direct text extraction from PDF bytes")
1267 |                     direct_text_list, has_direct_text = _extract_text_from_pdf_direct(
1268 |                         temp_path,
1269 |                         start_page=skip_pages,
1270 |                         max_pages=max_pages
1271 |                     )
1272 |                     
1273 |                     raw_text_list = direct_text_list
1274 |                     logger.info(f"Direct text extraction {'succeeded' if has_direct_text else 'failed'}")
1275 |                     
1276 |                     if has_direct_text and extraction_method == "direct":
1277 |                         method_used = "direct"
1278 |                         extracted_text_list = direct_text_list
1279 |                         logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages")
1280 |                     
1281 |                     elif has_direct_text and extraction_method == "hybrid":
1282 |                         method_used = "direct"
1283 |                         extracted_text_list = direct_text_list
1284 |                         logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages (hybrid mode)")
1285 |                     
1286 |                     elif extraction_method == "direct" and not has_direct_text:
1287 |                         raise ToolError("Direct text extraction failed to find text in the PDF")
1288 |                     
1289 |                     if extraction_method == "hybrid" and not has_direct_text:
1290 |                         logger.info("No text found via direct extraction, falling back to OCR (hybrid mode)")
1291 |                         method_used = "ocr"
1292 |                 
1293 |                 except Exception as e:
1294 |                     logger.error(f"Direct text extraction failed: {str(e)}")
1295 |                     if extraction_method == "direct":
1296 |                         raise ToolError(f"Direct text extraction failed: {str(e)}") from e
1297 |                     
1298 |                     logger.info("Falling back to OCR extraction")
1299 |                     method_used = "ocr"
1300 |             
1301 |             # Step 2: OCR extraction if needed
1302 |             if method_used == "ocr" or extraction_method == "ocr":
1303 |                 method_used = "ocr"
1304 |                 logger.info("Performing OCR-based text extraction on PDF bytes")
1305 |                 
1306 |                 # Convert PDF bytes to images
1307 |                 images = _convert_pdf_bytes_to_images(
1308 |                     pdf_bytes,
1309 |                     start_page=skip_pages,
1310 |                     max_pages=max_pages,
1311 |                     dpi=dpi
1312 |                 )
1313 |                 
1314 |                 # Extract text using OCR
1315 |                 raw_text_list = []
1316 |                 with ThreadPoolExecutor() as executor:
1317 |                     # Preprocess images in parallel
1318 |                     preprocessed_images = list(executor.map(
1319 |                         lambda img: _preprocess_image(img, preprocessing_options),
1320 |                         images
1321 |                     ))
1322 |                     
1323 |                     # Extract text in parallel
1324 |                     ocr_config = ""
1325 |                     ocr_results = list(executor.map(
1326 |                         lambda img: _extract_text_with_ocr(img, ocr_language, ocr_config),
1327 |                         preprocessed_images
1328 |                     ))
1329 |                 
1330 |                 extracted_text_list = ocr_results
1331 |                 raw_text_list = ocr_results
1332 |                 logger.info(f"OCR extraction completed for {len(extracted_text_list)} pages")
1333 |             
1334 |             # Step 3: Process extracted text
1335 |             logger.info("Processing extracted text with LLM enhancement")
1336 |             
1337 |             # Combine text from pages
1338 |             full_raw_text = "\n\n".join(raw_text_list)
1339 |             
1340 |             # Split into chunks for LLM processing
1341 |             chunks = _split_text_into_chunks(full_raw_text)
1342 |             logger.info(f"Text split into {len(chunks)} chunks for LLM processing")
1343 |             
1344 |             # Process chunks in parallel
1345 |             enhanced_chunks = await asyncio.gather(*[
1346 |                 _process_text_chunk(chunk, reformat_as_markdown, suppress_headers)
1347 |                 for chunk in chunks
1348 |             ])
1349 |             
1350 |             # Combine chunks
1351 |             enhanced_text = "\n\n".join(enhanced_chunks)
1352 |             
1353 |             # Step 4: Assess quality if requested
1354 |             quality_metrics = None
1355 |             if assess_quality:
1356 |                 logger.info("Assessing quality of text enhancement")
1357 |                 quality_metrics = await _assess_text_quality(full_raw_text, enhanced_text)
1358 |             
1359 |             # Prepare final result
1360 |             processing_time = time.time() - start_time
1361 |             result.update({
1362 |                 "success": True,
1363 |                 "text": enhanced_text,
1364 |                 "raw_text": full_raw_text,
1365 |                 "pages_processed": len(raw_text_list),
1366 |                 "extraction_method_used": method_used,
1367 |                 "processing_time": processing_time
1368 |             })
1369 |             
1370 |             if quality_metrics:
1371 |                 result["quality_metrics"] = quality_metrics
1372 |             
1373 |             logger.info(f"Text extraction and enhancement completed successfully in {processing_time:.2f}s")
1374 |             return result
1375 |         
1376 |         finally:
1377 |             # Clean up temporary file
1378 |             try:
1379 |                 os.unlink(temp_path)
1380 |             except Exception as e:
1381 |                 logger.warning(f"Failed to remove temporary file: {str(e)}")
1382 |     
1383 |     except Exception as e:
1384 |         logger.error(f"Error in extract_text_from_pdf_bytes: {str(e)}")
1385 |         logger.error(traceback.format_exc())
1386 |         raise ToolError(f"Failed to extract and enhance text from PDF bytes: {str(e)}") from e
1387 | 
1388 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
1389 | @with_tool_metrics
1390 | @with_retry(max_retries=2, retry_delay=1)
1391 | @with_error_handling
1392 | async def process_image_ocr(
1393 |     image_path: Optional[str] = None,
1394 |     image_data: Optional[str] = None,
1395 |     preprocessing_options: Optional[Dict[str, Any]] = None,
1396 |     ocr_language: str = "eng",
1397 |     reformat_as_markdown: bool = False,
1398 |     assess_quality: bool = False
1399 | ) -> Dict[str, Any]:
1400 |     """
1401 |     Processes an image with OCR and enhances the extracted text.
1402 |     
1403 |     This tool accepts either a path to an image file or base64-encoded image data,
1404 |     performs OCR on the image, and then enhances the extracted text using an LLM.
1405 |     
1406 |     Args:
1407 |         image_path: Path to the image file (mutually exclusive with image_data)
1408 |         image_data: Base64-encoded image data (mutually exclusive with image_path)
1409 |         preprocessing_options: Dictionary of options for image preprocessing:
1410 |             - denoise: Whether to apply denoising (default: True)
1411 |             - threshold: Thresholding method ('otsu', 'adaptive', 'none') (default: 'otsu')
1412 |             - deskew: Whether to deskew the image (default: True)
1413 |             - enhance_contrast: Whether to enhance contrast (default: True)
1414 |             - resize_factor: Factor to resize the image (default: 1.0)
1415 |         ocr_language: Language(s) for OCR, e.g., "eng" or "eng+fra" (default: "eng")
1416 |         reformat_as_markdown: Whether to format the output as markdown (default: False)
1417 |         assess_quality: Whether to assess the quality of the OCR improvement (default: False)
1418 |     
1419 |     Returns:
1420 |         A dictionary containing:
1421 |         {
1422 |             "success": true,
1423 |             "text": "The extracted and enhanced text...",
1424 |             "raw_text": "The original OCR text before enhancement...",
1425 |             "table_detected": false,  # Whether a table was detected in the image
1426 |             "quality_metrics": {  # Only if assess_quality=True
1427 |                 "score": 85,
1428 |                 "explanation": "Explanation of quality score..."
1429 |             },
1430 |             "processing_time": 3.45  # Seconds
1431 |         }
1432 |     
1433 |     Raises:
1434 |         ToolInputError: If input is invalid
1435 |         ToolError: If processing fails
1436 |     """
1437 |     start_time = time.time()
1438 |     
1439 |     # Check dependencies
1440 |     if not HAS_PIL or not HAS_PYTESSERACT:
1441 |         missing = []
1442 |         if not HAS_PIL: 
1443 |             missing.append("pillow")
1444 |         if not HAS_PYTESSERACT: 
1445 |             missing.append("pytesseract")
1446 |         raise ToolError(f"Required dependencies missing: {', '.join(missing)}")
1447 |     
1448 |     # Validate input
1449 |     if not image_path and not image_data:
1450 |         raise ToolInputError("Either image_path or image_data must be provided")
1451 |     
1452 |     if image_path and image_data:
1453 |         raise ToolInputError("Only one of image_path or image_data should be provided")
1454 |     
1455 |     try:
1456 |         # Load image
1457 |         if image_path:
1458 |             _validate_file_path(image_path)
1459 |             image = Image.open(image_path)
1460 |         else:
1461 |             # Decode base64 image data
1462 |             try:
1463 |                 image_bytes = base64.b64decode(image_data)
1464 |                 image = Image.open(io.BytesIO(image_bytes))
1465 |             except Exception as e:
1466 |                 raise ToolInputError(f"Invalid base64 image data: {str(e)}") from e
1467 |         
1468 |         # Preprocess image
1469 |         logger.info("Preprocessing image for OCR")
1470 |         preprocessed_image = _preprocess_image(image, preprocessing_options)
1471 |         
1472 |         # Detect tables
1473 |         table_regions = _detect_tables(preprocessed_image)
1474 |         table_detected = len(table_regions) > 0
1475 |         logger.info(f"Table detection: {len(table_regions)} potential tables found")
1476 |         
1477 |         # Extract text with OCR
1478 |         logger.info(f"Performing OCR with language(s): {ocr_language}")
1479 |         raw_text = _extract_text_with_ocr(preprocessed_image, ocr_language)
1480 |         
1481 |         # Process tables separately if detected
1482 |         table_texts = []
1483 |         if table_detected and HAS_CV2:
1484 |             logger.info("Processing detected tables separately")
1485 |             for i, region in enumerate(table_regions):
1486 |                 try:
1487 |                     table_image = _crop_image(preprocessed_image, region)
1488 |                     # Use a different preprocessing for tables (less aggressive)
1489 |                     table_options = {"denoise": True, "threshold": "adaptive", "deskew": False}
1490 |                     processed_table_image = _preprocess_image(table_image, table_options)
1491 |                     table_text = _extract_text_with_ocr(processed_table_image, ocr_language)
1492 |                     if table_text.strip():
1493 |                         table_texts.append(f"\n\nTable {i+1}:\n{table_text}\n")
1494 |                 except Exception as e:
1495 |                     logger.warning(f"Error processing table {i+1}: {str(e)}")
1496 |         
1497 |         # Include table texts with the main text
1498 |         if table_texts:
1499 |             raw_text += "\n\n" + "\n".join(table_texts)
1500 |         
1501 |         # Process with LLM
1502 |         logger.info("Processing extracted text with LLM enhancement")
1503 |         enhanced_text = await _process_text_chunk(raw_text, reformat_as_markdown, suppress_headers=False)
1504 |         
1505 |         # Assess quality if requested
1506 |         quality_metrics = None
1507 |         if assess_quality:
1508 |             logger.info("Assessing quality of text enhancement")
1509 |             quality_metrics = await _assess_text_quality(raw_text, enhanced_text)
1510 |         
1511 |         # Prepare result
1512 |         processing_time = time.time() - start_time
1513 |         result = {
1514 |             "success": True,
1515 |             "text": enhanced_text,
1516 |             "raw_text": raw_text,
1517 |             "table_detected": table_detected,
1518 |             "processing_time": processing_time
1519 |         }
1520 |         
1521 |         if quality_metrics:
1522 |             result["quality_metrics"] = quality_metrics
1523 |         
1524 |         logger.info(f"Image OCR processing completed in {processing_time:.2f}s")
1525 |         return result
1526 |     
1527 |     except Exception as e:
1528 |         logger.error(f"Error in process_image_ocr: {str(e)}")
1529 |         logger.error(traceback.format_exc())
1530 |         raise ToolError(f"Failed to process image with OCR: {str(e)}") from e
1531 | 
1532 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
1533 | @with_tool_metrics
1534 | @with_retry(max_retries=2, retry_delay=1)
1535 | @with_error_handling
1536 | async def enhance_ocr_text(
1537 |     ocr_text: str,
1538 |     reformat_as_markdown: bool = False,
1539 |     remove_headers: bool = False,
1540 |     detect_tables: bool = True,
1541 |     assess_quality: bool = False
1542 | ) -> Dict[str, Any]:
1543 |     """
1544 |     Enhances existing OCR text using an LLM to correct errors and improve formatting.
1545 |     
1546 |     This tool takes OCR text (e.g., from a different OCR engine) and uses an LLM to
1547 |     correct errors, improve formatting, and optionally convert to markdown.
1548 |     
1549 |     Args:
1550 |         ocr_text: The OCR text to enhance
1551 |         reformat_as_markdown: Whether to format the output as markdown (default: False)
1552 |         remove_headers: Whether to remove headers, footers, and page numbers (default: False)
1553 |         detect_tables: Whether to attempt to detect and format tables (default: True)
1554 |         assess_quality: Whether to assess the quality of the OCR improvement (default: False)
1555 |     
1556 |     Returns:
1557 |         A dictionary containing:
1558 |         {
1559 |             "success": true,
1560 |             "text": "The enhanced text...",
1561 |             "raw_text": "The original OCR text...",
1562 |             "quality_metrics": {  # Only if assess_quality=True
1563 |                 "score": 85,
1564 |                 "explanation": "Explanation of quality score..."
1565 |             },
1566 |             "processing_time": 2.34  # Seconds
1567 |         }
1568 |     
1569 |     Raises:
1570 |         ToolInputError: If the OCR text is empty
1571 |         ToolError: If enhancement fails
1572 |     """
1573 |     start_time = time.time()
1574 |     
1575 |     # Validate input
1576 |     if not ocr_text or not isinstance(ocr_text, str):
1577 |         raise ToolInputError("OCR text must be a non-empty string")
1578 |     
1579 |     try:
1580 |         # Split into chunks if large
1581 |         if len(ocr_text) > 10000:
1582 |             logger.info(f"Splitting large OCR text ({len(ocr_text)} chars) into chunks")
1583 |             chunks = _split_text_into_chunks(ocr_text)
1584 |             
1585 |             # Process chunks in parallel
1586 |             enhanced_chunks = await asyncio.gather(*[
1587 |                 _process_text_chunk(chunk, reformat_as_markdown, remove_headers)
1588 |                 for chunk in chunks
1589 |             ])
1590 |             
1591 |             # Combine chunks
1592 |             enhanced_text = "\n\n".join(enhanced_chunks)
1593 |             logger.info(f"Processed {len(chunks)} text chunks")
1594 |         else:
1595 |             # Process directly if small enough
1596 |             enhanced_text = await _process_text_chunk(ocr_text, reformat_as_markdown, remove_headers)
1597 |         
1598 |         # Detect and format tables if requested
1599 |         if detect_tables and reformat_as_markdown:
1600 |             logger.info("Attempting table detection and formatting")
1601 |             enhanced_text = await _format_tables_in_text(enhanced_text)
1602 |         
1603 |         # Assess quality if requested
1604 |         quality_metrics = None
1605 |         if assess_quality:
1606 |             logger.info("Assessing quality of text enhancement")
1607 |             quality_metrics = await _assess_text_quality(ocr_text, enhanced_text)
1608 |         
1609 |         # Prepare result
1610 |         processing_time = time.time() - start_time
1611 |         result = {
1612 |             "success": True,
1613 |             "text": enhanced_text,
1614 |             "raw_text": ocr_text,
1615 |             "processing_time": processing_time
1616 |         }
1617 |         
1618 |         if quality_metrics:
1619 |             result["quality_metrics"] = quality_metrics
1620 |         
1621 |         logger.info(f"OCR text enhancement completed in {processing_time:.2f}s")
1622 |         return result
1623 |     
1624 |     except Exception as e:
1625 |         logger.error(f"Error in enhance_ocr_text: {str(e)}")
1626 |         logger.error(traceback.format_exc())
1627 |         raise ToolError(f"Failed to enhance OCR text: {str(e)}") from e
1628 | 
1629 | @with_tool_metrics
1630 | @with_retry(max_retries=2, retry_delay=1.0)
1631 | @with_error_handling
1632 | async def analyze_pdf_structure(
1633 |     file_path: str,
1634 |     extract_metadata: bool = True,
1635 |     extract_outline: bool = True,
1636 |     extract_fonts: bool = False,
1637 |     extract_images: bool = False,
1638 |     estimate_ocr_needs: bool = True
1639 | ) -> Dict[str, Any]:
1640 |     """
1641 |     Analyzes the structure of a PDF file without performing full text extraction.
1642 |     
1643 |     This tool examines a PDF file and provides information about its structure,
1644 |     including metadata, outline (table of contents), fonts, embedded images,
1645 |     and an assessment of whether OCR would be beneficial.
1646 |     
1647 |     Args:
1648 |         file_path: Path to the PDF file
1649 |         extract_metadata: Whether to extract document metadata (default: True)
1650 |         extract_outline: Whether to extract the document outline/TOC (default: True)
1651 |         extract_fonts: Whether to extract font information (default: False)
1652 |         extract_images: Whether to extract information about embedded images (default: False)
1653 |         estimate_ocr_needs: Whether to estimate if OCR would benefit this PDF (default: True)
1654 |     
1655 |     Returns:
1656 |         A dictionary containing:
1657 |         {
1658 |             "success": true,
1659 |             "file_path": "/path/to/document.pdf",
1660 |             "page_count": 42,
1661 |             "metadata": {  # Only if extract_metadata=True
1662 |                 "title": "Document Title",
1663 |                 "author": "Author Name",
1664 |                 "subject": "Document Subject",
1665 |                 "keywords": "keyword1, keyword2",
1666 |                 "creator": "Creator Application",
1667 |                 "producer": "Producer Application",
1668 |                 "creation_date": "2023-01-01T12:00:00",
1669 |                 "modification_date": "2023-02-01T13:00:00"
1670 |             },
1671 |             "outline": [  # Only if extract_outline=True
1672 |                 {
1673 |                     "title": "Chapter 1",
1674 |                     "page": 5,
1675 |                     "children": [
1676 |                         {"title": "Section 1.1", "page": 6, "children": []}
1677 |                     ]
1678 |                 },
1679 |                 {"title": "Chapter 2", "page": 15, "children": []}
1680 |             ],
1681 |             "font_info": {  # Only if extract_fonts=True
1682 |                 "total_fonts": 3,
1683 |                 "embedded_fonts": 2,
1684 |                 "font_names": ["Arial", "Times New Roman", "Courier"]
1685 |             },
1686 |             "image_info": {  # Only if extract_images=True
1687 |                 "total_images": 12,
1688 |                 "image_types": {"jpeg": 8, "png": 4},
1689 |                 "average_size": "120kb"
1690 |             },
1691 |             "ocr_assessment": {  # Only if estimate_ocr_needs=True
1692 |                 "needs_ocr": false,
1693 |                 "confidence": "high",
1694 |                 "reason": "PDF contains extractable text throughout"
1695 |             },
1696 |             "processing_time": 1.23  # Seconds
1697 |         }
1698 |     
1699 |     Raises:
1700 |         ToolInputError: If the file path is invalid or the file is not a PDF
1701 |         ToolError: If analysis fails
1702 |     """
1703 |     start_time = time.time()
1704 |     
1705 |     # Validate file path
1706 |     _validate_file_path(file_path, expected_extension=".pdf")
1707 |     
1708 |     # Check for required libraries
1709 |     pdf_lib_available = False
1710 |     if HAS_PYMUPDF:
1711 |         pdf_lib = "pymupdf"
1712 |         pdf_lib_available = True
1713 |     elif HAS_PDFPLUMBER:
1714 |         pdf_lib = "pdfplumber"
1715 |         pdf_lib_available = True
1716 |     
1717 |     if not pdf_lib_available:
1718 |         raise ToolError("PDF analysis requires PyMuPDF or pdfplumber")
1719 |     
1720 |     try:
1721 |         result = {
1722 |             "success": False,
1723 |             "file_path": file_path,
1724 |             "processing_time": 0
1725 |         }
1726 |         
1727 |         if pdf_lib == "pymupdf":
1728 |             # Use PyMuPDF for analysis
1729 |             with pymupdf.open(file_path) as doc:
1730 |                 # Basic information
1731 |                 result["page_count"] = len(doc)
1732 |                 
1733 |                 # Extract metadata if requested
1734 |                 if extract_metadata:
1735 |                     metadata = doc.metadata
1736 |                     if metadata:
1737 |                         result["metadata"] = {
1738 |                             "title": metadata.get("title", ""),
1739 |                             "author": metadata.get("author", ""),
1740 |                             "subject": metadata.get("subject", ""),
1741 |                             "keywords": metadata.get("keywords", ""),
1742 |                             "creator": metadata.get("creator", ""),
1743 |                             "producer": metadata.get("producer", ""),
1744 |                             "creation_date": metadata.get("creationDate", ""),
1745 |                             "modification_date": metadata.get("modDate", "")
1746 |                         }
1747 |                 
1748 |                 # Extract outline if requested
1749 |                 if extract_outline:
1750 |                     toc = doc.get_toc()
1751 |                     if toc:
1752 |                         # Process TOC into a nested structure
1753 |                         result["outline"] = _process_toc(toc)
1754 |                 
1755 |                 # Extract font information if requested
1756 |                 if extract_fonts:
1757 |                     fonts: Set[str] = set()
1758 |                     embedded_fonts: Set[str] = set()
1759 |                     
1760 |                     for page_num in range(min(10, len(doc))):  # Analyze first 10 pages
1761 |                         page = doc[page_num]
1762 |                         page_fonts = page.get_fonts()
1763 |                         
1764 |                         for font in page_fonts:
1765 |                             fonts.add(font[3])  # Font name
1766 |                             if font[2]:  # Embedded flag
1767 |                                 embedded_fonts.add(font[3])
1768 |                     
1769 |                     result["font_info"] = {
1770 |                         "total_fonts": len(fonts),
1771 |                         "embedded_fonts": len(embedded_fonts),
1772 |                         "font_names": list(fonts)
1773 |                     }
1774 |                 
1775 |                 # Extract image information if requested
1776 |                 if extract_images:
1777 |                     image_count = 0
1778 |                     image_types: Dict[str, int] = {}
1779 |                     total_size = 0
1780 |                     
1781 |                     for page_num in range(min(5, len(doc))):  # Analyze first 5 pages
1782 |                         page = doc[page_num]
1783 |                         images = page.get_images(full=True)
1784 |                         
1785 |                         for img in images:
1786 |                             image_count += 1
1787 |                             xref = img[0]
1788 |                             img_info = doc.extract_image(xref)
1789 |                             
1790 |                             if img_info:
1791 |                                 img_type = img_info["ext"]
1792 |                                 img_size = len(img_info["image"])
1793 |                                 
1794 |                                 image_types[img_type] = image_types.get(img_type, 0) + 1
1795 |                                 total_size += img_size
1796 |                     
1797 |                     # Extrapolate total images based on sample
1798 |                     estimated_total = int(image_count * (len(doc) / max(1, min(5, len(doc)))))
1799 |                     avg_size = f"{int(total_size / max(1, image_count) / 1024)}kb" if image_count > 0 else "0kb"
1800 |                     
1801 |                     result["image_info"] = {
1802 |                         "total_images": image_count,
1803 |                         "estimated_total": estimated_total,
1804 |                         "image_types": image_types,
1805 |                         "average_size": avg_size
1806 |                     }
1807 |                 
1808 |                 # Estimate OCR needs if requested
1809 |                 if estimate_ocr_needs:
1810 |                     text_pages = 0
1811 |                     total_pages = len(doc)
1812 |                     sample_size = min(10, total_pages)
1813 |                     
1814 |                     for page_num in range(sample_size):
1815 |                         page = doc[page_num]
1816 |                         text = page.get_text()
1817 |                         if text and len(text.strip()) > 50:  # Page has meaningful text
1818 |                             text_pages += 1
1819 |                     
1820 |                     text_ratio = text_pages / sample_size
1821 |                     
1822 |                     if text_ratio > 0.9:
1823 |                         needs_ocr = False
1824 |                         confidence = "high"
1825 |                         reason = "PDF contains extractable text throughout"
1826 |                     elif text_ratio > 0.5:
1827 |                         needs_ocr = True
1828 |                         confidence = "medium"
1829 |                         reason = "PDF has some extractable text but may benefit from OCR for certain pages"
1830 |                     else:
1831 |                         needs_ocr = True
1832 |                         confidence = "high"
1833 |                         reason = "PDF appears to be scanned or has minimal extractable text"
1834 |                     
1835 |                     result["ocr_assessment"] = {
1836 |                         "needs_ocr": needs_ocr,
1837 |                         "confidence": confidence,
1838 |                         "reason": reason,
1839 |                         "text_coverage_ratio": text_ratio
1840 |                     }
1841 |         
1842 |         elif pdf_lib == "pdfplumber":
1843 |             # Use pdfplumber for analysis
1844 |             with pdfplumber.open(file_path) as pdf:
1845 |                 # Basic information
1846 |                 result["page_count"] = len(pdf.pages)
1847 |                 
1848 |                 # Extract metadata if requested
1849 |                 if extract_metadata:
1850 |                     metadata = pdf.metadata
1851 |                     if metadata:
1852 |                         result["metadata"] = {
1853 |                             "title": metadata.get("Title", ""),
1854 |                             "author": metadata.get("Author", ""),
1855 |                             "subject": metadata.get("Subject", ""),
1856 |                             "keywords": metadata.get("Keywords", ""),
1857 |                             "creator": metadata.get("Creator", ""),
1858 |                             "producer": metadata.get("Producer", ""),
1859 |                             "creation_date": metadata.get("CreationDate", ""),
1860 |                             "modification_date": metadata.get("ModDate", "")
1861 |                         }
1862 |                 
1863 |                 # Outline not supported in pdfplumber
1864 |                 if extract_outline:
1865 |                     result["outline"] = []
1866 |                 
1867 |                 # Font and image info not supported in pdfplumber
1868 |                 if extract_fonts:
1869 |                     result["font_info"] = {
1870 |                         "total_fonts": 0,
1871 |                         "embedded_fonts": 0,
1872 |                         "font_names": []
1873 |                     }
1874 |                 
1875 |                 if extract_images:
1876 |                     result["image_info"] = {
1877 |                         "total_images": 0,
1878 |                         "image_types": {},
1879 |                         "average_size": "0kb"
1880 |                     }
1881 |                 
1882 |                 # Estimate OCR needs if requested
1883 |                 if estimate_ocr_needs:
1884 |                     text_pages = 0
1885 |                     total_pages = len(pdf.pages)
1886 |                     sample_size = min(10, total_pages)
1887 |                     
1888 |                     for page_num in range(sample_size):
1889 |                         page = pdf.pages[page_num]
1890 |                         text = page.extract_text()
1891 |                         if text and len(text.strip()) > 50:  # Page has meaningful text
1892 |                             text_pages += 1
1893 |                     
1894 |                     text_ratio = text_pages / sample_size
1895 |                     
1896 |                     if text_ratio > 0.9:
1897 |                         needs_ocr = False
1898 |                         confidence = "high"
1899 |                         reason = "PDF contains extractable text throughout"
1900 |                     elif text_ratio > 0.5:
1901 |                         needs_ocr = True
1902 |                         confidence = "medium"
1903 |                         reason = "PDF has some extractable text but may benefit from OCR for certain pages"
1904 |                     else:
1905 |                         needs_ocr = True
1906 |                         confidence = "high"
1907 |                         reason = "PDF appears to be scanned or has minimal extractable text"
1908 |                     
1909 |                     result["ocr_assessment"] = {
1910 |                         "needs_ocr": needs_ocr,
1911 |                         "confidence": confidence,
1912 |                         "reason": reason,
1913 |                         "text_coverage_ratio": text_ratio
1914 |                     }
1915 |         
1916 |         # Update result
1917 |         processing_time = time.time() - start_time
1918 |         result["success"] = True
1919 |         result["processing_time"] = processing_time
1920 |         
1921 |         logger.info(f"PDF structure analysis completed in {processing_time:.2f}s")
1922 |         return result
1923 |     
1924 |     except Exception as e:
1925 |         logger.error(f"Error in analyze_pdf_structure: {str(e)}")
1926 |         logger.error(traceback.format_exc())
1927 |         raise ToolError(f"Failed to analyze PDF structure: {str(e)}") from e
1928 | 
1929 | @with_tool_metrics
1930 | @with_retry(max_retries=2, retry_delay=1.0)
1931 | @with_error_handling
1932 | async def batch_process_documents(
1933 |     folder_path: str,
1934 |     file_pattern: str = "*.pdf",
1935 |     output_folder: Optional[str] = None,
1936 |     extraction_method: str = "hybrid",
1937 |     max_pages_per_file: int = 0,
1938 |     reformat_as_markdown: bool = True,
1939 |     suppress_headers: bool = True,
1940 |     max_concurrency: int = 3,
1941 |     skip_on_error: bool = True,
1942 |     bytes_data: Optional[Dict[str, Union[bytes, str]]] = None
1943 | ) -> Dict[str, Any]:
1944 |     """
1945 |     Processes multiple document files in a folder with OCR and LLM enhancement.
1946 |     
1947 |     This tool handles batch processing of documents (PDFs and images) in a folder,
1948 |     extracting text, correcting OCR errors, and saving the results to an output folder.
1949 |     It can also process documents provided as bytes data.
1950 |     
1951 |     Args:
1952 |         folder_path: Path to the folder containing files to process
1953 |         file_pattern: Pattern to match files (default: "*.pdf", can be "*.jpg", "*.png", etc.)
1954 |         output_folder: Path to save the output files (default: create 'processed' subfolder)
1955 |         extraction_method: Method for PDF text extraction ("direct", "ocr", "hybrid")
1956 |         max_pages_per_file: Maximum pages to process per PDF (0 = all pages)
1957 |         reformat_as_markdown: Whether to format the output as markdown (default: True)
1958 |         suppress_headers: Whether to remove headers and footers (default: True)
1959 |         max_concurrency: Maximum number of files to process in parallel (default: 3)
1960 |         skip_on_error: Whether to continue processing other files if one fails (default: True)
1961 |         bytes_data: Optional dictionary of filename to bytes data for processing data directly
1962 |     
1963 |     Returns:
1964 |         A dictionary containing:
1965 |         {
1966 |             "success": true,
1967 |             "processed_files": [
1968 |                 {
1969 |                     "file": "/path/to/document1.pdf",
1970 |                     "output_file": "/path/to/output/document1.md",
1971 |                     "pages_processed": 5,
1972 |                     "extraction_method": "hybrid",
1973 |                     "processing_time": 12.34,
1974 |                     "quality_score": 85  # if quality assessment is performed
1975 |                 },
1976 |                 {
1977 |                     "file": "/path/to/document2.pdf",
1978 |                     "error": "Error message",  # if processing failed
1979 |                     "status": "failed"
1980 |                 }
1981 |             ],
1982 |             "total_files": 5,
1983 |             "successful_files": 4,
1984 |             "failed_files": 1,
1985 |             "output_folder": "/path/to/output",
1986 |             "total_processing_time": 45.67  # Seconds
1987 |         }
1988 |     
1989 |     Raises:
1990 |         ToolInputError: If the folder path is invalid
1991 |         ToolError: If batch processing fails
1992 |     """
1993 |     start_time = time.time()
1994 |     
1995 |     # Validate input if processing files from a folder
1996 |     all_files = []
1997 |     
1998 |     if not bytes_data:
1999 |         # Standard file processing from a folder
2000 |         if not folder_path or not os.path.exists(folder_path) or not os.path.isdir(folder_path):
2001 |             raise ToolInputError(f"Invalid folder path: {folder_path}")
2002 |         
2003 |         # Set output folder if not provided
2004 |         if not output_folder:
2005 |             output_folder = os.path.join(folder_path, "processed")
2006 |         
2007 |         # Create output folder if it doesn't exist
2008 |         os.makedirs(output_folder, exist_ok=True)
2009 |         
2010 |         # Find files matching the pattern
2011 |         matching_files: List[Path] = sorted(list(Path(folder_path).glob(file_pattern)))
2012 |         
2013 |         if not matching_files:
2014 |             raise ToolInputError(f"No files found in {folder_path} matching pattern {file_pattern}")
2015 |         
2016 |         all_files = [(str(f), None) for f in matching_files]  # (path, bytes_data)
2017 |     else:
2018 |         # Processing from bytes data
2019 |         if not output_folder:
2020 |             # Create a temporary output folder if not specified
2021 |             output_folder = tempfile.mkdtemp(prefix="ocr_batch_")
2022 |         else:
2023 |             os.makedirs(output_folder, exist_ok=True)
2024 |         
2025 |         # Convert bytes_data to our format
2026 |         for filename, data in bytes_data.items():
2027 |             if isinstance(data, str) and data.startswith('data:'):
2028 |                 # Handle base64 data URLs
2029 |                 try:
2030 |                     mime_type, b64data = data.split(';base64,', 1)
2031 |                     file_bytes = base64.b64decode(b64data)
2032 |                     all_files.append((filename, file_bytes))
2033 |                 except Exception as e:
2034 |                     logger.error(f"Error decoding base64 data for {filename}: {str(e)}")
2035 |                     if not skip_on_error:
2036 |                         raise ToolError(f"Failed to decode base64 data: {str(e)}") from e
2037 |             elif isinstance(data, bytes):
2038 |                 # Already in bytes format
2039 |                 all_files.append((filename, data))
2040 |             else:
2041 |                 logger.error(f"Unsupported data format for {filename}")
2042 |                 if not skip_on_error:
2043 |                     raise ToolInputError(f"Unsupported data format for {filename}")
2044 |     
2045 |     if not all_files:
2046 |         raise ToolInputError("No files to process")
2047 |     
2048 |     # Get task type for batch processing
2049 |     task_type = _get_task_type_for_ocr(extraction_method)
2050 |     logger.info(f"Batch processing documents with task type: {task_type}")
2051 |     
2052 |     # Initialize result
2053 |     result = {
2054 |         "success": False,
2055 |         "processed_files": [],
2056 |         "total_files": len(all_files),
2057 |         "successful_files": 0,
2058 |         "failed_files": 0,
2059 |         "output_folder": output_folder,
2060 |         "total_processing_time": 0,
2061 |         "task_type": task_type
2062 |     }
2063 |     
2064 |     # Create semaphore for concurrency control
2065 |     semaphore = asyncio.Semaphore(max_concurrency)
2066 |     
2067 |     # Create partially-applied functions for better reuse and readability
2068 |     # This allows us to pre-configure the processing functions with common parameters
2069 |     extract_pdf_with_config = functools.partial(
2070 |         extract_text_from_pdf,
2071 |         extraction_method=extraction_method,
2072 |         max_pages=max_pages_per_file,
2073 |         skip_pages=0,
2074 |         reformat_as_markdown=reformat_as_markdown,
2075 |         suppress_headers=suppress_headers,
2076 |         assess_quality=True
2077 |     )
2078 |     
2079 |     extract_pdf_bytes_with_config = functools.partial(
2080 |         extract_text_from_pdf_bytes,
2081 |         extraction_method=extraction_method,
2082 |         max_pages=max_pages_per_file,
2083 |         skip_pages=0,
2084 |         reformat_as_markdown=reformat_as_markdown,
2085 |         suppress_headers=suppress_headers,
2086 |         assess_quality=True
2087 |     )
2088 |     
2089 |     process_image_with_config = functools.partial(
2090 |         process_image_ocr,
2091 |         reformat_as_markdown=reformat_as_markdown,
2092 |         assess_quality=True
2093 |     )
2094 |     
2095 |     # Define worker function for processing each file
2096 |     async def process_file(file_info: Tuple[str, Optional[bytes]]) -> Dict[str, Any]:
2097 |         file_path, file_bytes = file_info
2098 |         async with semaphore:
2099 |             logger.info(f"Processing file: {file_path}")
2100 |             file_start_time = time.time()
2101 |             
2102 |             try:
2103 |                 # Determine file type based on extension
2104 |                 is_pdf = file_path.lower().endswith('.pdf')
2105 |                 
2106 |                 # Process according to file type
2107 |                 if is_pdf:
2108 |                     # Extract base name
2109 |                     base_name = os.path.splitext(os.path.basename(file_path))[0]
2110 |                     
2111 |                     # Determine output file extension
2112 |                     output_extension = '.md' if reformat_as_markdown else '.txt'
2113 |                     
2114 |                     # Define output file path
2115 |                     output_file = os.path.join(output_folder, f"{base_name}{output_extension}")
2116 |                     
2117 |                     # Extract text based on whether we have bytes or file path
2118 |                     if file_bytes is not None:
2119 |                         # Process PDF from bytes
2120 |                         extraction_result = await extract_pdf_bytes_with_config(pdf_bytes=file_bytes)
2121 |                     else:
2122 |                         # Process PDF from file path
2123 |                         extraction_result = await extract_pdf_with_config(file_path=file_path)
2124 |                     
2125 |                     # Save the enhanced text
2126 |                     with open(output_file, "w", encoding="utf-8") as f:
2127 |                         f.write(extraction_result["text"])
2128 |                     
2129 |                     # Save the raw text for reference
2130 |                     raw_output_file = os.path.join(output_folder, f"{base_name}_raw.txt")
2131 |                     with open(raw_output_file, "w", encoding="utf-8") as f:
2132 |                         f.write(extraction_result["raw_text"])
2133 |                     
2134 |                     # Create file result
2135 |                     file_processing_time = time.time() - file_start_time
2136 |                     file_result = {
2137 |                         "file": file_path,
2138 |                         "output_file": output_file,
2139 |                         "raw_output_file": raw_output_file,
2140 |                         "pages_processed": extraction_result["pages_processed"],
2141 |                         "extraction_method_used": extraction_result["extraction_method_used"],
2142 |                         "processing_time": file_processing_time,
2143 |                         "status": "success"
2144 |                     }
2145 |                     
2146 |                     # Add quality metrics if available
2147 |                     if "quality_metrics" in extraction_result:
2148 |                         quality_metrics = extraction_result["quality_metrics"]
2149 |                         file_result["quality_score"] = quality_metrics.get("score")
2150 |                     
2151 |                     logger.info(f"Successfully processed PDF: {file_path}")
2152 |                 
2153 |                 else:
2154 |                     # Handle image file
2155 |                     base_name = os.path.splitext(os.path.basename(file_path))[0]
2156 |                     output_extension = '.md' if reformat_as_markdown else '.txt'
2157 |                     output_file = os.path.join(output_folder, f"{base_name}{output_extension}")
2158 |                     
2159 |                     # Process image with OCR based on whether we have bytes or file path
2160 |                     if file_bytes is not None:
2161 |                         # Process image from bytes
2162 |                         ocr_result = await process_image_with_config(image_data=base64.b64encode(file_bytes).decode('utf-8'))
2163 |                     else:
2164 |                         # Process image from file path
2165 |                         ocr_result = await process_image_with_config(image_path=file_path)
2166 |                     
2167 |                     # Save the enhanced text
2168 |                     with open(output_file, "w", encoding="utf-8") as f:
2169 |                         f.write(ocr_result["text"])
2170 |                     
2171 |                     # Save the raw text for reference
2172 |                     raw_output_file = os.path.join(output_folder, f"{base_name}_raw.txt")
2173 |                     with open(raw_output_file, "w", encoding="utf-8") as f:
2174 |                         f.write(ocr_result["raw_text"])
2175 |                     
2176 |                     # Create file result
2177 |                     file_processing_time = time.time() - file_start_time
2178 |                     file_result = {
2179 |                         "file": file_path,
2180 |                         "output_file": output_file,
2181 |                         "raw_output_file": raw_output_file,
2182 |                         "table_detected": ocr_result.get("table_detected", False),
2183 |                         "processing_time": file_processing_time,
2184 |                         "status": "success"
2185 |                     }
2186 |                     
2187 |                     # Add quality metrics if available
2188 |                     if "quality_metrics" in ocr_result:
2189 |                         quality_metrics = ocr_result["quality_metrics"]
2190 |                         file_result["quality_score"] = quality_metrics.get("score")
2191 |                     
2192 |                     logger.info(f"Successfully processed image: {file_path}")
2193 |                 
2194 |                 return file_result
2195 |             except Exception as e:
2196 |                 logger.error(f"Error processing {file_path}: {str(e)}")
2197 |                 return {
2198 |                     "file": file_path,
2199 |                     "error": str(e),
2200 |                     "status": "failed"
2201 |                 }
2202 |     
2203 |     try:
2204 |         # Process files in parallel
2205 |         tasks = [process_file(file_info) for file_info in all_files]
2206 |         processed_results = await asyncio.gather(*tasks)
2207 |         
2208 |         # Update result
2209 |         result["processed_files"] = processed_results
2210 |         result["successful_files"] = sum(1 for r in processed_results if r.get("status") == "success")
2211 |         result["failed_files"] = sum(1 for r in processed_results if r.get("status") == "failed")
2212 |         result["success"] = True
2213 |         
2214 |         # Calculate total processing time
2215 |         total_processing_time = time.time() - start_time
2216 |         result["total_processing_time"] = total_processing_time
2217 |         
2218 |         logger.info(f"Batch processing completed: {result['successful_files']} successful, {result['failed_files']} failed")
2219 |         return result
2220 |     
2221 |     except Exception as e:
2222 |         logger.error(f"Error in batch processing: {str(e)}")
2223 |         logger.error(traceback.format_exc())
2224 |         raise ToolError(f"Failed to batch process documents: {str(e)}") from e
2225 | 
2226 | # --- Additional helper functions ---
2227 | 
2228 | def _process_toc(toc: List) -> List[Dict[str, Any]]:
2229 |     """
2230 |     Processes a PDF table of contents into a nested structure.
2231 |     
2232 |     Args:
2233 |         toc: Table of contents from PyMuPDF
2234 |         
2235 |     Returns:
2236 |         Nested outline structure
2237 |     """
2238 |     if not toc:
2239 |         return []
2240 |     
2241 |     # Convert flat list with indentation levels to nested structure
2242 |     result = []
2243 |     stack = [(-1, result)]  # (level, children_list)
2244 |     
2245 |     for item in toc:
2246 |         level, title, page = item
2247 |         
2248 |         # Find parent in stack
2249 |         while stack[-1][0] >= level:
2250 |             stack.pop()
2251 |         
2252 |         # Create new entry
2253 |         entry = {"title": title, "page": page, "children": []}
2254 |         stack[-1][1].append(entry)
2255 |         
2256 |         # Add to stack
2257 |         stack.append((level, entry["children"]))
2258 |     
2259 |     return result
2260 | 
2261 | async def _format_tables_in_text(text: str) -> str:
2262 |     """
2263 |     Detects and formats potential tables in text using markdown.
2264 |     
2265 |     Args:
2266 |         text: Text to process
2267 |         
2268 |     Returns:
2269 |         Text with tables formatted in markdown
2270 |     """
2271 |     # Simple pattern to detect table-like content
2272 |     table_patterns = [
2273 |         # Multiple lines with similar column separator patterns
2274 |         r'(\n|^)(((\s*\S+\s*\|\s*\S+\s*)+\|?(\s*\n)){2,})',
2275 |         # Multiple lines with similar tab/space alignment
2276 |         r'(\n|^)((\s*\S+\s+\S+\s+\S+\s+\S+\s*\n){3,})'
2277 |     ]
2278 |     
2279 |     table_sections: List[Tuple[int, int, str]] = []
2280 |     for pattern in table_patterns:
2281 |         matches = re.finditer(pattern, text, re.MULTILINE)
2282 |         for match in matches:
2283 |             table_sections.append((match.start(), match.end(), match.group(2)))
2284 |     
2285 |     # Sort by start position
2286 |     table_sections.sort(key=lambda x: x[0])
2287 |     
2288 |     # No tables found
2289 |     if not table_sections:
2290 |         return text
2291 |     
2292 |     # Process each potential table
2293 |     result_parts = []
2294 |     last_end = 0
2295 |     
2296 |     for start, end, table_text in table_sections:
2297 |         # Add text before table
2298 |         if start > last_end:
2299 |             result_parts.append(text[last_end:start])
2300 |         
2301 |         # Process table
2302 |         try:
2303 |             formatted_table = await _enhance_table_formatting(table_text)
2304 |             result_parts.append(formatted_table)
2305 |         except Exception as e:
2306 |             logger.warning(f"Error formatting table: {str(e)}")
2307 |             result_parts.append(table_text)
2308 |         
2309 |         last_end = end
2310 |     
2311 |     # Add remaining text
2312 |     if last_end < len(text):
2313 |         result_parts.append(text[last_end:])
2314 |     
2315 |     return ''.join(result_parts)
2316 | 
2317 | async def _enhance_table_formatting(table_text):
2318 |     """
2319 |     Enhances table formatting using LLM.
2320 |     
2321 |     Args:
2322 |         table_text: Potential table text
2323 |         
2324 |     Returns:
2325 |         Formatted table in markdown
2326 |     """
2327 |     prompt = f"""Format the following text as a markdown table. The text appears to contain tabular data but may not be properly formatted.
2328 | 
2329 | 1. Detect column headers and content
2330 | 2. Create a proper markdown table with headers, separator row, and content rows
2331 | 3. Preserve all information but improve readability
2332 | 4. If the input is not actually tabular data, return it unchanged with a comment indicating it's not a table
2333 | 
2334 | Here is the text to format:
2335 | 
2336 | ```
2337 | {table_text}
2338 | ```
2339 | 
2340 | Provide ONLY the formatted markdown table with no explanations or comments.
2341 | """
2342 |     
2343 |     try:
2344 |         result = await generate_completion(
2345 |             prompt=prompt,
2346 |             provider=Provider.ANTHROPIC.value,
2347 |             temperature=0.2,
2348 |             max_tokens=len(table_text) + 500
2349 |         )
2350 |         
2351 |         if not result or not result.get("text"):
2352 |             return table_text
2353 |         
2354 |         formatted_table = result["text"]
2355 |         
2356 |         # Check if it's actually formatted as a markdown table
2357 |         if "|" in formatted_table and "-|-" in formatted_table:
2358 |             return "\n" + formatted_table + "\n"
2359 |         else:
2360 |             return table_text
2361 |     except Exception as e:
2362 |         logger.warning(f"Error enhancing table format: {str(e)}")
2363 |         return table_text
2364 | 
2365 | async def _assess_text_quality(original_text: str, enhanced_text: str) -> Dict[str, Any]:
2366 |     """
2367 |     Assesses the quality of OCR enhancement using LLM.
2368 |     
2369 |     Args:
2370 |         original_text: Original OCR text
2371 |         enhanced_text: LLM-enhanced text
2372 |         
2373 |     Returns:
2374 |         Dictionary with quality assessment
2375 |     """
2376 |     # Truncate texts to reasonable lengths for assessment
2377 |     max_sample = 5000
2378 |     original_sample = original_text[:max_sample]
2379 |     enhanced_sample = enhanced_text[:max_sample]
2380 |     
2381 |     prompt = f"""Assess the quality improvement between the original OCR text and the enhanced version. Consider:
2382 | 
2383 | 1. Error correction (typos, OCR artifacts, broken words)
2384 | 2. Formatting improvements (paragraph structure, headings, lists)
2385 | 3. Readability enhancement
2386 | 4. Preservation of original content and meaning
2387 | 5. Removal of unnecessary elements (headers, footers, artifacts)
2388 | 
2389 | Original OCR text:
2390 | ```
2391 | {original_sample}
2392 | ```
2393 | 
2394 | Enhanced text:
2395 | ```
2396 | {enhanced_sample}
2397 | ```
2398 | 
2399 | Provide:
2400 | 1. A quality score from 0-100 where 100 is perfect enhancement
2401 | 2. A brief explanation of improvements and any issues
2402 | 3. Specific examples of corrections (max 3 examples)
2403 | 
2404 | Format your response as follows:
2405 | SCORE: [score]
2406 | EXPLANATION: [explanation]
2407 | EXAMPLES:
2408 | - [example 1]
2409 | - [example 2]
2410 | - [example 3]
2411 | """
2412 |     
2413 |     try:
2414 |         result = await generate_completion(
2415 |             prompt=prompt,
2416 |             provider=Provider.ANTHROPIC.value,
2417 |             temperature=0.3,
2418 |             max_tokens=1000
2419 |         )
2420 |         
2421 |         if not result or not result.get("text"):
2422 |             return {"score": None, "explanation": "Failed to assess quality"}
2423 |         
2424 |         assessment_text = result["text"]
2425 |         
2426 |         # Parse the assessment
2427 |         score_match = re.search(r'SCORE:\s*(\d+)', assessment_text)
2428 |         explanation_match = re.search(r'EXPLANATION:\s*(.*?)(?:\n\s*EXAMPLES|\Z)', assessment_text, re.DOTALL)
2429 |         examples_match = re.search(r'EXAMPLES:\s*(.*?)(?:\Z)', assessment_text, re.DOTALL)
2430 |         
2431 |         score = int(score_match.group(1)) if score_match else None
2432 |         explanation = explanation_match.group(1).strip() if explanation_match else "No explanation provided"
2433 |         
2434 |         examples = []
2435 |         if examples_match:
2436 |             examples_text = examples_match.group(1)
2437 |             examples = [ex.strip().lstrip('- ') for ex in examples_text.split('\n') if ex.strip()]
2438 |         
2439 |         return {
2440 |             "score": score,
2441 |             "explanation": explanation,
2442 |             "examples": examples
2443 |         }
2444 |     except Exception as e:
2445 |         logger.warning(f"Error assessing text quality: {str(e)}")
2446 |         return {"score": None, "explanation": f"Failed to assess quality: {str(e)}"}
```
Page 27/45FirstPrevNextLast