This is page 27 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│ ├── __init__.py
│ ├── advanced_agent_flows_using_unified_memory_system_demo.py
│ ├── advanced_extraction_demo.py
│ ├── advanced_unified_memory_system_demo.py
│ ├── advanced_vector_search_demo.py
│ ├── analytics_reporting_demo.py
│ ├── audio_transcription_demo.py
│ ├── basic_completion_demo.py
│ ├── cache_demo.py
│ ├── claude_integration_demo.py
│ ├── compare_synthesize_demo.py
│ ├── cost_optimization.py
│ ├── data
│ │ ├── sample_event.txt
│ │ ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│ │ └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│ ├── docstring_refiner_demo.py
│ ├── document_conversion_and_processing_demo.py
│ ├── entity_relation_graph_demo.py
│ ├── filesystem_operations_demo.py
│ ├── grok_integration_demo.py
│ ├── local_text_tools_demo.py
│ ├── marqo_fused_search_demo.py
│ ├── measure_model_speeds.py
│ ├── meta_api_demo.py
│ ├── multi_provider_demo.py
│ ├── ollama_integration_demo.py
│ ├── prompt_templates_demo.py
│ ├── python_sandbox_demo.py
│ ├── rag_example.py
│ ├── research_workflow_demo.py
│ ├── sample
│ │ ├── article.txt
│ │ ├── backprop_paper.pdf
│ │ ├── buffett.pdf
│ │ ├── contract_link.txt
│ │ ├── legal_contract.txt
│ │ ├── medical_case.txt
│ │ ├── northwind.db
│ │ ├── research_paper.txt
│ │ ├── sample_data.json
│ │ └── text_classification_samples
│ │ ├── email_classification.txt
│ │ ├── news_samples.txt
│ │ ├── product_reviews.txt
│ │ └── support_tickets.txt
│ ├── sample_docs
│ │ └── downloaded
│ │ └── attention_is_all_you_need.pdf
│ ├── sentiment_analysis_demo.py
│ ├── simple_completion_demo.py
│ ├── single_shot_synthesis_demo.py
│ ├── smart_browser_demo.py
│ ├── sql_database_demo.py
│ ├── sse_client_demo.py
│ ├── test_code_extraction.py
│ ├── test_content_detection.py
│ ├── test_ollama.py
│ ├── text_classification_demo.py
│ ├── text_redline_demo.py
│ ├── tool_composition_examples.py
│ ├── tournament_code_demo.py
│ ├── tournament_text_demo.py
│ ├── unified_memory_system_demo.py
│ ├── vector_search_demo.py
│ ├── web_automation_instruction_packs.py
│ └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│ └── smart_browser_internal
│ ├── locator_cache.db
│ ├── readability.js
│ └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── integration
│ │ ├── __init__.py
│ │ └── test_server.py
│ ├── manual
│ │ ├── test_extraction_advanced.py
│ │ └── test_extraction.py
│ └── unit
│ ├── __init__.py
│ ├── test_cache.py
│ ├── test_providers.py
│ └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── commands.py
│ │ ├── helpers.py
│ │ └── typer_cli.py
│ ├── clients
│ │ ├── __init__.py
│ │ ├── completion_client.py
│ │ └── rag_client.py
│ ├── config
│ │ └── examples
│ │ └── filesystem_config.yaml
│ ├── config.py
│ ├── constants.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── evaluation
│ │ │ ├── base.py
│ │ │ └── evaluators.py
│ │ ├── providers
│ │ │ ├── __init__.py
│ │ │ ├── anthropic.py
│ │ │ ├── base.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemini.py
│ │ │ ├── grok.py
│ │ │ ├── ollama.py
│ │ │ ├── openai.py
│ │ │ └── openrouter.py
│ │ ├── server.py
│ │ ├── state_store.py
│ │ ├── tournaments
│ │ │ ├── manager.py
│ │ │ ├── tasks.py
│ │ │ └── utils.py
│ │ └── ums_api
│ │ ├── __init__.py
│ │ ├── ums_database.py
│ │ ├── ums_endpoints.py
│ │ ├── ums_models.py
│ │ └── ums_services.py
│ ├── exceptions.py
│ ├── graceful_shutdown.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── analytics
│ │ │ ├── __init__.py
│ │ │ ├── metrics.py
│ │ │ └── reporting.py
│ │ ├── cache
│ │ │ ├── __init__.py
│ │ │ ├── cache_service.py
│ │ │ ├── persistence.py
│ │ │ ├── strategies.py
│ │ │ └── utils.py
│ │ ├── cache.py
│ │ ├── document.py
│ │ ├── knowledge_base
│ │ │ ├── __init__.py
│ │ │ ├── feedback.py
│ │ │ ├── manager.py
│ │ │ ├── rag_engine.py
│ │ │ ├── retriever.py
│ │ │ └── utils.py
│ │ ├── prompts
│ │ │ ├── __init__.py
│ │ │ ├── repository.py
│ │ │ └── templates.py
│ │ ├── prompts.py
│ │ └── vector
│ │ ├── __init__.py
│ │ ├── embeddings.py
│ │ └── vector_service.py
│ ├── tool_token_counter.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── audio_transcription.py
│ │ ├── base.py
│ │ ├── completion.py
│ │ ├── docstring_refiner.py
│ │ ├── document_conversion_and_processing.py
│ │ ├── enhanced-ums-lookbook.html
│ │ ├── entity_relation_graph.py
│ │ ├── excel_spreadsheet_automation.py
│ │ ├── extraction.py
│ │ ├── filesystem.py
│ │ ├── html_to_markdown.py
│ │ ├── local_text_tools.py
│ │ ├── marqo_fused_search.py
│ │ ├── meta_api_tool.py
│ │ ├── ocr_tools.py
│ │ ├── optimization.py
│ │ ├── provider.py
│ │ ├── pyodide_boot_template.html
│ │ ├── python_sandbox.py
│ │ ├── rag.py
│ │ ├── redline-compiled.css
│ │ ├── sentiment_analysis.py
│ │ ├── single_shot_synthesis.py
│ │ ├── smart_browser.py
│ │ ├── sql_databases.py
│ │ ├── text_classification.py
│ │ ├── text_redline_tools.py
│ │ ├── tournament.py
│ │ ├── ums_explorer.html
│ │ └── unified_memory_system.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── async_utils.py
│ │ ├── display.py
│ │ ├── logging
│ │ │ ├── __init__.py
│ │ │ ├── console.py
│ │ │ ├── emojis.py
│ │ │ ├── formatter.py
│ │ │ ├── logger.py
│ │ │ ├── panels.py
│ │ │ ├── progress.py
│ │ │ └── themes.py
│ │ ├── parse_yaml.py
│ │ ├── parsing.py
│ │ ├── security.py
│ │ └── text.py
│ └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/ocr_tools.py:
--------------------------------------------------------------------------------
```python
1 | """OCR Tools for Ultimate MCP Server.
2 |
3 | This module provides tools for OCR (Optical Character Recognition) processing,
4 | leveraging LLMs to improve the quality of extracted text from PDFs and images.
5 |
6 | Features:
7 | - PDF to image conversion with optimized preprocessing
8 | - Multiple extraction methods (OCR, direct text extraction, hybrid approach)
9 | - Intelligent text segmentation and processing for large documents
10 | - LLM-based error correction and formatting
11 | - Table detection and formatting
12 | - Multi-language support
13 | - Quality assessment with detailed metrics
14 | - PDF structure analysis
15 | - Batch processing with concurrency control
16 | - Sophisticated caching for improved performance
17 |
18 | Example usage:
19 | ```python
20 | # Extract text from a PDF file with LLM correction
21 | result = await client.tools.extract_text_from_pdf(
22 | file_path="document.pdf",
23 | extraction_method="hybrid", # Try direct text extraction first, fall back to OCR if needed
24 | max_pages=5,
25 | skip_pages=0,
26 | reformat_as_markdown=True,
27 | suppress_headers=True
28 | )
29 |
30 | # Process an image file with custom preprocessing
31 | result = await client.tools.process_image_ocr(
32 | image_path="scan.jpg",
33 | preprocessing_options={
34 | "denoise": True,
35 | "threshold": "adaptive",
36 | "deskew": True
37 | },
38 | ocr_language="eng+fra", # Multi-language support
39 | assess_quality=True
40 | )
41 |
42 | # Enhance existing OCR text with LLM
43 | result = await client.tools.enhance_ocr_text(
44 | ocr_text="Text with OCK errors and broken lin- es",
45 | reformat_as_markdown=True,
46 | remove_headers=True
47 | )
48 |
49 | # Analyze PDF structure without full extraction
50 | info = await client.tools.analyze_pdf_structure(
51 | file_path="document.pdf",
52 | extract_metadata=True,
53 | extract_outline=True,
54 | extract_fonts=True
55 | )
56 |
57 | # Batch process multiple PDFs
58 | result = await client.tools.batch_process_documents(
59 | folder_path="/path/to/documents",
60 | file_pattern="*.pdf",
61 | output_folder="/path/to/output",
62 | max_concurrency=3
63 | )
64 | ```
65 | """
66 | import asyncio
67 | import base64
68 | import functools
69 | import hashlib
70 | import io
71 | import json
72 | import math
73 | import os
74 | import re
75 | import tempfile
76 | import time
77 | import traceback
78 | import uuid
79 | from concurrent.futures import ThreadPoolExecutor
80 | from pathlib import Path
81 | from typing import Any, Dict, List, Optional, Set, Tuple, Union
82 |
83 | # Try importing required libraries with fallbacks
84 | try:
85 | import numpy as np
86 | HAS_NUMPY = True
87 | except ImportError:
88 | HAS_NUMPY = False
89 |
90 | try:
91 | from PIL import Image, ImageEnhance, ImageFilter
92 | HAS_PIL = True
93 | except ImportError:
94 | HAS_PIL = False
95 |
96 | try:
97 | import cv2
98 | HAS_CV2 = True
99 | except ImportError:
100 | HAS_CV2 = False
101 |
102 | try:
103 | import pytesseract
104 | HAS_PYTESSERACT = True
105 | except ImportError:
106 | HAS_PYTESSERACT = False
107 |
108 | try:
109 | from pdf2image import convert_from_bytes, convert_from_path
110 | HAS_PDF2IMAGE = True
111 | except ImportError:
112 | HAS_PDF2IMAGE = False
113 |
114 | try:
115 | import pdfplumber
116 | HAS_PDFPLUMBER = True
117 | except ImportError:
118 | HAS_PDFPLUMBER = False
119 |
120 | try:
121 | import pymupdf # PyMuPDF
122 | HAS_PYMUPDF = True
123 | except ImportError:
124 | HAS_PYMUPDF = False
125 |
126 | # Import tools and helpers from ultimate
127 | from ultimate_mcp_server.constants import Provider, TaskType
128 | from ultimate_mcp_server.exceptions import ProviderError, ToolError, ToolInputError
129 | from ultimate_mcp_server.tools.base import (
130 | with_cache,
131 | with_error_handling,
132 | with_retry,
133 | with_tool_metrics,
134 | )
135 | from ultimate_mcp_server.tools.completion import generate_completion
136 | from ultimate_mcp_server.utils import get_logger
137 |
138 | logger = get_logger("ultimate_mcp_server.tools.ocr")
139 |
140 | # Cache for storing preprocessed images and extracted text
141 | OCR_CACHE = {}
142 |
143 | # Check if required dependencies are available
144 | def _check_ocr_dependencies():
145 | """Checks if OCR dependencies are available and returns a dictionary of requirements."""
146 | requirements = {
147 | "numpy": HAS_NUMPY,
148 | "PIL": HAS_PIL,
149 | "cv2": HAS_CV2,
150 | "pytesseract": HAS_PYTESSERACT,
151 | "pdf2image": HAS_PDF2IMAGE,
152 | "pdfplumber": HAS_PDFPLUMBER,
153 | "pymupdf": HAS_PYMUPDF
154 | }
155 |
156 | missing = [lib for lib, available in requirements.items() if not available]
157 |
158 | if missing:
159 | logger.warning(f"Some OCR dependencies are missing: {', '.join(missing)}")
160 | logger.warning("OCR functionality may be limited. Install required packages with:")
161 | packages = {
162 | "numpy": "numpy",
163 | "PIL": "pillow",
164 | "cv2": "opencv-python-headless",
165 | "pytesseract": "pytesseract",
166 | "pdf2image": "pdf2image",
167 | "pdfplumber": "pdfplumber",
168 | "pymupdf": "pymupdf"
169 | }
170 |
171 | pip_command = f"pip install {' '.join(packages[lib] for lib in missing)}"
172 | logger.warning(f" {pip_command}")
173 |
174 | return requirements, missing
175 |
176 | # Check dependencies early
177 | OCR_REQUIREMENTS, MISSING_REQUIREMENTS = _check_ocr_dependencies()
178 |
179 | # --- Helper functions for OCR processing ---
180 |
181 | def _validate_file_path(file_path: str, expected_extension: Optional[str] = None) -> None:
182 | """
183 | Validates a file path exists and optionally has the expected extension.
184 |
185 | Args:
186 | file_path: Path to the file to validate
187 | expected_extension: Optional file extension to check (e.g., '.pdf')
188 |
189 | Raises:
190 | ToolInputError: If validation fails
191 | """
192 | if not file_path:
193 | raise ToolInputError("File path cannot be empty")
194 |
195 | file_path = os.path.expanduser(os.path.normpath(file_path))
196 |
197 | if not os.path.exists(file_path):
198 | raise ToolInputError(f"File not found: {file_path}")
199 |
200 | if not os.path.isfile(file_path):
201 | raise ToolInputError(f"Path is not a file: {file_path}")
202 |
203 | if expected_extension and not file_path.lower().endswith(expected_extension.lower()):
204 | raise ToolInputError(f"File does not have the expected extension ({expected_extension}): {file_path}")
205 |
206 | def _get_task_type_for_ocr(extraction_method: str = "hybrid") -> str:
207 | """
208 | Returns the appropriate TaskType for OCR operations based on extraction method.
209 |
210 | Args:
211 | extraction_method: The extraction method being used
212 |
213 | Returns:
214 | The TaskType value as a string
215 | """
216 | if extraction_method == "direct":
217 | return TaskType.TEXT_EXTRACTION.value
218 | elif extraction_method == "ocr":
219 | return TaskType.OCR.value
220 | else: # hybrid
221 | return TaskType.OCR.value
222 |
223 | def _handle_provider_error(e: Exception, operation: str) -> ToolError:
224 | """
225 | Handles provider-specific errors and converts them to tool errors.
226 |
227 | Args:
228 | e: The exception that was raised
229 | operation: Description of the operation that failed
230 |
231 | Returns:
232 | A ToolError with appropriate message
233 | """
234 | if isinstance(e, ProviderError):
235 | # Handle specific provider errors
236 | return ToolError(f"Provider error during {operation}: {str(e)}")
237 | else:
238 | # Handle generic errors
239 | return ToolError(f"Error during {operation}: {str(e)}")
240 |
241 | def _preprocess_image(image: Image.Image, preprocessing_options: Optional[Dict[str, Any]] = None) -> Image.Image:
242 | """
243 | Preprocesses an image for better OCR results.
244 |
245 | Args:
246 | image: PIL Image object
247 | preprocessing_options: Dictionary of preprocessing options
248 | - denoise: Whether to apply denoising (default: True)
249 | - threshold: Thresholding method ('otsu', 'adaptive', 'none') (default: 'otsu')
250 | - deskew: Whether to deskew the image (default: True)
251 | - enhance_contrast: Whether to enhance contrast (default: True)
252 | - enhance_brightness: Whether to enhance brightness (default: False)
253 | - enhance_sharpness: Whether to enhance sharpness (default: False)
254 | - apply_filters: List of filters to apply (default: [])
255 | - resize_factor: Factor to resize the image by (default: 1.0)
256 |
257 | Returns:
258 | Preprocessed PIL Image object
259 | """
260 | if not HAS_CV2 or not HAS_NUMPY or not HAS_PIL:
261 | logger.warning("Image preprocessing requires opencv-python, numpy, and pillow. Using original image.")
262 | return image
263 |
264 | # Default preprocessing options
265 | if preprocessing_options is None:
266 | preprocessing_options = {
267 | "denoise": True,
268 | "threshold": "otsu",
269 | "deskew": True,
270 | "enhance_contrast": True,
271 | "enhance_brightness": False,
272 | "enhance_sharpness": False,
273 | "apply_filters": [],
274 | "resize_factor": 1.0
275 | }
276 |
277 | # Apply PIL enhancements before OpenCV processing if enabled
278 | if HAS_PIL:
279 | # Enhance brightness if requested
280 | if preprocessing_options.get("enhance_brightness", False):
281 | enhancer = ImageEnhance.Brightness(image)
282 | # Increase brightness by 30%
283 | image = enhancer.enhance(1.3)
284 |
285 | # Enhance contrast if requested using PIL (in addition to OpenCV method)
286 | if preprocessing_options.get("enhance_contrast", True):
287 | enhancer = ImageEnhance.Contrast(image)
288 | # Increase contrast by 40%
289 | image = enhancer.enhance(1.4)
290 |
291 | # Enhance sharpness if requested
292 | if preprocessing_options.get("enhance_sharpness", False):
293 | enhancer = ImageEnhance.Sharpness(image)
294 | # Increase sharpness by 50%
295 | image = enhancer.enhance(1.5)
296 |
297 | # Apply filters if specified
298 | filters = preprocessing_options.get("apply_filters", [])
299 | for filter_name in filters:
300 | if filter_name == "unsharp_mask":
301 | image = image.filter(ImageFilter.UnsharpMask(radius=2, percent=150))
302 | elif filter_name == "detail":
303 | image = image.filter(ImageFilter.DETAIL)
304 | elif filter_name == "edge_enhance":
305 | image = image.filter(ImageFilter.EDGE_ENHANCE)
306 | elif filter_name == "smooth":
307 | image = image.filter(ImageFilter.SMOOTH)
308 |
309 | # Convert PIL Image to OpenCV format
310 | img = np.array(image)
311 | if len(img.shape) == 3 and img.shape[2] == 3:
312 | gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
313 | else:
314 | gray = img
315 |
316 | # Calculate optimal scaling based on image size and content
317 | original_height, original_width = gray.shape[:2]
318 | resize_factor = preprocessing_options.get("resize_factor", 1.0)
319 |
320 | # Adaptive scaling based on image dimensions for optimal OCR
321 | # For very small images, increase size; for very large images, reduce
322 | if resize_factor == 1.0: # Only auto-adjust if user didn't specify
323 | # Calculate the ideal size range for OCR (1500-3500 pixels on longest edge)
324 | longest_edge = max(original_width, original_height)
325 | if longest_edge < 1500:
326 | # For small images, scale up to improve OCR
327 | resize_factor = math.ceil(1500 / longest_edge * 10) / 10 # Round to nearest 0.1
328 | elif longest_edge > 3500:
329 | # For large images, scale down to improve performance
330 | resize_factor = math.floor(3500 / longest_edge * 10) / 10 # Round to nearest 0.1
331 |
332 | # Enhance contrast
333 | if preprocessing_options.get("enhance_contrast", True):
334 | gray = cv2.equalizeHist(gray)
335 |
336 | # Apply thresholding
337 | threshold_method = preprocessing_options.get("threshold", "otsu")
338 | if threshold_method == "otsu":
339 | _, img_thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
340 | elif threshold_method == "adaptive":
341 | # Calculate optimal block size based on image dimensions (odd number)
342 | block_size = math.floor(min(gray.shape) / 30)
343 | block_size = max(3, block_size)
344 | if block_size % 2 == 0:
345 | block_size += 1
346 | img_thresholded = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, 2)
347 | else:
348 | img_thresholded = gray
349 |
350 | # Denoise
351 | if preprocessing_options.get("denoise", True):
352 | # Calculate optimal denoising parameters based on image size
353 | h_param = math.ceil(10 * math.log10(min(original_width, original_height)))
354 | img_denoised = cv2.fastNlMeansDenoising(img_thresholded, None, h_param, 7, 21)
355 | else:
356 | img_denoised = img_thresholded
357 |
358 | # Deskew
359 | if preprocessing_options.get("deskew", True) and HAS_NUMPY:
360 | try:
361 | coords = np.column_stack(np.where(img_denoised > 0))
362 | angle = cv2.minAreaRect(coords)[-1]
363 |
364 | if angle < -45:
365 | angle = -(90 + angle)
366 | else:
367 | angle = -angle
368 |
369 | # Rotate to correct skew if significant skew detected
370 | if abs(angle) > 0.5:
371 | (h, w) = img_denoised.shape[:2]
372 | center = (w // 2, h // 2)
373 | M = cv2.getRotationMatrix2D(center, angle, 1.0)
374 | img_deskewed = cv2.warpAffine(img_denoised, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
375 | else:
376 | img_deskewed = img_denoised
377 | except Exception as e:
378 | logger.warning(f"Deskewing failed: {str(e)}. Using non-deskewed image.")
379 | img_deskewed = img_denoised
380 | else:
381 | img_deskewed = img_denoised
382 |
383 | # Resize if needed
384 | if resize_factor != 1.0:
385 | # Use ceiling to ensure we don't lose pixels in important small details
386 | new_w = math.ceil(original_width * resize_factor)
387 | new_h = math.ceil(original_height * resize_factor)
388 | img_resized = cv2.resize(img_deskewed, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
389 | else:
390 | img_resized = img_deskewed
391 |
392 | # Convert back to PIL Image
393 | return Image.fromarray(img_resized)
394 |
395 | def _extract_text_with_ocr(image: Image.Image, ocr_language: str = "eng", ocr_config: str = "") -> str:
396 | """
397 | Extracts text from an image using OCR.
398 |
399 | Args:
400 | image: PIL Image object
401 | ocr_language: Language(s) for OCR (default: "eng")
402 | ocr_config: Additional configuration for Tesseract
403 |
404 | Returns:
405 | Extracted text
406 | """
407 | if not HAS_PYTESSERACT:
408 | raise ToolError("pytesseract is required for OCR text extraction")
409 |
410 | try:
411 | custom_config = f"-l {ocr_language} {ocr_config}"
412 | return pytesseract.image_to_string(image, config=custom_config)
413 | except Exception as e:
414 | logger.error(f"OCR extraction failed: {str(e)}")
415 | raise ToolError(f"OCR extraction failed: {str(e)}") from e
416 |
417 | def _extract_text_from_pdf_direct(file_path: str, start_page: int = 0, max_pages: int = 0) -> Tuple[List[str], bool]:
418 | """
419 | Extracts text directly from a PDF file without OCR.
420 |
421 | Args:
422 | file_path: Path to the PDF file
423 | start_page: First page to extract (0-indexed)
424 | max_pages: Maximum number of pages to extract (0 = all)
425 |
426 | Returns:
427 | Tuple of (extracted_text_list, has_text)
428 | """
429 | texts = []
430 | has_text = False
431 |
432 | if HAS_PDFPLUMBER:
433 | try:
434 | with pdfplumber.open(file_path) as pdf:
435 | total_pages = len(pdf.pages)
436 | end_page = total_pages if max_pages == 0 else min(start_page + max_pages, total_pages)
437 |
438 | for i in range(start_page, end_page):
439 | try:
440 | page = pdf.pages[i]
441 | text = page.extract_text(x_tolerance=3, y_tolerance=3)
442 | if text and text.strip():
443 | has_text = True
444 | texts.append(text or "")
445 | except Exception as e:
446 | logger.warning(f"Error extracting text from page {i+1}: {str(e)}")
447 | texts.append("")
448 | except Exception as e:
449 | logger.error(f"Error extracting text directly from PDF: {str(e)}")
450 | raise ToolError(f"Failed to extract text directly from PDF: {str(e)}") from e
451 |
452 | elif HAS_PYMUPDF:
453 | try:
454 | with pymupdf.open(file_path) as doc:
455 | total_pages = len(doc)
456 | end_page = total_pages if max_pages == 0 else min(start_page + max_pages, total_pages)
457 |
458 | for i in range(start_page, end_page):
459 | try:
460 | page = doc[i]
461 | text = page.get_text()
462 | if text and text.strip():
463 | has_text = True
464 | texts.append(text or "")
465 | except Exception as e:
466 | logger.warning(f"Error extracting text from page {i+1}: {str(e)}")
467 | texts.append("")
468 | except Exception as e:
469 | logger.error(f"Error extracting text directly from PDF: {str(e)}")
470 | raise ToolError(f"Failed to extract text directly from PDF: {str(e)}") from e
471 |
472 | else:
473 | logger.warning("No PDF text extraction library available (pdfplumber or PyMuPDF)")
474 | raise ToolError("No PDF text extraction library available. Install pdfplumber or PyMuPDF.")
475 |
476 | return texts, has_text
477 |
478 | def _convert_pdf_to_images(file_path, start_page=0, max_pages=0, dpi=300):
479 | """
480 | Converts pages of a PDF file to PIL Image objects.
481 |
482 | Args:
483 | file_path: Path to the PDF file
484 | start_page: First page to convert (0-indexed)
485 | max_pages: Maximum number of pages to convert (0 = all)
486 | dpi: DPI for rendering (default: 300)
487 |
488 | Returns:
489 | List of PIL Image objects
490 | """
491 | if not HAS_PDF2IMAGE:
492 | raise ToolError("pdf2image is required for PDF to image conversion")
493 |
494 | try:
495 | # Create a temporary directory to store intermediate images
496 | # This helps with memory management for large PDFs
497 | with tempfile.TemporaryDirectory() as temp_dir:
498 | # pdf2image uses 1-based indexing
499 | first_page = start_page + 1
500 | last_page = None if max_pages == 0 else first_page + max_pages - 1
501 |
502 | # Use the temp directory for output_folder
503 | images = convert_from_path(
504 | file_path,
505 | dpi=dpi,
506 | first_page=first_page,
507 | last_page=last_page,
508 | output_folder=temp_dir
509 | )
510 |
511 | return images
512 | except Exception as e:
513 | logger.error(f"PDF to image conversion failed: {str(e)}")
514 | raise ToolError(f"Failed to convert PDF to images: {str(e)}") from e
515 |
516 | def _convert_pdf_bytes_to_images(pdf_bytes, start_page=0, max_pages=0, dpi=300):
517 | """
518 | Converts pages of a PDF from bytes to PIL Image objects.
519 |
520 | Args:
521 | pdf_bytes: PDF content as bytes
522 | start_page: First page to convert (0-indexed)
523 | max_pages: Maximum number of pages to convert (0 = all)
524 | dpi: DPI for rendering (default: 300)
525 |
526 | Returns:
527 | List of PIL Image objects
528 | """
529 | if not HAS_PDF2IMAGE:
530 | raise ToolError("pdf2image is required for PDF to image conversion")
531 |
532 | try:
533 | # Create a temporary directory to store intermediate images
534 | # This helps with memory management for large PDFs
535 | with tempfile.TemporaryDirectory() as temp_dir:
536 | # pdf2image uses 1-based indexing
537 | first_page = start_page + 1
538 | last_page = None if max_pages == 0 else first_page + max_pages - 1
539 |
540 | # Use the temp directory for output_folder
541 | images = convert_from_bytes(
542 | pdf_bytes,
543 | dpi=dpi,
544 | first_page=first_page,
545 | last_page=last_page,
546 | output_folder=temp_dir
547 | )
548 |
549 | return images
550 | except Exception as e:
551 | logger.error(f"PDF bytes to image conversion failed: {str(e)}")
552 | raise ToolError(f"Failed to convert PDF bytes to images: {str(e)}") from e
553 |
554 | def _generate_cache_key(data, prefix="ocr"):
555 | """Generate a cache key for the given data."""
556 | if isinstance(data, str) and os.path.exists(data):
557 | # For file paths, use mtime and size
558 | stat = os.stat(data)
559 | key_data = f"{data}:{stat.st_mtime}:{stat.st_size}"
560 | elif isinstance(data, Image.Image):
561 | # For PIL images, convert to bytes and hash
562 | img_bytes = io.BytesIO()
563 | data.save(img_bytes, format=data.format or 'PNG')
564 | key_data = img_bytes.getvalue()
565 | elif isinstance(data, dict):
566 | # For dictionaries, convert to JSON
567 | key_data = json.dumps(data, sort_keys=True)
568 | else:
569 | # For other data, use string representation
570 | key_data = str(data)
571 |
572 | # Generate hash
573 | h = hashlib.md5(key_data.encode() if isinstance(key_data, str) else key_data)
574 |
575 | # Add a UUID component for uniqueness across process restarts
576 | unique_id = str(uuid.uuid4())[:8]
577 |
578 | return f"{prefix}_{h.hexdigest()}_{unique_id}"
579 |
580 | def _split_text_into_chunks(text, max_chunk_size=8000, overlap=200):
581 | """
582 | Splits text into chunks of specified maximum size with overlap.
583 |
584 | Args:
585 | text: Text to split
586 | max_chunk_size: Maximum chunk size in characters
587 | overlap: Overlap between chunks in characters
588 |
589 | Returns:
590 | List of text chunks
591 | """
592 | if not text:
593 | return []
594 |
595 | # Ensure reasonable values
596 | max_chunk_size = max(1000, min(max_chunk_size, 15000))
597 | overlap = max(50, min(overlap, max_chunk_size // 4))
598 |
599 | # Split by paragraphs first
600 | paragraphs = re.split(r'\n\s*\n', text)
601 |
602 | chunks = []
603 | current_chunk = []
604 | current_length = 0
605 |
606 | for paragraph in paragraphs:
607 | para_length = len(paragraph)
608 |
609 | if current_length + para_length <= max_chunk_size:
610 | # Paragraph fits in current chunk
611 | current_chunk.append(paragraph)
612 | current_length += para_length + 2 # +2 for the newlines
613 | else:
614 | # Paragraph doesn't fit
615 | if current_chunk:
616 | # Save current chunk
617 | chunks.append("\n\n".join(current_chunk))
618 |
619 | if para_length <= max_chunk_size:
620 | # Start new chunk with this paragraph
621 | current_chunk = [paragraph]
622 | current_length = para_length + 2
623 | else:
624 | # Paragraph too large, split into sentences
625 | sentences = re.split(r'(?<=[.!?])\s+', paragraph)
626 | current_chunk = []
627 | current_length = 0
628 |
629 | for sentence in sentences:
630 | sentence_length = len(sentence)
631 |
632 | if current_length + sentence_length <= max_chunk_size:
633 | # Sentence fits in current chunk
634 | current_chunk.append(sentence)
635 | current_length += sentence_length + 1 # +1 for the space
636 | else:
637 | # Sentence doesn't fit
638 | if current_chunk:
639 | # Save current chunk
640 | chunks.append(" ".join(current_chunk))
641 |
642 | if sentence_length <= max_chunk_size:
643 | # Start new chunk with this sentence
644 | current_chunk = [sentence]
645 | current_length = sentence_length + 1
646 | else:
647 | # Sentence too large, split by words
648 | words = sentence.split()
649 | current_chunk = []
650 | current_length = 0
651 | current_part = []
652 | part_length = 0
653 |
654 | for word in words:
655 | word_length = len(word)
656 |
657 | if part_length + word_length + 1 <= max_chunk_size:
658 | current_part.append(word)
659 | part_length += word_length + 1 # +1 for the space
660 | else:
661 | if current_part:
662 | chunks.append(" ".join(current_part))
663 | current_part = [word]
664 | part_length = word_length + 1
665 |
666 | if current_part:
667 | current_chunk = current_part
668 | current_length = part_length
669 |
670 | # Add the last chunk if it exists
671 | if current_chunk:
672 | chunks.append("\n\n".join(current_chunk) if len(current_chunk) > 1 else current_chunk[0])
673 |
674 | # Add overlap between chunks
675 | result = []
676 | prev_end = ""
677 |
678 | for i, chunk in enumerate(chunks):
679 | if i > 0 and prev_end:
680 | # Find a good overlap point (try to break at paragraph or sentence)
681 | overlap_text = prev_end
682 | if "\n\n" in overlap_text:
683 | parts = overlap_text.split("\n\n")
684 | if len(parts) > 1:
685 | overlap_text = parts[-1]
686 |
687 | # Prepend overlap to current chunk
688 | chunk = overlap_text + " " + chunk
689 |
690 | # Save end of current chunk for next iteration
691 | prev_end = chunk[-overlap:] if len(chunk) > overlap else chunk
692 |
693 | result.append(chunk)
694 |
695 | return result
696 |
697 | def _detect_tables(image: Image.Image) -> List[Tuple[int, int, int, int]]:
698 | """
699 | Detects potential tables in an image.
700 |
701 | Args:
702 | image: PIL Image object
703 |
704 | Returns:
705 | List of detected table regions as (x, y, width, height) tuples
706 | """
707 | if not HAS_CV2 or not HAS_NUMPY:
708 | return []
709 |
710 | # Convert PIL Image to OpenCV format
711 | img = np.array(image)
712 | if len(img.shape) == 3 and img.shape[2] == 3:
713 | gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
714 | else:
715 | gray = img
716 |
717 | # Apply thresholding and morphological operations
718 | _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
719 |
720 | # Create a kernel for dilation
721 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
722 | dilated = cv2.dilate(thresh, kernel, iterations=5)
723 |
724 | # Find contours
725 | contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
726 |
727 | # Filter contours to find potential tables
728 | table_regions = []
729 | for contour in contours:
730 | x, y, w, h = cv2.boundingRect(contour)
731 |
732 | # Tables usually have a certain aspect ratio and size
733 | aspect_ratio = w / h
734 | area = w * h
735 | img_area = img.shape[0] * img.shape[1]
736 |
737 | if 0.5 <= aspect_ratio <= 3.0 and area > img_area * 0.05:
738 | table_regions.append((x, y, w, h))
739 |
740 | return table_regions
741 |
742 | def _crop_image(image: Image.Image, region: Tuple[int, int, int, int]) -> Image.Image:
743 | """
744 | Crops an image to the specified region.
745 |
746 | Args:
747 | image: PIL Image object
748 | region: Tuple of (x, y, width, height)
749 |
750 | Returns:
751 | Cropped PIL Image object
752 | """
753 | x, y, width, height = region
754 | return image.crop((x, y, x + width, y + height))
755 |
756 | def _is_text_mostly_noise(text, noise_threshold=0.3):
757 | """Determine if extracted text is mostly noise based on character distribution."""
758 | if not text or len(text) < 10:
759 | return False
760 |
761 | # Calculate the ratio of non-alphanumeric and non-punctuation characters
762 | total_chars = len(text)
763 | valid_chars = sum(1 for c in text if c.isalnum() or c.isspace() or c in '.,;:!?"-\'()[]{}')
764 |
765 | noise_ratio = 1 - (valid_chars / total_chars)
766 | return noise_ratio > noise_threshold
767 |
768 | def _is_likely_header_or_footer(text, line_length_threshold=50):
769 | """Determine if a text line is likely a header or footer."""
770 | text = text.strip()
771 | if len(text) == 0:
772 | return False
773 |
774 | # Short lines with page numbers
775 | if len(text) < line_length_threshold and re.search(r'\b\d+\b', text):
776 | return True
777 |
778 | # Common header/footer patterns
779 | patterns = [
780 | r'^\d+$', # Just a page number
781 | r'^Page\s+\d+(\s+of\s+\d+)?$', # Page X of Y
782 | r'^[\w\s]+\s+\|\s+\d+$', # Title | Page
783 | r'^\w+\s+\d{1,2},?\s+\d{4}$', # Date format
784 | r'^Copyright', # Copyright notices
785 | r'^\w+\s+\d{1,2}(st|nd|rd|th)?,?\s+\d{4}$', # Date with ordinal
786 | r'^\d{1,2}/\d{1,2}/\d{2,4}$' # Date in MM/DD/YY format
787 | ]
788 |
789 | for pattern in patterns:
790 | if re.search(pattern, text, re.IGNORECASE):
791 | return True
792 |
793 | return False
794 |
795 | def _remove_headers_and_footers(text, max_line_length=70):
796 | """
797 | Removes headers and footers from text.
798 |
799 | Args:
800 | text: Text to process
801 | max_line_length: Maximum length for a line to be considered a header/footer
802 |
803 | Returns:
804 | Text with headers and footers removed
805 | """
806 | if not text:
807 | return text
808 |
809 | # Split text into lines
810 | lines = text.splitlines()
811 | result = []
812 |
813 | for _i, line in enumerate(lines):
814 | # Skip empty lines
815 | if not line.strip():
816 | result.append(line)
817 | continue
818 |
819 | # Check if line is likely a header or footer
820 | if len(line.strip()) <= max_line_length and _is_likely_header_or_footer(line):
821 | # Replace with empty line to maintain spacing
822 | result.append("")
823 | continue
824 |
825 | result.append(line)
826 |
827 | # Join lines back together
828 | return "\n".join(result)
829 |
830 | async def _process_text_chunk(chunk: str, reformat_as_markdown: bool = False, remove_headers: bool = False) -> str:
831 | """
832 | Processes a chunk of OCR text with LLM enhancement.
833 |
834 | Args:
835 | chunk: Text chunk to process
836 | reformat_as_markdown: Whether to format as markdown
837 | remove_headers: Whether to remove headers and footers
838 |
839 | Returns:
840 | Enhanced text chunk
841 | """
842 | if not chunk.strip():
843 | return ""
844 |
845 | # First apply simple rule-based fixes
846 | cleaned_text = chunk
847 |
848 | # Fix hyphenated words at line breaks
849 | cleaned_text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: f"{m.group(1)}{m.group(2)}", cleaned_text)
850 |
851 | # Remove obvious noise
852 | if _is_text_mostly_noise(cleaned_text):
853 | logger.warning("Text chunk appears to be mostly noise, applying aggressive cleaning")
854 | # Replace unusual characters with spaces
855 | cleaned_text = re.sub(r'[^\w\s.,;:!?"\'\(\)\[\]\{\}-]', ' ', cleaned_text)
856 | # Normalize spaces
857 | cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
858 |
859 | # Remove headers and footers if requested
860 | if remove_headers:
861 | cleaned_text = _remove_headers_and_footers(cleaned_text)
862 |
863 | # Prepare LLM enhancement prompt
864 | if reformat_as_markdown:
865 | prompt = f"""Correct OCR errors in this text and format it as markdown. Follow these instructions:
866 |
867 | 1. Fix OCR-induced errors:
868 | - Correct words split across line breaks (e.g., "cor- rect" → "correct")
869 | - Fix typos like 'rn' misread as 'm', '0' misread as 'O', etc.
870 | - Merge split paragraphs but preserve intentional paragraph breaks
871 | - Use context and common sense to correct errors
872 |
873 | 2. Format as markdown:
874 | - Convert headings to markdown headings (# for main title, ## for subtitles, etc.)
875 | - Format lists as proper markdown lists
876 | - Use emphasis (*italic*) and strong (**bold**) where appropriate
877 | - Create tables using markdown syntax if tabular data is detected
878 | - For code or equations, use appropriate markdown formatting
879 |
880 | 3. Clean up formatting:
881 | - Remove unnecessary line breaks within paragraphs
882 | - Preserve paragraph structure
883 | - Remove duplicated text
884 | - {"Remove headers, footers, and page numbers" if remove_headers else "Preserve all content including headers/footers"}
885 |
886 | 4. Preserve the original content's meaning and information.
887 |
888 | Here is the text to correct and format:
889 |
890 | ```
891 | {cleaned_text}
892 | ```
893 |
894 | Provide ONLY the corrected markdown text with no explanations or comments.
895 | """
896 | else:
897 | prompt = f"""Correct OCR errors in this text. Follow these instructions:
898 |
899 | 1. Fix OCR-induced errors:
900 | - Correct words split across line breaks (e.g., "cor- rect" → "correct")
901 | - Fix typos like 'rn' misread as 'm', '0' misread as 'O', etc.
902 | - Merge split paragraphs but preserve intentional paragraph breaks
903 | - Use context and common sense to correct errors
904 |
905 | 2. Clean up formatting:
906 | - Remove unnecessary line breaks within paragraphs
907 | - Preserve paragraph structure
908 | - Remove duplicated text
909 | - {"Remove headers, footers, and page numbers" if remove_headers else "Preserve all content including headers/footers"}
910 |
911 | 3. Preserve the original content's meaning and information.
912 |
913 | Here is the text to correct:
914 |
915 | ```
916 | {cleaned_text}
917 | ```
918 |
919 | Provide ONLY the corrected text with no explanations or comments.
920 | """
921 |
922 | try:
923 | # Use generate_completion to process the text
924 | task_type = TaskType.TEXT_ENHANCEMENT.value
925 |
926 | result = await generate_completion(
927 | prompt=prompt,
928 | provider=Provider.ANTHROPIC.value, # Default to Anthropic for high-quality text processing
929 | temperature=0.2, # Low temperature for consistent results
930 | max_tokens=len(cleaned_text) + 1000, # Allow some expansion for formatting
931 | task_type=task_type
932 | )
933 |
934 | if not result or not result.get("text"):
935 | logger.warning("LLM text enhancement returned empty result")
936 | return cleaned_text
937 |
938 | enhanced_text = result["text"]
939 |
940 | # Remove any "Here is the corrected..." prefixes that LLMs sometimes add
941 | enhanced_text = re.sub(r'^(Here is|The corrected|Here\'s)[^:]*:?\s*', '', enhanced_text, flags=re.IGNORECASE)
942 |
943 | return enhanced_text
944 | except ProviderError as e:
945 | logger.error(f"Provider error during text enhancement: {str(e)}")
946 | # Fall back to the cleaned text
947 | return cleaned_text
948 | except Exception as e:
949 | logger.error(f"Error during LLM text enhancement: {str(e)}")
950 | # Fall back to the cleaned text
951 | return cleaned_text
952 |
953 | # --- Main OCR tool functions ---
954 |
955 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
956 | @with_tool_metrics
957 | @with_retry(max_retries=3, retry_delay=1)
958 | @with_error_handling
959 | async def extract_text_from_pdf(
960 | file_path: str,
961 | extraction_method: str = "hybrid",
962 | max_pages: int = 0,
963 | skip_pages: int = 0,
964 | preprocessing_options: Optional[Dict[str, Any]] = None,
965 | ocr_language: str = "eng",
966 | reformat_as_markdown: bool = False,
967 | suppress_headers: bool = False,
968 | assess_quality: bool = False,
969 | dpi: int = 300
970 | ) -> Dict[str, Any]:
971 | """
972 | Extracts and enhances text from a PDF document.
973 |
974 | This tool can use multiple extraction methods: direct text extraction from the PDF,
975 | OCR-based extraction, or a hybrid approach that uses direct extraction when possible
976 | and falls back to OCR when necessary. The extracted text is then enhanced using an
977 | LLM to correct OCR errors and optionally format the output as markdown.
978 |
979 | Args:
980 | file_path: Path to the PDF file
981 | extraction_method: Method to use for text extraction:
982 | - "direct": Extract text directly from the PDF (fastest, but may fail for scanned PDFs)
983 | - "ocr": Always use OCR (slower but works for scanned PDFs)
984 | - "hybrid": Try direct extraction first, fall back to OCR if needed (default)
985 | max_pages: Maximum number of pages to process (0 = all pages)
986 | skip_pages: Number of pages to skip from the beginning (0-indexed)
987 | preprocessing_options: Dictionary of options for image preprocessing:
988 | - denoise: Whether to apply denoising (default: True)
989 | - threshold: Thresholding method ('otsu', 'adaptive', 'none') (default: 'otsu')
990 | - deskew: Whether to deskew the image (default: True)
991 | - enhance_contrast: Whether to enhance contrast (default: True)
992 | - resize_factor: Factor to resize the image (default: 1.0)
993 | ocr_language: Language(s) for OCR, e.g., "eng" or "eng+fra" (default: "eng")
994 | reformat_as_markdown: Whether to format the output as markdown (default: False)
995 | suppress_headers: Whether to remove headers, footers, and page numbers (default: False)
996 | assess_quality: Whether to assess the quality of the OCR improvement (default: False)
997 | dpi: DPI for PDF rendering when using OCR (default: 300)
998 |
999 | Returns:
1000 | A dictionary containing:
1001 | {
1002 | "success": true,
1003 | "text": "The extracted and enhanced text...",
1004 | "raw_text": "The original OCR text before enhancement...",
1005 | "pages_processed": 5,
1006 | "extraction_method_used": "hybrid",
1007 | "file_path": "/path/to/document.pdf",
1008 | "quality_metrics": { # Only if assess_quality=True
1009 | "score": 85,
1010 | "explanation": "Explanation of quality score..."
1011 | },
1012 | "processing_time": 12.34 # Seconds
1013 | }
1014 |
1015 | Raises:
1016 | ToolInputError: If the file path is invalid or the file is not a PDF
1017 | ToolError: If text extraction fails
1018 | """
1019 | start_time = time.time()
1020 |
1021 | # Validate file path
1022 | _validate_file_path(file_path, expected_extension=".pdf")
1023 |
1024 | # Check extraction method
1025 | valid_methods = ["direct", "ocr", "hybrid"]
1026 | if extraction_method not in valid_methods:
1027 | raise ToolInputError(
1028 | f"Invalid extraction method: '{extraction_method}'. Must be one of: {', '.join(valid_methods)}"
1029 | )
1030 |
1031 | # Check dependencies based on extraction method
1032 | if extraction_method in ["ocr", "hybrid"]:
1033 | if not HAS_PDF2IMAGE or not HAS_PYTESSERACT:
1034 | logger.warning(f"OCR extraction requires pdf2image and pytesseract. {extraction_method} may fail.")
1035 |
1036 | if extraction_method in ["direct", "hybrid"]:
1037 | if not HAS_PDFPLUMBER and not HAS_PYMUPDF:
1038 | logger.warning("Direct extraction requires pdfplumber or PyMuPDF.")
1039 |
1040 | # Initialize result
1041 | result = {
1042 | "success": False,
1043 | "file_path": file_path,
1044 | "pages_processed": 0,
1045 | "extraction_method_used": extraction_method
1046 | }
1047 |
1048 | method_used = extraction_method
1049 | raw_text_list = []
1050 | extracted_text_list = []
1051 | has_direct_text = False
1052 |
1053 | try:
1054 | # Step 1: Extract text
1055 | if extraction_method in ["direct", "hybrid"]:
1056 | try:
1057 | logger.info(f"Attempting direct text extraction from PDF: {file_path}")
1058 | direct_text_list, has_direct_text = _extract_text_from_pdf_direct(
1059 | file_path,
1060 | start_page=skip_pages,
1061 | max_pages=max_pages
1062 | )
1063 |
1064 | raw_text_list = direct_text_list
1065 | logger.info(f"Direct text extraction {'succeeded' if has_direct_text else 'failed'}")
1066 |
1067 | if has_direct_text and extraction_method == "direct":
1068 | # If direct extraction found text and that's the requested method, we're done
1069 | method_used = "direct"
1070 | extracted_text_list = direct_text_list
1071 | logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages")
1072 |
1073 | elif has_direct_text and extraction_method == "hybrid":
1074 | # If hybrid mode and direct extraction worked, use it
1075 | method_used = "direct"
1076 | extracted_text_list = direct_text_list
1077 | logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages (hybrid mode)")
1078 |
1079 | elif extraction_method == "direct" and not has_direct_text:
1080 | # If direct mode but no text found, we fail
1081 | raise ToolError("Direct text extraction failed to find text in the PDF")
1082 |
1083 | # If hybrid mode and no text found, fall back to OCR
1084 | if extraction_method == "hybrid" and not has_direct_text:
1085 | logger.info("No text found via direct extraction, falling back to OCR (hybrid mode)")
1086 | method_used = "ocr"
1087 | # Continue to OCR extraction below
1088 |
1089 | except Exception as e:
1090 | logger.error(f"Direct text extraction failed: {str(e)}")
1091 | if extraction_method == "direct":
1092 | raise ToolError(f"Direct text extraction failed: {str(e)}") from e
1093 |
1094 | logger.info("Falling back to OCR extraction")
1095 | method_used = "ocr"
1096 |
1097 | # Step 2: OCR extraction if needed
1098 | if method_used == "ocr" or extraction_method == "ocr":
1099 | method_used = "ocr"
1100 | logger.info(f"Performing OCR-based text extraction on PDF: {file_path}")
1101 |
1102 | # Convert PDF to images
1103 | images = _convert_pdf_to_images(
1104 | file_path,
1105 | start_page=skip_pages,
1106 | max_pages=max_pages,
1107 | dpi=dpi
1108 | )
1109 |
1110 | # Extract text using OCR
1111 | raw_text_list = []
1112 | with ThreadPoolExecutor() as executor:
1113 | # Preprocess images in parallel
1114 | preprocessed_images = list(executor.map(
1115 | lambda img: _preprocess_image(img, preprocessing_options),
1116 | images
1117 | ))
1118 |
1119 | # Extract text in parallel
1120 | ocr_config = ""
1121 | ocr_results = list(executor.map(
1122 | lambda img: _extract_text_with_ocr(img, ocr_language, ocr_config),
1123 | preprocessed_images
1124 | ))
1125 |
1126 | extracted_text_list = ocr_results
1127 | raw_text_list = ocr_results
1128 | logger.info(f"OCR extraction completed for {len(extracted_text_list)} pages")
1129 |
1130 | # Step 3: Process extracted text
1131 | logger.info("Processing extracted text with LLM enhancement")
1132 |
1133 | # Combine text from pages
1134 | full_raw_text = "\n\n".join(raw_text_list)
1135 |
1136 | # Split into chunks for LLM processing
1137 | chunks = _split_text_into_chunks(full_raw_text)
1138 | logger.info(f"Text split into {len(chunks)} chunks for LLM processing")
1139 |
1140 | # Process chunks in parallel
1141 | enhanced_chunks = await asyncio.gather(*[
1142 | _process_text_chunk(chunk, reformat_as_markdown, suppress_headers)
1143 | for chunk in chunks
1144 | ])
1145 |
1146 | # Combine chunks
1147 | enhanced_text = "\n\n".join(enhanced_chunks)
1148 |
1149 | # Step 4: Assess quality if requested
1150 | quality_metrics = None
1151 | if assess_quality:
1152 | logger.info("Assessing quality of text enhancement")
1153 | quality_metrics = await _assess_text_quality(full_raw_text, enhanced_text)
1154 |
1155 | # Prepare final result
1156 | processing_time = time.time() - start_time
1157 | result.update({
1158 | "success": True,
1159 | "text": enhanced_text,
1160 | "raw_text": full_raw_text,
1161 | "pages_processed": len(raw_text_list),
1162 | "extraction_method_used": method_used,
1163 | "processing_time": processing_time
1164 | })
1165 |
1166 | if quality_metrics:
1167 | result["quality_metrics"] = quality_metrics
1168 |
1169 | logger.info(f"Text extraction and enhancement completed successfully in {processing_time:.2f}s")
1170 | return result
1171 |
1172 | except Exception as e:
1173 | logger.error(f"Error in extract_text_from_pdf: {str(e)}")
1174 | logger.error(traceback.format_exc())
1175 | raise ToolError(f"Failed to extract and enhance text from PDF: {str(e)}") from e
1176 |
1177 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
1178 | @with_tool_metrics
1179 | @with_retry(max_retries=3, retry_delay=1)
1180 | @with_error_handling
1181 | async def extract_text_from_pdf_bytes(
1182 | pdf_bytes: bytes,
1183 | extraction_method: str = "hybrid",
1184 | max_pages: int = 0,
1185 | skip_pages: int = 0,
1186 | preprocessing_options: Optional[Dict[str, Any]] = None,
1187 | ocr_language: str = "eng",
1188 | reformat_as_markdown: bool = False,
1189 | suppress_headers: bool = False,
1190 | assess_quality: bool = False,
1191 | dpi: int = 300
1192 | ) -> Dict[str, Any]:
1193 | """
1194 | Extracts and enhances text from PDF bytes data.
1195 |
1196 | This tool works like extract_text_from_pdf but accepts PDF data as bytes instead of a file path.
1197 | It can use multiple extraction methods and enhance the extracted text using an LLM.
1198 |
1199 | Args:
1200 | pdf_bytes: PDF content as bytes
1201 | extraction_method: Method to use for text extraction:
1202 | - "direct": Extract text directly from the PDF (fastest, but may fail for scanned PDFs)
1203 | - "ocr": Always use OCR (slower but works for scanned PDFs)
1204 | - "hybrid": Try direct extraction first, fall back to OCR if needed (default)
1205 | max_pages: Maximum number of pages to process (0 = all pages)
1206 | skip_pages: Number of pages to skip from the beginning (0-indexed)
1207 | preprocessing_options: Dictionary of options for image preprocessing
1208 | ocr_language: Language(s) for OCR, e.g., "eng" or "eng+fra" (default: "eng")
1209 | reformat_as_markdown: Whether to format the output as markdown (default: False)
1210 | suppress_headers: Whether to remove headers, footers, and page numbers (default: False)
1211 | assess_quality: Whether to assess the quality of the OCR improvement (default: False)
1212 | dpi: DPI for PDF rendering when using OCR (default: 300)
1213 |
1214 | Returns:
1215 | A dictionary with the extracted and enhanced text, same format as extract_text_from_pdf
1216 |
1217 | Raises:
1218 | ToolInputError: If the PDF bytes are invalid
1219 | ToolError: If text extraction fails
1220 | """
1221 | start_time = time.time()
1222 |
1223 | # Validate input
1224 | if not pdf_bytes:
1225 | raise ToolInputError("PDF bytes cannot be empty")
1226 |
1227 | # Check extraction method
1228 | valid_methods = ["direct", "ocr", "hybrid"]
1229 | if extraction_method not in valid_methods:
1230 | raise ToolInputError(
1231 | f"Invalid extraction method: '{extraction_method}'. Must be one of: {', '.join(valid_methods)}"
1232 | )
1233 |
1234 | # Check dependencies based on extraction method
1235 | if extraction_method in ["ocr", "hybrid"]:
1236 | if not HAS_PDF2IMAGE or not HAS_PYTESSERACT:
1237 | logger.warning(f"OCR extraction requires pdf2image and pytesseract. {extraction_method} may fail.")
1238 |
1239 | if extraction_method in ["direct", "hybrid"]:
1240 | if not HAS_PDFPLUMBER and not HAS_PYMUPDF:
1241 | logger.warning("Direct extraction requires pdfplumber or PyMuPDF.")
1242 |
1243 | # Initialize result
1244 | result = {
1245 | "success": False,
1246 | "pages_processed": 0,
1247 | "extraction_method_used": extraction_method
1248 | }
1249 |
1250 | method_used = extraction_method
1251 | raw_text_list = []
1252 | extracted_text_list = []
1253 | has_direct_text = False
1254 |
1255 | try:
1256 | # Create a temporary file for processing
1257 | with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
1258 | temp_path = temp_pdf.name
1259 | temp_pdf.write(pdf_bytes)
1260 | temp_pdf.flush()
1261 |
1262 | try:
1263 | # Step 1: Extract text
1264 | if extraction_method in ["direct", "hybrid"]:
1265 | try:
1266 | logger.info("Attempting direct text extraction from PDF bytes")
1267 | direct_text_list, has_direct_text = _extract_text_from_pdf_direct(
1268 | temp_path,
1269 | start_page=skip_pages,
1270 | max_pages=max_pages
1271 | )
1272 |
1273 | raw_text_list = direct_text_list
1274 | logger.info(f"Direct text extraction {'succeeded' if has_direct_text else 'failed'}")
1275 |
1276 | if has_direct_text and extraction_method == "direct":
1277 | method_used = "direct"
1278 | extracted_text_list = direct_text_list
1279 | logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages")
1280 |
1281 | elif has_direct_text and extraction_method == "hybrid":
1282 | method_used = "direct"
1283 | extracted_text_list = direct_text_list
1284 | logger.info(f"Using direct extraction result with {len(extracted_text_list)} pages (hybrid mode)")
1285 |
1286 | elif extraction_method == "direct" and not has_direct_text:
1287 | raise ToolError("Direct text extraction failed to find text in the PDF")
1288 |
1289 | if extraction_method == "hybrid" and not has_direct_text:
1290 | logger.info("No text found via direct extraction, falling back to OCR (hybrid mode)")
1291 | method_used = "ocr"
1292 |
1293 | except Exception as e:
1294 | logger.error(f"Direct text extraction failed: {str(e)}")
1295 | if extraction_method == "direct":
1296 | raise ToolError(f"Direct text extraction failed: {str(e)}") from e
1297 |
1298 | logger.info("Falling back to OCR extraction")
1299 | method_used = "ocr"
1300 |
1301 | # Step 2: OCR extraction if needed
1302 | if method_used == "ocr" or extraction_method == "ocr":
1303 | method_used = "ocr"
1304 | logger.info("Performing OCR-based text extraction on PDF bytes")
1305 |
1306 | # Convert PDF bytes to images
1307 | images = _convert_pdf_bytes_to_images(
1308 | pdf_bytes,
1309 | start_page=skip_pages,
1310 | max_pages=max_pages,
1311 | dpi=dpi
1312 | )
1313 |
1314 | # Extract text using OCR
1315 | raw_text_list = []
1316 | with ThreadPoolExecutor() as executor:
1317 | # Preprocess images in parallel
1318 | preprocessed_images = list(executor.map(
1319 | lambda img: _preprocess_image(img, preprocessing_options),
1320 | images
1321 | ))
1322 |
1323 | # Extract text in parallel
1324 | ocr_config = ""
1325 | ocr_results = list(executor.map(
1326 | lambda img: _extract_text_with_ocr(img, ocr_language, ocr_config),
1327 | preprocessed_images
1328 | ))
1329 |
1330 | extracted_text_list = ocr_results
1331 | raw_text_list = ocr_results
1332 | logger.info(f"OCR extraction completed for {len(extracted_text_list)} pages")
1333 |
1334 | # Step 3: Process extracted text
1335 | logger.info("Processing extracted text with LLM enhancement")
1336 |
1337 | # Combine text from pages
1338 | full_raw_text = "\n\n".join(raw_text_list)
1339 |
1340 | # Split into chunks for LLM processing
1341 | chunks = _split_text_into_chunks(full_raw_text)
1342 | logger.info(f"Text split into {len(chunks)} chunks for LLM processing")
1343 |
1344 | # Process chunks in parallel
1345 | enhanced_chunks = await asyncio.gather(*[
1346 | _process_text_chunk(chunk, reformat_as_markdown, suppress_headers)
1347 | for chunk in chunks
1348 | ])
1349 |
1350 | # Combine chunks
1351 | enhanced_text = "\n\n".join(enhanced_chunks)
1352 |
1353 | # Step 4: Assess quality if requested
1354 | quality_metrics = None
1355 | if assess_quality:
1356 | logger.info("Assessing quality of text enhancement")
1357 | quality_metrics = await _assess_text_quality(full_raw_text, enhanced_text)
1358 |
1359 | # Prepare final result
1360 | processing_time = time.time() - start_time
1361 | result.update({
1362 | "success": True,
1363 | "text": enhanced_text,
1364 | "raw_text": full_raw_text,
1365 | "pages_processed": len(raw_text_list),
1366 | "extraction_method_used": method_used,
1367 | "processing_time": processing_time
1368 | })
1369 |
1370 | if quality_metrics:
1371 | result["quality_metrics"] = quality_metrics
1372 |
1373 | logger.info(f"Text extraction and enhancement completed successfully in {processing_time:.2f}s")
1374 | return result
1375 |
1376 | finally:
1377 | # Clean up temporary file
1378 | try:
1379 | os.unlink(temp_path)
1380 | except Exception as e:
1381 | logger.warning(f"Failed to remove temporary file: {str(e)}")
1382 |
1383 | except Exception as e:
1384 | logger.error(f"Error in extract_text_from_pdf_bytes: {str(e)}")
1385 | logger.error(traceback.format_exc())
1386 | raise ToolError(f"Failed to extract and enhance text from PDF bytes: {str(e)}") from e
1387 |
1388 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
1389 | @with_tool_metrics
1390 | @with_retry(max_retries=2, retry_delay=1)
1391 | @with_error_handling
1392 | async def process_image_ocr(
1393 | image_path: Optional[str] = None,
1394 | image_data: Optional[str] = None,
1395 | preprocessing_options: Optional[Dict[str, Any]] = None,
1396 | ocr_language: str = "eng",
1397 | reformat_as_markdown: bool = False,
1398 | assess_quality: bool = False
1399 | ) -> Dict[str, Any]:
1400 | """
1401 | Processes an image with OCR and enhances the extracted text.
1402 |
1403 | This tool accepts either a path to an image file or base64-encoded image data,
1404 | performs OCR on the image, and then enhances the extracted text using an LLM.
1405 |
1406 | Args:
1407 | image_path: Path to the image file (mutually exclusive with image_data)
1408 | image_data: Base64-encoded image data (mutually exclusive with image_path)
1409 | preprocessing_options: Dictionary of options for image preprocessing:
1410 | - denoise: Whether to apply denoising (default: True)
1411 | - threshold: Thresholding method ('otsu', 'adaptive', 'none') (default: 'otsu')
1412 | - deskew: Whether to deskew the image (default: True)
1413 | - enhance_contrast: Whether to enhance contrast (default: True)
1414 | - resize_factor: Factor to resize the image (default: 1.0)
1415 | ocr_language: Language(s) for OCR, e.g., "eng" or "eng+fra" (default: "eng")
1416 | reformat_as_markdown: Whether to format the output as markdown (default: False)
1417 | assess_quality: Whether to assess the quality of the OCR improvement (default: False)
1418 |
1419 | Returns:
1420 | A dictionary containing:
1421 | {
1422 | "success": true,
1423 | "text": "The extracted and enhanced text...",
1424 | "raw_text": "The original OCR text before enhancement...",
1425 | "table_detected": false, # Whether a table was detected in the image
1426 | "quality_metrics": { # Only if assess_quality=True
1427 | "score": 85,
1428 | "explanation": "Explanation of quality score..."
1429 | },
1430 | "processing_time": 3.45 # Seconds
1431 | }
1432 |
1433 | Raises:
1434 | ToolInputError: If input is invalid
1435 | ToolError: If processing fails
1436 | """
1437 | start_time = time.time()
1438 |
1439 | # Check dependencies
1440 | if not HAS_PIL or not HAS_PYTESSERACT:
1441 | missing = []
1442 | if not HAS_PIL:
1443 | missing.append("pillow")
1444 | if not HAS_PYTESSERACT:
1445 | missing.append("pytesseract")
1446 | raise ToolError(f"Required dependencies missing: {', '.join(missing)}")
1447 |
1448 | # Validate input
1449 | if not image_path and not image_data:
1450 | raise ToolInputError("Either image_path or image_data must be provided")
1451 |
1452 | if image_path and image_data:
1453 | raise ToolInputError("Only one of image_path or image_data should be provided")
1454 |
1455 | try:
1456 | # Load image
1457 | if image_path:
1458 | _validate_file_path(image_path)
1459 | image = Image.open(image_path)
1460 | else:
1461 | # Decode base64 image data
1462 | try:
1463 | image_bytes = base64.b64decode(image_data)
1464 | image = Image.open(io.BytesIO(image_bytes))
1465 | except Exception as e:
1466 | raise ToolInputError(f"Invalid base64 image data: {str(e)}") from e
1467 |
1468 | # Preprocess image
1469 | logger.info("Preprocessing image for OCR")
1470 | preprocessed_image = _preprocess_image(image, preprocessing_options)
1471 |
1472 | # Detect tables
1473 | table_regions = _detect_tables(preprocessed_image)
1474 | table_detected = len(table_regions) > 0
1475 | logger.info(f"Table detection: {len(table_regions)} potential tables found")
1476 |
1477 | # Extract text with OCR
1478 | logger.info(f"Performing OCR with language(s): {ocr_language}")
1479 | raw_text = _extract_text_with_ocr(preprocessed_image, ocr_language)
1480 |
1481 | # Process tables separately if detected
1482 | table_texts = []
1483 | if table_detected and HAS_CV2:
1484 | logger.info("Processing detected tables separately")
1485 | for i, region in enumerate(table_regions):
1486 | try:
1487 | table_image = _crop_image(preprocessed_image, region)
1488 | # Use a different preprocessing for tables (less aggressive)
1489 | table_options = {"denoise": True, "threshold": "adaptive", "deskew": False}
1490 | processed_table_image = _preprocess_image(table_image, table_options)
1491 | table_text = _extract_text_with_ocr(processed_table_image, ocr_language)
1492 | if table_text.strip():
1493 | table_texts.append(f"\n\nTable {i+1}:\n{table_text}\n")
1494 | except Exception as e:
1495 | logger.warning(f"Error processing table {i+1}: {str(e)}")
1496 |
1497 | # Include table texts with the main text
1498 | if table_texts:
1499 | raw_text += "\n\n" + "\n".join(table_texts)
1500 |
1501 | # Process with LLM
1502 | logger.info("Processing extracted text with LLM enhancement")
1503 | enhanced_text = await _process_text_chunk(raw_text, reformat_as_markdown, suppress_headers=False)
1504 |
1505 | # Assess quality if requested
1506 | quality_metrics = None
1507 | if assess_quality:
1508 | logger.info("Assessing quality of text enhancement")
1509 | quality_metrics = await _assess_text_quality(raw_text, enhanced_text)
1510 |
1511 | # Prepare result
1512 | processing_time = time.time() - start_time
1513 | result = {
1514 | "success": True,
1515 | "text": enhanced_text,
1516 | "raw_text": raw_text,
1517 | "table_detected": table_detected,
1518 | "processing_time": processing_time
1519 | }
1520 |
1521 | if quality_metrics:
1522 | result["quality_metrics"] = quality_metrics
1523 |
1524 | logger.info(f"Image OCR processing completed in {processing_time:.2f}s")
1525 | return result
1526 |
1527 | except Exception as e:
1528 | logger.error(f"Error in process_image_ocr: {str(e)}")
1529 | logger.error(traceback.format_exc())
1530 | raise ToolError(f"Failed to process image with OCR: {str(e)}") from e
1531 |
1532 | @with_cache(ttl=24 * 60 * 60) # Cache for 24 hours
1533 | @with_tool_metrics
1534 | @with_retry(max_retries=2, retry_delay=1)
1535 | @with_error_handling
1536 | async def enhance_ocr_text(
1537 | ocr_text: str,
1538 | reformat_as_markdown: bool = False,
1539 | remove_headers: bool = False,
1540 | detect_tables: bool = True,
1541 | assess_quality: bool = False
1542 | ) -> Dict[str, Any]:
1543 | """
1544 | Enhances existing OCR text using an LLM to correct errors and improve formatting.
1545 |
1546 | This tool takes OCR text (e.g., from a different OCR engine) and uses an LLM to
1547 | correct errors, improve formatting, and optionally convert to markdown.
1548 |
1549 | Args:
1550 | ocr_text: The OCR text to enhance
1551 | reformat_as_markdown: Whether to format the output as markdown (default: False)
1552 | remove_headers: Whether to remove headers, footers, and page numbers (default: False)
1553 | detect_tables: Whether to attempt to detect and format tables (default: True)
1554 | assess_quality: Whether to assess the quality of the OCR improvement (default: False)
1555 |
1556 | Returns:
1557 | A dictionary containing:
1558 | {
1559 | "success": true,
1560 | "text": "The enhanced text...",
1561 | "raw_text": "The original OCR text...",
1562 | "quality_metrics": { # Only if assess_quality=True
1563 | "score": 85,
1564 | "explanation": "Explanation of quality score..."
1565 | },
1566 | "processing_time": 2.34 # Seconds
1567 | }
1568 |
1569 | Raises:
1570 | ToolInputError: If the OCR text is empty
1571 | ToolError: If enhancement fails
1572 | """
1573 | start_time = time.time()
1574 |
1575 | # Validate input
1576 | if not ocr_text or not isinstance(ocr_text, str):
1577 | raise ToolInputError("OCR text must be a non-empty string")
1578 |
1579 | try:
1580 | # Split into chunks if large
1581 | if len(ocr_text) > 10000:
1582 | logger.info(f"Splitting large OCR text ({len(ocr_text)} chars) into chunks")
1583 | chunks = _split_text_into_chunks(ocr_text)
1584 |
1585 | # Process chunks in parallel
1586 | enhanced_chunks = await asyncio.gather(*[
1587 | _process_text_chunk(chunk, reformat_as_markdown, remove_headers)
1588 | for chunk in chunks
1589 | ])
1590 |
1591 | # Combine chunks
1592 | enhanced_text = "\n\n".join(enhanced_chunks)
1593 | logger.info(f"Processed {len(chunks)} text chunks")
1594 | else:
1595 | # Process directly if small enough
1596 | enhanced_text = await _process_text_chunk(ocr_text, reformat_as_markdown, remove_headers)
1597 |
1598 | # Detect and format tables if requested
1599 | if detect_tables and reformat_as_markdown:
1600 | logger.info("Attempting table detection and formatting")
1601 | enhanced_text = await _format_tables_in_text(enhanced_text)
1602 |
1603 | # Assess quality if requested
1604 | quality_metrics = None
1605 | if assess_quality:
1606 | logger.info("Assessing quality of text enhancement")
1607 | quality_metrics = await _assess_text_quality(ocr_text, enhanced_text)
1608 |
1609 | # Prepare result
1610 | processing_time = time.time() - start_time
1611 | result = {
1612 | "success": True,
1613 | "text": enhanced_text,
1614 | "raw_text": ocr_text,
1615 | "processing_time": processing_time
1616 | }
1617 |
1618 | if quality_metrics:
1619 | result["quality_metrics"] = quality_metrics
1620 |
1621 | logger.info(f"OCR text enhancement completed in {processing_time:.2f}s")
1622 | return result
1623 |
1624 | except Exception as e:
1625 | logger.error(f"Error in enhance_ocr_text: {str(e)}")
1626 | logger.error(traceback.format_exc())
1627 | raise ToolError(f"Failed to enhance OCR text: {str(e)}") from e
1628 |
1629 | @with_tool_metrics
1630 | @with_retry(max_retries=2, retry_delay=1.0)
1631 | @with_error_handling
1632 | async def analyze_pdf_structure(
1633 | file_path: str,
1634 | extract_metadata: bool = True,
1635 | extract_outline: bool = True,
1636 | extract_fonts: bool = False,
1637 | extract_images: bool = False,
1638 | estimate_ocr_needs: bool = True
1639 | ) -> Dict[str, Any]:
1640 | """
1641 | Analyzes the structure of a PDF file without performing full text extraction.
1642 |
1643 | This tool examines a PDF file and provides information about its structure,
1644 | including metadata, outline (table of contents), fonts, embedded images,
1645 | and an assessment of whether OCR would be beneficial.
1646 |
1647 | Args:
1648 | file_path: Path to the PDF file
1649 | extract_metadata: Whether to extract document metadata (default: True)
1650 | extract_outline: Whether to extract the document outline/TOC (default: True)
1651 | extract_fonts: Whether to extract font information (default: False)
1652 | extract_images: Whether to extract information about embedded images (default: False)
1653 | estimate_ocr_needs: Whether to estimate if OCR would benefit this PDF (default: True)
1654 |
1655 | Returns:
1656 | A dictionary containing:
1657 | {
1658 | "success": true,
1659 | "file_path": "/path/to/document.pdf",
1660 | "page_count": 42,
1661 | "metadata": { # Only if extract_metadata=True
1662 | "title": "Document Title",
1663 | "author": "Author Name",
1664 | "subject": "Document Subject",
1665 | "keywords": "keyword1, keyword2",
1666 | "creator": "Creator Application",
1667 | "producer": "Producer Application",
1668 | "creation_date": "2023-01-01T12:00:00",
1669 | "modification_date": "2023-02-01T13:00:00"
1670 | },
1671 | "outline": [ # Only if extract_outline=True
1672 | {
1673 | "title": "Chapter 1",
1674 | "page": 5,
1675 | "children": [
1676 | {"title": "Section 1.1", "page": 6, "children": []}
1677 | ]
1678 | },
1679 | {"title": "Chapter 2", "page": 15, "children": []}
1680 | ],
1681 | "font_info": { # Only if extract_fonts=True
1682 | "total_fonts": 3,
1683 | "embedded_fonts": 2,
1684 | "font_names": ["Arial", "Times New Roman", "Courier"]
1685 | },
1686 | "image_info": { # Only if extract_images=True
1687 | "total_images": 12,
1688 | "image_types": {"jpeg": 8, "png": 4},
1689 | "average_size": "120kb"
1690 | },
1691 | "ocr_assessment": { # Only if estimate_ocr_needs=True
1692 | "needs_ocr": false,
1693 | "confidence": "high",
1694 | "reason": "PDF contains extractable text throughout"
1695 | },
1696 | "processing_time": 1.23 # Seconds
1697 | }
1698 |
1699 | Raises:
1700 | ToolInputError: If the file path is invalid or the file is not a PDF
1701 | ToolError: If analysis fails
1702 | """
1703 | start_time = time.time()
1704 |
1705 | # Validate file path
1706 | _validate_file_path(file_path, expected_extension=".pdf")
1707 |
1708 | # Check for required libraries
1709 | pdf_lib_available = False
1710 | if HAS_PYMUPDF:
1711 | pdf_lib = "pymupdf"
1712 | pdf_lib_available = True
1713 | elif HAS_PDFPLUMBER:
1714 | pdf_lib = "pdfplumber"
1715 | pdf_lib_available = True
1716 |
1717 | if not pdf_lib_available:
1718 | raise ToolError("PDF analysis requires PyMuPDF or pdfplumber")
1719 |
1720 | try:
1721 | result = {
1722 | "success": False,
1723 | "file_path": file_path,
1724 | "processing_time": 0
1725 | }
1726 |
1727 | if pdf_lib == "pymupdf":
1728 | # Use PyMuPDF for analysis
1729 | with pymupdf.open(file_path) as doc:
1730 | # Basic information
1731 | result["page_count"] = len(doc)
1732 |
1733 | # Extract metadata if requested
1734 | if extract_metadata:
1735 | metadata = doc.metadata
1736 | if metadata:
1737 | result["metadata"] = {
1738 | "title": metadata.get("title", ""),
1739 | "author": metadata.get("author", ""),
1740 | "subject": metadata.get("subject", ""),
1741 | "keywords": metadata.get("keywords", ""),
1742 | "creator": metadata.get("creator", ""),
1743 | "producer": metadata.get("producer", ""),
1744 | "creation_date": metadata.get("creationDate", ""),
1745 | "modification_date": metadata.get("modDate", "")
1746 | }
1747 |
1748 | # Extract outline if requested
1749 | if extract_outline:
1750 | toc = doc.get_toc()
1751 | if toc:
1752 | # Process TOC into a nested structure
1753 | result["outline"] = _process_toc(toc)
1754 |
1755 | # Extract font information if requested
1756 | if extract_fonts:
1757 | fonts: Set[str] = set()
1758 | embedded_fonts: Set[str] = set()
1759 |
1760 | for page_num in range(min(10, len(doc))): # Analyze first 10 pages
1761 | page = doc[page_num]
1762 | page_fonts = page.get_fonts()
1763 |
1764 | for font in page_fonts:
1765 | fonts.add(font[3]) # Font name
1766 | if font[2]: # Embedded flag
1767 | embedded_fonts.add(font[3])
1768 |
1769 | result["font_info"] = {
1770 | "total_fonts": len(fonts),
1771 | "embedded_fonts": len(embedded_fonts),
1772 | "font_names": list(fonts)
1773 | }
1774 |
1775 | # Extract image information if requested
1776 | if extract_images:
1777 | image_count = 0
1778 | image_types: Dict[str, int] = {}
1779 | total_size = 0
1780 |
1781 | for page_num in range(min(5, len(doc))): # Analyze first 5 pages
1782 | page = doc[page_num]
1783 | images = page.get_images(full=True)
1784 |
1785 | for img in images:
1786 | image_count += 1
1787 | xref = img[0]
1788 | img_info = doc.extract_image(xref)
1789 |
1790 | if img_info:
1791 | img_type = img_info["ext"]
1792 | img_size = len(img_info["image"])
1793 |
1794 | image_types[img_type] = image_types.get(img_type, 0) + 1
1795 | total_size += img_size
1796 |
1797 | # Extrapolate total images based on sample
1798 | estimated_total = int(image_count * (len(doc) / max(1, min(5, len(doc)))))
1799 | avg_size = f"{int(total_size / max(1, image_count) / 1024)}kb" if image_count > 0 else "0kb"
1800 |
1801 | result["image_info"] = {
1802 | "total_images": image_count,
1803 | "estimated_total": estimated_total,
1804 | "image_types": image_types,
1805 | "average_size": avg_size
1806 | }
1807 |
1808 | # Estimate OCR needs if requested
1809 | if estimate_ocr_needs:
1810 | text_pages = 0
1811 | total_pages = len(doc)
1812 | sample_size = min(10, total_pages)
1813 |
1814 | for page_num in range(sample_size):
1815 | page = doc[page_num]
1816 | text = page.get_text()
1817 | if text and len(text.strip()) > 50: # Page has meaningful text
1818 | text_pages += 1
1819 |
1820 | text_ratio = text_pages / sample_size
1821 |
1822 | if text_ratio > 0.9:
1823 | needs_ocr = False
1824 | confidence = "high"
1825 | reason = "PDF contains extractable text throughout"
1826 | elif text_ratio > 0.5:
1827 | needs_ocr = True
1828 | confidence = "medium"
1829 | reason = "PDF has some extractable text but may benefit from OCR for certain pages"
1830 | else:
1831 | needs_ocr = True
1832 | confidence = "high"
1833 | reason = "PDF appears to be scanned or has minimal extractable text"
1834 |
1835 | result["ocr_assessment"] = {
1836 | "needs_ocr": needs_ocr,
1837 | "confidence": confidence,
1838 | "reason": reason,
1839 | "text_coverage_ratio": text_ratio
1840 | }
1841 |
1842 | elif pdf_lib == "pdfplumber":
1843 | # Use pdfplumber for analysis
1844 | with pdfplumber.open(file_path) as pdf:
1845 | # Basic information
1846 | result["page_count"] = len(pdf.pages)
1847 |
1848 | # Extract metadata if requested
1849 | if extract_metadata:
1850 | metadata = pdf.metadata
1851 | if metadata:
1852 | result["metadata"] = {
1853 | "title": metadata.get("Title", ""),
1854 | "author": metadata.get("Author", ""),
1855 | "subject": metadata.get("Subject", ""),
1856 | "keywords": metadata.get("Keywords", ""),
1857 | "creator": metadata.get("Creator", ""),
1858 | "producer": metadata.get("Producer", ""),
1859 | "creation_date": metadata.get("CreationDate", ""),
1860 | "modification_date": metadata.get("ModDate", "")
1861 | }
1862 |
1863 | # Outline not supported in pdfplumber
1864 | if extract_outline:
1865 | result["outline"] = []
1866 |
1867 | # Font and image info not supported in pdfplumber
1868 | if extract_fonts:
1869 | result["font_info"] = {
1870 | "total_fonts": 0,
1871 | "embedded_fonts": 0,
1872 | "font_names": []
1873 | }
1874 |
1875 | if extract_images:
1876 | result["image_info"] = {
1877 | "total_images": 0,
1878 | "image_types": {},
1879 | "average_size": "0kb"
1880 | }
1881 |
1882 | # Estimate OCR needs if requested
1883 | if estimate_ocr_needs:
1884 | text_pages = 0
1885 | total_pages = len(pdf.pages)
1886 | sample_size = min(10, total_pages)
1887 |
1888 | for page_num in range(sample_size):
1889 | page = pdf.pages[page_num]
1890 | text = page.extract_text()
1891 | if text and len(text.strip()) > 50: # Page has meaningful text
1892 | text_pages += 1
1893 |
1894 | text_ratio = text_pages / sample_size
1895 |
1896 | if text_ratio > 0.9:
1897 | needs_ocr = False
1898 | confidence = "high"
1899 | reason = "PDF contains extractable text throughout"
1900 | elif text_ratio > 0.5:
1901 | needs_ocr = True
1902 | confidence = "medium"
1903 | reason = "PDF has some extractable text but may benefit from OCR for certain pages"
1904 | else:
1905 | needs_ocr = True
1906 | confidence = "high"
1907 | reason = "PDF appears to be scanned or has minimal extractable text"
1908 |
1909 | result["ocr_assessment"] = {
1910 | "needs_ocr": needs_ocr,
1911 | "confidence": confidence,
1912 | "reason": reason,
1913 | "text_coverage_ratio": text_ratio
1914 | }
1915 |
1916 | # Update result
1917 | processing_time = time.time() - start_time
1918 | result["success"] = True
1919 | result["processing_time"] = processing_time
1920 |
1921 | logger.info(f"PDF structure analysis completed in {processing_time:.2f}s")
1922 | return result
1923 |
1924 | except Exception as e:
1925 | logger.error(f"Error in analyze_pdf_structure: {str(e)}")
1926 | logger.error(traceback.format_exc())
1927 | raise ToolError(f"Failed to analyze PDF structure: {str(e)}") from e
1928 |
1929 | @with_tool_metrics
1930 | @with_retry(max_retries=2, retry_delay=1.0)
1931 | @with_error_handling
1932 | async def batch_process_documents(
1933 | folder_path: str,
1934 | file_pattern: str = "*.pdf",
1935 | output_folder: Optional[str] = None,
1936 | extraction_method: str = "hybrid",
1937 | max_pages_per_file: int = 0,
1938 | reformat_as_markdown: bool = True,
1939 | suppress_headers: bool = True,
1940 | max_concurrency: int = 3,
1941 | skip_on_error: bool = True,
1942 | bytes_data: Optional[Dict[str, Union[bytes, str]]] = None
1943 | ) -> Dict[str, Any]:
1944 | """
1945 | Processes multiple document files in a folder with OCR and LLM enhancement.
1946 |
1947 | This tool handles batch processing of documents (PDFs and images) in a folder,
1948 | extracting text, correcting OCR errors, and saving the results to an output folder.
1949 | It can also process documents provided as bytes data.
1950 |
1951 | Args:
1952 | folder_path: Path to the folder containing files to process
1953 | file_pattern: Pattern to match files (default: "*.pdf", can be "*.jpg", "*.png", etc.)
1954 | output_folder: Path to save the output files (default: create 'processed' subfolder)
1955 | extraction_method: Method for PDF text extraction ("direct", "ocr", "hybrid")
1956 | max_pages_per_file: Maximum pages to process per PDF (0 = all pages)
1957 | reformat_as_markdown: Whether to format the output as markdown (default: True)
1958 | suppress_headers: Whether to remove headers and footers (default: True)
1959 | max_concurrency: Maximum number of files to process in parallel (default: 3)
1960 | skip_on_error: Whether to continue processing other files if one fails (default: True)
1961 | bytes_data: Optional dictionary of filename to bytes data for processing data directly
1962 |
1963 | Returns:
1964 | A dictionary containing:
1965 | {
1966 | "success": true,
1967 | "processed_files": [
1968 | {
1969 | "file": "/path/to/document1.pdf",
1970 | "output_file": "/path/to/output/document1.md",
1971 | "pages_processed": 5,
1972 | "extraction_method": "hybrid",
1973 | "processing_time": 12.34,
1974 | "quality_score": 85 # if quality assessment is performed
1975 | },
1976 | {
1977 | "file": "/path/to/document2.pdf",
1978 | "error": "Error message", # if processing failed
1979 | "status": "failed"
1980 | }
1981 | ],
1982 | "total_files": 5,
1983 | "successful_files": 4,
1984 | "failed_files": 1,
1985 | "output_folder": "/path/to/output",
1986 | "total_processing_time": 45.67 # Seconds
1987 | }
1988 |
1989 | Raises:
1990 | ToolInputError: If the folder path is invalid
1991 | ToolError: If batch processing fails
1992 | """
1993 | start_time = time.time()
1994 |
1995 | # Validate input if processing files from a folder
1996 | all_files = []
1997 |
1998 | if not bytes_data:
1999 | # Standard file processing from a folder
2000 | if not folder_path or not os.path.exists(folder_path) or not os.path.isdir(folder_path):
2001 | raise ToolInputError(f"Invalid folder path: {folder_path}")
2002 |
2003 | # Set output folder if not provided
2004 | if not output_folder:
2005 | output_folder = os.path.join(folder_path, "processed")
2006 |
2007 | # Create output folder if it doesn't exist
2008 | os.makedirs(output_folder, exist_ok=True)
2009 |
2010 | # Find files matching the pattern
2011 | matching_files: List[Path] = sorted(list(Path(folder_path).glob(file_pattern)))
2012 |
2013 | if not matching_files:
2014 | raise ToolInputError(f"No files found in {folder_path} matching pattern {file_pattern}")
2015 |
2016 | all_files = [(str(f), None) for f in matching_files] # (path, bytes_data)
2017 | else:
2018 | # Processing from bytes data
2019 | if not output_folder:
2020 | # Create a temporary output folder if not specified
2021 | output_folder = tempfile.mkdtemp(prefix="ocr_batch_")
2022 | else:
2023 | os.makedirs(output_folder, exist_ok=True)
2024 |
2025 | # Convert bytes_data to our format
2026 | for filename, data in bytes_data.items():
2027 | if isinstance(data, str) and data.startswith('data:'):
2028 | # Handle base64 data URLs
2029 | try:
2030 | mime_type, b64data = data.split(';base64,', 1)
2031 | file_bytes = base64.b64decode(b64data)
2032 | all_files.append((filename, file_bytes))
2033 | except Exception as e:
2034 | logger.error(f"Error decoding base64 data for {filename}: {str(e)}")
2035 | if not skip_on_error:
2036 | raise ToolError(f"Failed to decode base64 data: {str(e)}") from e
2037 | elif isinstance(data, bytes):
2038 | # Already in bytes format
2039 | all_files.append((filename, data))
2040 | else:
2041 | logger.error(f"Unsupported data format for {filename}")
2042 | if not skip_on_error:
2043 | raise ToolInputError(f"Unsupported data format for {filename}")
2044 |
2045 | if not all_files:
2046 | raise ToolInputError("No files to process")
2047 |
2048 | # Get task type for batch processing
2049 | task_type = _get_task_type_for_ocr(extraction_method)
2050 | logger.info(f"Batch processing documents with task type: {task_type}")
2051 |
2052 | # Initialize result
2053 | result = {
2054 | "success": False,
2055 | "processed_files": [],
2056 | "total_files": len(all_files),
2057 | "successful_files": 0,
2058 | "failed_files": 0,
2059 | "output_folder": output_folder,
2060 | "total_processing_time": 0,
2061 | "task_type": task_type
2062 | }
2063 |
2064 | # Create semaphore for concurrency control
2065 | semaphore = asyncio.Semaphore(max_concurrency)
2066 |
2067 | # Create partially-applied functions for better reuse and readability
2068 | # This allows us to pre-configure the processing functions with common parameters
2069 | extract_pdf_with_config = functools.partial(
2070 | extract_text_from_pdf,
2071 | extraction_method=extraction_method,
2072 | max_pages=max_pages_per_file,
2073 | skip_pages=0,
2074 | reformat_as_markdown=reformat_as_markdown,
2075 | suppress_headers=suppress_headers,
2076 | assess_quality=True
2077 | )
2078 |
2079 | extract_pdf_bytes_with_config = functools.partial(
2080 | extract_text_from_pdf_bytes,
2081 | extraction_method=extraction_method,
2082 | max_pages=max_pages_per_file,
2083 | skip_pages=0,
2084 | reformat_as_markdown=reformat_as_markdown,
2085 | suppress_headers=suppress_headers,
2086 | assess_quality=True
2087 | )
2088 |
2089 | process_image_with_config = functools.partial(
2090 | process_image_ocr,
2091 | reformat_as_markdown=reformat_as_markdown,
2092 | assess_quality=True
2093 | )
2094 |
2095 | # Define worker function for processing each file
2096 | async def process_file(file_info: Tuple[str, Optional[bytes]]) -> Dict[str, Any]:
2097 | file_path, file_bytes = file_info
2098 | async with semaphore:
2099 | logger.info(f"Processing file: {file_path}")
2100 | file_start_time = time.time()
2101 |
2102 | try:
2103 | # Determine file type based on extension
2104 | is_pdf = file_path.lower().endswith('.pdf')
2105 |
2106 | # Process according to file type
2107 | if is_pdf:
2108 | # Extract base name
2109 | base_name = os.path.splitext(os.path.basename(file_path))[0]
2110 |
2111 | # Determine output file extension
2112 | output_extension = '.md' if reformat_as_markdown else '.txt'
2113 |
2114 | # Define output file path
2115 | output_file = os.path.join(output_folder, f"{base_name}{output_extension}")
2116 |
2117 | # Extract text based on whether we have bytes or file path
2118 | if file_bytes is not None:
2119 | # Process PDF from bytes
2120 | extraction_result = await extract_pdf_bytes_with_config(pdf_bytes=file_bytes)
2121 | else:
2122 | # Process PDF from file path
2123 | extraction_result = await extract_pdf_with_config(file_path=file_path)
2124 |
2125 | # Save the enhanced text
2126 | with open(output_file, "w", encoding="utf-8") as f:
2127 | f.write(extraction_result["text"])
2128 |
2129 | # Save the raw text for reference
2130 | raw_output_file = os.path.join(output_folder, f"{base_name}_raw.txt")
2131 | with open(raw_output_file, "w", encoding="utf-8") as f:
2132 | f.write(extraction_result["raw_text"])
2133 |
2134 | # Create file result
2135 | file_processing_time = time.time() - file_start_time
2136 | file_result = {
2137 | "file": file_path,
2138 | "output_file": output_file,
2139 | "raw_output_file": raw_output_file,
2140 | "pages_processed": extraction_result["pages_processed"],
2141 | "extraction_method_used": extraction_result["extraction_method_used"],
2142 | "processing_time": file_processing_time,
2143 | "status": "success"
2144 | }
2145 |
2146 | # Add quality metrics if available
2147 | if "quality_metrics" in extraction_result:
2148 | quality_metrics = extraction_result["quality_metrics"]
2149 | file_result["quality_score"] = quality_metrics.get("score")
2150 |
2151 | logger.info(f"Successfully processed PDF: {file_path}")
2152 |
2153 | else:
2154 | # Handle image file
2155 | base_name = os.path.splitext(os.path.basename(file_path))[0]
2156 | output_extension = '.md' if reformat_as_markdown else '.txt'
2157 | output_file = os.path.join(output_folder, f"{base_name}{output_extension}")
2158 |
2159 | # Process image with OCR based on whether we have bytes or file path
2160 | if file_bytes is not None:
2161 | # Process image from bytes
2162 | ocr_result = await process_image_with_config(image_data=base64.b64encode(file_bytes).decode('utf-8'))
2163 | else:
2164 | # Process image from file path
2165 | ocr_result = await process_image_with_config(image_path=file_path)
2166 |
2167 | # Save the enhanced text
2168 | with open(output_file, "w", encoding="utf-8") as f:
2169 | f.write(ocr_result["text"])
2170 |
2171 | # Save the raw text for reference
2172 | raw_output_file = os.path.join(output_folder, f"{base_name}_raw.txt")
2173 | with open(raw_output_file, "w", encoding="utf-8") as f:
2174 | f.write(ocr_result["raw_text"])
2175 |
2176 | # Create file result
2177 | file_processing_time = time.time() - file_start_time
2178 | file_result = {
2179 | "file": file_path,
2180 | "output_file": output_file,
2181 | "raw_output_file": raw_output_file,
2182 | "table_detected": ocr_result.get("table_detected", False),
2183 | "processing_time": file_processing_time,
2184 | "status": "success"
2185 | }
2186 |
2187 | # Add quality metrics if available
2188 | if "quality_metrics" in ocr_result:
2189 | quality_metrics = ocr_result["quality_metrics"]
2190 | file_result["quality_score"] = quality_metrics.get("score")
2191 |
2192 | logger.info(f"Successfully processed image: {file_path}")
2193 |
2194 | return file_result
2195 | except Exception as e:
2196 | logger.error(f"Error processing {file_path}: {str(e)}")
2197 | return {
2198 | "file": file_path,
2199 | "error": str(e),
2200 | "status": "failed"
2201 | }
2202 |
2203 | try:
2204 | # Process files in parallel
2205 | tasks = [process_file(file_info) for file_info in all_files]
2206 | processed_results = await asyncio.gather(*tasks)
2207 |
2208 | # Update result
2209 | result["processed_files"] = processed_results
2210 | result["successful_files"] = sum(1 for r in processed_results if r.get("status") == "success")
2211 | result["failed_files"] = sum(1 for r in processed_results if r.get("status") == "failed")
2212 | result["success"] = True
2213 |
2214 | # Calculate total processing time
2215 | total_processing_time = time.time() - start_time
2216 | result["total_processing_time"] = total_processing_time
2217 |
2218 | logger.info(f"Batch processing completed: {result['successful_files']} successful, {result['failed_files']} failed")
2219 | return result
2220 |
2221 | except Exception as e:
2222 | logger.error(f"Error in batch processing: {str(e)}")
2223 | logger.error(traceback.format_exc())
2224 | raise ToolError(f"Failed to batch process documents: {str(e)}") from e
2225 |
2226 | # --- Additional helper functions ---
2227 |
2228 | def _process_toc(toc: List) -> List[Dict[str, Any]]:
2229 | """
2230 | Processes a PDF table of contents into a nested structure.
2231 |
2232 | Args:
2233 | toc: Table of contents from PyMuPDF
2234 |
2235 | Returns:
2236 | Nested outline structure
2237 | """
2238 | if not toc:
2239 | return []
2240 |
2241 | # Convert flat list with indentation levels to nested structure
2242 | result = []
2243 | stack = [(-1, result)] # (level, children_list)
2244 |
2245 | for item in toc:
2246 | level, title, page = item
2247 |
2248 | # Find parent in stack
2249 | while stack[-1][0] >= level:
2250 | stack.pop()
2251 |
2252 | # Create new entry
2253 | entry = {"title": title, "page": page, "children": []}
2254 | stack[-1][1].append(entry)
2255 |
2256 | # Add to stack
2257 | stack.append((level, entry["children"]))
2258 |
2259 | return result
2260 |
2261 | async def _format_tables_in_text(text: str) -> str:
2262 | """
2263 | Detects and formats potential tables in text using markdown.
2264 |
2265 | Args:
2266 | text: Text to process
2267 |
2268 | Returns:
2269 | Text with tables formatted in markdown
2270 | """
2271 | # Simple pattern to detect table-like content
2272 | table_patterns = [
2273 | # Multiple lines with similar column separator patterns
2274 | r'(\n|^)(((\s*\S+\s*\|\s*\S+\s*)+\|?(\s*\n)){2,})',
2275 | # Multiple lines with similar tab/space alignment
2276 | r'(\n|^)((\s*\S+\s+\S+\s+\S+\s+\S+\s*\n){3,})'
2277 | ]
2278 |
2279 | table_sections: List[Tuple[int, int, str]] = []
2280 | for pattern in table_patterns:
2281 | matches = re.finditer(pattern, text, re.MULTILINE)
2282 | for match in matches:
2283 | table_sections.append((match.start(), match.end(), match.group(2)))
2284 |
2285 | # Sort by start position
2286 | table_sections.sort(key=lambda x: x[0])
2287 |
2288 | # No tables found
2289 | if not table_sections:
2290 | return text
2291 |
2292 | # Process each potential table
2293 | result_parts = []
2294 | last_end = 0
2295 |
2296 | for start, end, table_text in table_sections:
2297 | # Add text before table
2298 | if start > last_end:
2299 | result_parts.append(text[last_end:start])
2300 |
2301 | # Process table
2302 | try:
2303 | formatted_table = await _enhance_table_formatting(table_text)
2304 | result_parts.append(formatted_table)
2305 | except Exception as e:
2306 | logger.warning(f"Error formatting table: {str(e)}")
2307 | result_parts.append(table_text)
2308 |
2309 | last_end = end
2310 |
2311 | # Add remaining text
2312 | if last_end < len(text):
2313 | result_parts.append(text[last_end:])
2314 |
2315 | return ''.join(result_parts)
2316 |
2317 | async def _enhance_table_formatting(table_text):
2318 | """
2319 | Enhances table formatting using LLM.
2320 |
2321 | Args:
2322 | table_text: Potential table text
2323 |
2324 | Returns:
2325 | Formatted table in markdown
2326 | """
2327 | prompt = f"""Format the following text as a markdown table. The text appears to contain tabular data but may not be properly formatted.
2328 |
2329 | 1. Detect column headers and content
2330 | 2. Create a proper markdown table with headers, separator row, and content rows
2331 | 3. Preserve all information but improve readability
2332 | 4. If the input is not actually tabular data, return it unchanged with a comment indicating it's not a table
2333 |
2334 | Here is the text to format:
2335 |
2336 | ```
2337 | {table_text}
2338 | ```
2339 |
2340 | Provide ONLY the formatted markdown table with no explanations or comments.
2341 | """
2342 |
2343 | try:
2344 | result = await generate_completion(
2345 | prompt=prompt,
2346 | provider=Provider.ANTHROPIC.value,
2347 | temperature=0.2,
2348 | max_tokens=len(table_text) + 500
2349 | )
2350 |
2351 | if not result or not result.get("text"):
2352 | return table_text
2353 |
2354 | formatted_table = result["text"]
2355 |
2356 | # Check if it's actually formatted as a markdown table
2357 | if "|" in formatted_table and "-|-" in formatted_table:
2358 | return "\n" + formatted_table + "\n"
2359 | else:
2360 | return table_text
2361 | except Exception as e:
2362 | logger.warning(f"Error enhancing table format: {str(e)}")
2363 | return table_text
2364 |
2365 | async def _assess_text_quality(original_text: str, enhanced_text: str) -> Dict[str, Any]:
2366 | """
2367 | Assesses the quality of OCR enhancement using LLM.
2368 |
2369 | Args:
2370 | original_text: Original OCR text
2371 | enhanced_text: LLM-enhanced text
2372 |
2373 | Returns:
2374 | Dictionary with quality assessment
2375 | """
2376 | # Truncate texts to reasonable lengths for assessment
2377 | max_sample = 5000
2378 | original_sample = original_text[:max_sample]
2379 | enhanced_sample = enhanced_text[:max_sample]
2380 |
2381 | prompt = f"""Assess the quality improvement between the original OCR text and the enhanced version. Consider:
2382 |
2383 | 1. Error correction (typos, OCR artifacts, broken words)
2384 | 2. Formatting improvements (paragraph structure, headings, lists)
2385 | 3. Readability enhancement
2386 | 4. Preservation of original content and meaning
2387 | 5. Removal of unnecessary elements (headers, footers, artifacts)
2388 |
2389 | Original OCR text:
2390 | ```
2391 | {original_sample}
2392 | ```
2393 |
2394 | Enhanced text:
2395 | ```
2396 | {enhanced_sample}
2397 | ```
2398 |
2399 | Provide:
2400 | 1. A quality score from 0-100 where 100 is perfect enhancement
2401 | 2. A brief explanation of improvements and any issues
2402 | 3. Specific examples of corrections (max 3 examples)
2403 |
2404 | Format your response as follows:
2405 | SCORE: [score]
2406 | EXPLANATION: [explanation]
2407 | EXAMPLES:
2408 | - [example 1]
2409 | - [example 2]
2410 | - [example 3]
2411 | """
2412 |
2413 | try:
2414 | result = await generate_completion(
2415 | prompt=prompt,
2416 | provider=Provider.ANTHROPIC.value,
2417 | temperature=0.3,
2418 | max_tokens=1000
2419 | )
2420 |
2421 | if not result or not result.get("text"):
2422 | return {"score": None, "explanation": "Failed to assess quality"}
2423 |
2424 | assessment_text = result["text"]
2425 |
2426 | # Parse the assessment
2427 | score_match = re.search(r'SCORE:\s*(\d+)', assessment_text)
2428 | explanation_match = re.search(r'EXPLANATION:\s*(.*?)(?:\n\s*EXAMPLES|\Z)', assessment_text, re.DOTALL)
2429 | examples_match = re.search(r'EXAMPLES:\s*(.*?)(?:\Z)', assessment_text, re.DOTALL)
2430 |
2431 | score = int(score_match.group(1)) if score_match else None
2432 | explanation = explanation_match.group(1).strip() if explanation_match else "No explanation provided"
2433 |
2434 | examples = []
2435 | if examples_match:
2436 | examples_text = examples_match.group(1)
2437 | examples = [ex.strip().lstrip('- ') for ex in examples_text.split('\n') if ex.strip()]
2438 |
2439 | return {
2440 | "score": score,
2441 | "explanation": explanation,
2442 | "examples": examples
2443 | }
2444 | except Exception as e:
2445 | logger.warning(f"Error assessing text quality: {str(e)}")
2446 | return {"score": None, "explanation": f"Failed to assess quality: {str(e)}"}
```