dicklesworthstone/llm_gateway_mcp

This is page 31 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│   ├── __init__.py
│   ├── advanced_agent_flows_using_unified_memory_system_demo.py
│   ├── advanced_extraction_demo.py
│   ├── advanced_unified_memory_system_demo.py
│   ├── advanced_vector_search_demo.py
│   ├── analytics_reporting_demo.py
│   ├── audio_transcription_demo.py
│   ├── basic_completion_demo.py
│   ├── cache_demo.py
│   ├── claude_integration_demo.py
│   ├── compare_synthesize_demo.py
│   ├── cost_optimization.py
│   ├── data
│   │   ├── sample_event.txt
│   │   ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│   │   └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│   ├── docstring_refiner_demo.py
│   ├── document_conversion_and_processing_demo.py
│   ├── entity_relation_graph_demo.py
│   ├── filesystem_operations_demo.py
│   ├── grok_integration_demo.py
│   ├── local_text_tools_demo.py
│   ├── marqo_fused_search_demo.py
│   ├── measure_model_speeds.py
│   ├── meta_api_demo.py
│   ├── multi_provider_demo.py
│   ├── ollama_integration_demo.py
│   ├── prompt_templates_demo.py
│   ├── python_sandbox_demo.py
│   ├── rag_example.py
│   ├── research_workflow_demo.py
│   ├── sample
│   │   ├── article.txt
│   │   ├── backprop_paper.pdf
│   │   ├── buffett.pdf
│   │   ├── contract_link.txt
│   │   ├── legal_contract.txt
│   │   ├── medical_case.txt
│   │   ├── northwind.db
│   │   ├── research_paper.txt
│   │   ├── sample_data.json
│   │   └── text_classification_samples
│   │       ├── email_classification.txt
│   │       ├── news_samples.txt
│   │       ├── product_reviews.txt
│   │       └── support_tickets.txt
│   ├── sample_docs
│   │   └── downloaded
│   │       └── attention_is_all_you_need.pdf
│   ├── sentiment_analysis_demo.py
│   ├── simple_completion_demo.py
│   ├── single_shot_synthesis_demo.py
│   ├── smart_browser_demo.py
│   ├── sql_database_demo.py
│   ├── sse_client_demo.py
│   ├── test_code_extraction.py
│   ├── test_content_detection.py
│   ├── test_ollama.py
│   ├── text_classification_demo.py
│   ├── text_redline_demo.py
│   ├── tool_composition_examples.py
│   ├── tournament_code_demo.py
│   ├── tournament_text_demo.py
│   ├── unified_memory_system_demo.py
│   ├── vector_search_demo.py
│   ├── web_automation_instruction_packs.py
│   └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│   └── smart_browser_internal
│       ├── locator_cache.db
│       ├── readability.js
│       └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│   ├── __init__.py
│   ├── conftest.py
│   ├── integration
│   │   ├── __init__.py
│   │   └── test_server.py
│   ├── manual
│   │   ├── test_extraction_advanced.py
│   │   └── test_extraction.py
│   └── unit
│       ├── __init__.py
│       ├── test_cache.py
│       ├── test_providers.py
│       └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── commands.py
│   │   ├── helpers.py
│   │   └── typer_cli.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── completion_client.py
│   │   └── rag_client.py
│   ├── config
│   │   └── examples
│   │       └── filesystem_config.yaml
│   ├── config.py
│   ├── constants.py
│   ├── core
│   │   ├── __init__.py
│   │   ├── evaluation
│   │   │   ├── base.py
│   │   │   └── evaluators.py
│   │   ├── providers
│   │   │   ├── __init__.py
│   │   │   ├── anthropic.py
│   │   │   ├── base.py
│   │   │   ├── deepseek.py
│   │   │   ├── gemini.py
│   │   │   ├── grok.py
│   │   │   ├── ollama.py
│   │   │   ├── openai.py
│   │   │   └── openrouter.py
│   │   ├── server.py
│   │   ├── state_store.py
│   │   ├── tournaments
│   │   │   ├── manager.py
│   │   │   ├── tasks.py
│   │   │   └── utils.py
│   │   └── ums_api
│   │       ├── __init__.py
│   │       ├── ums_database.py
│   │       ├── ums_endpoints.py
│   │       ├── ums_models.py
│   │       └── ums_services.py
│   ├── exceptions.py
│   ├── graceful_shutdown.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── analytics
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── reporting.py
│   │   ├── cache
│   │   │   ├── __init__.py
│   │   │   ├── cache_service.py
│   │   │   ├── persistence.py
│   │   │   ├── strategies.py
│   │   │   └── utils.py
│   │   ├── cache.py
│   │   ├── document.py
│   │   ├── knowledge_base
│   │   │   ├── __init__.py
│   │   │   ├── feedback.py
│   │   │   ├── manager.py
│   │   │   ├── rag_engine.py
│   │   │   ├── retriever.py
│   │   │   └── utils.py
│   │   ├── prompts
│   │   │   ├── __init__.py
│   │   │   ├── repository.py
│   │   │   └── templates.py
│   │   ├── prompts.py
│   │   └── vector
│   │       ├── __init__.py
│   │       ├── embeddings.py
│   │       └── vector_service.py
│   ├── tool_token_counter.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── audio_transcription.py
│   │   ├── base.py
│   │   ├── completion.py
│   │   ├── docstring_refiner.py
│   │   ├── document_conversion_and_processing.py
│   │   ├── enhanced-ums-lookbook.html
│   │   ├── entity_relation_graph.py
│   │   ├── excel_spreadsheet_automation.py
│   │   ├── extraction.py
│   │   ├── filesystem.py
│   │   ├── html_to_markdown.py
│   │   ├── local_text_tools.py
│   │   ├── marqo_fused_search.py
│   │   ├── meta_api_tool.py
│   │   ├── ocr_tools.py
│   │   ├── optimization.py
│   │   ├── provider.py
│   │   ├── pyodide_boot_template.html
│   │   ├── python_sandbox.py
│   │   ├── rag.py
│   │   ├── redline-compiled.css
│   │   ├── sentiment_analysis.py
│   │   ├── single_shot_synthesis.py
│   │   ├── smart_browser.py
│   │   ├── sql_databases.py
│   │   ├── text_classification.py
│   │   ├── text_redline_tools.py
│   │   ├── tournament.py
│   │   ├── ums_explorer.html
│   │   └── unified_memory_system.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── async_utils.py
│   │   ├── display.py
│   │   ├── logging
│   │   │   ├── __init__.py
│   │   │   ├── console.py
│   │   │   ├── emojis.py
│   │   │   ├── formatter.py
│   │   │   ├── logger.py
│   │   │   ├── panels.py
│   │   │   ├── progress.py
│   │   │   └── themes.py
│   │   ├── parse_yaml.py
│   │   ├── parsing.py
│   │   ├── security.py
│   │   └── text.py
│   └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/audio_transcription.py:
--------------------------------------------------------------------------------

```python
   1 | """Advanced audio transcription and enhancement tools for Ultimate MCP Server.
   2 | 
   3 | This module provides tools for high-quality audio transcription, pre-processing,
   4 | and intelligent transcript enhancement with advanced features like speaker 
   5 | diarization, custom vocabulary support, and semantic structuring.
   6 | """
   7 | import asyncio
   8 | import concurrent.futures
   9 | import datetime
  10 | import json
  11 | import os
  12 | import re
  13 | import shutil
  14 | import subprocess
  15 | import tempfile
  16 | import time
  17 | from dataclasses import dataclass
  18 | from enum import Enum
  19 | from pathlib import Path
  20 | from typing import Any, Dict, List, Optional
  21 | 
  22 | import aiofiles
  23 | import httpx
  24 | from docx import Document
  25 | from pydantic import BaseModel, Field
  26 | from pydantic.functional_validators import field_validator
  27 | 
  28 | from ultimate_mcp_server.constants import Provider, TaskType
  29 | from ultimate_mcp_server.core.providers.base import get_provider
  30 | from ultimate_mcp_server.exceptions import (
  31 |     ProviderError,
  32 |     ResourceError,
  33 |     ToolError,
  34 |     ToolInputError,
  35 | )
  36 | from ultimate_mcp_server.services.cache import with_cache
  37 | from ultimate_mcp_server.tools.base import with_error_handling, with_retry, with_tool_metrics
  38 | from ultimate_mcp_server.tools.completion import chat_completion, generate_completion
  39 | from ultimate_mcp_server.utils import get_logger
  40 | from ultimate_mcp_server.utils.text import count_tokens
  41 | 
  42 | logger = get_logger("ultimate_mcp_server.tools.audio")
  43 | 
  44 | # --- Constants and Enums ---
  45 | 
  46 | class AudioEnhancementProfile(str, Enum):
  47 |     """Predefined audio enhancement profiles for different recording types."""
  48 |     CONFERENCE_CALL = "conference_call"  # Optimized for online meetings
  49 |     INTERVIEW = "interview"  # Optimized for interview recordings
  50 |     LECTURE = "lecture"  # Optimized for lectures/presentations
  51 |     NOISY = "noisy"  # Heavy noise reduction for noisy environments
  52 |     PHONE_CALL = "phone_call"  # Optimized for telephone audio
  53 |     VOICEMAIL = "voicemail"  # Optimized for voicemail recordings
  54 |     CUSTOM = "custom"  # User-defined settings
  55 | 
  56 | # Expected model sizes in bytes, with some tolerance (minimum size)
  57 | WHISPER_MODEL_SIZES = {
  58 |     "large-v3": 2900000000, # ~2.9GB
  59 |     "large-v3-turbo": 1500000000, # ~1.5GB
  60 | }
  61 | 
  62 | class TranscriptionQuality(str, Enum):
  63 |     """Quality settings for transcription, balancing speed vs accuracy."""
  64 |     DRAFT = "draft"  # Fastest, less accurate
  65 |     STANDARD = "standard"  # Balanced speed/accuracy
  66 |     ENHANCED = "enhanced"  # More accurate, slower
  67 |     MAXIMUM = "maximum"  # Most accurate, slowest
  68 | 
  69 | 
  70 | class EnhancementStyle(str, Enum):
  71 |     """Transcript enhancement styles for different use cases."""
  72 |     RAW = "raw"  # No enhancement, just cleaned
  73 |     READABLE = "readable"  # Basic readability improvements
  74 |     POLISHED = "polished"  # Well-formatted with proper punctuation
  75 |     VERBATIM = "verbatim"  # Preserve all speech patterns, hesitations
  76 |     STRUCTURED = "structured"  # Add semantic structure (paragraphs, sections)
  77 | 
  78 | 
  79 | class OutputFormat(str, Enum):
  80 |     """Available output formats for transcripts."""
  81 |     JSON = "json"  # Full JSON with all metadata
  82 |     TEXT = "text"  # Plain text
  83 |     SRT = "srt"  # SubRip subtitle format
  84 |     VTT = "vtt"  # WebVTT subtitle format
  85 |     DOCX = "docx"  # Microsoft Word format
  86 |     MARKDOWN = "markdown"  # Markdown with formatting
  87 | 
  88 | 
  89 | # --- Schema Validation Models ---
  90 | 
  91 | class AudioEnhancementParams(BaseModel):
  92 |     """Parameters for audio enhancement."""
  93 |     profile: AudioEnhancementProfile = Field(
  94 |         default=AudioEnhancementProfile.CONFERENCE_CALL,
  95 |         description="Predefined audio enhancement profile"
  96 |     )
  97 |     volume: float = Field(
  98 |         default=1.5, 
  99 |         ge=0.1, 
 100 |         le=5.0,
 101 |         description="Volume adjustment factor"
 102 |     )
 103 |     noise_reduction: int = Field(
 104 |         default=10, 
 105 |         ge=0, 
 106 |         le=30,
 107 |         description="Noise reduction strength (0-30)"
 108 |     )
 109 |     highpass: int = Field(
 110 |         default=200, 
 111 |         ge=50, 
 112 |         le=500,
 113 |         description="Highpass filter frequency in Hz"
 114 |     )
 115 |     lowpass: int = Field(
 116 |         default=3000, 
 117 |         ge=1000, 
 118 |         le=20000,
 119 |         description="Lowpass filter frequency in Hz"
 120 |     )
 121 |     normalize: bool = Field(
 122 |         default=True,
 123 |         description="Apply dynamic audio normalization"
 124 |     )
 125 |     compression: bool = Field(
 126 |         default=True,
 127 |         description="Apply dynamic range compression"
 128 |     )
 129 |     dereverberation: bool = Field(
 130 |         default=False,
 131 |         description="Apply dereverberation filter"
 132 |     )
 133 |     custom_filters: Optional[str] = Field(
 134 |         default=None,
 135 |         description="Additional custom FFmpeg filters"
 136 |     )
 137 |     output_channels: int = Field(
 138 |         default=2, 
 139 |         ge=1, 
 140 |         le=2,
 141 |         description="Number of output channels (1=mono, 2=stereo)"
 142 |     )
 143 |     output_sample_rate: int = Field(
 144 |         default=16000, 
 145 |         ge=8000, 
 146 |         le=48000,
 147 |         description="Output sample rate in Hz"
 148 |     )
 149 |     
 150 |     @field_validator('custom_filters')
 151 |     def validate_custom_filters(cls, v):
 152 |         """Validate that custom filters don't contain dangerous commands."""
 153 |         if v:
 154 |             # Check for shell escape attempts
 155 |             dangerous_patterns = [';', '&&', '||', '`', '$', '\\', '>', '<', '|', '*', '?', '~', '#']
 156 |             for pattern in dangerous_patterns:
 157 |                 if pattern in v:
 158 |                     raise ValueError(f"Custom filter contains disallowed character: {pattern}")
 159 |         return v
 160 | 
 161 | 
 162 | class WhisperParams(BaseModel):
 163 |     """Parameters for Whisper transcription."""
 164 |     model: str = Field(
 165 |         default="large-v3-turbo",
 166 |         description="Whisper model name"
 167 |     )
 168 |     language: Optional[str] = Field(
 169 |         default=None,
 170 |         description="Language code (auto-detect if None)"
 171 |     )
 172 |     quality: TranscriptionQuality = Field(
 173 |         default=TranscriptionQuality.STANDARD,
 174 |         description="Transcription quality level"
 175 |     )
 176 |     beam_size: int = Field(
 177 |         default=5, 
 178 |         ge=1, 
 179 |         le=10,
 180 |         description="Beam size for beam search"
 181 |     )
 182 |     processors: int = Field(
 183 |         default=2, 
 184 |         ge=1, 
 185 |         le=8,
 186 |         description="Number of processors to use"
 187 |     )
 188 |     word_timestamps: bool = Field(
 189 |         default=True,
 190 |         description="Generate timestamps for each word"
 191 |     )
 192 |     translate: bool = Field(
 193 |         default=False,
 194 |         description="Translate non-English to English"
 195 |     )
 196 |     diarize: bool = Field(
 197 |         default=False,
 198 |         description="Attempt speaker diarization"
 199 |     )
 200 |     highlight_words: bool = Field(
 201 |         default=False,
 202 |         description="Highlight words with lower confidence"
 203 |     )
 204 |     max_context: int = Field(
 205 |         default=-1, 
 206 |         ge=-1,
 207 |         description="Maximum number of text tokens to consider from previous history"
 208 |     )
 209 |     custom_vocab: Optional[List[str]] = Field(
 210 |         default=None,
 211 |         description="Custom vocabulary terms to improve recognition"
 212 |     )
 213 | 
 214 | 
 215 | class TranscriptEnhancementParams(BaseModel):
 216 |     """Parameters for transcript enhancement."""
 217 |     style: EnhancementStyle = Field(
 218 |         default=EnhancementStyle.READABLE,
 219 |         description="Enhancement style"
 220 |     )
 221 |     provider: str = Field(
 222 |         default=Provider.ANTHROPIC.value,
 223 |         description="LLM provider for enhancement"
 224 |     )
 225 |     model: Optional[str] = Field(
 226 |         default=None,
 227 |         description="Specific model to use (provider default if None)"
 228 |     )
 229 |     identify_speakers: bool = Field(
 230 |         default=False,
 231 |         description="Attempt to identify and label speakers"
 232 |     )
 233 |     add_paragraphs: bool = Field(
 234 |         default=True,
 235 |         description="Add paragraph breaks at natural points"
 236 |     )
 237 |     fix_spelling: bool = Field(
 238 |         default=True,
 239 |         description="Fix spelling errors"
 240 |     )
 241 |     fix_grammar: bool = Field(
 242 |         default=True,
 243 |         description="Fix basic grammatical errors"
 244 |     )
 245 |     sections: bool = Field(
 246 |         default=False,
 247 |         description="Add section headings based on topic changes"
 248 |     )
 249 |     max_chunk_size: int = Field(
 250 |         default=6500, 
 251 |         ge=1000, 
 252 |         le=100000,
 253 |         description="Maximum chunk size in characters"
 254 |     )
 255 |     format_numbers: bool = Field(
 256 |         default=True,
 257 |         description="Format numbers consistently (e.g., '25' instead of 'twenty-five')"
 258 |     )
 259 |     custom_instructions: Optional[str] = Field(
 260 |         default=None,
 261 |         description="Additional custom instructions for enhancement"
 262 |     )
 263 | 
 264 | 
 265 | class TranscriptionOptions(BaseModel):
 266 |     """Complete options for audio transcription."""
 267 |     enhance_audio: bool = Field(
 268 |         default=True,
 269 |         description="Whether to preprocess audio with FFmpeg"
 270 |     )
 271 |     enhance_transcript: bool = Field(
 272 |         default=True,
 273 |         description="Whether to enhance the transcript with LLM"
 274 |     )
 275 |     parallel_processing: bool = Field(
 276 |         default=True,
 277 |         description="Process chunks in parallel when possible"
 278 |     )
 279 |     max_workers: int = Field(
 280 |         default=4, 
 281 |         ge=1,
 282 |         description="Maximum number of parallel workers"
 283 |     )
 284 |     output_formats: List[OutputFormat] = Field(
 285 |         default=[OutputFormat.JSON, OutputFormat.TEXT],
 286 |         description="Output formats to generate"
 287 |     )
 288 |     save_enhanced_audio: bool = Field(
 289 |         default=False,
 290 |         description="Save the enhanced audio file"
 291 |     )
 292 |     keep_artifacts: bool = Field(
 293 |         default=False,
 294 |         description="Keep temporary files and artifacts"
 295 |     )
 296 |     audio_params: AudioEnhancementParams = Field(
 297 |         default_factory=AudioEnhancementParams,
 298 |         description="Audio enhancement parameters"
 299 |     )
 300 |     whisper_params: WhisperParams = Field(
 301 |         default_factory=WhisperParams,
 302 |         description="Whisper transcription parameters"
 303 |     )
 304 |     enhancement_params: TranscriptEnhancementParams = Field(
 305 |         default_factory=TranscriptEnhancementParams,
 306 |         description="Transcript enhancement parameters"
 307 |     )
 308 |     language_detection: bool = Field(
 309 |         default=False,  # Disable language detection by default
 310 |         description="Automatically detect language before transcription"
 311 |     )
 312 | 
 313 | 
 314 | class Segment(BaseModel):
 315 |     """A segment of transcript with timing information."""
 316 |     start: float = Field(..., description="Start time in seconds")
 317 |     end: float = Field(..., description="End time in seconds")
 318 |     text: str = Field(..., description="Segment text")
 319 |     speaker: Optional[str] = Field(None, description="Speaker identifier")
 320 |     words: Optional[List[Dict[str, Any]]] = Field(None, description="Word-level data")
 321 |     confidence: Optional[float] = Field(None, description="Confidence score")
 322 | 
 323 | 
 324 | class AudioInfo(BaseModel):
 325 |     """Audio file information."""
 326 |     duration: float = Field(..., description="Duration in seconds")
 327 |     channels: int = Field(..., description="Number of audio channels")
 328 |     sample_rate: int = Field(..., description="Sample rate in Hz")
 329 |     format: str = Field(..., description="Audio format")
 330 |     codec: Optional[str] = Field(None, description="Audio codec")
 331 |     bit_depth: Optional[int] = Field(None, description="Bit depth")
 332 |     bitrate: Optional[int] = Field(None, description="Bitrate in bits/second")
 333 |     size_bytes: Optional[int] = Field(None, description="File size in bytes")
 334 | 
 335 | 
 336 | # --- Data Classes ---
 337 | 
 338 | @dataclass
 339 | class ProcessingContext:
 340 |     """Context for the transcription process."""
 341 |     file_path: str
 342 |     temp_dir: str
 343 |     original_filename: str
 344 |     base_filename: str
 345 |     options: TranscriptionOptions
 346 |     enhanced_audio_path: Optional[str] = None
 347 |     processing_times: Dict[str, float] = None
 348 |     language_code: Optional[str] = None
 349 |     
 350 |     def __post_init__(self):
 351 |         if self.processing_times is None:
 352 |             self.processing_times = {}
 353 | 
 354 | 
 355 | # --- Tool Functions ---
 356 | 
 357 | @with_cache(ttl=24 * 60 * 60)  # Cache results for 24 hours
 358 | @with_tool_metrics
 359 | @with_retry(max_retries=1, retry_delay=1.0)
 360 | @with_error_handling
 361 | async def transcribe_audio(
 362 |     file_path: str,
 363 |     options: Optional[Dict[str, Any]] = None,
 364 | ) -> Dict[str, Any]:
 365 |     """Transcribes an audio file to text with advanced preprocessing and enhancement.
 366 |     
 367 |     This tool performs a multi-stage process:
 368 |     1. Analyzes the audio file to determine optimal processing parameters
 369 |     2. Enhances audio quality with adaptive filtering and preprocessing 
 370 |     3. Performs high-quality transcription with customizable settings
 371 |     4. Intelligently enhances and structures the transcript for readability
 372 |     5. Optionally identifies speakers and adds semantic structure
 373 |     
 374 |     Args:
 375 |         file_path: Path to the input audio file (.mp3, .m4a, .wav, etc.)
 376 |         options: Optional dictionary with transcription options including:
 377 |             - enhance_audio: Whether to preprocess audio (default True)
 378 |             - enhance_transcript: Whether to enhance transcript (default True)
 379 |             - parallel_processing: Process chunks in parallel (default True)
 380 |             - output_formats: List of output formats (default ["json", "text"])
 381 |             - audio_params: Audio enhancement parameters
 382 |             - whisper_params: Whisper transcription parameters 
 383 |             - enhancement_params: Transcript enhancement parameters
 384 |     
 385 |     Returns:
 386 |         A dictionary containing:
 387 |         {
 388 |             "raw_transcript": "Original unmodified transcript from Whisper",
 389 |             "enhanced_transcript": "LLM-enhanced transcript with improved formatting",
 390 |             "segments": [
 391 |                 {
 392 |                     "start": 0.0,
 393 |                     "end": 10.5,
 394 |                     "text": "Segment text content", 
 395 |                     "speaker": "Speaker 1",  # If speaker diarization is enabled
 396 |                     "words": [...]  # Word-level data if available
 397 |                 },
 398 |                 ...
 399 |             ],
 400 |             "metadata": {
 401 |                 "language": "en",
 402 |                 "duration": 120.5,
 403 |                 "title": "Automatically detected title",
 404 |                 "topics": ["Topic 1", "Topic 2"]  # If topic extraction is enabled
 405 |             },
 406 |             "audio_info": {
 407 |                 "duration": 120.5,
 408 |                 "channels": 2,
 409 |                 "sample_rate": 44100,
 410 |                 "format": "wav",
 411 |                 "codec": "pcm_s16le",
 412 |                 "bit_depth": 16,
 413 |                 "bitrate": 1411000,
 414 |                 "size_bytes": 31000000
 415 |             },
 416 |             "processing_time": {
 417 |                 "audio_analysis": 0.5,
 418 |                 "audio_enhancement": 5.2,
 419 |                 "language_detection": 1.1,
 420 |                 "transcription": 45.3,
 421 |                 "transcript_enhancement": 10.2,
 422 |                 "total": 62.3
 423 |             },
 424 |             "artifacts": {
 425 |                 "enhanced_audio": "/path/to/enhanced.wav",  # If save_enhanced_audio is True
 426 |                 "output_files": {
 427 |                     "json": "/path/to/transcript.json",
 428 |                     "text": "/path/to/transcript.txt",
 429 |                     "srt": "/path/to/transcript.srt"
 430 |                 }
 431 |             },
 432 |             "tokens": {
 433 |                 "input": 5000,
 434 |                 "output": 3200,
 435 |                 "total": 8200
 436 |             },
 437 |             "cost": 0.00185,
 438 |             "success": true
 439 |         }
 440 |     
 441 |     Raises:
 442 |         ToolInputError: If the file path is invalid or unsupported
 443 |         ToolError: If transcription or enhancement fails
 444 |         ResourceError: If required dependencies are not available
 445 |     """
 446 |     # Start timing total processing
 447 |     start_time = time.time()
 448 |     
 449 |     # --- Input Validation ---
 450 |     try:
 451 |         if not file_path or not isinstance(file_path, str):
 452 |             raise ToolInputError("File path must be a non-empty string.")
 453 |         
 454 |         file_path = os.path.abspath(os.path.expanduser(file_path))
 455 |         if not os.path.exists(file_path):
 456 |             raise ToolInputError(f"File not found: {file_path}")
 457 |         
 458 |         if not os.access(file_path, os.R_OK):
 459 |             raise ToolInputError(f"File not readable: {file_path}")
 460 |             
 461 |         # Validate file is an audio file
 462 |         _, ext = os.path.splitext(file_path)
 463 |         if ext.lower() not in ['.mp3', '.wav', '.m4a', '.mp4', '.flac', '.ogg', '.aac', '.wma', '.opus']:
 464 |             raise ToolInputError(f"Unsupported file format: {ext}. Please provide an audio file.")
 465 |         
 466 |         # Parse and validate options
 467 |         parsed_options = parse_options(options or {})
 468 |         
 469 |     except Exception as e:
 470 |         if isinstance(e, ToolInputError):
 471 |             raise
 472 |         raise ToolInputError(f"Failed to validate input: {str(e)}") from e
 473 |     
 474 |     # --- Initialize Processing Context ---
 475 |     try:
 476 |         temp_dir = tempfile.mkdtemp(prefix="llm_audio_")
 477 |         original_filename = os.path.basename(file_path)
 478 |         base_filename = os.path.splitext(original_filename)[0]
 479 |         
 480 |         context = ProcessingContext(
 481 |             file_path=file_path,
 482 |             temp_dir=temp_dir,
 483 |             original_filename=original_filename,
 484 |             base_filename=base_filename,
 485 |             options=parsed_options,
 486 |         )
 487 |         
 488 |         logger.info(
 489 |             f"Starting audio transcription process for {original_filename}",
 490 |             emoji_key="audio",
 491 |             temp_dir=temp_dir
 492 |         )
 493 |     except Exception as e:
 494 |         raise ToolError(f"Failed to initialize processing context: {str(e)}") from e
 495 |     
 496 |     try:
 497 |         # --- Check Dependencies ---
 498 |         await check_dependencies(context)
 499 |         
 500 |         # --- Process Audio ---
 501 |         result = await process_audio_file(context)
 502 |         
 503 |         # --- Calculate Total Time ---
 504 |         total_time = time.time() - start_time
 505 |         context.processing_times["total"] = total_time
 506 |         result["processing_time"] = context.processing_times
 507 |         
 508 |         logger.success(
 509 |             f"Audio transcription completed in {total_time:.2f}s",
 510 |             emoji_key="success",
 511 |             file=context.original_filename,
 512 |             duration=result.get("audio_info", {}).get("duration", 0)
 513 |         )
 514 |         
 515 |         return result
 516 |     
 517 |     except Exception as e:
 518 |         logger.error(
 519 |             f"Audio transcription failed: {str(e)}",
 520 |             emoji_key="error",
 521 |             exc_info=True,
 522 |             file=context.original_filename
 523 |         )
 524 |         # Clean up temporary directory unless keep_artifacts is True
 525 |         if context.options.keep_artifacts:
 526 |             logger.info(f"Keeping artifacts in {temp_dir}")
 527 |         else:
 528 |             try:
 529 |                 shutil.rmtree(temp_dir)
 530 |             except Exception as cleanup_err:
 531 |                 logger.warning(f"Failed to clean up temporary directory: {cleanup_err}")
 532 |         
 533 |         if isinstance(e, (ToolError, ToolInputError, ResourceError)):
 534 |             raise
 535 |         
 536 |         # Return a structured error response instead of just raising an exception
 537 |         return {
 538 |             "raw_transcript": "",
 539 |             "enhanced_transcript": "",
 540 |             "segments": [],
 541 |             "metadata": {},
 542 |             "audio_info": {},
 543 |             "processing_time": context.processing_times if hasattr(context, "processing_times") else {},
 544 |             "success": False,
 545 |             "error": f"Audio transcription failed: {str(e)}"
 546 |         }
 547 |     
 548 |     finally:
 549 |         # Clean up temporary directory unless keep_artifacts is True
 550 |         if not context.options.keep_artifacts:
 551 |             try:
 552 |                 shutil.rmtree(temp_dir)
 553 |             except Exception as cleanup_err:
 554 |                 logger.warning(f"Failed to clean up temporary directory: {cleanup_err}")
 555 | 
 556 | 
 557 | # --- Main Processing Functions ---
 558 | 
 559 | async def process_audio_file(context: ProcessingContext) -> Dict[str, Any]:
 560 |     """Process an audio file through the complete transcription pipeline."""
 561 |     # Get detailed audio information
 562 |     audio_analysis_start = time.time()
 563 |     audio_info = await get_detailed_audio_info(context.file_path)
 564 |     context.processing_times["audio_analysis"] = time.time() - audio_analysis_start
 565 |     
 566 |     logger.info(
 567 |         f"Audio analysis: {audio_info.get('duration', 0):.1f}s duration, "
 568 |         f"{audio_info.get('sample_rate', 0)} Hz, "
 569 |         f"{audio_info.get('channels', 0)} channels", 
 570 |         emoji_key="audio"
 571 |     )
 572 |     
 573 |     # Update parameters based on audio analysis if needed
 574 |     _update_parameters_from_audio_info(context, audio_info)
 575 |     
 576 |     # --- Audio Enhancement ---
 577 |     enhanced_audio_path = context.file_path
 578 |     if context.options.enhance_audio:
 579 |         audio_enhance_start = time.time()
 580 |         logger.info("Enhancing audio quality", emoji_key="audio", profile=context.options.audio_params.profile.value)
 581 |         
 582 |         enhanced_audio_path = await enhance_audio(context, audio_info)
 583 |         if not enhanced_audio_path:
 584 |             logger.warning("Audio enhancement failed, falling back to original file", emoji_key="warning")
 585 |             enhanced_audio_path = context.file_path
 586 |             
 587 |         context.processing_times["audio_enhancement"] = time.time() - audio_enhance_start
 588 |     else:
 589 |         context.processing_times["audio_enhancement"] = 0
 590 |         
 591 |     context.enhanced_audio_path = enhanced_audio_path
 592 |     
 593 |     if not os.path.exists(enhanced_audio_path):
 594 |         logger.warning(f"Enhanced audio path does not exist: {enhanced_audio_path}", emoji_key="warning")
 595 |         return {
 596 |             "raw_transcript": "",
 597 |             "enhanced_transcript": "",
 598 |             "segments": [],
 599 |             "metadata": {},
 600 |             "audio_info": audio_info,
 601 |             "processing_time": context.processing_times,
 602 |             "success": False,
 603 |             "error": "Enhanced audio file not found"
 604 |         }
 605 |     
 606 |     # --- Skip Language Detection ---
 607 |     # Language detection is disabled - always use Whisper's built-in detection
 608 |     context.processing_times["language_detection"] = 0
 609 |     if context.options.whisper_params.language:
 610 |         logger.info(f"Using specified language: {context.options.whisper_params.language}", emoji_key="language")
 611 |     else:
 612 |         logger.info("Using Whisper's built-in language detection", emoji_key="language")
 613 |     
 614 |     # --- Transcribe Audio ---
 615 |     transcribe_start = time.time()
 616 |     model = context.options.whisper_params.model
 617 |     quality = context.options.whisper_params.quality.value
 618 |     logger.info(
 619 |         f"Transcribing audio with model '{model}' (quality: {quality})",
 620 |         emoji_key="transcribe"
 621 |     )
 622 |     
 623 |     try:
 624 |         transcript_result = await transcribe_with_whisper(context)
 625 |         context.processing_times["transcription"] = time.time() - transcribe_start
 626 |         
 627 |         raw_transcript = transcript_result.get("text", "")
 628 |         segments = transcript_result.get("segments", [])
 629 |         
 630 |         # Check if transcript is empty
 631 |         if not raw_transcript:
 632 |             logger.warning("Whisper returned an empty transcript", emoji_key="warning")
 633 |         else:
 634 |             transcript_length = len(raw_transcript)
 635 |             segments_count = len(segments)
 636 |             logger.info(
 637 |                 f"Transcription complete: {transcript_length} characters, {segments_count} segments",
 638 |                 emoji_key="success"
 639 |             )
 640 |         
 641 |         # Extract metadata if available
 642 |         metadata = transcript_result.get("metadata", {})
 643 |         if context.language_code and "language" not in metadata:
 644 |             metadata["language"] = context.language_code
 645 |             
 646 |     except Exception as e:
 647 |         logger.error(f"Transcription failed: {str(e)}", emoji_key="error", exc_info=True)
 648 |         context.processing_times["transcription"] = time.time() - transcribe_start
 649 |         return {
 650 |             "raw_transcript": "",
 651 |             "enhanced_transcript": "",
 652 |             "segments": [],
 653 |             "metadata": {"language": context.language_code} if context.language_code else {},
 654 |             "audio_info": audio_info,
 655 |             "processing_time": context.processing_times,
 656 |             "success": False,
 657 |             "error": f"Transcription failed: {str(e)}"
 658 |         }
 659 |     
 660 |     # --- Enhance Transcript ---
 661 |     enhanced_transcript = raw_transcript
 662 |     enhancement_cost = 0.0
 663 |     enhancement_tokens = {"input": 0, "output": 0, "total": 0}
 664 |     
 665 |     if context.options.enhance_transcript and raw_transcript:
 666 |         enhance_start = time.time()
 667 |         logger.info(
 668 |             f"Enhancing transcript with style: {context.options.enhancement_params.style.value}",
 669 |             emoji_key="enhance",
 670 |             provider=context.options.enhancement_params.provider
 671 |         )
 672 |         
 673 |         try:
 674 |             enhancement_result = await enhance_transcript(context, raw_transcript, metadata)
 675 |             
 676 |             enhanced_transcript = enhancement_result["transcript"]
 677 |             enhancement_cost = enhancement_result["cost"]
 678 |             enhancement_tokens = enhancement_result["tokens"]
 679 |             
 680 |             if "topics" in enhancement_result and enhancement_result["topics"]:
 681 |                 metadata["topics"] = enhancement_result["topics"]
 682 |                 
 683 |             if "title" in enhancement_result and enhancement_result["title"]:
 684 |                 metadata["title"] = enhancement_result["title"]
 685 |                 
 686 |             context.processing_times["transcript_enhancement"] = time.time() - enhance_start
 687 |             
 688 |             if not enhanced_transcript:
 689 |                 logger.warning("Enhancement returned an empty transcript, falling back to raw transcript", emoji_key="warning")
 690 |                 enhanced_transcript = raw_transcript
 691 |             else:
 692 |                 enhancement_length = len(enhanced_transcript)
 693 |                 logger.info(f"Enhancement complete: {enhancement_length} characters", emoji_key="success")
 694 |                 
 695 |         except Exception as e:
 696 |             logger.error(f"Transcript enhancement failed: {e}", emoji_key="error", exc_info=True)
 697 |             context.processing_times["transcript_enhancement"] = time.time() - enhance_start
 698 |             # Fall back to raw transcript
 699 |             enhanced_transcript = raw_transcript
 700 |     else:
 701 |         if not raw_transcript:
 702 |             logger.warning("Skipping transcript enhancement because raw transcript is empty", emoji_key="warning")
 703 |         elif not context.options.enhance_transcript:
 704 |             logger.info("Transcript enhancement disabled by options", emoji_key="info")
 705 |             
 706 |         context.processing_times["transcript_enhancement"] = 0
 707 |     
 708 |     # --- Generate Output Files ---
 709 |     artifact_paths = await generate_output_files(context, raw_transcript, enhanced_transcript, segments, metadata)
 710 |     
 711 |     # --- Prepare Result ---
 712 |     result = {
 713 |         "raw_transcript": raw_transcript,
 714 |         "enhanced_transcript": enhanced_transcript,
 715 |         "segments": segments,
 716 |         "metadata": metadata,
 717 |         "audio_info": audio_info,
 718 |         "tokens": enhancement_tokens,
 719 |         "cost": enhancement_cost,
 720 |         "artifacts": artifact_paths,
 721 |         "success": bool(raw_transcript or enhanced_transcript)
 722 |     }
 723 |     
 724 |     return result
 725 | 
 726 | 
 727 | def parse_options(options: Dict[str, Any]) -> TranscriptionOptions:
 728 |     """Parse and validate transcription options."""
 729 |     # Convert string output formats to enum values
 730 |     if "output_formats" in options:
 731 |         if isinstance(options["output_formats"], list):
 732 |             formats = []
 733 |             for fmt in options["output_formats"]:
 734 |                 if isinstance(fmt, str):
 735 |                     try:
 736 |                         formats.append(OutputFormat(fmt.lower()))
 737 |                     except ValueError:
 738 |                         logger.warning(f"Invalid output format: {fmt}, ignoring")
 739 |                 elif isinstance(fmt, OutputFormat):
 740 |                     formats.append(fmt)
 741 |             if formats:  # Only update if we have valid formats
 742 |                 options["output_formats"] = formats
 743 |     
 744 |     # Handle nested parameter objects
 745 |     for key in ["audio_params", "whisper_params", "enhancement_params"]:
 746 |         if key in options and options[key]:
 747 |             # If a dictionary is provided, keep it for Pydantic
 748 |             if not isinstance(options[key], dict):
 749 |                 # Convert non-dict to dict by serializing/deserializing if possible
 750 |                 try:
 751 |                     options[key] = json.loads(json.dumps(options[key]))
 752 |                 except Exception:
 753 |                     # If can't convert, remove the invalid value
 754 |                     logger.warning(f"Invalid format for {key}, using defaults")
 755 |                     options.pop(key)
 756 |     
 757 |     # Set audio profile parameters if a profile is specified
 758 |     if "audio_params" in options and "profile" in options["audio_params"]:
 759 |         profile = options["audio_params"]["profile"]
 760 |         if isinstance(profile, str):
 761 |             try:
 762 |                 profile = AudioEnhancementProfile(profile.lower())
 763 |                 # Update audio parameters based on the selected profile
 764 |                 options["audio_params"].update(_get_audio_profile_params(profile))
 765 |             except ValueError:
 766 |                 logger.warning(f"Invalid audio profile: {profile}, using default")
 767 |     
 768 |     # Set whisper quality parameters if quality is specified
 769 |     if "whisper_params" in options and "quality" in options["whisper_params"]:
 770 |         quality = options["whisper_params"]["quality"]
 771 |         if isinstance(quality, str):
 772 |             try:
 773 |                 quality = TranscriptionQuality(quality.lower())
 774 |                 # Update whisper parameters based on the selected quality
 775 |                 options["whisper_params"].update(_get_whisper_quality_params(quality))
 776 |             except ValueError:
 777 |                 logger.warning(f"Invalid transcription quality: {quality}, using default")
 778 |     
 779 |     # Parse with Pydantic model
 780 |     try:
 781 |         return TranscriptionOptions(**options)
 782 |     except Exception as e:
 783 |         logger.warning(f"Error parsing options: {e}, using defaults with valid values", emoji_key="warning")
 784 |         # Create with default options
 785 |         return TranscriptionOptions()
 786 | 
 787 | 
 788 | def _get_audio_profile_params(profile: AudioEnhancementProfile) -> Dict[str, Any]:
 789 |     """Get audio enhancement parameters for a specific profile."""
 790 |     profiles = {
 791 |         AudioEnhancementProfile.CONFERENCE_CALL: {
 792 |             "volume": 1.5,
 793 |             "noise_reduction": 10,
 794 |             "highpass": 200,
 795 |             "lowpass": 3000,
 796 |             "compression": True,
 797 |             "normalize": True,
 798 |             "dereverberation": False
 799 |         },
 800 |         AudioEnhancementProfile.INTERVIEW: {
 801 |             "volume": 1.3,
 802 |             "noise_reduction": 8,
 803 |             "highpass": 150,
 804 |             "lowpass": 8000,
 805 |             "compression": True,
 806 |             "normalize": True,
 807 |             "dereverberation": False
 808 |         },
 809 |         AudioEnhancementProfile.LECTURE: {
 810 |             "volume": 1.4,
 811 |             "noise_reduction": 6,
 812 |             "highpass": 120,
 813 |             "lowpass": 8000,
 814 |             "compression": True,
 815 |             "normalize": True,
 816 |             "dereverberation": True
 817 |         },
 818 |         AudioEnhancementProfile.NOISY: {
 819 |             "volume": 1.8,
 820 |             "noise_reduction": 20,
 821 |             "highpass": 250,
 822 |             "lowpass": 3000,
 823 |             "compression": True,
 824 |             "normalize": True,
 825 |             "dereverberation": True
 826 |         },
 827 |         AudioEnhancementProfile.PHONE_CALL: {
 828 |             "volume": 2.0,
 829 |             "noise_reduction": 15,
 830 |             "highpass": 300,
 831 |             "lowpass": 3400,
 832 |             "compression": True,
 833 |             "normalize": True,
 834 |             "dereverberation": False
 835 |         },
 836 |         AudioEnhancementProfile.VOICEMAIL: {
 837 |             "volume": 2.0,
 838 |             "noise_reduction": 12,
 839 |             "highpass": 250,
 840 |             "lowpass": 3000,
 841 |             "compression": True,
 842 |             "normalize": True,
 843 |             "dereverberation": False
 844 |         }
 845 |     }
 846 |     
 847 |     return profiles.get(profile, {})
 848 | 
 849 | 
 850 | def _get_whisper_quality_params(quality: TranscriptionQuality) -> Dict[str, Any]:
 851 |     """Get whisper parameters for a specific quality level."""
 852 |     quality_params = {
 853 |         TranscriptionQuality.DRAFT: {
 854 |             "beam_size": 1,
 855 |             "processors": 1,
 856 |             "word_timestamps": False,
 857 |             "highlight_words": False
 858 |         },
 859 |         TranscriptionQuality.STANDARD: {
 860 |             "beam_size": 5,
 861 |             "processors": 2,
 862 |             "word_timestamps": True,
 863 |             "highlight_words": False
 864 |         },
 865 |         TranscriptionQuality.ENHANCED: {
 866 |             "beam_size": 8,
 867 |             "processors": 2,
 868 |             "word_timestamps": True,
 869 |             "highlight_words": True
 870 |         },
 871 |         TranscriptionQuality.MAXIMUM: {
 872 |             "beam_size": 10,
 873 |             "processors": 4,
 874 |             "word_timestamps": True,
 875 |             "highlight_words": True
 876 |         }
 877 |     }
 878 |     
 879 |     return quality_params.get(quality, {})
 880 | 
 881 | 
 882 | def _update_parameters_from_audio_info(context: ProcessingContext, audio_info: Dict[str, Any]) -> None:
 883 |     """Update processing parameters based on audio file analysis."""
 884 |     # If mono audio, adjust enhancement params
 885 |     audio_params = context.options.audio_params
 886 |     updated_params = False
 887 | 
 888 |     if audio_info.get("channels", 0) == 1:
 889 |         # Set output channels to match input if not explicitly set
 890 |         if "output_channels" not in context.options.audio_params.dict():
 891 |             # Create a copy with the updated parameter
 892 |             audio_params = AudioEnhancementParams(
 893 |                 **{**audio_params.dict(), "output_channels": 1}
 894 |             )
 895 |             updated_params = True
 896 |     
 897 |     # If low-quality audio, adjust enhancement profile
 898 |     sample_rate = audio_info.get("sample_rate", 0)
 899 |     if sample_rate < 16000 and context.options.audio_params.profile == AudioEnhancementProfile.CONFERENCE_CALL:
 900 |         logger.info(f"Detected low sample rate ({sample_rate} Hz), adjusting enhancement profile", emoji_key="audio")
 901 |         # Use phone_call profile for low sample rate audio
 902 |         params = _get_audio_profile_params(AudioEnhancementProfile.PHONE_CALL)
 903 |         # Create a copy with the updated parameters
 904 |         audio_params = AudioEnhancementParams(
 905 |             **{**audio_params.dict(), **params}
 906 |         )
 907 |         updated_params = True
 908 |     
 909 |     # If params were updated, create a new options object with the updated audio_params
 910 |     if updated_params:
 911 |         context.options = TranscriptionOptions(
 912 |             **{**context.options.dict(), "audio_params": audio_params}
 913 |         )
 914 |     
 915 |     # If very short audio (<10 seconds), adjust transcription quality
 916 |     duration = audio_info.get("duration", 0)
 917 |     if duration < 10 and context.options.whisper_params.quality != TranscriptionQuality.MAXIMUM:
 918 |         logger.info(f"Short audio detected ({duration:.1f}s), increasing transcription quality", emoji_key="audio")
 919 |         # Create a new whisper_params with enhanced quality
 920 |         whisper_params = WhisperParams(
 921 |             **{**context.options.whisper_params.dict(), "quality": TranscriptionQuality.ENHANCED}
 922 |         )
 923 |         # Update options with new whisper_params
 924 |         context.options = TranscriptionOptions(
 925 |             **{**context.options.dict(), "whisper_params": whisper_params}
 926 |         )
 927 | 
 928 | 
 929 | # --- Dependency and Audio Processing Functions ---
 930 | 
 931 | async def download_whisper_model(model_name: str, output_path: str) -> bool:
 932 |     """Download a Whisper model from Hugging Face using httpx.
 933 |     
 934 |     Args:
 935 |         model_name: Name of the model to download (e.g. 'large-v3')
 936 |         output_path: Path where to save the model file
 937 |     
 938 |     Returns:
 939 |         True if download was successful, False otherwise
 940 |     """
 941 |     url = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-{model_name}.bin"
 942 |     logger.info(f"Downloading Whisper model '{model_name}' from {url}", emoji_key="download")
 943 |     
 944 |     # Expected model size
 945 |     min_size_bytes = WHISPER_MODEL_SIZES.get(model_name, 100000000)  # Default to 100MB minimum if unknown
 946 |     expected_size_bytes = min_size_bytes  # Initialize with minimum size
 947 |     
 948 |     try:
 949 |         async with httpx.AsyncClient(timeout=None) as client:
 950 |             # Make HEAD request first to check if the URL is valid and get expected size
 951 |             try:
 952 |                 head_response = await client.head(url)
 953 |                 if head_response.status_code != 200:
 954 |                     logger.warning(f"Model URL is not accessible: HTTP {head_response.status_code} for {url}", emoji_key="warning")
 955 |                     return False
 956 |                 
 957 |                 # If Content-Length is available, use it to check expected size
 958 |                 content_length = int(head_response.headers.get("content-length", 0))
 959 |                 if content_length > 0:
 960 |                     expected_size_mb = content_length / (1024 * 1024)
 961 |                     logger.info(f"Expected model size: {expected_size_mb:.1f} MB", emoji_key="info")
 962 |                     
 963 |                     # Update expected size if it's larger than our preset minimum
 964 |                     if content_length > min_size_bytes:
 965 |                         expected_size_bytes = int(content_length * 0.95)  # Allow 5% tolerance
 966 |             except Exception as e:
 967 |                 logger.warning(f"Failed to validate model URL: {url} - Error: {str(e)}", emoji_key="warning")
 968 |                 # Continue anyway, the GET might still work
 969 |             
 970 |             # Stream the response to handle large files
 971 |             try:
 972 |                 async with client.stream("GET", url) as response:
 973 |                     if response.status_code != 200:
 974 |                         logger.warning(f"Failed to download model: HTTP {response.status_code} for {url}", emoji_key="warning")
 975 |                         if response.status_code == 404:
 976 |                             logger.warning(f"Model '{model_name}' not found on Hugging Face repository", emoji_key="warning")
 977 |                         return False
 978 |                     
 979 |                     # Get content length if available
 980 |                     content_length = int(response.headers.get("content-length", 0))
 981 |                     if content_length == 0:
 982 |                         logger.warning("Content length is zero or not provided", emoji_key="warning")
 983 |                     
 984 |                     downloaded = 0
 985 |                     
 986 |                     # Open file for writing
 987 |                     try:
 988 |                         with open(output_path, "wb") as f:
 989 |                             # Display progress
 990 |                             last_log_time = time.time()
 991 |                             async for chunk in response.aiter_bytes(chunk_size=8192):
 992 |                                 f.write(chunk)
 993 |                                 downloaded += len(chunk)
 994 |                                 
 995 |                                 # Log progress every 5 seconds
 996 |                                 now = time.time()
 997 |                                 if now - last_log_time > 5:
 998 |                                     if content_length > 0:
 999 |                                         percent = downloaded / content_length * 100
1000 |                                         logger.info(f"Download progress: {percent:.1f}% ({downloaded/(1024*1024):.1f} MB)", emoji_key="download")
1001 |                                     else:
1002 |                                         logger.info(f"Downloaded {downloaded/(1024*1024):.1f} MB", emoji_key="download")
1003 |                                     last_log_time = now
1004 |                     except IOError as e:
1005 |                         logger.warning(f"Failed to write to file {output_path}: {str(e)}", emoji_key="warning")
1006 |                         return False
1007 |             except httpx.RequestError as e:
1008 |                 logger.warning(f"HTTP request error while downloading model: {str(e)}", emoji_key="warning")
1009 |                 return False
1010 |         
1011 |         # Verify the file was downloaded
1012 |         if not os.path.exists(output_path):
1013 |             logger.warning(f"Downloaded file doesn't exist at {output_path}", emoji_key="warning")
1014 |             return False
1015 |             
1016 |         actual_size = os.path.getsize(output_path)
1017 |         if actual_size == 0:
1018 |             logger.warning(f"Downloaded file is empty at {output_path}", emoji_key="warning")
1019 |             return False
1020 |         
1021 |         # Verify file size meets expectations
1022 |         actual_size_mb = actual_size / (1024 * 1024)
1023 |         if actual_size < expected_size_bytes:
1024 |             logger.warning(
1025 |                 f"Model file size ({actual_size_mb:.1f} MB) is smaller than expected minimum size "
1026 |                 f"({expected_size_bytes/(1024*1024):.1f} MB). File may be corrupted or incomplete.", 
1027 |                 emoji_key="warning"
1028 |             )
1029 |             return False
1030 |             
1031 |         logger.info(f"Successfully downloaded model to {output_path} ({actual_size_mb:.1f} MB)", emoji_key="success")
1032 |         return True
1033 |             
1034 |     except Exception as e:
1035 |         logger.warning(f"Error downloading whisper model: {e}", emoji_key="warning", exc_info=True)
1036 |         return False
1037 | 
1038 | async def check_dependencies(context: ProcessingContext) -> bool:
1039 |     """Verifies that required dependencies are installed and accessible."""
1040 |     # Check ffmpeg
1041 |     try:
1042 |         result = await asyncio.create_subprocess_exec(
1043 |             "ffmpeg", "-version",
1044 |             stdout=asyncio.subprocess.PIPE,
1045 |             stderr=asyncio.subprocess.PIPE
1046 |         )
1047 |         stdout, stderr = await result.communicate()
1048 |         
1049 |         if result.returncode != 0:
1050 |             raise ResourceError(
1051 |                 "ffmpeg is not installed or not accessible. Please install it with 'apt install ffmpeg'."
1052 |             )
1053 |         
1054 |         # Extract ffmpeg version for logging
1055 |         version_match = re.search(r'ffmpeg version (\S+)', stdout.decode('utf-8', errors='ignore'))
1056 |         version = version_match.group(1) if version_match else "unknown"
1057 |         logger.debug(f"Found ffmpeg version {version}", emoji_key="dependency")
1058 |         
1059 |     except FileNotFoundError as e:
1060 |         raise ResourceError(
1061 |             "ffmpeg is not installed. Please install it with 'apt install ffmpeg'."
1062 |         ) from e
1063 |     
1064 |     # Check whisper.cpp
1065 |     whisper_path = os.path.expanduser("~/whisper.cpp")
1066 |     
1067 |     # Use user-supplied model
1068 |     model = context.options.whisper_params.model
1069 |     model_path = os.path.join(whisper_path, "models", f"ggml-{model}.bin")
1070 |     
1071 |     if not os.path.exists(whisper_path):
1072 |         raise ResourceError(
1073 |             f"whisper.cpp not found at {whisper_path}. Please install it first."
1074 |         )
1075 |     
1076 |     if not os.path.exists(model_path):
1077 |         # Check if models directory exists
1078 |         models_dir = os.path.join(whisper_path, "models")
1079 |         if not os.path.exists(models_dir):
1080 |             try:
1081 |                 os.makedirs(models_dir)
1082 |                 logger.info(f"Created models directory at {models_dir}", emoji_key="info")
1083 |             except Exception as e:
1084 |                 raise ResourceError(f"Failed to create models directory: {e}") from e
1085 |         
1086 |         # Try to automatically download the model using httpx
1087 |         logger.info(f"Whisper model '{model}' not found at {model_path}", emoji_key="info")
1088 |         logger.info(f"Attempting to download model '{model}' now...", emoji_key="download")
1089 |         
1090 |         # Download the model directly using httpx - first check if model exists
1091 |         if model == "large-v3":
1092 |             # Double check if the model actually exists
1093 |             test_url = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-{model}.bin"
1094 |             async with httpx.AsyncClient(timeout=10.0) as client:
1095 |                 try:
1096 |                     head_response = await client.head(test_url)
1097 |                     if head_response.status_code != 200:
1098 |                         # Model might not be directly available - show a clear error
1099 |                         if head_response.status_code == 404:
1100 |                             raise ResourceError(
1101 |                                 f"Whisper model 'large-v3' not found in the HuggingFace repository at {test_url}\n"
1102 |                                 f"Please download it manually with one of these commands:\n"
1103 |                                 f"1. ~/whisper.cpp/models/download-ggml-model.sh large-v3\n"
1104 |                                 f"2. Or try a different model like 'large-v3-turbo' which is known to be available"
1105 |                             )
1106 |                 except Exception as e:
1107 |                     logger.warning(f"Error checking model availability: {e}", emoji_key="warning")
1108 |                     # Continue with download attempt anyway
1109 |         
1110 |         # Attempt to download the model
1111 |         download_success = await download_whisper_model(model, model_path)
1112 |         if not download_success:
1113 |             # Modified error message with alternative suggestions
1114 |             if model == "large-v3":
1115 |                 raise ResourceError(
1116 |                     f"Failed to download Whisper model '{model}'.\n"
1117 |                     f"You can:\n"
1118 |                     f"1. Try using a different model like 'large-v3-turbo' which is more reliable\n"
1119 |                     f"2. Or download manually with: ~/whisper.cpp/models/download-ggml-model.sh {model}"
1120 |                 )
1121 |             else:
1122 |                 raise ResourceError(
1123 |                     f"Failed to download Whisper model '{model}'.\n"
1124 |                     f"Please download it manually with: ~/whisper.cpp/models/download-ggml-model.sh {model}"
1125 |                 )
1126 |         
1127 |         # Verify that the model was downloaded
1128 |         if not os.path.exists(model_path):
1129 |             raise ResourceError(
1130 |                 f"Model download completed but model file not found at {model_path}. "
1131 |                 f"Please check the download and try again."
1132 |             )
1133 |         
1134 |         # Verify model file size
1135 |         actual_size = os.path.getsize(model_path)
1136 |         expected_min_size = WHISPER_MODEL_SIZES.get(model, 100000000)  # Default to 100MB minimum if unknown
1137 |         
1138 |         if actual_size < expected_min_size:
1139 |             actual_size_mb = actual_size / (1024 * 1024)
1140 |             expected_mb = expected_min_size / (1024 * 1024)
1141 |             raise ResourceError(
1142 |                 f"Downloaded model file at {model_path} is too small ({actual_size_mb:.1f} MB). "
1143 |                 f"Expected at least {expected_mb:.1f} MB. File may be corrupted or incomplete. "
1144 |                 f"Please download it manually with: ~/whisper.cpp/models/download-ggml-model.sh {model}"
1145 |             )
1146 |         
1147 |         logger.info(f"Successfully downloaded Whisper model '{model}'", emoji_key="success")
1148 |     else:
1149 |         # Verify existing model file size
1150 |         actual_size = os.path.getsize(model_path)
1151 |         expected_min_size = WHISPER_MODEL_SIZES.get(model, 100000000)  # Default to 100MB minimum if unknown
1152 |         
1153 |         file_size_mb = actual_size / (1024 * 1024)
1154 |         if actual_size < expected_min_size:
1155 |             expected_mb = expected_min_size / (1024 * 1024)
1156 |             logger.warning(
1157 |                 f"Existing model at {model_path} is suspiciously small ({file_size_mb:.1f} MB). "
1158 |                 f"Expected at least {expected_mb:.1f} MB. Model may be corrupted.", 
1159 |                 emoji_key="warning"
1160 |             )
1161 |         else:
1162 |             logger.info(f"Found existing model at {model_path} ({file_size_mb:.1f} MB)", emoji_key="dependency")
1163 |     
1164 |     # Check if whisper binary is available in PATH using shlex for command safety
1165 |     try:
1166 |         result = subprocess.run(["which", "whisper-cli"], capture_output=True, text=True)
1167 |         if result.returncode == 0:
1168 |             whisper_path_found = result.stdout.strip()
1169 |             logger.debug(f"Found whisper-cli in PATH: {whisper_path_found}", emoji_key="dependency")
1170 |         else:
1171 |             # Check in the expected location
1172 |             whisper_path = os.path.expanduser("~/whisper.cpp")
1173 |             whisper_bin = os.path.join(whisper_path, "build", "bin", "whisper-cli")
1174 |             if not os.path.exists(whisper_bin):
1175 |                 raise ResourceError(
1176 |                     f"whisper-cli binary not found at {whisper_bin}. "
1177 |                     f"Please build whisper.cpp first with: "
1178 |                     f"cd ~/whisper.cpp && cmake -B build && cmake --build build -j --config Release"
1179 |                 )
1180 |             logger.debug(f"Found whisper-cli at {whisper_bin}", emoji_key="dependency")
1181 |     except FileNotFoundError as e:
1182 |         raise ResourceError("Command 'which' not found. Cannot check for whisper-cli installation.") from e
1183 |     
1184 |     logger.debug(f"Found whisper model: {model}", emoji_key="dependency")
1185 |     return True
1186 | 
1187 | 
1188 | async def get_detailed_audio_info(file_path: str) -> Dict[str, Any]:
1189 |     """Gets detailed information about an audio file using ffprobe."""
1190 |     cmd = [
1191 |         "ffprobe",
1192 |         "-v", "error",
1193 |         "-show_entries", "format=duration,bit_rate,size",
1194 |         "-select_streams", "a:0",
1195 |         "-show_entries", "stream=channels,sample_rate,codec_name,bits_per_sample",
1196 |         "-of", "json",
1197 |         file_path
1198 |     ]
1199 |     
1200 |     try:
1201 |         process = await asyncio.create_subprocess_exec(
1202 |             *cmd,
1203 |             stdout=asyncio.subprocess.PIPE,
1204 |             stderr=asyncio.subprocess.PIPE
1205 |         )
1206 |         stdout, stderr = await process.communicate()
1207 |         
1208 |         if process.returncode != 0:
1209 |             logger.warning(f"Failed to get audio info: {stderr.decode('utf-8', errors='ignore')}", emoji_key="warning")
1210 |             return {
1211 |                 "duration": 0,
1212 |                 "channels": 0,
1213 |                 "sample_rate": 0,
1214 |                 "format": os.path.splitext(file_path)[1][1:],
1215 |                 "codec": None,
1216 |                 "bit_depth": None,
1217 |                 "bitrate": None,
1218 |                 "size_bytes": os.path.getsize(file_path) if os.path.exists(file_path) else 0
1219 |             }
1220 |         
1221 |         info = json.loads(stdout)
1222 |         format_info = info.get("format", {})
1223 |         stream_info = info.get("streams", [{}])[0] if info.get("streams") else {}
1224 |         
1225 |         # Extract information
1226 |         duration = float(format_info.get("duration", 0))
1227 |         channels = int(stream_info.get("channels", 0))
1228 |         sample_rate = int(stream_info.get("sample_rate", 0))
1229 |         codec = stream_info.get("codec_name")
1230 |         bit_depth = int(stream_info.get("bits_per_sample", 0)) or None
1231 |         bitrate = int(format_info.get("bit_rate", 0)) or None
1232 |         size_bytes = int(format_info.get("size", 0)) or os.path.getsize(file_path)
1233 |         audio_format = os.path.splitext(file_path)[1][1:]
1234 |         
1235 |         return {
1236 |             "duration": duration,
1237 |             "channels": channels,
1238 |             "sample_rate": sample_rate,
1239 |             "format": audio_format,
1240 |             "codec": codec,
1241 |             "bit_depth": bit_depth,
1242 |             "bitrate": bitrate,
1243 |             "size_bytes": size_bytes
1244 |         }
1245 |     except Exception as e:
1246 |         logger.warning(f"Error getting audio info: {e}", emoji_key="warning", exc_info=True)
1247 |         try:
1248 |             size_bytes = os.path.getsize(file_path) if os.path.exists(file_path) else 0
1249 |         except Exception:
1250 |             size_bytes = 0
1251 |             
1252 |         return {
1253 |             "duration": 0,
1254 |             "channels": 0,
1255 |             "sample_rate": 0,
1256 |             "format": os.path.splitext(file_path)[1][1:],
1257 |             "codec": None,
1258 |             "bit_depth": None,
1259 |             "bitrate": None,
1260 |             "size_bytes": size_bytes
1261 |         }
1262 | 
1263 | 
1264 | async def enhance_audio(context: ProcessingContext, audio_info: Dict[str, Any]) -> Optional[str]:
1265 |     """Enhances audio quality using ffmpeg preprocessing."""
1266 |     # Create output path in temp directory
1267 |     output_path = os.path.join(context.temp_dir, f"{context.base_filename}_enhanced.wav")
1268 |     
1269 |     # Use optimized audio enhancement settings
1270 |     # Build the complete command with the standardized enhancement settings
1271 |     cmd = [
1272 |         "ffmpeg",
1273 |         "-i", context.file_path,
1274 |         "-threads", str(os.cpu_count() or 1),
1275 |         "-af", "volume=1.5, highpass=f=200, lowpass=f=3000, afftdn=nr=10:nf=-20, "
1276 |               "compand=attacks=0:points=-80/-80|-45/-15|-27/-9|0/-7|20/-7:gain=5, "
1277 |               "dynaudnorm=f=150:g=15:p=1:m=1:s=0, "
1278 |               "pan=stereo|c0=c0|c1=c0",
1279 |         "-ar", "16000",
1280 |         "-ac", "2",
1281 |         "-c:a", "pcm_s16le",
1282 |         "-y",  # Overwrite output if exists
1283 |         output_path
1284 |     ]
1285 |     
1286 |     logger.debug(f"Running ffmpeg command: {' '.join(cmd)}", emoji_key="command")
1287 |     
1288 |     try:
1289 |         process = await asyncio.create_subprocess_exec(
1290 |             *cmd,
1291 |             stdout=asyncio.subprocess.PIPE,
1292 |             stderr=asyncio.subprocess.PIPE
1293 |         )
1294 |         stdout, stderr = await process.communicate()
1295 |         
1296 |         if process.returncode != 0:
1297 |             error_msg = stderr.decode('utf-8', errors='ignore')
1298 |             logger.error(f"FFmpeg error: {error_msg}", emoji_key="error")
1299 |             return None
1300 |         
1301 |         # Verify the output file was created and has a reasonable size
1302 |         if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
1303 |             file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
1304 |             logger.info(f"Enhanced audio saved to {output_path} ({file_size_mb:.2f} MB)", emoji_key="audio")
1305 |         else:
1306 |             logger.warning("Enhanced audio file is suspiciously small or doesn't exist", emoji_key="warning")
1307 |             return None
1308 |         
1309 |         # If we're keeping enhanced audio, copy it to a persistent location
1310 |         if context.options.save_enhanced_audio:
1311 |             original_dir = os.path.dirname(context.file_path)
1312 |             persistent_path = os.path.join(original_dir, f"{context.base_filename}_enhanced.wav")
1313 |             shutil.copy2(output_path, persistent_path)
1314 |             logger.info(f"Saved enhanced audio to {persistent_path}", emoji_key="save")
1315 |         
1316 |         logger.info("Audio enhancement completed", emoji_key="audio")
1317 |         return output_path
1318 |     except Exception as e:
1319 |         logger.error(f"Error enhancing audio: {e}", emoji_key="error", exc_info=True)
1320 |         return None
1321 | 
1322 | 
1323 | async def transcribe_with_whisper(context: ProcessingContext) -> Dict[str, Any]:
1324 |     """Transcribes audio using Whisper.cpp with advanced options."""
1325 |     # Create output base name in temp directory
1326 |     output_base = os.path.join(context.temp_dir, context.base_filename)
1327 |     output_json = f"{output_base}.json"
1328 |     output_txt = f"{output_base}.txt"
1329 |     
1330 |     # Get whisper parameters
1331 |     params = context.options.whisper_params
1332 |     
1333 |     # Build command with configurable parameters
1334 |     whisper_bin = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")
1335 |     model_path = os.path.expanduser(f"~/whisper.cpp/models/ggml-{params.model}.bin")
1336 |     
1337 |     # Validate required files exist
1338 |     if not os.path.exists(whisper_bin):
1339 |         logger.error(f"Whisper binary not found at {whisper_bin}", emoji_key="error")
1340 |         raise ToolError(f"Whisper binary not found at {whisper_bin}")
1341 |     
1342 |     if not os.path.exists(model_path):
1343 |         logger.error(f"Whisper model not found at {model_path}", emoji_key="error")
1344 |         raise ToolError(f"Whisper model not found at {model_path}")
1345 |     
1346 |     # Verify model file size
1347 |     actual_size = os.path.getsize(model_path)
1348 |     expected_min_size = WHISPER_MODEL_SIZES.get(params.model, 100000000)  # Default to 100MB minimum if unknown
1349 |     if actual_size < expected_min_size:
1350 |         actual_size_mb = actual_size / (1024 * 1024)
1351 |         expected_mb = expected_min_size / (1024 * 1024)
1352 |         logger.warning(
1353 |             f"Model file at {model_path} is smaller than expected ({actual_size_mb:.1f} MB, expected {expected_mb:.1f} MB)",
1354 |             emoji_key="warning"
1355 |         )
1356 |     
1357 |     # Use file_path as fallback if enhanced_audio_path is None
1358 |     audio_path = context.enhanced_audio_path or context.file_path
1359 |     if not os.path.exists(audio_path):
1360 |         logger.error(f"Audio file not found at {audio_path}", emoji_key="error")
1361 |         raise ToolError(f"Audio file not found at {audio_path}")
1362 |     
1363 |     cmd = [
1364 |         whisper_bin,
1365 |         "-m", model_path,
1366 |         "-f", audio_path,
1367 |         "-of", output_base,
1368 |         "-oj"  # Always output JSON for post-processing
1369 |     ]
1370 |     
1371 |     # Add boolean flags
1372 |     if params.word_timestamps:
1373 |         cmd.append("-pc")
1374 |     
1375 |     if params.translate:
1376 |         cmd.append("-tr")
1377 |     
1378 |     # Always output text for readability
1379 |     cmd.append("-otxt")
1380 |     
1381 |     # Add numeric parameters
1382 |     cmd.extend(["-t", str(os.cpu_count() if params.processors <= 0 else params.processors)])
1383 |     
1384 |     if params.beam_size:
1385 |         cmd.extend(["-bs", str(params.beam_size)])
1386 |     
1387 |     # Add language parameter if specified
1388 |     if params.language:
1389 |         cmd.extend(["-l", params.language])
1390 |     
1391 |     # Add max context parameter if specified
1392 |     if params.max_context > 0:
1393 |         cmd.extend(["-mc", str(params.max_context)])
1394 |     
1395 |     # Additional optimizations
1396 |     cmd.append("-fa")  # Full sentence timestamps (improved segmentation)
1397 |     cmd.append("-pp")  # Enable post-processing
1398 |     
1399 |     # Add custom vocab if specified (create a vocab file)
1400 |     if params.custom_vocab:
1401 |         vocab_path = os.path.join(context.temp_dir, "custom_vocab.txt")
1402 |         try:
1403 |             async with aiofiles.open(vocab_path, 'w') as f:
1404 |                 await f.write("\n".join(params.custom_vocab))
1405 |             cmd.extend(["-kv", vocab_path])
1406 |         except Exception as e:
1407 |             logger.warning(f"Failed to create custom vocab file: {e}", emoji_key="warning")
1408 |     
1409 |     # Add diarization if requested
1410 |     if params.diarize:
1411 |         cmd.append("-dm")
1412 |     
1413 |     cmd_str = ' '.join(cmd)
1414 |     logger.debug(f"Running whisper command: {cmd_str}", emoji_key="command")
1415 |     
1416 |     try:
1417 |         process = await asyncio.create_subprocess_exec(
1418 |             *cmd,
1419 |             stdout=asyncio.subprocess.PIPE,
1420 |             stderr=asyncio.subprocess.PIPE
1421 |         )
1422 |         stdout, stderr = await process.communicate()
1423 |         
1424 |         stderr_output = stderr.decode('utf-8', errors='ignore') if stderr else ""
1425 |         stdout_output = stdout.decode('utf-8', errors='ignore') if stdout else ""
1426 |         
1427 |         # Log outputs for debugging
1428 |         if stdout_output:
1429 |             logger.debug(f"Whisper stdout (first 500 chars): {stdout_output[:500]}", emoji_key="info")
1430 |         
1431 |         if stderr_output:
1432 |             if process.returncode != 0:
1433 |                 logger.error(f"Whisper stderr: {stderr_output}", emoji_key="error")
1434 |             elif "error" in stderr_output.lower() or "warning" in stderr_output.lower():
1435 |                 logger.warning(f"Whisper warnings/errors: {stderr_output}", emoji_key="warning")
1436 |             else:
1437 |                 logger.debug(f"Whisper stderr: {stderr_output}", emoji_key="info")
1438 |         
1439 |         if process.returncode != 0:
1440 |             error_msg = stderr_output or "Unknown error"
1441 |             logger.error(f"Whisper transcription error (exit code {process.returncode}): {error_msg}", emoji_key="error")
1442 |             raise ToolError(f"Whisper transcription failed with exit code {process.returncode}: {error_msg}")
1443 |             
1444 |         # Check if output files exist
1445 |         if not os.path.exists(output_json) and not os.path.exists(output_txt):
1446 |             logger.error("Whisper completed successfully but no output files were created", emoji_key="error")
1447 |             raise ToolError("Whisper completed successfully but no output files were created")
1448 |             
1449 |         # Read results from the JSON file
1450 |         if os.path.exists(output_json):
1451 |             json_file_size = os.path.getsize(output_json)
1452 |             logger.debug(f"Reading JSON output file: {output_json} ({json_file_size} bytes)", emoji_key="info")
1453 |             
1454 |             if json_file_size < 10:  # Suspiciously small
1455 |                 logger.warning(f"JSON output file is suspiciously small: {json_file_size} bytes", emoji_key="warning")
1456 |             
1457 |             try:
1458 |                 async with aiofiles.open(output_json, 'r') as f:
1459 |                     content = await f.read()
1460 |                     
1461 |                 try:
1462 |                     result = json.loads(content)
1463 |                     
1464 |                     # Fix missing fields in result if needed
1465 |                     if "segments" not in result:
1466 |                         logger.warning("No segments found in Whisper JSON output", emoji_key="warning")
1467 |                         result["segments"] = []
1468 |                     
1469 |                     if "text" not in result or not result.get("text"):
1470 |                         logger.warning("No transcript text found in Whisper JSON output", emoji_key="warning")
1471 |                         # Try to construct text from segments
1472 |                         if result.get("segments"):
1473 |                             reconstructed_text = " ".join([seg.get("text", "") for seg in result["segments"]])
1474 |                             if reconstructed_text:
1475 |                                 logger.info("Reconstructed transcript text from segments", emoji_key="info")
1476 |                                 result["text"] = reconstructed_text
1477 |                             else:
1478 |                                 result["text"] = ""
1479 |                         else:
1480 |                             result["text"] = ""
1481 |                     
1482 |                     # Extract metadata
1483 |                     metadata = {
1484 |                         "language": context.language_code or result.get("language"),
1485 |                         "duration": result.get("duration", 0)
1486 |                     }
1487 |                     result["metadata"] = metadata
1488 |                     
1489 |                 except json.JSONDecodeError as e:
1490 |                     logger.error(f"Failed to parse Whisper JSON output: {e}", emoji_key="error")
1491 |                     logger.error(f"JSON content: {content[:1000]}...", emoji_key="error")
1492 |                     raise ToolError(f"Failed to parse Whisper output JSON: {e}") from e
1493 |             except Exception as e:
1494 |                 logger.error(f"Failed to read Whisper JSON output file: {e}", emoji_key="error")
1495 |                 raise ToolError(f"Failed to read Whisper output: {e}") from e
1496 |         else:
1497 |             logger.warning(f"Whisper JSON output not found at expected path: {output_json}", emoji_key="warning")
1498 |             # Fallback to text file
1499 |             if os.path.exists(output_txt):
1500 |                 txt_file_size = os.path.getsize(output_txt)
1501 |                 logger.info(f"Falling back to text output file: {output_txt} ({txt_file_size} bytes)", emoji_key="info")
1502 |                 
1503 |                 if txt_file_size < 10:  # Suspiciously small
1504 |                     logger.warning(f"Text output file is suspiciously small: {txt_file_size} bytes", emoji_key="warning")
1505 |                 
1506 |                 try:
1507 |                     async with aiofiles.open(output_txt, 'r') as f:
1508 |                         text = await f.read()
1509 |                     
1510 |                     # Create minimal result structure
1511 |                     result = {
1512 |                         "text": text,
1513 |                         "segments": [{"text": text, "start": 0, "end": 0}],
1514 |                         "metadata": {
1515 |                             "language": context.language_code,
1516 |                             "duration": 0
1517 |                         }
1518 |                     }
1519 |                     
1520 |                     if not text:
1521 |                         logger.warning("Text output file is empty", emoji_key="warning")
1522 |                 except Exception as e:
1523 |                     logger.error(f"Failed to read text output file: {e}", emoji_key="error")
1524 |                     raise ToolError(f"Failed to read Whisper text output: {e}") from e
1525 |             else:
1526 |                 logger.error(f"No output files found from Whisper at {output_base}.*", emoji_key="error")
1527 |                 raise ToolError("No output files found from Whisper transcription")
1528 |         
1529 |         # Check if we actually got a transcript
1530 |         if not result.get("text"):
1531 |             logger.warning("Whisper returned an empty transcript", emoji_key="warning")
1532 |         
1533 |         # Clean up results (remove empty/duplicate segments)
1534 |         cleaned_segments = clean_segments(result.get("segments", []))
1535 |         result["segments"] = cleaned_segments
1536 |         
1537 |         # Clean up text (remove dots-only lines, etc.)
1538 |         result["text"] = clean_raw_transcript(result.get("text", ""))
1539 |         
1540 |         return result
1541 |     
1542 |     except ToolError:
1543 |         raise
1544 |     except Exception as e:
1545 |         logger.error(f"Error in Whisper transcription: {e}", emoji_key="error", exc_info=True)
1546 |         raise ToolError(f"Whisper transcription failed: {str(e)}") from e
1547 | 
1548 | 
1549 | def clean_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1550 |     """Clean and normalize segment data."""
1551 |     cleaned_segments = []
1552 |     seen_texts = set()
1553 |     
1554 |     for segment in segments:
1555 |         # Skip segments with empty or meaningless text
1556 |         text = segment.get("text", "").strip()
1557 |         if not text or re.match(r'^[\s.]*$', text):
1558 |             continue
1559 |         
1560 |         # Skip exact duplicates unless they have meaningful timing differences
1561 |         # (within 0.5s of a previous segment)
1562 |         is_duplicate = False
1563 |         if text in seen_texts:
1564 |             for prev_segment in cleaned_segments:
1565 |                 if prev_segment.get("text") == text:
1566 |                     start_diff = abs(prev_segment.get("start", 0) - segment.get("start", 0))
1567 |                     end_diff = abs(prev_segment.get("end", 0) - segment.get("end", 0))
1568 |                     if start_diff < 0.5 and end_diff < 0.5:
1569 |                         is_duplicate = True
1570 |                         break
1571 |         
1572 |         if is_duplicate:
1573 |             continue
1574 |         
1575 |         # Add to seen texts
1576 |         seen_texts.add(text)
1577 |         
1578 |         # Standardize segment structure
1579 |         clean_segment = {
1580 |             "start": float(segment.get("start", 0)),
1581 |             "end": float(segment.get("end", 0)),
1582 |             "text": text
1583 |         }
1584 |         
1585 |         # Add optional fields if available
1586 |         for field in ["speaker", "words", "confidence"]:
1587 |             if field in segment:
1588 |                 clean_segment[field] = segment[field]
1589 |         
1590 |         cleaned_segments.append(clean_segment)
1591 |     
1592 |     # Sort by start time
1593 |     cleaned_segments.sort(key=lambda x: x["start"])
1594 |     
1595 |     return cleaned_segments
1596 | 
1597 | 
1598 | def clean_raw_transcript(text: str) -> str:
1599 |     """Cleans raw transcript."""
1600 |     if not text:
1601 |         return ""
1602 |     
1603 |     # Split into lines and process
1604 |     lines = text.split("\n")
1605 |     cleaned_lines = []
1606 |     seen_lines = set()
1607 |     
1608 |     for line in lines:
1609 |         line = line.strip()
1610 |         # Skip empty lines
1611 |         if not line:
1612 |             continue
1613 |         # Skip lines with just dots or other meaningless patterns
1614 |         if re.match(r'^[\s.]*$', line) or line == '[BLANK_AUDIO]':
1615 |             continue
1616 |         
1617 |         # Standardize multiple spaces
1618 |         line = re.sub(r'\s+', ' ', line)
1619 |         
1620 |         # Keep duplicates if they're long (likely not duplicates but legitimate repetition)
1621 |         if line in seen_lines and len(line) <= 50:
1622 |             continue
1623 |         
1624 |         seen_lines.add(line)
1625 |         cleaned_lines.append(line)
1626 |     
1627 |     # Join but ensure there's proper spacing
1628 |     return "\n".join(cleaned_lines)
1629 | 
1630 | 
1631 | # --- Transcript Enhancement Functions ---
1632 | 
1633 | async def enhance_transcript(
1634 |     context: ProcessingContext, 
1635 |     transcript: str,
1636 |     metadata: Dict[str, Any]
1637 | ) -> Dict[str, Any]:
1638 |     """Enhance a transcript with formatting, readability and semantic structuring."""
1639 |     if not transcript or transcript.strip() == "":
1640 |         return {
1641 |             "transcript": "",
1642 |             "tokens": {"input": 0, "output": 0, "total": 0},
1643 |             "cost": 0.0,
1644 |             "topics": [],
1645 |             "title": None
1646 |         }
1647 |     
1648 |     # Extract key parameters
1649 |     params = context.options.enhancement_params
1650 |     
1651 |     # Split transcript into manageable chunks
1652 |     chunks = await chunk_text(transcript, params.max_chunk_size)
1653 |     
1654 |     if not chunks:
1655 |         return {
1656 |             "transcript": transcript,
1657 |             "tokens": {"input": 0, "output": 0, "total": 0},
1658 |             "cost": 0.0,
1659 |             "topics": [],
1660 |             "title": None
1661 |         }
1662 |     
1663 |     # First analyze context to get a summary of the content
1664 |     context_data = await detect_subject_matter(
1665 |         chunks[0], 
1666 |         params.provider, 
1667 |         params.model,
1668 |         metadata
1669 |     )
1670 |     
1671 |     # Track topics if available
1672 |     topics = context_data.get("topics", [])
1673 |     title = context_data.get("title")
1674 |     context_info = context_data.get("context", "")
1675 |     
1676 |     logger.info(f"Content analysis complete: {len(topics)} topics identified", emoji_key="analyze")
1677 |     
1678 |     # Process chunks concurrently if parallel processing is enabled
1679 |     if context.options.parallel_processing and len(chunks) > 1:
1680 |         enhanced_chunks = await process_chunks_parallel(
1681 |             context, 
1682 |             chunks, 
1683 |             context_info, 
1684 |             params
1685 |         )
1686 |     else:
1687 |         enhanced_chunks = await process_chunks_sequential(
1688 |             context, 
1689 |             chunks, 
1690 |             context_info, 
1691 |             params
1692 |         )
1693 |     
1694 |     # Calculate total metrics
1695 |     total_tokens = {"input": 0, "output": 0, "total": 0}
1696 |     total_cost = 0.0
1697 |     
1698 |     for chunk_data in enhanced_chunks:
1699 |         chunk_tokens = chunk_data.get("tokens", {})
1700 |         total_tokens["input"] += chunk_tokens.get("input", 0)
1701 |         total_tokens["output"] += chunk_tokens.get("output", 0)
1702 |         total_tokens["total"] += chunk_tokens.get("total", 0)
1703 |         total_cost += chunk_data.get("cost", 0.0)
1704 |     
1705 |     # Join the enhanced chunks
1706 |     enhanced_transcript = "\n\n".join(chunk_data["text"] for chunk_data in enhanced_chunks)
1707 |     
1708 |     # If sections are enabled, try to add section headings
1709 |     if params.sections and topics:
1710 |         enhanced_transcript = await add_section_headings(
1711 |             enhanced_transcript, 
1712 |             topics, 
1713 |             params.provider, 
1714 |             params.model
1715 |         )
1716 |     
1717 |     return {
1718 |         "transcript": enhanced_transcript,
1719 |         "tokens": total_tokens,
1720 |         "cost": total_cost,
1721 |         "topics": topics,
1722 |         "title": title
1723 |     }
1724 | 
1725 | 
1726 | async def chunk_text(text: str, max_chunk_size: int = 6500) -> List[str]:
1727 |     """Split text into chunks with intelligent boundary detection."""
1728 |     if len(text) <= max_chunk_size:
1729 |         return [text]
1730 |     
1731 |     # Define patterns for natural breaks, prioritized
1732 |     patterns = [
1733 |         r'\n\s*\n\s*\n',  # Triple line break (highest priority)
1734 |         r'\n\s*\n',       # Double line break
1735 |         r'(?<=[.!?])\s+(?=[A-Z])',  # Sentence boundary with capital letter following
1736 |         r'(?<=[.!?])\s',  # Any sentence boundary
1737 |         r'(?<=[,:;])\s'   # Phrase boundary (lowest priority)
1738 |     ]
1739 |     
1740 |     chunks = []
1741 |     remaining_text = text
1742 |     
1743 |     while remaining_text:
1744 |         if len(remaining_text) <= max_chunk_size:
1745 |             chunks.append(remaining_text)
1746 |             break
1747 |         
1748 |         # Start with an initial chunk at max size
1749 |         chunk_candidate = remaining_text[:max_chunk_size]
1750 |         split_position = None
1751 |         
1752 |         # Try each pattern in order of priority
1753 |         for pattern in patterns:
1754 |             # Look for the last occurrence of the pattern
1755 |             matches = list(re.finditer(pattern, chunk_candidate))
1756 |             if matches:
1757 |                 # Use the last match as the split point
1758 |                 split_position = matches[-1].end()
1759 |                 break
1760 |         
1761 |         # Fallback if no natural breaks found
1762 |         if split_position is None or split_position < max_chunk_size // 2:
1763 |             # Look for the last space after a minimum chunk size
1764 |             min_size = max(max_chunk_size // 2, 1000)
1765 |             last_space = chunk_candidate.rfind(' ', min_size)
1766 |             if last_space > min_size:
1767 |                 split_position = last_space
1768 |             else:
1769 |                 # Forced split at max_chunk_size
1770 |                 split_position = max_chunk_size
1771 |         
1772 |         # Create chunk and update remaining text
1773 |         chunks.append(remaining_text[:split_position].strip())
1774 |         remaining_text = remaining_text[split_position:].strip()
1775 |     
1776 |     # Validate chunks
1777 |     validated_chunks = []
1778 |     for chunk in chunks:
1779 |         if chunk and len(chunk) >= 100:  # Minimum viable chunk size
1780 |             validated_chunks.append(chunk)
1781 |         elif chunk:
1782 |             # If chunk is too small, combine with previous or next chunk
1783 |             if validated_chunks:
1784 |                 validated_chunks[-1] += "\n\n" + chunk
1785 |             else:
1786 |                 # This is the first chunk and it's too small - rare case
1787 |                 validated_chunks.append(chunk)
1788 |     
1789 |     return validated_chunks
1790 | 
1791 | 
1792 | async def detect_subject_matter(
1793 |     text: str, 
1794 |     provider: str, 
1795 |     model: Optional[str] = None,
1796 |     metadata: Optional[Dict[str, Any]] = None
1797 | ) -> Dict[str, Any]:
1798 |     """Analyze transcript to determine subject matter, topics, and title."""
1799 |     prompt = """Analyze this transcript excerpt for the following:
1800 | 
1801 | 1. CONTEXT: The primary domain or topic being discussed (e.g., technology, business, healthcare, etc.)
1802 | 2. SPEAKERS: The likely type and number of speakers (e.g., interview, panel, lecture, etc.)
1803 | 3. TOPICS: List 2-5 specific topics covered, in order of importance
1804 | 4. TITLE: A short, descriptive title for this content (under 10 words)
1805 | 
1806 | Return your analysis in JSON format ONLY:
1807 | {
1808 |   "context": "Brief description of the domain and conversation type",
1809 |   "topics": ["Topic 1", "Topic 2", "Topic 3"],
1810 |   "title": "Concise descriptive title"
1811 | }
1812 | 
1813 | Transcript excerpt:
1814 | {text}"""
1815 | 
1816 |     # Include metadata if available
1817 |     if metadata and metadata.get("language"):
1818 |         language = metadata.get("language")
1819 |         prompt += f"\n\nMetadata: The transcript language is {language}."
1820 |     
1821 |     try:
1822 |         result = await chat_completion(
1823 |             messages=[{"role": "user", "content": prompt.format(text=text)}],
1824 |             provider=provider,
1825 |             model=model,
1826 |             temperature=0
1827 |         )
1828 |         
1829 |         if result.get("success") and "message" in result:
1830 |             content = result["message"].get("content", "")
1831 |             
1832 |             # Try to parse JSON from content
1833 |             try:
1834 |                 # Extract JSON if it's embedded in text
1835 |                 json_match = re.search(r'({[\s\S]*})', content)
1836 |                 if json_match:
1837 |                     analysis = json.loads(json_match.group(1))
1838 |                 else:
1839 |                     analysis = json.loads(content)
1840 |                 
1841 |                 return {
1842 |                     "context": analysis.get("context", ""),
1843 |                     "topics": analysis.get("topics", []),
1844 |                     "title": analysis.get("title")
1845 |                 }
1846 |             except json.JSONDecodeError:
1847 |                 # Fallback: extract fields manually
1848 |                 context_match = re.search(r'context["\s:]+([^"}\n]+)', content, re.IGNORECASE)
1849 |                 title_match = re.search(r'title["\s:]+([^"}\n]+)', content, re.IGNORECASE)
1850 |                 topics_match = re.search(r'topics["\s:]+\[(.*?)\]', content, re.IGNORECASE | re.DOTALL)
1851 |                 
1852 |                 context = context_match.group(1).strip() if context_match else ""
1853 |                 title = title_match.group(1).strip() if title_match else None
1854 |                 
1855 |                 topics = []
1856 |                 if topics_match:
1857 |                     topics_text = topics_match.group(1)
1858 |                     topics = [t.strip().strip('"\'') for t in re.findall(r'"([^"]+)"', topics_text)]
1859 |                     if not topics:
1860 |                         topics = [t.strip() for t in topics_text.split(',') if t.strip()]
1861 |                 
1862 |                 return {
1863 |                     "context": context,
1864 |                     "topics": topics,
1865 |                     "title": title
1866 |                 }
1867 |     except Exception as e:
1868 |         logger.warning(f"Subject matter detection failed: {e}", emoji_key="warning")
1869 |     
1870 |     return {
1871 |         "context": "",
1872 |         "topics": [],
1873 |         "title": None
1874 |     }
1875 | 
1876 | 
1877 | async def process_chunks_parallel(
1878 |     context: ProcessingContext,
1879 |     chunks: List[str],
1880 |     context_info: str,
1881 |     params: TranscriptEnhancementParams
1882 | ) -> List[Dict[str, Any]]:
1883 |     """Process transcript chunks in parallel for better performance."""
1884 |     # Limit max workers to CPU count or specified max
1885 |     max_workers = min(context.options.max_workers, os.cpu_count() or 4, len(chunks))
1886 |     
1887 |     # Create a semaphore to limit concurrency
1888 |     sem = asyncio.Semaphore(max_workers)
1889 |     
1890 |     # Create a thread pool for parallel processing
1891 |     chunk_results = []
1892 |     
1893 |     async def process_chunk(i, chunk):
1894 |         """Process an individual chunk."""
1895 |         async with sem:  # Use semaphore to limit concurrency
1896 |             logger.info(f"Enhancing chunk {i+1}/{len(chunks)}", emoji_key="enhance")
1897 |             try:
1898 |                 result = await enhance_chunk(chunk, context_info, params, i, len(chunks))
1899 |                 return result
1900 |             except Exception as e:
1901 |                 logger.error(f"Error enhancing chunk {i+1}: {e}", emoji_key="error", exc_info=True)
1902 |                 return {"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0}
1903 |     
1904 |     # Create tasks for parallel execution
1905 |     tasks = [process_chunk(i, chunk) for i, chunk in enumerate(chunks)]
1906 |     chunk_results = await asyncio.gather(*tasks)
1907 |     
1908 |     # Make sure results are in original order (they should be)
1909 |     return chunk_results
1910 | 
1911 | 
1912 | async def process_chunks_sequential(
1913 |     context: ProcessingContext,
1914 |     chunks: List[str],
1915 |     context_info: str,
1916 |     params: TranscriptEnhancementParams
1917 | ) -> List[Dict[str, Any]]:
1918 |     """Process transcript chunks sequentially to preserve context flow."""
1919 |     enhanced_chunks = []
1920 |     accumulated_context = context_info
1921 |     
1922 |     for i, chunk in enumerate(chunks):
1923 |         logger.info(f"Enhancing chunk {i+1}/{len(chunks)}", emoji_key="enhance")
1924 |         
1925 |         # Update context with information from previous chunks
1926 |         if i > 0 and enhanced_chunks:
1927 |             # Add brief summary of what was covered in previous chunk
1928 |             previous_text = enhanced_chunks[-1]["text"]
1929 |             if len(previous_text) > 500:
1930 |                 accumulated_context += f"\nPrevious chunk ended with: {previous_text[-500:]}"
1931 |             else:
1932 |                 accumulated_context += f"\nPrevious chunk: {previous_text}"
1933 |         
1934 |         try:
1935 |             result = await enhance_chunk(chunk, accumulated_context, params, i, len(chunks))
1936 |             enhanced_chunks.append(result)
1937 |         except Exception as e:
1938 |             logger.error(f"Error enhancing chunk {i+1}: {e}", emoji_key="error", exc_info=True)
1939 |             # Use original text on error
1940 |             enhanced_chunks.append({"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0})
1941 |     
1942 |     return enhanced_chunks
1943 | 
1944 | 
1945 | async def enhance_chunk(
1946 |     chunk: str, 
1947 |     context_info: str, 
1948 |     params: TranscriptEnhancementParams,
1949 |     chunk_index: int,
1950 |     total_chunks: int
1951 | ) -> Dict[str, Any]:
1952 |     """Enhance a single transcript chunk with LLM."""
1953 |     # Build the prompt based on enhancement parameters
1954 |     style_instructions = _get_style_instructions(params.style)
1955 |     fix_instructions = []
1956 |     
1957 |     if params.add_paragraphs:
1958 |         fix_instructions.append("- Add paragraph breaks at natural topic transitions")
1959 |     
1960 |     if params.fix_spelling:
1961 |         fix_instructions.append("- Fix obvious spelling errors while preserving domain-specific terms")
1962 |     
1963 |     if params.fix_grammar:
1964 |         fix_instructions.append("- Fix basic grammatical errors without changing style or meaning")
1965 |     
1966 |     if params.format_numbers:
1967 |         fix_instructions.append("- Format numbers consistently (e.g., '25' instead of 'twenty-five')")
1968 |     
1969 |     if params.identify_speakers:
1970 |         fix_instructions.append("- Try to identify different speakers and label them as Speaker 1, Speaker 2, etc.")
1971 |         fix_instructions.append("- Format speaker changes as 'Speaker N: text' on a new line")
1972 |     
1973 |     fix_section = "\n".join(fix_instructions) if fix_instructions else "None"
1974 |     
1975 |     # Add custom instructions if provided
1976 |     custom_section = f"\nADDITIONAL INSTRUCTIONS:\n{params.custom_instructions}" if params.custom_instructions else ""
1977 |     
1978 |     # Mention chunk position in context
1979 |     position_info = f"This is chunk {chunk_index+1} of {total_chunks}." if total_chunks > 1 else ""
1980 |     
1981 |     prompt = f"""You are cleaning up a raw transcript from a recorded conversation. {position_info}
1982 | 
1983 | CONTENT CONTEXT: {context_info}
1984 | 
1985 | ENHANCEMENT STYLE: {style_instructions}
1986 | 
1987 | CLEANUP INSTRUCTIONS:
1988 | 1. Remove filler sounds: "um", "uh", "er", "ah", "hmm"
1989 | 2. Remove stutters and word repetitions: "the- the", "I- I"
1990 | 3. Remove meaningless filler phrases when used as pure filler: "you know", "like", "sort of"
1991 | 4. Fix clear transcription errors and garbled text
1992 | 5. Add proper punctuation for readability
1993 | {fix_section}
1994 | 
1995 | STRICT PRESERVATION RULES:
1996 | 1. DO NOT modify, rephrase, or restructure ANY of the speaker's content
1997 | 2. DO NOT add ANY new content or explanations
1998 | 3. DO NOT make the language more formal or technical
1999 | 4. DO NOT summarize or condense anything
2000 | 5. PRESERVE ALL technical terms, numbers, and specific details exactly as spoken
2001 | 6. PRESERVE the speaker's unique speaking style and personality
2002 | {custom_section}
2003 | 
2004 | Here's the transcript chunk to clean:
2005 | {chunk}
2006 | 
2007 | Return ONLY the cleaned transcript text with no explanations, comments, or metadata."""
2008 | 
2009 |     try:
2010 |         result = await chat_completion(
2011 |             messages=[{"role": "user", "content": prompt}],
2012 |             provider=params.provider,
2013 |             model=params.model,
2014 |             temperature=0.1,  # Low temperature for consistent results
2015 |             max_tokens=min(len(chunk) * 2, 8192)  # Reasonable token limit based on input size
2016 |         )
2017 |         
2018 |         if result.get("success") and "message" in result:
2019 |             enhanced_text = result["message"].get("content", "").strip()
2020 |             
2021 |             # Validation: if enhanced text is much shorter, it might have been summarized
2022 |             if len(enhanced_text) < len(chunk) * 0.6:
2023 |                 logger.warning(
2024 |                     f"Enhanced text suspiciously short ({len(enhanced_text)} vs {len(chunk)} chars), "
2025 |                     f"may have been summarized. Using original.",
2026 |                     emoji_key="warning"
2027 |                 )
2028 |                 enhanced_text = chunk
2029 |             
2030 |             return {
2031 |                 "text": enhanced_text,
2032 |                 "tokens": result.get("tokens", {"input": 0, "output": 0, "total": 0}),
2033 |                 "cost": result.get("cost", 0.0)
2034 |             }
2035 |         
2036 |         return {"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0}
2037 |     except Exception as e:
2038 |         logger.error(f"Chunk enhancement error: {e}", emoji_key="error")
2039 |         return {"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0}
2040 | 
2041 | 
2042 | def _get_style_instructions(style: EnhancementStyle) -> str:
2043 |     """Get instructions for the specified enhancement style."""
2044 |     styles = {
2045 |         EnhancementStyle.RAW: (
2046 |             "Minimal cleaning only. Preserve all speech patterns and informality. "
2047 |             "Focus on removing only transcription errors and unintelligible elements."
2048 |         ),
2049 |         EnhancementStyle.READABLE: (
2050 |             "Basic readability improvements. Light cleanup while preserving natural speech patterns. "
2051 |             "Remove only clear disfluencies and maintain conversational flow."
2052 |         ),
2053 |         EnhancementStyle.POLISHED: (
2054 |             "Well-formatted with proper punctuation and clean sentences. "
2055 |             "Remove speech disfluencies but preserve the speaker's voice and style. "
2056 |             "Create a professional but authentic reading experience."
2057 |         ),
2058 |         EnhancementStyle.VERBATIM: (
2059 |             "Preserve all speech patterns, hesitations, and repetitions. "
2060 |             "Format for readability but maintain every verbal quirk and pause. "
2061 |             "Indicate hesitations with ellipses [...] and preserve every repeated word or phrase."
2062 |         ),
2063 |         EnhancementStyle.STRUCTURED: (
2064 |             "Add semantic structure with clear paragraphs around topics. "
2065 |             "Clean speech for maximum readability while preserving content accuracy. "
2066 |             "Organize into logical sections while keeping all original information."
2067 |         )
2068 |     }
2069 |     
2070 |     return styles.get(style, styles[EnhancementStyle.READABLE])
2071 | 
2072 | 
2073 | async def add_section_headings(
2074 |     transcript: str, 
2075 |     topics: List[str], 
2076 |     provider: str, 
2077 |     model: Optional[str] = None
2078 | ) -> str:
2079 |     """Add section headings to the transcript based on topic changes."""
2080 |     if not transcript or not topics:
2081 |         return transcript
2082 |     
2083 |     prompt = """Add clear section headings to this transcript based on topic changes. 
2084 | 
2085 | TOPICS COVERED (in approximate order):
2086 | {topics}
2087 | 
2088 | RULES:
2089 | 1. Insert section headings as "## [Topic]" on their own line
2090 | 2. Place headings ONLY where there is a clear topic change
2091 | 3. Use at most {max_sections} headings total
2092 | 4. NEVER add content or edit the existing text
2093 | 5. NEVER remove any original content
2094 | 6. Base headings on the given topics list, but you can adjust wording for clarity
2095 | 7. Don't duplicate headings for the same topic
2096 | 8. Keep headings short and descriptive (2-6 words each)
2097 | 
2098 | TRANSCRIPT:
2099 | {text}
2100 | 
2101 | Return the full transcript with section headings added."""
2102 | 
2103 |     # Adjust max sections based on transcript length
2104 |     token_estimate = len(transcript) // 4
2105 |     max_sections = min(len(topics) + 1, token_estimate // 1000 + 1)
2106 |     topics_text = "\n".join([f"- {topic}" for topic in topics])
2107 |     
2108 |     try:
2109 |         result = await chat_completion(
2110 |             messages=[{
2111 |                 "role": "user", 
2112 |                 "content": prompt.format(topics=topics_text, text=transcript, max_sections=max_sections)
2113 |             }],
2114 |             provider=provider,
2115 |             model=model,
2116 |             temperature=0.1,
2117 |             max_tokens=min(len(transcript) * 2, 8192)
2118 |         )
2119 |         
2120 |         if result.get("success") and "message" in result:
2121 |             return result["message"].get("content", "").strip()
2122 |         
2123 |         return transcript
2124 |     except Exception as e:
2125 |         logger.warning(f"Failed to add section headings: {e}", emoji_key="warning")
2126 |         return transcript
2127 | 
2128 | 
2129 | # --- Output File Generation ---
2130 | 
2131 | async def generate_output_files(
2132 |     context: ProcessingContext,
2133 |     raw_transcript: str,
2134 |     enhanced_transcript: str,
2135 |     segments: List[Dict[str, Any]],
2136 |     metadata: Dict[str, Any]
2137 | ) -> Dict[str, Any]:
2138 |     """Generate output files in requested formats."""
2139 |     artifact_paths = {
2140 |         "output_files": {}
2141 |     }
2142 |     
2143 |     # Save enhanced audio path if requested
2144 |     if context.options.save_enhanced_audio and context.enhanced_audio_path:
2145 |         original_dir = os.path.dirname(context.file_path)
2146 |         persistent_path = os.path.join(original_dir, f"{context.base_filename}_enhanced.wav")
2147 |         
2148 |         # May have already been saved during enhancement
2149 |         if not os.path.exists(persistent_path) and os.path.exists(context.enhanced_audio_path):
2150 |             shutil.copy2(context.enhanced_audio_path, persistent_path)
2151 |             
2152 |         artifact_paths["enhanced_audio"] = persistent_path
2153 |     
2154 |     # Generate requested output formats
2155 |     output_formats = context.options.output_formats
2156 |     output_dir = os.path.dirname(context.file_path)
2157 |     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
2158 |     
2159 |     # Convert temp files to Path objects for easier path manipulation
2160 |     output_dir = Path(os.path.dirname(context.file_path))
2161 |     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
2162 |     
2163 |     # Generate JSON output
2164 |     if OutputFormat.JSON in output_formats:
2165 |         json_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.json"
2166 |         
2167 |         # Create JSON data structure
2168 |         json_data = {
2169 |             "metadata": {
2170 |                 "filename": context.original_filename,
2171 |                 "processed_at": timestamp,
2172 |                 **metadata
2173 |             },
2174 |             "raw_transcript": raw_transcript,
2175 |             "enhanced_transcript": enhanced_transcript,
2176 |             "segments": segments
2177 |         }
2178 |         
2179 |         # Write JSON file
2180 |         async with aiofiles.open(str(json_path), 'w') as f:
2181 |             await f.write(json.dumps(json_data, indent=2))
2182 |             
2183 |         artifact_paths["output_files"]["json"] = json_path
2184 |     
2185 |     # Generate TEXT output
2186 |     if OutputFormat.TEXT in output_formats:
2187 |         text_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.txt"
2188 |         
2189 |         # Create plain text file
2190 |         async with aiofiles.open(str(text_path), 'w') as f:
2191 |             # Add metadata header if available
2192 |             if metadata:
2193 |                 if "title" in metadata and metadata["title"]:
2194 |                     await f.write(f"Title: {metadata['title']}\n")
2195 |                 if "language" in metadata and metadata["language"]:
2196 |                     await f.write(f"Language: {metadata['language']}\n")
2197 |                 if "topics" in metadata and metadata["topics"]:
2198 |                     topics_str = ", ".join(metadata["topics"])
2199 |                     await f.write(f"Topics: {topics_str}\n")
2200 |                 await f.write("\n")
2201 |             
2202 |             # Write enhanced transcript if available, otherwise use raw transcript
2203 |             await f.write(enhanced_transcript or raw_transcript)
2204 |         
2205 |         artifact_paths["output_files"]["text"] = text_path
2206 |     
2207 |     # Generate SRT output
2208 |     if OutputFormat.SRT in output_formats:
2209 |         srt_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.srt"
2210 |         
2211 |         # Convert segments to SRT format
2212 |         srt_content = generate_srt(segments)
2213 |         
2214 |         # Write SRT file
2215 |         async with aiofiles.open(str(srt_path), 'w') as f:
2216 |             await f.write(srt_content)
2217 |             
2218 |         artifact_paths["output_files"]["srt"] = srt_path
2219 |     
2220 |     # Generate VTT output
2221 |     if OutputFormat.VTT in output_formats:
2222 |         vtt_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.vtt"
2223 |         
2224 |         # Convert segments to VTT format
2225 |         vtt_content = generate_vtt(segments)
2226 |         
2227 |         # Write VTT file
2228 |         async with aiofiles.open(str(vtt_path), 'w') as f:
2229 |             await f.write(vtt_content)
2230 |             
2231 |         artifact_paths["output_files"]["vtt"] = vtt_path
2232 |     
2233 |     # Generate Markdown output
2234 |     if OutputFormat.MARKDOWN in output_formats:
2235 |         md_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.md"
2236 |         
2237 |         # Create markdown content
2238 |         md_content = generate_markdown(enhanced_transcript, metadata)
2239 |         
2240 |         # Write markdown file
2241 |         async with aiofiles.open(str(md_path), 'w') as f:
2242 |             await f.write(md_content)
2243 |             
2244 |         artifact_paths["output_files"]["markdown"] = md_path
2245 |     
2246 |     # Generate DOCX output (if supported)
2247 |     if OutputFormat.DOCX in output_formats:
2248 |         try:
2249 |             docx_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.docx"
2250 |             
2251 |             # Generate DOCX in a thread pool to avoid blocking
2252 |             with concurrent.futures.ThreadPoolExecutor() as executor:
2253 |                 await asyncio.get_event_loop().run_in_executor(
2254 |                     executor, 
2255 |                     generate_docx, 
2256 |                     docx_path, 
2257 |                     enhanced_transcript, 
2258 |                     metadata
2259 |                 )
2260 |                 
2261 |             artifact_paths["output_files"]["docx"] = docx_path
2262 |         except (ImportError, Exception) as e:
2263 |             logger.warning(f"Failed to generate DOCX output: {e}", emoji_key="warning")
2264 |     
2265 |     return artifact_paths
2266 | 
2267 | 
2268 | def generate_srt(segments: List[Dict[str, Any]]) -> str:
2269 |     """Generate SRT format from segments."""
2270 |     srt_lines = []
2271 |     
2272 |     for i, segment in enumerate(segments):
2273 |         # Convert times to SRT format (HH:MM:SS,mmm)
2274 |         start_time = segment.get("start", 0)
2275 |         end_time = segment.get("end", 0)
2276 |         
2277 |         start_str = format_srt_time(start_time)
2278 |         end_str = format_srt_time(end_time)
2279 |         
2280 |         # Format text
2281 |         text = segment.get("text", "").replace("\n", " ")
2282 |         
2283 |         # Add speaker if available
2284 |         if "speaker" in segment and segment["speaker"]:
2285 |             text = f"[{segment['speaker']}] {text}"
2286 |         
2287 |         # Add to SRT
2288 |         srt_lines.append(f"{i+1}")
2289 |         srt_lines.append(f"{start_str} --> {end_str}")
2290 |         srt_lines.append(f"{text}")
2291 |         srt_lines.append("")  # Empty line between entries
2292 |     
2293 |     return "\n".join(srt_lines)
2294 | 
2295 | 
2296 | def format_srt_time(seconds: float) -> str:
2297 |     """Format seconds as SRT time: HH:MM:SS,mmm."""
2298 |     hours = int(seconds // 3600)
2299 |     minutes = int((seconds % 3600) // 60)
2300 |     sec_int = int(seconds % 60)
2301 |     ms = int((seconds % 1) * 1000)
2302 |     return f"{hours:02d}:{minutes:02d}:{sec_int:02d},{ms:03d}"
2303 | 
2304 | 
2305 | def generate_vtt(segments: List[Dict[str, Any]]) -> str:
2306 |     """Generate WebVTT format from segments."""
2307 |     vtt_lines = ["WEBVTT", ""]
2308 |     
2309 |     for segment in segments:
2310 |         # Convert times to VTT format (HH:MM:SS.mmm)
2311 |         start_time = segment.get("start", 0)
2312 |         end_time = segment.get("end", 0)
2313 |         
2314 |         start_str = format_vtt_time(start_time)
2315 |         end_str = format_vtt_time(end_time)
2316 |         
2317 |         # Format text
2318 |         text = segment.get("text", "").replace("\n", " ")
2319 |         
2320 |         # Add speaker if available
2321 |         if "speaker" in segment and segment["speaker"]:
2322 |             text = f"<v {segment['speaker']}>{text}</v>"
2323 |         
2324 |         # Add to VTT
2325 |         vtt_lines.append(f"{start_str} --> {end_str}")
2326 |         vtt_lines.append(f"{text}")
2327 |         vtt_lines.append("")  # Empty line between entries
2328 |     
2329 |     return "\n".join(vtt_lines)
2330 | 
2331 | 
2332 | def format_vtt_time(seconds: float) -> str:
2333 |     """Format seconds as WebVTT time: HH:MM:SS.mmm."""
2334 |     hours = int(seconds // 3600)
2335 |     minutes = int((seconds % 3600) // 60)
2336 |     sec_fractional = seconds % 60
2337 |     return f"{hours:02d}:{minutes:02d}:{sec_fractional:06.3f}"
2338 | 
2339 | 
2340 | def generate_markdown(transcript: str, metadata: Dict[str, Any]) -> str:
2341 |     """Generate Markdown format for the transcript."""
2342 |     md_lines = []
2343 |     
2344 |     # Add title
2345 |     if "title" in metadata and metadata["title"]:
2346 |         md_lines.append(f"# {metadata['title']}")
2347 |         md_lines.append("")
2348 |     else:
2349 |         md_lines.append("# Transcript")
2350 |         md_lines.append("")
2351 |     
2352 |     # Add metadata section
2353 |     md_lines.append("## Metadata")
2354 |     md_lines.append("")
2355 |     
2356 |     if "language" in metadata and metadata["language"]:
2357 |         md_lines.append(f"- **Language:** {metadata['language']}")
2358 |     
2359 |     if "duration" in metadata and metadata["duration"]:
2360 |         duration_min = int(metadata["duration"] // 60)
2361 |         duration_sec = int(metadata["duration"] % 60)
2362 |         md_lines.append(f"- **Duration:** {duration_min} min {duration_sec} sec")
2363 |     
2364 |     if "topics" in metadata and metadata["topics"]:
2365 |         topics_str = ", ".join(metadata["topics"])
2366 |         md_lines.append(f"- **Topics:** {topics_str}")
2367 |     
2368 |     md_lines.append("")
2369 |     md_lines.append("## Transcript")
2370 |     md_lines.append("")
2371 |     
2372 |     # Add transcript with proper line breaks preserved
2373 |     for line in transcript.split("\n"):
2374 |         md_lines.append(line)
2375 |     
2376 |     return "\n".join(md_lines)
2377 | 
2378 | 
2379 | def generate_docx(docx_path: str, transcript: str, metadata: Dict[str, Any]) -> None:
2380 |     """Generate DOCX format for the transcript."""
2381 |     # Must be run in a ThreadPoolExecutor since python-docx is not async
2382 |     doc = Document()
2383 |     
2384 |     # Add title
2385 |     if "title" in metadata and metadata["title"]:
2386 |         title = doc.add_heading(metadata["title"], 0)
2387 |     else:
2388 |         title = doc.add_heading("Transcript", 0)  # noqa: F841
2389 |     
2390 |     # Add metadata section
2391 |     doc.add_heading("Metadata", 1)
2392 |     
2393 |     if "language" in metadata and metadata["language"]:
2394 |         doc.add_paragraph(f"Language: {metadata['language']}")
2395 |     
2396 |     if "duration" in metadata and metadata["duration"]:
2397 |         duration_min = int(metadata["duration"] // 60)
2398 |         duration_sec = int(metadata["duration"] % 60)
2399 |         doc.add_paragraph(f"Duration: {duration_min} min {duration_sec} sec")
2400 |     
2401 |     if "topics" in metadata and metadata["topics"]:
2402 |         topics_str = ", ".join(metadata["topics"])
2403 |         doc.add_paragraph(f"Topics: {topics_str}")
2404 |     
2405 |     # Add transcript
2406 |     doc.add_heading("Transcript", 1)
2407 |     
2408 |     # Split into paragraphs and add
2409 |     for paragraph in transcript.split("\n\n"):
2410 |         if paragraph.strip():
2411 |             p = doc.add_paragraph()
2412 |             
2413 |             # Check if paragraph starts with a heading marker
2414 |             if paragraph.startswith("##"):
2415 |                 parts = paragraph.split(" ", 1)
2416 |                 if len(parts) > 1:
2417 |                     doc.add_heading(parts[1], 2)
2418 |                     continue
2419 |             
2420 |             # Regular paragraph
2421 |             p.add_run(paragraph)
2422 |     
2423 |     # Save the document
2424 |     doc.save(docx_path)
2425 | 
2426 | 
2427 | async def chat_with_transcript(
2428 |     transcript: str,
2429 |     query: str,
2430 |     provider: str = Provider.ANTHROPIC.value,
2431 |     model: Optional[str] = None,
2432 |     context: Optional[str] = None
2433 | ) -> Dict[str, Any]:
2434 |     """Chat with a transcript to extract information or answer questions about its content.
2435 |     
2436 |     Args:
2437 |         transcript: The transcript text to analyze
2438 |         query: The question or instruction to process regarding the transcript
2439 |         provider: LLM provider to use (default: Anthropic)
2440 |         model: Specific model to use (default: provider's default model)
2441 |         context: Optional additional context about the audio/transcript
2442 |         
2443 |     Returns:
2444 |         A dictionary containing the response and related metadata
2445 |     """
2446 |     if not transcript or not isinstance(transcript, str):
2447 |         raise ToolInputError("Transcript must be a non-empty string.")
2448 |     
2449 |     if not query or not isinstance(query, str):
2450 |         raise ToolInputError("Query must be a non-empty string.")
2451 |     
2452 |     # Calculate token count for logging
2453 |     try:
2454 |         transcript_tokens = count_tokens(transcript, model)
2455 |         query_tokens = count_tokens(query, model)
2456 |         logger.info(
2457 |             f"Transcript: {transcript_tokens} tokens, Query: {query_tokens} tokens",
2458 |             emoji_key=TaskType.CHAT.value
2459 |         )
2460 |     except Exception as e:
2461 |         logger.warning(f"Failed to count tokens: {e}", emoji_key="warning")
2462 |     
2463 |     # Build the prompt
2464 |     system_prompt = """You are an expert at analyzing transcripts and extracting information.
2465 | Provide concise, accurate answers based solely on the provided transcript.
2466 | If the answer is not in the transcript, say so clearly."""
2467 | 
2468 |     if context:
2469 |         system_prompt += f"\n\nAdditional context about this transcript: {context}"
2470 |     
2471 |     # Get provider instance to ensure it exists and is available
2472 |     try:
2473 |         provider_instance = await get_provider(provider)
2474 |         if model is None:
2475 |             # Check if the provider has a default model or use claude-3-7-sonnet as fallback
2476 |             default_models = await provider_instance.list_models()
2477 |             if default_models and len(default_models) > 0:
2478 |                 model = default_models[0].get("id")
2479 |             else:
2480 |                 model = "claude-3-7-sonnet-20250219" if provider == Provider.ANTHROPIC.value else None
2481 |         
2482 |         logger.info(f"Using model: {provider}/{model}", emoji_key="model")
2483 |     except Exception as e:
2484 |         raise ProviderError(
2485 |             f"Failed to initialize provider '{provider}': {str(e)}",
2486 |             provider=provider,
2487 |             cause=e
2488 |         ) from e
2489 |     
2490 |     # Use relative file paths for any file references
2491 |     rel_transcript_path = None
2492 |     if os.path.exists(transcript):
2493 |         rel_transcript_path = Path(transcript).relative_to(Path.cwd())  # noqa: F841
2494 |     
2495 |     # Create the message with the transcript and query
2496 |     user_message = f"""Here is a transcript to analyze:
2497 | 
2498 | ---TRANSCRIPT BEGIN---
2499 | {transcript}
2500 | ---TRANSCRIPT END---
2501 | 
2502 | {query}"""
2503 |     
2504 |     # Send to LLM
2505 |     result = await chat_completion(
2506 |         messages=[{"role": "user", "content": user_message}],
2507 |         provider=provider,
2508 |         model=model,
2509 |         system_prompt=system_prompt,
2510 |         temperature=0.1
2511 |     )
2512 |     
2513 |     return result
2514 | 
2515 | 
2516 | @with_cache(ttl=24 * 60 * 60)  # Cache results for 24 hours
2517 | @with_tool_metrics
2518 | @with_retry(max_retries=1, retry_delay=1.0)
2519 | @with_error_handling
2520 | async def extract_audio_transcript_key_points(
2521 |     file_path_or_transcript: str,
2522 |     is_file: bool = True,
2523 |     provider: str = Provider.ANTHROPIC.value,
2524 |     model: Optional[str] = None,
2525 |     max_points: int = 10
2526 | ) -> Dict[str, Any]:
2527 |     """Extracts the most important key points from an audio transcript.
2528 |     
2529 |     This tool can process either an audio file (which it will transcribe first)
2530 |     or directly analyze an existing transcript to identify the most important 
2531 |     information, main topics, and key takeaways.
2532 |     
2533 |     Args:
2534 |         file_path_or_transcript: Path to audio file or transcript text content
2535 |         is_file: Whether the input is a file path (True) or transcript text (False)
2536 |         provider: LLM provider to use for analysis
2537 |         model: Specific model to use (provider default if None)
2538 |         max_points: Maximum number of key points to extract
2539 |         
2540 |     Returns:
2541 |         A dictionary containing:
2542 |         {
2543 |             "key_points": ["Point 1", "Point 2", ...],
2544 |             "summary": "Brief summary of the content",
2545 |             "topics": ["Topic 1", "Topic 2", ...],
2546 |             "speakers": ["Speaker 1", "Speaker 2", ...] (if multiple speakers detected),
2547 |             "tokens": { statistics about token usage },
2548 |             "cost": estimated cost of the operation,
2549 |             "processing_time": total processing time in seconds
2550 |         }
2551 |     """
2552 |     start_time = time.time()
2553 |     
2554 |     # Get transcript from file or use provided text
2555 |     transcript = ""
2556 |     if is_file:
2557 |         try:
2558 |             # Validate file path
2559 |             file_path = os.path.abspath(os.path.expanduser(file_path_or_transcript))
2560 |             if not os.path.exists(file_path):
2561 |                 raise ToolInputError(f"File not found: {file_path}")
2562 |             
2563 |             # Get file info for logging
2564 |             file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
2565 |             file_name = Path(file_path).name
2566 |             
2567 |             logger.info(
2568 |                 f"Extracting key points from audio file: {file_name} ({file_size_mb:.2f} MB)",
2569 |                 emoji_key="audio"
2570 |             )
2571 |             
2572 |             # Transcribe audio
2573 |             transcription_result = await transcribe_audio(file_path, {
2574 |                 "enhance_audio": True,
2575 |                 "enhance_transcript": True,
2576 |                 "output_formats": ["json"]
2577 |             })
2578 |             
2579 |             transcript = transcription_result.get("enhanced_transcript", "")
2580 |             if not transcript:
2581 |                 transcript = transcription_result.get("raw_transcript", "")
2582 |                 
2583 |             if not transcript:
2584 |                 raise ToolError("Failed to generate transcript from audio")
2585 |                 
2586 |         except Exception as e:
2587 |             if isinstance(e, (ToolError, ToolInputError)):
2588 |                 raise
2589 |             raise ToolError(f"Failed to process audio file: {str(e)}") from e
2590 |     else:
2591 |         # Input is already a transcript
2592 |         transcript = file_path_or_transcript
2593 |         if not transcript or not isinstance(transcript, str):
2594 |             raise ToolInputError("Transcript text must be a non-empty string")
2595 |     
2596 |     # Calculate token count for the transcript
2597 |     try:
2598 |         token_count = count_tokens(transcript, model)
2599 |         logger.info(f"Transcript token count: {token_count}", emoji_key="tokens")
2600 |     except Exception as e:
2601 |         logger.warning(f"Failed to count tokens: {e}")
2602 |     
2603 |     # Create prompt for key points extraction
2604 |     prompt = f"""Extract the most important key points from this transcript.
2605 | 
2606 | Identify:
2607 | 1. The {max_points} most important key points or takeaways
2608 | 2. Main topics discussed
2609 | 3. Any speakers or main entities mentioned (if identifiable)
2610 | 4. A brief summary (2-3 sentences max)
2611 | 
2612 | Format your response as JSON with these fields:
2613 | {{
2614 |   "key_points": ["Point 1", "Point 2", ...],
2615 |   "topics": ["Topic 1", "Topic 2", ...],
2616 |   "speakers": ["Speaker 1", "Speaker 2", ...],
2617 |   "summary": "Brief summary here"
2618 | }}
2619 | 
2620 | TRANSCRIPT:
2621 | {transcript}
2622 | """
2623 | 
2624 |     # Get provider instance
2625 |     try:
2626 |         provider_instance = await get_provider(provider)  # noqa: F841
2627 |     except Exception as e:
2628 |         raise ProviderError(
2629 |             f"Failed to initialize provider '{provider}': {str(e)}",
2630 |             provider=provider,
2631 |             cause=e
2632 |         ) from e
2633 | 
2634 |     # Generate completion
2635 |     try:
2636 |         completion_result = await generate_completion(
2637 |             prompt=prompt,
2638 |             provider=provider,
2639 |             model=model,
2640 |             temperature=0.1,
2641 |             max_tokens=1000
2642 |         )
2643 |         
2644 |         # Parse JSON response
2645 |         response_text = completion_result.get("text", "")
2646 |         
2647 |         # Find JSON in the response
2648 |         json_match = re.search(r'({[\s\S]*})', response_text)
2649 |         if json_match:
2650 |             try:
2651 |                 extracted_data = json.loads(json_match.group(1))
2652 |             except json.JSONDecodeError:
2653 |                 # Fallback to regex extraction
2654 |                 extracted_data = _extract_key_points_with_regex(response_text)
2655 |         else:
2656 |             # Fallback to regex extraction
2657 |             extracted_data = _extract_key_points_with_regex(response_text)
2658 |         
2659 |         processing_time = time.time() - start_time
2660 |         
2661 |         # Add token and cost info
2662 |         result = {
2663 |             "key_points": extracted_data.get("key_points", []),
2664 |             "topics": extracted_data.get("topics", []),
2665 |             "speakers": extracted_data.get("speakers", []),
2666 |             "summary": extracted_data.get("summary", ""),
2667 |             "tokens": completion_result.get("tokens", {"input": 0, "output": 0, "total": 0}),
2668 |             "cost": completion_result.get("cost", 0.0),
2669 |             "processing_time": processing_time
2670 |         }
2671 |         
2672 |         return result
2673 |         
2674 |     except Exception as e:
2675 |         error_model = model or f"{provider}/default"
2676 |         raise ProviderError(
2677 |             f"Key points extraction failed for model '{error_model}': {str(e)}",
2678 |             provider=provider,
2679 |             model=error_model,
2680 |             cause=e
2681 |         ) from e
2682 | 
2683 | 
2684 | def _extract_key_points_with_regex(text: str) -> Dict[str, Any]:
2685 |     """Extract key points data using regex when JSON parsing fails."""
2686 |     result = {
2687 |         "key_points": [],
2688 |         "topics": [],
2689 |         "speakers": [],
2690 |         "summary": ""
2691 |     }
2692 |     
2693 |     # Extract key points
2694 |     key_points_pattern = r'key_points"?\s*:?\s*\[\s*"([^"]+)"(?:\s*,\s*"([^"]+)")*\s*\]'
2695 |     key_points_match = re.search(key_points_pattern, text, re.IGNORECASE | re.DOTALL)
2696 |     if key_points_match:
2697 |         point_list = re.findall(r'"([^"]+)"', key_points_match.group(0))
2698 |         # Filter out empty strings
2699 |         result["key_points"] = [p for p in point_list if p.strip()]
2700 |     else:
2701 |         # Try alternative pattern for non-JSON format
2702 |         point_list = re.findall(r'(?:^|\n)(?:•|\*|-|[0-9]+\.)\s*([^\n]+?)(?:\n|$)', text)
2703 |         # Filter out empty strings
2704 |         result["key_points"] = [p.strip() for p in point_list if p.strip()][:10]  # Limit to 10 points
2705 |     
2706 |     # Extract topics
2707 |     topics_pattern = r'topics"?\s*:?\s*\[\s*"([^"]+)"(?:\s*,\s*"([^"]+)")*\s*\]'
2708 |     topics_match = re.search(topics_pattern, text, re.IGNORECASE | re.DOTALL)
2709 |     if topics_match:
2710 |         topic_list = re.findall(r'"([^"]+)"', topics_match.group(0))
2711 |         # Filter out empty strings
2712 |         result["topics"] = [t for t in topic_list if t.strip()]
2713 |     
2714 |     # Extract speakers
2715 |     speakers_pattern = r'speakers"?\s*:?\s*\[\s*"([^"]+)"(?:\s*,\s*"([^"]+)")*\s*\]'
2716 |     speakers_match = re.search(speakers_pattern, text, re.IGNORECASE | re.DOTALL)
2717 |     if speakers_match:
2718 |         speaker_list = re.findall(r'"([^"]+)"', speakers_match.group(0))
2719 |         # Filter out empty strings
2720 |         result["speakers"] = [s for s in speaker_list if s.strip()]
2721 |     
2722 |     # Extract summary
2723 |     summary_pattern = r'summary"?\s*:?\s*"([^"]+)"'
2724 |     summary_match = re.search(summary_pattern, text, re.IGNORECASE)
2725 |     if summary_match and summary_match.group(1).strip():
2726 |         result["summary"] = summary_match.group(1).strip()
2727 |     
2728 |     return result
```