This is page 31 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│ ├── __init__.py
│ ├── advanced_agent_flows_using_unified_memory_system_demo.py
│ ├── advanced_extraction_demo.py
│ ├── advanced_unified_memory_system_demo.py
│ ├── advanced_vector_search_demo.py
│ ├── analytics_reporting_demo.py
│ ├── audio_transcription_demo.py
│ ├── basic_completion_demo.py
│ ├── cache_demo.py
│ ├── claude_integration_demo.py
│ ├── compare_synthesize_demo.py
│ ├── cost_optimization.py
│ ├── data
│ │ ├── sample_event.txt
│ │ ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│ │ └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│ ├── docstring_refiner_demo.py
│ ├── document_conversion_and_processing_demo.py
│ ├── entity_relation_graph_demo.py
│ ├── filesystem_operations_demo.py
│ ├── grok_integration_demo.py
│ ├── local_text_tools_demo.py
│ ├── marqo_fused_search_demo.py
│ ├── measure_model_speeds.py
│ ├── meta_api_demo.py
│ ├── multi_provider_demo.py
│ ├── ollama_integration_demo.py
│ ├── prompt_templates_demo.py
│ ├── python_sandbox_demo.py
│ ├── rag_example.py
│ ├── research_workflow_demo.py
│ ├── sample
│ │ ├── article.txt
│ │ ├── backprop_paper.pdf
│ │ ├── buffett.pdf
│ │ ├── contract_link.txt
│ │ ├── legal_contract.txt
│ │ ├── medical_case.txt
│ │ ├── northwind.db
│ │ ├── research_paper.txt
│ │ ├── sample_data.json
│ │ └── text_classification_samples
│ │ ├── email_classification.txt
│ │ ├── news_samples.txt
│ │ ├── product_reviews.txt
│ │ └── support_tickets.txt
│ ├── sample_docs
│ │ └── downloaded
│ │ └── attention_is_all_you_need.pdf
│ ├── sentiment_analysis_demo.py
│ ├── simple_completion_demo.py
│ ├── single_shot_synthesis_demo.py
│ ├── smart_browser_demo.py
│ ├── sql_database_demo.py
│ ├── sse_client_demo.py
│ ├── test_code_extraction.py
│ ├── test_content_detection.py
│ ├── test_ollama.py
│ ├── text_classification_demo.py
│ ├── text_redline_demo.py
│ ├── tool_composition_examples.py
│ ├── tournament_code_demo.py
│ ├── tournament_text_demo.py
│ ├── unified_memory_system_demo.py
│ ├── vector_search_demo.py
│ ├── web_automation_instruction_packs.py
│ └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│ └── smart_browser_internal
│ ├── locator_cache.db
│ ├── readability.js
│ └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── integration
│ │ ├── __init__.py
│ │ └── test_server.py
│ ├── manual
│ │ ├── test_extraction_advanced.py
│ │ └── test_extraction.py
│ └── unit
│ ├── __init__.py
│ ├── test_cache.py
│ ├── test_providers.py
│ └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── commands.py
│ │ ├── helpers.py
│ │ └── typer_cli.py
│ ├── clients
│ │ ├── __init__.py
│ │ ├── completion_client.py
│ │ └── rag_client.py
│ ├── config
│ │ └── examples
│ │ └── filesystem_config.yaml
│ ├── config.py
│ ├── constants.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── evaluation
│ │ │ ├── base.py
│ │ │ └── evaluators.py
│ │ ├── providers
│ │ │ ├── __init__.py
│ │ │ ├── anthropic.py
│ │ │ ├── base.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemini.py
│ │ │ ├── grok.py
│ │ │ ├── ollama.py
│ │ │ ├── openai.py
│ │ │ └── openrouter.py
│ │ ├── server.py
│ │ ├── state_store.py
│ │ ├── tournaments
│ │ │ ├── manager.py
│ │ │ ├── tasks.py
│ │ │ └── utils.py
│ │ └── ums_api
│ │ ├── __init__.py
│ │ ├── ums_database.py
│ │ ├── ums_endpoints.py
│ │ ├── ums_models.py
│ │ └── ums_services.py
│ ├── exceptions.py
│ ├── graceful_shutdown.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── analytics
│ │ │ ├── __init__.py
│ │ │ ├── metrics.py
│ │ │ └── reporting.py
│ │ ├── cache
│ │ │ ├── __init__.py
│ │ │ ├── cache_service.py
│ │ │ ├── persistence.py
│ │ │ ├── strategies.py
│ │ │ └── utils.py
│ │ ├── cache.py
│ │ ├── document.py
│ │ ├── knowledge_base
│ │ │ ├── __init__.py
│ │ │ ├── feedback.py
│ │ │ ├── manager.py
│ │ │ ├── rag_engine.py
│ │ │ ├── retriever.py
│ │ │ └── utils.py
│ │ ├── prompts
│ │ │ ├── __init__.py
│ │ │ ├── repository.py
│ │ │ └── templates.py
│ │ ├── prompts.py
│ │ └── vector
│ │ ├── __init__.py
│ │ ├── embeddings.py
│ │ └── vector_service.py
│ ├── tool_token_counter.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── audio_transcription.py
│ │ ├── base.py
│ │ ├── completion.py
│ │ ├── docstring_refiner.py
│ │ ├── document_conversion_and_processing.py
│ │ ├── enhanced-ums-lookbook.html
│ │ ├── entity_relation_graph.py
│ │ ├── excel_spreadsheet_automation.py
│ │ ├── extraction.py
│ │ ├── filesystem.py
│ │ ├── html_to_markdown.py
│ │ ├── local_text_tools.py
│ │ ├── marqo_fused_search.py
│ │ ├── meta_api_tool.py
│ │ ├── ocr_tools.py
│ │ ├── optimization.py
│ │ ├── provider.py
│ │ ├── pyodide_boot_template.html
│ │ ├── python_sandbox.py
│ │ ├── rag.py
│ │ ├── redline-compiled.css
│ │ ├── sentiment_analysis.py
│ │ ├── single_shot_synthesis.py
│ │ ├── smart_browser.py
│ │ ├── sql_databases.py
│ │ ├── text_classification.py
│ │ ├── text_redline_tools.py
│ │ ├── tournament.py
│ │ ├── ums_explorer.html
│ │ └── unified_memory_system.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── async_utils.py
│ │ ├── display.py
│ │ ├── logging
│ │ │ ├── __init__.py
│ │ │ ├── console.py
│ │ │ ├── emojis.py
│ │ │ ├── formatter.py
│ │ │ ├── logger.py
│ │ │ ├── panels.py
│ │ │ ├── progress.py
│ │ │ └── themes.py
│ │ ├── parse_yaml.py
│ │ ├── parsing.py
│ │ ├── security.py
│ │ └── text.py
│ └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/ultimate_mcp_server/tools/audio_transcription.py:
--------------------------------------------------------------------------------
```python
1 | """Advanced audio transcription and enhancement tools for Ultimate MCP Server.
2 |
3 | This module provides tools for high-quality audio transcription, pre-processing,
4 | and intelligent transcript enhancement with advanced features like speaker
5 | diarization, custom vocabulary support, and semantic structuring.
6 | """
7 | import asyncio
8 | import concurrent.futures
9 | import datetime
10 | import json
11 | import os
12 | import re
13 | import shutil
14 | import subprocess
15 | import tempfile
16 | import time
17 | from dataclasses import dataclass
18 | from enum import Enum
19 | from pathlib import Path
20 | from typing import Any, Dict, List, Optional
21 |
22 | import aiofiles
23 | import httpx
24 | from docx import Document
25 | from pydantic import BaseModel, Field
26 | from pydantic.functional_validators import field_validator
27 |
28 | from ultimate_mcp_server.constants import Provider, TaskType
29 | from ultimate_mcp_server.core.providers.base import get_provider
30 | from ultimate_mcp_server.exceptions import (
31 | ProviderError,
32 | ResourceError,
33 | ToolError,
34 | ToolInputError,
35 | )
36 | from ultimate_mcp_server.services.cache import with_cache
37 | from ultimate_mcp_server.tools.base import with_error_handling, with_retry, with_tool_metrics
38 | from ultimate_mcp_server.tools.completion import chat_completion, generate_completion
39 | from ultimate_mcp_server.utils import get_logger
40 | from ultimate_mcp_server.utils.text import count_tokens
41 |
42 | logger = get_logger("ultimate_mcp_server.tools.audio")
43 |
44 | # --- Constants and Enums ---
45 |
46 | class AudioEnhancementProfile(str, Enum):
47 | """Predefined audio enhancement profiles for different recording types."""
48 | CONFERENCE_CALL = "conference_call" # Optimized for online meetings
49 | INTERVIEW = "interview" # Optimized for interview recordings
50 | LECTURE = "lecture" # Optimized for lectures/presentations
51 | NOISY = "noisy" # Heavy noise reduction for noisy environments
52 | PHONE_CALL = "phone_call" # Optimized for telephone audio
53 | VOICEMAIL = "voicemail" # Optimized for voicemail recordings
54 | CUSTOM = "custom" # User-defined settings
55 |
56 | # Expected model sizes in bytes, with some tolerance (minimum size)
57 | WHISPER_MODEL_SIZES = {
58 | "large-v3": 2900000000, # ~2.9GB
59 | "large-v3-turbo": 1500000000, # ~1.5GB
60 | }
61 |
62 | class TranscriptionQuality(str, Enum):
63 | """Quality settings for transcription, balancing speed vs accuracy."""
64 | DRAFT = "draft" # Fastest, less accurate
65 | STANDARD = "standard" # Balanced speed/accuracy
66 | ENHANCED = "enhanced" # More accurate, slower
67 | MAXIMUM = "maximum" # Most accurate, slowest
68 |
69 |
70 | class EnhancementStyle(str, Enum):
71 | """Transcript enhancement styles for different use cases."""
72 | RAW = "raw" # No enhancement, just cleaned
73 | READABLE = "readable" # Basic readability improvements
74 | POLISHED = "polished" # Well-formatted with proper punctuation
75 | VERBATIM = "verbatim" # Preserve all speech patterns, hesitations
76 | STRUCTURED = "structured" # Add semantic structure (paragraphs, sections)
77 |
78 |
79 | class OutputFormat(str, Enum):
80 | """Available output formats for transcripts."""
81 | JSON = "json" # Full JSON with all metadata
82 | TEXT = "text" # Plain text
83 | SRT = "srt" # SubRip subtitle format
84 | VTT = "vtt" # WebVTT subtitle format
85 | DOCX = "docx" # Microsoft Word format
86 | MARKDOWN = "markdown" # Markdown with formatting
87 |
88 |
89 | # --- Schema Validation Models ---
90 |
91 | class AudioEnhancementParams(BaseModel):
92 | """Parameters for audio enhancement."""
93 | profile: AudioEnhancementProfile = Field(
94 | default=AudioEnhancementProfile.CONFERENCE_CALL,
95 | description="Predefined audio enhancement profile"
96 | )
97 | volume: float = Field(
98 | default=1.5,
99 | ge=0.1,
100 | le=5.0,
101 | description="Volume adjustment factor"
102 | )
103 | noise_reduction: int = Field(
104 | default=10,
105 | ge=0,
106 | le=30,
107 | description="Noise reduction strength (0-30)"
108 | )
109 | highpass: int = Field(
110 | default=200,
111 | ge=50,
112 | le=500,
113 | description="Highpass filter frequency in Hz"
114 | )
115 | lowpass: int = Field(
116 | default=3000,
117 | ge=1000,
118 | le=20000,
119 | description="Lowpass filter frequency in Hz"
120 | )
121 | normalize: bool = Field(
122 | default=True,
123 | description="Apply dynamic audio normalization"
124 | )
125 | compression: bool = Field(
126 | default=True,
127 | description="Apply dynamic range compression"
128 | )
129 | dereverberation: bool = Field(
130 | default=False,
131 | description="Apply dereverberation filter"
132 | )
133 | custom_filters: Optional[str] = Field(
134 | default=None,
135 | description="Additional custom FFmpeg filters"
136 | )
137 | output_channels: int = Field(
138 | default=2,
139 | ge=1,
140 | le=2,
141 | description="Number of output channels (1=mono, 2=stereo)"
142 | )
143 | output_sample_rate: int = Field(
144 | default=16000,
145 | ge=8000,
146 | le=48000,
147 | description="Output sample rate in Hz"
148 | )
149 |
150 | @field_validator('custom_filters')
151 | def validate_custom_filters(cls, v):
152 | """Validate that custom filters don't contain dangerous commands."""
153 | if v:
154 | # Check for shell escape attempts
155 | dangerous_patterns = [';', '&&', '||', '`', '$', '\\', '>', '<', '|', '*', '?', '~', '#']
156 | for pattern in dangerous_patterns:
157 | if pattern in v:
158 | raise ValueError(f"Custom filter contains disallowed character: {pattern}")
159 | return v
160 |
161 |
162 | class WhisperParams(BaseModel):
163 | """Parameters for Whisper transcription."""
164 | model: str = Field(
165 | default="large-v3-turbo",
166 | description="Whisper model name"
167 | )
168 | language: Optional[str] = Field(
169 | default=None,
170 | description="Language code (auto-detect if None)"
171 | )
172 | quality: TranscriptionQuality = Field(
173 | default=TranscriptionQuality.STANDARD,
174 | description="Transcription quality level"
175 | )
176 | beam_size: int = Field(
177 | default=5,
178 | ge=1,
179 | le=10,
180 | description="Beam size for beam search"
181 | )
182 | processors: int = Field(
183 | default=2,
184 | ge=1,
185 | le=8,
186 | description="Number of processors to use"
187 | )
188 | word_timestamps: bool = Field(
189 | default=True,
190 | description="Generate timestamps for each word"
191 | )
192 | translate: bool = Field(
193 | default=False,
194 | description="Translate non-English to English"
195 | )
196 | diarize: bool = Field(
197 | default=False,
198 | description="Attempt speaker diarization"
199 | )
200 | highlight_words: bool = Field(
201 | default=False,
202 | description="Highlight words with lower confidence"
203 | )
204 | max_context: int = Field(
205 | default=-1,
206 | ge=-1,
207 | description="Maximum number of text tokens to consider from previous history"
208 | )
209 | custom_vocab: Optional[List[str]] = Field(
210 | default=None,
211 | description="Custom vocabulary terms to improve recognition"
212 | )
213 |
214 |
215 | class TranscriptEnhancementParams(BaseModel):
216 | """Parameters for transcript enhancement."""
217 | style: EnhancementStyle = Field(
218 | default=EnhancementStyle.READABLE,
219 | description="Enhancement style"
220 | )
221 | provider: str = Field(
222 | default=Provider.ANTHROPIC.value,
223 | description="LLM provider for enhancement"
224 | )
225 | model: Optional[str] = Field(
226 | default=None,
227 | description="Specific model to use (provider default if None)"
228 | )
229 | identify_speakers: bool = Field(
230 | default=False,
231 | description="Attempt to identify and label speakers"
232 | )
233 | add_paragraphs: bool = Field(
234 | default=True,
235 | description="Add paragraph breaks at natural points"
236 | )
237 | fix_spelling: bool = Field(
238 | default=True,
239 | description="Fix spelling errors"
240 | )
241 | fix_grammar: bool = Field(
242 | default=True,
243 | description="Fix basic grammatical errors"
244 | )
245 | sections: bool = Field(
246 | default=False,
247 | description="Add section headings based on topic changes"
248 | )
249 | max_chunk_size: int = Field(
250 | default=6500,
251 | ge=1000,
252 | le=100000,
253 | description="Maximum chunk size in characters"
254 | )
255 | format_numbers: bool = Field(
256 | default=True,
257 | description="Format numbers consistently (e.g., '25' instead of 'twenty-five')"
258 | )
259 | custom_instructions: Optional[str] = Field(
260 | default=None,
261 | description="Additional custom instructions for enhancement"
262 | )
263 |
264 |
265 | class TranscriptionOptions(BaseModel):
266 | """Complete options for audio transcription."""
267 | enhance_audio: bool = Field(
268 | default=True,
269 | description="Whether to preprocess audio with FFmpeg"
270 | )
271 | enhance_transcript: bool = Field(
272 | default=True,
273 | description="Whether to enhance the transcript with LLM"
274 | )
275 | parallel_processing: bool = Field(
276 | default=True,
277 | description="Process chunks in parallel when possible"
278 | )
279 | max_workers: int = Field(
280 | default=4,
281 | ge=1,
282 | description="Maximum number of parallel workers"
283 | )
284 | output_formats: List[OutputFormat] = Field(
285 | default=[OutputFormat.JSON, OutputFormat.TEXT],
286 | description="Output formats to generate"
287 | )
288 | save_enhanced_audio: bool = Field(
289 | default=False,
290 | description="Save the enhanced audio file"
291 | )
292 | keep_artifacts: bool = Field(
293 | default=False,
294 | description="Keep temporary files and artifacts"
295 | )
296 | audio_params: AudioEnhancementParams = Field(
297 | default_factory=AudioEnhancementParams,
298 | description="Audio enhancement parameters"
299 | )
300 | whisper_params: WhisperParams = Field(
301 | default_factory=WhisperParams,
302 | description="Whisper transcription parameters"
303 | )
304 | enhancement_params: TranscriptEnhancementParams = Field(
305 | default_factory=TranscriptEnhancementParams,
306 | description="Transcript enhancement parameters"
307 | )
308 | language_detection: bool = Field(
309 | default=False, # Disable language detection by default
310 | description="Automatically detect language before transcription"
311 | )
312 |
313 |
314 | class Segment(BaseModel):
315 | """A segment of transcript with timing information."""
316 | start: float = Field(..., description="Start time in seconds")
317 | end: float = Field(..., description="End time in seconds")
318 | text: str = Field(..., description="Segment text")
319 | speaker: Optional[str] = Field(None, description="Speaker identifier")
320 | words: Optional[List[Dict[str, Any]]] = Field(None, description="Word-level data")
321 | confidence: Optional[float] = Field(None, description="Confidence score")
322 |
323 |
324 | class AudioInfo(BaseModel):
325 | """Audio file information."""
326 | duration: float = Field(..., description="Duration in seconds")
327 | channels: int = Field(..., description="Number of audio channels")
328 | sample_rate: int = Field(..., description="Sample rate in Hz")
329 | format: str = Field(..., description="Audio format")
330 | codec: Optional[str] = Field(None, description="Audio codec")
331 | bit_depth: Optional[int] = Field(None, description="Bit depth")
332 | bitrate: Optional[int] = Field(None, description="Bitrate in bits/second")
333 | size_bytes: Optional[int] = Field(None, description="File size in bytes")
334 |
335 |
336 | # --- Data Classes ---
337 |
338 | @dataclass
339 | class ProcessingContext:
340 | """Context for the transcription process."""
341 | file_path: str
342 | temp_dir: str
343 | original_filename: str
344 | base_filename: str
345 | options: TranscriptionOptions
346 | enhanced_audio_path: Optional[str] = None
347 | processing_times: Dict[str, float] = None
348 | language_code: Optional[str] = None
349 |
350 | def __post_init__(self):
351 | if self.processing_times is None:
352 | self.processing_times = {}
353 |
354 |
355 | # --- Tool Functions ---
356 |
357 | @with_cache(ttl=24 * 60 * 60) # Cache results for 24 hours
358 | @with_tool_metrics
359 | @with_retry(max_retries=1, retry_delay=1.0)
360 | @with_error_handling
361 | async def transcribe_audio(
362 | file_path: str,
363 | options: Optional[Dict[str, Any]] = None,
364 | ) -> Dict[str, Any]:
365 | """Transcribes an audio file to text with advanced preprocessing and enhancement.
366 |
367 | This tool performs a multi-stage process:
368 | 1. Analyzes the audio file to determine optimal processing parameters
369 | 2. Enhances audio quality with adaptive filtering and preprocessing
370 | 3. Performs high-quality transcription with customizable settings
371 | 4. Intelligently enhances and structures the transcript for readability
372 | 5. Optionally identifies speakers and adds semantic structure
373 |
374 | Args:
375 | file_path: Path to the input audio file (.mp3, .m4a, .wav, etc.)
376 | options: Optional dictionary with transcription options including:
377 | - enhance_audio: Whether to preprocess audio (default True)
378 | - enhance_transcript: Whether to enhance transcript (default True)
379 | - parallel_processing: Process chunks in parallel (default True)
380 | - output_formats: List of output formats (default ["json", "text"])
381 | - audio_params: Audio enhancement parameters
382 | - whisper_params: Whisper transcription parameters
383 | - enhancement_params: Transcript enhancement parameters
384 |
385 | Returns:
386 | A dictionary containing:
387 | {
388 | "raw_transcript": "Original unmodified transcript from Whisper",
389 | "enhanced_transcript": "LLM-enhanced transcript with improved formatting",
390 | "segments": [
391 | {
392 | "start": 0.0,
393 | "end": 10.5,
394 | "text": "Segment text content",
395 | "speaker": "Speaker 1", # If speaker diarization is enabled
396 | "words": [...] # Word-level data if available
397 | },
398 | ...
399 | ],
400 | "metadata": {
401 | "language": "en",
402 | "duration": 120.5,
403 | "title": "Automatically detected title",
404 | "topics": ["Topic 1", "Topic 2"] # If topic extraction is enabled
405 | },
406 | "audio_info": {
407 | "duration": 120.5,
408 | "channels": 2,
409 | "sample_rate": 44100,
410 | "format": "wav",
411 | "codec": "pcm_s16le",
412 | "bit_depth": 16,
413 | "bitrate": 1411000,
414 | "size_bytes": 31000000
415 | },
416 | "processing_time": {
417 | "audio_analysis": 0.5,
418 | "audio_enhancement": 5.2,
419 | "language_detection": 1.1,
420 | "transcription": 45.3,
421 | "transcript_enhancement": 10.2,
422 | "total": 62.3
423 | },
424 | "artifacts": {
425 | "enhanced_audio": "/path/to/enhanced.wav", # If save_enhanced_audio is True
426 | "output_files": {
427 | "json": "/path/to/transcript.json",
428 | "text": "/path/to/transcript.txt",
429 | "srt": "/path/to/transcript.srt"
430 | }
431 | },
432 | "tokens": {
433 | "input": 5000,
434 | "output": 3200,
435 | "total": 8200
436 | },
437 | "cost": 0.00185,
438 | "success": true
439 | }
440 |
441 | Raises:
442 | ToolInputError: If the file path is invalid or unsupported
443 | ToolError: If transcription or enhancement fails
444 | ResourceError: If required dependencies are not available
445 | """
446 | # Start timing total processing
447 | start_time = time.time()
448 |
449 | # --- Input Validation ---
450 | try:
451 | if not file_path or not isinstance(file_path, str):
452 | raise ToolInputError("File path must be a non-empty string.")
453 |
454 | file_path = os.path.abspath(os.path.expanduser(file_path))
455 | if not os.path.exists(file_path):
456 | raise ToolInputError(f"File not found: {file_path}")
457 |
458 | if not os.access(file_path, os.R_OK):
459 | raise ToolInputError(f"File not readable: {file_path}")
460 |
461 | # Validate file is an audio file
462 | _, ext = os.path.splitext(file_path)
463 | if ext.lower() not in ['.mp3', '.wav', '.m4a', '.mp4', '.flac', '.ogg', '.aac', '.wma', '.opus']:
464 | raise ToolInputError(f"Unsupported file format: {ext}. Please provide an audio file.")
465 |
466 | # Parse and validate options
467 | parsed_options = parse_options(options or {})
468 |
469 | except Exception as e:
470 | if isinstance(e, ToolInputError):
471 | raise
472 | raise ToolInputError(f"Failed to validate input: {str(e)}") from e
473 |
474 | # --- Initialize Processing Context ---
475 | try:
476 | temp_dir = tempfile.mkdtemp(prefix="llm_audio_")
477 | original_filename = os.path.basename(file_path)
478 | base_filename = os.path.splitext(original_filename)[0]
479 |
480 | context = ProcessingContext(
481 | file_path=file_path,
482 | temp_dir=temp_dir,
483 | original_filename=original_filename,
484 | base_filename=base_filename,
485 | options=parsed_options,
486 | )
487 |
488 | logger.info(
489 | f"Starting audio transcription process for {original_filename}",
490 | emoji_key="audio",
491 | temp_dir=temp_dir
492 | )
493 | except Exception as e:
494 | raise ToolError(f"Failed to initialize processing context: {str(e)}") from e
495 |
496 | try:
497 | # --- Check Dependencies ---
498 | await check_dependencies(context)
499 |
500 | # --- Process Audio ---
501 | result = await process_audio_file(context)
502 |
503 | # --- Calculate Total Time ---
504 | total_time = time.time() - start_time
505 | context.processing_times["total"] = total_time
506 | result["processing_time"] = context.processing_times
507 |
508 | logger.success(
509 | f"Audio transcription completed in {total_time:.2f}s",
510 | emoji_key="success",
511 | file=context.original_filename,
512 | duration=result.get("audio_info", {}).get("duration", 0)
513 | )
514 |
515 | return result
516 |
517 | except Exception as e:
518 | logger.error(
519 | f"Audio transcription failed: {str(e)}",
520 | emoji_key="error",
521 | exc_info=True,
522 | file=context.original_filename
523 | )
524 | # Clean up temporary directory unless keep_artifacts is True
525 | if context.options.keep_artifacts:
526 | logger.info(f"Keeping artifacts in {temp_dir}")
527 | else:
528 | try:
529 | shutil.rmtree(temp_dir)
530 | except Exception as cleanup_err:
531 | logger.warning(f"Failed to clean up temporary directory: {cleanup_err}")
532 |
533 | if isinstance(e, (ToolError, ToolInputError, ResourceError)):
534 | raise
535 |
536 | # Return a structured error response instead of just raising an exception
537 | return {
538 | "raw_transcript": "",
539 | "enhanced_transcript": "",
540 | "segments": [],
541 | "metadata": {},
542 | "audio_info": {},
543 | "processing_time": context.processing_times if hasattr(context, "processing_times") else {},
544 | "success": False,
545 | "error": f"Audio transcription failed: {str(e)}"
546 | }
547 |
548 | finally:
549 | # Clean up temporary directory unless keep_artifacts is True
550 | if not context.options.keep_artifacts:
551 | try:
552 | shutil.rmtree(temp_dir)
553 | except Exception as cleanup_err:
554 | logger.warning(f"Failed to clean up temporary directory: {cleanup_err}")
555 |
556 |
557 | # --- Main Processing Functions ---
558 |
559 | async def process_audio_file(context: ProcessingContext) -> Dict[str, Any]:
560 | """Process an audio file through the complete transcription pipeline."""
561 | # Get detailed audio information
562 | audio_analysis_start = time.time()
563 | audio_info = await get_detailed_audio_info(context.file_path)
564 | context.processing_times["audio_analysis"] = time.time() - audio_analysis_start
565 |
566 | logger.info(
567 | f"Audio analysis: {audio_info.get('duration', 0):.1f}s duration, "
568 | f"{audio_info.get('sample_rate', 0)} Hz, "
569 | f"{audio_info.get('channels', 0)} channels",
570 | emoji_key="audio"
571 | )
572 |
573 | # Update parameters based on audio analysis if needed
574 | _update_parameters_from_audio_info(context, audio_info)
575 |
576 | # --- Audio Enhancement ---
577 | enhanced_audio_path = context.file_path
578 | if context.options.enhance_audio:
579 | audio_enhance_start = time.time()
580 | logger.info("Enhancing audio quality", emoji_key="audio", profile=context.options.audio_params.profile.value)
581 |
582 | enhanced_audio_path = await enhance_audio(context, audio_info)
583 | if not enhanced_audio_path:
584 | logger.warning("Audio enhancement failed, falling back to original file", emoji_key="warning")
585 | enhanced_audio_path = context.file_path
586 |
587 | context.processing_times["audio_enhancement"] = time.time() - audio_enhance_start
588 | else:
589 | context.processing_times["audio_enhancement"] = 0
590 |
591 | context.enhanced_audio_path = enhanced_audio_path
592 |
593 | if not os.path.exists(enhanced_audio_path):
594 | logger.warning(f"Enhanced audio path does not exist: {enhanced_audio_path}", emoji_key="warning")
595 | return {
596 | "raw_transcript": "",
597 | "enhanced_transcript": "",
598 | "segments": [],
599 | "metadata": {},
600 | "audio_info": audio_info,
601 | "processing_time": context.processing_times,
602 | "success": False,
603 | "error": "Enhanced audio file not found"
604 | }
605 |
606 | # --- Skip Language Detection ---
607 | # Language detection is disabled - always use Whisper's built-in detection
608 | context.processing_times["language_detection"] = 0
609 | if context.options.whisper_params.language:
610 | logger.info(f"Using specified language: {context.options.whisper_params.language}", emoji_key="language")
611 | else:
612 | logger.info("Using Whisper's built-in language detection", emoji_key="language")
613 |
614 | # --- Transcribe Audio ---
615 | transcribe_start = time.time()
616 | model = context.options.whisper_params.model
617 | quality = context.options.whisper_params.quality.value
618 | logger.info(
619 | f"Transcribing audio with model '{model}' (quality: {quality})",
620 | emoji_key="transcribe"
621 | )
622 |
623 | try:
624 | transcript_result = await transcribe_with_whisper(context)
625 | context.processing_times["transcription"] = time.time() - transcribe_start
626 |
627 | raw_transcript = transcript_result.get("text", "")
628 | segments = transcript_result.get("segments", [])
629 |
630 | # Check if transcript is empty
631 | if not raw_transcript:
632 | logger.warning("Whisper returned an empty transcript", emoji_key="warning")
633 | else:
634 | transcript_length = len(raw_transcript)
635 | segments_count = len(segments)
636 | logger.info(
637 | f"Transcription complete: {transcript_length} characters, {segments_count} segments",
638 | emoji_key="success"
639 | )
640 |
641 | # Extract metadata if available
642 | metadata = transcript_result.get("metadata", {})
643 | if context.language_code and "language" not in metadata:
644 | metadata["language"] = context.language_code
645 |
646 | except Exception as e:
647 | logger.error(f"Transcription failed: {str(e)}", emoji_key="error", exc_info=True)
648 | context.processing_times["transcription"] = time.time() - transcribe_start
649 | return {
650 | "raw_transcript": "",
651 | "enhanced_transcript": "",
652 | "segments": [],
653 | "metadata": {"language": context.language_code} if context.language_code else {},
654 | "audio_info": audio_info,
655 | "processing_time": context.processing_times,
656 | "success": False,
657 | "error": f"Transcription failed: {str(e)}"
658 | }
659 |
660 | # --- Enhance Transcript ---
661 | enhanced_transcript = raw_transcript
662 | enhancement_cost = 0.0
663 | enhancement_tokens = {"input": 0, "output": 0, "total": 0}
664 |
665 | if context.options.enhance_transcript and raw_transcript:
666 | enhance_start = time.time()
667 | logger.info(
668 | f"Enhancing transcript with style: {context.options.enhancement_params.style.value}",
669 | emoji_key="enhance",
670 | provider=context.options.enhancement_params.provider
671 | )
672 |
673 | try:
674 | enhancement_result = await enhance_transcript(context, raw_transcript, metadata)
675 |
676 | enhanced_transcript = enhancement_result["transcript"]
677 | enhancement_cost = enhancement_result["cost"]
678 | enhancement_tokens = enhancement_result["tokens"]
679 |
680 | if "topics" in enhancement_result and enhancement_result["topics"]:
681 | metadata["topics"] = enhancement_result["topics"]
682 |
683 | if "title" in enhancement_result and enhancement_result["title"]:
684 | metadata["title"] = enhancement_result["title"]
685 |
686 | context.processing_times["transcript_enhancement"] = time.time() - enhance_start
687 |
688 | if not enhanced_transcript:
689 | logger.warning("Enhancement returned an empty transcript, falling back to raw transcript", emoji_key="warning")
690 | enhanced_transcript = raw_transcript
691 | else:
692 | enhancement_length = len(enhanced_transcript)
693 | logger.info(f"Enhancement complete: {enhancement_length} characters", emoji_key="success")
694 |
695 | except Exception as e:
696 | logger.error(f"Transcript enhancement failed: {e}", emoji_key="error", exc_info=True)
697 | context.processing_times["transcript_enhancement"] = time.time() - enhance_start
698 | # Fall back to raw transcript
699 | enhanced_transcript = raw_transcript
700 | else:
701 | if not raw_transcript:
702 | logger.warning("Skipping transcript enhancement because raw transcript is empty", emoji_key="warning")
703 | elif not context.options.enhance_transcript:
704 | logger.info("Transcript enhancement disabled by options", emoji_key="info")
705 |
706 | context.processing_times["transcript_enhancement"] = 0
707 |
708 | # --- Generate Output Files ---
709 | artifact_paths = await generate_output_files(context, raw_transcript, enhanced_transcript, segments, metadata)
710 |
711 | # --- Prepare Result ---
712 | result = {
713 | "raw_transcript": raw_transcript,
714 | "enhanced_transcript": enhanced_transcript,
715 | "segments": segments,
716 | "metadata": metadata,
717 | "audio_info": audio_info,
718 | "tokens": enhancement_tokens,
719 | "cost": enhancement_cost,
720 | "artifacts": artifact_paths,
721 | "success": bool(raw_transcript or enhanced_transcript)
722 | }
723 |
724 | return result
725 |
726 |
727 | def parse_options(options: Dict[str, Any]) -> TranscriptionOptions:
728 | """Parse and validate transcription options."""
729 | # Convert string output formats to enum values
730 | if "output_formats" in options:
731 | if isinstance(options["output_formats"], list):
732 | formats = []
733 | for fmt in options["output_formats"]:
734 | if isinstance(fmt, str):
735 | try:
736 | formats.append(OutputFormat(fmt.lower()))
737 | except ValueError:
738 | logger.warning(f"Invalid output format: {fmt}, ignoring")
739 | elif isinstance(fmt, OutputFormat):
740 | formats.append(fmt)
741 | if formats: # Only update if we have valid formats
742 | options["output_formats"] = formats
743 |
744 | # Handle nested parameter objects
745 | for key in ["audio_params", "whisper_params", "enhancement_params"]:
746 | if key in options and options[key]:
747 | # If a dictionary is provided, keep it for Pydantic
748 | if not isinstance(options[key], dict):
749 | # Convert non-dict to dict by serializing/deserializing if possible
750 | try:
751 | options[key] = json.loads(json.dumps(options[key]))
752 | except Exception:
753 | # If can't convert, remove the invalid value
754 | logger.warning(f"Invalid format for {key}, using defaults")
755 | options.pop(key)
756 |
757 | # Set audio profile parameters if a profile is specified
758 | if "audio_params" in options and "profile" in options["audio_params"]:
759 | profile = options["audio_params"]["profile"]
760 | if isinstance(profile, str):
761 | try:
762 | profile = AudioEnhancementProfile(profile.lower())
763 | # Update audio parameters based on the selected profile
764 | options["audio_params"].update(_get_audio_profile_params(profile))
765 | except ValueError:
766 | logger.warning(f"Invalid audio profile: {profile}, using default")
767 |
768 | # Set whisper quality parameters if quality is specified
769 | if "whisper_params" in options and "quality" in options["whisper_params"]:
770 | quality = options["whisper_params"]["quality"]
771 | if isinstance(quality, str):
772 | try:
773 | quality = TranscriptionQuality(quality.lower())
774 | # Update whisper parameters based on the selected quality
775 | options["whisper_params"].update(_get_whisper_quality_params(quality))
776 | except ValueError:
777 | logger.warning(f"Invalid transcription quality: {quality}, using default")
778 |
779 | # Parse with Pydantic model
780 | try:
781 | return TranscriptionOptions(**options)
782 | except Exception as e:
783 | logger.warning(f"Error parsing options: {e}, using defaults with valid values", emoji_key="warning")
784 | # Create with default options
785 | return TranscriptionOptions()
786 |
787 |
788 | def _get_audio_profile_params(profile: AudioEnhancementProfile) -> Dict[str, Any]:
789 | """Get audio enhancement parameters for a specific profile."""
790 | profiles = {
791 | AudioEnhancementProfile.CONFERENCE_CALL: {
792 | "volume": 1.5,
793 | "noise_reduction": 10,
794 | "highpass": 200,
795 | "lowpass": 3000,
796 | "compression": True,
797 | "normalize": True,
798 | "dereverberation": False
799 | },
800 | AudioEnhancementProfile.INTERVIEW: {
801 | "volume": 1.3,
802 | "noise_reduction": 8,
803 | "highpass": 150,
804 | "lowpass": 8000,
805 | "compression": True,
806 | "normalize": True,
807 | "dereverberation": False
808 | },
809 | AudioEnhancementProfile.LECTURE: {
810 | "volume": 1.4,
811 | "noise_reduction": 6,
812 | "highpass": 120,
813 | "lowpass": 8000,
814 | "compression": True,
815 | "normalize": True,
816 | "dereverberation": True
817 | },
818 | AudioEnhancementProfile.NOISY: {
819 | "volume": 1.8,
820 | "noise_reduction": 20,
821 | "highpass": 250,
822 | "lowpass": 3000,
823 | "compression": True,
824 | "normalize": True,
825 | "dereverberation": True
826 | },
827 | AudioEnhancementProfile.PHONE_CALL: {
828 | "volume": 2.0,
829 | "noise_reduction": 15,
830 | "highpass": 300,
831 | "lowpass": 3400,
832 | "compression": True,
833 | "normalize": True,
834 | "dereverberation": False
835 | },
836 | AudioEnhancementProfile.VOICEMAIL: {
837 | "volume": 2.0,
838 | "noise_reduction": 12,
839 | "highpass": 250,
840 | "lowpass": 3000,
841 | "compression": True,
842 | "normalize": True,
843 | "dereverberation": False
844 | }
845 | }
846 |
847 | return profiles.get(profile, {})
848 |
849 |
850 | def _get_whisper_quality_params(quality: TranscriptionQuality) -> Dict[str, Any]:
851 | """Get whisper parameters for a specific quality level."""
852 | quality_params = {
853 | TranscriptionQuality.DRAFT: {
854 | "beam_size": 1,
855 | "processors": 1,
856 | "word_timestamps": False,
857 | "highlight_words": False
858 | },
859 | TranscriptionQuality.STANDARD: {
860 | "beam_size": 5,
861 | "processors": 2,
862 | "word_timestamps": True,
863 | "highlight_words": False
864 | },
865 | TranscriptionQuality.ENHANCED: {
866 | "beam_size": 8,
867 | "processors": 2,
868 | "word_timestamps": True,
869 | "highlight_words": True
870 | },
871 | TranscriptionQuality.MAXIMUM: {
872 | "beam_size": 10,
873 | "processors": 4,
874 | "word_timestamps": True,
875 | "highlight_words": True
876 | }
877 | }
878 |
879 | return quality_params.get(quality, {})
880 |
881 |
882 | def _update_parameters_from_audio_info(context: ProcessingContext, audio_info: Dict[str, Any]) -> None:
883 | """Update processing parameters based on audio file analysis."""
884 | # If mono audio, adjust enhancement params
885 | audio_params = context.options.audio_params
886 | updated_params = False
887 |
888 | if audio_info.get("channels", 0) == 1:
889 | # Set output channels to match input if not explicitly set
890 | if "output_channels" not in context.options.audio_params.dict():
891 | # Create a copy with the updated parameter
892 | audio_params = AudioEnhancementParams(
893 | **{**audio_params.dict(), "output_channels": 1}
894 | )
895 | updated_params = True
896 |
897 | # If low-quality audio, adjust enhancement profile
898 | sample_rate = audio_info.get("sample_rate", 0)
899 | if sample_rate < 16000 and context.options.audio_params.profile == AudioEnhancementProfile.CONFERENCE_CALL:
900 | logger.info(f"Detected low sample rate ({sample_rate} Hz), adjusting enhancement profile", emoji_key="audio")
901 | # Use phone_call profile for low sample rate audio
902 | params = _get_audio_profile_params(AudioEnhancementProfile.PHONE_CALL)
903 | # Create a copy with the updated parameters
904 | audio_params = AudioEnhancementParams(
905 | **{**audio_params.dict(), **params}
906 | )
907 | updated_params = True
908 |
909 | # If params were updated, create a new options object with the updated audio_params
910 | if updated_params:
911 | context.options = TranscriptionOptions(
912 | **{**context.options.dict(), "audio_params": audio_params}
913 | )
914 |
915 | # If very short audio (<10 seconds), adjust transcription quality
916 | duration = audio_info.get("duration", 0)
917 | if duration < 10 and context.options.whisper_params.quality != TranscriptionQuality.MAXIMUM:
918 | logger.info(f"Short audio detected ({duration:.1f}s), increasing transcription quality", emoji_key="audio")
919 | # Create a new whisper_params with enhanced quality
920 | whisper_params = WhisperParams(
921 | **{**context.options.whisper_params.dict(), "quality": TranscriptionQuality.ENHANCED}
922 | )
923 | # Update options with new whisper_params
924 | context.options = TranscriptionOptions(
925 | **{**context.options.dict(), "whisper_params": whisper_params}
926 | )
927 |
928 |
929 | # --- Dependency and Audio Processing Functions ---
930 |
931 | async def download_whisper_model(model_name: str, output_path: str) -> bool:
932 | """Download a Whisper model from Hugging Face using httpx.
933 |
934 | Args:
935 | model_name: Name of the model to download (e.g. 'large-v3')
936 | output_path: Path where to save the model file
937 |
938 | Returns:
939 | True if download was successful, False otherwise
940 | """
941 | url = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-{model_name}.bin"
942 | logger.info(f"Downloading Whisper model '{model_name}' from {url}", emoji_key="download")
943 |
944 | # Expected model size
945 | min_size_bytes = WHISPER_MODEL_SIZES.get(model_name, 100000000) # Default to 100MB minimum if unknown
946 | expected_size_bytes = min_size_bytes # Initialize with minimum size
947 |
948 | try:
949 | async with httpx.AsyncClient(timeout=None) as client:
950 | # Make HEAD request first to check if the URL is valid and get expected size
951 | try:
952 | head_response = await client.head(url)
953 | if head_response.status_code != 200:
954 | logger.warning(f"Model URL is not accessible: HTTP {head_response.status_code} for {url}", emoji_key="warning")
955 | return False
956 |
957 | # If Content-Length is available, use it to check expected size
958 | content_length = int(head_response.headers.get("content-length", 0))
959 | if content_length > 0:
960 | expected_size_mb = content_length / (1024 * 1024)
961 | logger.info(f"Expected model size: {expected_size_mb:.1f} MB", emoji_key="info")
962 |
963 | # Update expected size if it's larger than our preset minimum
964 | if content_length > min_size_bytes:
965 | expected_size_bytes = int(content_length * 0.95) # Allow 5% tolerance
966 | except Exception as e:
967 | logger.warning(f"Failed to validate model URL: {url} - Error: {str(e)}", emoji_key="warning")
968 | # Continue anyway, the GET might still work
969 |
970 | # Stream the response to handle large files
971 | try:
972 | async with client.stream("GET", url) as response:
973 | if response.status_code != 200:
974 | logger.warning(f"Failed to download model: HTTP {response.status_code} for {url}", emoji_key="warning")
975 | if response.status_code == 404:
976 | logger.warning(f"Model '{model_name}' not found on Hugging Face repository", emoji_key="warning")
977 | return False
978 |
979 | # Get content length if available
980 | content_length = int(response.headers.get("content-length", 0))
981 | if content_length == 0:
982 | logger.warning("Content length is zero or not provided", emoji_key="warning")
983 |
984 | downloaded = 0
985 |
986 | # Open file for writing
987 | try:
988 | with open(output_path, "wb") as f:
989 | # Display progress
990 | last_log_time = time.time()
991 | async for chunk in response.aiter_bytes(chunk_size=8192):
992 | f.write(chunk)
993 | downloaded += len(chunk)
994 |
995 | # Log progress every 5 seconds
996 | now = time.time()
997 | if now - last_log_time > 5:
998 | if content_length > 0:
999 | percent = downloaded / content_length * 100
1000 | logger.info(f"Download progress: {percent:.1f}% ({downloaded/(1024*1024):.1f} MB)", emoji_key="download")
1001 | else:
1002 | logger.info(f"Downloaded {downloaded/(1024*1024):.1f} MB", emoji_key="download")
1003 | last_log_time = now
1004 | except IOError as e:
1005 | logger.warning(f"Failed to write to file {output_path}: {str(e)}", emoji_key="warning")
1006 | return False
1007 | except httpx.RequestError as e:
1008 | logger.warning(f"HTTP request error while downloading model: {str(e)}", emoji_key="warning")
1009 | return False
1010 |
1011 | # Verify the file was downloaded
1012 | if not os.path.exists(output_path):
1013 | logger.warning(f"Downloaded file doesn't exist at {output_path}", emoji_key="warning")
1014 | return False
1015 |
1016 | actual_size = os.path.getsize(output_path)
1017 | if actual_size == 0:
1018 | logger.warning(f"Downloaded file is empty at {output_path}", emoji_key="warning")
1019 | return False
1020 |
1021 | # Verify file size meets expectations
1022 | actual_size_mb = actual_size / (1024 * 1024)
1023 | if actual_size < expected_size_bytes:
1024 | logger.warning(
1025 | f"Model file size ({actual_size_mb:.1f} MB) is smaller than expected minimum size "
1026 | f"({expected_size_bytes/(1024*1024):.1f} MB). File may be corrupted or incomplete.",
1027 | emoji_key="warning"
1028 | )
1029 | return False
1030 |
1031 | logger.info(f"Successfully downloaded model to {output_path} ({actual_size_mb:.1f} MB)", emoji_key="success")
1032 | return True
1033 |
1034 | except Exception as e:
1035 | logger.warning(f"Error downloading whisper model: {e}", emoji_key="warning", exc_info=True)
1036 | return False
1037 |
1038 | async def check_dependencies(context: ProcessingContext) -> bool:
1039 | """Verifies that required dependencies are installed and accessible."""
1040 | # Check ffmpeg
1041 | try:
1042 | result = await asyncio.create_subprocess_exec(
1043 | "ffmpeg", "-version",
1044 | stdout=asyncio.subprocess.PIPE,
1045 | stderr=asyncio.subprocess.PIPE
1046 | )
1047 | stdout, stderr = await result.communicate()
1048 |
1049 | if result.returncode != 0:
1050 | raise ResourceError(
1051 | "ffmpeg is not installed or not accessible. Please install it with 'apt install ffmpeg'."
1052 | )
1053 |
1054 | # Extract ffmpeg version for logging
1055 | version_match = re.search(r'ffmpeg version (\S+)', stdout.decode('utf-8', errors='ignore'))
1056 | version = version_match.group(1) if version_match else "unknown"
1057 | logger.debug(f"Found ffmpeg version {version}", emoji_key="dependency")
1058 |
1059 | except FileNotFoundError as e:
1060 | raise ResourceError(
1061 | "ffmpeg is not installed. Please install it with 'apt install ffmpeg'."
1062 | ) from e
1063 |
1064 | # Check whisper.cpp
1065 | whisper_path = os.path.expanduser("~/whisper.cpp")
1066 |
1067 | # Use user-supplied model
1068 | model = context.options.whisper_params.model
1069 | model_path = os.path.join(whisper_path, "models", f"ggml-{model}.bin")
1070 |
1071 | if not os.path.exists(whisper_path):
1072 | raise ResourceError(
1073 | f"whisper.cpp not found at {whisper_path}. Please install it first."
1074 | )
1075 |
1076 | if not os.path.exists(model_path):
1077 | # Check if models directory exists
1078 | models_dir = os.path.join(whisper_path, "models")
1079 | if not os.path.exists(models_dir):
1080 | try:
1081 | os.makedirs(models_dir)
1082 | logger.info(f"Created models directory at {models_dir}", emoji_key="info")
1083 | except Exception as e:
1084 | raise ResourceError(f"Failed to create models directory: {e}") from e
1085 |
1086 | # Try to automatically download the model using httpx
1087 | logger.info(f"Whisper model '{model}' not found at {model_path}", emoji_key="info")
1088 | logger.info(f"Attempting to download model '{model}' now...", emoji_key="download")
1089 |
1090 | # Download the model directly using httpx - first check if model exists
1091 | if model == "large-v3":
1092 | # Double check if the model actually exists
1093 | test_url = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-{model}.bin"
1094 | async with httpx.AsyncClient(timeout=10.0) as client:
1095 | try:
1096 | head_response = await client.head(test_url)
1097 | if head_response.status_code != 200:
1098 | # Model might not be directly available - show a clear error
1099 | if head_response.status_code == 404:
1100 | raise ResourceError(
1101 | f"Whisper model 'large-v3' not found in the HuggingFace repository at {test_url}\n"
1102 | f"Please download it manually with one of these commands:\n"
1103 | f"1. ~/whisper.cpp/models/download-ggml-model.sh large-v3\n"
1104 | f"2. Or try a different model like 'large-v3-turbo' which is known to be available"
1105 | )
1106 | except Exception as e:
1107 | logger.warning(f"Error checking model availability: {e}", emoji_key="warning")
1108 | # Continue with download attempt anyway
1109 |
1110 | # Attempt to download the model
1111 | download_success = await download_whisper_model(model, model_path)
1112 | if not download_success:
1113 | # Modified error message with alternative suggestions
1114 | if model == "large-v3":
1115 | raise ResourceError(
1116 | f"Failed to download Whisper model '{model}'.\n"
1117 | f"You can:\n"
1118 | f"1. Try using a different model like 'large-v3-turbo' which is more reliable\n"
1119 | f"2. Or download manually with: ~/whisper.cpp/models/download-ggml-model.sh {model}"
1120 | )
1121 | else:
1122 | raise ResourceError(
1123 | f"Failed to download Whisper model '{model}'.\n"
1124 | f"Please download it manually with: ~/whisper.cpp/models/download-ggml-model.sh {model}"
1125 | )
1126 |
1127 | # Verify that the model was downloaded
1128 | if not os.path.exists(model_path):
1129 | raise ResourceError(
1130 | f"Model download completed but model file not found at {model_path}. "
1131 | f"Please check the download and try again."
1132 | )
1133 |
1134 | # Verify model file size
1135 | actual_size = os.path.getsize(model_path)
1136 | expected_min_size = WHISPER_MODEL_SIZES.get(model, 100000000) # Default to 100MB minimum if unknown
1137 |
1138 | if actual_size < expected_min_size:
1139 | actual_size_mb = actual_size / (1024 * 1024)
1140 | expected_mb = expected_min_size / (1024 * 1024)
1141 | raise ResourceError(
1142 | f"Downloaded model file at {model_path} is too small ({actual_size_mb:.1f} MB). "
1143 | f"Expected at least {expected_mb:.1f} MB. File may be corrupted or incomplete. "
1144 | f"Please download it manually with: ~/whisper.cpp/models/download-ggml-model.sh {model}"
1145 | )
1146 |
1147 | logger.info(f"Successfully downloaded Whisper model '{model}'", emoji_key="success")
1148 | else:
1149 | # Verify existing model file size
1150 | actual_size = os.path.getsize(model_path)
1151 | expected_min_size = WHISPER_MODEL_SIZES.get(model, 100000000) # Default to 100MB minimum if unknown
1152 |
1153 | file_size_mb = actual_size / (1024 * 1024)
1154 | if actual_size < expected_min_size:
1155 | expected_mb = expected_min_size / (1024 * 1024)
1156 | logger.warning(
1157 | f"Existing model at {model_path} is suspiciously small ({file_size_mb:.1f} MB). "
1158 | f"Expected at least {expected_mb:.1f} MB. Model may be corrupted.",
1159 | emoji_key="warning"
1160 | )
1161 | else:
1162 | logger.info(f"Found existing model at {model_path} ({file_size_mb:.1f} MB)", emoji_key="dependency")
1163 |
1164 | # Check if whisper binary is available in PATH using shlex for command safety
1165 | try:
1166 | result = subprocess.run(["which", "whisper-cli"], capture_output=True, text=True)
1167 | if result.returncode == 0:
1168 | whisper_path_found = result.stdout.strip()
1169 | logger.debug(f"Found whisper-cli in PATH: {whisper_path_found}", emoji_key="dependency")
1170 | else:
1171 | # Check in the expected location
1172 | whisper_path = os.path.expanduser("~/whisper.cpp")
1173 | whisper_bin = os.path.join(whisper_path, "build", "bin", "whisper-cli")
1174 | if not os.path.exists(whisper_bin):
1175 | raise ResourceError(
1176 | f"whisper-cli binary not found at {whisper_bin}. "
1177 | f"Please build whisper.cpp first with: "
1178 | f"cd ~/whisper.cpp && cmake -B build && cmake --build build -j --config Release"
1179 | )
1180 | logger.debug(f"Found whisper-cli at {whisper_bin}", emoji_key="dependency")
1181 | except FileNotFoundError as e:
1182 | raise ResourceError("Command 'which' not found. Cannot check for whisper-cli installation.") from e
1183 |
1184 | logger.debug(f"Found whisper model: {model}", emoji_key="dependency")
1185 | return True
1186 |
1187 |
1188 | async def get_detailed_audio_info(file_path: str) -> Dict[str, Any]:
1189 | """Gets detailed information about an audio file using ffprobe."""
1190 | cmd = [
1191 | "ffprobe",
1192 | "-v", "error",
1193 | "-show_entries", "format=duration,bit_rate,size",
1194 | "-select_streams", "a:0",
1195 | "-show_entries", "stream=channels,sample_rate,codec_name,bits_per_sample",
1196 | "-of", "json",
1197 | file_path
1198 | ]
1199 |
1200 | try:
1201 | process = await asyncio.create_subprocess_exec(
1202 | *cmd,
1203 | stdout=asyncio.subprocess.PIPE,
1204 | stderr=asyncio.subprocess.PIPE
1205 | )
1206 | stdout, stderr = await process.communicate()
1207 |
1208 | if process.returncode != 0:
1209 | logger.warning(f"Failed to get audio info: {stderr.decode('utf-8', errors='ignore')}", emoji_key="warning")
1210 | return {
1211 | "duration": 0,
1212 | "channels": 0,
1213 | "sample_rate": 0,
1214 | "format": os.path.splitext(file_path)[1][1:],
1215 | "codec": None,
1216 | "bit_depth": None,
1217 | "bitrate": None,
1218 | "size_bytes": os.path.getsize(file_path) if os.path.exists(file_path) else 0
1219 | }
1220 |
1221 | info = json.loads(stdout)
1222 | format_info = info.get("format", {})
1223 | stream_info = info.get("streams", [{}])[0] if info.get("streams") else {}
1224 |
1225 | # Extract information
1226 | duration = float(format_info.get("duration", 0))
1227 | channels = int(stream_info.get("channels", 0))
1228 | sample_rate = int(stream_info.get("sample_rate", 0))
1229 | codec = stream_info.get("codec_name")
1230 | bit_depth = int(stream_info.get("bits_per_sample", 0)) or None
1231 | bitrate = int(format_info.get("bit_rate", 0)) or None
1232 | size_bytes = int(format_info.get("size", 0)) or os.path.getsize(file_path)
1233 | audio_format = os.path.splitext(file_path)[1][1:]
1234 |
1235 | return {
1236 | "duration": duration,
1237 | "channels": channels,
1238 | "sample_rate": sample_rate,
1239 | "format": audio_format,
1240 | "codec": codec,
1241 | "bit_depth": bit_depth,
1242 | "bitrate": bitrate,
1243 | "size_bytes": size_bytes
1244 | }
1245 | except Exception as e:
1246 | logger.warning(f"Error getting audio info: {e}", emoji_key="warning", exc_info=True)
1247 | try:
1248 | size_bytes = os.path.getsize(file_path) if os.path.exists(file_path) else 0
1249 | except Exception:
1250 | size_bytes = 0
1251 |
1252 | return {
1253 | "duration": 0,
1254 | "channels": 0,
1255 | "sample_rate": 0,
1256 | "format": os.path.splitext(file_path)[1][1:],
1257 | "codec": None,
1258 | "bit_depth": None,
1259 | "bitrate": None,
1260 | "size_bytes": size_bytes
1261 | }
1262 |
1263 |
1264 | async def enhance_audio(context: ProcessingContext, audio_info: Dict[str, Any]) -> Optional[str]:
1265 | """Enhances audio quality using ffmpeg preprocessing."""
1266 | # Create output path in temp directory
1267 | output_path = os.path.join(context.temp_dir, f"{context.base_filename}_enhanced.wav")
1268 |
1269 | # Use optimized audio enhancement settings
1270 | # Build the complete command with the standardized enhancement settings
1271 | cmd = [
1272 | "ffmpeg",
1273 | "-i", context.file_path,
1274 | "-threads", str(os.cpu_count() or 1),
1275 | "-af", "volume=1.5, highpass=f=200, lowpass=f=3000, afftdn=nr=10:nf=-20, "
1276 | "compand=attacks=0:points=-80/-80|-45/-15|-27/-9|0/-7|20/-7:gain=5, "
1277 | "dynaudnorm=f=150:g=15:p=1:m=1:s=0, "
1278 | "pan=stereo|c0=c0|c1=c0",
1279 | "-ar", "16000",
1280 | "-ac", "2",
1281 | "-c:a", "pcm_s16le",
1282 | "-y", # Overwrite output if exists
1283 | output_path
1284 | ]
1285 |
1286 | logger.debug(f"Running ffmpeg command: {' '.join(cmd)}", emoji_key="command")
1287 |
1288 | try:
1289 | process = await asyncio.create_subprocess_exec(
1290 | *cmd,
1291 | stdout=asyncio.subprocess.PIPE,
1292 | stderr=asyncio.subprocess.PIPE
1293 | )
1294 | stdout, stderr = await process.communicate()
1295 |
1296 | if process.returncode != 0:
1297 | error_msg = stderr.decode('utf-8', errors='ignore')
1298 | logger.error(f"FFmpeg error: {error_msg}", emoji_key="error")
1299 | return None
1300 |
1301 | # Verify the output file was created and has a reasonable size
1302 | if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
1303 | file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
1304 | logger.info(f"Enhanced audio saved to {output_path} ({file_size_mb:.2f} MB)", emoji_key="audio")
1305 | else:
1306 | logger.warning("Enhanced audio file is suspiciously small or doesn't exist", emoji_key="warning")
1307 | return None
1308 |
1309 | # If we're keeping enhanced audio, copy it to a persistent location
1310 | if context.options.save_enhanced_audio:
1311 | original_dir = os.path.dirname(context.file_path)
1312 | persistent_path = os.path.join(original_dir, f"{context.base_filename}_enhanced.wav")
1313 | shutil.copy2(output_path, persistent_path)
1314 | logger.info(f"Saved enhanced audio to {persistent_path}", emoji_key="save")
1315 |
1316 | logger.info("Audio enhancement completed", emoji_key="audio")
1317 | return output_path
1318 | except Exception as e:
1319 | logger.error(f"Error enhancing audio: {e}", emoji_key="error", exc_info=True)
1320 | return None
1321 |
1322 |
1323 | async def transcribe_with_whisper(context: ProcessingContext) -> Dict[str, Any]:
1324 | """Transcribes audio using Whisper.cpp with advanced options."""
1325 | # Create output base name in temp directory
1326 | output_base = os.path.join(context.temp_dir, context.base_filename)
1327 | output_json = f"{output_base}.json"
1328 | output_txt = f"{output_base}.txt"
1329 |
1330 | # Get whisper parameters
1331 | params = context.options.whisper_params
1332 |
1333 | # Build command with configurable parameters
1334 | whisper_bin = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")
1335 | model_path = os.path.expanduser(f"~/whisper.cpp/models/ggml-{params.model}.bin")
1336 |
1337 | # Validate required files exist
1338 | if not os.path.exists(whisper_bin):
1339 | logger.error(f"Whisper binary not found at {whisper_bin}", emoji_key="error")
1340 | raise ToolError(f"Whisper binary not found at {whisper_bin}")
1341 |
1342 | if not os.path.exists(model_path):
1343 | logger.error(f"Whisper model not found at {model_path}", emoji_key="error")
1344 | raise ToolError(f"Whisper model not found at {model_path}")
1345 |
1346 | # Verify model file size
1347 | actual_size = os.path.getsize(model_path)
1348 | expected_min_size = WHISPER_MODEL_SIZES.get(params.model, 100000000) # Default to 100MB minimum if unknown
1349 | if actual_size < expected_min_size:
1350 | actual_size_mb = actual_size / (1024 * 1024)
1351 | expected_mb = expected_min_size / (1024 * 1024)
1352 | logger.warning(
1353 | f"Model file at {model_path} is smaller than expected ({actual_size_mb:.1f} MB, expected {expected_mb:.1f} MB)",
1354 | emoji_key="warning"
1355 | )
1356 |
1357 | # Use file_path as fallback if enhanced_audio_path is None
1358 | audio_path = context.enhanced_audio_path or context.file_path
1359 | if not os.path.exists(audio_path):
1360 | logger.error(f"Audio file not found at {audio_path}", emoji_key="error")
1361 | raise ToolError(f"Audio file not found at {audio_path}")
1362 |
1363 | cmd = [
1364 | whisper_bin,
1365 | "-m", model_path,
1366 | "-f", audio_path,
1367 | "-of", output_base,
1368 | "-oj" # Always output JSON for post-processing
1369 | ]
1370 |
1371 | # Add boolean flags
1372 | if params.word_timestamps:
1373 | cmd.append("-pc")
1374 |
1375 | if params.translate:
1376 | cmd.append("-tr")
1377 |
1378 | # Always output text for readability
1379 | cmd.append("-otxt")
1380 |
1381 | # Add numeric parameters
1382 | cmd.extend(["-t", str(os.cpu_count() if params.processors <= 0 else params.processors)])
1383 |
1384 | if params.beam_size:
1385 | cmd.extend(["-bs", str(params.beam_size)])
1386 |
1387 | # Add language parameter if specified
1388 | if params.language:
1389 | cmd.extend(["-l", params.language])
1390 |
1391 | # Add max context parameter if specified
1392 | if params.max_context > 0:
1393 | cmd.extend(["-mc", str(params.max_context)])
1394 |
1395 | # Additional optimizations
1396 | cmd.append("-fa") # Full sentence timestamps (improved segmentation)
1397 | cmd.append("-pp") # Enable post-processing
1398 |
1399 | # Add custom vocab if specified (create a vocab file)
1400 | if params.custom_vocab:
1401 | vocab_path = os.path.join(context.temp_dir, "custom_vocab.txt")
1402 | try:
1403 | async with aiofiles.open(vocab_path, 'w') as f:
1404 | await f.write("\n".join(params.custom_vocab))
1405 | cmd.extend(["-kv", vocab_path])
1406 | except Exception as e:
1407 | logger.warning(f"Failed to create custom vocab file: {e}", emoji_key="warning")
1408 |
1409 | # Add diarization if requested
1410 | if params.diarize:
1411 | cmd.append("-dm")
1412 |
1413 | cmd_str = ' '.join(cmd)
1414 | logger.debug(f"Running whisper command: {cmd_str}", emoji_key="command")
1415 |
1416 | try:
1417 | process = await asyncio.create_subprocess_exec(
1418 | *cmd,
1419 | stdout=asyncio.subprocess.PIPE,
1420 | stderr=asyncio.subprocess.PIPE
1421 | )
1422 | stdout, stderr = await process.communicate()
1423 |
1424 | stderr_output = stderr.decode('utf-8', errors='ignore') if stderr else ""
1425 | stdout_output = stdout.decode('utf-8', errors='ignore') if stdout else ""
1426 |
1427 | # Log outputs for debugging
1428 | if stdout_output:
1429 | logger.debug(f"Whisper stdout (first 500 chars): {stdout_output[:500]}", emoji_key="info")
1430 |
1431 | if stderr_output:
1432 | if process.returncode != 0:
1433 | logger.error(f"Whisper stderr: {stderr_output}", emoji_key="error")
1434 | elif "error" in stderr_output.lower() or "warning" in stderr_output.lower():
1435 | logger.warning(f"Whisper warnings/errors: {stderr_output}", emoji_key="warning")
1436 | else:
1437 | logger.debug(f"Whisper stderr: {stderr_output}", emoji_key="info")
1438 |
1439 | if process.returncode != 0:
1440 | error_msg = stderr_output or "Unknown error"
1441 | logger.error(f"Whisper transcription error (exit code {process.returncode}): {error_msg}", emoji_key="error")
1442 | raise ToolError(f"Whisper transcription failed with exit code {process.returncode}: {error_msg}")
1443 |
1444 | # Check if output files exist
1445 | if not os.path.exists(output_json) and not os.path.exists(output_txt):
1446 | logger.error("Whisper completed successfully but no output files were created", emoji_key="error")
1447 | raise ToolError("Whisper completed successfully but no output files were created")
1448 |
1449 | # Read results from the JSON file
1450 | if os.path.exists(output_json):
1451 | json_file_size = os.path.getsize(output_json)
1452 | logger.debug(f"Reading JSON output file: {output_json} ({json_file_size} bytes)", emoji_key="info")
1453 |
1454 | if json_file_size < 10: # Suspiciously small
1455 | logger.warning(f"JSON output file is suspiciously small: {json_file_size} bytes", emoji_key="warning")
1456 |
1457 | try:
1458 | async with aiofiles.open(output_json, 'r') as f:
1459 | content = await f.read()
1460 |
1461 | try:
1462 | result = json.loads(content)
1463 |
1464 | # Fix missing fields in result if needed
1465 | if "segments" not in result:
1466 | logger.warning("No segments found in Whisper JSON output", emoji_key="warning")
1467 | result["segments"] = []
1468 |
1469 | if "text" not in result or not result.get("text"):
1470 | logger.warning("No transcript text found in Whisper JSON output", emoji_key="warning")
1471 | # Try to construct text from segments
1472 | if result.get("segments"):
1473 | reconstructed_text = " ".join([seg.get("text", "") for seg in result["segments"]])
1474 | if reconstructed_text:
1475 | logger.info("Reconstructed transcript text from segments", emoji_key="info")
1476 | result["text"] = reconstructed_text
1477 | else:
1478 | result["text"] = ""
1479 | else:
1480 | result["text"] = ""
1481 |
1482 | # Extract metadata
1483 | metadata = {
1484 | "language": context.language_code or result.get("language"),
1485 | "duration": result.get("duration", 0)
1486 | }
1487 | result["metadata"] = metadata
1488 |
1489 | except json.JSONDecodeError as e:
1490 | logger.error(f"Failed to parse Whisper JSON output: {e}", emoji_key="error")
1491 | logger.error(f"JSON content: {content[:1000]}...", emoji_key="error")
1492 | raise ToolError(f"Failed to parse Whisper output JSON: {e}") from e
1493 | except Exception as e:
1494 | logger.error(f"Failed to read Whisper JSON output file: {e}", emoji_key="error")
1495 | raise ToolError(f"Failed to read Whisper output: {e}") from e
1496 | else:
1497 | logger.warning(f"Whisper JSON output not found at expected path: {output_json}", emoji_key="warning")
1498 | # Fallback to text file
1499 | if os.path.exists(output_txt):
1500 | txt_file_size = os.path.getsize(output_txt)
1501 | logger.info(f"Falling back to text output file: {output_txt} ({txt_file_size} bytes)", emoji_key="info")
1502 |
1503 | if txt_file_size < 10: # Suspiciously small
1504 | logger.warning(f"Text output file is suspiciously small: {txt_file_size} bytes", emoji_key="warning")
1505 |
1506 | try:
1507 | async with aiofiles.open(output_txt, 'r') as f:
1508 | text = await f.read()
1509 |
1510 | # Create minimal result structure
1511 | result = {
1512 | "text": text,
1513 | "segments": [{"text": text, "start": 0, "end": 0}],
1514 | "metadata": {
1515 | "language": context.language_code,
1516 | "duration": 0
1517 | }
1518 | }
1519 |
1520 | if not text:
1521 | logger.warning("Text output file is empty", emoji_key="warning")
1522 | except Exception as e:
1523 | logger.error(f"Failed to read text output file: {e}", emoji_key="error")
1524 | raise ToolError(f"Failed to read Whisper text output: {e}") from e
1525 | else:
1526 | logger.error(f"No output files found from Whisper at {output_base}.*", emoji_key="error")
1527 | raise ToolError("No output files found from Whisper transcription")
1528 |
1529 | # Check if we actually got a transcript
1530 | if not result.get("text"):
1531 | logger.warning("Whisper returned an empty transcript", emoji_key="warning")
1532 |
1533 | # Clean up results (remove empty/duplicate segments)
1534 | cleaned_segments = clean_segments(result.get("segments", []))
1535 | result["segments"] = cleaned_segments
1536 |
1537 | # Clean up text (remove dots-only lines, etc.)
1538 | result["text"] = clean_raw_transcript(result.get("text", ""))
1539 |
1540 | return result
1541 |
1542 | except ToolError:
1543 | raise
1544 | except Exception as e:
1545 | logger.error(f"Error in Whisper transcription: {e}", emoji_key="error", exc_info=True)
1546 | raise ToolError(f"Whisper transcription failed: {str(e)}") from e
1547 |
1548 |
1549 | def clean_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1550 | """Clean and normalize segment data."""
1551 | cleaned_segments = []
1552 | seen_texts = set()
1553 |
1554 | for segment in segments:
1555 | # Skip segments with empty or meaningless text
1556 | text = segment.get("text", "").strip()
1557 | if not text or re.match(r'^[\s.]*$', text):
1558 | continue
1559 |
1560 | # Skip exact duplicates unless they have meaningful timing differences
1561 | # (within 0.5s of a previous segment)
1562 | is_duplicate = False
1563 | if text in seen_texts:
1564 | for prev_segment in cleaned_segments:
1565 | if prev_segment.get("text") == text:
1566 | start_diff = abs(prev_segment.get("start", 0) - segment.get("start", 0))
1567 | end_diff = abs(prev_segment.get("end", 0) - segment.get("end", 0))
1568 | if start_diff < 0.5 and end_diff < 0.5:
1569 | is_duplicate = True
1570 | break
1571 |
1572 | if is_duplicate:
1573 | continue
1574 |
1575 | # Add to seen texts
1576 | seen_texts.add(text)
1577 |
1578 | # Standardize segment structure
1579 | clean_segment = {
1580 | "start": float(segment.get("start", 0)),
1581 | "end": float(segment.get("end", 0)),
1582 | "text": text
1583 | }
1584 |
1585 | # Add optional fields if available
1586 | for field in ["speaker", "words", "confidence"]:
1587 | if field in segment:
1588 | clean_segment[field] = segment[field]
1589 |
1590 | cleaned_segments.append(clean_segment)
1591 |
1592 | # Sort by start time
1593 | cleaned_segments.sort(key=lambda x: x["start"])
1594 |
1595 | return cleaned_segments
1596 |
1597 |
1598 | def clean_raw_transcript(text: str) -> str:
1599 | """Cleans raw transcript."""
1600 | if not text:
1601 | return ""
1602 |
1603 | # Split into lines and process
1604 | lines = text.split("\n")
1605 | cleaned_lines = []
1606 | seen_lines = set()
1607 |
1608 | for line in lines:
1609 | line = line.strip()
1610 | # Skip empty lines
1611 | if not line:
1612 | continue
1613 | # Skip lines with just dots or other meaningless patterns
1614 | if re.match(r'^[\s.]*$', line) or line == '[BLANK_AUDIO]':
1615 | continue
1616 |
1617 | # Standardize multiple spaces
1618 | line = re.sub(r'\s+', ' ', line)
1619 |
1620 | # Keep duplicates if they're long (likely not duplicates but legitimate repetition)
1621 | if line in seen_lines and len(line) <= 50:
1622 | continue
1623 |
1624 | seen_lines.add(line)
1625 | cleaned_lines.append(line)
1626 |
1627 | # Join but ensure there's proper spacing
1628 | return "\n".join(cleaned_lines)
1629 |
1630 |
1631 | # --- Transcript Enhancement Functions ---
1632 |
1633 | async def enhance_transcript(
1634 | context: ProcessingContext,
1635 | transcript: str,
1636 | metadata: Dict[str, Any]
1637 | ) -> Dict[str, Any]:
1638 | """Enhance a transcript with formatting, readability and semantic structuring."""
1639 | if not transcript or transcript.strip() == "":
1640 | return {
1641 | "transcript": "",
1642 | "tokens": {"input": 0, "output": 0, "total": 0},
1643 | "cost": 0.0,
1644 | "topics": [],
1645 | "title": None
1646 | }
1647 |
1648 | # Extract key parameters
1649 | params = context.options.enhancement_params
1650 |
1651 | # Split transcript into manageable chunks
1652 | chunks = await chunk_text(transcript, params.max_chunk_size)
1653 |
1654 | if not chunks:
1655 | return {
1656 | "transcript": transcript,
1657 | "tokens": {"input": 0, "output": 0, "total": 0},
1658 | "cost": 0.0,
1659 | "topics": [],
1660 | "title": None
1661 | }
1662 |
1663 | # First analyze context to get a summary of the content
1664 | context_data = await detect_subject_matter(
1665 | chunks[0],
1666 | params.provider,
1667 | params.model,
1668 | metadata
1669 | )
1670 |
1671 | # Track topics if available
1672 | topics = context_data.get("topics", [])
1673 | title = context_data.get("title")
1674 | context_info = context_data.get("context", "")
1675 |
1676 | logger.info(f"Content analysis complete: {len(topics)} topics identified", emoji_key="analyze")
1677 |
1678 | # Process chunks concurrently if parallel processing is enabled
1679 | if context.options.parallel_processing and len(chunks) > 1:
1680 | enhanced_chunks = await process_chunks_parallel(
1681 | context,
1682 | chunks,
1683 | context_info,
1684 | params
1685 | )
1686 | else:
1687 | enhanced_chunks = await process_chunks_sequential(
1688 | context,
1689 | chunks,
1690 | context_info,
1691 | params
1692 | )
1693 |
1694 | # Calculate total metrics
1695 | total_tokens = {"input": 0, "output": 0, "total": 0}
1696 | total_cost = 0.0
1697 |
1698 | for chunk_data in enhanced_chunks:
1699 | chunk_tokens = chunk_data.get("tokens", {})
1700 | total_tokens["input"] += chunk_tokens.get("input", 0)
1701 | total_tokens["output"] += chunk_tokens.get("output", 0)
1702 | total_tokens["total"] += chunk_tokens.get("total", 0)
1703 | total_cost += chunk_data.get("cost", 0.0)
1704 |
1705 | # Join the enhanced chunks
1706 | enhanced_transcript = "\n\n".join(chunk_data["text"] for chunk_data in enhanced_chunks)
1707 |
1708 | # If sections are enabled, try to add section headings
1709 | if params.sections and topics:
1710 | enhanced_transcript = await add_section_headings(
1711 | enhanced_transcript,
1712 | topics,
1713 | params.provider,
1714 | params.model
1715 | )
1716 |
1717 | return {
1718 | "transcript": enhanced_transcript,
1719 | "tokens": total_tokens,
1720 | "cost": total_cost,
1721 | "topics": topics,
1722 | "title": title
1723 | }
1724 |
1725 |
1726 | async def chunk_text(text: str, max_chunk_size: int = 6500) -> List[str]:
1727 | """Split text into chunks with intelligent boundary detection."""
1728 | if len(text) <= max_chunk_size:
1729 | return [text]
1730 |
1731 | # Define patterns for natural breaks, prioritized
1732 | patterns = [
1733 | r'\n\s*\n\s*\n', # Triple line break (highest priority)
1734 | r'\n\s*\n', # Double line break
1735 | r'(?<=[.!?])\s+(?=[A-Z])', # Sentence boundary with capital letter following
1736 | r'(?<=[.!?])\s', # Any sentence boundary
1737 | r'(?<=[,:;])\s' # Phrase boundary (lowest priority)
1738 | ]
1739 |
1740 | chunks = []
1741 | remaining_text = text
1742 |
1743 | while remaining_text:
1744 | if len(remaining_text) <= max_chunk_size:
1745 | chunks.append(remaining_text)
1746 | break
1747 |
1748 | # Start with an initial chunk at max size
1749 | chunk_candidate = remaining_text[:max_chunk_size]
1750 | split_position = None
1751 |
1752 | # Try each pattern in order of priority
1753 | for pattern in patterns:
1754 | # Look for the last occurrence of the pattern
1755 | matches = list(re.finditer(pattern, chunk_candidate))
1756 | if matches:
1757 | # Use the last match as the split point
1758 | split_position = matches[-1].end()
1759 | break
1760 |
1761 | # Fallback if no natural breaks found
1762 | if split_position is None or split_position < max_chunk_size // 2:
1763 | # Look for the last space after a minimum chunk size
1764 | min_size = max(max_chunk_size // 2, 1000)
1765 | last_space = chunk_candidate.rfind(' ', min_size)
1766 | if last_space > min_size:
1767 | split_position = last_space
1768 | else:
1769 | # Forced split at max_chunk_size
1770 | split_position = max_chunk_size
1771 |
1772 | # Create chunk and update remaining text
1773 | chunks.append(remaining_text[:split_position].strip())
1774 | remaining_text = remaining_text[split_position:].strip()
1775 |
1776 | # Validate chunks
1777 | validated_chunks = []
1778 | for chunk in chunks:
1779 | if chunk and len(chunk) >= 100: # Minimum viable chunk size
1780 | validated_chunks.append(chunk)
1781 | elif chunk:
1782 | # If chunk is too small, combine with previous or next chunk
1783 | if validated_chunks:
1784 | validated_chunks[-1] += "\n\n" + chunk
1785 | else:
1786 | # This is the first chunk and it's too small - rare case
1787 | validated_chunks.append(chunk)
1788 |
1789 | return validated_chunks
1790 |
1791 |
1792 | async def detect_subject_matter(
1793 | text: str,
1794 | provider: str,
1795 | model: Optional[str] = None,
1796 | metadata: Optional[Dict[str, Any]] = None
1797 | ) -> Dict[str, Any]:
1798 | """Analyze transcript to determine subject matter, topics, and title."""
1799 | prompt = """Analyze this transcript excerpt for the following:
1800 |
1801 | 1. CONTEXT: The primary domain or topic being discussed (e.g., technology, business, healthcare, etc.)
1802 | 2. SPEAKERS: The likely type and number of speakers (e.g., interview, panel, lecture, etc.)
1803 | 3. TOPICS: List 2-5 specific topics covered, in order of importance
1804 | 4. TITLE: A short, descriptive title for this content (under 10 words)
1805 |
1806 | Return your analysis in JSON format ONLY:
1807 | {
1808 | "context": "Brief description of the domain and conversation type",
1809 | "topics": ["Topic 1", "Topic 2", "Topic 3"],
1810 | "title": "Concise descriptive title"
1811 | }
1812 |
1813 | Transcript excerpt:
1814 | {text}"""
1815 |
1816 | # Include metadata if available
1817 | if metadata and metadata.get("language"):
1818 | language = metadata.get("language")
1819 | prompt += f"\n\nMetadata: The transcript language is {language}."
1820 |
1821 | try:
1822 | result = await chat_completion(
1823 | messages=[{"role": "user", "content": prompt.format(text=text)}],
1824 | provider=provider,
1825 | model=model,
1826 | temperature=0
1827 | )
1828 |
1829 | if result.get("success") and "message" in result:
1830 | content = result["message"].get("content", "")
1831 |
1832 | # Try to parse JSON from content
1833 | try:
1834 | # Extract JSON if it's embedded in text
1835 | json_match = re.search(r'({[\s\S]*})', content)
1836 | if json_match:
1837 | analysis = json.loads(json_match.group(1))
1838 | else:
1839 | analysis = json.loads(content)
1840 |
1841 | return {
1842 | "context": analysis.get("context", ""),
1843 | "topics": analysis.get("topics", []),
1844 | "title": analysis.get("title")
1845 | }
1846 | except json.JSONDecodeError:
1847 | # Fallback: extract fields manually
1848 | context_match = re.search(r'context["\s:]+([^"}\n]+)', content, re.IGNORECASE)
1849 | title_match = re.search(r'title["\s:]+([^"}\n]+)', content, re.IGNORECASE)
1850 | topics_match = re.search(r'topics["\s:]+\[(.*?)\]', content, re.IGNORECASE | re.DOTALL)
1851 |
1852 | context = context_match.group(1).strip() if context_match else ""
1853 | title = title_match.group(1).strip() if title_match else None
1854 |
1855 | topics = []
1856 | if topics_match:
1857 | topics_text = topics_match.group(1)
1858 | topics = [t.strip().strip('"\'') for t in re.findall(r'"([^"]+)"', topics_text)]
1859 | if not topics:
1860 | topics = [t.strip() for t in topics_text.split(',') if t.strip()]
1861 |
1862 | return {
1863 | "context": context,
1864 | "topics": topics,
1865 | "title": title
1866 | }
1867 | except Exception as e:
1868 | logger.warning(f"Subject matter detection failed: {e}", emoji_key="warning")
1869 |
1870 | return {
1871 | "context": "",
1872 | "topics": [],
1873 | "title": None
1874 | }
1875 |
1876 |
1877 | async def process_chunks_parallel(
1878 | context: ProcessingContext,
1879 | chunks: List[str],
1880 | context_info: str,
1881 | params: TranscriptEnhancementParams
1882 | ) -> List[Dict[str, Any]]:
1883 | """Process transcript chunks in parallel for better performance."""
1884 | # Limit max workers to CPU count or specified max
1885 | max_workers = min(context.options.max_workers, os.cpu_count() or 4, len(chunks))
1886 |
1887 | # Create a semaphore to limit concurrency
1888 | sem = asyncio.Semaphore(max_workers)
1889 |
1890 | # Create a thread pool for parallel processing
1891 | chunk_results = []
1892 |
1893 | async def process_chunk(i, chunk):
1894 | """Process an individual chunk."""
1895 | async with sem: # Use semaphore to limit concurrency
1896 | logger.info(f"Enhancing chunk {i+1}/{len(chunks)}", emoji_key="enhance")
1897 | try:
1898 | result = await enhance_chunk(chunk, context_info, params, i, len(chunks))
1899 | return result
1900 | except Exception as e:
1901 | logger.error(f"Error enhancing chunk {i+1}: {e}", emoji_key="error", exc_info=True)
1902 | return {"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0}
1903 |
1904 | # Create tasks for parallel execution
1905 | tasks = [process_chunk(i, chunk) for i, chunk in enumerate(chunks)]
1906 | chunk_results = await asyncio.gather(*tasks)
1907 |
1908 | # Make sure results are in original order (they should be)
1909 | return chunk_results
1910 |
1911 |
1912 | async def process_chunks_sequential(
1913 | context: ProcessingContext,
1914 | chunks: List[str],
1915 | context_info: str,
1916 | params: TranscriptEnhancementParams
1917 | ) -> List[Dict[str, Any]]:
1918 | """Process transcript chunks sequentially to preserve context flow."""
1919 | enhanced_chunks = []
1920 | accumulated_context = context_info
1921 |
1922 | for i, chunk in enumerate(chunks):
1923 | logger.info(f"Enhancing chunk {i+1}/{len(chunks)}", emoji_key="enhance")
1924 |
1925 | # Update context with information from previous chunks
1926 | if i > 0 and enhanced_chunks:
1927 | # Add brief summary of what was covered in previous chunk
1928 | previous_text = enhanced_chunks[-1]["text"]
1929 | if len(previous_text) > 500:
1930 | accumulated_context += f"\nPrevious chunk ended with: {previous_text[-500:]}"
1931 | else:
1932 | accumulated_context += f"\nPrevious chunk: {previous_text}"
1933 |
1934 | try:
1935 | result = await enhance_chunk(chunk, accumulated_context, params, i, len(chunks))
1936 | enhanced_chunks.append(result)
1937 | except Exception as e:
1938 | logger.error(f"Error enhancing chunk {i+1}: {e}", emoji_key="error", exc_info=True)
1939 | # Use original text on error
1940 | enhanced_chunks.append({"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0})
1941 |
1942 | return enhanced_chunks
1943 |
1944 |
1945 | async def enhance_chunk(
1946 | chunk: str,
1947 | context_info: str,
1948 | params: TranscriptEnhancementParams,
1949 | chunk_index: int,
1950 | total_chunks: int
1951 | ) -> Dict[str, Any]:
1952 | """Enhance a single transcript chunk with LLM."""
1953 | # Build the prompt based on enhancement parameters
1954 | style_instructions = _get_style_instructions(params.style)
1955 | fix_instructions = []
1956 |
1957 | if params.add_paragraphs:
1958 | fix_instructions.append("- Add paragraph breaks at natural topic transitions")
1959 |
1960 | if params.fix_spelling:
1961 | fix_instructions.append("- Fix obvious spelling errors while preserving domain-specific terms")
1962 |
1963 | if params.fix_grammar:
1964 | fix_instructions.append("- Fix basic grammatical errors without changing style or meaning")
1965 |
1966 | if params.format_numbers:
1967 | fix_instructions.append("- Format numbers consistently (e.g., '25' instead of 'twenty-five')")
1968 |
1969 | if params.identify_speakers:
1970 | fix_instructions.append("- Try to identify different speakers and label them as Speaker 1, Speaker 2, etc.")
1971 | fix_instructions.append("- Format speaker changes as 'Speaker N: text' on a new line")
1972 |
1973 | fix_section = "\n".join(fix_instructions) if fix_instructions else "None"
1974 |
1975 | # Add custom instructions if provided
1976 | custom_section = f"\nADDITIONAL INSTRUCTIONS:\n{params.custom_instructions}" if params.custom_instructions else ""
1977 |
1978 | # Mention chunk position in context
1979 | position_info = f"This is chunk {chunk_index+1} of {total_chunks}." if total_chunks > 1 else ""
1980 |
1981 | prompt = f"""You are cleaning up a raw transcript from a recorded conversation. {position_info}
1982 |
1983 | CONTENT CONTEXT: {context_info}
1984 |
1985 | ENHANCEMENT STYLE: {style_instructions}
1986 |
1987 | CLEANUP INSTRUCTIONS:
1988 | 1. Remove filler sounds: "um", "uh", "er", "ah", "hmm"
1989 | 2. Remove stutters and word repetitions: "the- the", "I- I"
1990 | 3. Remove meaningless filler phrases when used as pure filler: "you know", "like", "sort of"
1991 | 4. Fix clear transcription errors and garbled text
1992 | 5. Add proper punctuation for readability
1993 | {fix_section}
1994 |
1995 | STRICT PRESERVATION RULES:
1996 | 1. DO NOT modify, rephrase, or restructure ANY of the speaker's content
1997 | 2. DO NOT add ANY new content or explanations
1998 | 3. DO NOT make the language more formal or technical
1999 | 4. DO NOT summarize or condense anything
2000 | 5. PRESERVE ALL technical terms, numbers, and specific details exactly as spoken
2001 | 6. PRESERVE the speaker's unique speaking style and personality
2002 | {custom_section}
2003 |
2004 | Here's the transcript chunk to clean:
2005 | {chunk}
2006 |
2007 | Return ONLY the cleaned transcript text with no explanations, comments, or metadata."""
2008 |
2009 | try:
2010 | result = await chat_completion(
2011 | messages=[{"role": "user", "content": prompt}],
2012 | provider=params.provider,
2013 | model=params.model,
2014 | temperature=0.1, # Low temperature for consistent results
2015 | max_tokens=min(len(chunk) * 2, 8192) # Reasonable token limit based on input size
2016 | )
2017 |
2018 | if result.get("success") and "message" in result:
2019 | enhanced_text = result["message"].get("content", "").strip()
2020 |
2021 | # Validation: if enhanced text is much shorter, it might have been summarized
2022 | if len(enhanced_text) < len(chunk) * 0.6:
2023 | logger.warning(
2024 | f"Enhanced text suspiciously short ({len(enhanced_text)} vs {len(chunk)} chars), "
2025 | f"may have been summarized. Using original.",
2026 | emoji_key="warning"
2027 | )
2028 | enhanced_text = chunk
2029 |
2030 | return {
2031 | "text": enhanced_text,
2032 | "tokens": result.get("tokens", {"input": 0, "output": 0, "total": 0}),
2033 | "cost": result.get("cost", 0.0)
2034 | }
2035 |
2036 | return {"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0}
2037 | except Exception as e:
2038 | logger.error(f"Chunk enhancement error: {e}", emoji_key="error")
2039 | return {"text": chunk, "tokens": {"input": 0, "output": 0, "total": 0}, "cost": 0.0}
2040 |
2041 |
2042 | def _get_style_instructions(style: EnhancementStyle) -> str:
2043 | """Get instructions for the specified enhancement style."""
2044 | styles = {
2045 | EnhancementStyle.RAW: (
2046 | "Minimal cleaning only. Preserve all speech patterns and informality. "
2047 | "Focus on removing only transcription errors and unintelligible elements."
2048 | ),
2049 | EnhancementStyle.READABLE: (
2050 | "Basic readability improvements. Light cleanup while preserving natural speech patterns. "
2051 | "Remove only clear disfluencies and maintain conversational flow."
2052 | ),
2053 | EnhancementStyle.POLISHED: (
2054 | "Well-formatted with proper punctuation and clean sentences. "
2055 | "Remove speech disfluencies but preserve the speaker's voice and style. "
2056 | "Create a professional but authentic reading experience."
2057 | ),
2058 | EnhancementStyle.VERBATIM: (
2059 | "Preserve all speech patterns, hesitations, and repetitions. "
2060 | "Format for readability but maintain every verbal quirk and pause. "
2061 | "Indicate hesitations with ellipses [...] and preserve every repeated word or phrase."
2062 | ),
2063 | EnhancementStyle.STRUCTURED: (
2064 | "Add semantic structure with clear paragraphs around topics. "
2065 | "Clean speech for maximum readability while preserving content accuracy. "
2066 | "Organize into logical sections while keeping all original information."
2067 | )
2068 | }
2069 |
2070 | return styles.get(style, styles[EnhancementStyle.READABLE])
2071 |
2072 |
2073 | async def add_section_headings(
2074 | transcript: str,
2075 | topics: List[str],
2076 | provider: str,
2077 | model: Optional[str] = None
2078 | ) -> str:
2079 | """Add section headings to the transcript based on topic changes."""
2080 | if not transcript or not topics:
2081 | return transcript
2082 |
2083 | prompt = """Add clear section headings to this transcript based on topic changes.
2084 |
2085 | TOPICS COVERED (in approximate order):
2086 | {topics}
2087 |
2088 | RULES:
2089 | 1. Insert section headings as "## [Topic]" on their own line
2090 | 2. Place headings ONLY where there is a clear topic change
2091 | 3. Use at most {max_sections} headings total
2092 | 4. NEVER add content or edit the existing text
2093 | 5. NEVER remove any original content
2094 | 6. Base headings on the given topics list, but you can adjust wording for clarity
2095 | 7. Don't duplicate headings for the same topic
2096 | 8. Keep headings short and descriptive (2-6 words each)
2097 |
2098 | TRANSCRIPT:
2099 | {text}
2100 |
2101 | Return the full transcript with section headings added."""
2102 |
2103 | # Adjust max sections based on transcript length
2104 | token_estimate = len(transcript) // 4
2105 | max_sections = min(len(topics) + 1, token_estimate // 1000 + 1)
2106 | topics_text = "\n".join([f"- {topic}" for topic in topics])
2107 |
2108 | try:
2109 | result = await chat_completion(
2110 | messages=[{
2111 | "role": "user",
2112 | "content": prompt.format(topics=topics_text, text=transcript, max_sections=max_sections)
2113 | }],
2114 | provider=provider,
2115 | model=model,
2116 | temperature=0.1,
2117 | max_tokens=min(len(transcript) * 2, 8192)
2118 | )
2119 |
2120 | if result.get("success") and "message" in result:
2121 | return result["message"].get("content", "").strip()
2122 |
2123 | return transcript
2124 | except Exception as e:
2125 | logger.warning(f"Failed to add section headings: {e}", emoji_key="warning")
2126 | return transcript
2127 |
2128 |
2129 | # --- Output File Generation ---
2130 |
2131 | async def generate_output_files(
2132 | context: ProcessingContext,
2133 | raw_transcript: str,
2134 | enhanced_transcript: str,
2135 | segments: List[Dict[str, Any]],
2136 | metadata: Dict[str, Any]
2137 | ) -> Dict[str, Any]:
2138 | """Generate output files in requested formats."""
2139 | artifact_paths = {
2140 | "output_files": {}
2141 | }
2142 |
2143 | # Save enhanced audio path if requested
2144 | if context.options.save_enhanced_audio and context.enhanced_audio_path:
2145 | original_dir = os.path.dirname(context.file_path)
2146 | persistent_path = os.path.join(original_dir, f"{context.base_filename}_enhanced.wav")
2147 |
2148 | # May have already been saved during enhancement
2149 | if not os.path.exists(persistent_path) and os.path.exists(context.enhanced_audio_path):
2150 | shutil.copy2(context.enhanced_audio_path, persistent_path)
2151 |
2152 | artifact_paths["enhanced_audio"] = persistent_path
2153 |
2154 | # Generate requested output formats
2155 | output_formats = context.options.output_formats
2156 | output_dir = os.path.dirname(context.file_path)
2157 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
2158 |
2159 | # Convert temp files to Path objects for easier path manipulation
2160 | output_dir = Path(os.path.dirname(context.file_path))
2161 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
2162 |
2163 | # Generate JSON output
2164 | if OutputFormat.JSON in output_formats:
2165 | json_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.json"
2166 |
2167 | # Create JSON data structure
2168 | json_data = {
2169 | "metadata": {
2170 | "filename": context.original_filename,
2171 | "processed_at": timestamp,
2172 | **metadata
2173 | },
2174 | "raw_transcript": raw_transcript,
2175 | "enhanced_transcript": enhanced_transcript,
2176 | "segments": segments
2177 | }
2178 |
2179 | # Write JSON file
2180 | async with aiofiles.open(str(json_path), 'w') as f:
2181 | await f.write(json.dumps(json_data, indent=2))
2182 |
2183 | artifact_paths["output_files"]["json"] = json_path
2184 |
2185 | # Generate TEXT output
2186 | if OutputFormat.TEXT in output_formats:
2187 | text_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.txt"
2188 |
2189 | # Create plain text file
2190 | async with aiofiles.open(str(text_path), 'w') as f:
2191 | # Add metadata header if available
2192 | if metadata:
2193 | if "title" in metadata and metadata["title"]:
2194 | await f.write(f"Title: {metadata['title']}\n")
2195 | if "language" in metadata and metadata["language"]:
2196 | await f.write(f"Language: {metadata['language']}\n")
2197 | if "topics" in metadata and metadata["topics"]:
2198 | topics_str = ", ".join(metadata["topics"])
2199 | await f.write(f"Topics: {topics_str}\n")
2200 | await f.write("\n")
2201 |
2202 | # Write enhanced transcript if available, otherwise use raw transcript
2203 | await f.write(enhanced_transcript or raw_transcript)
2204 |
2205 | artifact_paths["output_files"]["text"] = text_path
2206 |
2207 | # Generate SRT output
2208 | if OutputFormat.SRT in output_formats:
2209 | srt_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.srt"
2210 |
2211 | # Convert segments to SRT format
2212 | srt_content = generate_srt(segments)
2213 |
2214 | # Write SRT file
2215 | async with aiofiles.open(str(srt_path), 'w') as f:
2216 | await f.write(srt_content)
2217 |
2218 | artifact_paths["output_files"]["srt"] = srt_path
2219 |
2220 | # Generate VTT output
2221 | if OutputFormat.VTT in output_formats:
2222 | vtt_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.vtt"
2223 |
2224 | # Convert segments to VTT format
2225 | vtt_content = generate_vtt(segments)
2226 |
2227 | # Write VTT file
2228 | async with aiofiles.open(str(vtt_path), 'w') as f:
2229 | await f.write(vtt_content)
2230 |
2231 | artifact_paths["output_files"]["vtt"] = vtt_path
2232 |
2233 | # Generate Markdown output
2234 | if OutputFormat.MARKDOWN in output_formats:
2235 | md_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.md"
2236 |
2237 | # Create markdown content
2238 | md_content = generate_markdown(enhanced_transcript, metadata)
2239 |
2240 | # Write markdown file
2241 | async with aiofiles.open(str(md_path), 'w') as f:
2242 | await f.write(md_content)
2243 |
2244 | artifact_paths["output_files"]["markdown"] = md_path
2245 |
2246 | # Generate DOCX output (if supported)
2247 | if OutputFormat.DOCX in output_formats:
2248 | try:
2249 | docx_path = output_dir / f"{context.base_filename}_transcript_{timestamp}.docx"
2250 |
2251 | # Generate DOCX in a thread pool to avoid blocking
2252 | with concurrent.futures.ThreadPoolExecutor() as executor:
2253 | await asyncio.get_event_loop().run_in_executor(
2254 | executor,
2255 | generate_docx,
2256 | docx_path,
2257 | enhanced_transcript,
2258 | metadata
2259 | )
2260 |
2261 | artifact_paths["output_files"]["docx"] = docx_path
2262 | except (ImportError, Exception) as e:
2263 | logger.warning(f"Failed to generate DOCX output: {e}", emoji_key="warning")
2264 |
2265 | return artifact_paths
2266 |
2267 |
2268 | def generate_srt(segments: List[Dict[str, Any]]) -> str:
2269 | """Generate SRT format from segments."""
2270 | srt_lines = []
2271 |
2272 | for i, segment in enumerate(segments):
2273 | # Convert times to SRT format (HH:MM:SS,mmm)
2274 | start_time = segment.get("start", 0)
2275 | end_time = segment.get("end", 0)
2276 |
2277 | start_str = format_srt_time(start_time)
2278 | end_str = format_srt_time(end_time)
2279 |
2280 | # Format text
2281 | text = segment.get("text", "").replace("\n", " ")
2282 |
2283 | # Add speaker if available
2284 | if "speaker" in segment and segment["speaker"]:
2285 | text = f"[{segment['speaker']}] {text}"
2286 |
2287 | # Add to SRT
2288 | srt_lines.append(f"{i+1}")
2289 | srt_lines.append(f"{start_str} --> {end_str}")
2290 | srt_lines.append(f"{text}")
2291 | srt_lines.append("") # Empty line between entries
2292 |
2293 | return "\n".join(srt_lines)
2294 |
2295 |
2296 | def format_srt_time(seconds: float) -> str:
2297 | """Format seconds as SRT time: HH:MM:SS,mmm."""
2298 | hours = int(seconds // 3600)
2299 | minutes = int((seconds % 3600) // 60)
2300 | sec_int = int(seconds % 60)
2301 | ms = int((seconds % 1) * 1000)
2302 | return f"{hours:02d}:{minutes:02d}:{sec_int:02d},{ms:03d}"
2303 |
2304 |
2305 | def generate_vtt(segments: List[Dict[str, Any]]) -> str:
2306 | """Generate WebVTT format from segments."""
2307 | vtt_lines = ["WEBVTT", ""]
2308 |
2309 | for segment in segments:
2310 | # Convert times to VTT format (HH:MM:SS.mmm)
2311 | start_time = segment.get("start", 0)
2312 | end_time = segment.get("end", 0)
2313 |
2314 | start_str = format_vtt_time(start_time)
2315 | end_str = format_vtt_time(end_time)
2316 |
2317 | # Format text
2318 | text = segment.get("text", "").replace("\n", " ")
2319 |
2320 | # Add speaker if available
2321 | if "speaker" in segment and segment["speaker"]:
2322 | text = f"<v {segment['speaker']}>{text}</v>"
2323 |
2324 | # Add to VTT
2325 | vtt_lines.append(f"{start_str} --> {end_str}")
2326 | vtt_lines.append(f"{text}")
2327 | vtt_lines.append("") # Empty line between entries
2328 |
2329 | return "\n".join(vtt_lines)
2330 |
2331 |
2332 | def format_vtt_time(seconds: float) -> str:
2333 | """Format seconds as WebVTT time: HH:MM:SS.mmm."""
2334 | hours = int(seconds // 3600)
2335 | minutes = int((seconds % 3600) // 60)
2336 | sec_fractional = seconds % 60
2337 | return f"{hours:02d}:{minutes:02d}:{sec_fractional:06.3f}"
2338 |
2339 |
2340 | def generate_markdown(transcript: str, metadata: Dict[str, Any]) -> str:
2341 | """Generate Markdown format for the transcript."""
2342 | md_lines = []
2343 |
2344 | # Add title
2345 | if "title" in metadata and metadata["title"]:
2346 | md_lines.append(f"# {metadata['title']}")
2347 | md_lines.append("")
2348 | else:
2349 | md_lines.append("# Transcript")
2350 | md_lines.append("")
2351 |
2352 | # Add metadata section
2353 | md_lines.append("## Metadata")
2354 | md_lines.append("")
2355 |
2356 | if "language" in metadata and metadata["language"]:
2357 | md_lines.append(f"- **Language:** {metadata['language']}")
2358 |
2359 | if "duration" in metadata and metadata["duration"]:
2360 | duration_min = int(metadata["duration"] // 60)
2361 | duration_sec = int(metadata["duration"] % 60)
2362 | md_lines.append(f"- **Duration:** {duration_min} min {duration_sec} sec")
2363 |
2364 | if "topics" in metadata and metadata["topics"]:
2365 | topics_str = ", ".join(metadata["topics"])
2366 | md_lines.append(f"- **Topics:** {topics_str}")
2367 |
2368 | md_lines.append("")
2369 | md_lines.append("## Transcript")
2370 | md_lines.append("")
2371 |
2372 | # Add transcript with proper line breaks preserved
2373 | for line in transcript.split("\n"):
2374 | md_lines.append(line)
2375 |
2376 | return "\n".join(md_lines)
2377 |
2378 |
2379 | def generate_docx(docx_path: str, transcript: str, metadata: Dict[str, Any]) -> None:
2380 | """Generate DOCX format for the transcript."""
2381 | # Must be run in a ThreadPoolExecutor since python-docx is not async
2382 | doc = Document()
2383 |
2384 | # Add title
2385 | if "title" in metadata and metadata["title"]:
2386 | title = doc.add_heading(metadata["title"], 0)
2387 | else:
2388 | title = doc.add_heading("Transcript", 0) # noqa: F841
2389 |
2390 | # Add metadata section
2391 | doc.add_heading("Metadata", 1)
2392 |
2393 | if "language" in metadata and metadata["language"]:
2394 | doc.add_paragraph(f"Language: {metadata['language']}")
2395 |
2396 | if "duration" in metadata and metadata["duration"]:
2397 | duration_min = int(metadata["duration"] // 60)
2398 | duration_sec = int(metadata["duration"] % 60)
2399 | doc.add_paragraph(f"Duration: {duration_min} min {duration_sec} sec")
2400 |
2401 | if "topics" in metadata and metadata["topics"]:
2402 | topics_str = ", ".join(metadata["topics"])
2403 | doc.add_paragraph(f"Topics: {topics_str}")
2404 |
2405 | # Add transcript
2406 | doc.add_heading("Transcript", 1)
2407 |
2408 | # Split into paragraphs and add
2409 | for paragraph in transcript.split("\n\n"):
2410 | if paragraph.strip():
2411 | p = doc.add_paragraph()
2412 |
2413 | # Check if paragraph starts with a heading marker
2414 | if paragraph.startswith("##"):
2415 | parts = paragraph.split(" ", 1)
2416 | if len(parts) > 1:
2417 | doc.add_heading(parts[1], 2)
2418 | continue
2419 |
2420 | # Regular paragraph
2421 | p.add_run(paragraph)
2422 |
2423 | # Save the document
2424 | doc.save(docx_path)
2425 |
2426 |
2427 | async def chat_with_transcript(
2428 | transcript: str,
2429 | query: str,
2430 | provider: str = Provider.ANTHROPIC.value,
2431 | model: Optional[str] = None,
2432 | context: Optional[str] = None
2433 | ) -> Dict[str, Any]:
2434 | """Chat with a transcript to extract information or answer questions about its content.
2435 |
2436 | Args:
2437 | transcript: The transcript text to analyze
2438 | query: The question or instruction to process regarding the transcript
2439 | provider: LLM provider to use (default: Anthropic)
2440 | model: Specific model to use (default: provider's default model)
2441 | context: Optional additional context about the audio/transcript
2442 |
2443 | Returns:
2444 | A dictionary containing the response and related metadata
2445 | """
2446 | if not transcript or not isinstance(transcript, str):
2447 | raise ToolInputError("Transcript must be a non-empty string.")
2448 |
2449 | if not query or not isinstance(query, str):
2450 | raise ToolInputError("Query must be a non-empty string.")
2451 |
2452 | # Calculate token count for logging
2453 | try:
2454 | transcript_tokens = count_tokens(transcript, model)
2455 | query_tokens = count_tokens(query, model)
2456 | logger.info(
2457 | f"Transcript: {transcript_tokens} tokens, Query: {query_tokens} tokens",
2458 | emoji_key=TaskType.CHAT.value
2459 | )
2460 | except Exception as e:
2461 | logger.warning(f"Failed to count tokens: {e}", emoji_key="warning")
2462 |
2463 | # Build the prompt
2464 | system_prompt = """You are an expert at analyzing transcripts and extracting information.
2465 | Provide concise, accurate answers based solely on the provided transcript.
2466 | If the answer is not in the transcript, say so clearly."""
2467 |
2468 | if context:
2469 | system_prompt += f"\n\nAdditional context about this transcript: {context}"
2470 |
2471 | # Get provider instance to ensure it exists and is available
2472 | try:
2473 | provider_instance = await get_provider(provider)
2474 | if model is None:
2475 | # Check if the provider has a default model or use claude-3-7-sonnet as fallback
2476 | default_models = await provider_instance.list_models()
2477 | if default_models and len(default_models) > 0:
2478 | model = default_models[0].get("id")
2479 | else:
2480 | model = "claude-3-7-sonnet-20250219" if provider == Provider.ANTHROPIC.value else None
2481 |
2482 | logger.info(f"Using model: {provider}/{model}", emoji_key="model")
2483 | except Exception as e:
2484 | raise ProviderError(
2485 | f"Failed to initialize provider '{provider}': {str(e)}",
2486 | provider=provider,
2487 | cause=e
2488 | ) from e
2489 |
2490 | # Use relative file paths for any file references
2491 | rel_transcript_path = None
2492 | if os.path.exists(transcript):
2493 | rel_transcript_path = Path(transcript).relative_to(Path.cwd()) # noqa: F841
2494 |
2495 | # Create the message with the transcript and query
2496 | user_message = f"""Here is a transcript to analyze:
2497 |
2498 | ---TRANSCRIPT BEGIN---
2499 | {transcript}
2500 | ---TRANSCRIPT END---
2501 |
2502 | {query}"""
2503 |
2504 | # Send to LLM
2505 | result = await chat_completion(
2506 | messages=[{"role": "user", "content": user_message}],
2507 | provider=provider,
2508 | model=model,
2509 | system_prompt=system_prompt,
2510 | temperature=0.1
2511 | )
2512 |
2513 | return result
2514 |
2515 |
2516 | @with_cache(ttl=24 * 60 * 60) # Cache results for 24 hours
2517 | @with_tool_metrics
2518 | @with_retry(max_retries=1, retry_delay=1.0)
2519 | @with_error_handling
2520 | async def extract_audio_transcript_key_points(
2521 | file_path_or_transcript: str,
2522 | is_file: bool = True,
2523 | provider: str = Provider.ANTHROPIC.value,
2524 | model: Optional[str] = None,
2525 | max_points: int = 10
2526 | ) -> Dict[str, Any]:
2527 | """Extracts the most important key points from an audio transcript.
2528 |
2529 | This tool can process either an audio file (which it will transcribe first)
2530 | or directly analyze an existing transcript to identify the most important
2531 | information, main topics, and key takeaways.
2532 |
2533 | Args:
2534 | file_path_or_transcript: Path to audio file or transcript text content
2535 | is_file: Whether the input is a file path (True) or transcript text (False)
2536 | provider: LLM provider to use for analysis
2537 | model: Specific model to use (provider default if None)
2538 | max_points: Maximum number of key points to extract
2539 |
2540 | Returns:
2541 | A dictionary containing:
2542 | {
2543 | "key_points": ["Point 1", "Point 2", ...],
2544 | "summary": "Brief summary of the content",
2545 | "topics": ["Topic 1", "Topic 2", ...],
2546 | "speakers": ["Speaker 1", "Speaker 2", ...] (if multiple speakers detected),
2547 | "tokens": { statistics about token usage },
2548 | "cost": estimated cost of the operation,
2549 | "processing_time": total processing time in seconds
2550 | }
2551 | """
2552 | start_time = time.time()
2553 |
2554 | # Get transcript from file or use provided text
2555 | transcript = ""
2556 | if is_file:
2557 | try:
2558 | # Validate file path
2559 | file_path = os.path.abspath(os.path.expanduser(file_path_or_transcript))
2560 | if not os.path.exists(file_path):
2561 | raise ToolInputError(f"File not found: {file_path}")
2562 |
2563 | # Get file info for logging
2564 | file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
2565 | file_name = Path(file_path).name
2566 |
2567 | logger.info(
2568 | f"Extracting key points from audio file: {file_name} ({file_size_mb:.2f} MB)",
2569 | emoji_key="audio"
2570 | )
2571 |
2572 | # Transcribe audio
2573 | transcription_result = await transcribe_audio(file_path, {
2574 | "enhance_audio": True,
2575 | "enhance_transcript": True,
2576 | "output_formats": ["json"]
2577 | })
2578 |
2579 | transcript = transcription_result.get("enhanced_transcript", "")
2580 | if not transcript:
2581 | transcript = transcription_result.get("raw_transcript", "")
2582 |
2583 | if not transcript:
2584 | raise ToolError("Failed to generate transcript from audio")
2585 |
2586 | except Exception as e:
2587 | if isinstance(e, (ToolError, ToolInputError)):
2588 | raise
2589 | raise ToolError(f"Failed to process audio file: {str(e)}") from e
2590 | else:
2591 | # Input is already a transcript
2592 | transcript = file_path_or_transcript
2593 | if not transcript or not isinstance(transcript, str):
2594 | raise ToolInputError("Transcript text must be a non-empty string")
2595 |
2596 | # Calculate token count for the transcript
2597 | try:
2598 | token_count = count_tokens(transcript, model)
2599 | logger.info(f"Transcript token count: {token_count}", emoji_key="tokens")
2600 | except Exception as e:
2601 | logger.warning(f"Failed to count tokens: {e}")
2602 |
2603 | # Create prompt for key points extraction
2604 | prompt = f"""Extract the most important key points from this transcript.
2605 |
2606 | Identify:
2607 | 1. The {max_points} most important key points or takeaways
2608 | 2. Main topics discussed
2609 | 3. Any speakers or main entities mentioned (if identifiable)
2610 | 4. A brief summary (2-3 sentences max)
2611 |
2612 | Format your response as JSON with these fields:
2613 | {{
2614 | "key_points": ["Point 1", "Point 2", ...],
2615 | "topics": ["Topic 1", "Topic 2", ...],
2616 | "speakers": ["Speaker 1", "Speaker 2", ...],
2617 | "summary": "Brief summary here"
2618 | }}
2619 |
2620 | TRANSCRIPT:
2621 | {transcript}
2622 | """
2623 |
2624 | # Get provider instance
2625 | try:
2626 | provider_instance = await get_provider(provider) # noqa: F841
2627 | except Exception as e:
2628 | raise ProviderError(
2629 | f"Failed to initialize provider '{provider}': {str(e)}",
2630 | provider=provider,
2631 | cause=e
2632 | ) from e
2633 |
2634 | # Generate completion
2635 | try:
2636 | completion_result = await generate_completion(
2637 | prompt=prompt,
2638 | provider=provider,
2639 | model=model,
2640 | temperature=0.1,
2641 | max_tokens=1000
2642 | )
2643 |
2644 | # Parse JSON response
2645 | response_text = completion_result.get("text", "")
2646 |
2647 | # Find JSON in the response
2648 | json_match = re.search(r'({[\s\S]*})', response_text)
2649 | if json_match:
2650 | try:
2651 | extracted_data = json.loads(json_match.group(1))
2652 | except json.JSONDecodeError:
2653 | # Fallback to regex extraction
2654 | extracted_data = _extract_key_points_with_regex(response_text)
2655 | else:
2656 | # Fallback to regex extraction
2657 | extracted_data = _extract_key_points_with_regex(response_text)
2658 |
2659 | processing_time = time.time() - start_time
2660 |
2661 | # Add token and cost info
2662 | result = {
2663 | "key_points": extracted_data.get("key_points", []),
2664 | "topics": extracted_data.get("topics", []),
2665 | "speakers": extracted_data.get("speakers", []),
2666 | "summary": extracted_data.get("summary", ""),
2667 | "tokens": completion_result.get("tokens", {"input": 0, "output": 0, "total": 0}),
2668 | "cost": completion_result.get("cost", 0.0),
2669 | "processing_time": processing_time
2670 | }
2671 |
2672 | return result
2673 |
2674 | except Exception as e:
2675 | error_model = model or f"{provider}/default"
2676 | raise ProviderError(
2677 | f"Key points extraction failed for model '{error_model}': {str(e)}",
2678 | provider=provider,
2679 | model=error_model,
2680 | cause=e
2681 | ) from e
2682 |
2683 |
2684 | def _extract_key_points_with_regex(text: str) -> Dict[str, Any]:
2685 | """Extract key points data using regex when JSON parsing fails."""
2686 | result = {
2687 | "key_points": [],
2688 | "topics": [],
2689 | "speakers": [],
2690 | "summary": ""
2691 | }
2692 |
2693 | # Extract key points
2694 | key_points_pattern = r'key_points"?\s*:?\s*\[\s*"([^"]+)"(?:\s*,\s*"([^"]+)")*\s*\]'
2695 | key_points_match = re.search(key_points_pattern, text, re.IGNORECASE | re.DOTALL)
2696 | if key_points_match:
2697 | point_list = re.findall(r'"([^"]+)"', key_points_match.group(0))
2698 | # Filter out empty strings
2699 | result["key_points"] = [p for p in point_list if p.strip()]
2700 | else:
2701 | # Try alternative pattern for non-JSON format
2702 | point_list = re.findall(r'(?:^|\n)(?:•|\*|-|[0-9]+\.)\s*([^\n]+?)(?:\n|$)', text)
2703 | # Filter out empty strings
2704 | result["key_points"] = [p.strip() for p in point_list if p.strip()][:10] # Limit to 10 points
2705 |
2706 | # Extract topics
2707 | topics_pattern = r'topics"?\s*:?\s*\[\s*"([^"]+)"(?:\s*,\s*"([^"]+)")*\s*\]'
2708 | topics_match = re.search(topics_pattern, text, re.IGNORECASE | re.DOTALL)
2709 | if topics_match:
2710 | topic_list = re.findall(r'"([^"]+)"', topics_match.group(0))
2711 | # Filter out empty strings
2712 | result["topics"] = [t for t in topic_list if t.strip()]
2713 |
2714 | # Extract speakers
2715 | speakers_pattern = r'speakers"?\s*:?\s*\[\s*"([^"]+)"(?:\s*,\s*"([^"]+)")*\s*\]'
2716 | speakers_match = re.search(speakers_pattern, text, re.IGNORECASE | re.DOTALL)
2717 | if speakers_match:
2718 | speaker_list = re.findall(r'"([^"]+)"', speakers_match.group(0))
2719 | # Filter out empty strings
2720 | result["speakers"] = [s for s in speaker_list if s.strip()]
2721 |
2722 | # Extract summary
2723 | summary_pattern = r'summary"?\s*:?\s*"([^"]+)"'
2724 | summary_match = re.search(summary_pattern, text, re.IGNORECASE)
2725 | if summary_match and summary_match.group(1).strip():
2726 | result["summary"] = summary_match.group(1).strip()
2727 |
2728 | return result
```