TalkEdit/utils/transcription.py

import whisper
from pathlib import Path
from utils.audio_processing import extract_audio
import logging
import torch
import streamlit as st

try:
    from utils.gpu_utils import configure_gpu, get_optimal_device
    GPU_UTILS_AVAILABLE = True
except ImportError:
    GPU_UTILS_AVAILABLE = False

try:
    from utils.cache import load_from_cache, save_to_cache
    CACHE_AVAILABLE = True
except ImportError:
    CACHE_AVAILABLE = False

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

WHISPER_MODEL = "base"

WHISPER_MODEL_SIZES = {
    "tiny": 75,
    "base": 140,
    "small": 460,
    "medium": 1500,
    "large": 2900,
    "large-v2": 2900,
    "large-v3": 2900,
}


@st.cache_resource
def _load_whisper_model(model_name, device_str):
    """Load and cache a Whisper model. Cached across reruns."""
    logger.info(f"Loading Whisper model: {model_name} on {device_str}")
    device = torch.device(device_str)
    try:
        return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu")
    except (MemoryError, RuntimeError) as e:
        err_str = str(e).lower()
        if "out of memory" in err_str or "cannot allocate" in err_str or isinstance(e, MemoryError):
            size_mb = WHISPER_MODEL_SIZES.get(model_name, "unknown")
            raise MemoryError(
                f"Not enough memory to load Whisper '{model_name}' model (~{size_mb}MB). "
                f"Try a smaller model (tiny/base/small) or enable GPU acceleration."
            ) from e
        raise


def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, 
                     use_gpu=True, memory_fraction=0.8):
    """
    Transcribe audio using Whisper and return both segments and full transcript.
    
    Args:
        audio_path (Path): Path to the audio or video file
        model (str): Whisper model size to use (tiny, base, small, medium, large)
        use_cache (bool): Whether to use caching
        cache_max_age (float, optional): Maximum age of cache in seconds
        use_gpu (bool): Whether to use GPU acceleration if available
        memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
        
    Returns:
        tuple: (segments, transcript) where segments is a list of dicts with timing info
    """
    audio_path = Path(audio_path)
    
    if use_cache and CACHE_AVAILABLE:
        cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age)
        if cached_data:
            logger.info(f"Using cached transcription for {audio_path}")
            return cached_data.get("segments", []), cached_data.get("transcript", "")
    
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
    if audio_path.suffix.lower() in video_extensions:
        audio_path = extract_audio(audio_path)
    
    device = torch.device("cpu")
    if use_gpu and GPU_UTILS_AVAILABLE:
        gpu_config = configure_gpu(model, memory_fraction)
        device = gpu_config["device"]
        logger.info(f"Using device: {device} for transcription")
    
    whisper_model = _load_whisper_model(model, str(device))
    
    logger.info(f"Transcribing audio: {audio_path}")
    result = whisper_model.transcribe(str(audio_path))
    
    transcript = result["text"]
    segments = result["segments"]
    
    if use_cache and CACHE_AVAILABLE:
        cache_data = {
            "transcript": transcript,
            "segments": segments
        }
        save_to_cache(audio_path, cache_data, model, "transcribe")
    
    return segments, transcript
Updated with code 2025-01-28 17:00:03 -05:00			`import whisper`
			`from pathlib import Path`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`from utils.audio_processing import extract_audio`
			`import logging`
			`import torch`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`import streamlit as st`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
			`try:`
			`from utils.gpu_utils import configure_gpu, get_optimal_device`
			`GPU_UTILS_AVAILABLE = True`
			`except ImportError:`
			`GPU_UTILS_AVAILABLE = False`

			`try:`
			`from utils.cache import load_from_cache, save_to_cache`
			`CACHE_AVAILABLE = True`
			`except ImportError:`
			`CACHE_AVAILABLE = False`

			`logging.basicConfig(level=logging.INFO)`
			`logger = logging.getLogger(__name__)`
Updated with code 2025-01-28 17:00:03 -05:00
			`WHISPER_MODEL = "base"`

fix: Resolve issues #7, #8, #9 - moviepy, transformers, Whisper OOM Issue #7: Handle moviepy 2.x removing verbose param from write_audiofile Issue #8: Pin transformers<5.0.0 to fix summarization pipeline task registry Issue #9: Add Whisper model memory warnings and OOM error handling 2026-03-03 02:10:52 -05:00			`WHISPER_MODEL_SIZES = {`
			`"tiny": 75,`
			`"base": 140,`
			`"small": 460,`
			`"medium": 1500,`
			`"large": 2900,`
			`"large-v2": 2900,`
			`"large-v3": 2900,`
			`}`

feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00
			`@st.cache_resource`
			`def _load_whisper_model(model_name, device_str):`
			`"""Load and cache a Whisper model. Cached across reruns."""`
			`logger.info(f"Loading Whisper model: {model_name} on {device_str}")`
			`device = torch.device(device_str)`
fix: Resolve issues #7, #8, #9 - moviepy, transformers, Whisper OOM Issue #7: Handle moviepy 2.x removing verbose param from write_audiofile Issue #8: Pin transformers<5.0.0 to fix summarization pipeline task registry Issue #9: Add Whisper model memory warnings and OOM error handling 2026-03-03 02:10:52 -05:00			`try:`
			`return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu")`
			`except (MemoryError, RuntimeError) as e:`
			`err_str = str(e).lower()`
			`if "out of memory" in err_str or "cannot allocate" in err_str or isinstance(e, MemoryError):`
			`size_mb = WHISPER_MODEL_SIZES.get(model_name, "unknown")`
			`raise MemoryError(`
			`f"Not enough memory to load Whisper '{model_name}' model (~{size_mb}MB). "`
			`f"Try a smaller model (tiny/base/small) or enable GPU acceleration."`
			`) from e`
			`raise`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00

Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,`
			`use_gpu=True, memory_fraction=0.8):`
			`"""`
			`Transcribe audio using Whisper and return both segments and full transcript.`

			`Args:`
			`audio_path (Path): Path to the audio or video file`
			`model (str): Whisper model size to use (tiny, base, small, medium, large)`
			`use_cache (bool): Whether to use caching`
			`cache_max_age (float, optional): Maximum age of cache in seconds`
			`use_gpu (bool): Whether to use GPU acceleration if available`
			`memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)`

			`Returns:`
			`tuple: (segments, transcript) where segments is a list of dicts with timing info`
			`"""`
			`audio_path = Path(audio_path)`

			`if use_cache and CACHE_AVAILABLE:`
			`cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age)`
			`if cached_data:`
			`logger.info(f"Using cached transcription for {audio_path}")`
			`return cached_data.get("segments", []), cached_data.get("transcript", "")`

Enhance file selection to support additional audio format (M4A) and update README.md to reflect new supported formats for video and audio recordings. 2025-08-05 11:18:36 -04:00			`video_extensions = ['.mp4', '.avi', '.mov', '.mkv']`
			`if audio_path.suffix.lower() in video_extensions:`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`audio_path = extract_audio(audio_path)`

			`device = torch.device("cpu")`
			`if use_gpu and GPU_UTILS_AVAILABLE:`
			`gpu_config = configure_gpu(model, memory_fraction)`
			`device = gpu_config["device"]`
			`logger.info(f"Using device: {device} for transcription")`

feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`whisper_model = _load_whisper_model(model, str(device))`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
			`logger.info(f"Transcribing audio: {audio_path}")`
			`result = whisper_model.transcribe(str(audio_path))`

Updated with code 2025-01-28 17:00:03 -05:00			`transcript = result["text"]`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`segments = result["segments"]`

			`if use_cache and CACHE_AVAILABLE:`
			`cache_data = {`
			`"transcript": transcript,`
			`"segments": segments`
			`}`
			`save_to_cache(audio_path, cache_data, model, "transcribe")`

Fix summarization issues and improve GPU handling. Update .gitignore for venv 2025-04-30 12:09:10 -04:00			`return segments, transcript`