feat: Add streaming Ollama support, model caching, and UI improvements

- Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils
2026-02-18 10:26:09 -05:00
parent ce398ae1d4
commit 70c5d32413
10 changed files with 998 additions and 707 deletions
--- a/utils/transcription.py
+++ b/utils/transcription.py
@ -1,31 +1,36 @@
 import whisper
 from pathlib import Path
-from transformers import pipeline, AutoTokenizer
 from utils.audio_processing import extract_audio
-from utils.summarization import summarize_text
 import logging
 import torch
+import streamlit as st

-# Try to import GPU utilities, but don't fail if not available
 try:
    from utils.gpu_utils import configure_gpu, get_optimal_device
    GPU_UTILS_AVAILABLE = True
 except ImportError:
    GPU_UTILS_AVAILABLE = False

-# Try to import caching utilities, but don't fail if not available
 try:
    from utils.cache import load_from_cache, save_to_cache
    CACHE_AVAILABLE = True
 except ImportError:
    CACHE_AVAILABLE = False

-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 WHISPER_MODEL = "base"

+
+@st.cache_resource
+def _load_whisper_model(model_name, device_str):
+    """Load and cache a Whisper model. Cached across reruns."""
+    logger.info(f"Loading Whisper model: {model_name} on {device_str}")
+    device = torch.device(device_str)
+    return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu")
+
+
 def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, 
                     use_gpu=True, memory_fraction=0.8):
    """
@ -44,38 +49,30 @@ def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cach
    """
    audio_path = Path(audio_path)
    
-    # Check cache first if enabled
    if use_cache and CACHE_AVAILABLE:
        cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age)
        if cached_data:
            logger.info(f"Using cached transcription for {audio_path}")
            return cached_data.get("segments", []), cached_data.get("transcript", "")
    
-    # Extract audio if the input is a video file (M4A is already audio)
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
    if audio_path.suffix.lower() in video_extensions:
        audio_path = extract_audio(audio_path)
    
-    # Configure GPU if available and requested
    device = torch.device("cpu")
    if use_gpu and GPU_UTILS_AVAILABLE:
        gpu_config = configure_gpu(model, memory_fraction)
        device = gpu_config["device"]
        logger.info(f"Using device: {device} for transcription")
    
-    # Load the specified Whisper model
-    logger.info(f"Loading Whisper model: {model}")
-    whisper_model = whisper.load_model(model, device=device if device.type != "mps" else "cpu")
+    whisper_model = _load_whisper_model(model, str(device))
    
-    # Transcribe the audio
    logger.info(f"Transcribing audio: {audio_path}")
    result = whisper_model.transcribe(str(audio_path))
    
-    # Extract the full transcript and segments
    transcript = result["text"]
    segments = result["segments"]
    
-    # Cache the results if caching is enabled
    if use_cache and CACHE_AVAILABLE:
        cache_data = {
            "transcript": transcript,