feat: Add streaming Ollama support, model caching, and UI improvements
- Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils
This commit is contained in:
@ -1,31 +1,36 @@
|
||||
import whisper
|
||||
from pathlib import Path
|
||||
from transformers import pipeline, AutoTokenizer
|
||||
from utils.audio_processing import extract_audio
|
||||
from utils.summarization import summarize_text
|
||||
import logging
|
||||
import torch
|
||||
import streamlit as st
|
||||
|
||||
# Try to import GPU utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.gpu_utils import configure_gpu, get_optimal_device
|
||||
GPU_UTILS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GPU_UTILS_AVAILABLE = False
|
||||
|
||||
# Try to import caching utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.cache import load_from_cache, save_to_cache
|
||||
CACHE_AVAILABLE = True
|
||||
except ImportError:
|
||||
CACHE_AVAILABLE = False
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
WHISPER_MODEL = "base"
|
||||
|
||||
|
||||
@st.cache_resource
|
||||
def _load_whisper_model(model_name, device_str):
|
||||
"""Load and cache a Whisper model. Cached across reruns."""
|
||||
logger.info(f"Loading Whisper model: {model_name} on {device_str}")
|
||||
device = torch.device(device_str)
|
||||
return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu")
|
||||
|
||||
|
||||
def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,
|
||||
use_gpu=True, memory_fraction=0.8):
|
||||
"""
|
||||
@ -44,38 +49,30 @@ def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cach
|
||||
"""
|
||||
audio_path = Path(audio_path)
|
||||
|
||||
# Check cache first if enabled
|
||||
if use_cache and CACHE_AVAILABLE:
|
||||
cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age)
|
||||
if cached_data:
|
||||
logger.info(f"Using cached transcription for {audio_path}")
|
||||
return cached_data.get("segments", []), cached_data.get("transcript", "")
|
||||
|
||||
# Extract audio if the input is a video file (M4A is already audio)
|
||||
video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
|
||||
if audio_path.suffix.lower() in video_extensions:
|
||||
audio_path = extract_audio(audio_path)
|
||||
|
||||
# Configure GPU if available and requested
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
gpu_config = configure_gpu(model, memory_fraction)
|
||||
device = gpu_config["device"]
|
||||
logger.info(f"Using device: {device} for transcription")
|
||||
|
||||
# Load the specified Whisper model
|
||||
logger.info(f"Loading Whisper model: {model}")
|
||||
whisper_model = whisper.load_model(model, device=device if device.type != "mps" else "cpu")
|
||||
whisper_model = _load_whisper_model(model, str(device))
|
||||
|
||||
# Transcribe the audio
|
||||
logger.info(f"Transcribing audio: {audio_path}")
|
||||
result = whisper_model.transcribe(str(audio_path))
|
||||
|
||||
# Extract the full transcript and segments
|
||||
transcript = result["text"]
|
||||
segments = result["segments"]
|
||||
|
||||
# Cache the results if caching is enabled
|
||||
if use_cache and CACHE_AVAILABLE:
|
||||
cache_data = {
|
||||
"transcript": transcript,
|
||||
|
||||
Reference in New Issue
Block a user