feat: Add streaming Ollama support, model caching, and UI improvements

- Add streaming summarization via Ollama API (stream_summarize_with_ollama)

- Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper)

- Add temp file cleanup for extracted audio

- Add system capabilities detection (FFmpeg, GPU info)

- Add get_video_duration utility

- Improve validation with FFmpeg check

- Rewrite app.py with streaming support and UI enhancements

- Clean up redundant comments and unused imports across all utils
This commit is contained in:
Your Name
2026-02-18 10:26:09 -05:00
parent ce398ae1d4
commit 70c5d32413
10 changed files with 998 additions and 707 deletions

View File

@ -1,41 +1,49 @@
"""
Translation utilities for the OBS Recording Transcriber.
Translation utilities for the Video Transcriber.
Provides functions for language detection and translation.
"""
import logging
import torch
from pathlib import Path
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, M2M100ForConditionalGeneration
from transformers import pipeline, AutoTokenizer, M2M100ForConditionalGeneration
import whisper
import iso639
import streamlit as st
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Try to import GPU utilities, but don't fail if not available
try:
from utils.gpu_utils import get_optimal_device
GPU_UTILS_AVAILABLE = True
except ImportError:
GPU_UTILS_AVAILABLE = False
# Default models
TRANSLATION_MODEL = "facebook/m2m100_418M"
LANGUAGE_DETECTION_MODEL = "papluca/xlm-roberta-base-language-detection"
# ISO language code mapping
@st.cache_resource
def _load_language_detector(model_name, device_int):
"""Load and cache the language detection pipeline."""
logger.info(f"Loading language detection model: {model_name}")
return pipeline("text-classification", model=model_name, device=device_int)
@st.cache_resource
def _load_translation_model(model_name, device_str):
"""Load and cache the M2M100 translation model and tokenizer."""
logger.info(f"Loading translation model: {model_name} on {device_str}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
device = torch.device(device_str)
model = model.to(device)
return model, tokenizer
def get_language_name(code):
"""
Get the language name from ISO code.
Args:
code (str): ISO language code
Returns:
str: Language name or original code if not found
"""
"""Get the language name from ISO code."""
try:
return iso639.languages.get(part1=code).name
except (KeyError, AttributeError):
@ -57,7 +65,6 @@ def detect_language(text, model=LANGUAGE_DETECTION_MODEL, use_gpu=True):
Returns:
tuple: (language_code, confidence)
"""
# Configure device
device = torch.device("cpu")
if use_gpu and GPU_UTILS_AVAILABLE:
device = get_optimal_device()
@ -66,25 +73,43 @@ def detect_language(text, model=LANGUAGE_DETECTION_MODEL, use_gpu=True):
device_arg = -1
try:
# Initialize the pipeline
classifier = pipeline("text-classification", model=model, device=device_arg)
classifier = _load_language_detector(model, device_arg)
# Truncate text if too long
max_length = 512
if len(text) > max_length:
text = text[:max_length]
# Detect language
result = classifier(text)[0]
language_code = result["label"]
confidence = result["score"]
return language_code, confidence
return result["label"], result["score"]
except Exception as e:
logger.error(f"Error detecting language: {e}")
return None, 0.0
def _translate_text_with_model(text, source_lang, target_lang, trans_model, tokenizer, device):
"""Translate text using a pre-loaded model and tokenizer."""
tokenizer.src_lang = source_lang
max_length = 512
if len(text) > max_length:
chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
else:
chunks = [text]
translated_chunks = []
for chunk in chunks:
encoded = tokenizer(chunk, return_tensors="pt").to(device)
generated_tokens = trans_model.generate(
**encoded,
forced_bos_token_id=tokenizer.get_lang_id(target_lang),
max_length=max_length
)
translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
translated_chunks.append(translated_chunk)
return " ".join(translated_chunks)
def translate_text(text, source_lang=None, target_lang="en", model=TRANSLATION_MODEL, use_gpu=True):
"""
Translate text from source language to target language.
@ -99,7 +124,6 @@ def translate_text(text, source_lang=None, target_lang="en", model=TRANSLATION_M
Returns:
str: Translated text
"""
# Auto-detect source language if not provided
if source_lang is None:
detected_lang, confidence = detect_language(text, use_gpu=use_gpu)
if detected_lang and confidence > 0.5:
@ -109,50 +133,17 @@ def translate_text(text, source_lang=None, target_lang="en", model=TRANSLATION_M
logger.warning("Could not reliably detect language, defaulting to English")
source_lang = "en"
# Skip translation if source and target are the same
if source_lang == target_lang:
logger.info(f"Source and target languages are the same ({source_lang}), skipping translation")
return text
# Configure device
device = torch.device("cpu")
if use_gpu and GPU_UTILS_AVAILABLE:
device = get_optimal_device()
try:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)
model = M2M100ForConditionalGeneration.from_pretrained(model)
# Move model to device
model = model.to(device)
# Prepare for translation
tokenizer.src_lang = source_lang
# Split text into manageable chunks if too long
max_length = 512
if len(text) > max_length:
chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
else:
chunks = [text]
# Translate each chunk
translated_chunks = []
for chunk in chunks:
encoded = tokenizer(chunk, return_tensors="pt").to(device)
generated_tokens = model.generate(
**encoded,
forced_bos_token_id=tokenizer.get_lang_id(target_lang),
max_length=max_length
)
translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
translated_chunks.append(translated_chunk)
# Combine translated chunks
translated_text = " ".join(translated_chunks)
return translated_text
trans_model, tokenizer = _load_translation_model(model, str(device))
return _translate_text_with_model(text, source_lang, target_lang, trans_model, tokenizer, device)
except Exception as e:
logger.error(f"Error translating text: {e}")
return text
@ -160,7 +151,7 @@ def translate_text(text, source_lang=None, target_lang="en", model=TRANSLATION_M
def translate_segments(segments, source_lang=None, target_lang="en", use_gpu=True):
"""
Translate transcript segments.
Translate transcript segments. Loads the model once and reuses for all segments.
Args:
segments (list): List of transcript segments
@ -174,36 +165,32 @@ def translate_segments(segments, source_lang=None, target_lang="en", use_gpu=Tru
if not segments:
return []
# Auto-detect source language from combined text if not provided
if source_lang is None:
combined_text = " ".join([segment["text"] for segment in segments])
detected_lang, _ = detect_language(combined_text, use_gpu=use_gpu)
source_lang = detected_lang if detected_lang else "en"
# Skip translation if source and target are the same
if source_lang == target_lang:
return segments
device = torch.device("cpu")
if use_gpu and GPU_UTILS_AVAILABLE:
device = get_optimal_device()
try:
# Initialize translation pipeline
translated_segments = []
trans_model, tokenizer = _load_translation_model(TRANSLATION_MODEL, str(device))
# Translate each segment
translated_segments = []
for segment in segments:
translated_text = translate_text(
segment["text"],
source_lang=source_lang,
target_lang=target_lang,
use_gpu=use_gpu
translated_text = _translate_text_with_model(
segment["text"], source_lang, target_lang, trans_model, tokenizer, device
)
# Create a new segment with translated text
translated_segment = segment.copy()
translated_segment["text"] = translated_text
translated_segment["original_text"] = segment["text"]
translated_segment["source_lang"] = source_lang
translated_segment["target_lang"] = target_lang
translated_segments.append(translated_segment)
return translated_segments
@ -227,39 +214,33 @@ def transcribe_and_translate(audio_path, whisper_model="base", target_lang="en",
Returns:
tuple: (original_segments, translated_segments, original_transcript, translated_transcript)
"""
from utils.transcription import _load_whisper_model
audio_path = Path(audio_path)
# Configure device
device = torch.device("cpu")
if use_gpu and GPU_UTILS_AVAILABLE:
device = get_optimal_device()
try:
# Step 1: Transcribe audio with Whisper
logger.info(f"Transcribing audio with Whisper model: {whisper_model}")
model = whisper.load_model(whisper_model, device=device if device.type != "mps" else "cpu")
model = _load_whisper_model(whisper_model, str(device))
# Use Whisper's built-in language detection if requested
if detect_source:
# First, detect language with Whisper
audio = whisper.load_audio(str(audio_path))
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(device if device.type != "mps" else "cpu")
_, probs = model.detect_language(mel)
source_lang = max(probs, key=probs.get)
logger.info(f"Whisper detected language: {get_language_name(source_lang)} ({source_lang})")
# Transcribe with detected language
result = model.transcribe(str(audio_path), language=source_lang)
else:
# Transcribe without language specification
result = model.transcribe(str(audio_path))
source_lang = result.get("language", "en")
original_segments = result["segments"]
original_transcript = result["text"]
# Step 2: Translate if needed
if source_lang != target_lang:
logger.info(f"Translating from {source_lang} to {target_lang}")
translated_segments = translate_segments(
@ -268,8 +249,6 @@ def transcribe_and_translate(audio_path, whisper_model="base", target_lang="en",
target_lang=target_lang,
use_gpu=use_gpu
)
# Create full translated transcript
translated_transcript = " ".join([segment["text"] for segment in translated_segments])
else:
logger.info(f"Source and target languages are the same ({source_lang}), skipping translation")
@ -280,4 +259,4 @@ def transcribe_and_translate(audio_path, whisper_model="base", target_lang="en",
except Exception as e:
logger.error(f"Error in transcribe_and_translate: {e}")
return None, None, None, None
return None, None, None, None