feat: Add streaming Ollama support, model caching, and UI improvements

- Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils
2026-02-18 10:26:09 -05:00
parent ce398ae1d4
commit 70c5d32413
10 changed files with 998 additions and 707 deletions
--- a/utils/summarization.py
+++ b/utils/summarization.py
@ -1,45 +1,49 @@
 from transformers import pipeline, AutoTokenizer
 import torch
 import logging
+import streamlit as st

-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 SUMMARY_MODEL = "Falconsai/text_summarization"

+
+@st.cache_resource
+def _load_summarizer(device_int):
+    """Load and cache the summarization pipeline."""
+    logger.info(f"Loading summarization model on device {device_int}")
+    return pipeline("summarization", model=SUMMARY_MODEL, device=device_int)
+
+
+@st.cache_resource
+def _load_summary_tokenizer():
+    """Load and cache the summarization tokenizer."""
+    return AutoTokenizer.from_pretrained(SUMMARY_MODEL)
+
+
 def chunk_text(text, max_tokens, tokenizer):
    """
-    Splits the text into a list of chunks based on token limits.
-    
-    Args:
-        text (str): Text to chunk
-        max_tokens (int): Maximum tokens per chunk
-        tokenizer (AutoTokenizer): Tokenizer to use
-        
-    Returns:
-        list: List of text chunks
+    Splits text into chunks by tokenizing once, then splitting by token windows.
+    Much faster than the per-word tokenization approach.
    """
-    words = text.split()
+    all_ids = tokenizer(text, return_tensors='pt', truncation=False)['input_ids'][0]
+    content_ids = all_ids[1:-1]  # strip BOS/EOS
+    usable_max = max_tokens - 2  # leave room for special tokens
+
    chunks = []
-    current_chunk = []
-    current_length = 0
-    
-    for word in words:
-        hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
-        if hypothetical_length <= max_tokens:
-            current_chunk.append(word)
-            current_length = hypothetical_length
-        else:
-            chunks.append(' '.join(current_chunk))
-            current_chunk = [word]
-            current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
-    
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    
+    for i in range(0, len(content_ids), usable_max):
+        chunk_ids = content_ids[i : i + usable_max]
+        decoded = tokenizer.decode(chunk_ids, skip_special_tokens=True).strip()
+        if decoded:
+            chunks.append(decoded)
+
+    if not chunks:
+        chunks.append(text)
+
    return chunks

+
 def summarize_text(text, use_gpu=True, memory_fraction=0.8):
    """
    Summarize text using a Hugging Face pipeline with chunking support.
@ -52,21 +56,17 @@ def summarize_text(text, use_gpu=True, memory_fraction=0.8):
    Returns:
        str: Summarized text
    """
-    # Determine device
-    device = -1  # Default to CPU
+    device = -1
    if use_gpu and torch.cuda.is_available():
-        device = 0  # Use first GPU
-        if torch.cuda.is_available():
-            torch.cuda.set_per_process_memory_fraction(memory_fraction)
+        device = 0
+        torch.cuda.set_per_process_memory_fraction(memory_fraction)
    
    logger.info(f"Using device {device} for summarization")
    
    try:
-        # Initialize the pipeline and tokenizer
-        summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device)
-        tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL)
+        summarizer = _load_summarizer(device)
+        tokenizer = _load_summary_tokenizer()
        
-        # Check if text needs to be chunked
        max_tokens = 512
        tokens = tokenizer(text, return_tensors='pt')
        num_tokens = len(tokens['input_ids'][0])
@ -85,7 +85,6 @@ def summarize_text(text, use_gpu=True, memory_fraction=0.8):
                )
                summaries.append(summary_output[0]['summary_text'])
            
-            # If multiple chunks, summarize the combined summaries
            if len(summaries) > 1:
                logger.info("Generating final summary from chunk summaries")
                combined_text = " ".join(summaries)
@ -106,7 +105,6 @@ def summarize_text(text, use_gpu=True, memory_fraction=0.8):
            
    except Exception as e:
        logger.error(f"Error during summarization: {e}")
-        # Fallback to CPU if GPU fails
        if device != -1:
            logger.info("Falling back to CPU")
            return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction)