feat: Add streaming Ollama support, model caching, and UI improvements

- Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils
2026-02-18 10:26:09 -05:00
parent ce398ae1d4
commit 70c5d32413
10 changed files with 998 additions and 707 deletions
--- a/utils/ollama_integration.py
+++ b/utils/ollama_integration.py
@ -1,6 +1,6 @@
 """
 Ollama integration for local AI model inference.
-Provides functions to use Ollama's API for text summarization.
+Provides functions to use Ollama's API for text summarization with streaming support.
 """

 import requests
@ -9,21 +9,14 @@ import logging
 from pathlib import Path
 import os

-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

-# Default Ollama API endpoint - configurable via environment variable
 OLLAMA_API_URL = os.environ.get("OLLAMA_API_URL", "http://localhost:11434/api")


 def check_ollama_available():
-    """
-    Check if Ollama service is available.
-    
-    Returns:
-        bool: True if Ollama is available, False otherwise
-    """
+    """Check if Ollama service is available."""
    try:
        response = requests.get(f"{OLLAMA_API_URL}/tags", timeout=2)
        return response.status_code == 200
@ -32,12 +25,7 @@ def check_ollama_available():


 def list_available_models():
-    """
-    List available models in Ollama.
-    
-    Returns:
-        list: List of available model names
-    """
+    """List available models in Ollama."""
    try:
        response = requests.get(f"{OLLAMA_API_URL}/tags")
        if response.status_code == 200:
@ -50,32 +38,14 @@ def list_available_models():


 def summarize_with_ollama(text, model="llama3", max_length=150):
-    """
-    Summarize text using Ollama's local API.
-    
-    Args:
-        text (str): Text to summarize
-        model (str): Ollama model to use
-        max_length (int): Maximum length of the summary
-        
-    Returns:
-        str: Summarized text or None if failed
-    """
+    """Summarize text using Ollama's local API (non-streaming)."""
    if not check_ollama_available():
        logger.warning("Ollama service is not available")
        return None
    
-    # Check if the model is available
-    available_models = list_available_models()
-    if model not in available_models:
-        logger.warning(f"Model {model} not available in Ollama. Available models: {available_models}")
-        return None
-    
-    # Prepare the prompt for summarization
    prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
    
    try:
-        # Make the API request
        response = requests.post(
            f"{OLLAMA_API_URL}/generate",
            json={
@ -85,7 +55,7 @@ def summarize_with_ollama(text, model="llama3", max_length=150):
                "options": {
                    "temperature": 0.3,
                    "top_p": 0.9,
-                    "max_tokens": max_length * 2  # Approximate token count
+                    "max_tokens": max_length * 2
                }
            }
        )
@ -101,23 +71,55 @@ def summarize_with_ollama(text, model="llama3", max_length=150):
        return None


-def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
+def stream_summarize_with_ollama(text, model="llama3", max_length=150):
    """
-    Chunk long text and summarize each chunk, then combine the summaries.
+    Summarize text using Ollama with streaming. Yields tokens as they arrive.
    
-    Args:
-        text (str): Text to summarize
-        model (str): Ollama model to use
-        chunk_size (int): Maximum size of each chunk in characters
-        max_length (int): Maximum length of the final summary
-        
-    Returns:
-        str: Combined summary or None if failed
+    Yields:
+        str: Individual response tokens
    """
+    if not check_ollama_available():
+        logger.warning("Ollama service is not available")
+        return
+    
+    prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
+    
+    try:
+        response = requests.post(
+            f"{OLLAMA_API_URL}/generate",
+            json={
+                "model": model,
+                "prompt": prompt,
+                "stream": True,
+                "options": {
+                    "temperature": 0.3,
+                    "top_p": 0.9,
+                    "max_tokens": max_length * 2
+                }
+            },
+            stream=True
+        )
+        
+        if response.status_code == 200:
+            for line in response.iter_lines():
+                if line:
+                    data = json.loads(line)
+                    token = data.get('response', '')
+                    if token:
+                        yield token
+                    if data.get('done', False):
+                        break
+        else:
+            logger.error(f"Ollama API error: {response.status_code}")
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error communicating with Ollama: {e}")
+
+
+def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
+    """Chunk long text and summarize each chunk, then combine."""
    if len(text) <= chunk_size:
        return summarize_with_ollama(text, model, max_length)
    
-    # Split text into chunks
    words = text.split()
    chunks = []
    current_chunk = []
@ -135,7 +137,6 @@ def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
-    # Summarize each chunk
    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
@ -146,10 +147,55 @@ def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
    if not chunk_summaries:
        return None
    
-    # If there's only one chunk summary, return it
    if len(chunk_summaries) == 1:
        return chunk_summaries[0]
    
-    # Otherwise, combine the summaries and summarize again
    combined_summary = " ".join(chunk_summaries)
-    return summarize_with_ollama(combined_summary, model, max_length) 
+    return summarize_with_ollama(combined_summary, model, max_length)
+
+
+def stream_chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
+    """
+    Chunk and summarize with streaming on the final summary.
+    Returns non-streaming chunk summaries, then streams the final combination.
+    
+    Yields:
+        str: Tokens from the final summary
+    """
+    if len(text) <= chunk_size:
+        yield from stream_summarize_with_ollama(text, model, max_length)
+        return
+    
+    words = text.split()
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    
+    for word in words:
+        if current_length + len(word) + 1 <= chunk_size:
+            current_chunk.append(word)
+            current_length += len(word) + 1
+        else:
+            chunks.append(' '.join(current_chunk))
+            current_chunk = [word]
+            current_length = len(word) + 1
+    
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    
+    chunk_summaries = []
+    for i, chunk in enumerate(chunks):
+        logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
+        summary = summarize_with_ollama(chunk, model, max_length // len(chunks))
+        if summary:
+            chunk_summaries.append(summary)
+    
+    if not chunk_summaries:
+        return
+    
+    if len(chunk_summaries) == 1:
+        yield chunk_summaries[0]
+        return
+    
+    combined_summary = " ".join(chunk_summaries)
+    yield from stream_summarize_with_ollama(combined_summary, model, max_length)