feat: Add streaming Ollama support, model caching, and UI improvements

- Add streaming summarization via Ollama API (stream_summarize_with_ollama)

- Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper)

- Add temp file cleanup for extracted audio

- Add system capabilities detection (FFmpeg, GPU info)

- Add get_video_duration utility

- Improve validation with FFmpeg check

- Rewrite app.py with streaming support and UI enhancements

- Clean up redundant comments and unused imports across all utils
This commit is contained in:
Your Name
2026-02-18 10:26:09 -05:00
parent ce398ae1d4
commit 70c5d32413
10 changed files with 998 additions and 707 deletions

View File

@ -1,6 +1,6 @@
"""
Ollama integration for local AI model inference.
Provides functions to use Ollama's API for text summarization.
Provides functions to use Ollama's API for text summarization with streaming support.
"""
import requests
@ -9,21 +9,14 @@ import logging
from pathlib import Path
import os
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Default Ollama API endpoint - configurable via environment variable
OLLAMA_API_URL = os.environ.get("OLLAMA_API_URL", "http://localhost:11434/api")
def check_ollama_available():
"""
Check if Ollama service is available.
Returns:
bool: True if Ollama is available, False otherwise
"""
"""Check if Ollama service is available."""
try:
response = requests.get(f"{OLLAMA_API_URL}/tags", timeout=2)
return response.status_code == 200
@ -32,12 +25,7 @@ def check_ollama_available():
def list_available_models():
"""
List available models in Ollama.
Returns:
list: List of available model names
"""
"""List available models in Ollama."""
try:
response = requests.get(f"{OLLAMA_API_URL}/tags")
if response.status_code == 200:
@ -50,32 +38,14 @@ def list_available_models():
def summarize_with_ollama(text, model="llama3", max_length=150):
"""
Summarize text using Ollama's local API.
Args:
text (str): Text to summarize
model (str): Ollama model to use
max_length (int): Maximum length of the summary
Returns:
str: Summarized text or None if failed
"""
"""Summarize text using Ollama's local API (non-streaming)."""
if not check_ollama_available():
logger.warning("Ollama service is not available")
return None
# Check if the model is available
available_models = list_available_models()
if model not in available_models:
logger.warning(f"Model {model} not available in Ollama. Available models: {available_models}")
return None
# Prepare the prompt for summarization
prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
try:
# Make the API request
response = requests.post(
f"{OLLAMA_API_URL}/generate",
json={
@ -85,7 +55,7 @@ def summarize_with_ollama(text, model="llama3", max_length=150):
"options": {
"temperature": 0.3,
"top_p": 0.9,
"max_tokens": max_length * 2 # Approximate token count
"max_tokens": max_length * 2
}
}
)
@ -101,23 +71,55 @@ def summarize_with_ollama(text, model="llama3", max_length=150):
return None
def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
def stream_summarize_with_ollama(text, model="llama3", max_length=150):
"""
Chunk long text and summarize each chunk, then combine the summaries.
Summarize text using Ollama with streaming. Yields tokens as they arrive.
Args:
text (str): Text to summarize
model (str): Ollama model to use
chunk_size (int): Maximum size of each chunk in characters
max_length (int): Maximum length of the final summary
Returns:
str: Combined summary or None if failed
Yields:
str: Individual response tokens
"""
if not check_ollama_available():
logger.warning("Ollama service is not available")
return
prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
try:
response = requests.post(
f"{OLLAMA_API_URL}/generate",
json={
"model": model,
"prompt": prompt,
"stream": True,
"options": {
"temperature": 0.3,
"top_p": 0.9,
"max_tokens": max_length * 2
}
},
stream=True
)
if response.status_code == 200:
for line in response.iter_lines():
if line:
data = json.loads(line)
token = data.get('response', '')
if token:
yield token
if data.get('done', False):
break
else:
logger.error(f"Ollama API error: {response.status_code}")
except requests.exceptions.RequestException as e:
logger.error(f"Error communicating with Ollama: {e}")
def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
"""Chunk long text and summarize each chunk, then combine."""
if len(text) <= chunk_size:
return summarize_with_ollama(text, model, max_length)
# Split text into chunks
words = text.split()
chunks = []
current_chunk = []
@ -135,7 +137,6 @@ def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
if current_chunk:
chunks.append(' '.join(current_chunk))
# Summarize each chunk
chunk_summaries = []
for i, chunk in enumerate(chunks):
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
@ -146,10 +147,55 @@ def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
if not chunk_summaries:
return None
# If there's only one chunk summary, return it
if len(chunk_summaries) == 1:
return chunk_summaries[0]
# Otherwise, combine the summaries and summarize again
combined_summary = " ".join(chunk_summaries)
return summarize_with_ollama(combined_summary, model, max_length)
return summarize_with_ollama(combined_summary, model, max_length)
def stream_chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
"""
Chunk and summarize with streaming on the final summary.
Returns non-streaming chunk summaries, then streams the final combination.
Yields:
str: Tokens from the final summary
"""
if len(text) <= chunk_size:
yield from stream_summarize_with_ollama(text, model, max_length)
return
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
if current_length + len(word) + 1 <= chunk_size:
current_chunk.append(word)
current_length += len(word) + 1
else:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = len(word) + 1
if current_chunk:
chunks.append(' '.join(current_chunk))
chunk_summaries = []
for i, chunk in enumerate(chunks):
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
summary = summarize_with_ollama(chunk, model, max_length // len(chunks))
if summary:
chunk_summaries.append(summary)
if not chunk_summaries:
return
if len(chunk_summaries) == 1:
yield chunk_summaries[0]
return
combined_summary = " ".join(chunk_summaries)
yield from stream_summarize_with_ollama(combined_summary, model, max_length)