feat: Add streaming Ollama support, model caching, and UI improvements
- Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils
This commit is contained in:
@ -1,6 +1,6 @@
|
||||
"""
|
||||
Ollama integration for local AI model inference.
|
||||
Provides functions to use Ollama's API for text summarization.
|
||||
Provides functions to use Ollama's API for text summarization with streaming support.
|
||||
"""
|
||||
|
||||
import requests
|
||||
@ -9,21 +9,14 @@ import logging
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default Ollama API endpoint - configurable via environment variable
|
||||
OLLAMA_API_URL = os.environ.get("OLLAMA_API_URL", "http://localhost:11434/api")
|
||||
|
||||
|
||||
def check_ollama_available():
|
||||
"""
|
||||
Check if Ollama service is available.
|
||||
|
||||
Returns:
|
||||
bool: True if Ollama is available, False otherwise
|
||||
"""
|
||||
"""Check if Ollama service is available."""
|
||||
try:
|
||||
response = requests.get(f"{OLLAMA_API_URL}/tags", timeout=2)
|
||||
return response.status_code == 200
|
||||
@ -32,12 +25,7 @@ def check_ollama_available():
|
||||
|
||||
|
||||
def list_available_models():
|
||||
"""
|
||||
List available models in Ollama.
|
||||
|
||||
Returns:
|
||||
list: List of available model names
|
||||
"""
|
||||
"""List available models in Ollama."""
|
||||
try:
|
||||
response = requests.get(f"{OLLAMA_API_URL}/tags")
|
||||
if response.status_code == 200:
|
||||
@ -50,32 +38,14 @@ def list_available_models():
|
||||
|
||||
|
||||
def summarize_with_ollama(text, model="llama3", max_length=150):
|
||||
"""
|
||||
Summarize text using Ollama's local API.
|
||||
|
||||
Args:
|
||||
text (str): Text to summarize
|
||||
model (str): Ollama model to use
|
||||
max_length (int): Maximum length of the summary
|
||||
|
||||
Returns:
|
||||
str: Summarized text or None if failed
|
||||
"""
|
||||
"""Summarize text using Ollama's local API (non-streaming)."""
|
||||
if not check_ollama_available():
|
||||
logger.warning("Ollama service is not available")
|
||||
return None
|
||||
|
||||
# Check if the model is available
|
||||
available_models = list_available_models()
|
||||
if model not in available_models:
|
||||
logger.warning(f"Model {model} not available in Ollama. Available models: {available_models}")
|
||||
return None
|
||||
|
||||
# Prepare the prompt for summarization
|
||||
prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
|
||||
|
||||
try:
|
||||
# Make the API request
|
||||
response = requests.post(
|
||||
f"{OLLAMA_API_URL}/generate",
|
||||
json={
|
||||
@ -85,7 +55,7 @@ def summarize_with_ollama(text, model="llama3", max_length=150):
|
||||
"options": {
|
||||
"temperature": 0.3,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": max_length * 2 # Approximate token count
|
||||
"max_tokens": max_length * 2
|
||||
}
|
||||
}
|
||||
)
|
||||
@ -101,23 +71,55 @@ def summarize_with_ollama(text, model="llama3", max_length=150):
|
||||
return None
|
||||
|
||||
|
||||
def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
|
||||
def stream_summarize_with_ollama(text, model="llama3", max_length=150):
|
||||
"""
|
||||
Chunk long text and summarize each chunk, then combine the summaries.
|
||||
Summarize text using Ollama with streaming. Yields tokens as they arrive.
|
||||
|
||||
Args:
|
||||
text (str): Text to summarize
|
||||
model (str): Ollama model to use
|
||||
chunk_size (int): Maximum size of each chunk in characters
|
||||
max_length (int): Maximum length of the final summary
|
||||
|
||||
Returns:
|
||||
str: Combined summary or None if failed
|
||||
Yields:
|
||||
str: Individual response tokens
|
||||
"""
|
||||
if not check_ollama_available():
|
||||
logger.warning("Ollama service is not available")
|
||||
return
|
||||
|
||||
prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{OLLAMA_API_URL}/generate",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": True,
|
||||
"options": {
|
||||
"temperature": 0.3,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": max_length * 2
|
||||
}
|
||||
},
|
||||
stream=True
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
data = json.loads(line)
|
||||
token = data.get('response', '')
|
||||
if token:
|
||||
yield token
|
||||
if data.get('done', False):
|
||||
break
|
||||
else:
|
||||
logger.error(f"Ollama API error: {response.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error communicating with Ollama: {e}")
|
||||
|
||||
|
||||
def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
|
||||
"""Chunk long text and summarize each chunk, then combine."""
|
||||
if len(text) <= chunk_size:
|
||||
return summarize_with_ollama(text, model, max_length)
|
||||
|
||||
# Split text into chunks
|
||||
words = text.split()
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
@ -135,7 +137,6 @@ def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
|
||||
if current_chunk:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
|
||||
# Summarize each chunk
|
||||
chunk_summaries = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
|
||||
@ -146,10 +147,55 @@ def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
|
||||
if not chunk_summaries:
|
||||
return None
|
||||
|
||||
# If there's only one chunk summary, return it
|
||||
if len(chunk_summaries) == 1:
|
||||
return chunk_summaries[0]
|
||||
|
||||
# Otherwise, combine the summaries and summarize again
|
||||
combined_summary = " ".join(chunk_summaries)
|
||||
return summarize_with_ollama(combined_summary, model, max_length)
|
||||
return summarize_with_ollama(combined_summary, model, max_length)
|
||||
|
||||
|
||||
def stream_chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
|
||||
"""
|
||||
Chunk and summarize with streaming on the final summary.
|
||||
Returns non-streaming chunk summaries, then streams the final combination.
|
||||
|
||||
Yields:
|
||||
str: Tokens from the final summary
|
||||
"""
|
||||
if len(text) <= chunk_size:
|
||||
yield from stream_summarize_with_ollama(text, model, max_length)
|
||||
return
|
||||
|
||||
words = text.split()
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for word in words:
|
||||
if current_length + len(word) + 1 <= chunk_size:
|
||||
current_chunk.append(word)
|
||||
current_length += len(word) + 1
|
||||
else:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
current_chunk = [word]
|
||||
current_length = len(word) + 1
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
|
||||
chunk_summaries = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
|
||||
summary = summarize_with_ollama(chunk, model, max_length // len(chunks))
|
||||
if summary:
|
||||
chunk_summaries.append(summary)
|
||||
|
||||
if not chunk_summaries:
|
||||
return
|
||||
|
||||
if len(chunk_summaries) == 1:
|
||||
yield chunk_summaries[0]
|
||||
return
|
||||
|
||||
combined_summary = " ".join(chunk_summaries)
|
||||
yield from stream_summarize_with_ollama(combined_summary, model, max_length)
|
||||
Reference in New Issue
Block a user