feat: Add streaming Ollama support, model caching, and UI improvements
- Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils
This commit is contained in:
@ -1,45 +1,49 @@
|
||||
from transformers import pipeline, AutoTokenizer
|
||||
import torch
|
||||
import logging
|
||||
import streamlit as st
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SUMMARY_MODEL = "Falconsai/text_summarization"
|
||||
|
||||
|
||||
@st.cache_resource
|
||||
def _load_summarizer(device_int):
|
||||
"""Load and cache the summarization pipeline."""
|
||||
logger.info(f"Loading summarization model on device {device_int}")
|
||||
return pipeline("summarization", model=SUMMARY_MODEL, device=device_int)
|
||||
|
||||
|
||||
@st.cache_resource
|
||||
def _load_summary_tokenizer():
|
||||
"""Load and cache the summarization tokenizer."""
|
||||
return AutoTokenizer.from_pretrained(SUMMARY_MODEL)
|
||||
|
||||
|
||||
def chunk_text(text, max_tokens, tokenizer):
|
||||
"""
|
||||
Splits the text into a list of chunks based on token limits.
|
||||
|
||||
Args:
|
||||
text (str): Text to chunk
|
||||
max_tokens (int): Maximum tokens per chunk
|
||||
tokenizer (AutoTokenizer): Tokenizer to use
|
||||
|
||||
Returns:
|
||||
list: List of text chunks
|
||||
Splits text into chunks by tokenizing once, then splitting by token windows.
|
||||
Much faster than the per-word tokenization approach.
|
||||
"""
|
||||
words = text.split()
|
||||
all_ids = tokenizer(text, return_tensors='pt', truncation=False)['input_ids'][0]
|
||||
content_ids = all_ids[1:-1] # strip BOS/EOS
|
||||
usable_max = max_tokens - 2 # leave room for special tokens
|
||||
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for word in words:
|
||||
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||||
if hypothetical_length <= max_tokens:
|
||||
current_chunk.append(word)
|
||||
current_length = hypothetical_length
|
||||
else:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
current_chunk = [word]
|
||||
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
|
||||
for i in range(0, len(content_ids), usable_max):
|
||||
chunk_ids = content_ids[i : i + usable_max]
|
||||
decoded = tokenizer.decode(chunk_ids, skip_special_tokens=True).strip()
|
||||
if decoded:
|
||||
chunks.append(decoded)
|
||||
|
||||
if not chunks:
|
||||
chunks.append(text)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def summarize_text(text, use_gpu=True, memory_fraction=0.8):
|
||||
"""
|
||||
Summarize text using a Hugging Face pipeline with chunking support.
|
||||
@ -52,21 +56,17 @@ def summarize_text(text, use_gpu=True, memory_fraction=0.8):
|
||||
Returns:
|
||||
str: Summarized text
|
||||
"""
|
||||
# Determine device
|
||||
device = -1 # Default to CPU
|
||||
device = -1
|
||||
if use_gpu and torch.cuda.is_available():
|
||||
device = 0 # Use first GPU
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_per_process_memory_fraction(memory_fraction)
|
||||
device = 0
|
||||
torch.cuda.set_per_process_memory_fraction(memory_fraction)
|
||||
|
||||
logger.info(f"Using device {device} for summarization")
|
||||
|
||||
try:
|
||||
# Initialize the pipeline and tokenizer
|
||||
summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device)
|
||||
tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL)
|
||||
summarizer = _load_summarizer(device)
|
||||
tokenizer = _load_summary_tokenizer()
|
||||
|
||||
# Check if text needs to be chunked
|
||||
max_tokens = 512
|
||||
tokens = tokenizer(text, return_tensors='pt')
|
||||
num_tokens = len(tokens['input_ids'][0])
|
||||
@ -85,7 +85,6 @@ def summarize_text(text, use_gpu=True, memory_fraction=0.8):
|
||||
)
|
||||
summaries.append(summary_output[0]['summary_text'])
|
||||
|
||||
# If multiple chunks, summarize the combined summaries
|
||||
if len(summaries) > 1:
|
||||
logger.info("Generating final summary from chunk summaries")
|
||||
combined_text = " ".join(summaries)
|
||||
@ -106,7 +105,6 @@ def summarize_text(text, use_gpu=True, memory_fraction=0.8):
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during summarization: {e}")
|
||||
# Fallback to CPU if GPU fails
|
||||
if device != -1:
|
||||
logger.info("Falling back to CPU")
|
||||
return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction)
|
||||
|
||||
Reference in New Issue
Block a user