Fix summarization issues and improve GPU handling. Update .gitignore for venv

This commit is contained in:
DataAnts-AI
2025-04-30 12:09:10 -04:00
parent 9ca396d6fa
commit ce9bb9c2e2
5 changed files with 123 additions and 114 deletions

9
.gitignore vendored
View File

@ -1,4 +1,13 @@
# Python virtual environment
venv/
__pycache__/ __pycache__/
*.pyc *.pyc
# IDE files
.vscode/
.idea/
# OS files
.env .env
.DS_Store .DS_Store
Thumbs.db

View File

@ -2,7 +2,7 @@
# Core dependencies # Core dependencies
streamlit==1.26.0 streamlit==1.26.0
moviepy==1.0.3 moviepy==1.0.3
openai-whisper>=20230314 openai-whisper==20231117
transformers>=4.21.1 transformers>=4.21.1
torch>=1.7.0 torch>=1.7.0
torchaudio>=0.7.0 torchaudio>=0.7.0

View File

@ -68,6 +68,8 @@ def get_optimal_device():
def set_memory_limits(memory_fraction=0.8): def set_memory_limits(memory_fraction=0.8):
global torch
import torch
""" """
Set memory limits for GPU usage. Set memory limits for GPU usage.
@ -81,9 +83,6 @@ def set_memory_limits(memory_fraction=0.8):
return False return False
try: try:
# Import only if CUDA is available
import torch.cuda
# Set memory fraction for each device # Set memory fraction for each device
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
torch.cuda.set_per_process_memory_fraction(memory_fraction, i) torch.cuda.set_per_process_memory_fraction(memory_fraction, i)

View File

@ -1,8 +1,113 @@
from transformers import pipeline from transformers import pipeline, AutoTokenizer
import torch
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
SUMMARY_MODEL = "Falconsai/text_summarization" SUMMARY_MODEL = "Falconsai/text_summarization"
def summarize_text(text): def chunk_text(text, max_tokens, tokenizer):
"""Summarize text using a Hugging Face pipeline.""" """
summarizer = pipeline("summarization", model=SUMMARY_MODEL) Splits the text into a list of chunks based on token limits.
return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]
Args:
text (str): Text to chunk
max_tokens (int): Maximum tokens per chunk
tokenizer (AutoTokenizer): Tokenizer to use
Returns:
list: List of text chunks
"""
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
if hypothetical_length <= max_tokens:
current_chunk.append(word)
current_length = hypothetical_length
else:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def summarize_text(text, use_gpu=True, memory_fraction=0.8):
"""
Summarize text using a Hugging Face pipeline with chunking support.
Args:
text (str): Text to summarize
use_gpu (bool): Whether to use GPU if available
memory_fraction (float): Fraction of GPU memory to use
Returns:
str: Summarized text
"""
# Determine device
device = -1 # Default to CPU
if use_gpu and torch.cuda.is_available():
device = 0 # Use first GPU
if torch.cuda.is_available():
torch.cuda.set_per_process_memory_fraction(memory_fraction)
logger.info(f"Using device {device} for summarization")
try:
# Initialize the pipeline and tokenizer
summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device)
tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL)
# Check if text needs to be chunked
max_tokens = 512
tokens = tokenizer(text, return_tensors='pt')
num_tokens = len(tokens['input_ids'][0])
if num_tokens > max_tokens:
chunks = chunk_text(text, max_tokens, tokenizer)
summaries = []
for i, chunk in enumerate(chunks):
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
summary_output = summarizer(
"summarize: " + chunk,
max_length=150,
min_length=30,
do_sample=False
)
summaries.append(summary_output[0]['summary_text'])
# If multiple chunks, summarize the combined summaries
if len(summaries) > 1:
logger.info("Generating final summary from chunk summaries")
combined_text = " ".join(summaries)
return summarizer(
"summarize: " + combined_text,
max_length=150,
min_length=30,
do_sample=False
)[0]['summary_text']
return summaries[0]
else:
return summarizer(
"summarize: " + text,
max_length=150,
min_length=30,
do_sample=False
)[0]['summary_text']
except Exception as e:
logger.error(f"Error during summarization: {e}")
# Fallback to CPU if GPU fails
if device != -1:
logger.info("Falling back to CPU")
return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction)
raise

View File

@ -25,7 +25,6 @@ logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
WHISPER_MODEL = "base" WHISPER_MODEL = "base"
SUMMARIZATION_MODEL = "t5-base"
def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,
use_gpu=True, memory_fraction=0.8): use_gpu=True, memory_fraction=0.8):
@ -83,107 +82,4 @@ def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cach
} }
save_to_cache(audio_path, cache_data, model, "transcribe") save_to_cache(audio_path, cache_data, model, "transcribe")
return segments, transcript return segments, transcript
def summarize_text(text, model=SUMMARIZATION_MODEL, use_gpu=True, memory_fraction=0.8):
"""
Summarize text using a pre-trained transformer model with chunking.
Args:
text (str): Text to summarize
model (str): Model to use for summarization
use_gpu (bool): Whether to use GPU acceleration if available
memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
Returns:
str: Summarized text
"""
# Configure device
device = torch.device("cpu")
if use_gpu and GPU_UTILS_AVAILABLE:
device = get_optimal_device()
logger.info(f"Using device: {device} for summarization")
# Initialize the pipeline with the specified device
device_arg = -1 if device.type == "cpu" else 0 # -1 for CPU, 0 for GPU
summarization_pipeline = pipeline("summarization", model=model, device=device_arg)
tokenizer = AutoTokenizer.from_pretrained(model)
max_tokens = 512
tokens = tokenizer(text, return_tensors='pt')
num_tokens = len(tokens['input_ids'][0])
if num_tokens > max_tokens:
chunks = chunk_text(text, max_tokens, tokenizer)
summaries = []
for i, chunk in enumerate(chunks):
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
summary_output = summarization_pipeline(
"summarize: " + chunk,
max_length=150,
min_length=30,
do_sample=False
)
summaries.append(summary_output[0]['summary_text'])
overall_summary = " ".join(summaries)
# If the combined summary is still long, summarize it again
if len(summaries) > 1:
logger.info("Generating final summary from chunk summaries")
combined_text = " ".join(summaries)
overall_summary = summarization_pipeline(
"summarize: " + combined_text,
max_length=150,
min_length=30,
do_sample=False
)[0]['summary_text']
else:
overall_summary = summarization_pipeline(
"summarize: " + text,
max_length=150,
min_length=30,
do_sample=False
)[0]['summary_text']
return overall_summary
def chunk_text(text, max_tokens, tokenizer=None):
"""
Splits the text into a list of chunks based on token limits.
Args:
text (str): Text to chunk
max_tokens (int): Maximum tokens per chunk
tokenizer (AutoTokenizer, optional): Tokenizer to use
Returns:
list: List of text chunks
"""
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
if hypothetical_length <= max_tokens:
current_chunk.append(word)
current_length = hypothetical_length
else:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks