From ce9bb9c2e25cc10be4377d2f35fa613b44579b65 Mon Sep 17 00:00:00 2001 From: DataAnts-AI Date: Wed, 30 Apr 2025 12:09:10 -0400 Subject: [PATCH] Fix summarization issues and improve GPU handling. Update .gitignore for venv --- .gitignore | 9 ++++ requirements.txt | 2 +- utils/gpu_utils.py | 5 +- utils/summarization.py | 115 +++++++++++++++++++++++++++++++++++++++-- utils/transcription.py | 106 +------------------------------------ 5 files changed, 123 insertions(+), 114 deletions(-) diff --git a/.gitignore b/.gitignore index 0eb8597..0bb6ebd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,13 @@ +# Python virtual environment +venv/ __pycache__/ *.pyc + +# IDE files +.vscode/ +.idea/ + +# OS files .env .DS_Store +Thumbs.db diff --git a/requirements.txt b/requirements.txt index 9b22b85..d13e785 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # Core dependencies streamlit==1.26.0 moviepy==1.0.3 -openai-whisper>=20230314 +openai-whisper==20231117 transformers>=4.21.1 torch>=1.7.0 torchaudio>=0.7.0 diff --git a/utils/gpu_utils.py b/utils/gpu_utils.py index cf7b1fc..1c08ec3 100644 --- a/utils/gpu_utils.py +++ b/utils/gpu_utils.py @@ -68,6 +68,8 @@ def get_optimal_device(): def set_memory_limits(memory_fraction=0.8): + global torch + import torch """ Set memory limits for GPU usage. @@ -81,9 +83,6 @@ def set_memory_limits(memory_fraction=0.8): return False try: - # Import only if CUDA is available - import torch.cuda - # Set memory fraction for each device for i in range(torch.cuda.device_count()): torch.cuda.set_per_process_memory_fraction(memory_fraction, i) diff --git a/utils/summarization.py b/utils/summarization.py index 696d1c9..a5c5ddc 100644 --- a/utils/summarization.py +++ b/utils/summarization.py @@ -1,8 +1,113 @@ -from transformers import pipeline +from transformers import pipeline, AutoTokenizer +import torch +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) SUMMARY_MODEL = "Falconsai/text_summarization" -def summarize_text(text): - """Summarize text using a Hugging Face pipeline.""" - summarizer = pipeline("summarization", model=SUMMARY_MODEL) - return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"] +def chunk_text(text, max_tokens, tokenizer): + """ + Splits the text into a list of chunks based on token limits. + + Args: + text (str): Text to chunk + max_tokens (int): Maximum tokens per chunk + tokenizer (AutoTokenizer): Tokenizer to use + + Returns: + list: List of text chunks + """ + words = text.split() + chunks = [] + current_chunk = [] + current_length = 0 + + for word in words: + hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2 + if hypothetical_length <= max_tokens: + current_chunk.append(word) + current_length = hypothetical_length + else: + chunks.append(' '.join(current_chunk)) + current_chunk = [word] + current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2 + + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return chunks + +def summarize_text(text, use_gpu=True, memory_fraction=0.8): + """ + Summarize text using a Hugging Face pipeline with chunking support. + + Args: + text (str): Text to summarize + use_gpu (bool): Whether to use GPU if available + memory_fraction (float): Fraction of GPU memory to use + + Returns: + str: Summarized text + """ + # Determine device + device = -1 # Default to CPU + if use_gpu and torch.cuda.is_available(): + device = 0 # Use first GPU + if torch.cuda.is_available(): + torch.cuda.set_per_process_memory_fraction(memory_fraction) + + logger.info(f"Using device {device} for summarization") + + try: + # Initialize the pipeline and tokenizer + summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device) + tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL) + + # Check if text needs to be chunked + max_tokens = 512 + tokens = tokenizer(text, return_tensors='pt') + num_tokens = len(tokens['input_ids'][0]) + + if num_tokens > max_tokens: + chunks = chunk_text(text, max_tokens, tokenizer) + summaries = [] + + for i, chunk in enumerate(chunks): + logger.info(f"Summarizing chunk {i+1}/{len(chunks)}") + summary_output = summarizer( + "summarize: " + chunk, + max_length=150, + min_length=30, + do_sample=False + ) + summaries.append(summary_output[0]['summary_text']) + + # If multiple chunks, summarize the combined summaries + if len(summaries) > 1: + logger.info("Generating final summary from chunk summaries") + combined_text = " ".join(summaries) + return summarizer( + "summarize: " + combined_text, + max_length=150, + min_length=30, + do_sample=False + )[0]['summary_text'] + return summaries[0] + else: + return summarizer( + "summarize: " + text, + max_length=150, + min_length=30, + do_sample=False + )[0]['summary_text'] + + except Exception as e: + logger.error(f"Error during summarization: {e}") + # Fallback to CPU if GPU fails + if device != -1: + logger.info("Falling back to CPU") + return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction) + raise diff --git a/utils/transcription.py b/utils/transcription.py index 9c2437b..86b88f6 100644 --- a/utils/transcription.py +++ b/utils/transcription.py @@ -25,7 +25,6 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) WHISPER_MODEL = "base" -SUMMARIZATION_MODEL = "t5-base" def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, use_gpu=True, memory_fraction=0.8): @@ -83,107 +82,4 @@ def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cach } save_to_cache(audio_path, cache_data, model, "transcribe") - return segments, transcript - - -def summarize_text(text, model=SUMMARIZATION_MODEL, use_gpu=True, memory_fraction=0.8): - """ - Summarize text using a pre-trained transformer model with chunking. - - Args: - text (str): Text to summarize - model (str): Model to use for summarization - use_gpu (bool): Whether to use GPU acceleration if available - memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0) - - Returns: - str: Summarized text - """ - # Configure device - device = torch.device("cpu") - if use_gpu and GPU_UTILS_AVAILABLE: - device = get_optimal_device() - logger.info(f"Using device: {device} for summarization") - - # Initialize the pipeline with the specified device - device_arg = -1 if device.type == "cpu" else 0 # -1 for CPU, 0 for GPU - summarization_pipeline = pipeline("summarization", model=model, device=device_arg) - tokenizer = AutoTokenizer.from_pretrained(model) - - max_tokens = 512 - - tokens = tokenizer(text, return_tensors='pt') - num_tokens = len(tokens['input_ids'][0]) - - if num_tokens > max_tokens: - chunks = chunk_text(text, max_tokens, tokenizer) - summaries = [] - - for i, chunk in enumerate(chunks): - logger.info(f"Summarizing chunk {i+1}/{len(chunks)}") - summary_output = summarization_pipeline( - "summarize: " + chunk, - max_length=150, - min_length=30, - do_sample=False - ) - summaries.append(summary_output[0]['summary_text']) - - overall_summary = " ".join(summaries) - - # If the combined summary is still long, summarize it again - if len(summaries) > 1: - logger.info("Generating final summary from chunk summaries") - combined_text = " ".join(summaries) - overall_summary = summarization_pipeline( - "summarize: " + combined_text, - max_length=150, - min_length=30, - do_sample=False - )[0]['summary_text'] - else: - overall_summary = summarization_pipeline( - "summarize: " + text, - max_length=150, - min_length=30, - do_sample=False - )[0]['summary_text'] - - return overall_summary - - -def chunk_text(text, max_tokens, tokenizer=None): - """ - Splits the text into a list of chunks based on token limits. - - Args: - text (str): Text to chunk - max_tokens (int): Maximum tokens per chunk - tokenizer (AutoTokenizer, optional): Tokenizer to use - - Returns: - list: List of text chunks - """ - if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL) - - words = text.split() - - chunks = [] - current_chunk = [] - current_length = 0 - - for word in words: - hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2 - if hypothetical_length <= max_tokens: - current_chunk.append(word) - current_length = hypothetical_length - else: - chunks.append(' '.join(current_chunk)) - current_chunk = [word] - current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2 - - if current_chunk: - chunks.append(' '.join(current_chunk)) - - return chunks \ No newline at end of file + return segments, transcript \ No newline at end of file