From ce9bb9c2e25cc10be4377d2f35fa613b44579b65 Mon Sep 17 00:00:00 2001
From: DataAnts-AI <sais+stream@dataants.org>
Date: Wed, 30 Apr 2025 12:09:10 -0400
Subject: [PATCH] Fix summarization issues and improve GPU handling. Update
 .gitignore for venv

---
 .gitignore             |   9 ++++
 requirements.txt       |   2 +-
 utils/gpu_utils.py     |   5 +-
 utils/summarization.py | 115 +++++++++++++++++++++++++++++++++++++++--
 utils/transcription.py | 106 +------------------------------------
 5 files changed, 123 insertions(+), 114 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0eb8597..0bb6ebd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,13 @@
+# Python virtual environment
+venv/
 __pycache__/
 *.pyc
+
+# IDE files
+.vscode/
+.idea/
+
+# OS files
 .env
 .DS_Store
+Thumbs.db
diff --git a/requirements.txt b/requirements.txt
index 9b22b85..d13e785 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 # Core dependencies
 streamlit==1.26.0
 moviepy==1.0.3
-openai-whisper>=20230314
+openai-whisper==20231117
 transformers>=4.21.1
 torch>=1.7.0
 torchaudio>=0.7.0
diff --git a/utils/gpu_utils.py b/utils/gpu_utils.py
index cf7b1fc..1c08ec3 100644
--- a/utils/gpu_utils.py
+++ b/utils/gpu_utils.py
@@ -68,6 +68,8 @@ def get_optimal_device():
 
 
 def set_memory_limits(memory_fraction=0.8):
+    global torch
+    import torch
     """
     Set memory limits for GPU usage.
     
@@ -81,9 +83,6 @@ def set_memory_limits(memory_fraction=0.8):
         return False
     
     try:
-        # Import only if CUDA is available
-        import torch.cuda
-        
         # Set memory fraction for each device
         for i in range(torch.cuda.device_count()):
             torch.cuda.set_per_process_memory_fraction(memory_fraction, i)
diff --git a/utils/summarization.py b/utils/summarization.py
index 696d1c9..a5c5ddc 100644
--- a/utils/summarization.py
+++ b/utils/summarization.py
@@ -1,8 +1,113 @@
-from transformers import pipeline
+from transformers import pipeline, AutoTokenizer
+import torch
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 SUMMARY_MODEL = "Falconsai/text_summarization"
 
-def summarize_text(text):
-    """Summarize text using a Hugging Face pipeline."""
-    summarizer = pipeline("summarization", model=SUMMARY_MODEL)
-    return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]
+def chunk_text(text, max_tokens, tokenizer):
+    """
+    Splits the text into a list of chunks based on token limits.
+    
+    Args:
+        text (str): Text to chunk
+        max_tokens (int): Maximum tokens per chunk
+        tokenizer (AutoTokenizer): Tokenizer to use
+        
+    Returns:
+        list: List of text chunks
+    """
+    words = text.split()
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    
+    for word in words:
+        hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
+        if hypothetical_length <= max_tokens:
+            current_chunk.append(word)
+            current_length = hypothetical_length
+        else:
+            chunks.append(' '.join(current_chunk))
+            current_chunk = [word]
+            current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
+    
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    
+    return chunks
+
+def summarize_text(text, use_gpu=True, memory_fraction=0.8):
+    """
+    Summarize text using a Hugging Face pipeline with chunking support.
+    
+    Args:
+        text (str): Text to summarize
+        use_gpu (bool): Whether to use GPU if available
+        memory_fraction (float): Fraction of GPU memory to use
+    
+    Returns:
+        str: Summarized text
+    """
+    # Determine device
+    device = -1  # Default to CPU
+    if use_gpu and torch.cuda.is_available():
+        device = 0  # Use first GPU
+        if torch.cuda.is_available():
+            torch.cuda.set_per_process_memory_fraction(memory_fraction)
+    
+    logger.info(f"Using device {device} for summarization")
+    
+    try:
+        # Initialize the pipeline and tokenizer
+        summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device)
+        tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL)
+        
+        # Check if text needs to be chunked
+        max_tokens = 512
+        tokens = tokenizer(text, return_tensors='pt')
+        num_tokens = len(tokens['input_ids'][0])
+        
+        if num_tokens > max_tokens:
+            chunks = chunk_text(text, max_tokens, tokenizer)
+            summaries = []
+            
+            for i, chunk in enumerate(chunks):
+                logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
+                summary_output = summarizer(
+                    "summarize: " + chunk,
+                    max_length=150,
+                    min_length=30,
+                    do_sample=False
+                )
+                summaries.append(summary_output[0]['summary_text'])
+            
+            # If multiple chunks, summarize the combined summaries
+            if len(summaries) > 1:
+                logger.info("Generating final summary from chunk summaries")
+                combined_text = " ".join(summaries)
+                return summarizer(
+                    "summarize: " + combined_text,
+                    max_length=150,
+                    min_length=30,
+                    do_sample=False
+                )[0]['summary_text']
+            return summaries[0]
+        else:
+            return summarizer(
+                "summarize: " + text,
+                max_length=150,
+                min_length=30,
+                do_sample=False
+            )[0]['summary_text']
+            
+    except Exception as e:
+        logger.error(f"Error during summarization: {e}")
+        # Fallback to CPU if GPU fails
+        if device != -1:
+            logger.info("Falling back to CPU")
+            return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction)
+        raise
diff --git a/utils/transcription.py b/utils/transcription.py
index 9c2437b..86b88f6 100644
--- a/utils/transcription.py
+++ b/utils/transcription.py
@@ -25,7 +25,6 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 WHISPER_MODEL = "base"
-SUMMARIZATION_MODEL = "t5-base"
 
 def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, 
                      use_gpu=True, memory_fraction=0.8):
@@ -83,107 +82,4 @@ def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cach
         }
         save_to_cache(audio_path, cache_data, model, "transcribe")
     
-    return segments, transcript
-
-
-def summarize_text(text, model=SUMMARIZATION_MODEL, use_gpu=True, memory_fraction=0.8):
-    """
-    Summarize text using a pre-trained transformer model with chunking.
-    
-    Args:
-        text (str): Text to summarize
-        model (str): Model to use for summarization
-        use_gpu (bool): Whether to use GPU acceleration if available
-        memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
-        
-    Returns:
-        str: Summarized text
-    """
-    # Configure device
-    device = torch.device("cpu")
-    if use_gpu and GPU_UTILS_AVAILABLE:
-        device = get_optimal_device()
-        logger.info(f"Using device: {device} for summarization")
-    
-    # Initialize the pipeline with the specified device
-    device_arg = -1 if device.type == "cpu" else 0  # -1 for CPU, 0 for GPU
-    summarization_pipeline = pipeline("summarization", model=model, device=device_arg)
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    
-    max_tokens = 512
-    
-    tokens = tokenizer(text, return_tensors='pt')
-    num_tokens = len(tokens['input_ids'][0])
-    
-    if num_tokens > max_tokens:
-        chunks = chunk_text(text, max_tokens, tokenizer)
-        summaries = []
-        
-        for i, chunk in enumerate(chunks):
-            logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
-            summary_output = summarization_pipeline(
-                "summarize: " + chunk, 
-                max_length=150, 
-                min_length=30, 
-                do_sample=False
-            )
-            summaries.append(summary_output[0]['summary_text'])
-        
-        overall_summary = " ".join(summaries)
-        
-        # If the combined summary is still long, summarize it again
-        if len(summaries) > 1:
-            logger.info("Generating final summary from chunk summaries")
-            combined_text = " ".join(summaries)
-            overall_summary = summarization_pipeline(
-                "summarize: " + combined_text, 
-                max_length=150, 
-                min_length=30, 
-                do_sample=False
-            )[0]['summary_text']
-    else:
-        overall_summary = summarization_pipeline(
-            "summarize: " + text, 
-            max_length=150, 
-            min_length=30, 
-            do_sample=False
-        )[0]['summary_text']
-    
-    return overall_summary
-
-
-def chunk_text(text, max_tokens, tokenizer=None):
-    """
-    Splits the text into a list of chunks based on token limits.
-    
-    Args:
-        text (str): Text to chunk
-        max_tokens (int): Maximum tokens per chunk
-        tokenizer (AutoTokenizer, optional): Tokenizer to use
-        
-    Returns:
-        list: List of text chunks
-    """
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
-    
-    words = text.split()
-    
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    
-    for word in words:
-        hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
-        if hypothetical_length <= max_tokens:
-            current_chunk.append(word)
-            current_length = hypothetical_length
-        else:
-            chunks.append(' '.join(current_chunk))
-            current_chunk = [word]
-            current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
-    
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    
-    return chunks
\ No newline at end of file
+    return segments, transcript
\ No newline at end of file