Fix summarization issues and improve GPU handling. Update .gitignore for venv

2025-04-30 12:09:10 -04:00
parent 9ca396d6fa
commit ce9bb9c2e2
5 changed files with 123 additions and 114 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,13 @@
 # Python virtual environment
 venv/
 __pycache__/
 *.pyc
 # IDE files
 .vscode/
 .idea/
 # OS files
 .env
 .DS_Store
 Thumbs.db
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,7 @@
 # Core dependencies
 streamlit==1.26.0
 moviepy==1.0.3
-openai-whisper>=20230314
+openai-whisper==20231117
 transformers>=4.21.1
 torch>=1.7.0
 torchaudio>=0.7.0
--- a/utils/gpu_utils.py
+++ b/utils/gpu_utils.py
@ -68,6 +68,8 @@ def get_optimal_device():
 def set_memory_limits(memory_fraction=0.8):
    global torch
    import torch
    """
    Set memory limits for GPU usage.
@ -81,9 +83,6 @@ def set_memory_limits(memory_fraction=0.8):
        return False
    try:
        # Import only if CUDA is available
        import torch.cuda
        # Set memory fraction for each device
        for i in range(torch.cuda.device_count()):
            torch.cuda.set_per_process_memory_fraction(memory_fraction, i)
--- a/utils/summarization.py
+++ b/utils/summarization.py
@ -1,8 +1,113 @@
-from transformers import pipeline
+from transformers import pipeline, AutoTokenizer
 import torch
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 SUMMARY_MODEL = "Falconsai/text_summarization"
-def summarize_text(text):
+def chunk_text(text, max_tokens, tokenizer):
-    """Summarize text using a Hugging Face pipeline."""
+    """
-    summarizer = pipeline("summarization", model=SUMMARY_MODEL)
+    Splits the text into a list of chunks based on token limits.
-    return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]
+    
    Args:
        text (str): Text to chunk
        max_tokens (int): Maximum tokens per chunk
        tokenizer (AutoTokenizer): Tokenizer to use
    Returns:
        list: List of text chunks
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
        if hypothetical_length <= max_tokens:
            current_chunk.append(word)
            current_length = hypothetical_length
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks
 def summarize_text(text, use_gpu=True, memory_fraction=0.8):
    """
    Summarize text using a Hugging Face pipeline with chunking support.
    Args:
        text (str): Text to summarize
        use_gpu (bool): Whether to use GPU if available
        memory_fraction (float): Fraction of GPU memory to use
    Returns:
        str: Summarized text
    """
    # Determine device
    device = -1  # Default to CPU
    if use_gpu and torch.cuda.is_available():
        device = 0  # Use first GPU
        if torch.cuda.is_available():
            torch.cuda.set_per_process_memory_fraction(memory_fraction)
    logger.info(f"Using device {device} for summarization")
    try:
        # Initialize the pipeline and tokenizer
        summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device)
        tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL)
        # Check if text needs to be chunked
        max_tokens = 512
        tokens = tokenizer(text, return_tensors='pt')
        num_tokens = len(tokens['input_ids'][0])
        if num_tokens > max_tokens:
            chunks = chunk_text(text, max_tokens, tokenizer)
            summaries = []
            for i, chunk in enumerate(chunks):
                logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
                summary_output = summarizer(
                    "summarize: " + chunk,
                    max_length=150,
                    min_length=30,
                    do_sample=False
                )
                summaries.append(summary_output[0]['summary_text'])
            # If multiple chunks, summarize the combined summaries
            if len(summaries) > 1:
                logger.info("Generating final summary from chunk summaries")
                combined_text = " ".join(summaries)
                return summarizer(
                    "summarize: " + combined_text,
                    max_length=150,
                    min_length=30,
                    do_sample=False
                )[0]['summary_text']
            return summaries[0]
        else:
            return summarizer(
                "summarize: " + text,
                max_length=150,
                min_length=30,
                do_sample=False
            )[0]['summary_text']
    except Exception as e:
        logger.error(f"Error during summarization: {e}")
        # Fallback to CPU if GPU fails
        if device != -1:
            logger.info("Falling back to CPU")
            return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction)
        raise
--- a/utils/transcription.py
+++ b/utils/transcription.py
@ -25,7 +25,6 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 WHISPER_MODEL = "base"
 SUMMARIZATION_MODEL = "t5-base"
 def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, 
                     use_gpu=True, memory_fraction=0.8):
@ -84,106 +83,3 @@ def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cach
        save_to_cache(audio_path, cache_data, model, "transcribe")
    return segments, transcript
 def summarize_text(text, model=SUMMARIZATION_MODEL, use_gpu=True, memory_fraction=0.8):
    """
    Summarize text using a pre-trained transformer model with chunking.
    Args:
        text (str): Text to summarize
        model (str): Model to use for summarization
        use_gpu (bool): Whether to use GPU acceleration if available
        memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
    Returns:
        str: Summarized text
    """
    # Configure device
    device = torch.device("cpu")
    if use_gpu and GPU_UTILS_AVAILABLE:
        device = get_optimal_device()
        logger.info(f"Using device: {device} for summarization")
    # Initialize the pipeline with the specified device
    device_arg = -1 if device.type == "cpu" else 0  # -1 for CPU, 0 for GPU
    summarization_pipeline = pipeline("summarization", model=model, device=device_arg)
    tokenizer = AutoTokenizer.from_pretrained(model)
    max_tokens = 512
    tokens = tokenizer(text, return_tensors='pt')
    num_tokens = len(tokens['input_ids'][0])
    if num_tokens > max_tokens:
        chunks = chunk_text(text, max_tokens, tokenizer)
        summaries = []
        for i, chunk in enumerate(chunks):
            logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
            summary_output = summarization_pipeline(
                "summarize: " + chunk, 
                max_length=150, 
                min_length=30, 
                do_sample=False
            )
            summaries.append(summary_output[0]['summary_text'])
        overall_summary = " ".join(summaries)
        # If the combined summary is still long, summarize it again
        if len(summaries) > 1:
            logger.info("Generating final summary from chunk summaries")
            combined_text = " ".join(summaries)
            overall_summary = summarization_pipeline(
                "summarize: " + combined_text, 
                max_length=150, 
                min_length=30, 
                do_sample=False
            )[0]['summary_text']
    else:
        overall_summary = summarization_pipeline(
            "summarize: " + text, 
            max_length=150, 
            min_length=30, 
            do_sample=False
        )[0]['summary_text']
    return overall_summary
 def chunk_text(text, max_tokens, tokenizer=None):
    """
    Splits the text into a list of chunks based on token limits.
    Args:
        text (str): Text to chunk
        max_tokens (int): Maximum tokens per chunk
        tokenizer (AutoTokenizer, optional): Tokenizer to use
    Returns:
        list: List of text chunks
    """
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
        if hypothetical_length <= max_tokens:
            current_chunk.append(word)
            current_length = hypothetical_length
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks