TalkEdit/utils/transcription.py

import whisper
from pathlib import Path
from transformers import pipeline, AutoTokenizer
from utils.audio_processing import extract_audio
from utils.summarization import summarize_text
import logging
import torch

# Try to import GPU utilities, but don't fail if not available
try:
    from utils.gpu_utils import configure_gpu, get_optimal_device
    GPU_UTILS_AVAILABLE = True
except ImportError:
    GPU_UTILS_AVAILABLE = False

# Try to import caching utilities, but don't fail if not available
try:
    from utils.cache import load_from_cache, save_to_cache
    CACHE_AVAILABLE = True
except ImportError:
    CACHE_AVAILABLE = False

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

WHISPER_MODEL = "base"
SUMMARIZATION_MODEL = "t5-base"

def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None, 
                     use_gpu=True, memory_fraction=0.8):
    """
    Transcribe audio using Whisper and return both segments and full transcript.
    
    Args:
        audio_path (Path): Path to the audio or video file
        model (str): Whisper model size to use (tiny, base, small, medium, large)
        use_cache (bool): Whether to use caching
        cache_max_age (float, optional): Maximum age of cache in seconds
        use_gpu (bool): Whether to use GPU acceleration if available
        memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
        
    Returns:
        tuple: (segments, transcript) where segments is a list of dicts with timing info
    """
    audio_path = Path(audio_path)
    
    # Check cache first if enabled
    if use_cache and CACHE_AVAILABLE:
        cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age)
        if cached_data:
            logger.info(f"Using cached transcription for {audio_path}")
            return cached_data.get("segments", []), cached_data.get("transcript", "")
    
    # Extract audio if the input is a video file
    if audio_path.suffix.lower() in ['.mp4', '.avi', '.mov', '.mkv']:
        audio_path = extract_audio(audio_path)
    
    # Configure GPU if available and requested
    device = torch.device("cpu")
    if use_gpu and GPU_UTILS_AVAILABLE:
        gpu_config = configure_gpu(model, memory_fraction)
        device = gpu_config["device"]
        logger.info(f"Using device: {device} for transcription")
    
    # Load the specified Whisper model
    logger.info(f"Loading Whisper model: {model}")
    whisper_model = whisper.load_model(model, device=device if device.type != "mps" else "cpu")
    
    # Transcribe the audio
    logger.info(f"Transcribing audio: {audio_path}")
    result = whisper_model.transcribe(str(audio_path))
    
    # Extract the full transcript and segments
    transcript = result["text"]
    segments = result["segments"]
    
    # Cache the results if caching is enabled
    if use_cache and CACHE_AVAILABLE:
        cache_data = {
            "transcript": transcript,
            "segments": segments
        }
        save_to_cache(audio_path, cache_data, model, "transcribe")
    
    return segments, transcript


def summarize_text(text, model=SUMMARIZATION_MODEL, use_gpu=True, memory_fraction=0.8):
    """
    Summarize text using a pre-trained transformer model with chunking.
    
    Args:
        text (str): Text to summarize
        model (str): Model to use for summarization
        use_gpu (bool): Whether to use GPU acceleration if available
        memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
        
    Returns:
        str: Summarized text
    """
    # Configure device
    device = torch.device("cpu")
    if use_gpu and GPU_UTILS_AVAILABLE:
        device = get_optimal_device()
        logger.info(f"Using device: {device} for summarization")
    
    # Initialize the pipeline with the specified device
    device_arg = -1 if device.type == "cpu" else 0  # -1 for CPU, 0 for GPU
    summarization_pipeline = pipeline("summarization", model=model, device=device_arg)
    tokenizer = AutoTokenizer.from_pretrained(model)
    
    max_tokens = 512
    
    tokens = tokenizer(text, return_tensors='pt')
    num_tokens = len(tokens['input_ids'][0])
    
    if num_tokens > max_tokens:
        chunks = chunk_text(text, max_tokens, tokenizer)
        summaries = []
        
        for i, chunk in enumerate(chunks):
            logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
            summary_output = summarization_pipeline(
                "summarize: " + chunk, 
                max_length=150, 
                min_length=30, 
                do_sample=False
            )
            summaries.append(summary_output[0]['summary_text'])
        
        overall_summary = " ".join(summaries)
        
        # If the combined summary is still long, summarize it again
        if len(summaries) > 1:
            logger.info("Generating final summary from chunk summaries")
            combined_text = " ".join(summaries)
            overall_summary = summarization_pipeline(
                "summarize: " + combined_text, 
                max_length=150, 
                min_length=30, 
                do_sample=False
            )[0]['summary_text']
    else:
        overall_summary = summarization_pipeline(
            "summarize: " + text, 
            max_length=150, 
            min_length=30, 
            do_sample=False
        )[0]['summary_text']
    
    return overall_summary


def chunk_text(text, max_tokens, tokenizer=None):
    """
    Splits the text into a list of chunks based on token limits.
    
    Args:
        text (str): Text to chunk
        max_tokens (int): Maximum tokens per chunk
        tokenizer (AutoTokenizer, optional): Tokenizer to use
        
    Returns:
        list: List of text chunks
    """
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
    
    words = text.split()
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
        if hypothetical_length <= max_tokens:
            current_chunk.append(word)
            current_length = hypothetical_length
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks
Updated with code 2025-01-28 17:00:03 -05:00			`import whisper`
			`from pathlib import Path`
			`from transformers import pipeline, AutoTokenizer`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`from utils.audio_processing import extract_audio`
			`from utils.summarization import summarize_text`
			`import logging`
			`import torch`

			`# Try to import GPU utilities, but don't fail if not available`
			`try:`
			`from utils.gpu_utils import configure_gpu, get_optimal_device`
			`GPU_UTILS_AVAILABLE = True`
			`except ImportError:`
			`GPU_UTILS_AVAILABLE = False`

			`# Try to import caching utilities, but don't fail if not available`
			`try:`
			`from utils.cache import load_from_cache, save_to_cache`
			`CACHE_AVAILABLE = True`
			`except ImportError:`
			`CACHE_AVAILABLE = False`

			`# Configure logging`
			`logging.basicConfig(level=logging.INFO)`
			`logger = logging.getLogger(__name__)`
Updated with code 2025-01-28 17:00:03 -05:00
			`WHISPER_MODEL = "base"`
			`SUMMARIZATION_MODEL = "t5-base"`

Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,`
			`use_gpu=True, memory_fraction=0.8):`
			`"""`
			`Transcribe audio using Whisper and return both segments and full transcript.`

			`Args:`
			`audio_path (Path): Path to the audio or video file`
			`model (str): Whisper model size to use (tiny, base, small, medium, large)`
			`use_cache (bool): Whether to use caching`
			`cache_max_age (float, optional): Maximum age of cache in seconds`
			`use_gpu (bool): Whether to use GPU acceleration if available`
			`memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)`

			`Returns:`
			`tuple: (segments, transcript) where segments is a list of dicts with timing info`
			`"""`
			`audio_path = Path(audio_path)`

			`# Check cache first if enabled`
			`if use_cache and CACHE_AVAILABLE:`
			`cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age)`
			`if cached_data:`
			`logger.info(f"Using cached transcription for {audio_path}")`
			`return cached_data.get("segments", []), cached_data.get("transcript", "")`

			`# Extract audio if the input is a video file`
			`if audio_path.suffix.lower() in ['.mp4', '.avi', '.mov', '.mkv']:`
			`audio_path = extract_audio(audio_path)`

			`# Configure GPU if available and requested`
			`device = torch.device("cpu")`
			`if use_gpu and GPU_UTILS_AVAILABLE:`
			`gpu_config = configure_gpu(model, memory_fraction)`
			`device = gpu_config["device"]`
			`logger.info(f"Using device: {device} for transcription")`

			`# Load the specified Whisper model`
			`logger.info(f"Loading Whisper model: {model}")`
			`whisper_model = whisper.load_model(model, device=device if device.type != "mps" else "cpu")`

			`# Transcribe the audio`
			`logger.info(f"Transcribing audio: {audio_path}")`
			`result = whisper_model.transcribe(str(audio_path))`

			`# Extract the full transcript and segments`
Updated with code 2025-01-28 17:00:03 -05:00			`transcript = result["text"]`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`segments = result["segments"]`

			`# Cache the results if caching is enabled`
			`if use_cache and CACHE_AVAILABLE:`
			`cache_data = {`
			`"transcript": transcript,`
			`"segments": segments`
			`}`
			`save_to_cache(audio_path, cache_data, model, "transcribe")`

			`return segments, transcript`
Updated with code 2025-01-28 17:00:03 -05:00
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
			`def summarize_text(text, model=SUMMARIZATION_MODEL, use_gpu=True, memory_fraction=0.8):`
			`"""`
			`Summarize text using a pre-trained transformer model with chunking.`

			`Args:`
			`text (str): Text to summarize`
			`model (str): Model to use for summarization`
			`use_gpu (bool): Whether to use GPU acceleration if available`
			`memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)`

			`Returns:`
			`str: Summarized text`
			`"""`
			`# Configure device`
			`device = torch.device("cpu")`
			`if use_gpu and GPU_UTILS_AVAILABLE:`
			`device = get_optimal_device()`
			`logger.info(f"Using device: {device} for summarization")`

			`# Initialize the pipeline with the specified device`
			`device_arg = -1 if device.type == "cpu" else 0 # -1 for CPU, 0 for GPU`
			`summarization_pipeline = pipeline("summarization", model=model, device=device_arg)`
			`tokenizer = AutoTokenizer.from_pretrained(model)`
Updated with code 2025-01-28 17:00:03 -05:00
			`max_tokens = 512`

			`tokens = tokenizer(text, return_tensors='pt')`
			`num_tokens = len(tokens['input_ids'][0])`

			`if num_tokens > max_tokens:`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`chunks = chunk_text(text, max_tokens, tokenizer)`
Updated with code 2025-01-28 17:00:03 -05:00			`summaries = []`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
			`for i, chunk in enumerate(chunks):`
			`logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")`
			`summary_output = summarization_pipeline(`
			`"summarize: " + chunk,`
			`max_length=150,`
			`min_length=30,`
			`do_sample=False`
			`)`
Updated with code 2025-01-28 17:00:03 -05:00			`summaries.append(summary_output[0]['summary_text'])`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
Updated with code 2025-01-28 17:00:03 -05:00			`overall_summary = " ".join(summaries)`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
			`# If the combined summary is still long, summarize it again`
			`if len(summaries) > 1:`
			`logger.info("Generating final summary from chunk summaries")`
			`combined_text = " ".join(summaries)`
			`overall_summary = summarization_pipeline(`
			`"summarize: " + combined_text,`
			`max_length=150,`
			`min_length=30,`
			`do_sample=False`
			`)[0]['summary_text']`
Updated with code 2025-01-28 17:00:03 -05:00			`else:`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`overall_summary = summarization_pipeline(`
			`"summarize: " + text,`
			`max_length=150,`
			`min_length=30,`
			`do_sample=False`
			`)[0]['summary_text']`
Updated with code 2025-01-28 17:00:03 -05:00
			`return overall_summary`

Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
			`def chunk_text(text, max_tokens, tokenizer=None):`
			`"""`
			`Splits the text into a list of chunks based on token limits.`

			`Args:`
			`text (str): Text to chunk`
			`max_tokens (int): Maximum tokens per chunk`
			`tokenizer (AutoTokenizer, optional): Tokenizer to use`

			`Returns:`
			`list: List of text chunks`
			`"""`
			`if tokenizer is None:`
			`tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)`

Updated with code 2025-01-28 17:00:03 -05:00			`words = text.split()`

			`chunks = []`
			`current_chunk = []`
			`current_length = 0`

			`for word in words:`
			`hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2`
			`if hypothetical_length <= max_tokens:`
			`current_chunk.append(word)`
			`current_length = hypothetical_length`
			`else:`
			`chunks.append(' '.join(current_chunk))`
			`current_chunk = [word]`
			`current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2`

			`if current_chunk:`
			`chunks.append(' '.join(current_chunk))`

			`return chunks`