diff --git a/app.py b/app.py index 4c28d7f..3e71331 100644 --- a/app.py +++ b/app.py @@ -113,9 +113,16 @@ def render_sidebar(): index=["tiny", "base", "small", "medium", "large"].index( st.session_state.transcription_model ), - help="Larger models are more accurate but slower.", + help="Larger models are more accurate but slower. " + "Memory: tiny ~75MB, base ~140MB, small ~460MB, medium ~1.5GB, large ~2.9GB", key="sb_whisper_model", ) + if st.session_state.transcription_model in ("large", "large-v2", "large-v3") and not st.session_state.get("use_gpu", False): + st.warning( + "The **large** Whisper model requires ~2.9GB of memory. " + "Without GPU, this may crash the application. Consider using " + "**medium** or smaller, or enable GPU acceleration." + ) summarization_options = ( ["Hugging Face (Online)", "Ollama (Local)"] @@ -407,109 +414,119 @@ def process_recording(file_path, sidebar_opts): results = {} start_time = time.time() - with st.status("Processing recording...", expanded=True) as status: + try: + with st.status("Processing recording...", expanded=True) as status: - # Step 1: Transcription - st.write(f"Transcribing with Whisper ({st.session_state.transcription_model} model)...") - t0 = time.time() - - if st.session_state.use_diarization and DIARIZATION_AVAILABLE and sidebar_opts["hf_token"]: - num_spk = int(sidebar_opts["num_speakers"]) if sidebar_opts["num_speakers"] > 0 else None - segments, transcript = transcribe_with_diarization( - file_path, - whisper_model=st.session_state.transcription_model, - num_speakers=num_spk, - use_gpu=st.session_state.use_gpu, - hf_token=sidebar_opts["hf_token"], - ) - results["diarized"] = True - elif st.session_state.use_translation and TRANSLATION_AVAILABLE: - st.write("Transcribing and translating...") - orig_seg, trans_seg, orig_text, trans_text = transcribe_and_translate( - file_path, - whisper_model=st.session_state.transcription_model, - target_lang=sidebar_opts["target_lang"], - use_gpu=st.session_state.use_gpu, - ) - segments = trans_seg - transcript = trans_text - results["original_text"] = orig_text - results["original_segments"] = orig_seg - results["translated"] = True - else: - segments, transcript = transcribe_audio( - file_path, - model=st.session_state.transcription_model, - use_cache=st.session_state.use_cache, - use_gpu=st.session_state.use_gpu, - memory_fraction=st.session_state.memory_fraction, - ) - - transcription_time = time.time() - t0 - st.write(f"Transcription complete ({transcription_time:.1f}s)") - - if not transcript: - status.update(label="Processing failed", state="error") - return None - - results["segments"] = segments - results["transcript"] = transcript - - # Step 2: Keyword extraction - if st.session_state.use_keywords and KEYWORD_EXTRACTION_AVAILABLE: - st.write("Extracting keywords...") + # Step 1: Transcription + st.write(f"Transcribing with Whisper ({st.session_state.transcription_model} model)...") t0 = time.time() - kw_ts, ent_ts = extract_keywords_from_transcript( - transcript, segments, - max_keywords=sidebar_opts["max_keywords"], - use_gpu=st.session_state.use_gpu, + + if st.session_state.use_diarization and DIARIZATION_AVAILABLE and sidebar_opts["hf_token"]: + num_spk = int(sidebar_opts["num_speakers"]) if sidebar_opts["num_speakers"] > 0 else None + segments, transcript = transcribe_with_diarization( + file_path, + whisper_model=st.session_state.transcription_model, + num_speakers=num_spk, + use_gpu=st.session_state.use_gpu, + hf_token=sidebar_opts["hf_token"], + ) + results["diarized"] = True + elif st.session_state.use_translation and TRANSLATION_AVAILABLE: + st.write("Transcribing and translating...") + orig_seg, trans_seg, orig_text, trans_text = transcribe_and_translate( + file_path, + whisper_model=st.session_state.transcription_model, + target_lang=sidebar_opts["target_lang"], + use_gpu=st.session_state.use_gpu, + ) + segments = trans_seg + transcript = trans_text + results["original_text"] = orig_text + results["original_segments"] = orig_seg + results["translated"] = True + else: + segments, transcript = transcribe_audio( + file_path, + model=st.session_state.transcription_model, + use_cache=st.session_state.use_cache, + use_gpu=st.session_state.use_gpu, + memory_fraction=st.session_state.memory_fraction, + ) + + transcription_time = time.time() - t0 + st.write(f"Transcription complete ({transcription_time:.1f}s)") + + if not transcript: + status.update(label="Processing failed", state="error") + return None + + results["segments"] = segments + results["transcript"] = transcript + + # Step 2: Keyword extraction + if st.session_state.use_keywords and KEYWORD_EXTRACTION_AVAILABLE: + st.write("Extracting keywords...") + t0 = time.time() + kw_ts, ent_ts = extract_keywords_from_transcript( + transcript, segments, + max_keywords=sidebar_opts["max_keywords"], + use_gpu=st.session_state.use_gpu, + ) + results["keyword_timestamps"] = kw_ts + results["entity_timestamps"] = ent_ts + results["keyword_index"] = generate_keyword_index(kw_ts, ent_ts) + results["interactive_transcript"] = generate_interactive_transcript(segments, kw_ts, ent_ts) + st.write(f"Keywords extracted ({time.time() - t0:.1f}s)") + + # Step 3: Summarization + st.write("Generating summary...") + t0 = time.time() + + use_ollama = ( + OLLAMA_AVAILABLE + and st.session_state.summarization_method == "Ollama (Local)" + and sidebar_opts["ollama_model"] ) - results["keyword_timestamps"] = kw_ts - results["entity_timestamps"] = ent_ts - results["keyword_index"] = generate_keyword_index(kw_ts, ent_ts) - results["interactive_transcript"] = generate_interactive_transcript(segments, kw_ts, ent_ts) - st.write(f"Keywords extracted ({time.time() - t0:.1f}s)") - # Step 3: Summarization - st.write("Generating summary...") - t0 = time.time() - - use_ollama = ( - OLLAMA_AVAILABLE - and st.session_state.summarization_method == "Ollama (Local)" - and sidebar_opts["ollama_model"] - ) - - if use_ollama: - summary = chunk_and_summarize(transcript, model=sidebar_opts["ollama_model"]) - if not summary: - st.write("Ollama failed, falling back to Hugging Face...") + if use_ollama: + summary = chunk_and_summarize(transcript, model=sidebar_opts["ollama_model"]) + if not summary: + st.write("Ollama failed, falling back to Hugging Face...") + summary = summarize_text( + transcript, + use_gpu=st.session_state.use_gpu, + memory_fraction=st.session_state.memory_fraction, + ) + results["ollama_streaming"] = True + else: summary = summarize_text( transcript, use_gpu=st.session_state.use_gpu, memory_fraction=st.session_state.memory_fraction, ) - results["ollama_streaming"] = True - else: - summary = summarize_text( - transcript, - use_gpu=st.session_state.use_gpu, - memory_fraction=st.session_state.memory_fraction, - ) - results["summary"] = summary - st.write(f"Summary generated ({time.time() - t0:.1f}s)") + results["summary"] = summary + st.write(f"Summary generated ({time.time() - t0:.1f}s)") - # Cleanup temp audio files - cleanup_temp_audio() + # Cleanup temp audio files + cleanup_temp_audio() - total_time = time.time() - start_time - results["processing_time"] = total_time - results["word_count"] = len(transcript.split()) + total_time = time.time() - start_time + results["processing_time"] = total_time + results["word_count"] = len(transcript.split()) - status.update(label=f"Complete in {total_time:.1f}s", state="complete") + status.update(label=f"Complete in {total_time:.1f}s", state="complete") - return results + return results + + except MemoryError as e: + st.error(str(e)) + logger.error(f"Out of memory: {e}") + return None + except Exception as e: + st.error(f"Processing error: {e}") + logger.error(f"Processing error: {e}", exc_info=True) + return None def render_results(results, sidebar_opts): diff --git a/requirements.txt b/requirements.txt index 6b28359..11c57dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ humanize>=4.6.0 # torchaudio >= 2.1.0 is REQUIRED for diarization to work properly # Transformers ecosystem -transformers>=4.35.0 +transformers>=4.35.0,<5.0.0 tokenizers>=0.14.0 # ML dependencies - use flexible versions for compatibility diff --git a/utils/audio_processing.py b/utils/audio_processing.py index e9530f3..7da6527 100644 --- a/utils/audio_processing.py +++ b/utils/audio_processing.py @@ -19,7 +19,11 @@ def extract_audio(video_path: Path): audio = AudioFileClip(str(video_path)) temp_dir = tempfile.mkdtemp(prefix="videotranscriber_") audio_path = Path(temp_dir) / f"{video_path.stem}_audio.wav" - audio.write_audiofile(str(audio_path), verbose=False, logger=None) + try: + audio.write_audiofile(str(audio_path), logger=None) + except TypeError: + # moviepy 1.x uses verbose parameter; moviepy 2.x removed it + audio.write_audiofile(str(audio_path), verbose=False, logger=None) audio.close() _temp_audio_files.append(str(audio_path)) return audio_path diff --git a/utils/transcription.py b/utils/transcription.py index 0c53b5f..82d9054 100644 --- a/utils/transcription.py +++ b/utils/transcription.py @@ -22,13 +22,33 @@ logger = logging.getLogger(__name__) WHISPER_MODEL = "base" +WHISPER_MODEL_SIZES = { + "tiny": 75, + "base": 140, + "small": 460, + "medium": 1500, + "large": 2900, + "large-v2": 2900, + "large-v3": 2900, +} + @st.cache_resource def _load_whisper_model(model_name, device_str): """Load and cache a Whisper model. Cached across reruns.""" logger.info(f"Loading Whisper model: {model_name} on {device_str}") device = torch.device(device_str) - return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu") + try: + return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu") + except (MemoryError, RuntimeError) as e: + err_str = str(e).lower() + if "out of memory" in err_str or "cannot allocate" in err_str or isinstance(e, MemoryError): + size_mb = WHISPER_MODEL_SIZES.get(model_name, "unknown") + raise MemoryError( + f"Not enough memory to load Whisper '{model_name}' model (~{size_mb}MB). " + f"Try a smaller model (tiny/base/small) or enable GPU acceleration." + ) from e + raise def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,