fix: Resolve issues #7, #8, #9 - moviepy, transformers, Whisper OOM

Issue #7: Handle moviepy 2.x removing verbose param from write_audiofile Issue #8: Pin transformers<5.0.0 to fix summarization pipeline task registry Issue #9: Add Whisper model memory warnings and OOM error handling
2026-03-03 02:10:52 -05:00
parent 70c5d32413
commit d1e1fedcae
4 changed files with 134 additions and 93 deletions
--- a/app.py
+++ b/app.py
@ -113,9 +113,16 @@ def render_sidebar():
            index=["tiny", "base", "small", "medium", "large"].index(
                st.session_state.transcription_model
            ),
-            help="Larger models are more accurate but slower.",
+            help="Larger models are more accurate but slower. "
+                 "Memory: tiny ~75MB, base ~140MB, small ~460MB, medium ~1.5GB, large ~2.9GB",
            key="sb_whisper_model",
        )
+        if st.session_state.transcription_model in ("large", "large-v2", "large-v3") and not st.session_state.get("use_gpu", False):
+            st.warning(
+                "The **large** Whisper model requires ~2.9GB of memory. "
+                "Without GPU, this may crash the application. Consider using "
+                "**medium** or smaller, or enable GPU acceleration."
+            )

        summarization_options = (
            ["Hugging Face (Online)", "Ollama (Local)"]
@ -407,6 +414,7 @@ def process_recording(file_path, sidebar_opts):
    results = {}
    start_time = time.time()

+    try:
        with st.status("Processing recording...", expanded=True) as status:

            # Step 1: Transcription
@ -511,6 +519,15 @@ def process_recording(file_path, sidebar_opts):

        return results

+    except MemoryError as e:
+        st.error(str(e))
+        logger.error(f"Out of memory: {e}")
+        return None
+    except Exception as e:
+        st.error(f"Processing error: {e}")
+        logger.error(f"Processing error: {e}", exc_info=True)
+        return None
+

 def render_results(results, sidebar_opts):
    """Display processing results with metrics, tabs, and export options."""
--- a/requirements.txt
+++ b/requirements.txt
@ -13,7 +13,7 @@ humanize>=4.6.0
 # torchaudio >= 2.1.0 is REQUIRED for diarization to work properly

 # Transformers ecosystem
-transformers>=4.35.0
+transformers>=4.35.0,<5.0.0
 tokenizers>=0.14.0

 # ML dependencies - use flexible versions for compatibility
--- a/utils/audio_processing.py
+++ b/utils/audio_processing.py
@ -19,6 +19,10 @@ def extract_audio(video_path: Path):
        audio = AudioFileClip(str(video_path))
        temp_dir = tempfile.mkdtemp(prefix="videotranscriber_")
        audio_path = Path(temp_dir) / f"{video_path.stem}_audio.wav"
+        try:
+            audio.write_audiofile(str(audio_path), logger=None)
+        except TypeError:
+            # moviepy 1.x uses verbose parameter; moviepy 2.x removed it
            audio.write_audiofile(str(audio_path), verbose=False, logger=None)
        audio.close()
        _temp_audio_files.append(str(audio_path))
--- a/utils/transcription.py
+++ b/utils/transcription.py
@ -22,13 +22,33 @@ logger = logging.getLogger(__name__)

 WHISPER_MODEL = "base"

+WHISPER_MODEL_SIZES = {
+    "tiny": 75,
+    "base": 140,
+    "small": 460,
+    "medium": 1500,
+    "large": 2900,
+    "large-v2": 2900,
+    "large-v3": 2900,
+}
+

@st.cache_resource
 def _load_whisper_model(model_name, device_str):
    """Load and cache a Whisper model. Cached across reruns."""
    logger.info(f"Loading Whisper model: {model_name} on {device_str}")
    device = torch.device(device_str)
+    try:
        return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu")
+    except (MemoryError, RuntimeError) as e:
+        err_str = str(e).lower()
+        if "out of memory" in err_str or "cannot allocate" in err_str or isinstance(e, MemoryError):
+            size_mb = WHISPER_MODEL_SIZES.get(model_name, "unknown")
+            raise MemoryError(
+                f"Not enough memory to load Whisper '{model_name}' model (~{size_mb}MB). "
+                f"Try a smaller model (tiny/base/small) or enable GPU acceleration."
+            ) from e
+        raise


 def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,