Issue #7: Handle moviepy 2.x removing verbose param from write_audiofile Issue #8: Pin transformers<5.0.0 to fix summarization pipeline task registry Issue #9: Add Whisper model memory warnings and OOM error handling
This commit is contained in:
197
app.py
197
app.py
@ -113,9 +113,16 @@ def render_sidebar():
|
|||||||
index=["tiny", "base", "small", "medium", "large"].index(
|
index=["tiny", "base", "small", "medium", "large"].index(
|
||||||
st.session_state.transcription_model
|
st.session_state.transcription_model
|
||||||
),
|
),
|
||||||
help="Larger models are more accurate but slower.",
|
help="Larger models are more accurate but slower. "
|
||||||
|
"Memory: tiny ~75MB, base ~140MB, small ~460MB, medium ~1.5GB, large ~2.9GB",
|
||||||
key="sb_whisper_model",
|
key="sb_whisper_model",
|
||||||
)
|
)
|
||||||
|
if st.session_state.transcription_model in ("large", "large-v2", "large-v3") and not st.session_state.get("use_gpu", False):
|
||||||
|
st.warning(
|
||||||
|
"The **large** Whisper model requires ~2.9GB of memory. "
|
||||||
|
"Without GPU, this may crash the application. Consider using "
|
||||||
|
"**medium** or smaller, or enable GPU acceleration."
|
||||||
|
)
|
||||||
|
|
||||||
summarization_options = (
|
summarization_options = (
|
||||||
["Hugging Face (Online)", "Ollama (Local)"]
|
["Hugging Face (Online)", "Ollama (Local)"]
|
||||||
@ -407,109 +414,119 @@ def process_recording(file_path, sidebar_opts):
|
|||||||
results = {}
|
results = {}
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
with st.status("Processing recording...", expanded=True) as status:
|
try:
|
||||||
|
with st.status("Processing recording...", expanded=True) as status:
|
||||||
|
|
||||||
# Step 1: Transcription
|
# Step 1: Transcription
|
||||||
st.write(f"Transcribing with Whisper ({st.session_state.transcription_model} model)...")
|
st.write(f"Transcribing with Whisper ({st.session_state.transcription_model} model)...")
|
||||||
t0 = time.time()
|
|
||||||
|
|
||||||
if st.session_state.use_diarization and DIARIZATION_AVAILABLE and sidebar_opts["hf_token"]:
|
|
||||||
num_spk = int(sidebar_opts["num_speakers"]) if sidebar_opts["num_speakers"] > 0 else None
|
|
||||||
segments, transcript = transcribe_with_diarization(
|
|
||||||
file_path,
|
|
||||||
whisper_model=st.session_state.transcription_model,
|
|
||||||
num_speakers=num_spk,
|
|
||||||
use_gpu=st.session_state.use_gpu,
|
|
||||||
hf_token=sidebar_opts["hf_token"],
|
|
||||||
)
|
|
||||||
results["diarized"] = True
|
|
||||||
elif st.session_state.use_translation and TRANSLATION_AVAILABLE:
|
|
||||||
st.write("Transcribing and translating...")
|
|
||||||
orig_seg, trans_seg, orig_text, trans_text = transcribe_and_translate(
|
|
||||||
file_path,
|
|
||||||
whisper_model=st.session_state.transcription_model,
|
|
||||||
target_lang=sidebar_opts["target_lang"],
|
|
||||||
use_gpu=st.session_state.use_gpu,
|
|
||||||
)
|
|
||||||
segments = trans_seg
|
|
||||||
transcript = trans_text
|
|
||||||
results["original_text"] = orig_text
|
|
||||||
results["original_segments"] = orig_seg
|
|
||||||
results["translated"] = True
|
|
||||||
else:
|
|
||||||
segments, transcript = transcribe_audio(
|
|
||||||
file_path,
|
|
||||||
model=st.session_state.transcription_model,
|
|
||||||
use_cache=st.session_state.use_cache,
|
|
||||||
use_gpu=st.session_state.use_gpu,
|
|
||||||
memory_fraction=st.session_state.memory_fraction,
|
|
||||||
)
|
|
||||||
|
|
||||||
transcription_time = time.time() - t0
|
|
||||||
st.write(f"Transcription complete ({transcription_time:.1f}s)")
|
|
||||||
|
|
||||||
if not transcript:
|
|
||||||
status.update(label="Processing failed", state="error")
|
|
||||||
return None
|
|
||||||
|
|
||||||
results["segments"] = segments
|
|
||||||
results["transcript"] = transcript
|
|
||||||
|
|
||||||
# Step 2: Keyword extraction
|
|
||||||
if st.session_state.use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
|
|
||||||
st.write("Extracting keywords...")
|
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
kw_ts, ent_ts = extract_keywords_from_transcript(
|
|
||||||
transcript, segments,
|
if st.session_state.use_diarization and DIARIZATION_AVAILABLE and sidebar_opts["hf_token"]:
|
||||||
max_keywords=sidebar_opts["max_keywords"],
|
num_spk = int(sidebar_opts["num_speakers"]) if sidebar_opts["num_speakers"] > 0 else None
|
||||||
use_gpu=st.session_state.use_gpu,
|
segments, transcript = transcribe_with_diarization(
|
||||||
|
file_path,
|
||||||
|
whisper_model=st.session_state.transcription_model,
|
||||||
|
num_speakers=num_spk,
|
||||||
|
use_gpu=st.session_state.use_gpu,
|
||||||
|
hf_token=sidebar_opts["hf_token"],
|
||||||
|
)
|
||||||
|
results["diarized"] = True
|
||||||
|
elif st.session_state.use_translation and TRANSLATION_AVAILABLE:
|
||||||
|
st.write("Transcribing and translating...")
|
||||||
|
orig_seg, trans_seg, orig_text, trans_text = transcribe_and_translate(
|
||||||
|
file_path,
|
||||||
|
whisper_model=st.session_state.transcription_model,
|
||||||
|
target_lang=sidebar_opts["target_lang"],
|
||||||
|
use_gpu=st.session_state.use_gpu,
|
||||||
|
)
|
||||||
|
segments = trans_seg
|
||||||
|
transcript = trans_text
|
||||||
|
results["original_text"] = orig_text
|
||||||
|
results["original_segments"] = orig_seg
|
||||||
|
results["translated"] = True
|
||||||
|
else:
|
||||||
|
segments, transcript = transcribe_audio(
|
||||||
|
file_path,
|
||||||
|
model=st.session_state.transcription_model,
|
||||||
|
use_cache=st.session_state.use_cache,
|
||||||
|
use_gpu=st.session_state.use_gpu,
|
||||||
|
memory_fraction=st.session_state.memory_fraction,
|
||||||
|
)
|
||||||
|
|
||||||
|
transcription_time = time.time() - t0
|
||||||
|
st.write(f"Transcription complete ({transcription_time:.1f}s)")
|
||||||
|
|
||||||
|
if not transcript:
|
||||||
|
status.update(label="Processing failed", state="error")
|
||||||
|
return None
|
||||||
|
|
||||||
|
results["segments"] = segments
|
||||||
|
results["transcript"] = transcript
|
||||||
|
|
||||||
|
# Step 2: Keyword extraction
|
||||||
|
if st.session_state.use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
|
||||||
|
st.write("Extracting keywords...")
|
||||||
|
t0 = time.time()
|
||||||
|
kw_ts, ent_ts = extract_keywords_from_transcript(
|
||||||
|
transcript, segments,
|
||||||
|
max_keywords=sidebar_opts["max_keywords"],
|
||||||
|
use_gpu=st.session_state.use_gpu,
|
||||||
|
)
|
||||||
|
results["keyword_timestamps"] = kw_ts
|
||||||
|
results["entity_timestamps"] = ent_ts
|
||||||
|
results["keyword_index"] = generate_keyword_index(kw_ts, ent_ts)
|
||||||
|
results["interactive_transcript"] = generate_interactive_transcript(segments, kw_ts, ent_ts)
|
||||||
|
st.write(f"Keywords extracted ({time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# Step 3: Summarization
|
||||||
|
st.write("Generating summary...")
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
use_ollama = (
|
||||||
|
OLLAMA_AVAILABLE
|
||||||
|
and st.session_state.summarization_method == "Ollama (Local)"
|
||||||
|
and sidebar_opts["ollama_model"]
|
||||||
)
|
)
|
||||||
results["keyword_timestamps"] = kw_ts
|
|
||||||
results["entity_timestamps"] = ent_ts
|
|
||||||
results["keyword_index"] = generate_keyword_index(kw_ts, ent_ts)
|
|
||||||
results["interactive_transcript"] = generate_interactive_transcript(segments, kw_ts, ent_ts)
|
|
||||||
st.write(f"Keywords extracted ({time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# Step 3: Summarization
|
if use_ollama:
|
||||||
st.write("Generating summary...")
|
summary = chunk_and_summarize(transcript, model=sidebar_opts["ollama_model"])
|
||||||
t0 = time.time()
|
if not summary:
|
||||||
|
st.write("Ollama failed, falling back to Hugging Face...")
|
||||||
use_ollama = (
|
summary = summarize_text(
|
||||||
OLLAMA_AVAILABLE
|
transcript,
|
||||||
and st.session_state.summarization_method == "Ollama (Local)"
|
use_gpu=st.session_state.use_gpu,
|
||||||
and sidebar_opts["ollama_model"]
|
memory_fraction=st.session_state.memory_fraction,
|
||||||
)
|
)
|
||||||
|
results["ollama_streaming"] = True
|
||||||
if use_ollama:
|
else:
|
||||||
summary = chunk_and_summarize(transcript, model=sidebar_opts["ollama_model"])
|
|
||||||
if not summary:
|
|
||||||
st.write("Ollama failed, falling back to Hugging Face...")
|
|
||||||
summary = summarize_text(
|
summary = summarize_text(
|
||||||
transcript,
|
transcript,
|
||||||
use_gpu=st.session_state.use_gpu,
|
use_gpu=st.session_state.use_gpu,
|
||||||
memory_fraction=st.session_state.memory_fraction,
|
memory_fraction=st.session_state.memory_fraction,
|
||||||
)
|
)
|
||||||
results["ollama_streaming"] = True
|
|
||||||
else:
|
|
||||||
summary = summarize_text(
|
|
||||||
transcript,
|
|
||||||
use_gpu=st.session_state.use_gpu,
|
|
||||||
memory_fraction=st.session_state.memory_fraction,
|
|
||||||
)
|
|
||||||
|
|
||||||
results["summary"] = summary
|
results["summary"] = summary
|
||||||
st.write(f"Summary generated ({time.time() - t0:.1f}s)")
|
st.write(f"Summary generated ({time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
# Cleanup temp audio files
|
# Cleanup temp audio files
|
||||||
cleanup_temp_audio()
|
cleanup_temp_audio()
|
||||||
|
|
||||||
total_time = time.time() - start_time
|
total_time = time.time() - start_time
|
||||||
results["processing_time"] = total_time
|
results["processing_time"] = total_time
|
||||||
results["word_count"] = len(transcript.split())
|
results["word_count"] = len(transcript.split())
|
||||||
|
|
||||||
status.update(label=f"Complete in {total_time:.1f}s", state="complete")
|
status.update(label=f"Complete in {total_time:.1f}s", state="complete")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
except MemoryError as e:
|
||||||
|
st.error(str(e))
|
||||||
|
logger.error(f"Out of memory: {e}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Processing error: {e}")
|
||||||
|
logger.error(f"Processing error: {e}", exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def render_results(results, sidebar_opts):
|
def render_results(results, sidebar_opts):
|
||||||
|
|||||||
@ -13,7 +13,7 @@ humanize>=4.6.0
|
|||||||
# torchaudio >= 2.1.0 is REQUIRED for diarization to work properly
|
# torchaudio >= 2.1.0 is REQUIRED for diarization to work properly
|
||||||
|
|
||||||
# Transformers ecosystem
|
# Transformers ecosystem
|
||||||
transformers>=4.35.0
|
transformers>=4.35.0,<5.0.0
|
||||||
tokenizers>=0.14.0
|
tokenizers>=0.14.0
|
||||||
|
|
||||||
# ML dependencies - use flexible versions for compatibility
|
# ML dependencies - use flexible versions for compatibility
|
||||||
|
|||||||
@ -19,7 +19,11 @@ def extract_audio(video_path: Path):
|
|||||||
audio = AudioFileClip(str(video_path))
|
audio = AudioFileClip(str(video_path))
|
||||||
temp_dir = tempfile.mkdtemp(prefix="videotranscriber_")
|
temp_dir = tempfile.mkdtemp(prefix="videotranscriber_")
|
||||||
audio_path = Path(temp_dir) / f"{video_path.stem}_audio.wav"
|
audio_path = Path(temp_dir) / f"{video_path.stem}_audio.wav"
|
||||||
audio.write_audiofile(str(audio_path), verbose=False, logger=None)
|
try:
|
||||||
|
audio.write_audiofile(str(audio_path), logger=None)
|
||||||
|
except TypeError:
|
||||||
|
# moviepy 1.x uses verbose parameter; moviepy 2.x removed it
|
||||||
|
audio.write_audiofile(str(audio_path), verbose=False, logger=None)
|
||||||
audio.close()
|
audio.close()
|
||||||
_temp_audio_files.append(str(audio_path))
|
_temp_audio_files.append(str(audio_path))
|
||||||
return audio_path
|
return audio_path
|
||||||
|
|||||||
@ -22,13 +22,33 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
WHISPER_MODEL = "base"
|
WHISPER_MODEL = "base"
|
||||||
|
|
||||||
|
WHISPER_MODEL_SIZES = {
|
||||||
|
"tiny": 75,
|
||||||
|
"base": 140,
|
||||||
|
"small": 460,
|
||||||
|
"medium": 1500,
|
||||||
|
"large": 2900,
|
||||||
|
"large-v2": 2900,
|
||||||
|
"large-v3": 2900,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@st.cache_resource
|
@st.cache_resource
|
||||||
def _load_whisper_model(model_name, device_str):
|
def _load_whisper_model(model_name, device_str):
|
||||||
"""Load and cache a Whisper model. Cached across reruns."""
|
"""Load and cache a Whisper model. Cached across reruns."""
|
||||||
logger.info(f"Loading Whisper model: {model_name} on {device_str}")
|
logger.info(f"Loading Whisper model: {model_name} on {device_str}")
|
||||||
device = torch.device(device_str)
|
device = torch.device(device_str)
|
||||||
return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu")
|
try:
|
||||||
|
return whisper.load_model(model_name, device=device if device.type != "mps" else "cpu")
|
||||||
|
except (MemoryError, RuntimeError) as e:
|
||||||
|
err_str = str(e).lower()
|
||||||
|
if "out of memory" in err_str or "cannot allocate" in err_str or isinstance(e, MemoryError):
|
||||||
|
size_mb = WHISPER_MODEL_SIZES.get(model_name, "unknown")
|
||||||
|
raise MemoryError(
|
||||||
|
f"Not enough memory to load Whisper '{model_name}' model (~{size_mb}MB). "
|
||||||
|
f"Try a smaller model (tiny/base/small) or enable GPU acceleration."
|
||||||
|
) from e
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,
|
def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,
|
||||||
|
|||||||
Reference in New Issue
Block a user