Files
TalkEdit/transcribe.py

111 lines
3.5 KiB
Python
Raw Normal View History

2026-03-26 23:39:31 -06:00
#!/usr/bin/env python3
import sys
import json
import tempfile
import subprocess
from faster_whisper import WhisperModel
def extract_audio(input_path, output_path):
"""Extract audio from video/audio file to 16kHz mono WAV"""
cmd = [
'ffmpeg', '-y', '-i', input_path, '-vn', '-ar', '16000', '-ac', '1', '-f', 'wav', output_path
]
subprocess.run(cmd, check=True)
def main():
if len(sys.argv) < 3:
print("Usage: python transcribe.py <audio_file> <model_name> [language]", file=sys.stderr)
sys.exit(1)
audio_file = sys.argv[1]
model_name = sys.argv[2]
language = sys.argv[3] if len(sys.argv) > 3 else None
2026-04-03 10:46:26 -06:00
# Check file size - warn for very large files
import os
file_size_mb = os.path.getsize(audio_file) / (1024 * 1024)
if file_size_mb > 100: # Warn for files over 100MB
print(f"Warning: Large file detected ({file_size_mb:.1f}MB). Transcription may take a long time.", file=sys.stderr)
print("Consider splitting long audio files into smaller segments for faster processing.", file=sys.stderr)
2026-03-26 23:39:31 -06:00
# Extract audio to temp WAV if needed
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
wav_path = tmp.name
try:
extract_audio(audio_file, wav_path)
# Load model - use GPU if CUDA is available, else CPU with int8
import ctypes
try:
ctypes.CDLL("libcublas.so.12")
device = "cuda"
compute_type = "float16"
except OSError:
device = "cpu"
compute_type = "int8"
2026-05-06 23:11:00 -06:00
try:
model = WhisperModel(model_name, device=device, compute_type=compute_type)
except RuntimeError as e:
if "out of memory" in str(e).lower() and device == "cuda":
print(f"CUDA OOM, falling back to CPU (int8)", file=sys.stderr)
device = "cpu"
compute_type = "int8"
model = WhisperModel(model_name, device=device, compute_type=compute_type)
else:
raise
2026-03-26 23:39:31 -06:00
2026-04-03 10:46:26 -06:00
# Transcribe with progress reporting
print(f"Starting transcription of {wav_path} with model {model_name}", file=sys.stderr)
2026-03-26 23:39:31 -06:00
segments, info = model.transcribe(
wav_path,
language=language,
beam_size=5,
word_timestamps=True,
vad_filter=True,
vad_parameters=dict(threshold=0.5, min_speech_duration_ms=250),
without_timestamps=False
)
2026-04-03 10:46:26 -06:00
print(f"Transcription completed. Detected language: {info.language}", file=sys.stderr)
2026-03-26 23:39:31 -06:00
# Convert to our format
words = []
segments_list = []
for segment in segments:
seg_words = []
for word in segment.words:
w = {
"word": word.word,
"start": word.start,
"end": word.end,
"confidence": word.probability
}
words.append(w)
seg_words.append(w)
segments_list.append({
"id": len(segments_list),
"start": segment.start,
"end": segment.end,
"text": segment.text,
"words": seg_words
})
result = {
"words": words,
"segments": segments_list,
"language": info.language
}
print(json.dumps(result))
finally:
import os
os.unlink(wav_path)
if __name__ == "__main__":
main()