#!/usr/bin/env python3 import sys import json import tempfile import subprocess from faster_whisper import WhisperModel def extract_audio(input_path, output_path): """Extract audio from video/audio file to 16kHz mono WAV""" cmd = [ 'ffmpeg', '-y', '-i', input_path, '-vn', '-ar', '16000', '-ac', '1', '-f', 'wav', output_path ] subprocess.run(cmd, check=True) def main(): if len(sys.argv) < 3: print("Usage: python transcribe.py [language]", file=sys.stderr) sys.exit(1) audio_file = sys.argv[1] model_name = sys.argv[2] language = sys.argv[3] if len(sys.argv) > 3 else None # Check file size - warn for very large files import os file_size_mb = os.path.getsize(audio_file) / (1024 * 1024) if file_size_mb > 100: # Warn for files over 100MB print(f"Warning: Large file detected ({file_size_mb:.1f}MB). Transcription may take a long time.", file=sys.stderr) print("Consider splitting long audio files into smaller segments for faster processing.", file=sys.stderr) # Extract audio to temp WAV if needed with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: wav_path = tmp.name try: extract_audio(audio_file, wav_path) # Load model - use GPU if CUDA is available, else CPU with int8 import ctypes try: ctypes.CDLL("libcublas.so.12") device = "cuda" compute_type = "float16" except OSError: device = "cpu" compute_type = "int8" try: model = WhisperModel(model_name, device=device, compute_type=compute_type) except RuntimeError as e: if "out of memory" in str(e).lower() and device == "cuda": print(f"CUDA OOM, falling back to CPU (int8)", file=sys.stderr) device = "cpu" compute_type = "int8" model = WhisperModel(model_name, device=device, compute_type=compute_type) else: raise # Transcribe with progress reporting print(f"Starting transcription of {wav_path} with model {model_name}", file=sys.stderr) segments, info = model.transcribe( wav_path, language=language, beam_size=5, word_timestamps=True, vad_filter=True, vad_parameters=dict(threshold=0.5, min_speech_duration_ms=250), without_timestamps=False ) print(f"Transcription completed. Detected language: {info.language}", file=sys.stderr) # Convert to our format words = [] segments_list = [] for segment in segments: seg_words = [] for word in segment.words: w = { "word": word.word, "start": word.start, "end": word.end, "confidence": word.probability } words.append(w) seg_words.append(w) segments_list.append({ "id": len(segments_list), "start": segment.start, "end": segment.end, "text": segment.text, "words": seg_words }) result = { "words": words, "segments": segments_list, "language": info.language } print(json.dumps(result)) finally: import os os.unlink(wav_path) if __name__ == "__main__": main()