#!/usr/bin/env python3 import sys import json import tempfile import subprocess from faster_whisper import WhisperModel def extract_audio(input_path, output_path): """Extract audio from video/audio file to 16kHz mono WAV""" cmd = [ 'ffmpeg', '-y', '-i', input_path, '-vn', '-ar', '16000', '-ac', '1', '-f', 'wav', output_path ] subprocess.run(cmd, check=True) def main(): if len(sys.argv) < 3: print("Usage: python transcribe.py [language]", file=sys.stderr) sys.exit(1) audio_file = sys.argv[1] model_name = sys.argv[2] language = sys.argv[3] if len(sys.argv) > 3 else None # Extract audio to temp WAV if needed with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: wav_path = tmp.name try: extract_audio(audio_file, wav_path) # Load model - use GPU if CUDA is available, else CPU with int8 import ctypes try: ctypes.CDLL("libcublas.so.12") device = "cuda" compute_type = "float16" except OSError: device = "cpu" compute_type = "int8" model = WhisperModel(model_name, device=device, compute_type=compute_type) # Transcribe segments, info = model.transcribe( wav_path, language=language, beam_size=5, word_timestamps=True, vad_filter=True, vad_parameters=dict(threshold=0.5, min_speech_duration_ms=250), without_timestamps=False ) # Convert to our format words = [] segments_list = [] for segment in segments: seg_words = [] for word in segment.words: w = { "word": word.word, "start": word.start, "end": word.end, "confidence": word.probability } words.append(w) seg_words.append(w) segments_list.append({ "id": len(segments_list), "start": segment.start, "end": segment.end, "text": segment.text, "words": seg_words }) result = { "words": words, "segments": segments_list, "language": info.language } print(json.dumps(result)) finally: import os os.unlink(wav_path) if __name__ == "__main__": main()