2026-03-26 23:39:31 -06:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
import sys
|
|
|
|
|
import json
|
|
|
|
|
import tempfile
|
|
|
|
|
import subprocess
|
|
|
|
|
from faster_whisper import WhisperModel
|
|
|
|
|
|
|
|
|
|
def extract_audio(input_path, output_path):
|
|
|
|
|
"""Extract audio from video/audio file to 16kHz mono WAV"""
|
|
|
|
|
cmd = [
|
|
|
|
|
'ffmpeg', '-y', '-i', input_path, '-vn', '-ar', '16000', '-ac', '1', '-f', 'wav', output_path
|
|
|
|
|
]
|
|
|
|
|
subprocess.run(cmd, check=True)
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
if len(sys.argv) < 3:
|
|
|
|
|
print("Usage: python transcribe.py <audio_file> <model_name> [language]", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
audio_file = sys.argv[1]
|
|
|
|
|
model_name = sys.argv[2]
|
|
|
|
|
language = sys.argv[3] if len(sys.argv) > 3 else None
|
|
|
|
|
|
2026-04-03 10:46:26 -06:00
|
|
|
# Check file size - warn for very large files
|
|
|
|
|
import os
|
|
|
|
|
file_size_mb = os.path.getsize(audio_file) / (1024 * 1024)
|
|
|
|
|
if file_size_mb > 100: # Warn for files over 100MB
|
|
|
|
|
print(f"Warning: Large file detected ({file_size_mb:.1f}MB). Transcription may take a long time.", file=sys.stderr)
|
|
|
|
|
print("Consider splitting long audio files into smaller segments for faster processing.", file=sys.stderr)
|
|
|
|
|
|
2026-03-26 23:39:31 -06:00
|
|
|
# Extract audio to temp WAV if needed
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
|
|
|
|
wav_path = tmp.name
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
extract_audio(audio_file, wav_path)
|
|
|
|
|
|
|
|
|
|
# Load model - use GPU if CUDA is available, else CPU with int8
|
|
|
|
|
import ctypes
|
|
|
|
|
try:
|
|
|
|
|
ctypes.CDLL("libcublas.so.12")
|
|
|
|
|
device = "cuda"
|
|
|
|
|
compute_type = "float16"
|
|
|
|
|
except OSError:
|
|
|
|
|
device = "cpu"
|
|
|
|
|
compute_type = "int8"
|
|
|
|
|
|
|
|
|
|
model = WhisperModel(model_name, device=device, compute_type=compute_type)
|
|
|
|
|
|
2026-04-03 10:46:26 -06:00
|
|
|
# Transcribe with progress reporting
|
|
|
|
|
print(f"Starting transcription of {wav_path} with model {model_name}", file=sys.stderr)
|
|
|
|
|
|
2026-03-26 23:39:31 -06:00
|
|
|
segments, info = model.transcribe(
|
|
|
|
|
wav_path,
|
|
|
|
|
language=language,
|
|
|
|
|
beam_size=5,
|
|
|
|
|
word_timestamps=True,
|
|
|
|
|
vad_filter=True,
|
|
|
|
|
vad_parameters=dict(threshold=0.5, min_speech_duration_ms=250),
|
|
|
|
|
without_timestamps=False
|
|
|
|
|
)
|
2026-04-03 10:46:26 -06:00
|
|
|
|
|
|
|
|
print(f"Transcription completed. Detected language: {info.language}", file=sys.stderr)
|
2026-03-26 23:39:31 -06:00
|
|
|
|
|
|
|
|
# Convert to our format
|
|
|
|
|
words = []
|
|
|
|
|
segments_list = []
|
|
|
|
|
|
|
|
|
|
for segment in segments:
|
|
|
|
|
seg_words = []
|
|
|
|
|
for word in segment.words:
|
|
|
|
|
w = {
|
|
|
|
|
"word": word.word,
|
|
|
|
|
"start": word.start,
|
|
|
|
|
"end": word.end,
|
|
|
|
|
"confidence": word.probability
|
|
|
|
|
}
|
|
|
|
|
words.append(w)
|
|
|
|
|
seg_words.append(w)
|
|
|
|
|
|
|
|
|
|
segments_list.append({
|
|
|
|
|
"id": len(segments_list),
|
|
|
|
|
"start": segment.start,
|
|
|
|
|
"end": segment.end,
|
|
|
|
|
"text": segment.text,
|
|
|
|
|
"words": seg_words
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
result = {
|
|
|
|
|
"words": words,
|
|
|
|
|
"segments": segments_list,
|
|
|
|
|
"language": info.language
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
print(json.dumps(result))
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
import os
|
|
|
|
|
os.unlink(wav_path)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|