added api for ai; got backend working
This commit is contained in:
91
transcribe.py
Normal file
91
transcribe.py
Normal file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
import tempfile
|
||||
import subprocess
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
def extract_audio(input_path, output_path):
|
||||
"""Extract audio from video/audio file to 16kHz mono WAV"""
|
||||
cmd = [
|
||||
'ffmpeg', '-y', '-i', input_path, '-vn', '-ar', '16000', '-ac', '1', '-f', 'wav', output_path
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python transcribe.py <audio_file> <model_name> [language]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
audio_file = sys.argv[1]
|
||||
model_name = sys.argv[2]
|
||||
language = sys.argv[3] if len(sys.argv) > 3 else None
|
||||
|
||||
# Extract audio to temp WAV if needed
|
||||
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
||||
wav_path = tmp.name
|
||||
|
||||
try:
|
||||
extract_audio(audio_file, wav_path)
|
||||
|
||||
# Load model - use GPU if CUDA is available, else CPU with int8
|
||||
import ctypes
|
||||
try:
|
||||
ctypes.CDLL("libcublas.so.12")
|
||||
device = "cuda"
|
||||
compute_type = "float16"
|
||||
except OSError:
|
||||
device = "cpu"
|
||||
compute_type = "int8"
|
||||
|
||||
model = WhisperModel(model_name, device=device, compute_type=compute_type)
|
||||
|
||||
# Transcribe
|
||||
segments, info = model.transcribe(
|
||||
wav_path,
|
||||
language=language,
|
||||
beam_size=5,
|
||||
word_timestamps=True,
|
||||
vad_filter=True,
|
||||
vad_parameters=dict(threshold=0.5, min_speech_duration_ms=250),
|
||||
without_timestamps=False
|
||||
)
|
||||
|
||||
# Convert to our format
|
||||
words = []
|
||||
segments_list = []
|
||||
|
||||
for segment in segments:
|
||||
seg_words = []
|
||||
for word in segment.words:
|
||||
w = {
|
||||
"word": word.word,
|
||||
"start": word.start,
|
||||
"end": word.end,
|
||||
"confidence": word.probability
|
||||
}
|
||||
words.append(w)
|
||||
seg_words.append(w)
|
||||
|
||||
segments_list.append({
|
||||
"id": len(segments_list),
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"text": segment.text,
|
||||
"words": seg_words
|
||||
})
|
||||
|
||||
result = {
|
||||
"words": words,
|
||||
"segments": segments_list,
|
||||
"language": info.language
|
||||
}
|
||||
|
||||
print(json.dumps(result))
|
||||
|
||||
finally:
|
||||
import os
|
||||
os.unlink(wav_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user