60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
|
|
import whisper
|
||
|
|
from pathlib import Path
|
||
|
|
from transformers import pipeline, AutoTokenizer
|
||
|
|
|
||
|
|
WHISPER_MODEL = "base"
|
||
|
|
SUMMARIZATION_MODEL = "t5-base"
|
||
|
|
|
||
|
|
def transcribe_audio(audio_path: Path):
|
||
|
|
"""Transcribe audio using Whisper."""
|
||
|
|
model = whisper.load_model(WHISPER_MODEL)
|
||
|
|
result = model.transcribe(str(audio_path))
|
||
|
|
transcript = result["text"]
|
||
|
|
summary = summarize_text(transcript)
|
||
|
|
return transcript, summary
|
||
|
|
|
||
|
|
def summarize_text(text):
|
||
|
|
"""Summarize text using a pre-trained T5 transformer model with chunking."""
|
||
|
|
summarization_pipeline = pipeline("summarization", model=SUMMARIZATION_MODEL)
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
|
||
|
|
|
||
|
|
max_tokens = 512
|
||
|
|
|
||
|
|
tokens = tokenizer(text, return_tensors='pt')
|
||
|
|
num_tokens = len(tokens['input_ids'][0])
|
||
|
|
|
||
|
|
if num_tokens > max_tokens:
|
||
|
|
chunks = chunk_text(text, max_tokens)
|
||
|
|
summaries = []
|
||
|
|
for chunk in chunks:
|
||
|
|
summary_output = summarization_pipeline("summarize: " + chunk, max_length=150, min_length=30, do_sample=False)
|
||
|
|
summaries.append(summary_output[0]['summary_text'])
|
||
|
|
overall_summary = " ".join(summaries)
|
||
|
|
else:
|
||
|
|
overall_summary = summarization_pipeline("summarize: " + text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
|
||
|
|
|
||
|
|
return overall_summary
|
||
|
|
|
||
|
|
def chunk_text(text, max_tokens):
|
||
|
|
"""Splits the text into a list of chunks based on token limits."""
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
|
||
|
|
words = text.split()
|
||
|
|
|
||
|
|
chunks = []
|
||
|
|
current_chunk = []
|
||
|
|
current_length = 0
|
||
|
|
|
||
|
|
for word in words:
|
||
|
|
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||
|
|
if hypothetical_length <= max_tokens:
|
||
|
|
current_chunk.append(word)
|
||
|
|
current_length = hypothetical_length
|
||
|
|
else:
|
||
|
|
chunks.append(' '.join(current_chunk))
|
||
|
|
current_chunk = [word]
|
||
|
|
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||
|
|
|
||
|
|
if current_chunk:
|
||
|
|
chunks.append(' '.join(current_chunk))
|
||
|
|
|
||
|
|
return chunks
|