Updated with code

This commit is contained in:
DataAnts-AI
2025-01-28 17:00:03 -05:00
parent 2b37a68c56
commit 3c44257d8a
8 changed files with 180 additions and 0 deletions

12
utils/audio_processing.py Normal file
View File

@ -0,0 +1,12 @@
from moviepy.editor import AudioFileClip
from pathlib import Path
def extract_audio(video_path: Path):
"""Extract audio from a video file."""
try:
audio = AudioFileClip(str(video_path))
audio_path = video_path.parent / f"{video_path.stem}_audio.wav"
audio.write_audiofile(str(audio_path), verbose=False, logger=None)
return audio_path
except Exception as e:
raise RuntimeError(f"Audio extraction failed: {e}")

8
utils/summarization.py Normal file
View File

@ -0,0 +1,8 @@
from transformers import pipeline
SUMMARY_MODEL = "Falconsai/text_summarization"
def summarize_text(text):
"""Summarize text using a Hugging Face pipeline."""
summarizer = pipeline("summarization", model=SUMMARY_MODEL)
return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]

60
utils/transcription.py Normal file
View File

@ -0,0 +1,60 @@
import whisper
from pathlib import Path
from transformers import pipeline, AutoTokenizer
WHISPER_MODEL = "base"
SUMMARIZATION_MODEL = "t5-base"
def transcribe_audio(audio_path: Path):
"""Transcribe audio using Whisper."""
model = whisper.load_model(WHISPER_MODEL)
result = model.transcribe(str(audio_path))
transcript = result["text"]
summary = summarize_text(transcript)
return transcript, summary
def summarize_text(text):
"""Summarize text using a pre-trained T5 transformer model with chunking."""
summarization_pipeline = pipeline("summarization", model=SUMMARIZATION_MODEL)
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
max_tokens = 512
tokens = tokenizer(text, return_tensors='pt')
num_tokens = len(tokens['input_ids'][0])
if num_tokens > max_tokens:
chunks = chunk_text(text, max_tokens)
summaries = []
for chunk in chunks:
summary_output = summarization_pipeline("summarize: " + chunk, max_length=150, min_length=30, do_sample=False)
summaries.append(summary_output[0]['summary_text'])
overall_summary = " ".join(summaries)
else:
overall_summary = summarization_pipeline("summarize: " + text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
return overall_summary
def chunk_text(text, max_tokens):
"""Splits the text into a list of chunks based on token limits."""
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
if hypothetical_length <= max_tokens:
current_chunk.append(word)
current_length = hypothetical_length
else:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks

8
utils/validation.py Normal file
View File

@ -0,0 +1,8 @@
from pathlib import Path
def validate_environment(obs_path: Path):
"""Validate environment and prerequisites."""
errors = []
if not obs_path.exists():
errors.append(f"OBS directory not found: {obs_path}")
return errors