Fix summarization issues and improve GPU handling. Update .gitignore for venv
This commit is contained in:
9
.gitignore
vendored
9
.gitignore
vendored
@ -1,4 +1,13 @@
|
|||||||
|
# Python virtual environment
|
||||||
|
venv/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
|
|
||||||
|
# IDE files
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
# OS files
|
||||||
.env
|
.env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# Core dependencies
|
# Core dependencies
|
||||||
streamlit==1.26.0
|
streamlit==1.26.0
|
||||||
moviepy==1.0.3
|
moviepy==1.0.3
|
||||||
openai-whisper>=20230314
|
openai-whisper==20231117
|
||||||
transformers>=4.21.1
|
transformers>=4.21.1
|
||||||
torch>=1.7.0
|
torch>=1.7.0
|
||||||
torchaudio>=0.7.0
|
torchaudio>=0.7.0
|
||||||
|
|||||||
@ -68,6 +68,8 @@ def get_optimal_device():
|
|||||||
|
|
||||||
|
|
||||||
def set_memory_limits(memory_fraction=0.8):
|
def set_memory_limits(memory_fraction=0.8):
|
||||||
|
global torch
|
||||||
|
import torch
|
||||||
"""
|
"""
|
||||||
Set memory limits for GPU usage.
|
Set memory limits for GPU usage.
|
||||||
|
|
||||||
@ -81,9 +83,6 @@ def set_memory_limits(memory_fraction=0.8):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Import only if CUDA is available
|
|
||||||
import torch.cuda
|
|
||||||
|
|
||||||
# Set memory fraction for each device
|
# Set memory fraction for each device
|
||||||
for i in range(torch.cuda.device_count()):
|
for i in range(torch.cuda.device_count()):
|
||||||
torch.cuda.set_per_process_memory_fraction(memory_fraction, i)
|
torch.cuda.set_per_process_memory_fraction(memory_fraction, i)
|
||||||
|
|||||||
@ -1,8 +1,113 @@
|
|||||||
from transformers import pipeline
|
from transformers import pipeline, AutoTokenizer
|
||||||
|
import torch
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
SUMMARY_MODEL = "Falconsai/text_summarization"
|
SUMMARY_MODEL = "Falconsai/text_summarization"
|
||||||
|
|
||||||
def summarize_text(text):
|
def chunk_text(text, max_tokens, tokenizer):
|
||||||
"""Summarize text using a Hugging Face pipeline."""
|
"""
|
||||||
summarizer = pipeline("summarization", model=SUMMARY_MODEL)
|
Splits the text into a list of chunks based on token limits.
|
||||||
return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to chunk
|
||||||
|
max_tokens (int): Maximum tokens per chunk
|
||||||
|
tokenizer (AutoTokenizer): Tokenizer to use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of text chunks
|
||||||
|
"""
|
||||||
|
words = text.split()
|
||||||
|
chunks = []
|
||||||
|
current_chunk = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||||||
|
if hypothetical_length <= max_tokens:
|
||||||
|
current_chunk.append(word)
|
||||||
|
current_length = hypothetical_length
|
||||||
|
else:
|
||||||
|
chunks.append(' '.join(current_chunk))
|
||||||
|
current_chunk = [word]
|
||||||
|
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||||||
|
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(' '.join(current_chunk))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def summarize_text(text, use_gpu=True, memory_fraction=0.8):
|
||||||
|
"""
|
||||||
|
Summarize text using a Hugging Face pipeline with chunking support.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to summarize
|
||||||
|
use_gpu (bool): Whether to use GPU if available
|
||||||
|
memory_fraction (float): Fraction of GPU memory to use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Summarized text
|
||||||
|
"""
|
||||||
|
# Determine device
|
||||||
|
device = -1 # Default to CPU
|
||||||
|
if use_gpu and torch.cuda.is_available():
|
||||||
|
device = 0 # Use first GPU
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.set_per_process_memory_fraction(memory_fraction)
|
||||||
|
|
||||||
|
logger.info(f"Using device {device} for summarization")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Initialize the pipeline and tokenizer
|
||||||
|
summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL)
|
||||||
|
|
||||||
|
# Check if text needs to be chunked
|
||||||
|
max_tokens = 512
|
||||||
|
tokens = tokenizer(text, return_tensors='pt')
|
||||||
|
num_tokens = len(tokens['input_ids'][0])
|
||||||
|
|
||||||
|
if num_tokens > max_tokens:
|
||||||
|
chunks = chunk_text(text, max_tokens, tokenizer)
|
||||||
|
summaries = []
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
|
||||||
|
summary_output = summarizer(
|
||||||
|
"summarize: " + chunk,
|
||||||
|
max_length=150,
|
||||||
|
min_length=30,
|
||||||
|
do_sample=False
|
||||||
|
)
|
||||||
|
summaries.append(summary_output[0]['summary_text'])
|
||||||
|
|
||||||
|
# If multiple chunks, summarize the combined summaries
|
||||||
|
if len(summaries) > 1:
|
||||||
|
logger.info("Generating final summary from chunk summaries")
|
||||||
|
combined_text = " ".join(summaries)
|
||||||
|
return summarizer(
|
||||||
|
"summarize: " + combined_text,
|
||||||
|
max_length=150,
|
||||||
|
min_length=30,
|
||||||
|
do_sample=False
|
||||||
|
)[0]['summary_text']
|
||||||
|
return summaries[0]
|
||||||
|
else:
|
||||||
|
return summarizer(
|
||||||
|
"summarize: " + text,
|
||||||
|
max_length=150,
|
||||||
|
min_length=30,
|
||||||
|
do_sample=False
|
||||||
|
)[0]['summary_text']
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during summarization: {e}")
|
||||||
|
# Fallback to CPU if GPU fails
|
||||||
|
if device != -1:
|
||||||
|
logger.info("Falling back to CPU")
|
||||||
|
return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction)
|
||||||
|
raise
|
||||||
|
|||||||
@ -25,7 +25,6 @@ logging.basicConfig(level=logging.INFO)
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
WHISPER_MODEL = "base"
|
WHISPER_MODEL = "base"
|
||||||
SUMMARIZATION_MODEL = "t5-base"
|
|
||||||
|
|
||||||
def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,
|
def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,
|
||||||
use_gpu=True, memory_fraction=0.8):
|
use_gpu=True, memory_fraction=0.8):
|
||||||
@ -84,106 +83,3 @@ def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cach
|
|||||||
save_to_cache(audio_path, cache_data, model, "transcribe")
|
save_to_cache(audio_path, cache_data, model, "transcribe")
|
||||||
|
|
||||||
return segments, transcript
|
return segments, transcript
|
||||||
|
|
||||||
|
|
||||||
def summarize_text(text, model=SUMMARIZATION_MODEL, use_gpu=True, memory_fraction=0.8):
|
|
||||||
"""
|
|
||||||
Summarize text using a pre-trained transformer model with chunking.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): Text to summarize
|
|
||||||
model (str): Model to use for summarization
|
|
||||||
use_gpu (bool): Whether to use GPU acceleration if available
|
|
||||||
memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Summarized text
|
|
||||||
"""
|
|
||||||
# Configure device
|
|
||||||
device = torch.device("cpu")
|
|
||||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
|
||||||
device = get_optimal_device()
|
|
||||||
logger.info(f"Using device: {device} for summarization")
|
|
||||||
|
|
||||||
# Initialize the pipeline with the specified device
|
|
||||||
device_arg = -1 if device.type == "cpu" else 0 # -1 for CPU, 0 for GPU
|
|
||||||
summarization_pipeline = pipeline("summarization", model=model, device=device_arg)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
|
|
||||||
max_tokens = 512
|
|
||||||
|
|
||||||
tokens = tokenizer(text, return_tensors='pt')
|
|
||||||
num_tokens = len(tokens['input_ids'][0])
|
|
||||||
|
|
||||||
if num_tokens > max_tokens:
|
|
||||||
chunks = chunk_text(text, max_tokens, tokenizer)
|
|
||||||
summaries = []
|
|
||||||
|
|
||||||
for i, chunk in enumerate(chunks):
|
|
||||||
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
|
|
||||||
summary_output = summarization_pipeline(
|
|
||||||
"summarize: " + chunk,
|
|
||||||
max_length=150,
|
|
||||||
min_length=30,
|
|
||||||
do_sample=False
|
|
||||||
)
|
|
||||||
summaries.append(summary_output[0]['summary_text'])
|
|
||||||
|
|
||||||
overall_summary = " ".join(summaries)
|
|
||||||
|
|
||||||
# If the combined summary is still long, summarize it again
|
|
||||||
if len(summaries) > 1:
|
|
||||||
logger.info("Generating final summary from chunk summaries")
|
|
||||||
combined_text = " ".join(summaries)
|
|
||||||
overall_summary = summarization_pipeline(
|
|
||||||
"summarize: " + combined_text,
|
|
||||||
max_length=150,
|
|
||||||
min_length=30,
|
|
||||||
do_sample=False
|
|
||||||
)[0]['summary_text']
|
|
||||||
else:
|
|
||||||
overall_summary = summarization_pipeline(
|
|
||||||
"summarize: " + text,
|
|
||||||
max_length=150,
|
|
||||||
min_length=30,
|
|
||||||
do_sample=False
|
|
||||||
)[0]['summary_text']
|
|
||||||
|
|
||||||
return overall_summary
|
|
||||||
|
|
||||||
|
|
||||||
def chunk_text(text, max_tokens, tokenizer=None):
|
|
||||||
"""
|
|
||||||
Splits the text into a list of chunks based on token limits.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): Text to chunk
|
|
||||||
max_tokens (int): Maximum tokens per chunk
|
|
||||||
tokenizer (AutoTokenizer, optional): Tokenizer to use
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: List of text chunks
|
|
||||||
"""
|
|
||||||
if tokenizer is None:
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
|
|
||||||
|
|
||||||
words = text.split()
|
|
||||||
|
|
||||||
chunks = []
|
|
||||||
current_chunk = []
|
|
||||||
current_length = 0
|
|
||||||
|
|
||||||
for word in words:
|
|
||||||
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
|
||||||
if hypothetical_length <= max_tokens:
|
|
||||||
current_chunk.append(word)
|
|
||||||
current_length = hypothetical_length
|
|
||||||
else:
|
|
||||||
chunks.append(' '.join(current_chunk))
|
|
||||||
current_chunk = [word]
|
|
||||||
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
|
||||||
|
|
||||||
if current_chunk:
|
|
||||||
chunks.append(' '.join(current_chunk))
|
|
||||||
|
|
||||||
return chunks
|
|
||||||
Reference in New Issue
Block a user