Refactor keyword extraction logic to convert stopwords to a list for compatibility with scikit-learn; add checks for empty keywords list in timestamp extraction function.

This commit is contained in:
Your Name
2025-09-10 23:33:57 -04:00
parent f04853eba9
commit 81f40726e3

View File

@ -43,11 +43,11 @@ def extract_keywords_tfidf(text, max_keywords=10, ngram_range=(1, 2)):
# Preprocess text # Preprocess text
text = text.lower() text = text.lower()
# Remove common stopwords # Remove common stopwords - convert to list for scikit-learn compatibility
stopwords = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', stopwords = ['a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those', 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for', 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
'is', 'of', 'while', 'during', 'to', 'from', 'in', 'out', 'on', 'off', 'by'} 'is', 'of', 'while', 'during', 'to', 'from', 'in', 'out', 'on', 'off', 'by']
# Create sentences for better TF-IDF analysis # Create sentences for better TF-IDF analysis
sentences = re.split(r'[.!?]', text) sentences = re.split(r'[.!?]', text)
@ -146,6 +146,10 @@ def find_keyword_timestamps(segments, keywords):
keyword_timestamps = {} keyword_timestamps = {}
# Convert keywords to lowercase for case-insensitive matching # Convert keywords to lowercase for case-insensitive matching
# Check if keywords list is not empty before accessing keywords[0]
if not keywords:
return keyword_timestamps
if isinstance(keywords[0], tuple): if isinstance(keywords[0], tuple):
# If keywords is a list of (keyword, score) tuples # If keywords is a list of (keyword, score) tuples
keywords_lower = [k.lower() for k, _ in keywords] keywords_lower = [k.lower() for k, _ in keywords]
@ -163,6 +167,7 @@ def find_keyword_timestamps(segments, keywords):
for i, keyword in enumerate(keywords_lower): for i, keyword in enumerate(keywords_lower):
if keyword in segment_text: if keyword in segment_text:
# Get the original case of the keyword # Get the original case of the keyword
# Safe access to keywords[0] since we already checked keywords is not empty
original_keyword = keywords[i][0] if isinstance(keywords[0], tuple) else keywords[i] original_keyword = keywords[i][0] if isinstance(keywords[0], tuple) else keywords[i]
# Initialize the list if this is the first occurrence # Initialize the list if this is the first occurrence