Refactor keyword extraction logic to convert stopwords to a list for compatibility with scikit-learn; add checks for empty keywords list in timestamp extraction function.
This commit is contained in:
@ -43,11 +43,11 @@ def extract_keywords_tfidf(text, max_keywords=10, ngram_range=(1, 2)):
|
|||||||
# Preprocess text
|
# Preprocess text
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
|
||||||
# Remove common stopwords
|
# Remove common stopwords - convert to list for scikit-learn compatibility
|
||||||
stopwords = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
stopwords = ['a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
||||||
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
||||||
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
||||||
'is', 'of', 'while', 'during', 'to', 'from', 'in', 'out', 'on', 'off', 'by'}
|
'is', 'of', 'while', 'during', 'to', 'from', 'in', 'out', 'on', 'off', 'by']
|
||||||
|
|
||||||
# Create sentences for better TF-IDF analysis
|
# Create sentences for better TF-IDF analysis
|
||||||
sentences = re.split(r'[.!?]', text)
|
sentences = re.split(r'[.!?]', text)
|
||||||
@ -146,6 +146,10 @@ def find_keyword_timestamps(segments, keywords):
|
|||||||
keyword_timestamps = {}
|
keyword_timestamps = {}
|
||||||
|
|
||||||
# Convert keywords to lowercase for case-insensitive matching
|
# Convert keywords to lowercase for case-insensitive matching
|
||||||
|
# Check if keywords list is not empty before accessing keywords[0]
|
||||||
|
if not keywords:
|
||||||
|
return keyword_timestamps
|
||||||
|
|
||||||
if isinstance(keywords[0], tuple):
|
if isinstance(keywords[0], tuple):
|
||||||
# If keywords is a list of (keyword, score) tuples
|
# If keywords is a list of (keyword, score) tuples
|
||||||
keywords_lower = [k.lower() for k, _ in keywords]
|
keywords_lower = [k.lower() for k, _ in keywords]
|
||||||
@ -163,6 +167,7 @@ def find_keyword_timestamps(segments, keywords):
|
|||||||
for i, keyword in enumerate(keywords_lower):
|
for i, keyword in enumerate(keywords_lower):
|
||||||
if keyword in segment_text:
|
if keyword in segment_text:
|
||||||
# Get the original case of the keyword
|
# Get the original case of the keyword
|
||||||
|
# Safe access to keywords[0] since we already checked keywords is not empty
|
||||||
original_keyword = keywords[i][0] if isinstance(keywords[0], tuple) else keywords[i]
|
original_keyword = keywords[i][0] if isinstance(keywords[0], tuple) else keywords[i]
|
||||||
|
|
||||||
# Initialize the list if this is the first occurrence
|
# Initialize the list if this is the first occurrence
|
||||||
|
|||||||
Reference in New Issue
Block a user