Files
audiobook_creator/extract_proper_nouns.py

189 lines
7.3 KiB
Python
Raw Normal View History

"""
extract_proper_nouns.py
Scan 'Audio Master Nem Full.txt' and extract all proper nouns into
'proper_nouns.txt', grouped by type and sorted alphabetically.
Uses spaCy for:
NER (PERSON, GPE, LOC, ORG, ) named entity recognition
POS (PROPN) catches names spaCy's NER misses
because they are not in its training vocabulary (e.g. Hagoth, Meninta)
Run:
.venv/bin/python extract_proper_nouns.py
"""
import re
from collections import defaultdict
from pathlib import Path
import spacy
2026-02-26 00:57:40 -07:00
from wordfreq import top_n_list
# ── Top 10 000 most-frequent English words ──────────────────────────
TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
# Words in the top-10k list that are genuine proper nouns in this text —
# keep them despite the frequency filter.
PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
# Biblical names
"aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
"elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
"jacob", "james", "jehovah", "john", "joseph", "judah",
"laban", "lehi", "levi", "micah", "michael", "moses", "noah",
"peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
"timothy", "zion",
# Book-specific names that happen to match English words
"alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
"mosiah", "nephi", "satan", "sidon",
})
SOURCE = Path("Audio Master Nem Full.txt")
OUTPUT = Path("proper_nouns.txt")
# ── spaCy setup ────────────────────────────────────────────────────────────────
print("Loading spaCy model …")
nlp = spacy.load("en_core_web_sm")
# Increase max length for the large source file
nlp.max_length = 2_000_000
# ── NER label groups ───────────────────────────────────────────────────────────
PERSON_LABELS = {"PERSON"}
PLACE_LABELS = {"GPE", "LOC", "FAC"}
ORG_LABELS = {"ORG", "NORP"}
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
# ── Noise filters ──────────────────────────────────────────────────────────────
2026-02-26 00:57:40 -07:00
# Common English words that should be dropped when splitting multi-word entities.
STOP_WORDS: set[str] = {
"A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
"DO", "DID", "DOTH",
"EVEN", "FOR", "FROM",
"HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
"I", "IN", "IS", "IT", "ITS",
"MAY", "ME", "MORE", "MY",
"NAY", "NO", "NOT", "NOW",
"OF", "OR", "OUR",
"SHALL", "SHE", "SO", "SOME",
"THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
"THIS", "THOSE", "THOU", "THUS", "THY", "TO",
"UP", "UPON", "US",
"WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
"YE", "YEA", "YET", "YOU", "YOUR",
# Book-specific common words not worth flagging
"BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
# Generic nouns that slip through NER
"CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
"PEOPLE", "SON", "TIME",
}
def is_noise(text: str) -> bool:
t = text.strip()
if len(t) <= 1:
return True
if t.isupper() and len(t) > 4: # all-caps section header word
return True
2026-02-26 00:57:40 -07:00
if t.upper() in STOP_WORDS:
return True
if re.search(r"[^a-zA-Z\-']", t): # contains digits, spaces, or symbols
return True
2026-02-26 00:57:40 -07:00
# Drop common English words (no hyphens) unless whitelisted as proper nouns.
if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
return True
return False
def canonical(text: str) -> str:
"""Normalise whitespace and title-case."""
return " ".join(text.split()).title()
2026-02-26 00:57:40 -07:00
def split_words(phrase: str) -> list[str]:
"""Split a phrase on spaces; hyphenated words are kept as one token."""
return phrase.split()
# ── Read and process ───────────────────────────────────────────────────────────
print(f"Reading '{SOURCE}'")
raw_text = SOURCE.read_text(encoding="utf-8")
print("Running spaCy pipeline (this may take a minute) …")
doc = nlp(raw_text)
# Buckets: keyed by display-group name → set of canonical strings
buckets: dict[str, set[str]] = defaultdict(set)
# 1. NER pass — trust spaCy's entity labels
2026-02-26 00:57:40 -07:00
# Multi-word entities (e.g. "Peter James John") are split into individual
# words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
for ent in doc.ents:
2026-02-26 00:57:40 -07:00
phrase = canonical(ent.text)
for word in split_words(phrase):
if is_noise(word):
continue
if ent.label_ in PERSON_LABELS:
buckets["People & Characters"].add(word)
elif ent.label_ in PLACE_LABELS:
buckets["Places & Lands"].add(word)
elif ent.label_ in ORG_LABELS:
buckets["Groups & Nations"].add(word)
elif ent.label_ in OTHER_LABELS:
buckets["Other Named Things"].add(word)
else:
buckets["Other Named Things"].add(word)
# 2. PROPN pass — catch names spaCy didn't recognise as entities
# Only include tokens that are inside a sentence (not at position 0)
# and are title-cased (filters out all-caps headers).
for token in doc:
if token.pos_ != "PROPN":
continue
text = token.text.strip()
if not text[0].isupper() or text.isupper():
continue # skip all-caps
if token.i == token.sent.start:
continue # skip sentence-initial (could be any word)
2026-02-26 00:57:40 -07:00
word = canonical(text)
if is_noise(word):
continue
# Only add if not already captured by NER
2026-02-26 00:57:40 -07:00
already_captured = any(word in s for s in buckets.values())
if not already_captured:
2026-02-26 00:57:40 -07:00
buckets["Unclassified Proper Nouns"].add(word)
# ── Write output ───────────────────────────────────────────────────────────────
GROUP_ORDER = [
"People & Characters",
"Places & Lands",
"Groups & Nations",
"Other Named Things",
"Unclassified Proper Nouns",
]
lines: list[str] = []
lines.append("PROPER NOUNS — Book of the Nem")
lines.append("=" * 50)
lines.append(
"Review this list for TTS mispronunciations.\n"
"Each entry is the form that appears in the text.\n"
)
total = 0
for group in GROUP_ORDER:
names = sorted(buckets.get(group, set()), key=str.casefold)
if not names:
continue
lines.append(f"\n{'' * 50}")
lines.append(f"{group.upper()} ({len(names)})")
lines.append(f"{'' * 50}")
for name in names:
lines.append(f" {name}")
total += len(names)
lines.append(f"\n{'=' * 50}")
lines.append(f"TOTAL: {total} unique proper nouns")
OUTPUT.write_text("\n".join(lines), encoding="utf-8")
print(f"\n✓ Written '{OUTPUT}' ({total} unique proper nouns)")