audiobook_creator/extract_proper_nouns.py

"""
extract_proper_nouns.py
───────────────────────
Scan 'Audio Master Nem Full.txt' and extract all proper nouns into
'proper_nouns.txt', grouped by type and sorted alphabetically.

Uses spaCy for:
  • NER  (PERSON, GPE, LOC, ORG, …)   – named entity recognition
  • POS  (PROPN)                        – catches names spaCy's NER misses
    because they are not in its training vocabulary (e.g. Hagoth, Meninta)

Run:
    .venv/bin/python extract_proper_nouns.py
"""

import re
from collections import defaultdict
from pathlib import Path

import spacy
from wordfreq import top_n_list

# ── Top 10 000 most-frequent English words ──────────────────────────
TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))

# Words in the top-10k list that are genuine proper nouns in this text —
# keep them despite the frequency filter.
PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
    # Biblical names
    "aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
    "elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
    "jacob", "james", "jehovah", "john", "joseph", "judah",
    "laban", "lehi", "levi", "micah", "michael", "moses", "noah",
    "peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
    "timothy", "zion",
    # Book-specific names that happen to match English words
    "alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
    "mosiah", "nephi", "satan", "sidon",
})

SOURCE = Path("Audio Master Nem Full.txt")
OUTPUT = Path("proper_nouns.txt")

# ── spaCy setup ────────────────────────────────────────────────────────────────
print("Loading spaCy model …")
nlp = spacy.load("en_core_web_sm")
# Increase max length for the large source file
nlp.max_length = 2_000_000

# ── NER label groups ───────────────────────────────────────────────────────────
PERSON_LABELS = {"PERSON"}
PLACE_LABELS  = {"GPE", "LOC", "FAC"}
ORG_LABELS    = {"ORG", "NORP"}
OTHER_LABELS  = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}

# ── Noise filters ──────────────────────────────────────────────────────────────
# Common English words that should be dropped when splitting multi-word entities.
STOP_WORDS: set[str] = {
    "A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
    "DO", "DID", "DOTH",
    "EVEN", "FOR", "FROM",
    "HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
    "I", "IN", "IS", "IT", "ITS",
    "MAY", "ME", "MORE", "MY",
    "NAY", "NO", "NOT", "NOW",
    "OF", "OR", "OUR",
    "SHALL", "SHE", "SO", "SOME",
    "THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
    "THIS", "THOSE", "THOU", "THUS", "THY", "TO",
    "UP", "UPON", "US",
    "WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
    "YE", "YEA", "YET", "YOU", "YOUR",
    # Book-specific common words not worth flagging
    "BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
    # Generic nouns that slip through NER
    "CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
    "PEOPLE", "SON", "TIME",
}


def is_noise(text: str) -> bool:
    t = text.strip()
    if len(t) <= 1:
        return True
    if t.isupper() and len(t) > 4:      # all-caps section header word
        return True
    if t.upper() in STOP_WORDS:
        return True
    if re.search(r"[^a-zA-Z\-']", t):   # contains digits, spaces, or symbols
        return True
    # Drop common English words (no hyphens) unless whitelisted as proper nouns.
    if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
        return True
    return False


def canonical(text: str) -> str:
    """Normalise whitespace and title-case."""
    return " ".join(text.split()).title()


def split_words(phrase: str) -> list[str]:
    """Split a phrase on spaces; hyphenated words are kept as one token."""
    return phrase.split()


# ── Read and process ───────────────────────────────────────────────────────────
print(f"Reading '{SOURCE}' …")
raw_text = SOURCE.read_text(encoding="utf-8")

print("Running spaCy pipeline (this may take a minute) …")
doc = nlp(raw_text)

# Buckets: keyed by display-group name → set of canonical strings
buckets: dict[str, set[str]] = defaultdict(set)

# 1. NER pass — trust spaCy's entity labels
#    Multi-word entities (e.g. "Peter James John") are split into individual
#    words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
for ent in doc.ents:
    phrase = canonical(ent.text)
    for word in split_words(phrase):
        if is_noise(word):
            continue
        if ent.label_ in PERSON_LABELS:
            buckets["People & Characters"].add(word)
        elif ent.label_ in PLACE_LABELS:
            buckets["Places & Lands"].add(word)
        elif ent.label_ in ORG_LABELS:
            buckets["Groups & Nations"].add(word)
        elif ent.label_ in OTHER_LABELS:
            buckets["Other Named Things"].add(word)
        else:
            buckets["Other Named Things"].add(word)

# 2. PROPN pass — catch names spaCy didn't recognise as entities
#    Only include tokens that are inside a sentence (not at position 0)
#    and are title-cased (filters out all-caps headers).
for token in doc:
    if token.pos_ != "PROPN":
        continue
    text = token.text.strip()
    if not text[0].isupper() or text.isupper():
        continue                          # skip all-caps
    if token.i == token.sent.start:
        continue                          # skip sentence-initial (could be any word)
    word = canonical(text)
    if is_noise(word):
        continue
    # Only add if not already captured by NER
    already_captured = any(word in s for s in buckets.values())
    if not already_captured:
        buckets["Unclassified Proper Nouns"].add(word)

# ── Write output ───────────────────────────────────────────────────────────────
GROUP_ORDER = [
    "People & Characters",
    "Places & Lands",
    "Groups & Nations",
    "Other Named Things",
    "Unclassified Proper Nouns",
]

lines: list[str] = []
lines.append("PROPER NOUNS — Book of the Nem")
lines.append("=" * 50)
lines.append(
    "Review this list for TTS mispronunciations.\n"
    "Each entry is the form that appears in the text.\n"
)

total = 0
for group in GROUP_ORDER:
    names = sorted(buckets.get(group, set()), key=str.casefold)
    if not names:
        continue
    lines.append(f"\n{'─' * 50}")
    lines.append(f"{group.upper()}  ({len(names)})")
    lines.append(f"{'─' * 50}")
    for name in names:
        lines.append(f"  {name}")
    total += len(names)

lines.append(f"\n{'=' * 50}")
lines.append(f"TOTAL: {total} unique proper nouns")

OUTPUT.write_text("\n".join(lines), encoding="utf-8")
print(f"\n✓  Written '{OUTPUT}'  ({total} unique proper nouns)")