audiobook_creator/extract_proper_nouns.py

"""
extract_proper_nouns.py
───────────────────────
Scan 'Audio Master Nem Full.txt' and extract all proper nouns into
'proper_nouns.txt', grouped by type and sorted alphabetically.

Uses spaCy for:
  • NER  (PERSON, GPE, LOC, ORG, …)   – named entity recognition
  • POS  (PROPN)                        – catches names spaCy's NER misses
    because they are not in its training vocabulary (e.g. Hagoth, Meninta)

Run:
    .venv/bin/python extract_proper_nouns.py
"""

import re
from collections import defaultdict
from pathlib import Path

import spacy

SOURCE = Path("Audio Master Nem Full.txt")
OUTPUT = Path("proper_nouns.txt")

# ── spaCy setup ────────────────────────────────────────────────────────────────
print("Loading spaCy model …")
nlp = spacy.load("en_core_web_sm")
# Increase max length for the large source file
nlp.max_length = 2_000_000

# ── NER label groups ───────────────────────────────────────────────────────────
PERSON_LABELS = {"PERSON"}
PLACE_LABELS  = {"GPE", "LOC", "FAC"}
ORG_LABELS    = {"ORG", "NORP"}
OTHER_LABELS  = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}

# ── Noise filters ──────────────────────────────────────────────────────────────
# All-caps lines are section headers, not spoken names — skip them.
# Also skip very short tokens that are likely artefacts.
SKIP_PATTERNS = re.compile(
    r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|"
    r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$"
)

def is_noise(text: str) -> bool:
    t = text.strip()
    if len(t) <= 1:
        return True
    if t.isupper() and len(t) > 4:      # all-caps section header word
        return True
    if SKIP_PATTERNS.match(t.upper()):
        return True
    if re.search(r"[^a-zA-Z\-' ]", t):  # contains digits or symbols
        return True
    return False


def canonical(text: str) -> str:
    """Normalise whitespace and title-case."""
    return " ".join(text.split()).title()


# ── Read and process ───────────────────────────────────────────────────────────
print(f"Reading '{SOURCE}' …")
raw_text = SOURCE.read_text(encoding="utf-8")

print("Running spaCy pipeline (this may take a minute) …")
doc = nlp(raw_text)

# Buckets: keyed by display-group name → set of canonical strings
buckets: dict[str, set[str]] = defaultdict(set)

# 1. NER pass — trust spaCy's entity labels
for ent in doc.ents:
    name = canonical(ent.text)
    if is_noise(name):
        continue
    if ent.label_ in PERSON_LABELS:
        buckets["People & Characters"].add(name)
    elif ent.label_ in PLACE_LABELS:
        buckets["Places & Lands"].add(name)
    elif ent.label_ in ORG_LABELS:
        buckets["Groups & Nations"].add(name)
    elif ent.label_ in OTHER_LABELS:
        buckets["Other Named Things"].add(name)
    else:
        buckets["Other Named Things"].add(name)

# 2. PROPN pass — catch names spaCy didn't recognise as entities
#    Only include tokens that are inside a sentence (not at position 0)
#    and are title-cased (filters out all-caps headers).
for token in doc:
    if token.pos_ != "PROPN":
        continue
    text = token.text.strip()
    if not text[0].isupper() or text.isupper():
        continue                          # skip all-caps
    if token.i == token.sent.start:
        continue                          # skip sentence-initial (could be any word)
    name = canonical(text)
    if is_noise(name):
        continue
    # Only add if not already captured by NER
    already_captured = any(name in s for s in buckets.values())
    if not already_captured:
        buckets["Unclassified Proper Nouns"].add(name)

# ── Write output ───────────────────────────────────────────────────────────────
GROUP_ORDER = [
    "People & Characters",
    "Places & Lands",
    "Groups & Nations",
    "Other Named Things",
    "Unclassified Proper Nouns",
]

lines: list[str] = []
lines.append("PROPER NOUNS — Book of the Nem")
lines.append("=" * 50)
lines.append(
    "Review this list for TTS mispronunciations.\n"
    "Each entry is the form that appears in the text.\n"
)

total = 0
for group in GROUP_ORDER:
    names = sorted(buckets.get(group, set()), key=str.casefold)
    if not names:
        continue
    lines.append(f"\n{'─' * 50}")
    lines.append(f"{group.upper()}  ({len(names)})")
    lines.append(f"{'─' * 50}")
    for name in names:
        lines.append(f"  {name}")
    total += len(names)

lines.append(f"\n{'=' * 50}")
lines.append(f"TOTAL: {total} unique proper nouns")

OUTPUT.write_text("\n".join(lines), encoding="utf-8")
print(f"\n✓  Written '{OUTPUT}'  ({total} unique proper nouns)")