Files
audiobook_creator/extract_proper_nouns.py

142 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
extract_proper_nouns.py
───────────────────────
Scan 'Audio Master Nem Full.txt' and extract all proper nouns into
'proper_nouns.txt', grouped by type and sorted alphabetically.
Uses spaCy for:
• NER (PERSON, GPE, LOC, ORG, …) named entity recognition
• POS (PROPN) catches names spaCy's NER misses
because they are not in its training vocabulary (e.g. Hagoth, Meninta)
Run:
.venv/bin/python extract_proper_nouns.py
"""
import re
from collections import defaultdict
from pathlib import Path
import spacy
SOURCE = Path("Audio Master Nem Full.txt")
OUTPUT = Path("proper_nouns.txt")
# ── spaCy setup ────────────────────────────────────────────────────────────────
print("Loading spaCy model …")
nlp = spacy.load("en_core_web_sm")
# Increase max length for the large source file
nlp.max_length = 2_000_000
# ── NER label groups ───────────────────────────────────────────────────────────
PERSON_LABELS = {"PERSON"}
PLACE_LABELS = {"GPE", "LOC", "FAC"}
ORG_LABELS = {"ORG", "NORP"}
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
# ── Noise filters ──────────────────────────────────────────────────────────────
# All-caps lines are section headers, not spoken names — skip them.
# Also skip very short tokens that are likely artefacts.
SKIP_PATTERNS = re.compile(
r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|"
r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$"
)
def is_noise(text: str) -> bool:
t = text.strip()
if len(t) <= 1:
return True
if t.isupper() and len(t) > 4: # all-caps section header word
return True
if SKIP_PATTERNS.match(t.upper()):
return True
if re.search(r"[^a-zA-Z\-' ]", t): # contains digits or symbols
return True
return False
def canonical(text: str) -> str:
"""Normalise whitespace and title-case."""
return " ".join(text.split()).title()
# ── Read and process ───────────────────────────────────────────────────────────
print(f"Reading '{SOURCE}'")
raw_text = SOURCE.read_text(encoding="utf-8")
print("Running spaCy pipeline (this may take a minute) …")
doc = nlp(raw_text)
# Buckets: keyed by display-group name → set of canonical strings
buckets: dict[str, set[str]] = defaultdict(set)
# 1. NER pass — trust spaCy's entity labels
for ent in doc.ents:
name = canonical(ent.text)
if is_noise(name):
continue
if ent.label_ in PERSON_LABELS:
buckets["People & Characters"].add(name)
elif ent.label_ in PLACE_LABELS:
buckets["Places & Lands"].add(name)
elif ent.label_ in ORG_LABELS:
buckets["Groups & Nations"].add(name)
elif ent.label_ in OTHER_LABELS:
buckets["Other Named Things"].add(name)
else:
buckets["Other Named Things"].add(name)
# 2. PROPN pass — catch names spaCy didn't recognise as entities
# Only include tokens that are inside a sentence (not at position 0)
# and are title-cased (filters out all-caps headers).
for token in doc:
if token.pos_ != "PROPN":
continue
text = token.text.strip()
if not text[0].isupper() or text.isupper():
continue # skip all-caps
if token.i == token.sent.start:
continue # skip sentence-initial (could be any word)
name = canonical(text)
if is_noise(name):
continue
# Only add if not already captured by NER
already_captured = any(name in s for s in buckets.values())
if not already_captured:
buckets["Unclassified Proper Nouns"].add(name)
# ── Write output ───────────────────────────────────────────────────────────────
GROUP_ORDER = [
"People & Characters",
"Places & Lands",
"Groups & Nations",
"Other Named Things",
"Unclassified Proper Nouns",
]
lines: list[str] = []
lines.append("PROPER NOUNS — Book of the Nem")
lines.append("=" * 50)
lines.append(
"Review this list for TTS mispronunciations.\n"
"Each entry is the form that appears in the text.\n"
)
total = 0
for group in GROUP_ORDER:
names = sorted(buckets.get(group, set()), key=str.casefold)
if not names:
continue
lines.append(f"\n{'' * 50}")
lines.append(f"{group.upper()} ({len(names)})")
lines.append(f"{'' * 50}")
for name in names:
lines.append(f" {name}")
total += len(names)
lines.append(f"\n{'=' * 50}")
lines.append(f"TOTAL: {total} unique proper nouns")
OUTPUT.write_text("\n".join(lines), encoding="utf-8")
print(f"\n✓ Written '{OUTPUT}' ({total} unique proper nouns)")