2026-02-24 14:40:31 -07:00
|
|
|
|
"""
|
|
|
|
|
|
extract_proper_nouns.py
|
|
|
|
|
|
───────────────────────
|
|
|
|
|
|
Scan 'Audio Master Nem Full.txt' and extract all proper nouns into
|
|
|
|
|
|
'proper_nouns.txt', grouped by type and sorted alphabetically.
|
|
|
|
|
|
|
|
|
|
|
|
Uses spaCy for:
|
|
|
|
|
|
• NER (PERSON, GPE, LOC, ORG, …) – named entity recognition
|
|
|
|
|
|
• POS (PROPN) – catches names spaCy's NER misses
|
|
|
|
|
|
because they are not in its training vocabulary (e.g. Hagoth, Meninta)
|
|
|
|
|
|
|
|
|
|
|
|
Run:
|
|
|
|
|
|
.venv/bin/python extract_proper_nouns.py
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
import spacy
|
2026-02-26 00:57:40 -07:00
|
|
|
|
from wordfreq import top_n_list
|
|
|
|
|
|
|
|
|
|
|
|
# ── Top 10 000 most-frequent English words ──────────────────────────
|
|
|
|
|
|
TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
|
|
|
|
|
|
|
|
|
|
|
|
# Words in the top-10k list that are genuine proper nouns in this text —
|
|
|
|
|
|
# keep them despite the frequency filter.
|
|
|
|
|
|
PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
|
|
|
|
|
|
# Biblical names
|
|
|
|
|
|
"aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
|
|
|
|
|
|
"elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
|
|
|
|
|
|
"jacob", "james", "jehovah", "john", "joseph", "judah",
|
|
|
|
|
|
"laban", "lehi", "levi", "micah", "michael", "moses", "noah",
|
|
|
|
|
|
"peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
|
|
|
|
|
|
"timothy", "zion",
|
|
|
|
|
|
# Book-specific names that happen to match English words
|
|
|
|
|
|
"alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
|
|
|
|
|
|
"mosiah", "nephi", "satan", "sidon",
|
|
|
|
|
|
})
|
2026-02-24 14:40:31 -07:00
|
|
|
|
|
|
|
|
|
|
SOURCE = Path("Audio Master Nem Full.txt")
|
|
|
|
|
|
OUTPUT = Path("proper_nouns.txt")
|
|
|
|
|
|
|
|
|
|
|
|
# ── spaCy setup ────────────────────────────────────────────────────────────────
|
|
|
|
|
|
print("Loading spaCy model …")
|
|
|
|
|
|
nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
|
# Increase max length for the large source file
|
|
|
|
|
|
nlp.max_length = 2_000_000
|
|
|
|
|
|
|
|
|
|
|
|
# ── NER label groups ───────────────────────────────────────────────────────────
|
|
|
|
|
|
PERSON_LABELS = {"PERSON"}
|
|
|
|
|
|
PLACE_LABELS = {"GPE", "LOC", "FAC"}
|
|
|
|
|
|
ORG_LABELS = {"ORG", "NORP"}
|
|
|
|
|
|
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
|
|
|
|
|
|
|
|
|
|
|
|
# ── Noise filters ──────────────────────────────────────────────────────────────
|
2026-02-26 00:57:40 -07:00
|
|
|
|
# Common English words that should be dropped when splitting multi-word entities.
|
|
|
|
|
|
STOP_WORDS: set[str] = {
|
|
|
|
|
|
"A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
|
|
|
|
|
|
"DO", "DID", "DOTH",
|
|
|
|
|
|
"EVEN", "FOR", "FROM",
|
|
|
|
|
|
"HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
|
|
|
|
|
|
"I", "IN", "IS", "IT", "ITS",
|
|
|
|
|
|
"MAY", "ME", "MORE", "MY",
|
|
|
|
|
|
"NAY", "NO", "NOT", "NOW",
|
|
|
|
|
|
"OF", "OR", "OUR",
|
|
|
|
|
|
"SHALL", "SHE", "SO", "SOME",
|
|
|
|
|
|
"THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
|
|
|
|
|
|
"THIS", "THOSE", "THOU", "THUS", "THY", "TO",
|
|
|
|
|
|
"UP", "UPON", "US",
|
|
|
|
|
|
"WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
|
|
|
|
|
|
"YE", "YEA", "YET", "YOU", "YOUR",
|
|
|
|
|
|
# Book-specific common words not worth flagging
|
|
|
|
|
|
"BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
|
|
|
|
|
|
# Generic nouns that slip through NER
|
|
|
|
|
|
"CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
|
|
|
|
|
|
"PEOPLE", "SON", "TIME",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-02-24 14:40:31 -07:00
|
|
|
|
|
|
|
|
|
|
def is_noise(text: str) -> bool:
|
|
|
|
|
|
t = text.strip()
|
|
|
|
|
|
if len(t) <= 1:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if t.isupper() and len(t) > 4: # all-caps section header word
|
|
|
|
|
|
return True
|
2026-02-26 00:57:40 -07:00
|
|
|
|
if t.upper() in STOP_WORDS:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if re.search(r"[^a-zA-Z\-']", t): # contains digits, spaces, or symbols
|
2026-02-24 14:40:31 -07:00
|
|
|
|
return True
|
2026-02-26 00:57:40 -07:00
|
|
|
|
# Drop common English words (no hyphens) unless whitelisted as proper nouns.
|
|
|
|
|
|
if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
|
2026-02-24 14:40:31 -07:00
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def canonical(text: str) -> str:
|
|
|
|
|
|
"""Normalise whitespace and title-case."""
|
|
|
|
|
|
return " ".join(text.split()).title()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-02-26 00:57:40 -07:00
|
|
|
|
def split_words(phrase: str) -> list[str]:
|
|
|
|
|
|
"""Split a phrase on spaces; hyphenated words are kept as one token."""
|
|
|
|
|
|
return phrase.split()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-02-24 14:40:31 -07:00
|
|
|
|
# ── Read and process ───────────────────────────────────────────────────────────
|
|
|
|
|
|
print(f"Reading '{SOURCE}' …")
|
|
|
|
|
|
raw_text = SOURCE.read_text(encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
print("Running spaCy pipeline (this may take a minute) …")
|
|
|
|
|
|
doc = nlp(raw_text)
|
|
|
|
|
|
|
|
|
|
|
|
# Buckets: keyed by display-group name → set of canonical strings
|
|
|
|
|
|
buckets: dict[str, set[str]] = defaultdict(set)
|
|
|
|
|
|
|
|
|
|
|
|
# 1. NER pass — trust spaCy's entity labels
|
2026-02-26 00:57:40 -07:00
|
|
|
|
# Multi-word entities (e.g. "Peter James John") are split into individual
|
|
|
|
|
|
# words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
|
2026-02-24 14:40:31 -07:00
|
|
|
|
for ent in doc.ents:
|
2026-02-26 00:57:40 -07:00
|
|
|
|
phrase = canonical(ent.text)
|
|
|
|
|
|
for word in split_words(phrase):
|
|
|
|
|
|
if is_noise(word):
|
|
|
|
|
|
continue
|
|
|
|
|
|
if ent.label_ in PERSON_LABELS:
|
|
|
|
|
|
buckets["People & Characters"].add(word)
|
|
|
|
|
|
elif ent.label_ in PLACE_LABELS:
|
|
|
|
|
|
buckets["Places & Lands"].add(word)
|
|
|
|
|
|
elif ent.label_ in ORG_LABELS:
|
|
|
|
|
|
buckets["Groups & Nations"].add(word)
|
|
|
|
|
|
elif ent.label_ in OTHER_LABELS:
|
|
|
|
|
|
buckets["Other Named Things"].add(word)
|
|
|
|
|
|
else:
|
|
|
|
|
|
buckets["Other Named Things"].add(word)
|
2026-02-24 14:40:31 -07:00
|
|
|
|
|
|
|
|
|
|
# 2. PROPN pass — catch names spaCy didn't recognise as entities
|
|
|
|
|
|
# Only include tokens that are inside a sentence (not at position 0)
|
|
|
|
|
|
# and are title-cased (filters out all-caps headers).
|
|
|
|
|
|
for token in doc:
|
|
|
|
|
|
if token.pos_ != "PROPN":
|
|
|
|
|
|
continue
|
|
|
|
|
|
text = token.text.strip()
|
|
|
|
|
|
if not text[0].isupper() or text.isupper():
|
|
|
|
|
|
continue # skip all-caps
|
|
|
|
|
|
if token.i == token.sent.start:
|
|
|
|
|
|
continue # skip sentence-initial (could be any word)
|
2026-02-26 00:57:40 -07:00
|
|
|
|
word = canonical(text)
|
|
|
|
|
|
if is_noise(word):
|
2026-02-24 14:40:31 -07:00
|
|
|
|
continue
|
|
|
|
|
|
# Only add if not already captured by NER
|
2026-02-26 00:57:40 -07:00
|
|
|
|
already_captured = any(word in s for s in buckets.values())
|
2026-02-24 14:40:31 -07:00
|
|
|
|
if not already_captured:
|
2026-02-26 00:57:40 -07:00
|
|
|
|
buckets["Unclassified Proper Nouns"].add(word)
|
2026-02-24 14:40:31 -07:00
|
|
|
|
|
|
|
|
|
|
# ── Write output ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
GROUP_ORDER = [
|
|
|
|
|
|
"People & Characters",
|
|
|
|
|
|
"Places & Lands",
|
|
|
|
|
|
"Groups & Nations",
|
|
|
|
|
|
"Other Named Things",
|
|
|
|
|
|
"Unclassified Proper Nouns",
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
lines: list[str] = []
|
|
|
|
|
|
lines.append("PROPER NOUNS — Book of the Nem")
|
|
|
|
|
|
lines.append("=" * 50)
|
|
|
|
|
|
lines.append(
|
|
|
|
|
|
"Review this list for TTS mispronunciations.\n"
|
|
|
|
|
|
"Each entry is the form that appears in the text.\n"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
total = 0
|
|
|
|
|
|
for group in GROUP_ORDER:
|
|
|
|
|
|
names = sorted(buckets.get(group, set()), key=str.casefold)
|
|
|
|
|
|
if not names:
|
|
|
|
|
|
continue
|
|
|
|
|
|
lines.append(f"\n{'─' * 50}")
|
|
|
|
|
|
lines.append(f"{group.upper()} ({len(names)})")
|
|
|
|
|
|
lines.append(f"{'─' * 50}")
|
|
|
|
|
|
for name in names:
|
|
|
|
|
|
lines.append(f" {name}")
|
|
|
|
|
|
total += len(names)
|
|
|
|
|
|
|
|
|
|
|
|
lines.append(f"\n{'=' * 50}")
|
|
|
|
|
|
lines.append(f"TOTAL: {total} unique proper nouns")
|
|
|
|
|
|
|
|
|
|
|
|
OUTPUT.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
print(f"\n✓ Written '{OUTPUT}' ({total} unique proper nouns)")
|