improved proper noun parsing

This commit is contained in:
2026-02-26 00:57:40 -07:00
parent 949bd7c203
commit 6cefc3c862
4 changed files with 472 additions and 2286 deletions

View File

@ -18,6 +18,25 @@ from collections import defaultdict
from pathlib import Path
import spacy
from wordfreq import top_n_list
# ── Top 10 000 most-frequent English words ──────────────────────────
TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
# Words in the top-10k list that are genuine proper nouns in this text —
# keep them despite the frequency filter.
PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
# Biblical names
"aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
"elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
"jacob", "james", "jehovah", "john", "joseph", "judah",
"laban", "lehi", "levi", "micah", "michael", "moses", "noah",
"peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
"timothy", "zion",
# Book-specific names that happen to match English words
"alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
"mosiah", "nephi", "satan", "sidon",
})
SOURCE = Path("Audio Master Nem Full.txt")
OUTPUT = Path("proper_nouns.txt")
@ -35,12 +54,29 @@ ORG_LABELS = {"ORG", "NORP"}
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
# ── Noise filters ──────────────────────────────────────────────────────────────
# All-caps lines are section headers, not spoken names — skip them.
# Also skip very short tokens that are likely artefacts.
SKIP_PATTERNS = re.compile(
r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|"
r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$"
)
# Common English words that should be dropped when splitting multi-word entities.
STOP_WORDS: set[str] = {
"A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
"DO", "DID", "DOTH",
"EVEN", "FOR", "FROM",
"HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
"I", "IN", "IS", "IT", "ITS",
"MAY", "ME", "MORE", "MY",
"NAY", "NO", "NOT", "NOW",
"OF", "OR", "OUR",
"SHALL", "SHE", "SO", "SOME",
"THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
"THIS", "THOSE", "THOU", "THUS", "THY", "TO",
"UP", "UPON", "US",
"WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
"YE", "YEA", "YET", "YOU", "YOUR",
# Book-specific common words not worth flagging
"BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
# Generic nouns that slip through NER
"CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
"PEOPLE", "SON", "TIME",
}
def is_noise(text: str) -> bool:
t = text.strip()
@ -48,9 +84,12 @@ def is_noise(text: str) -> bool:
return True
if t.isupper() and len(t) > 4: # all-caps section header word
return True
if SKIP_PATTERNS.match(t.upper()):
if t.upper() in STOP_WORDS:
return True
if re.search(r"[^a-zA-Z\-' ]", t): # contains digits or symbols
if re.search(r"[^a-zA-Z\-']", t): # contains digits, spaces, or symbols
return True
# Drop common English words (no hyphens) unless whitelisted as proper nouns.
if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
return True
return False
@ -60,6 +99,11 @@ def canonical(text: str) -> str:
return " ".join(text.split()).title()
def split_words(phrase: str) -> list[str]:
"""Split a phrase on spaces; hyphenated words are kept as one token."""
return phrase.split()
# ── Read and process ───────────────────────────────────────────────────────────
print(f"Reading '{SOURCE}'")
raw_text = SOURCE.read_text(encoding="utf-8")
@ -71,20 +115,23 @@ doc = nlp(raw_text)
buckets: dict[str, set[str]] = defaultdict(set)
# 1. NER pass — trust spaCy's entity labels
# Multi-word entities (e.g. "Peter James John") are split into individual
# words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
for ent in doc.ents:
name = canonical(ent.text)
if is_noise(name):
continue
if ent.label_ in PERSON_LABELS:
buckets["People & Characters"].add(name)
elif ent.label_ in PLACE_LABELS:
buckets["Places & Lands"].add(name)
elif ent.label_ in ORG_LABELS:
buckets["Groups & Nations"].add(name)
elif ent.label_ in OTHER_LABELS:
buckets["Other Named Things"].add(name)
else:
buckets["Other Named Things"].add(name)
phrase = canonical(ent.text)
for word in split_words(phrase):
if is_noise(word):
continue
if ent.label_ in PERSON_LABELS:
buckets["People & Characters"].add(word)
elif ent.label_ in PLACE_LABELS:
buckets["Places & Lands"].add(word)
elif ent.label_ in ORG_LABELS:
buckets["Groups & Nations"].add(word)
elif ent.label_ in OTHER_LABELS:
buckets["Other Named Things"].add(word)
else:
buckets["Other Named Things"].add(word)
# 2. PROPN pass — catch names spaCy didn't recognise as entities
# Only include tokens that are inside a sentence (not at position 0)
@ -97,13 +144,13 @@ for token in doc:
continue # skip all-caps
if token.i == token.sent.start:
continue # skip sentence-initial (could be any word)
name = canonical(text)
if is_noise(name):
word = canonical(text)
if is_noise(word):
continue
# Only add if not already captured by NER
already_captured = any(name in s for s in buckets.values())
already_captured = any(word in s for s in buckets.values())
if not already_captured:
buckets["Unclassified Proper Nouns"].add(name)
buckets["Unclassified Proper Nouns"].add(word)
# ── Write output ───────────────────────────────────────────────────────────────
GROUP_ORDER = [