improved proper noun parsing

Clean correct_words.json: single words, filter stop words, keep two-part proper names
2026-02-26 00:57:40 -07:00 · 2026-02-25 23:50:52 -07:00
5 changed files with 781 additions and 3076 deletions
--- a/create_audiobook_nem.py
+++ b/create_audiobook_nem.py
@ -11,6 +11,7 @@ Output .wav files are written to OUTPUT_DIR (created automatically).
 """
 import re
 import time
 import numpy as np
 import soundfile as sf
 import torch
@ -51,19 +52,19 @@ BOOKS = [
    # label                       start_marker                       voice         output_wav
    ("Introduction",              "Introduction",                    "af_heart",   "00_introduction.wav"),
    ("Book of Hagoth",            "THE BOOK OF HAGOTH",              "am_fenrir",  "01_hagoth.wav"),
-    ("Shi-Tugo I",                "THE FIRST BOOK OF SHI-TUGO",      "am_eric",    "02_shi_tugo_1.wav"),
+    # ("Shi-Tugo I",                "THE FIRST BOOK OF SHI-TUGO",      "am_eric",    "02_shi_tugo_1.wav"),
-    ("Sanempet",                  "THE BOOK OF SANEMPET",            "am_liam",    "03_sanempet.wav"),
+    # ("Sanempet",                  "THE BOOK OF SANEMPET",            "am_liam",    "03_sanempet.wav"),
-    ("Oug",                       "THE BOOK OF OUG",                 "am_michael", "04_oug.wav"),
+    # ("Oug",                       "THE BOOK OF OUG",                 "am_michael", "04_oug.wav"),
-    ("Temple Writings of Oug",    "THE BOOK OF",                     "am_michael", "05_temple_writings_oug.wav"),
+    # ("Temple Writings of Oug",    "THE BOOK OF",                     "am_michael", "05_temple_writings_oug.wav"),
-    ("Sacred Temple Writings",    "THE SACRED",                      "am_michael", "06_sacred_temple_writings.wav"),
+    # ("Sacred Temple Writings",    "THE SACRED",                      "am_michael", "06_sacred_temple_writings.wav"),
-    ("Samuel the Lamanite I",     "THE FIRST BOOK",                  "am_echo",    "07_samuel_lamanite_1.wav"),
+    # ("Samuel the Lamanite I",     "THE FIRST BOOK",                  "am_echo",    "07_samuel_lamanite_1.wav"),
-    ("Samuel the Lamanite II",    "THE SECOND BOOK",                 "am_echo",    "08_samuel_lamanite_2.wav"),
+    # ("Samuel the Lamanite II",    "THE SECOND BOOK",                 "am_echo",    "08_samuel_lamanite_2.wav"),
-    ("Manti",                     "THE BOOK OF MANTI",               "am_onyx",    "09_manti.wav"),
+    # ("Manti",                     "THE BOOK OF MANTI",               "am_onyx",    "09_manti.wav"),
-    ("Pa Nat I",                  "THE FIRST BOOK OF PA NAT",        "af_nicole",  "10_pa_nat_1.wav"),
+    # ("Pa Nat I",                  "THE FIRST BOOK OF PA NAT",        "af_nicole",  "10_pa_nat_1.wav"),
-    ("Moroni I",                  "THE FIRST BOOK OF MORONI",        "am_adam",    "11_moroni_1.wav"),
+    # ("Moroni I",                  "THE FIRST BOOK OF MORONI",        "am_adam",    "11_moroni_1.wav"),
-    ("Moroni II",                 "THE SECOND BOOK OF MORONI",       "am_adam",    "12_moroni_2.wav"),
+    # ("Moroni II",                 "THE SECOND BOOK OF MORONI",       "am_adam",    "12_moroni_2.wav"),
-    ("Moroni III",                "THE THIRD BOOK OF MORONI",        "am_adam",    "13_moroni_3.wav"),
+    # ("Moroni III",                "THE THIRD BOOK OF MORONI",        "am_adam",    "13_moroni_3.wav"),
-    ("Shioni",                    "THE BOOK OF SHIONI",              "am_puck",    "14_shioni.wav"),
+    # ("Shioni",                    "THE BOOK OF SHIONI",              "am_puck",    "14_shioni.wav"),
 ]
 # ── Helpers ────────────────────────────────────────────────────────────────────
@ -118,8 +119,18 @@ def clean_text(text: str) -> str:
    return text.strip()
 def _fmt_duration(seconds: float) -> str:
    """Format seconds as 'Xm Ys' or 'Xs'."""
    if seconds >= 60:
        m, s = divmod(int(seconds), 60)
        return f"{m}m {s:02d}s"
    return f"{seconds:.0f}s"
 def generate_audio(pipeline: KPipeline, text: str, voice: str,
-                   output_path: Path) -> None:
+                   output_path: Path) -> float:
    """Generate audio and return wall-clock seconds elapsed."""
    t0 = time.monotonic()
    chunks = []
    for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
        if hasattr(chunk_audio, "numpy"):
@ -131,10 +142,13 @@ def generate_audio(pipeline: KPipeline, text: str, voice: str,
    if chunks:
        audio = np.concatenate(chunks, axis=0)
        sf.write(str(output_path), audio, SAMPLE_RATE)
        elapsed = time.monotonic() - t0
        duration = len(audio) / SAMPLE_RATE
-        print(f"  ✓  Saved '{output_path.name}'  ({duration:.1f}s)")
+        print(f"  ✓  Saved '{output_path.name}'  ({duration:.1f}s audio  |  {elapsed:.1f}s wall-clock)")
    else:
        elapsed = time.monotonic() - t0
        print(f"  ✗  No audio produced for voice='{voice}'")
    return elapsed
 # ── Main ───────────────────────────────────────────────────────────────────────
@ -156,19 +170,59 @@ def main() -> None:
    print("Initialising Kokoro pipeline …")
    pipeline = KPipeline(lang_code=LANG_CODE)
    # Pre-compute char counts for all sections so we can estimate ETAs
    section_chars: dict[str, int] = {
        label: len(clean_text(sections[label]))
        for label, _, _, _ in BOOKS
        if label in sections
    }
    chars_per_sec: float | None = None   # derived from the first book that finishes
    timing_rows: list[tuple[str, int, float]] = []  # (label, chars, elapsed)
    for label, marker, voice, wav_name in BOOKS:
        if label not in sections:
            continue  # marker was not found; warning already printed
        print(f"\n[{label}]  voice={voice}  →  {wav_name}")
        text = clean_text(sections[label])
        if not text:
            print("  ⚠  Empty text — skipping")
            continue
-        out_path = OUTPUT_DIR / wav_name
+        text = clean_text(sections[label])
-        generate_audio(pipeline, text, voice, out_path)
+        if not text:
            print(f"\n[{label}]  ⚠  Empty text — skipping")
            continue
        chars = section_chars[label]
        # Print ETA once we have a calibration rate
        if chars_per_sec is not None:
            eta_sec = chars / chars_per_sec
            eta_str = _fmt_duration(eta_sec)
            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (est. {eta_str})")
        else:
            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (timing calibration run)")
        stem, ext = wav_name.rsplit(".", 1)
        out_path = OUTPUT_DIR / f"{stem}_{voice}.{ext}"
        elapsed = generate_audio(pipeline, text, voice, out_path)
        timing_rows.append((label, chars, elapsed))
        # Calibrate from first completed book
        if chars_per_sec is None and elapsed > 0:
            chars_per_sec = chars / elapsed
            print(f"  ⏱  Calibrated: {chars_per_sec:.0f} chars/sec")
    # ── Summary ────────────────────────────────────────────────────────────────
    print("\n" + "─" * 60)
    print(f"  {'Section':<30}  {'Chars':>7}  {'Actual':>8}  {'Est':>8}")
    print("─" * 60)
    for i, (label, chars, elapsed) in enumerate(timing_rows):
        actual_str = _fmt_duration(elapsed)
        if i == 0 or chars_per_sec is None:
            est_str = "(calibration)"
        else:
            est_str = _fmt_duration(chars / chars_per_sec)
        print(f"  {label:<30}  {chars:>7,}  {actual_str:>8}  {est_str:>8}")
    total_elapsed = sum(e for _, _, e in timing_rows)
    print("─" * 60)
    print(f"  {'TOTAL':<30}  {sum(c for _,c,_ in timing_rows):>7,}  {_fmt_duration(total_elapsed):>8}")
    print("\nDone.")
--- a/extract_proper_nouns.py
+++ b/extract_proper_nouns.py
@ -18,6 +18,25 @@ from collections import defaultdict
 from pathlib import Path
 import spacy
 from wordfreq import top_n_list
 # ── Top 10 000 most-frequent English words ──────────────────────────
 TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
 # Words in the top-10k list that are genuine proper nouns in this text —
 # keep them despite the frequency filter.
 PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
    # Biblical names
    "aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
    "elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
    "jacob", "james", "jehovah", "john", "joseph", "judah",
    "laban", "lehi", "levi", "micah", "michael", "moses", "noah",
    "peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
    "timothy", "zion",
    # Book-specific names that happen to match English words
    "alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
    "mosiah", "nephi", "satan", "sidon",
 })
 SOURCE = Path("Audio Master Nem Full.txt")
 OUTPUT = Path("proper_nouns.txt")
@ -35,12 +54,29 @@ ORG_LABELS    = {"ORG", "NORP"}
 OTHER_LABELS  = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
 # ── Noise filters ──────────────────────────────────────────────────────────────
-# All-caps lines are section headers, not spoken names — skip them.
+# Common English words that should be dropped when splitting multi-word entities.
-# Also skip very short tokens that are likely artefacts.
+STOP_WORDS: set[str] = {
-SKIP_PATTERNS = re.compile(
+    "A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
-    r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|"
+    "DO", "DID", "DOTH",
-    r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$"
+    "EVEN", "FOR", "FROM",
-)
+    "HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
    "I", "IN", "IS", "IT", "ITS",
    "MAY", "ME", "MORE", "MY",
    "NAY", "NO", "NOT", "NOW",
    "OF", "OR", "OUR",
    "SHALL", "SHE", "SO", "SOME",
    "THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
    "THIS", "THOSE", "THOU", "THUS", "THY", "TO",
    "UP", "UPON", "US",
    "WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
    "YE", "YEA", "YET", "YOU", "YOUR",
    # Book-specific common words not worth flagging
    "BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
    # Generic nouns that slip through NER
    "CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
    "PEOPLE", "SON", "TIME",
 }
 def is_noise(text: str) -> bool:
    t = text.strip()
@ -48,9 +84,12 @@ def is_noise(text: str) -> bool:
        return True
    if t.isupper() and len(t) > 4:      # all-caps section header word
        return True
-    if SKIP_PATTERNS.match(t.upper()):
+    if t.upper() in STOP_WORDS:
        return True
-    if re.search(r"[^a-zA-Z\-' ]", t):  # contains digits or symbols
+    if re.search(r"[^a-zA-Z\-']", t):   # contains digits, spaces, or symbols
        return True
    # Drop common English words (no hyphens) unless whitelisted as proper nouns.
    if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
        return True
    return False
@ -60,6 +99,11 @@ def canonical(text: str) -> str:
    return " ".join(text.split()).title()
 def split_words(phrase: str) -> list[str]:
    """Split a phrase on spaces; hyphenated words are kept as one token."""
    return phrase.split()
 # ── Read and process ───────────────────────────────────────────────────────────
 print(f"Reading '{SOURCE}' …")
 raw_text = SOURCE.read_text(encoding="utf-8")
@ -71,20 +115,23 @@ doc = nlp(raw_text)
 buckets: dict[str, set[str]] = defaultdict(set)
 # 1. NER pass — trust spaCy's entity labels
 #    Multi-word entities (e.g. "Peter James John") are split into individual
 #    words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
 for ent in doc.ents:
-    name = canonical(ent.text)
+    phrase = canonical(ent.text)
-    if is_noise(name):
+    for word in split_words(phrase):
        if is_noise(word):
            continue
        if ent.label_ in PERSON_LABELS:
-        buckets["People & Characters"].add(name)
+            buckets["People & Characters"].add(word)
        elif ent.label_ in PLACE_LABELS:
-        buckets["Places & Lands"].add(name)
+            buckets["Places & Lands"].add(word)
        elif ent.label_ in ORG_LABELS:
-        buckets["Groups & Nations"].add(name)
+            buckets["Groups & Nations"].add(word)
        elif ent.label_ in OTHER_LABELS:
-        buckets["Other Named Things"].add(name)
+            buckets["Other Named Things"].add(word)
        else:
-        buckets["Other Named Things"].add(name)
+            buckets["Other Named Things"].add(word)
 # 2. PROPN pass — catch names spaCy didn't recognise as entities
 #    Only include tokens that are inside a sentence (not at position 0)
@ -97,13 +144,13 @@ for token in doc:
        continue                          # skip all-caps
    if token.i == token.sent.start:
        continue                          # skip sentence-initial (could be any word)
-    name = canonical(text)
+    word = canonical(text)
-    if is_noise(name):
+    if is_noise(word):
        continue
    # Only add if not already captured by NER
-    already_captured = any(name in s for s in buckets.values())
+    already_captured = any(word in s for s in buckets.values())
    if not already_captured:
-        buckets["Unclassified Proper Nouns"].add(name)
+        buckets["Unclassified Proper Nouns"].add(word)
 # ── Write output ───────────────────────────────────────────────────────────────
 GROUP_ORDER = [
--- a/output_proper_nouns/correct_words.json
+++ b/output_proper_nouns/correct_words.json
--- a/output_proper_nouns/manifest.json
+++ b/output_proper_nouns/manifest.json
--- a/proper_nouns.txt
+++ b/proper_nouns.txt
Author	SHA1	Message	Date
dillonj	6cefc3c862	improved proper noun parsing	2026-02-26 00:57:40 -07:00
dillonj	949bd7c203	Clean correct_words.json: single words, filter stop words, keep two-part proper names	2026-02-25 23:50:52 -07:00