improved proper noun parsing

Clean correct_words.json: single words, filter stop words, keep two-part proper names
2026-02-26 00:57:40 -07:00 · 2026-02-25 23:50:52 -07:00
5 changed files with 781 additions and 3076 deletions
--- a/create_audiobook_nem.py
+++ b/create_audiobook_nem.py
@ -11,6 +11,7 @@ Output .wav files are written to OUTPUT_DIR (created automatically).
 """

 import re
+import time
 import numpy as np
 import soundfile as sf
 import torch
@ -51,19 +52,19 @@ BOOKS = [
    # label                       start_marker                       voice         output_wav
    ("Introduction",              "Introduction",                    "af_heart",   "00_introduction.wav"),
    ("Book of Hagoth",            "THE BOOK OF HAGOTH",              "am_fenrir",  "01_hagoth.wav"),
-    ("Shi-Tugo I",                "THE FIRST BOOK OF SHI-TUGO",      "am_eric",    "02_shi_tugo_1.wav"),
-    ("Sanempet",                  "THE BOOK OF SANEMPET",            "am_liam",    "03_sanempet.wav"),
-    ("Oug",                       "THE BOOK OF OUG",                 "am_michael", "04_oug.wav"),
-    ("Temple Writings of Oug",    "THE BOOK OF",                     "am_michael", "05_temple_writings_oug.wav"),
-    ("Sacred Temple Writings",    "THE SACRED",                      "am_michael", "06_sacred_temple_writings.wav"),
-    ("Samuel the Lamanite I",     "THE FIRST BOOK",                  "am_echo",    "07_samuel_lamanite_1.wav"),
-    ("Samuel the Lamanite II",    "THE SECOND BOOK",                 "am_echo",    "08_samuel_lamanite_2.wav"),
-    ("Manti",                     "THE BOOK OF MANTI",               "am_onyx",    "09_manti.wav"),
-    ("Pa Nat I",                  "THE FIRST BOOK OF PA NAT",        "af_nicole",  "10_pa_nat_1.wav"),
-    ("Moroni I",                  "THE FIRST BOOK OF MORONI",        "am_adam",    "11_moroni_1.wav"),
-    ("Moroni II",                 "THE SECOND BOOK OF MORONI",       "am_adam",    "12_moroni_2.wav"),
-    ("Moroni III",                "THE THIRD BOOK OF MORONI",        "am_adam",    "13_moroni_3.wav"),
-    ("Shioni",                    "THE BOOK OF SHIONI",              "am_puck",    "14_shioni.wav"),
+    # ("Shi-Tugo I",                "THE FIRST BOOK OF SHI-TUGO",      "am_eric",    "02_shi_tugo_1.wav"),
+    # ("Sanempet",                  "THE BOOK OF SANEMPET",            "am_liam",    "03_sanempet.wav"),
+    # ("Oug",                       "THE BOOK OF OUG",                 "am_michael", "04_oug.wav"),
+    # ("Temple Writings of Oug",    "THE BOOK OF",                     "am_michael", "05_temple_writings_oug.wav"),
+    # ("Sacred Temple Writings",    "THE SACRED",                      "am_michael", "06_sacred_temple_writings.wav"),
+    # ("Samuel the Lamanite I",     "THE FIRST BOOK",                  "am_echo",    "07_samuel_lamanite_1.wav"),
+    # ("Samuel the Lamanite II",    "THE SECOND BOOK",                 "am_echo",    "08_samuel_lamanite_2.wav"),
+    # ("Manti",                     "THE BOOK OF MANTI",               "am_onyx",    "09_manti.wav"),
+    # ("Pa Nat I",                  "THE FIRST BOOK OF PA NAT",        "af_nicole",  "10_pa_nat_1.wav"),
+    # ("Moroni I",                  "THE FIRST BOOK OF MORONI",        "am_adam",    "11_moroni_1.wav"),
+    # ("Moroni II",                 "THE SECOND BOOK OF MORONI",       "am_adam",    "12_moroni_2.wav"),
+    # ("Moroni III",                "THE THIRD BOOK OF MORONI",        "am_adam",    "13_moroni_3.wav"),
+    # ("Shioni",                    "THE BOOK OF SHIONI",              "am_puck",    "14_shioni.wav"),
 ]

 # ── Helpers ────────────────────────────────────────────────────────────────────
@ -118,8 +119,18 @@ def clean_text(text: str) -> str:
    return text.strip()


+def _fmt_duration(seconds: float) -> str:
+    """Format seconds as 'Xm Ys' or 'Xs'."""
+    if seconds >= 60:
+        m, s = divmod(int(seconds), 60)
+        return f"{m}m {s:02d}s"
+    return f"{seconds:.0f}s"
+
+
 def generate_audio(pipeline: KPipeline, text: str, voice: str,
-                   output_path: Path) -> None:
+                   output_path: Path) -> float:
+    """Generate audio and return wall-clock seconds elapsed."""
+    t0 = time.monotonic()
    chunks = []
    for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
        if hasattr(chunk_audio, "numpy"):
@ -131,10 +142,13 @@ def generate_audio(pipeline: KPipeline, text: str, voice: str,
    if chunks:
        audio = np.concatenate(chunks, axis=0)
        sf.write(str(output_path), audio, SAMPLE_RATE)
+        elapsed = time.monotonic() - t0
        duration = len(audio) / SAMPLE_RATE
-        print(f"  ✓  Saved '{output_path.name}'  ({duration:.1f}s)")
+        print(f"  ✓  Saved '{output_path.name}'  ({duration:.1f}s audio  |  {elapsed:.1f}s wall-clock)")
    else:
+        elapsed = time.monotonic() - t0
        print(f"  ✗  No audio produced for voice='{voice}'")
+    return elapsed


 # ── Main ───────────────────────────────────────────────────────────────────────
@ -156,19 +170,59 @@ def main() -> None:
    print("Initialising Kokoro pipeline …")
    pipeline = KPipeline(lang_code=LANG_CODE)

+    # Pre-compute char counts for all sections so we can estimate ETAs
+    section_chars: dict[str, int] = {
+        label: len(clean_text(sections[label]))
+        for label, _, _, _ in BOOKS
+        if label in sections
+    }
+
+    chars_per_sec: float | None = None   # derived from the first book that finishes
+    timing_rows: list[tuple[str, int, float]] = []  # (label, chars, elapsed)
+
    for label, marker, voice, wav_name in BOOKS:
        if label not in sections:
-            continue  # marker was not found; warning already printed
-
-        print(f"\n[{label}]  voice={voice}  →  {wav_name}")
-        text = clean_text(sections[label])
-        if not text:
-            print("  ⚠  Empty text — skipping")
            continue

-        out_path = OUTPUT_DIR / wav_name
-        generate_audio(pipeline, text, voice, out_path)
+        text = clean_text(sections[label])
+        if not text:
+            print(f"\n[{label}]  ⚠  Empty text — skipping")
+            continue

+        chars = section_chars[label]
+
+        # Print ETA once we have a calibration rate
+        if chars_per_sec is not None:
+            eta_sec = chars / chars_per_sec
+            eta_str = _fmt_duration(eta_sec)
+            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (est. {eta_str})")
+        else:
+            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (timing calibration run)")
+
+        stem, ext = wav_name.rsplit(".", 1)
+        out_path = OUTPUT_DIR / f"{stem}_{voice}.{ext}"
+        elapsed = generate_audio(pipeline, text, voice, out_path)
+        timing_rows.append((label, chars, elapsed))
+
+        # Calibrate from first completed book
+        if chars_per_sec is None and elapsed > 0:
+            chars_per_sec = chars / elapsed
+            print(f"  ⏱  Calibrated: {chars_per_sec:.0f} chars/sec")
+
+    # ── Summary ────────────────────────────────────────────────────────────────
+    print("\n" + "─" * 60)
+    print(f"  {'Section':<30}  {'Chars':>7}  {'Actual':>8}  {'Est':>8}")
+    print("─" * 60)
+    for i, (label, chars, elapsed) in enumerate(timing_rows):
+        actual_str = _fmt_duration(elapsed)
+        if i == 0 or chars_per_sec is None:
+            est_str = "(calibration)"
+        else:
+            est_str = _fmt_duration(chars / chars_per_sec)
+        print(f"  {label:<30}  {chars:>7,}  {actual_str:>8}  {est_str:>8}")
+    total_elapsed = sum(e for _, _, e in timing_rows)
+    print("─" * 60)
+    print(f"  {'TOTAL':<30}  {sum(c for _,c,_ in timing_rows):>7,}  {_fmt_duration(total_elapsed):>8}")
    print("\nDone.")


--- a/extract_proper_nouns.py
+++ b/extract_proper_nouns.py
@ -18,6 +18,25 @@ from collections import defaultdict
 from pathlib import Path

 import spacy
+from wordfreq import top_n_list
+
+# ── Top 10 000 most-frequent English words ──────────────────────────
+TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
+
+# Words in the top-10k list that are genuine proper nouns in this text —
+# keep them despite the frequency filter.
+PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
+    # Biblical names
+    "aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
+    "elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
+    "jacob", "james", "jehovah", "john", "joseph", "judah",
+    "laban", "lehi", "levi", "micah", "michael", "moses", "noah",
+    "peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
+    "timothy", "zion",
+    # Book-specific names that happen to match English words
+    "alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
+    "mosiah", "nephi", "satan", "sidon",
+})

 SOURCE = Path("Audio Master Nem Full.txt")
 OUTPUT = Path("proper_nouns.txt")
@ -35,12 +54,29 @@ ORG_LABELS    = {"ORG", "NORP"}
 OTHER_LABELS  = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}

 # ── Noise filters ──────────────────────────────────────────────────────────────
-# All-caps lines are section headers, not spoken names — skip them.
-# Also skip very short tokens that are likely artefacts.
-SKIP_PATTERNS = re.compile(
-    r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|"
-    r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$"
-)
+# Common English words that should be dropped when splitting multi-word entities.
+STOP_WORDS: set[str] = {
+    "A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
+    "DO", "DID", "DOTH",
+    "EVEN", "FOR", "FROM",
+    "HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
+    "I", "IN", "IS", "IT", "ITS",
+    "MAY", "ME", "MORE", "MY",
+    "NAY", "NO", "NOT", "NOW",
+    "OF", "OR", "OUR",
+    "SHALL", "SHE", "SO", "SOME",
+    "THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
+    "THIS", "THOSE", "THOU", "THUS", "THY", "TO",
+    "UP", "UPON", "US",
+    "WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
+    "YE", "YEA", "YET", "YOU", "YOUR",
+    # Book-specific common words not worth flagging
+    "BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
+    # Generic nouns that slip through NER
+    "CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
+    "PEOPLE", "SON", "TIME",
+}
+

 def is_noise(text: str) -> bool:
    t = text.strip()
@ -48,9 +84,12 @@ def is_noise(text: str) -> bool:
        return True
    if t.isupper() and len(t) > 4:      # all-caps section header word
        return True
-    if SKIP_PATTERNS.match(t.upper()):
+    if t.upper() in STOP_WORDS:
        return True
-    if re.search(r"[^a-zA-Z\-' ]", t):  # contains digits or symbols
+    if re.search(r"[^a-zA-Z\-']", t):   # contains digits, spaces, or symbols
+        return True
+    # Drop common English words (no hyphens) unless whitelisted as proper nouns.
+    if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
        return True
    return False

@ -60,6 +99,11 @@ def canonical(text: str) -> str:
    return " ".join(text.split()).title()


+def split_words(phrase: str) -> list[str]:
+    """Split a phrase on spaces; hyphenated words are kept as one token."""
+    return phrase.split()
+
+
 # ── Read and process ───────────────────────────────────────────────────────────
 print(f"Reading '{SOURCE}' …")
 raw_text = SOURCE.read_text(encoding="utf-8")
@ -71,20 +115,23 @@ doc = nlp(raw_text)
 buckets: dict[str, set[str]] = defaultdict(set)

 # 1. NER pass — trust spaCy's entity labels
+#    Multi-word entities (e.g. "Peter James John") are split into individual
+#    words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
 for ent in doc.ents:
-    name = canonical(ent.text)
-    if is_noise(name):
+    phrase = canonical(ent.text)
+    for word in split_words(phrase):
+        if is_noise(word):
            continue
        if ent.label_ in PERSON_LABELS:
-        buckets["People & Characters"].add(name)
+            buckets["People & Characters"].add(word)
        elif ent.label_ in PLACE_LABELS:
-        buckets["Places & Lands"].add(name)
+            buckets["Places & Lands"].add(word)
        elif ent.label_ in ORG_LABELS:
-        buckets["Groups & Nations"].add(name)
+            buckets["Groups & Nations"].add(word)
        elif ent.label_ in OTHER_LABELS:
-        buckets["Other Named Things"].add(name)
+            buckets["Other Named Things"].add(word)
        else:
-        buckets["Other Named Things"].add(name)
+            buckets["Other Named Things"].add(word)

 # 2. PROPN pass — catch names spaCy didn't recognise as entities
 #    Only include tokens that are inside a sentence (not at position 0)
@ -97,13 +144,13 @@ for token in doc:
        continue                          # skip all-caps
    if token.i == token.sent.start:
        continue                          # skip sentence-initial (could be any word)
-    name = canonical(text)
-    if is_noise(name):
+    word = canonical(text)
+    if is_noise(word):
        continue
    # Only add if not already captured by NER
-    already_captured = any(name in s for s in buckets.values())
+    already_captured = any(word in s for s in buckets.values())
    if not already_captured:
-        buckets["Unclassified Proper Nouns"].add(name)
+        buckets["Unclassified Proper Nouns"].add(word)

 # ── Write output ───────────────────────────────────────────────────────────────
 GROUP_ORDER = [
--- a/output_proper_nouns/correct_words.json
+++ b/output_proper_nouns/correct_words.json
--- a/output_proper_nouns/manifest.json
+++ b/output_proper_nouns/manifest.json
--- a/proper_nouns.txt
+++ b/proper_nouns.txt
Author	SHA1	Message	Date
dillonj	6cefc3c862	improved proper noun parsing	2026-02-26 00:57:40 -07:00
dillonj	949bd7c203	Clean correct_words.json: single words, filter stop words, keep two-part proper names	2026-02-25 23:50:52 -07:00