Compare commits

...

2 Commits

5 changed files with 781 additions and 3076 deletions

View File

@ -11,6 +11,7 @@ Output .wav files are written to OUTPUT_DIR (created automatically).
""" """
import re import re
import time
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import torch import torch
@ -51,19 +52,19 @@ BOOKS = [
# label start_marker voice output_wav # label start_marker voice output_wav
("Introduction", "Introduction", "af_heart", "00_introduction.wav"), ("Introduction", "Introduction", "af_heart", "00_introduction.wav"),
("Book of Hagoth", "THE BOOK OF HAGOTH", "am_fenrir", "01_hagoth.wav"), ("Book of Hagoth", "THE BOOK OF HAGOTH", "am_fenrir", "01_hagoth.wav"),
("Shi-Tugo I", "THE FIRST BOOK OF SHI-TUGO", "am_eric", "02_shi_tugo_1.wav"), # ("Shi-Tugo I", "THE FIRST BOOK OF SHI-TUGO", "am_eric", "02_shi_tugo_1.wav"),
("Sanempet", "THE BOOK OF SANEMPET", "am_liam", "03_sanempet.wav"), # ("Sanempet", "THE BOOK OF SANEMPET", "am_liam", "03_sanempet.wav"),
("Oug", "THE BOOK OF OUG", "am_michael", "04_oug.wav"), # ("Oug", "THE BOOK OF OUG", "am_michael", "04_oug.wav"),
("Temple Writings of Oug", "THE BOOK OF", "am_michael", "05_temple_writings_oug.wav"), # ("Temple Writings of Oug", "THE BOOK OF", "am_michael", "05_temple_writings_oug.wav"),
("Sacred Temple Writings", "THE SACRED", "am_michael", "06_sacred_temple_writings.wav"), # ("Sacred Temple Writings", "THE SACRED", "am_michael", "06_sacred_temple_writings.wav"),
("Samuel the Lamanite I", "THE FIRST BOOK", "am_echo", "07_samuel_lamanite_1.wav"), # ("Samuel the Lamanite I", "THE FIRST BOOK", "am_echo", "07_samuel_lamanite_1.wav"),
("Samuel the Lamanite II", "THE SECOND BOOK", "am_echo", "08_samuel_lamanite_2.wav"), # ("Samuel the Lamanite II", "THE SECOND BOOK", "am_echo", "08_samuel_lamanite_2.wav"),
("Manti", "THE BOOK OF MANTI", "am_onyx", "09_manti.wav"), # ("Manti", "THE BOOK OF MANTI", "am_onyx", "09_manti.wav"),
("Pa Nat I", "THE FIRST BOOK OF PA NAT", "af_nicole", "10_pa_nat_1.wav"), # ("Pa Nat I", "THE FIRST BOOK OF PA NAT", "af_nicole", "10_pa_nat_1.wav"),
("Moroni I", "THE FIRST BOOK OF MORONI", "am_adam", "11_moroni_1.wav"), # ("Moroni I", "THE FIRST BOOK OF MORONI", "am_adam", "11_moroni_1.wav"),
("Moroni II", "THE SECOND BOOK OF MORONI", "am_adam", "12_moroni_2.wav"), # ("Moroni II", "THE SECOND BOOK OF MORONI", "am_adam", "12_moroni_2.wav"),
("Moroni III", "THE THIRD BOOK OF MORONI", "am_adam", "13_moroni_3.wav"), # ("Moroni III", "THE THIRD BOOK OF MORONI", "am_adam", "13_moroni_3.wav"),
("Shioni", "THE BOOK OF SHIONI", "am_puck", "14_shioni.wav"), # ("Shioni", "THE BOOK OF SHIONI", "am_puck", "14_shioni.wav"),
] ]
# ── Helpers ──────────────────────────────────────────────────────────────────── # ── Helpers ────────────────────────────────────────────────────────────────────
@ -118,8 +119,18 @@ def clean_text(text: str) -> str:
return text.strip() return text.strip()
def _fmt_duration(seconds: float) -> str:
"""Format seconds as 'Xm Ys' or 'Xs'."""
if seconds >= 60:
m, s = divmod(int(seconds), 60)
return f"{m}m {s:02d}s"
return f"{seconds:.0f}s"
def generate_audio(pipeline: KPipeline, text: str, voice: str, def generate_audio(pipeline: KPipeline, text: str, voice: str,
output_path: Path) -> None: output_path: Path) -> float:
"""Generate audio and return wall-clock seconds elapsed."""
t0 = time.monotonic()
chunks = [] chunks = []
for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED): for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
if hasattr(chunk_audio, "numpy"): if hasattr(chunk_audio, "numpy"):
@ -131,10 +142,13 @@ def generate_audio(pipeline: KPipeline, text: str, voice: str,
if chunks: if chunks:
audio = np.concatenate(chunks, axis=0) audio = np.concatenate(chunks, axis=0)
sf.write(str(output_path), audio, SAMPLE_RATE) sf.write(str(output_path), audio, SAMPLE_RATE)
elapsed = time.monotonic() - t0
duration = len(audio) / SAMPLE_RATE duration = len(audio) / SAMPLE_RATE
print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s)") print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s audio | {elapsed:.1f}s wall-clock)")
else: else:
elapsed = time.monotonic() - t0
print(f" ✗ No audio produced for voice='{voice}'") print(f" ✗ No audio produced for voice='{voice}'")
return elapsed
# ── Main ─────────────────────────────────────────────────────────────────────── # ── Main ───────────────────────────────────────────────────────────────────────
@ -156,19 +170,59 @@ def main() -> None:
print("Initialising Kokoro pipeline …") print("Initialising Kokoro pipeline …")
pipeline = KPipeline(lang_code=LANG_CODE) pipeline = KPipeline(lang_code=LANG_CODE)
# Pre-compute char counts for all sections so we can estimate ETAs
section_chars: dict[str, int] = {
label: len(clean_text(sections[label]))
for label, _, _, _ in BOOKS
if label in sections
}
chars_per_sec: float | None = None # derived from the first book that finishes
timing_rows: list[tuple[str, int, float]] = [] # (label, chars, elapsed)
for label, marker, voice, wav_name in BOOKS: for label, marker, voice, wav_name in BOOKS:
if label not in sections: if label not in sections:
continue # marker was not found; warning already printed
print(f"\n[{label}] voice={voice}{wav_name}")
text = clean_text(sections[label])
if not text:
print(" ⚠ Empty text — skipping")
continue continue
out_path = OUTPUT_DIR / wav_name text = clean_text(sections[label])
generate_audio(pipeline, text, voice, out_path) if not text:
print(f"\n[{label}] ⚠ Empty text — skipping")
continue
chars = section_chars[label]
# Print ETA once we have a calibration rate
if chars_per_sec is not None:
eta_sec = chars / chars_per_sec
eta_str = _fmt_duration(eta_sec)
print(f"\n[{label}] voice={voice}{wav_name} (est. {eta_str})")
else:
print(f"\n[{label}] voice={voice}{wav_name} (timing calibration run)")
stem, ext = wav_name.rsplit(".", 1)
out_path = OUTPUT_DIR / f"{stem}_{voice}.{ext}"
elapsed = generate_audio(pipeline, text, voice, out_path)
timing_rows.append((label, chars, elapsed))
# Calibrate from first completed book
if chars_per_sec is None and elapsed > 0:
chars_per_sec = chars / elapsed
print(f" ⏱ Calibrated: {chars_per_sec:.0f} chars/sec")
# ── Summary ────────────────────────────────────────────────────────────────
print("\n" + "" * 60)
print(f" {'Section':<30} {'Chars':>7} {'Actual':>8} {'Est':>8}")
print("" * 60)
for i, (label, chars, elapsed) in enumerate(timing_rows):
actual_str = _fmt_duration(elapsed)
if i == 0 or chars_per_sec is None:
est_str = "(calibration)"
else:
est_str = _fmt_duration(chars / chars_per_sec)
print(f" {label:<30} {chars:>7,} {actual_str:>8} {est_str:>8}")
total_elapsed = sum(e for _, _, e in timing_rows)
print("" * 60)
print(f" {'TOTAL':<30} {sum(c for _,c,_ in timing_rows):>7,} {_fmt_duration(total_elapsed):>8}")
print("\nDone.") print("\nDone.")

View File

@ -18,6 +18,25 @@ from collections import defaultdict
from pathlib import Path from pathlib import Path
import spacy import spacy
from wordfreq import top_n_list
# ── Top 10 000 most-frequent English words ──────────────────────────
TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
# Words in the top-10k list that are genuine proper nouns in this text —
# keep them despite the frequency filter.
PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
# Biblical names
"aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
"elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
"jacob", "james", "jehovah", "john", "joseph", "judah",
"laban", "lehi", "levi", "micah", "michael", "moses", "noah",
"peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
"timothy", "zion",
# Book-specific names that happen to match English words
"alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
"mosiah", "nephi", "satan", "sidon",
})
SOURCE = Path("Audio Master Nem Full.txt") SOURCE = Path("Audio Master Nem Full.txt")
OUTPUT = Path("proper_nouns.txt") OUTPUT = Path("proper_nouns.txt")
@ -35,12 +54,29 @@ ORG_LABELS = {"ORG", "NORP"}
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"} OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
# ── Noise filters ────────────────────────────────────────────────────────────── # ── Noise filters ──────────────────────────────────────────────────────────────
# All-caps lines are section headers, not spoken names — skip them. # Common English words that should be dropped when splitting multi-word entities.
# Also skip very short tokens that are likely artefacts. STOP_WORDS: set[str] = {
SKIP_PATTERNS = re.compile( "A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|" "DO", "DID", "DOTH",
r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$" "EVEN", "FOR", "FROM",
) "HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
"I", "IN", "IS", "IT", "ITS",
"MAY", "ME", "MORE", "MY",
"NAY", "NO", "NOT", "NOW",
"OF", "OR", "OUR",
"SHALL", "SHE", "SO", "SOME",
"THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
"THIS", "THOSE", "THOU", "THUS", "THY", "TO",
"UP", "UPON", "US",
"WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
"YE", "YEA", "YET", "YOU", "YOUR",
# Book-specific common words not worth flagging
"BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
# Generic nouns that slip through NER
"CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
"PEOPLE", "SON", "TIME",
}
def is_noise(text: str) -> bool: def is_noise(text: str) -> bool:
t = text.strip() t = text.strip()
@ -48,9 +84,12 @@ def is_noise(text: str) -> bool:
return True return True
if t.isupper() and len(t) > 4: # all-caps section header word if t.isupper() and len(t) > 4: # all-caps section header word
return True return True
if SKIP_PATTERNS.match(t.upper()): if t.upper() in STOP_WORDS:
return True return True
if re.search(r"[^a-zA-Z\-' ]", t): # contains digits or symbols if re.search(r"[^a-zA-Z\-']", t): # contains digits, spaces, or symbols
return True
# Drop common English words (no hyphens) unless whitelisted as proper nouns.
if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
return True return True
return False return False
@ -60,6 +99,11 @@ def canonical(text: str) -> str:
return " ".join(text.split()).title() return " ".join(text.split()).title()
def split_words(phrase: str) -> list[str]:
"""Split a phrase on spaces; hyphenated words are kept as one token."""
return phrase.split()
# ── Read and process ─────────────────────────────────────────────────────────── # ── Read and process ───────────────────────────────────────────────────────────
print(f"Reading '{SOURCE}'") print(f"Reading '{SOURCE}'")
raw_text = SOURCE.read_text(encoding="utf-8") raw_text = SOURCE.read_text(encoding="utf-8")
@ -71,20 +115,23 @@ doc = nlp(raw_text)
buckets: dict[str, set[str]] = defaultdict(set) buckets: dict[str, set[str]] = defaultdict(set)
# 1. NER pass — trust spaCy's entity labels # 1. NER pass — trust spaCy's entity labels
# Multi-word entities (e.g. "Peter James John") are split into individual
# words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
for ent in doc.ents: for ent in doc.ents:
name = canonical(ent.text) phrase = canonical(ent.text)
if is_noise(name): for word in split_words(phrase):
continue if is_noise(word):
if ent.label_ in PERSON_LABELS: continue
buckets["People & Characters"].add(name) if ent.label_ in PERSON_LABELS:
elif ent.label_ in PLACE_LABELS: buckets["People & Characters"].add(word)
buckets["Places & Lands"].add(name) elif ent.label_ in PLACE_LABELS:
elif ent.label_ in ORG_LABELS: buckets["Places & Lands"].add(word)
buckets["Groups & Nations"].add(name) elif ent.label_ in ORG_LABELS:
elif ent.label_ in OTHER_LABELS: buckets["Groups & Nations"].add(word)
buckets["Other Named Things"].add(name) elif ent.label_ in OTHER_LABELS:
else: buckets["Other Named Things"].add(word)
buckets["Other Named Things"].add(name) else:
buckets["Other Named Things"].add(word)
# 2. PROPN pass — catch names spaCy didn't recognise as entities # 2. PROPN pass — catch names spaCy didn't recognise as entities
# Only include tokens that are inside a sentence (not at position 0) # Only include tokens that are inside a sentence (not at position 0)
@ -97,13 +144,13 @@ for token in doc:
continue # skip all-caps continue # skip all-caps
if token.i == token.sent.start: if token.i == token.sent.start:
continue # skip sentence-initial (could be any word) continue # skip sentence-initial (could be any word)
name = canonical(text) word = canonical(text)
if is_noise(name): if is_noise(word):
continue continue
# Only add if not already captured by NER # Only add if not already captured by NER
already_captured = any(name in s for s in buckets.values()) already_captured = any(word in s for s in buckets.values())
if not already_captured: if not already_captured:
buckets["Unclassified Proper Nouns"].add(name) buckets["Unclassified Proper Nouns"].add(word)
# ── Write output ─────────────────────────────────────────────────────────────── # ── Write output ───────────────────────────────────────────────────────────────
GROUP_ORDER = [ GROUP_ORDER = [

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff