Compare commits
2 Commits
f0e0adf24b
...
6cefc3c862
| Author | SHA1 | Date | |
|---|---|---|---|
| 6cefc3c862 | |||
| 949bd7c203 |
@ -11,6 +11,7 @@ Output .wav files are written to OUTPUT_DIR (created automatically).
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
@ -51,19 +52,19 @@ BOOKS = [
|
||||
# label start_marker voice output_wav
|
||||
("Introduction", "Introduction", "af_heart", "00_introduction.wav"),
|
||||
("Book of Hagoth", "THE BOOK OF HAGOTH", "am_fenrir", "01_hagoth.wav"),
|
||||
("Shi-Tugo I", "THE FIRST BOOK OF SHI-TUGO", "am_eric", "02_shi_tugo_1.wav"),
|
||||
("Sanempet", "THE BOOK OF SANEMPET", "am_liam", "03_sanempet.wav"),
|
||||
("Oug", "THE BOOK OF OUG", "am_michael", "04_oug.wav"),
|
||||
("Temple Writings of Oug", "THE BOOK OF", "am_michael", "05_temple_writings_oug.wav"),
|
||||
("Sacred Temple Writings", "THE SACRED", "am_michael", "06_sacred_temple_writings.wav"),
|
||||
("Samuel the Lamanite I", "THE FIRST BOOK", "am_echo", "07_samuel_lamanite_1.wav"),
|
||||
("Samuel the Lamanite II", "THE SECOND BOOK", "am_echo", "08_samuel_lamanite_2.wav"),
|
||||
("Manti", "THE BOOK OF MANTI", "am_onyx", "09_manti.wav"),
|
||||
("Pa Nat I", "THE FIRST BOOK OF PA NAT", "af_nicole", "10_pa_nat_1.wav"),
|
||||
("Moroni I", "THE FIRST BOOK OF MORONI", "am_adam", "11_moroni_1.wav"),
|
||||
("Moroni II", "THE SECOND BOOK OF MORONI", "am_adam", "12_moroni_2.wav"),
|
||||
("Moroni III", "THE THIRD BOOK OF MORONI", "am_adam", "13_moroni_3.wav"),
|
||||
("Shioni", "THE BOOK OF SHIONI", "am_puck", "14_shioni.wav"),
|
||||
# ("Shi-Tugo I", "THE FIRST BOOK OF SHI-TUGO", "am_eric", "02_shi_tugo_1.wav"),
|
||||
# ("Sanempet", "THE BOOK OF SANEMPET", "am_liam", "03_sanempet.wav"),
|
||||
# ("Oug", "THE BOOK OF OUG", "am_michael", "04_oug.wav"),
|
||||
# ("Temple Writings of Oug", "THE BOOK OF", "am_michael", "05_temple_writings_oug.wav"),
|
||||
# ("Sacred Temple Writings", "THE SACRED", "am_michael", "06_sacred_temple_writings.wav"),
|
||||
# ("Samuel the Lamanite I", "THE FIRST BOOK", "am_echo", "07_samuel_lamanite_1.wav"),
|
||||
# ("Samuel the Lamanite II", "THE SECOND BOOK", "am_echo", "08_samuel_lamanite_2.wav"),
|
||||
# ("Manti", "THE BOOK OF MANTI", "am_onyx", "09_manti.wav"),
|
||||
# ("Pa Nat I", "THE FIRST BOOK OF PA NAT", "af_nicole", "10_pa_nat_1.wav"),
|
||||
# ("Moroni I", "THE FIRST BOOK OF MORONI", "am_adam", "11_moroni_1.wav"),
|
||||
# ("Moroni II", "THE SECOND BOOK OF MORONI", "am_adam", "12_moroni_2.wav"),
|
||||
# ("Moroni III", "THE THIRD BOOK OF MORONI", "am_adam", "13_moroni_3.wav"),
|
||||
# ("Shioni", "THE BOOK OF SHIONI", "am_puck", "14_shioni.wav"),
|
||||
]
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────────────────
|
||||
@ -118,8 +119,18 @@ def clean_text(text: str) -> str:
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _fmt_duration(seconds: float) -> str:
|
||||
"""Format seconds as 'Xm Ys' or 'Xs'."""
|
||||
if seconds >= 60:
|
||||
m, s = divmod(int(seconds), 60)
|
||||
return f"{m}m {s:02d}s"
|
||||
return f"{seconds:.0f}s"
|
||||
|
||||
|
||||
def generate_audio(pipeline: KPipeline, text: str, voice: str,
|
||||
output_path: Path) -> None:
|
||||
output_path: Path) -> float:
|
||||
"""Generate audio and return wall-clock seconds elapsed."""
|
||||
t0 = time.monotonic()
|
||||
chunks = []
|
||||
for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
|
||||
if hasattr(chunk_audio, "numpy"):
|
||||
@ -131,10 +142,13 @@ def generate_audio(pipeline: KPipeline, text: str, voice: str,
|
||||
if chunks:
|
||||
audio = np.concatenate(chunks, axis=0)
|
||||
sf.write(str(output_path), audio, SAMPLE_RATE)
|
||||
elapsed = time.monotonic() - t0
|
||||
duration = len(audio) / SAMPLE_RATE
|
||||
print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s)")
|
||||
print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s audio | {elapsed:.1f}s wall-clock)")
|
||||
else:
|
||||
elapsed = time.monotonic() - t0
|
||||
print(f" ✗ No audio produced for voice='{voice}'")
|
||||
return elapsed
|
||||
|
||||
|
||||
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||
@ -156,19 +170,59 @@ def main() -> None:
|
||||
print("Initialising Kokoro pipeline …")
|
||||
pipeline = KPipeline(lang_code=LANG_CODE)
|
||||
|
||||
# Pre-compute char counts for all sections so we can estimate ETAs
|
||||
section_chars: dict[str, int] = {
|
||||
label: len(clean_text(sections[label]))
|
||||
for label, _, _, _ in BOOKS
|
||||
if label in sections
|
||||
}
|
||||
|
||||
chars_per_sec: float | None = None # derived from the first book that finishes
|
||||
timing_rows: list[tuple[str, int, float]] = [] # (label, chars, elapsed)
|
||||
|
||||
for label, marker, voice, wav_name in BOOKS:
|
||||
if label not in sections:
|
||||
continue # marker was not found; warning already printed
|
||||
|
||||
print(f"\n[{label}] voice={voice} → {wav_name}")
|
||||
text = clean_text(sections[label])
|
||||
if not text:
|
||||
print(" ⚠ Empty text — skipping")
|
||||
continue
|
||||
|
||||
out_path = OUTPUT_DIR / wav_name
|
||||
generate_audio(pipeline, text, voice, out_path)
|
||||
text = clean_text(sections[label])
|
||||
if not text:
|
||||
print(f"\n[{label}] ⚠ Empty text — skipping")
|
||||
continue
|
||||
|
||||
chars = section_chars[label]
|
||||
|
||||
# Print ETA once we have a calibration rate
|
||||
if chars_per_sec is not None:
|
||||
eta_sec = chars / chars_per_sec
|
||||
eta_str = _fmt_duration(eta_sec)
|
||||
print(f"\n[{label}] voice={voice} → {wav_name} (est. {eta_str})")
|
||||
else:
|
||||
print(f"\n[{label}] voice={voice} → {wav_name} (timing calibration run)")
|
||||
|
||||
stem, ext = wav_name.rsplit(".", 1)
|
||||
out_path = OUTPUT_DIR / f"{stem}_{voice}.{ext}"
|
||||
elapsed = generate_audio(pipeline, text, voice, out_path)
|
||||
timing_rows.append((label, chars, elapsed))
|
||||
|
||||
# Calibrate from first completed book
|
||||
if chars_per_sec is None and elapsed > 0:
|
||||
chars_per_sec = chars / elapsed
|
||||
print(f" ⏱ Calibrated: {chars_per_sec:.0f} chars/sec")
|
||||
|
||||
# ── Summary ────────────────────────────────────────────────────────────────
|
||||
print("\n" + "─" * 60)
|
||||
print(f" {'Section':<30} {'Chars':>7} {'Actual':>8} {'Est':>8}")
|
||||
print("─" * 60)
|
||||
for i, (label, chars, elapsed) in enumerate(timing_rows):
|
||||
actual_str = _fmt_duration(elapsed)
|
||||
if i == 0 or chars_per_sec is None:
|
||||
est_str = "(calibration)"
|
||||
else:
|
||||
est_str = _fmt_duration(chars / chars_per_sec)
|
||||
print(f" {label:<30} {chars:>7,} {actual_str:>8} {est_str:>8}")
|
||||
total_elapsed = sum(e for _, _, e in timing_rows)
|
||||
print("─" * 60)
|
||||
print(f" {'TOTAL':<30} {sum(c for _,c,_ in timing_rows):>7,} {_fmt_duration(total_elapsed):>8}")
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
|
||||
@ -18,6 +18,25 @@ from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import spacy
|
||||
from wordfreq import top_n_list
|
||||
|
||||
# ── Top 10 000 most-frequent English words ──────────────────────────
|
||||
TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
|
||||
|
||||
# Words in the top-10k list that are genuine proper nouns in this text —
|
||||
# keep them despite the frequency filter.
|
||||
PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
|
||||
# Biblical names
|
||||
"aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
|
||||
"elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
|
||||
"jacob", "james", "jehovah", "john", "joseph", "judah",
|
||||
"laban", "lehi", "levi", "micah", "michael", "moses", "noah",
|
||||
"peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
|
||||
"timothy", "zion",
|
||||
# Book-specific names that happen to match English words
|
||||
"alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
|
||||
"mosiah", "nephi", "satan", "sidon",
|
||||
})
|
||||
|
||||
SOURCE = Path("Audio Master Nem Full.txt")
|
||||
OUTPUT = Path("proper_nouns.txt")
|
||||
@ -35,12 +54,29 @@ ORG_LABELS = {"ORG", "NORP"}
|
||||
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
|
||||
|
||||
# ── Noise filters ──────────────────────────────────────────────────────────────
|
||||
# All-caps lines are section headers, not spoken names — skip them.
|
||||
# Also skip very short tokens that are likely artefacts.
|
||||
SKIP_PATTERNS = re.compile(
|
||||
r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|"
|
||||
r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$"
|
||||
)
|
||||
# Common English words that should be dropped when splitting multi-word entities.
|
||||
STOP_WORDS: set[str] = {
|
||||
"A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
|
||||
"DO", "DID", "DOTH",
|
||||
"EVEN", "FOR", "FROM",
|
||||
"HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
|
||||
"I", "IN", "IS", "IT", "ITS",
|
||||
"MAY", "ME", "MORE", "MY",
|
||||
"NAY", "NO", "NOT", "NOW",
|
||||
"OF", "OR", "OUR",
|
||||
"SHALL", "SHE", "SO", "SOME",
|
||||
"THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
|
||||
"THIS", "THOSE", "THOU", "THUS", "THY", "TO",
|
||||
"UP", "UPON", "US",
|
||||
"WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
|
||||
"YE", "YEA", "YET", "YOU", "YOUR",
|
||||
# Book-specific common words not worth flagging
|
||||
"BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
|
||||
# Generic nouns that slip through NER
|
||||
"CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
|
||||
"PEOPLE", "SON", "TIME",
|
||||
}
|
||||
|
||||
|
||||
def is_noise(text: str) -> bool:
|
||||
t = text.strip()
|
||||
@ -48,9 +84,12 @@ def is_noise(text: str) -> bool:
|
||||
return True
|
||||
if t.isupper() and len(t) > 4: # all-caps section header word
|
||||
return True
|
||||
if SKIP_PATTERNS.match(t.upper()):
|
||||
if t.upper() in STOP_WORDS:
|
||||
return True
|
||||
if re.search(r"[^a-zA-Z\-' ]", t): # contains digits or symbols
|
||||
if re.search(r"[^a-zA-Z\-']", t): # contains digits, spaces, or symbols
|
||||
return True
|
||||
# Drop common English words (no hyphens) unless whitelisted as proper nouns.
|
||||
if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
|
||||
return True
|
||||
return False
|
||||
|
||||
@ -60,6 +99,11 @@ def canonical(text: str) -> str:
|
||||
return " ".join(text.split()).title()
|
||||
|
||||
|
||||
def split_words(phrase: str) -> list[str]:
|
||||
"""Split a phrase on spaces; hyphenated words are kept as one token."""
|
||||
return phrase.split()
|
||||
|
||||
|
||||
# ── Read and process ───────────────────────────────────────────────────────────
|
||||
print(f"Reading '{SOURCE}' …")
|
||||
raw_text = SOURCE.read_text(encoding="utf-8")
|
||||
@ -71,20 +115,23 @@ doc = nlp(raw_text)
|
||||
buckets: dict[str, set[str]] = defaultdict(set)
|
||||
|
||||
# 1. NER pass — trust spaCy's entity labels
|
||||
# Multi-word entities (e.g. "Peter James John") are split into individual
|
||||
# words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
|
||||
for ent in doc.ents:
|
||||
name = canonical(ent.text)
|
||||
if is_noise(name):
|
||||
phrase = canonical(ent.text)
|
||||
for word in split_words(phrase):
|
||||
if is_noise(word):
|
||||
continue
|
||||
if ent.label_ in PERSON_LABELS:
|
||||
buckets["People & Characters"].add(name)
|
||||
buckets["People & Characters"].add(word)
|
||||
elif ent.label_ in PLACE_LABELS:
|
||||
buckets["Places & Lands"].add(name)
|
||||
buckets["Places & Lands"].add(word)
|
||||
elif ent.label_ in ORG_LABELS:
|
||||
buckets["Groups & Nations"].add(name)
|
||||
buckets["Groups & Nations"].add(word)
|
||||
elif ent.label_ in OTHER_LABELS:
|
||||
buckets["Other Named Things"].add(name)
|
||||
buckets["Other Named Things"].add(word)
|
||||
else:
|
||||
buckets["Other Named Things"].add(name)
|
||||
buckets["Other Named Things"].add(word)
|
||||
|
||||
# 2. PROPN pass — catch names spaCy didn't recognise as entities
|
||||
# Only include tokens that are inside a sentence (not at position 0)
|
||||
@ -97,13 +144,13 @@ for token in doc:
|
||||
continue # skip all-caps
|
||||
if token.i == token.sent.start:
|
||||
continue # skip sentence-initial (could be any word)
|
||||
name = canonical(text)
|
||||
if is_noise(name):
|
||||
word = canonical(text)
|
||||
if is_noise(word):
|
||||
continue
|
||||
# Only add if not already captured by NER
|
||||
already_captured = any(name in s for s in buckets.values())
|
||||
already_captured = any(word in s for s in buckets.values())
|
||||
if not already_captured:
|
||||
buckets["Unclassified Proper Nouns"].add(name)
|
||||
buckets["Unclassified Proper Nouns"].add(word)
|
||||
|
||||
# ── Write output ───────────────────────────────────────────────────────────────
|
||||
GROUP_ORDER = [
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1345
proper_nouns.txt
1345
proper_nouns.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user