Compare commits
2 Commits
f0e0adf24b
...
6cefc3c862
| Author | SHA1 | Date | |
|---|---|---|---|
| 6cefc3c862 | |||
| 949bd7c203 |
@ -11,6 +11,7 @@ Output .wav files are written to OUTPUT_DIR (created automatically).
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
import torch
|
import torch
|
||||||
@ -51,19 +52,19 @@ BOOKS = [
|
|||||||
# label start_marker voice output_wav
|
# label start_marker voice output_wav
|
||||||
("Introduction", "Introduction", "af_heart", "00_introduction.wav"),
|
("Introduction", "Introduction", "af_heart", "00_introduction.wav"),
|
||||||
("Book of Hagoth", "THE BOOK OF HAGOTH", "am_fenrir", "01_hagoth.wav"),
|
("Book of Hagoth", "THE BOOK OF HAGOTH", "am_fenrir", "01_hagoth.wav"),
|
||||||
("Shi-Tugo I", "THE FIRST BOOK OF SHI-TUGO", "am_eric", "02_shi_tugo_1.wav"),
|
# ("Shi-Tugo I", "THE FIRST BOOK OF SHI-TUGO", "am_eric", "02_shi_tugo_1.wav"),
|
||||||
("Sanempet", "THE BOOK OF SANEMPET", "am_liam", "03_sanempet.wav"),
|
# ("Sanempet", "THE BOOK OF SANEMPET", "am_liam", "03_sanempet.wav"),
|
||||||
("Oug", "THE BOOK OF OUG", "am_michael", "04_oug.wav"),
|
# ("Oug", "THE BOOK OF OUG", "am_michael", "04_oug.wav"),
|
||||||
("Temple Writings of Oug", "THE BOOK OF", "am_michael", "05_temple_writings_oug.wav"),
|
# ("Temple Writings of Oug", "THE BOOK OF", "am_michael", "05_temple_writings_oug.wav"),
|
||||||
("Sacred Temple Writings", "THE SACRED", "am_michael", "06_sacred_temple_writings.wav"),
|
# ("Sacred Temple Writings", "THE SACRED", "am_michael", "06_sacred_temple_writings.wav"),
|
||||||
("Samuel the Lamanite I", "THE FIRST BOOK", "am_echo", "07_samuel_lamanite_1.wav"),
|
# ("Samuel the Lamanite I", "THE FIRST BOOK", "am_echo", "07_samuel_lamanite_1.wav"),
|
||||||
("Samuel the Lamanite II", "THE SECOND BOOK", "am_echo", "08_samuel_lamanite_2.wav"),
|
# ("Samuel the Lamanite II", "THE SECOND BOOK", "am_echo", "08_samuel_lamanite_2.wav"),
|
||||||
("Manti", "THE BOOK OF MANTI", "am_onyx", "09_manti.wav"),
|
# ("Manti", "THE BOOK OF MANTI", "am_onyx", "09_manti.wav"),
|
||||||
("Pa Nat I", "THE FIRST BOOK OF PA NAT", "af_nicole", "10_pa_nat_1.wav"),
|
# ("Pa Nat I", "THE FIRST BOOK OF PA NAT", "af_nicole", "10_pa_nat_1.wav"),
|
||||||
("Moroni I", "THE FIRST BOOK OF MORONI", "am_adam", "11_moroni_1.wav"),
|
# ("Moroni I", "THE FIRST BOOK OF MORONI", "am_adam", "11_moroni_1.wav"),
|
||||||
("Moroni II", "THE SECOND BOOK OF MORONI", "am_adam", "12_moroni_2.wav"),
|
# ("Moroni II", "THE SECOND BOOK OF MORONI", "am_adam", "12_moroni_2.wav"),
|
||||||
("Moroni III", "THE THIRD BOOK OF MORONI", "am_adam", "13_moroni_3.wav"),
|
# ("Moroni III", "THE THIRD BOOK OF MORONI", "am_adam", "13_moroni_3.wav"),
|
||||||
("Shioni", "THE BOOK OF SHIONI", "am_puck", "14_shioni.wav"),
|
# ("Shioni", "THE BOOK OF SHIONI", "am_puck", "14_shioni.wav"),
|
||||||
]
|
]
|
||||||
|
|
||||||
# ── Helpers ────────────────────────────────────────────────────────────────────
|
# ── Helpers ────────────────────────────────────────────────────────────────────
|
||||||
@ -118,8 +119,18 @@ def clean_text(text: str) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_duration(seconds: float) -> str:
|
||||||
|
"""Format seconds as 'Xm Ys' or 'Xs'."""
|
||||||
|
if seconds >= 60:
|
||||||
|
m, s = divmod(int(seconds), 60)
|
||||||
|
return f"{m}m {s:02d}s"
|
||||||
|
return f"{seconds:.0f}s"
|
||||||
|
|
||||||
|
|
||||||
def generate_audio(pipeline: KPipeline, text: str, voice: str,
|
def generate_audio(pipeline: KPipeline, text: str, voice: str,
|
||||||
output_path: Path) -> None:
|
output_path: Path) -> float:
|
||||||
|
"""Generate audio and return wall-clock seconds elapsed."""
|
||||||
|
t0 = time.monotonic()
|
||||||
chunks = []
|
chunks = []
|
||||||
for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
|
for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
|
||||||
if hasattr(chunk_audio, "numpy"):
|
if hasattr(chunk_audio, "numpy"):
|
||||||
@ -131,10 +142,13 @@ def generate_audio(pipeline: KPipeline, text: str, voice: str,
|
|||||||
if chunks:
|
if chunks:
|
||||||
audio = np.concatenate(chunks, axis=0)
|
audio = np.concatenate(chunks, axis=0)
|
||||||
sf.write(str(output_path), audio, SAMPLE_RATE)
|
sf.write(str(output_path), audio, SAMPLE_RATE)
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
duration = len(audio) / SAMPLE_RATE
|
duration = len(audio) / SAMPLE_RATE
|
||||||
print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s)")
|
print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s audio | {elapsed:.1f}s wall-clock)")
|
||||||
else:
|
else:
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
print(f" ✗ No audio produced for voice='{voice}'")
|
print(f" ✗ No audio produced for voice='{voice}'")
|
||||||
|
return elapsed
|
||||||
|
|
||||||
|
|
||||||
# ── Main ───────────────────────────────────────────────────────────────────────
|
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||||
@ -156,19 +170,59 @@ def main() -> None:
|
|||||||
print("Initialising Kokoro pipeline …")
|
print("Initialising Kokoro pipeline …")
|
||||||
pipeline = KPipeline(lang_code=LANG_CODE)
|
pipeline = KPipeline(lang_code=LANG_CODE)
|
||||||
|
|
||||||
|
# Pre-compute char counts for all sections so we can estimate ETAs
|
||||||
|
section_chars: dict[str, int] = {
|
||||||
|
label: len(clean_text(sections[label]))
|
||||||
|
for label, _, _, _ in BOOKS
|
||||||
|
if label in sections
|
||||||
|
}
|
||||||
|
|
||||||
|
chars_per_sec: float | None = None # derived from the first book that finishes
|
||||||
|
timing_rows: list[tuple[str, int, float]] = [] # (label, chars, elapsed)
|
||||||
|
|
||||||
for label, marker, voice, wav_name in BOOKS:
|
for label, marker, voice, wav_name in BOOKS:
|
||||||
if label not in sections:
|
if label not in sections:
|
||||||
continue # marker was not found; warning already printed
|
|
||||||
|
|
||||||
print(f"\n[{label}] voice={voice} → {wav_name}")
|
|
||||||
text = clean_text(sections[label])
|
|
||||||
if not text:
|
|
||||||
print(" ⚠ Empty text — skipping")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
out_path = OUTPUT_DIR / wav_name
|
text = clean_text(sections[label])
|
||||||
generate_audio(pipeline, text, voice, out_path)
|
if not text:
|
||||||
|
print(f"\n[{label}] ⚠ Empty text — skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
chars = section_chars[label]
|
||||||
|
|
||||||
|
# Print ETA once we have a calibration rate
|
||||||
|
if chars_per_sec is not None:
|
||||||
|
eta_sec = chars / chars_per_sec
|
||||||
|
eta_str = _fmt_duration(eta_sec)
|
||||||
|
print(f"\n[{label}] voice={voice} → {wav_name} (est. {eta_str})")
|
||||||
|
else:
|
||||||
|
print(f"\n[{label}] voice={voice} → {wav_name} (timing calibration run)")
|
||||||
|
|
||||||
|
stem, ext = wav_name.rsplit(".", 1)
|
||||||
|
out_path = OUTPUT_DIR / f"{stem}_{voice}.{ext}"
|
||||||
|
elapsed = generate_audio(pipeline, text, voice, out_path)
|
||||||
|
timing_rows.append((label, chars, elapsed))
|
||||||
|
|
||||||
|
# Calibrate from first completed book
|
||||||
|
if chars_per_sec is None and elapsed > 0:
|
||||||
|
chars_per_sec = chars / elapsed
|
||||||
|
print(f" ⏱ Calibrated: {chars_per_sec:.0f} chars/sec")
|
||||||
|
|
||||||
|
# ── Summary ────────────────────────────────────────────────────────────────
|
||||||
|
print("\n" + "─" * 60)
|
||||||
|
print(f" {'Section':<30} {'Chars':>7} {'Actual':>8} {'Est':>8}")
|
||||||
|
print("─" * 60)
|
||||||
|
for i, (label, chars, elapsed) in enumerate(timing_rows):
|
||||||
|
actual_str = _fmt_duration(elapsed)
|
||||||
|
if i == 0 or chars_per_sec is None:
|
||||||
|
est_str = "(calibration)"
|
||||||
|
else:
|
||||||
|
est_str = _fmt_duration(chars / chars_per_sec)
|
||||||
|
print(f" {label:<30} {chars:>7,} {actual_str:>8} {est_str:>8}")
|
||||||
|
total_elapsed = sum(e for _, _, e in timing_rows)
|
||||||
|
print("─" * 60)
|
||||||
|
print(f" {'TOTAL':<30} {sum(c for _,c,_ in timing_rows):>7,} {_fmt_duration(total_elapsed):>8}")
|
||||||
print("\nDone.")
|
print("\nDone.")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -18,6 +18,25 @@ from collections import defaultdict
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
|
from wordfreq import top_n_list
|
||||||
|
|
||||||
|
# ── Top 10 000 most-frequent English words ──────────────────────────
|
||||||
|
TOP_10K_ENGLISH: frozenset[str] = frozenset(top_n_list("en", 10_000))
|
||||||
|
|
||||||
|
# Words in the top-10k list that are genuine proper nouns in this text —
|
||||||
|
# keep them despite the frequency filter.
|
||||||
|
PROPER_NOUN_WHITELIST: frozenset[str] = frozenset({
|
||||||
|
# Biblical names
|
||||||
|
"aaron", "abel", "abraham", "adam", "cain", "eden", "egypt",
|
||||||
|
"elijah", "ephraim", "eve", "gad", "ham", "isaac", "israel",
|
||||||
|
"jacob", "james", "jehovah", "john", "joseph", "judah",
|
||||||
|
"laban", "lehi", "levi", "micah", "michael", "moses", "noah",
|
||||||
|
"peter", "pharaoh", "samuel", "sarah", "sarai", "seth", "simeon",
|
||||||
|
"timothy", "zion",
|
||||||
|
# Book-specific names that happen to match English words
|
||||||
|
"alma", "ether", "gideon", "limhi", "mormon", "moroni", "mulek",
|
||||||
|
"mosiah", "nephi", "satan", "sidon",
|
||||||
|
})
|
||||||
|
|
||||||
SOURCE = Path("Audio Master Nem Full.txt")
|
SOURCE = Path("Audio Master Nem Full.txt")
|
||||||
OUTPUT = Path("proper_nouns.txt")
|
OUTPUT = Path("proper_nouns.txt")
|
||||||
@ -35,12 +54,29 @@ ORG_LABELS = {"ORG", "NORP"}
|
|||||||
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
|
OTHER_LABELS = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
|
||||||
|
|
||||||
# ── Noise filters ──────────────────────────────────────────────────────────────
|
# ── Noise filters ──────────────────────────────────────────────────────────────
|
||||||
# All-caps lines are section headers, not spoken names — skip them.
|
# Common English words that should be dropped when splitting multi-word entities.
|
||||||
# Also skip very short tokens that are likely artefacts.
|
STOP_WORDS: set[str] = {
|
||||||
SKIP_PATTERNS = re.compile(
|
"A", "AN", "AND", "AS", "AT", "BE", "BUT", "BY",
|
||||||
r"^(THE|A|AN|AND|OF|IN|TO|FOR|BY|AT|IS|WAS|BE|HE|SHE|IT|"
|
"DO", "DID", "DOTH",
|
||||||
r"CHAPTER|VERSE|YEA|BEHOLD|LORD|GOD|CHRIST|HOLY|GHOST)$"
|
"EVEN", "FOR", "FROM",
|
||||||
)
|
"HAD", "HAS", "HAVE", "HATH", "HE", "HER", "HIS", "HOW",
|
||||||
|
"I", "IN", "IS", "IT", "ITS",
|
||||||
|
"MAY", "ME", "MORE", "MY",
|
||||||
|
"NAY", "NO", "NOT", "NOW",
|
||||||
|
"OF", "OR", "OUR",
|
||||||
|
"SHALL", "SHE", "SO", "SOME",
|
||||||
|
"THAT", "THE", "THEE", "THEIR", "THEN", "THERE", "THESE", "THEY",
|
||||||
|
"THIS", "THOSE", "THOU", "THUS", "THY", "TO",
|
||||||
|
"UP", "UPON", "US",
|
||||||
|
"WAS", "WE", "WHEN", "WHERE", "WHICH", "WHO", "WILL", "WITH",
|
||||||
|
"YE", "YEA", "YET", "YOU", "YOUR",
|
||||||
|
# Book-specific common words not worth flagging
|
||||||
|
"BEHOLD", "CHAPTER", "CHRIST", "GOD", "GHOST", "HOLY", "LORD", "VERSE",
|
||||||
|
# Generic nouns that slip through NER
|
||||||
|
"CITY", "DAYS", "DAY", "GREAT", "LAND", "MAN", "MEN", "NEW",
|
||||||
|
"PEOPLE", "SON", "TIME",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def is_noise(text: str) -> bool:
|
def is_noise(text: str) -> bool:
|
||||||
t = text.strip()
|
t = text.strip()
|
||||||
@ -48,9 +84,12 @@ def is_noise(text: str) -> bool:
|
|||||||
return True
|
return True
|
||||||
if t.isupper() and len(t) > 4: # all-caps section header word
|
if t.isupper() and len(t) > 4: # all-caps section header word
|
||||||
return True
|
return True
|
||||||
if SKIP_PATTERNS.match(t.upper()):
|
if t.upper() in STOP_WORDS:
|
||||||
return True
|
return True
|
||||||
if re.search(r"[^a-zA-Z\-' ]", t): # contains digits or symbols
|
if re.search(r"[^a-zA-Z\-']", t): # contains digits, spaces, or symbols
|
||||||
|
return True
|
||||||
|
# Drop common English words (no hyphens) unless whitelisted as proper nouns.
|
||||||
|
if "-" not in t and t.lower() in TOP_10K_ENGLISH and t.lower() not in PROPER_NOUN_WHITELIST:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -60,6 +99,11 @@ def canonical(text: str) -> str:
|
|||||||
return " ".join(text.split()).title()
|
return " ".join(text.split()).title()
|
||||||
|
|
||||||
|
|
||||||
|
def split_words(phrase: str) -> list[str]:
|
||||||
|
"""Split a phrase on spaces; hyphenated words are kept as one token."""
|
||||||
|
return phrase.split()
|
||||||
|
|
||||||
|
|
||||||
# ── Read and process ───────────────────────────────────────────────────────────
|
# ── Read and process ───────────────────────────────────────────────────────────
|
||||||
print(f"Reading '{SOURCE}' …")
|
print(f"Reading '{SOURCE}' …")
|
||||||
raw_text = SOURCE.read_text(encoding="utf-8")
|
raw_text = SOURCE.read_text(encoding="utf-8")
|
||||||
@ -71,20 +115,23 @@ doc = nlp(raw_text)
|
|||||||
buckets: dict[str, set[str]] = defaultdict(set)
|
buckets: dict[str, set[str]] = defaultdict(set)
|
||||||
|
|
||||||
# 1. NER pass — trust spaCy's entity labels
|
# 1. NER pass — trust spaCy's entity labels
|
||||||
|
# Multi-word entities (e.g. "Peter James John") are split into individual
|
||||||
|
# words; hyphenated words (e.g. "Anti-Nephi-Lehi") stay as one token.
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
name = canonical(ent.text)
|
phrase = canonical(ent.text)
|
||||||
if is_noise(name):
|
for word in split_words(phrase):
|
||||||
|
if is_noise(word):
|
||||||
continue
|
continue
|
||||||
if ent.label_ in PERSON_LABELS:
|
if ent.label_ in PERSON_LABELS:
|
||||||
buckets["People & Characters"].add(name)
|
buckets["People & Characters"].add(word)
|
||||||
elif ent.label_ in PLACE_LABELS:
|
elif ent.label_ in PLACE_LABELS:
|
||||||
buckets["Places & Lands"].add(name)
|
buckets["Places & Lands"].add(word)
|
||||||
elif ent.label_ in ORG_LABELS:
|
elif ent.label_ in ORG_LABELS:
|
||||||
buckets["Groups & Nations"].add(name)
|
buckets["Groups & Nations"].add(word)
|
||||||
elif ent.label_ in OTHER_LABELS:
|
elif ent.label_ in OTHER_LABELS:
|
||||||
buckets["Other Named Things"].add(name)
|
buckets["Other Named Things"].add(word)
|
||||||
else:
|
else:
|
||||||
buckets["Other Named Things"].add(name)
|
buckets["Other Named Things"].add(word)
|
||||||
|
|
||||||
# 2. PROPN pass — catch names spaCy didn't recognise as entities
|
# 2. PROPN pass — catch names spaCy didn't recognise as entities
|
||||||
# Only include tokens that are inside a sentence (not at position 0)
|
# Only include tokens that are inside a sentence (not at position 0)
|
||||||
@ -97,13 +144,13 @@ for token in doc:
|
|||||||
continue # skip all-caps
|
continue # skip all-caps
|
||||||
if token.i == token.sent.start:
|
if token.i == token.sent.start:
|
||||||
continue # skip sentence-initial (could be any word)
|
continue # skip sentence-initial (could be any word)
|
||||||
name = canonical(text)
|
word = canonical(text)
|
||||||
if is_noise(name):
|
if is_noise(word):
|
||||||
continue
|
continue
|
||||||
# Only add if not already captured by NER
|
# Only add if not already captured by NER
|
||||||
already_captured = any(name in s for s in buckets.values())
|
already_captured = any(word in s for s in buckets.values())
|
||||||
if not already_captured:
|
if not already_captured:
|
||||||
buckets["Unclassified Proper Nouns"].add(name)
|
buckets["Unclassified Proper Nouns"].add(word)
|
||||||
|
|
||||||
# ── Write output ───────────────────────────────────────────────────────────────
|
# ── Write output ───────────────────────────────────────────────────────────────
|
||||||
GROUP_ORDER = [
|
GROUP_ORDER = [
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1345
proper_nouns.txt
1345
proper_nouns.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user