""" audiobook_nem.py ──────────────── Generate the Book of the Nem audiobook — one unique voice per book/section. Usage: python create_audiobook_nem.py # all enabled books python create_audiobook_nem.py --list # list available book labels python create_audiobook_nem.py Introduction python create_audiobook_nem.py "Book of Hagoth" python create_audiobook_nem.py Introduction "Book of Hagoth" To permanently skip a section, comment out its entry in BOOKS below. Output .wav files are written to OUTPUT_DIR (created automatically). """ import argparse import re import time import numpy as np import soundfile as sf import torch from pathlib import Path from kokoro import KPipeline # ── Config ───────────────────────────────────────────────────────────────────── _FIXED_FILE = Path("Audio Master Nem Full (TTS Fixed).txt") _ORIG_FILE = Path("Audio Master Nem Full.txt") SOURCE_FILE = _FIXED_FILE if _FIXED_FILE.exists() else _ORIG_FILE OUTPUT_DIR = Path("output_audiobook") SAMPLE_RATE = 24000 SPEED = 1.0 LANG_CODE = "a" # 'a' = American English # ── Available Kokoro voices (American English, lang_code='a') ────────────────── # af_bella – American female [downloaded] # af_heart – warm American female [downloaded] # af_nicole – American female [downloaded] # af_river – American female [downloaded] # af_sarah – American female [downloaded] # af_sky – American female [downloaded] # am_adam – American male (deep) [downloaded] # am_echo – American male [downloaded] # am_eric – American male [downloaded] # am_fenrir – American male [downloaded] # am_liam – American male [downloaded] # am_michael – American male (clear) [downloaded] # am_onyx – American male [downloaded] # am_puck – American male [downloaded] # am_santa – American male [downloaded] (not used) # ── Book definitions ─────────────────────────────────────────────────────────── # Format: (label, (start_line1, start_line2), voice, output_wav) # start_line1 – exact text of the FIRST line of the section header # start_line2 – prefix of the SECOND line (used together for unambiguous matching) # voice – Kokoro voice name # output_wav – filename saved inside OUTPUT_DIR # # Comment out any line to skip that section entirely. BOOKS = [ # label (start_line1, start_line2) voice output_wav ("Introduction", ("Introduction", "The Book of the Nem"), "af_heart", "00_introduction.wav"), ("Book of Hagoth", ("THE BOOK OF HAGOTH", "THE SON OF HAGMENI,"), "am_santa", "01_hagoth.wav"), ("Shi-Tugo I", ("THE FIRST BOOK OF SHI-TUGO", "FORMER WARRIOR, AMMONITE"), "am_eric", "02_shi_tugo_1.wav"), ("Sanempet", ("THE BOOK OF SANEMPET", "THE SON OF HAGMENI,"), "am_liam", "03_sanempet.wav"), ("Oug", ("THE BOOK OF OUG", "THE SON OF SANEMPET"), "am_michael", "04_oug.wav"), ("Temple Writings of Oug", ("THE BOOK OF", "THE TEMPLE WRITINGS"), "am_michael", "05_temple_writings_oug.wav"), ("Sacred Temple Writings", ("THE SACRED", "TEMPLE WRITINGS"), "am_michael", "06_sacred_temple_writings.wav"), ("Samuel the Lamanite I", ("THE FIRST BOOK", "OF SAMUEL THE LAMANITE"), "am_echo", "07_samuel_lamanite_1.wav"), ("Samuel the Lamanite II", ("THE SECOND BOOK", "OF SAMUEL THE LAMANITE"), "am_echo", "08_samuel_lamanite_2.wav"), ("Manti", ("THE BOOK OF MANTI", "THE SON OF OUG"), "am_onyx", "09_manti.wav"), ("Pa Nat I", ("THE FIRST BOOK OF PA NAT", "THE DAUGHTER OF SHIMLEI"), "af_bella", "10_pa_nat_1.wav"), ("Moroni I", ("THE FIRST BOOK OF MORONI", "THE SON OF MORMON,"), "am_adam", "11_moroni_1.wav"), ("Moroni II", ("THE SECOND BOOK OF MORONI", "THE SON OF MORMON,"), "am_adam", "12_moroni_2.wav"), ("Moroni III", ("THE THIRD BOOK OF MORONI", "THE SON OF MORMON,"), "am_adam", "13_moroni_3.wav"), ("Shioni", ("THE BOOK OF SHIONI", "THE SON OF MORONI"), "am_puck", "14_shioni.wav"), ] # ── Helpers ──────────────────────────────────────────────────────────────────── def load_and_split(source: Path, books: list) -> dict[str, str]: """ Read the source file and split it into sections keyed by label. Each section starts at its (start_line1, start_line2) marker pair and ends just before the next section's marker. Marker positions are always detected from the *original* unmodified file (_ORIG_FILE) when it exists, so that phonetic fixes applied to section headings in the TTS-fixed file can never break section detection. The line numbers are identical in both files because word-level replacements never add or remove lines. """ # Use the original (un-fixed) file for marker detection so phonetic # changes to heading lines don't break matching. marker_source = _ORIG_FILE if _ORIG_FILE.exists() else source marker_lines = marker_source.read_text(encoding="utf-8").splitlines() # The content to actually return comes from `source` (may be fixed file). content_lines = source.read_text(encoding="utf-8").splitlines() # Build a mapping: (label, line1, line2) for each book markers = [(label, m[0].strip(), m[1].strip()) for label, m, _, _ in books] # Find the line index of each marker's first occurrence (two-line match) marker_positions: list[tuple[int, int]] = [] # (line_idx, books_idx) for book_idx, (label, m1, m2) in enumerate(markers): for line_idx, line in enumerate(marker_lines[:-1]): if (line.strip().upper() == m1.upper() and marker_lines[line_idx + 1].strip().upper().startswith(m2.upper())): marker_positions.append((line_idx, book_idx)) break else: print(f" ⚠ Marker not found for '{label}': '{m1}' / '{m2}' — skipping") marker_positions.sort(key=lambda x: x[0]) sections: dict[str, str] = {} for rank, (line_idx, book_idx) in enumerate(marker_positions): label = markers[book_idx][0] if rank + 1 < len(marker_positions): end_line = marker_positions[rank + 1][0] else: end_line = len(content_lines) text = "\n".join(content_lines[line_idx:end_line]).strip() sections[label] = text return sections def clean_text(text: str) -> str: """ Strip formatting artifacts, underscores, and normalise whitespace so the TTS receives clean prose. """ # Remove lines that are pure underscores (horizontal rules) text = re.sub(r"^_{3,}\s*$", "", text, flags=re.MULTILINE) # Remove leading chapter headers that are all-caps lines # (keep them as natural spoken title for context) # Collapse excess blank lines text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def _fmt_duration(seconds: float) -> str: """Format seconds as 'Xm Ys' or 'Xs'.""" if seconds >= 60: m, s = divmod(int(seconds), 60) return f"{m}m {s:02d}s" return f"{seconds:.0f}s" def generate_audio(pipeline: KPipeline, text: str, voice: str, output_path: Path) -> float: """Generate audio and return wall-clock seconds elapsed.""" t0 = time.monotonic() chunks = [] for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED): if hasattr(chunk_audio, "numpy"): chunk_audio = chunk_audio.cpu().numpy() chunk_audio = np.atleast_1d(chunk_audio.squeeze()) if chunk_audio.size > 0: chunks.append(chunk_audio) if chunks: audio = np.concatenate(chunks, axis=0) sf.write(str(output_path), audio, SAMPLE_RATE) elapsed = time.monotonic() - t0 duration = len(audio) / SAMPLE_RATE print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s audio | {elapsed:.1f}s wall-clock)") else: elapsed = time.monotonic() - t0 print(f" ✗ No audio produced for voice='{voice}'") return elapsed # ── Main ─────────────────────────────────────────────────────────────────────── def main() -> None: # ── CLI ──────────────────────────────────────────────────────────── parser = argparse.ArgumentParser(description="Generate Nem audiobook sections.") parser.add_argument( "books", nargs="*", help="Labels of sections to generate (default: all enabled books). " "Use --list to see available labels." ) parser.add_argument( "--list", action="store_true", help="Print all enabled book labels and exit." ) parser.add_argument( "--preview", nargs="?", const=3000, type=int, metavar="CHARS", help="Generate a short preview clip per book (default: 3000 chars). " "Output filenames get a _preview suffix." ) args = parser.parse_args() enabled_labels = [label for label, _, _, _ in BOOKS] if args.list: print("Enabled books:") for label in enabled_labels: print(f" {label}") return # Filter to requested subset, preserving BOOKS order if args.books: unknown = [b for b in args.books if b not in enabled_labels] if unknown: print(f"Unknown book label(s): {', '.join(unknown)}") print(f"Run with --list to see available labels.") return run_books = [b for b in BOOKS if b[0] in args.books] else: run_books = list(BOOKS) device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Device: {device}") if device == "cuda": print(f"GPU: {torch.cuda.get_device_name(0)}") OUTPUT_DIR.mkdir(exist_ok=True) print(f"\nSource: '{SOURCE_FILE}'" + (" ✓ (TTS fixed)" if SOURCE_FILE == _FIXED_FILE else " ⚠ (original — run 'Apply Fixes to Text' in the GUI to use phonetic fixes)")) # Always split using ALL books for correct section boundaries, # but only generate for run_books. sections = load_and_split(SOURCE_FILE, BOOKS) print(f" Found {len(sections)} sections ({len(run_books)} selected).\n") print("Initialising Kokoro pipeline …") pipeline = KPipeline(lang_code=LANG_CODE) # Pre-compute char counts for all sections so we can estimate ETAs section_chars: dict[str, int] = { label: len(clean_text(sections[label])) for label, _, _, _ in run_books if label in sections } # Print char count summary before starting preview_note = f" ⚡ PREVIEW MODE — capped at {args.preview:,} chars/book\n" if args.preview else "" print(f"\n{preview_note}{'─' * 52}") print(f" {'Section':<30} {'Chars':>8}") print(f"{'─' * 52}") for label, _, _, wav_name in run_books: if label in section_chars: print(f" {label:<30} {section_chars[label]:>8,}") print(f"{'─' * 52}") total_chars = sum(section_chars.values()) print(f" {'TOTAL':<30} {total_chars:>8,}") print() chars_per_sec: float | None = None # derived from the first book that finishes timing_rows: list[tuple[str, int, float]] = [] # (label, chars, elapsed) for label, _marker, voice, wav_name in run_books: if label not in sections: continue text = clean_text(sections[label]) if not text: print(f"\n[{label}] ⚠ Empty text — skipping") continue # Preview mode: truncate to requested char limit at a word boundary preview_chars = args.preview if preview_chars: if len(text) > preview_chars: cut = text.rfind(" ", 0, preview_chars) text = text[: cut if cut > 0 else preview_chars] chars = len(text) # Print ETA once we have a calibration rate if chars_per_sec is not None: eta_sec = chars / chars_per_sec eta_str = _fmt_duration(eta_sec) print(f"\n[{label}] voice={voice} → {wav_name} (est. {eta_str})") else: print(f"\n[{label}] voice={voice} → {wav_name} (timing calibration run)") stem, ext = wav_name.rsplit(".", 1) preview_tag = "_preview" if preview_chars else "" out_path = OUTPUT_DIR / f"{stem}_{voice}{preview_tag}.{ext}" elapsed = generate_audio(pipeline, text, voice, out_path) timing_rows.append((label, chars, elapsed)) # Update calibration as a cumulative average after every book total_chars_done = sum(c for _, c, _ in timing_rows) total_elapsed_done = sum(e for _, _, e in timing_rows) if total_elapsed_done > 0: chars_per_sec = total_chars_done / total_elapsed_done print(f" ⏱ Calibration: {chars_per_sec:.0f} chars/sec") # ── Summary ──────────────────────────────────────────────────────────────── print("\n" + "─" * 60) print(f" {'Section':<30} {'Chars':>7} {'Actual':>8} {'Est':>8}") print("─" * 60) for i, (label, chars, elapsed) in enumerate(timing_rows): actual_str = _fmt_duration(elapsed) # Estimate using the cumulative rate *before* this book was added prior_chars = sum(c for _, c, _ in timing_rows[:i]) prior_elapsed = sum(e for _, _, e in timing_rows[:i]) if prior_elapsed > 0: est_str = _fmt_duration(chars / (prior_chars / prior_elapsed)) else: est_str = "(first run)" print(f" {label:<30} {chars:>7,} {actual_str:>8} {est_str:>8}") total_elapsed = sum(e for _, _, e in timing_rows) print("─" * 60) print(f" {'TOTAL':<30} {sum(c for _,c,_ in timing_rows):>7,} {_fmt_duration(total_elapsed):>8}") print("\nDone.") if __name__ == "__main__": main()