audiobook_creator/create_audiobook_nem.py

"""
audiobook_nem.py
────────────────
Generate the Book of the Nem audiobook — one unique voice per book/section.

Usage:
    python create_audiobook_nem.py                   # all enabled books
    python create_audiobook_nem.py --list            # list available book labels
    python create_audiobook_nem.py Introduction
    python create_audiobook_nem.py "Book of Hagoth"
    python create_audiobook_nem.py Introduction "Book of Hagoth"

To permanently skip a section, comment out its entry in BOOKS below.
Output .wav files are written to OUTPUT_DIR (created automatically).
"""

import argparse
import re
import time
import numpy as np
import soundfile as sf
import torch
from pathlib import Path
from kokoro import KPipeline

# ── Config ─────────────────────────────────────────────────────────────────────
_FIXED_FILE   = Path("Audio Master Nem Full (TTS Fixed).txt")
_ORIG_FILE    = Path("Audio Master Nem Full.txt")
SOURCE_FILE   = _FIXED_FILE if _FIXED_FILE.exists() else _ORIG_FILE
OUTPUT_DIR    = Path("output_audiobook")
SAMPLE_RATE   = 24000
SPEED         = 1.0
LANG_CODE     = "a"   # 'a' = American English

# ── Available Kokoro voices (American English, lang_code='a') ──────────────────
#   af_bella   – American female             [downloaded]
#   af_heart   – warm American female        [downloaded]
#   af_nicole  – American female             [downloaded]
#   af_river   – American female             [downloaded]
#   af_sarah   – American female             [downloaded]
#   af_sky     – American female             [downloaded]
#   am_adam    – American male (deep)        [downloaded]
#   am_echo    – American male               [downloaded]
#   am_eric    – American male               [downloaded]
#   am_fenrir  – American male               [downloaded]
#   am_liam    – American male               [downloaded]
#   am_michael – American male (clear)       [downloaded]
#   am_onyx    – American male               [downloaded]
#   am_puck    – American male               [downloaded]
#   am_santa   – American male               [downloaded] (not used)

# ── Book definitions ───────────────────────────────────────────────────────────
# Format: (label, (start_line1, start_line2), voice, output_wav)
#   start_line1 – exact text of the FIRST line of the section header
#   start_line2 – prefix of the SECOND line (used together for unambiguous matching)
#   voice        – Kokoro voice name
#   output_wav   – filename saved inside OUTPUT_DIR
#
# Comment out any line to skip that section entirely.
BOOKS = [
    # label                       (start_line1,                    start_line2)                           voice         output_wav
    ("Introduction",              ("Introduction",                 "The Book of the Nem"),                "af_heart",   "00_introduction.wav"),
    ("Book of Hagoth",            ("THE BOOK OF HAGOTH",           "THE SON OF HAGMENI,"),                 "am_santa",  "01_hagoth.wav"),
    ("Shi-Tugo I",                ("THE FIRST BOOK OF SHI-TUGO",  "FORMER WARRIOR, AMMONITE"),            "am_eric",    "02_shi_tugo_1.wav"),
    ("Sanempet",                  ("THE BOOK OF SANEMPET",        "THE SON OF HAGMENI,"),                 "am_liam",    "03_sanempet.wav"),
    ("Oug",                       ("THE BOOK OF OUG",             "THE SON OF SANEMPET"),                 "am_michael", "04_oug.wav"),
    ("Temple Writings of Oug",    ("THE BOOK OF",                 "THE TEMPLE WRITINGS"),                "am_michael", "05_temple_writings_oug.wav"),
    ("Sacred Temple Writings",    ("THE SACRED",                  "TEMPLE WRITINGS"),                     "am_michael", "06_sacred_temple_writings.wav"),
    ("Samuel the Lamanite I",     ("THE FIRST BOOK",              "OF SAMUEL THE LAMANITE"),             "am_echo",    "07_samuel_lamanite_1.wav"),
    ("Samuel the Lamanite II",    ("THE SECOND BOOK",             "OF SAMUEL THE LAMANITE"),             "am_echo",    "08_samuel_lamanite_2.wav"),
    ("Manti",                     ("THE BOOK OF MANTI",           "THE SON OF OUG"),                      "am_onyx",    "09_manti.wav"),
    ("Pa Nat I",                  ("THE FIRST BOOK OF PA NAT",    "THE DAUGHTER OF SHIMLEI"),             "af_bella",  "10_pa_nat_1.wav"),
    ("Moroni I",                  ("THE FIRST BOOK OF MORONI",    "THE SON OF MORMON,"),                  "am_adam",    "11_moroni_1.wav"),
    ("Moroni II",                 ("THE SECOND BOOK OF MORONI",   "THE SON OF MORMON,"),                  "am_adam",    "12_moroni_2.wav"),
    ("Moroni III",                ("THE THIRD BOOK OF MORONI",    "THE SON OF MORMON,"),                  "am_adam",    "13_moroni_3.wav"),
    ("Shioni",                    ("THE BOOK OF SHIONI",          "THE SON OF MORONI"),                   "am_puck",    "14_shioni.wav"),
]

# ── Helpers ────────────────────────────────────────────────────────────────────

def load_and_split(source: Path, books: list) -> dict[str, str]:
    """
    Read the source file and split it into sections keyed by label.
    Each section starts at its (start_line1, start_line2) marker pair and
    ends just before the next section's marker.

    Marker positions are always detected from the *original* unmodified file
    (_ORIG_FILE) when it exists, so that phonetic fixes applied to section
    headings in the TTS-fixed file can never break section detection.  The
    line numbers are identical in both files because word-level replacements
    never add or remove lines.
    """
    # Use the original (un-fixed) file for marker detection so phonetic
    # changes to heading lines don't break matching.
    marker_source = _ORIG_FILE if _ORIG_FILE.exists() else source
    marker_lines = marker_source.read_text(encoding="utf-8").splitlines()

    # The content to actually return comes from `source` (may be fixed file).
    content_lines = source.read_text(encoding="utf-8").splitlines()

    # Build a mapping: (label, line1, line2) for each book
    markers = [(label, m[0].strip(), m[1].strip()) for label, m, _, _ in books]

    # Find the line index of each marker's first occurrence (two-line match)
    marker_positions: list[tuple[int, int]] = []   # (line_idx, books_idx)
    for book_idx, (label, m1, m2) in enumerate(markers):
        for line_idx, line in enumerate(marker_lines[:-1]):
            if (line.strip().upper() == m1.upper() and
                    marker_lines[line_idx + 1].strip().upper().startswith(m2.upper())):
                marker_positions.append((line_idx, book_idx))
                break
        else:
            print(f"  ⚠  Marker not found for '{label}': '{m1}' / '{m2}' — skipping")

    marker_positions.sort(key=lambda x: x[0])

    sections: dict[str, str] = {}
    for rank, (line_idx, book_idx) in enumerate(marker_positions):
        label = markers[book_idx][0]
        if rank + 1 < len(marker_positions):
            end_line = marker_positions[rank + 1][0]
        else:
            end_line = len(content_lines)
        text = "\n".join(content_lines[line_idx:end_line]).strip()
        sections[label] = text

    return sections


def clean_text(text: str) -> str:
    """
    Strip formatting artifacts, underscores, and normalise whitespace
    so the TTS receives clean prose.
    """
    # Remove lines that are pure underscores (horizontal rules)
    text = re.sub(r"^_{3,}\s*$", "", text, flags=re.MULTILINE)
    # Remove leading chapter headers that are all-caps lines
    # (keep them as natural spoken title for context)
    # Collapse excess blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def _fmt_duration(seconds: float) -> str:
    """Format seconds as 'Xm Ys' or 'Xs'."""
    if seconds >= 60:
        m, s = divmod(int(seconds), 60)
        return f"{m}m {s:02d}s"
    return f"{seconds:.0f}s"


def generate_audio(pipeline: KPipeline, text: str, voice: str,
                   output_path: Path) -> float:
    """Generate audio and return wall-clock seconds elapsed."""
    t0 = time.monotonic()
    chunks = []
    for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
        if hasattr(chunk_audio, "numpy"):
            chunk_audio = chunk_audio.cpu().numpy()
        chunk_audio = np.atleast_1d(chunk_audio.squeeze())
        if chunk_audio.size > 0:
            chunks.append(chunk_audio)

    if chunks:
        audio = np.concatenate(chunks, axis=0)
        sf.write(str(output_path), audio, SAMPLE_RATE)
        elapsed = time.monotonic() - t0
        duration = len(audio) / SAMPLE_RATE
        print(f"  ✓  Saved '{output_path.name}'  ({duration:.1f}s audio  |  {elapsed:.1f}s wall-clock)")
    else:
        elapsed = time.monotonic() - t0
        print(f"  ✗  No audio produced for voice='{voice}'")
    return elapsed


# ── Main ───────────────────────────────────────────────────────────────────────

def main() -> None:
    # ── CLI ────────────────────────────────────────────────────────────
    parser = argparse.ArgumentParser(description="Generate Nem audiobook sections.")
    parser.add_argument(
        "books", nargs="*",
        help="Labels of sections to generate (default: all enabled books). "
             "Use --list to see available labels."
    )
    parser.add_argument(
        "--list", action="store_true",
        help="Print all enabled book labels and exit."
    )
    parser.add_argument(
        "--preview", nargs="?", const=3000, type=int, metavar="CHARS",
        help="Generate a short preview clip per book (default: 3000 chars). "
             "Output filenames get a _preview suffix."
    )
    args = parser.parse_args()

    enabled_labels = [label for label, _, _, _ in BOOKS]

    if args.list:
        print("Enabled books:")
        for label in enabled_labels:
            print(f"  {label}")
        return

    # Filter to requested subset, preserving BOOKS order
    if args.books:
        unknown = [b for b in args.books if b not in enabled_labels]
        if unknown:
            print(f"Unknown book label(s): {', '.join(unknown)}")
            print(f"Run with --list to see available labels.")
            return
        run_books = [b for b in BOOKS if b[0] in args.books]
    else:
        run_books = list(BOOKS)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}")
    if device == "cuda":
        print(f"GPU:    {torch.cuda.get_device_name(0)}")

    OUTPUT_DIR.mkdir(exist_ok=True)

    print(f"\nSource: '{SOURCE_FILE}'"
          + (" ✓ (TTS fixed)" if SOURCE_FILE == _FIXED_FILE else
             " ⚠ (original — run 'Apply Fixes to Text' in the GUI to use phonetic fixes)"))
    # Always split using ALL books for correct section boundaries,
    # but only generate for run_books.
    sections = load_and_split(SOURCE_FILE, BOOKS)
    print(f"  Found {len(sections)} sections ({len(run_books)} selected).\n")

    print("Initialising Kokoro pipeline …")
    pipeline = KPipeline(lang_code=LANG_CODE)

    # Pre-compute char counts for all sections so we can estimate ETAs
    section_chars: dict[str, int] = {
        label: len(clean_text(sections[label]))
        for label, _, _, _ in run_books
        if label in sections
    }

    # Print char count summary before starting
    preview_note = f"  ⚡ PREVIEW MODE — capped at {args.preview:,} chars/book\n" if args.preview else ""
    print(f"\n{preview_note}{'─' * 52}")
    print(f"  {'Section':<30}  {'Chars':>8}")
    print(f"{'─' * 52}")
    for label, _, _, wav_name in run_books:
        if label in section_chars:
            print(f"  {label:<30}  {section_chars[label]:>8,}")
    print(f"{'─' * 52}")
    total_chars = sum(section_chars.values())
    print(f"  {'TOTAL':<30}  {total_chars:>8,}")
    print()

    chars_per_sec: float | None = None   # derived from the first book that finishes
    timing_rows: list[tuple[str, int, float]] = []  # (label, chars, elapsed)

    for label, _marker, voice, wav_name in run_books:
        if label not in sections:
            continue

        text = clean_text(sections[label])
        if not text:
            print(f"\n[{label}]  ⚠  Empty text — skipping")
            continue

        # Preview mode: truncate to requested char limit at a word boundary
        preview_chars = args.preview
        if preview_chars:
            if len(text) > preview_chars:
                cut = text.rfind(" ", 0, preview_chars)
                text = text[: cut if cut > 0 else preview_chars]

        chars = len(text)

        # Print ETA once we have a calibration rate
        if chars_per_sec is not None:
            eta_sec = chars / chars_per_sec
            eta_str = _fmt_duration(eta_sec)
            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (est. {eta_str})")
        else:
            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (timing calibration run)")

        stem, ext = wav_name.rsplit(".", 1)
        preview_tag = "_preview" if preview_chars else ""
        out_path = OUTPUT_DIR / f"{stem}_{voice}{preview_tag}.{ext}"
        elapsed = generate_audio(pipeline, text, voice, out_path)
        timing_rows.append((label, chars, elapsed))

        # Update calibration as a cumulative average after every book
        total_chars_done = sum(c for _, c, _ in timing_rows)
        total_elapsed_done = sum(e for _, _, e in timing_rows)
        if total_elapsed_done > 0:
            chars_per_sec = total_chars_done / total_elapsed_done
            print(f"  ⏱  Calibration: {chars_per_sec:.0f} chars/sec")

    # ── Summary ────────────────────────────────────────────────────────────────
    print("\n" + "─" * 60)
    print(f"  {'Section':<30}  {'Chars':>7}  {'Actual':>8}  {'Est':>8}")
    print("─" * 60)
    for i, (label, chars, elapsed) in enumerate(timing_rows):
        actual_str = _fmt_duration(elapsed)
        # Estimate using the cumulative rate *before* this book was added
        prior_chars = sum(c for _, c, _ in timing_rows[:i])
        prior_elapsed = sum(e for _, _, e in timing_rows[:i])
        if prior_elapsed > 0:
            est_str = _fmt_duration(chars / (prior_chars / prior_elapsed))
        else:
            est_str = "(first run)"
        print(f"  {label:<30}  {chars:>7,}  {actual_str:>8}  {est_str:>8}")
    total_elapsed = sum(e for _, _, e in timing_rows)
    print("─" * 60)
    print(f"  {'TOTAL':<30}  {sum(c for _,c,_ in timing_rows):>7,}  {_fmt_duration(total_elapsed):>8}")
    print("\nDone.")


if __name__ == "__main__":
    main()