audiobook_creator/create_audiobook_nem.py

"""
audiobook_nem.py
────────────────
Generate the Book of the Nem audiobook — one unique voice per book/section.

Usage:
    python audiobook_nem.py

To skip a section, comment out its entry in BOOKS below.
Output .wav files are written to OUTPUT_DIR (created automatically).
"""

import re
import time
import numpy as np
import soundfile as sf
import torch
from pathlib import Path
from kokoro import KPipeline

# ── Config ─────────────────────────────────────────────────────────────────────
_FIXED_FILE   = Path("Audio Master Nem Full (TTS Fixed).txt")
_ORIG_FILE    = Path("Audio Master Nem Full.txt")
SOURCE_FILE   = _FIXED_FILE if _FIXED_FILE.exists() else _ORIG_FILE
OUTPUT_DIR    = Path("output_audiobook")
SAMPLE_RATE   = 24000
SPEED         = 1.0
LANG_CODE     = "a"   # 'a' = American English

# ── Available Kokoro voices (American English, lang_code='a') ──────────────────
#   af_heart   – warm American female      [downloaded]
#   af_nicole  – American female             [downloaded]
#   am_adam    – American male (deep)        [downloaded]
#   am_echo    – American male               [downloaded]
#   am_eric    – American male               [downloaded]
#   am_fenrir  – American male               [downloaded]
#   am_liam    – American male               [downloaded]
#   am_michael – American male (clear)       [downloaded]
#   am_onyx    – American male               [downloaded]
#   am_puck    – American male               [downloaded]
#   am_santa   – American male               [downloaded] (not used)

# ── Book definitions ───────────────────────────────────────────────────────────
# Format: (label, start_marker, voice, output_wav)
#   start_marker – exact text of the FIRST line of the section header in the source
#                  (leading/trailing whitespace is ignored when matching)
#   voice        – Kokoro voice name
#   output_wav   – filename saved inside OUTPUT_DIR
#
# Comment out any line to skip that section entirely.
BOOKS = [
    # label                       start_marker                       voice         output_wav
    ("Introduction",              "Introduction",                    "af_heart",   "00_introduction.wav"),
    ("Book of Hagoth",            "THE BOOK OF HAGOTH",              "am_fenrir",  "01_hagoth.wav"),
    # ("Shi-Tugo I",                "THE FIRST BOOK OF SHI-TUGO",      "am_eric",    "02_shi_tugo_1.wav"),
    # ("Sanempet",                  "THE BOOK OF SANEMPET",            "am_liam",    "03_sanempet.wav"),
    # ("Oug",                       "THE BOOK OF OUG",                 "am_michael", "04_oug.wav"),
    # ("Temple Writings of Oug",    "THE BOOK OF",                     "am_michael", "05_temple_writings_oug.wav"),
    # ("Sacred Temple Writings",    "THE SACRED",                      "am_michael", "06_sacred_temple_writings.wav"),
    # ("Samuel the Lamanite I",     "THE FIRST BOOK",                  "am_echo",    "07_samuel_lamanite_1.wav"),
    # ("Samuel the Lamanite II",    "THE SECOND BOOK",                 "am_echo",    "08_samuel_lamanite_2.wav"),
    # ("Manti",                     "THE BOOK OF MANTI",               "am_onyx",    "09_manti.wav"),
    # ("Pa Nat I",                  "THE FIRST BOOK OF PA NAT",        "af_nicole",  "10_pa_nat_1.wav"),
    # ("Moroni I",                  "THE FIRST BOOK OF MORONI",        "am_adam",    "11_moroni_1.wav"),
    # ("Moroni II",                 "THE SECOND BOOK OF MORONI",       "am_adam",    "12_moroni_2.wav"),
    # ("Moroni III",                "THE THIRD BOOK OF MORONI",        "am_adam",    "13_moroni_3.wav"),
    # ("Shioni",                    "THE BOOK OF SHIONI",              "am_puck",    "14_shioni.wav"),
]

# ── Helpers ────────────────────────────────────────────────────────────────────

def load_and_split(source: Path, books: list) -> dict[str, str]:
    """
    Read the source file and split it into sections keyed by label.
    Each section starts at its start_marker line and ends just before the
    next section's start_marker.
    """
    raw_lines = source.read_text(encoding="utf-8").splitlines()

    # Build a mapping: marker_text → index in BOOKS
    markers = [(label, marker.strip()) for label, marker, _, _ in books]

    # Find the line index of each marker's first occurrence
    marker_positions: list[tuple[int, int]] = []   # (line_idx, books_idx)
    for book_idx, (label, marker) in enumerate(markers):
        for line_idx, line in enumerate(raw_lines):
            if line.strip() == marker:
                marker_positions.append((line_idx, book_idx))
                break
        else:
            print(f"  ⚠  Marker not found for '{label}': '{marker}' — skipping")

    marker_positions.sort(key=lambda x: x[0])

    sections: dict[str, str] = {}
    for rank, (line_idx, book_idx) in enumerate(marker_positions):
        label = markers[book_idx][0]
        if rank + 1 < len(marker_positions):
            end_line = marker_positions[rank + 1][0]
        else:
            end_line = len(raw_lines)
        text = "\n".join(raw_lines[line_idx:end_line]).strip()
        sections[label] = text

    return sections


def clean_text(text: str) -> str:
    """
    Strip formatting artifacts, underscores, and normalise whitespace
    so the TTS receives clean prose.
    """
    # Remove lines that are pure underscores (horizontal rules)
    text = re.sub(r"^_{3,}\s*$", "", text, flags=re.MULTILINE)
    # Remove leading chapter headers that are all-caps lines
    # (keep them as natural spoken title for context)
    # Collapse excess blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def _fmt_duration(seconds: float) -> str:
    """Format seconds as 'Xm Ys' or 'Xs'."""
    if seconds >= 60:
        m, s = divmod(int(seconds), 60)
        return f"{m}m {s:02d}s"
    return f"{seconds:.0f}s"


def generate_audio(pipeline: KPipeline, text: str, voice: str,
                   output_path: Path) -> float:
    """Generate audio and return wall-clock seconds elapsed."""
    t0 = time.monotonic()
    chunks = []
    for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
        if hasattr(chunk_audio, "numpy"):
            chunk_audio = chunk_audio.cpu().numpy()
        chunk_audio = np.atleast_1d(chunk_audio.squeeze())
        if chunk_audio.size > 0:
            chunks.append(chunk_audio)

    if chunks:
        audio = np.concatenate(chunks, axis=0)
        sf.write(str(output_path), audio, SAMPLE_RATE)
        elapsed = time.monotonic() - t0
        duration = len(audio) / SAMPLE_RATE
        print(f"  ✓  Saved '{output_path.name}'  ({duration:.1f}s audio  |  {elapsed:.1f}s wall-clock)")
    else:
        elapsed = time.monotonic() - t0
        print(f"  ✗  No audio produced for voice='{voice}'")
    return elapsed


# ── Main ───────────────────────────────────────────────────────────────────────

def main() -> None:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}")
    if device == "cuda":
        print(f"GPU:    {torch.cuda.get_device_name(0)}")

    OUTPUT_DIR.mkdir(exist_ok=True)

    print(f"\nSource: '{SOURCE_FILE}'"
          + (" ✓ (TTS fixed)" if SOURCE_FILE == _FIXED_FILE else
             " ⚠ (original — run 'Apply Fixes to Text' in the GUI to use phonetic fixes)"))
    sections = load_and_split(SOURCE_FILE, BOOKS)
    print(f"  Found {len(sections)} sections.\n")

    print("Initialising Kokoro pipeline …")
    pipeline = KPipeline(lang_code=LANG_CODE)

    # Pre-compute char counts for all sections so we can estimate ETAs
    section_chars: dict[str, int] = {
        label: len(clean_text(sections[label]))
        for label, _, _, _ in BOOKS
        if label in sections
    }

    chars_per_sec: float | None = None   # derived from the first book that finishes
    timing_rows: list[tuple[str, int, float]] = []  # (label, chars, elapsed)

    for label, marker, voice, wav_name in BOOKS:
        if label not in sections:
            continue

        text = clean_text(sections[label])
        if not text:
            print(f"\n[{label}]  ⚠  Empty text — skipping")
            continue

        chars = section_chars[label]

        # Print ETA once we have a calibration rate
        if chars_per_sec is not None:
            eta_sec = chars / chars_per_sec
            eta_str = _fmt_duration(eta_sec)
            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (est. {eta_str})")
        else:
            print(f"\n[{label}]  voice={voice}  →  {wav_name}  (timing calibration run)")

        stem, ext = wav_name.rsplit(".", 1)
        out_path = OUTPUT_DIR / f"{stem}_{voice}.{ext}"
        elapsed = generate_audio(pipeline, text, voice, out_path)
        timing_rows.append((label, chars, elapsed))

        # Calibrate from first completed book
        if chars_per_sec is None and elapsed > 0:
            chars_per_sec = chars / elapsed
            print(f"  ⏱  Calibrated: {chars_per_sec:.0f} chars/sec")

    # ── Summary ────────────────────────────────────────────────────────────────
    print("\n" + "─" * 60)
    print(f"  {'Section':<30}  {'Chars':>7}  {'Actual':>8}  {'Est':>8}")
    print("─" * 60)
    for i, (label, chars, elapsed) in enumerate(timing_rows):
        actual_str = _fmt_duration(elapsed)
        if i == 0 or chars_per_sec is None:
            est_str = "(calibration)"
        else:
            est_str = _fmt_duration(chars / chars_per_sec)
        print(f"  {label:<30}  {chars:>7,}  {actual_str:>8}  {est_str:>8}")
    total_elapsed = sum(e for _, _, e in timing_rows)
    print("─" * 60)
    print(f"  {'TOTAL':<30}  {sum(c for _,c,_ in timing_rows):>7,}  {_fmt_duration(total_elapsed):>8}")
    print("\nDone.")


if __name__ == "__main__":
    main()