audiobook_creator/create_temple_voices.py

"""
create_temple_voices.py
────────────────────────
Generate the "Sacred Temple Writings" section of the Nem audiobook using one
distinct Microsoft Edge neural TTS voice per character (NOT Kokoro).

Uses the free edge-tts library which streams Microsoft Azure neural voices.
Audio is stitched into a single WAV and saved to OUTPUT_DIR.

Usage:
    python create_temple_voices.py                    # full render
    python create_temple_voices.py --preview 40       # first 40 segments only
    python create_temple_voices.py --print-segments   # inspect parsed segments
    python create_temple_voices.py --list-voices      # list available en voices

Voice assignments live in CHARACTER_VOICES below — easy to customise.
Run  --list-voices  to discover all available edge-tts voice names.
"""

import argparse
import asyncio
import re
import subprocess
import time
from collections import Counter
from pathlib import Path

import numpy as np
import soundfile as sf
import edge_tts

# ── File / output config ───────────────────────────────────────────────────────
_FIXED_FILE  = Path("Audio Master Nem Full (TTS Fixed).txt")
_ORIG_FILE   = Path("Audio Master Nem Full.txt")
SOURCE_FILE  = _FIXED_FILE if _FIXED_FILE.exists() else _ORIG_FILE

OUTPUT_DIR   = Path("output_temple_voices")
OUTPUT_FILE  = "sacred_temple_writings_multivoice.wav"

SAMPLE_RATE  = 24_000   # Hz — final WAV sample rate
PAUSE_SAME   = 350      # ms silence between same-speaker segments
PAUSE_CHANGE = 650      # ms silence between different-speaker segments

# ── Section boundary markers (match create_audiobook_nem.py BOOKS order) ──────
#   Sacred Temple Writings starts at "THE SACRED" / "TEMPLE WRITINGS"
#   and ends just before "THE FIRST BOOK" / "OF SAMUEL THE LAMANITE"
_SEC_START_L1 = "THE SACRED"
_SEC_START_L2 = "TEMPLE WRITINGS"
_SEC_END_L1   = "THE FIRST BOOK"
_SEC_END_L2   = "OF SAMUEL THE LAMANITE"

# ── Character → edge-tts voice ────────────────────────────────────────────────
# Run  python create_temple_voices.py --list-voices  to see all available voices.
# Keys must match the speaker labels exactly as they appear in the source file.
CHARACTER_VOICES: dict[str, str] = {
    # ── Celestial beings ───────────────────────────────────────────────────────
    "Narrator":               "en-US-GuyNeural",         # calm neutral narrator
    "Elohim Heavenly Mother": "en-US-JennyNeural",       # warm, wise matriarch
    "Elohim Heavenly Father": "en-US-AndrewMultilingualNeural",  # expressive, authoritative
    "Jehovah":                "en-US-AndrewNeural",      # clear, gentle divine
    "Angel of the Lord":      "en-US-BrianNeural",       # ethereal divine messenger
    "Holy Ghost":             "en-US-EricNeural",        # quiet, inward, spiritual
    "Holy Ghost Elders":      "en-US-BrianNeural",       # measured elder council

    # ── Dark beings ────────────────────────────────────────────────────────────
    "Lucifer":                "en-CA-LiamNeural",        # smooth, persuasive tempter
    "Satan":                  "en-US-SteffanNeural",     # cold, commanding adversary

    # ── Mortal / earth characters ──────────────────────────────────────────────
    "Michael":                "en-US-RogerNeural",        # noble warrior archangel
    "Adam":                   "en-US-ChristopherNeural",  # earnest first man
    "Eve":                    "en-US-AriaNeural",        # curious, warm first woman

    # ── Apostles ───────────────────────────────────────────────────────────────
    "Peter":                  "en-GB-RyanNeural",        # firm British apostle
    "James":                  "en-AU-WilliamMultilingualNeural",  # steady Australian voice
    "John":                   "en-IE-ConnorNeural",      # gentle Irish apostle

    # ── Other roles ────────────────────────────────────────────────────────────
    "Preacher":               "en-US-AvaNeural",         # bold emphatic preacher
    "Mob":                    "en-US-MichelleNeural",    # crowd / multitude voice
    "The Voice of the Mob":   "en-US-MichelleNeural",   # alias used in some editions
}

# Voice used when a speaker label isn't found in CHARACTER_VOICES
FALLBACK_VOICE = "en-US-GuyNeural"

# Lines/patterns that are ceremony stage-directions → read by Narrator
_STAGE_NARRATOR = re.compile(
    r"^(Break for Instruction|Resume Session|All\s+arise|"
    r"CHAPTER\s*\d*|________________+|────+)",
    re.IGNORECASE,
)

# Lines to skip entirely (decorative / empty)
_SKIP_RE = re.compile(r"^[—\-_\s\u2014\u2013]*$")


# ── Section extraction ─────────────────────────────────────────────────────────

def extract_section(source: Path) -> str:
    """Return text of the Sacred Temple Writings section."""
    lines = source.read_text(encoding="utf-8").splitlines()
    in_sec = False
    out: list[str] = []

    for i, line in enumerate(lines):
        s = line.strip()
        if not in_sec:
            if (s.upper() == _SEC_START_L1 and
                    i + 1 < len(lines) and
                    lines[i + 1].strip().upper().startswith(_SEC_START_L2)):
                in_sec = True
        else:
            # End just before the next section
            if (s.upper() == _SEC_END_L1 and
                    i + 1 < len(lines) and
                    lines[i + 1].strip().upper().startswith(_SEC_END_L2)):
                break
            out.append(line)

    if not out:
        raise RuntimeError(
            f"Could not locate 'Sacred Temple Writings' in '{source}'.\n"
            "Ensure the source file has a line exactly matching "
            f"'{_SEC_START_L1}' followed by '{_SEC_START_L2}'."
        )
    return "\n".join(out)


# ── Segment parser ─────────────────────────────────────────────────────────────

def _speaker_regex(characters: list[str]) -> re.Pattern:
    """Regex matching  [optional-number]  CharacterName:  text"""
    # Sort longest-first so "Holy Ghost Elders" matches before "Holy Ghost"
    names = sorted(characters, key=len, reverse=True)
    pat = "|".join(re.escape(n) for n in names)
    return re.compile(r"^\d*\s*(" + pat + r")\s*:\s*(.*)", re.IGNORECASE)


def parse_segments(text: str) -> list[tuple[str, str]]:
    """
    Convert section text into a list of (normalised_speaker, spoken_text) tuples.
    Non-attributed prose becomes Narrator lines.
    """
    char_re = _speaker_regex(list(CHARACTER_VOICES.keys()))

    # Build a quick lowercase→canonical lookup for speaker name normalisation
    canon: dict[str, str] = {k.lower(): k for k in CHARACTER_VOICES}

    segments: list[tuple[str, str]] = []
    cur_speaker = "Narrator"
    buf: list[str] = []

    def flush() -> None:
        combined = " ".join(l.strip() for l in buf if l.strip())
        if combined:
            segments.append((cur_speaker, combined))
        buf.clear()

    for raw in text.splitlines():
        line = raw.strip()

        if not line or _SKIP_RE.match(line):
            continue

        # Stage direction → Narrator reads it
        if _STAGE_NARRATOR.match(line):
            flush()
            cur_speaker = "Narrator"
            buf.append(line)
            continue

        # "The words of Jehovah … are in blue." — formatting note, skip
        if re.search(r"are in blue|words of jehovah", line, re.IGNORECASE):
            continue

        m = char_re.match(line)
        if m:
            flush()
            raw_name = m.group(1)
            cur_speaker = canon.get(raw_name.lower(), raw_name)
            spoken = m.group(2).strip()
            if spoken:
                buf.append(spoken)
        else:
            # Continuation of current speaker (or unattributed narrator prose)
            buf.append(line)

    flush()
    return segments


# ── Audio generation ───────────────────────────────────────────────────────────

async def _tts_bytes(text: str, voice: str) -> bytes:
    """Stream edge-tts and return raw MP3 bytes."""
    communicate = edge_tts.Communicate(text, voice)
    data = bytearray()
    async for chunk in communicate.stream():
        if chunk["type"] == "audio":
            data.extend(chunk["data"])
    return bytes(data)


def _mp3_to_numpy(mp3: bytes) -> np.ndarray:
    """Decode MP3 bytes → mono float32 numpy array at SAMPLE_RATE using ffmpeg."""
    cmd = [
        "ffmpeg", "-hide_banner", "-loglevel", "error",
        "-i", "pipe:0",                    # read MP3 from stdin
        "-f", "f32le",                      # raw 32-bit little-endian float PCM
        "-acodec", "pcm_f32le",
        "-ac", "1",                          # mono
        "-ar", str(SAMPLE_RATE),            # resample to target rate
        "pipe:1",                           # write PCM to stdout
    ]
    result = subprocess.run(cmd, input=mp3, capture_output=True, check=True)
    return np.frombuffer(result.stdout, dtype=np.float32).copy()


def _silence(ms: int) -> np.ndarray:
    return np.zeros(int(SAMPLE_RATE * ms / 1000), dtype=np.float32)


async def render(
    segments: list[tuple[str, str]],
    preview: int | None = None,
) -> np.ndarray:
    """Generate and stitch all segment audio; return concatenated float32 array."""
    if preview is not None:
        segments = segments[:preview]

    parts: list[np.ndarray] = []
    last_speaker: str | None = None
    t0 = time.monotonic()

    for idx, (speaker, text) in enumerate(segments, 1):
        voice = CHARACTER_VOICES.get(speaker, FALLBACK_VOICE)
        marker = "⚠" if speaker not in CHARACTER_VOICES else " "
        print(f"  {marker}[{idx:>4}/{len(segments)}]  {speaker:<28}  {voice}")

        try:
            mp3 = await _tts_bytes(text, voice)
        except Exception as exc:
            print(f"       ↳ ERROR with '{voice}': {exc}  — falling back to {FALLBACK_VOICE}")
            mp3 = await _tts_bytes(text, FALLBACK_VOICE)

        audio = _mp3_to_numpy(mp3)

        if parts:
            gap = PAUSE_SAME if speaker == last_speaker else PAUSE_CHANGE
            parts.append(_silence(gap))
        parts.append(audio)
        last_speaker = speaker

    elapsed = time.monotonic() - t0
    print(f"\n  ✓  {len(segments)} segments in {elapsed:.0f}s")
    return np.concatenate(parts) if parts else np.array([], dtype=np.float32)


# ── Voice listing ──────────────────────────────────────────────────────────────

async def _list_voices_async() -> None:
    voices = await edge_tts.list_voices()
    english = sorted(
        (v for v in voices if v["Locale"].startswith("en-")),
        key=lambda v: (v["Locale"], v["ShortName"]),
    )
    print(f"\n  {'Locale':<12}  {'Short Name':<45}  Gender")
    print("  " + "─" * 68)
    for v in english:
        print(f"  {v['Locale']:<12}  {v['ShortName']:<45}  {v['Gender']}")
    print(f"\n  {len(english)} English voices total.")


# ── CLI / main ─────────────────────────────────────────────────────────────────

def main() -> None:
    ap = argparse.ArgumentParser(
        description="Render Sacred Temple Writings with per-character edge-tts voices."
    )
    ap.add_argument("--list-voices", action="store_true",
                    help="Print all available English edge-tts voices and exit.")
    ap.add_argument("--print-segments", action="store_true",
                    help="Print parsed (speaker, text) segments and exit.")
    ap.add_argument("--preview", type=int, metavar="N",
                    help="Render only the first N segments (quick test).")
    args = ap.parse_args()

    if args.list_voices:
        asyncio.run(_list_voices_async())
        return

    # ── Extract & parse ────────────────────────────────────────────────────────
    print(f"Source : {SOURCE_FILE}")
    text = extract_section(SOURCE_FILE)
    print(f"Section: {len(text):,} chars extracted\n")

    segments = parse_segments(text)

    if args.print_segments:
        print(f"Parsed {len(segments)} segments:\n")
        for i, (spkr, txt) in enumerate(segments, 1):
            snippet = txt[:90] + ("…" if len(txt) > 90 else "")
            voice = CHARACTER_VOICES.get(spkr, f"{FALLBACK_VOICE} ⚠")
            print(f"  {i:>4}.  [{spkr}]  ({voice})\n        {snippet}\n")
        return

    # ── Summary table ──────────────────────────────────────────────────────────
    counts = Counter(s for s, _ in segments)
    unrecognised = {s for s in counts if s not in CHARACTER_VOICES}

    print(f"Parsed {len(segments)} segments across {len(counts)} speakers:\n")
    print(f"  {'Speaker':<28}  {'Segs':>5}  {'Voice'}")
    print(f"  {'─'*28}  {'─'*5}  {'─'*45}")
    for spkr, voice in CHARACTER_VOICES.items():
        if counts[spkr]:
            print(f"  {spkr:<28}  {counts[spkr]:>5}  {voice}")
    for spkr in sorted(unrecognised):
        print(f"  {spkr:<28}  {counts[spkr]:>5}  {FALLBACK_VOICE}  ⚠ unrecognised")

    total_chars = sum(len(t) for _, t in segments)
    print(f"\n  Total chars: {total_chars:,}")
    if args.preview:
        print(f"  ⚡ PREVIEW MODE — rendering first {args.preview} segments only")

    # ── GPU note ───────────────────────────────────────────────────────────────
    # edge-tts is cloud-based (Microsoft Azure neural, free) — GPU not used.
    print("\nNote: edge-tts uses Microsoft's servers (free, no API key needed).\n"
          "      Render speed depends on your internet connection.\n")

    # ── Render ─────────────────────────────────────────────────────────────────
    OUTPUT_DIR.mkdir(exist_ok=True)
    out_path = OUTPUT_DIR / (
        f"sacred_temple_writings_preview{args.preview}.wav"
        if args.preview else OUTPUT_FILE
    )

    print("Rendering segments …\n")
    audio = asyncio.run(render(segments, args.preview))

    if audio.size > 0:
        sf.write(str(out_path), audio, SAMPLE_RATE)
        dur = len(audio) / SAMPLE_RATE
        m, s = divmod(int(dur), 60)
        print(f"\n✓  Saved '{out_path}'  ({m}m {s:02d}s audio  |  {SAMPLE_RATE} Hz)")
    else:
        print("✗  No audio produced — check parsing with --print-segments")


if __name__ == "__main__":
    main()