audiobook_creator/generate_proper_noun_audio.py

"""
generate_proper_noun_audio.py
──────────────────────────────
Read proper_nouns.txt, generate a short TTS audio clip for every entry
using am_michael, and save a JSON manifest for the GUI.

Outputs:
    output_proper_nouns/<slug>.wav   – one wav per entry
    output_proper_nouns/manifest.json – { "Word" : "slug.wav", … }

Already-generated files are skipped, so re-runs are fast.

Run:
    .venv/bin/python generate_proper_noun_audio.py
"""

import json
import re
import sys
import numpy as np
import soundfile as sf
import torch
from pathlib import Path
from kokoro import KPipeline

PROPER_NOUNS_FILE = Path("proper_nouns.txt")
DATA_DIR          = Path("output_proper_nouns")   # JSON files — tracked in git
AUDIO_DIR         = Path("proper_nouns_audio")     # WAV files — not tracked
MANIFEST_FILE     = DATA_DIR / "manifest.json"
VOICE             = "am_michael"
SAMPLE_RATE       = 24000
SPEED             = 1.0

# ── Parse proper_nouns.txt ─────────────────────────────────────────────────────

def parse_entries(path: Path) -> list[tuple[str, str]]:
    """Return list of (category, entry) pairs."""
    entries: list[tuple[str, str]] = []
    current_cat = "Uncategorised"
    header_re = re.compile(r"^[A-Z &]+\s+\(\d+\)$")

    for line in path.read_text(encoding="utf-8").splitlines():
        stripped = line.strip()
        if not stripped:
            continue
        if stripped.startswith("=") or stripped.startswith("─"):
            continue
        if header_re.match(stripped):
            # e.g.  "PEOPLE & CHARACTERS  (301)"
            current_cat = stripped.rsplit("(", 1)[0].strip().title()
            continue
        if stripped.startswith("TOTAL:"):
            continue
        if stripped.startswith("Review this") or stripped.startswith("Each entry"):
            continue
        if stripped.startswith("PROPER NOUNS"):
            continue
        # Regular entry — indented two spaces in the file
        if line.startswith("  "):
            entries.append((current_cat, stripped))

    return entries


def slugify(text: str) -> str:
    """Convert 'Hagoth-II foo' → 'hagoth_ii_foo'."""
    s = text.lower()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    return s.strip("_")


# ── TTS generation ─────────────────────────────────────────────────────────────

def generate(pipeline: KPipeline, text: str, out_path: Path) -> bool:
    chunks = []
    # Speak the word in a short carrier phrase so the TTS pronounces it
    # naturally (isolated tokens sometimes get clipped prosody).
    spoken = text
    for _, _, chunk in pipeline(spoken, voice=VOICE, speed=SPEED):
        if hasattr(chunk, "numpy"):
            chunk = chunk.cpu().numpy()
        chunk = np.atleast_1d(chunk.squeeze())
        if chunk.size > 0:
            chunks.append(chunk)
    if chunks:
        audio = np.concatenate(chunks)
        sf.write(str(out_path), audio, SAMPLE_RATE)
        return True
    return False


# ── Main ───────────────────────────────────────────────────────────────────────

def main() -> None:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}")
    if device == "cuda":
        print(f"GPU:    {torch.cuda.get_device_name(0)}")

    DATA_DIR.mkdir(exist_ok=True)
    AUDIO_DIR.mkdir(exist_ok=True)

    print(f"Parsing '{PROPER_NOUNS_FILE}' …")
    entries = parse_entries(PROPER_NOUNS_FILE)
    print(f"  {len(entries)} entries found.\n")

    # Load existing manifest so we can skip already-done words
    if MANIFEST_FILE.exists():
        manifest: dict = json.loads(MANIFEST_FILE.read_text())
    else:
        manifest = {}

    print("Initialising Kokoro pipeline …")
    pipeline = KPipeline(lang_code="a")

    skipped = 0
    generated = 0
    failed = 0

    for i, (cat, entry) in enumerate(entries):
        slug = slugify(entry)
        wav_name = f"{slug}.wav"
        wav_path = AUDIO_DIR / wav_name

        if entry in manifest and wav_path.exists():
            skipped += 1
            continue

        sys.stdout.write(f"\r[{i+1}/{len(entries)}] {entry[:55]:<55}")
        sys.stdout.flush()

        ok = generate(pipeline, entry, wav_path)
        if ok:
            manifest[entry] = wav_name
            generated += 1
        else:
            print(f"\n  ✗  Failed: {entry}")
            failed += 1

    print(f"\n\nDone.  generated={generated}  skipped={skipped}  failed={failed}")

    MANIFEST_FILE.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))
    print(f"Manifest saved → '{MANIFEST_FILE}'")


if __name__ == "__main__":
    main()