Files
audiobook_creator/create_audiobook_nem.py

318 lines
15 KiB
Python
Raw Normal View History

"""
audiobook_nem.py
Generate the Book of the Nem audiobook one unique voice per book/section.
Usage:
2026-02-26 12:09:43 -07:00
python create_audiobook_nem.py # all enabled books
python create_audiobook_nem.py --list # list available book labels
python create_audiobook_nem.py Introduction
python create_audiobook_nem.py "Book of Hagoth"
python create_audiobook_nem.py Introduction "Book of Hagoth"
2026-02-26 12:09:43 -07:00
To permanently skip a section, comment out its entry in BOOKS below.
Output .wav files are written to OUTPUT_DIR (created automatically).
"""
2026-02-26 12:09:43 -07:00
import argparse
import re
2026-02-26 00:57:40 -07:00
import time
import numpy as np
import soundfile as sf
import torch
from pathlib import Path
from kokoro import KPipeline
# ── Config ─────────────────────────────────────────────────────────────────────
2026-02-25 11:37:35 -07:00
_FIXED_FILE = Path("Audio Master Nem Full (TTS Fixed).txt")
_ORIG_FILE = Path("Audio Master Nem Full.txt")
SOURCE_FILE = _FIXED_FILE if _FIXED_FILE.exists() else _ORIG_FILE
OUTPUT_DIR = Path("output_audiobook")
SAMPLE_RATE = 24000
SPEED = 1.0
LANG_CODE = "a" # 'a' = American English
# ── Available Kokoro voices (American English, lang_code='a') ──────────────────
2026-03-09 23:36:50 -06:00
# af_bella American female [downloaded]
# af_heart warm American female [downloaded]
# af_nicole American female [downloaded]
2026-03-09 23:36:50 -06:00
# af_river American female [downloaded]
# af_sarah American female [downloaded]
# af_sky American female [downloaded]
# am_adam American male (deep) [downloaded]
# am_echo American male [downloaded]
# am_eric American male [downloaded]
# am_fenrir American male [downloaded]
# am_liam American male [downloaded]
# am_michael American male (clear) [downloaded]
# am_onyx American male [downloaded]
# am_puck American male [downloaded]
# am_santa American male [downloaded] (not used)
# ── Book definitions ───────────────────────────────────────────────────────────
2026-02-26 12:09:43 -07:00
# Format: (label, (start_line1, start_line2), voice, output_wav)
# start_line1 exact text of the FIRST line of the section header
# start_line2 prefix of the SECOND line (used together for unambiguous matching)
# voice Kokoro voice name
# output_wav filename saved inside OUTPUT_DIR
#
# Comment out any line to skip that section entirely.
BOOKS = [
2026-02-26 12:09:43 -07:00
# label (start_line1, start_line2) voice output_wav
("Introduction", ("Introduction", "The Book of the Nem"), "af_heart", "00_introduction.wav"),
2026-03-09 23:36:50 -06:00
("Book of Hagoth", ("THE BOOK OF HAGOTH", "THE SON OF HAGMENI,"), "am_santa", "01_hagoth.wav"),
2026-02-26 12:09:43 -07:00
("Shi-Tugo I", ("THE FIRST BOOK OF SHI-TUGO", "FORMER WARRIOR, AMMONITE"), "am_eric", "02_shi_tugo_1.wav"),
("Sanempet", ("THE BOOK OF SANEMPET", "THE SON OF HAGMENI,"), "am_liam", "03_sanempet.wav"),
("Oug", ("THE BOOK OF OUG", "THE SON OF SANEMPET"), "am_michael", "04_oug.wav"),
("Temple Writings of Oug", ("THE BOOK OF", "THE TEMPLE WRITINGS"), "am_michael", "05_temple_writings_oug.wav"),
("Sacred Temple Writings", ("THE SACRED", "TEMPLE WRITINGS"), "am_michael", "06_sacred_temple_writings.wav"),
("Samuel the Lamanite I", ("THE FIRST BOOK", "OF SAMUEL THE LAMANITE"), "am_echo", "07_samuel_lamanite_1.wav"),
("Samuel the Lamanite II", ("THE SECOND BOOK", "OF SAMUEL THE LAMANITE"), "am_echo", "08_samuel_lamanite_2.wav"),
("Manti", ("THE BOOK OF MANTI", "THE SON OF OUG"), "am_onyx", "09_manti.wav"),
2026-03-09 23:36:50 -06:00
("Pa Nat I", ("THE FIRST BOOK OF PA NAT", "THE DAUGHTER OF SHIMLEI"), "af_bella", "10_pa_nat_1.wav"),
2026-02-26 12:09:43 -07:00
("Moroni I", ("THE FIRST BOOK OF MORONI", "THE SON OF MORMON,"), "am_adam", "11_moroni_1.wav"),
("Moroni II", ("THE SECOND BOOK OF MORONI", "THE SON OF MORMON,"), "am_adam", "12_moroni_2.wav"),
("Moroni III", ("THE THIRD BOOK OF MORONI", "THE SON OF MORMON,"), "am_adam", "13_moroni_3.wav"),
("Shioni", ("THE BOOK OF SHIONI", "THE SON OF MORONI"), "am_puck", "14_shioni.wav"),
]
# ── Helpers ────────────────────────────────────────────────────────────────────
def load_and_split(source: Path, books: list) -> dict[str, str]:
"""
Read the source file and split it into sections keyed by label.
2026-02-26 12:09:43 -07:00
Each section starts at its (start_line1, start_line2) marker pair and
ends just before the next section's marker.
2026-02-26 15:08:44 -07:00
Marker positions are always detected from the *original* unmodified file
(_ORIG_FILE) when it exists, so that phonetic fixes applied to section
headings in the TTS-fixed file can never break section detection. The
line numbers are identical in both files because word-level replacements
never add or remove lines.
"""
2026-02-26 15:08:44 -07:00
# Use the original (un-fixed) file for marker detection so phonetic
# changes to heading lines don't break matching.
marker_source = _ORIG_FILE if _ORIG_FILE.exists() else source
marker_lines = marker_source.read_text(encoding="utf-8").splitlines()
# The content to actually return comes from `source` (may be fixed file).
content_lines = source.read_text(encoding="utf-8").splitlines()
2026-02-26 12:09:43 -07:00
# Build a mapping: (label, line1, line2) for each book
markers = [(label, m[0].strip(), m[1].strip()) for label, m, _, _ in books]
2026-02-26 12:09:43 -07:00
# Find the line index of each marker's first occurrence (two-line match)
marker_positions: list[tuple[int, int]] = [] # (line_idx, books_idx)
2026-02-26 12:09:43 -07:00
for book_idx, (label, m1, m2) in enumerate(markers):
2026-02-26 15:08:44 -07:00
for line_idx, line in enumerate(marker_lines[:-1]):
if (line.strip().upper() == m1.upper() and
marker_lines[line_idx + 1].strip().upper().startswith(m2.upper())):
marker_positions.append((line_idx, book_idx))
break
else:
2026-02-26 12:09:43 -07:00
print(f" ⚠ Marker not found for '{label}': '{m1}' / '{m2}' — skipping")
marker_positions.sort(key=lambda x: x[0])
sections: dict[str, str] = {}
for rank, (line_idx, book_idx) in enumerate(marker_positions):
label = markers[book_idx][0]
if rank + 1 < len(marker_positions):
end_line = marker_positions[rank + 1][0]
else:
2026-02-26 15:08:44 -07:00
end_line = len(content_lines)
text = "\n".join(content_lines[line_idx:end_line]).strip()
sections[label] = text
return sections
def clean_text(text: str) -> str:
"""
Strip formatting artifacts, underscores, and normalise whitespace
so the TTS receives clean prose.
"""
# Remove lines that are pure underscores (horizontal rules)
text = re.sub(r"^_{3,}\s*$", "", text, flags=re.MULTILINE)
# Remove leading chapter headers that are all-caps lines
# (keep them as natural spoken title for context)
# Collapse excess blank lines
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
2026-02-26 00:57:40 -07:00
def _fmt_duration(seconds: float) -> str:
"""Format seconds as 'Xm Ys' or 'Xs'."""
if seconds >= 60:
m, s = divmod(int(seconds), 60)
return f"{m}m {s:02d}s"
return f"{seconds:.0f}s"
def generate_audio(pipeline: KPipeline, text: str, voice: str,
2026-02-26 00:57:40 -07:00
output_path: Path) -> float:
"""Generate audio and return wall-clock seconds elapsed."""
t0 = time.monotonic()
chunks = []
for _, _, chunk_audio in pipeline(text, voice=voice, speed=SPEED):
if hasattr(chunk_audio, "numpy"):
chunk_audio = chunk_audio.cpu().numpy()
chunk_audio = np.atleast_1d(chunk_audio.squeeze())
if chunk_audio.size > 0:
chunks.append(chunk_audio)
if chunks:
audio = np.concatenate(chunks, axis=0)
sf.write(str(output_path), audio, SAMPLE_RATE)
2026-02-26 00:57:40 -07:00
elapsed = time.monotonic() - t0
duration = len(audio) / SAMPLE_RATE
2026-02-26 00:57:40 -07:00
print(f" ✓ Saved '{output_path.name}' ({duration:.1f}s audio | {elapsed:.1f}s wall-clock)")
else:
2026-02-26 00:57:40 -07:00
elapsed = time.monotonic() - t0
print(f" ✗ No audio produced for voice='{voice}'")
2026-02-26 00:57:40 -07:00
return elapsed
# ── Main ───────────────────────────────────────────────────────────────────────
def main() -> None:
2026-02-26 12:09:43 -07:00
# ── CLI ────────────────────────────────────────────────────────────
parser = argparse.ArgumentParser(description="Generate Nem audiobook sections.")
parser.add_argument(
"books", nargs="*",
help="Labels of sections to generate (default: all enabled books). "
"Use --list to see available labels."
)
parser.add_argument(
"--list", action="store_true",
help="Print all enabled book labels and exit."
)
2026-03-09 23:36:50 -06:00
parser.add_argument(
"--preview", nargs="?", const=3000, type=int, metavar="CHARS",
help="Generate a short preview clip per book (default: 3000 chars). "
"Output filenames get a _preview suffix."
)
2026-02-26 12:09:43 -07:00
args = parser.parse_args()
enabled_labels = [label for label, _, _, _ in BOOKS]
if args.list:
print("Enabled books:")
for label in enabled_labels:
print(f" {label}")
return
# Filter to requested subset, preserving BOOKS order
if args.books:
unknown = [b for b in args.books if b not in enabled_labels]
if unknown:
print(f"Unknown book label(s): {', '.join(unknown)}")
print(f"Run with --list to see available labels.")
return
run_books = [b for b in BOOKS if b[0] in args.books]
else:
run_books = list(BOOKS)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
print(f"GPU: {torch.cuda.get_device_name(0)}")
OUTPUT_DIR.mkdir(exist_ok=True)
2026-02-25 11:37:35 -07:00
print(f"\nSource: '{SOURCE_FILE}'"
+ (" ✓ (TTS fixed)" if SOURCE_FILE == _FIXED_FILE else
" ⚠ (original — run 'Apply Fixes to Text' in the GUI to use phonetic fixes)"))
2026-02-26 12:09:43 -07:00
# Always split using ALL books for correct section boundaries,
# but only generate for run_books.
sections = load_and_split(SOURCE_FILE, BOOKS)
2026-02-26 12:09:43 -07:00
print(f" Found {len(sections)} sections ({len(run_books)} selected).\n")
print("Initialising Kokoro pipeline …")
pipeline = KPipeline(lang_code=LANG_CODE)
2026-02-26 00:57:40 -07:00
# Pre-compute char counts for all sections so we can estimate ETAs
section_chars: dict[str, int] = {
label: len(clean_text(sections[label]))
2026-02-26 12:09:43 -07:00
for label, _, _, _ in run_books
2026-02-26 00:57:40 -07:00
if label in sections
}
2026-02-26 12:09:43 -07:00
# Print char count summary before starting
2026-03-09 23:36:50 -06:00
preview_note = f" ⚡ PREVIEW MODE — capped at {args.preview:,} chars/book\n" if args.preview else ""
print(f"\n{preview_note}{'' * 52}")
2026-02-26 12:09:43 -07:00
print(f" {'Section':<30} {'Chars':>8}")
print(f"{'' * 52}")
for label, _, _, wav_name in run_books:
if label in section_chars:
print(f" {label:<30} {section_chars[label]:>8,}")
print(f"{'' * 52}")
total_chars = sum(section_chars.values())
print(f" {'TOTAL':<30} {total_chars:>8,}")
print()
2026-02-26 00:57:40 -07:00
chars_per_sec: float | None = None # derived from the first book that finishes
timing_rows: list[tuple[str, int, float]] = [] # (label, chars, elapsed)
2026-02-26 12:09:43 -07:00
for label, _marker, voice, wav_name in run_books:
if label not in sections:
2026-02-26 00:57:40 -07:00
continue
text = clean_text(sections[label])
if not text:
2026-02-26 00:57:40 -07:00
print(f"\n[{label}] ⚠ Empty text — skipping")
continue
2026-03-09 23:36:50 -06:00
# Preview mode: truncate to requested char limit at a word boundary
preview_chars = args.preview
if preview_chars:
if len(text) > preview_chars:
cut = text.rfind(" ", 0, preview_chars)
text = text[: cut if cut > 0 else preview_chars]
chars = len(text)
2026-02-26 00:57:40 -07:00
# Print ETA once we have a calibration rate
if chars_per_sec is not None:
eta_sec = chars / chars_per_sec
eta_str = _fmt_duration(eta_sec)
print(f"\n[{label}] voice={voice}{wav_name} (est. {eta_str})")
else:
print(f"\n[{label}] voice={voice}{wav_name} (timing calibration run)")
stem, ext = wav_name.rsplit(".", 1)
2026-03-09 23:36:50 -06:00
preview_tag = "_preview" if preview_chars else ""
out_path = OUTPUT_DIR / f"{stem}_{voice}{preview_tag}.{ext}"
2026-02-26 00:57:40 -07:00
elapsed = generate_audio(pipeline, text, voice, out_path)
timing_rows.append((label, chars, elapsed))
2026-02-26 12:21:44 -07:00
# Update calibration as a cumulative average after every book
total_chars_done = sum(c for _, c, _ in timing_rows)
total_elapsed_done = sum(e for _, _, e in timing_rows)
if total_elapsed_done > 0:
chars_per_sec = total_chars_done / total_elapsed_done
print(f" ⏱ Calibration: {chars_per_sec:.0f} chars/sec")
2026-02-26 00:57:40 -07:00
# ── Summary ────────────────────────────────────────────────────────────────
print("\n" + "" * 60)
print(f" {'Section':<30} {'Chars':>7} {'Actual':>8} {'Est':>8}")
print("" * 60)
for i, (label, chars, elapsed) in enumerate(timing_rows):
actual_str = _fmt_duration(elapsed)
2026-02-26 12:21:44 -07:00
# Estimate using the cumulative rate *before* this book was added
prior_chars = sum(c for _, c, _ in timing_rows[:i])
prior_elapsed = sum(e for _, _, e in timing_rows[:i])
if prior_elapsed > 0:
est_str = _fmt_duration(chars / (prior_chars / prior_elapsed))
2026-02-26 00:57:40 -07:00
else:
2026-02-26 12:21:44 -07:00
est_str = "(first run)"
2026-02-26 00:57:40 -07:00
print(f" {label:<30} {chars:>7,} {actual_str:>8} {est_str:>8}")
total_elapsed = sum(e for _, _, e in timing_rows)
print("" * 60)
print(f" {'TOTAL':<30} {sum(c for _,c,_ in timing_rows):>7,} {_fmt_duration(total_elapsed):>8}")
print("\nDone.")
if __name__ == "__main__":
main()