prep for win 11
This commit is contained in:
352
create_temple_voices.py
Normal file
352
create_temple_voices.py
Normal file
@ -0,0 +1,352 @@
|
||||
"""
|
||||
create_temple_voices.py
|
||||
────────────────────────
|
||||
Generate the "Sacred Temple Writings" section of the Nem audiobook using one
|
||||
distinct Microsoft Edge neural TTS voice per character (NOT Kokoro).
|
||||
|
||||
Uses the free edge-tts library which streams Microsoft Azure neural voices.
|
||||
Audio is stitched into a single WAV and saved to OUTPUT_DIR.
|
||||
|
||||
Usage:
|
||||
python create_temple_voices.py # full render
|
||||
python create_temple_voices.py --preview 40 # first 40 segments only
|
||||
python create_temple_voices.py --print-segments # inspect parsed segments
|
||||
python create_temple_voices.py --list-voices # list available en voices
|
||||
|
||||
Voice assignments live in CHARACTER_VOICES below — easy to customise.
|
||||
Run --list-voices to discover all available edge-tts voice names.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import edge_tts
|
||||
|
||||
# ── File / output config ───────────────────────────────────────────────────────
|
||||
_FIXED_FILE = Path("Audio Master Nem Full (TTS Fixed).txt")
|
||||
_ORIG_FILE = Path("Audio Master Nem Full.txt")
|
||||
SOURCE_FILE = _FIXED_FILE if _FIXED_FILE.exists() else _ORIG_FILE
|
||||
|
||||
OUTPUT_DIR = Path("output_temple_voices")
|
||||
OUTPUT_FILE = "sacred_temple_writings_multivoice.wav"
|
||||
|
||||
SAMPLE_RATE = 24_000 # Hz — final WAV sample rate
|
||||
PAUSE_SAME = 350 # ms silence between same-speaker segments
|
||||
PAUSE_CHANGE = 650 # ms silence between different-speaker segments
|
||||
|
||||
# ── Section boundary markers (match create_audiobook_nem.py BOOKS order) ──────
|
||||
# Sacred Temple Writings starts at "THE SACRED" / "TEMPLE WRITINGS"
|
||||
# and ends just before "THE FIRST BOOK" / "OF SAMUEL THE LAMANITE"
|
||||
_SEC_START_L1 = "THE SACRED"
|
||||
_SEC_START_L2 = "TEMPLE WRITINGS"
|
||||
_SEC_END_L1 = "THE FIRST BOOK"
|
||||
_SEC_END_L2 = "OF SAMUEL THE LAMANITE"
|
||||
|
||||
# ── Character → edge-tts voice ────────────────────────────────────────────────
|
||||
# Run python create_temple_voices.py --list-voices to see all available voices.
|
||||
# Keys must match the speaker labels exactly as they appear in the source file.
|
||||
CHARACTER_VOICES: dict[str, str] = {
|
||||
# ── Celestial beings ───────────────────────────────────────────────────────
|
||||
"Narrator": "en-US-GuyNeural", # calm neutral narrator
|
||||
"Elohim Heavenly Mother": "en-US-JennyNeural", # warm, wise matriarch
|
||||
"Elohim Heavenly Father": "en-US-AndrewMultilingualNeural", # expressive, authoritative
|
||||
"Jehovah": "en-US-AndrewNeural", # clear, gentle divine
|
||||
"Angel of the Lord": "en-US-BrianNeural", # ethereal divine messenger
|
||||
"Holy Ghost": "en-US-EricNeural", # quiet, inward, spiritual
|
||||
"Holy Ghost Elders": "en-US-BrianNeural", # measured elder council
|
||||
|
||||
# ── Dark beings ────────────────────────────────────────────────────────────
|
||||
"Lucifer": "en-CA-LiamNeural", # smooth, persuasive tempter
|
||||
"Satan": "en-US-SteffanNeural", # cold, commanding adversary
|
||||
|
||||
# ── Mortal / earth characters ──────────────────────────────────────────────
|
||||
"Michael": "en-US-RogerNeural", # noble warrior archangel
|
||||
"Adam": "en-US-ChristopherNeural", # earnest first man
|
||||
"Eve": "en-US-AriaNeural", # curious, warm first woman
|
||||
|
||||
# ── Apostles ───────────────────────────────────────────────────────────────
|
||||
"Peter": "en-GB-RyanNeural", # firm British apostle
|
||||
"James": "en-AU-WilliamMultilingualNeural", # steady Australian voice
|
||||
"John": "en-IE-ConnorNeural", # gentle Irish apostle
|
||||
|
||||
# ── Other roles ────────────────────────────────────────────────────────────
|
||||
"Preacher": "en-US-AvaNeural", # bold emphatic preacher
|
||||
"Mob": "en-US-MichelleNeural", # crowd / multitude voice
|
||||
"The Voice of the Mob": "en-US-MichelleNeural", # alias used in some editions
|
||||
}
|
||||
|
||||
# Voice used when a speaker label isn't found in CHARACTER_VOICES
|
||||
FALLBACK_VOICE = "en-US-GuyNeural"
|
||||
|
||||
# Lines/patterns that are ceremony stage-directions → read by Narrator
|
||||
_STAGE_NARRATOR = re.compile(
|
||||
r"^(Break for Instruction|Resume Session|All\s+arise|"
|
||||
r"CHAPTER\s*\d*|________________+|────+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Lines to skip entirely (decorative / empty)
|
||||
_SKIP_RE = re.compile(r"^[—\-_\s\u2014\u2013]*$")
|
||||
|
||||
|
||||
# ── Section extraction ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_section(source: Path) -> str:
|
||||
"""Return text of the Sacred Temple Writings section."""
|
||||
lines = source.read_text(encoding="utf-8").splitlines()
|
||||
in_sec = False
|
||||
out: list[str] = []
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
s = line.strip()
|
||||
if not in_sec:
|
||||
if (s.upper() == _SEC_START_L1 and
|
||||
i + 1 < len(lines) and
|
||||
lines[i + 1].strip().upper().startswith(_SEC_START_L2)):
|
||||
in_sec = True
|
||||
else:
|
||||
# End just before the next section
|
||||
if (s.upper() == _SEC_END_L1 and
|
||||
i + 1 < len(lines) and
|
||||
lines[i + 1].strip().upper().startswith(_SEC_END_L2)):
|
||||
break
|
||||
out.append(line)
|
||||
|
||||
if not out:
|
||||
raise RuntimeError(
|
||||
f"Could not locate 'Sacred Temple Writings' in '{source}'.\n"
|
||||
"Ensure the source file has a line exactly matching "
|
||||
f"'{_SEC_START_L1}' followed by '{_SEC_START_L2}'."
|
||||
)
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
# ── Segment parser ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _speaker_regex(characters: list[str]) -> re.Pattern:
|
||||
"""Regex matching [optional-number] CharacterName: text"""
|
||||
# Sort longest-first so "Holy Ghost Elders" matches before "Holy Ghost"
|
||||
names = sorted(characters, key=len, reverse=True)
|
||||
pat = "|".join(re.escape(n) for n in names)
|
||||
return re.compile(r"^\d*\s*(" + pat + r")\s*:\s*(.*)", re.IGNORECASE)
|
||||
|
||||
|
||||
def parse_segments(text: str) -> list[tuple[str, str]]:
|
||||
"""
|
||||
Convert section text into a list of (normalised_speaker, spoken_text) tuples.
|
||||
Non-attributed prose becomes Narrator lines.
|
||||
"""
|
||||
char_re = _speaker_regex(list(CHARACTER_VOICES.keys()))
|
||||
|
||||
# Build a quick lowercase→canonical lookup for speaker name normalisation
|
||||
canon: dict[str, str] = {k.lower(): k for k in CHARACTER_VOICES}
|
||||
|
||||
segments: list[tuple[str, str]] = []
|
||||
cur_speaker = "Narrator"
|
||||
buf: list[str] = []
|
||||
|
||||
def flush() -> None:
|
||||
combined = " ".join(l.strip() for l in buf if l.strip())
|
||||
if combined:
|
||||
segments.append((cur_speaker, combined))
|
||||
buf.clear()
|
||||
|
||||
for raw in text.splitlines():
|
||||
line = raw.strip()
|
||||
|
||||
if not line or _SKIP_RE.match(line):
|
||||
continue
|
||||
|
||||
# Stage direction → Narrator reads it
|
||||
if _STAGE_NARRATOR.match(line):
|
||||
flush()
|
||||
cur_speaker = "Narrator"
|
||||
buf.append(line)
|
||||
continue
|
||||
|
||||
# "The words of Jehovah … are in blue." — formatting note, skip
|
||||
if re.search(r"are in blue|words of jehovah", line, re.IGNORECASE):
|
||||
continue
|
||||
|
||||
m = char_re.match(line)
|
||||
if m:
|
||||
flush()
|
||||
raw_name = m.group(1)
|
||||
cur_speaker = canon.get(raw_name.lower(), raw_name)
|
||||
spoken = m.group(2).strip()
|
||||
if spoken:
|
||||
buf.append(spoken)
|
||||
else:
|
||||
# Continuation of current speaker (or unattributed narrator prose)
|
||||
buf.append(line)
|
||||
|
||||
flush()
|
||||
return segments
|
||||
|
||||
|
||||
# ── Audio generation ───────────────────────────────────────────────────────────
|
||||
|
||||
async def _tts_bytes(text: str, voice: str) -> bytes:
|
||||
"""Stream edge-tts and return raw MP3 bytes."""
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
data = bytearray()
|
||||
async for chunk in communicate.stream():
|
||||
if chunk["type"] == "audio":
|
||||
data.extend(chunk["data"])
|
||||
return bytes(data)
|
||||
|
||||
|
||||
def _mp3_to_numpy(mp3: bytes) -> np.ndarray:
|
||||
"""Decode MP3 bytes → mono float32 numpy array at SAMPLE_RATE using ffmpeg."""
|
||||
cmd = [
|
||||
"ffmpeg", "-hide_banner", "-loglevel", "error",
|
||||
"-i", "pipe:0", # read MP3 from stdin
|
||||
"-f", "f32le", # raw 32-bit little-endian float PCM
|
||||
"-acodec", "pcm_f32le",
|
||||
"-ac", "1", # mono
|
||||
"-ar", str(SAMPLE_RATE), # resample to target rate
|
||||
"pipe:1", # write PCM to stdout
|
||||
]
|
||||
result = subprocess.run(cmd, input=mp3, capture_output=True, check=True)
|
||||
return np.frombuffer(result.stdout, dtype=np.float32).copy()
|
||||
|
||||
|
||||
def _silence(ms: int) -> np.ndarray:
|
||||
return np.zeros(int(SAMPLE_RATE * ms / 1000), dtype=np.float32)
|
||||
|
||||
|
||||
async def render(
|
||||
segments: list[tuple[str, str]],
|
||||
preview: int | None = None,
|
||||
) -> np.ndarray:
|
||||
"""Generate and stitch all segment audio; return concatenated float32 array."""
|
||||
if preview is not None:
|
||||
segments = segments[:preview]
|
||||
|
||||
parts: list[np.ndarray] = []
|
||||
last_speaker: str | None = None
|
||||
t0 = time.monotonic()
|
||||
|
||||
for idx, (speaker, text) in enumerate(segments, 1):
|
||||
voice = CHARACTER_VOICES.get(speaker, FALLBACK_VOICE)
|
||||
marker = "⚠" if speaker not in CHARACTER_VOICES else " "
|
||||
print(f" {marker}[{idx:>4}/{len(segments)}] {speaker:<28} {voice}")
|
||||
|
||||
try:
|
||||
mp3 = await _tts_bytes(text, voice)
|
||||
except Exception as exc:
|
||||
print(f" ↳ ERROR with '{voice}': {exc} — falling back to {FALLBACK_VOICE}")
|
||||
mp3 = await _tts_bytes(text, FALLBACK_VOICE)
|
||||
|
||||
audio = _mp3_to_numpy(mp3)
|
||||
|
||||
if parts:
|
||||
gap = PAUSE_SAME if speaker == last_speaker else PAUSE_CHANGE
|
||||
parts.append(_silence(gap))
|
||||
parts.append(audio)
|
||||
last_speaker = speaker
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
print(f"\n ✓ {len(segments)} segments in {elapsed:.0f}s")
|
||||
return np.concatenate(parts) if parts else np.array([], dtype=np.float32)
|
||||
|
||||
|
||||
# ── Voice listing ──────────────────────────────────────────────────────────────
|
||||
|
||||
async def _list_voices_async() -> None:
|
||||
voices = await edge_tts.list_voices()
|
||||
english = sorted(
|
||||
(v for v in voices if v["Locale"].startswith("en-")),
|
||||
key=lambda v: (v["Locale"], v["ShortName"]),
|
||||
)
|
||||
print(f"\n {'Locale':<12} {'Short Name':<45} Gender")
|
||||
print(" " + "─" * 68)
|
||||
for v in english:
|
||||
print(f" {v['Locale']:<12} {v['ShortName']:<45} {v['Gender']}")
|
||||
print(f"\n {len(english)} English voices total.")
|
||||
|
||||
|
||||
# ── CLI / main ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(
|
||||
description="Render Sacred Temple Writings with per-character edge-tts voices."
|
||||
)
|
||||
ap.add_argument("--list-voices", action="store_true",
|
||||
help="Print all available English edge-tts voices and exit.")
|
||||
ap.add_argument("--print-segments", action="store_true",
|
||||
help="Print parsed (speaker, text) segments and exit.")
|
||||
ap.add_argument("--preview", type=int, metavar="N",
|
||||
help="Render only the first N segments (quick test).")
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.list_voices:
|
||||
asyncio.run(_list_voices_async())
|
||||
return
|
||||
|
||||
# ── Extract & parse ────────────────────────────────────────────────────────
|
||||
print(f"Source : {SOURCE_FILE}")
|
||||
text = extract_section(SOURCE_FILE)
|
||||
print(f"Section: {len(text):,} chars extracted\n")
|
||||
|
||||
segments = parse_segments(text)
|
||||
|
||||
if args.print_segments:
|
||||
print(f"Parsed {len(segments)} segments:\n")
|
||||
for i, (spkr, txt) in enumerate(segments, 1):
|
||||
snippet = txt[:90] + ("…" if len(txt) > 90 else "")
|
||||
voice = CHARACTER_VOICES.get(spkr, f"{FALLBACK_VOICE} ⚠")
|
||||
print(f" {i:>4}. [{spkr}] ({voice})\n {snippet}\n")
|
||||
return
|
||||
|
||||
# ── Summary table ──────────────────────────────────────────────────────────
|
||||
counts = Counter(s for s, _ in segments)
|
||||
unrecognised = {s for s in counts if s not in CHARACTER_VOICES}
|
||||
|
||||
print(f"Parsed {len(segments)} segments across {len(counts)} speakers:\n")
|
||||
print(f" {'Speaker':<28} {'Segs':>5} {'Voice'}")
|
||||
print(f" {'─'*28} {'─'*5} {'─'*45}")
|
||||
for spkr, voice in CHARACTER_VOICES.items():
|
||||
if counts[spkr]:
|
||||
print(f" {spkr:<28} {counts[spkr]:>5} {voice}")
|
||||
for spkr in sorted(unrecognised):
|
||||
print(f" {spkr:<28} {counts[spkr]:>5} {FALLBACK_VOICE} ⚠ unrecognised")
|
||||
|
||||
total_chars = sum(len(t) for _, t in segments)
|
||||
print(f"\n Total chars: {total_chars:,}")
|
||||
if args.preview:
|
||||
print(f" ⚡ PREVIEW MODE — rendering first {args.preview} segments only")
|
||||
|
||||
# ── GPU note ───────────────────────────────────────────────────────────────
|
||||
# edge-tts is cloud-based (Microsoft Azure neural, free) — GPU not used.
|
||||
print("\nNote: edge-tts uses Microsoft's servers (free, no API key needed).\n"
|
||||
" Render speed depends on your internet connection.\n")
|
||||
|
||||
# ── Render ─────────────────────────────────────────────────────────────────
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
out_path = OUTPUT_DIR / (
|
||||
f"sacred_temple_writings_preview{args.preview}.wav"
|
||||
if args.preview else OUTPUT_FILE
|
||||
)
|
||||
|
||||
print("Rendering segments …\n")
|
||||
audio = asyncio.run(render(segments, args.preview))
|
||||
|
||||
if audio.size > 0:
|
||||
sf.write(str(out_path), audio, SAMPLE_RATE)
|
||||
dur = len(audio) / SAMPLE_RATE
|
||||
m, s = divmod(int(dur), 60)
|
||||
print(f"\n✓ Saved '{out_path}' ({m}m {s:02d}s audio | {SAMPLE_RATE} Hz)")
|
||||
else:
|
||||
print("✗ No audio produced — check parsing with --print-segments")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user