Initial commit: audiobook generator, proper noun auditor GUI

2026-02-24 14:40:31 -07:00
commit 58a236d181
15 changed files with 14975 additions and 0 deletions
--- a/tts_test.py
+++ b/tts_test.py
@ -0,0 +1,49 @@
+import torch
+import soundfile as sf
+from kokoro import KPipeline
+
+# ── Device setup ──────────────────────────────────────────────────────────────
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+if device == "cuda":
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+
+# ── Test paragraph ─────────────────────────────────────────────────────────────
+TEXT = (
+    "The world of artificial intelligence is evolving at a remarkable pace. "
+    "Modern language models can now read, write, and even speak with surprising "
+    "clarity and nuance. This audio was generated entirely on a local machine "
+    "using the Kokoro text-to-speech model, running on an NVIDIA RTX 3060 GPU. "
+    "No cloud, no API keys — just raw local compute turning words into sound."
+)
+
+# ── Build pipeline ─────────────────────────────────────────────────────────────
+# lang_code: 'a' = American English, 'b' = British English
+# voices: af_heart, af_bella, af_nova, am_adam, am_michael, bf_emma, bm_george …
+pipeline = KPipeline(lang_code="a")
+
+OUTPUT_FILE = "output.wav"
+VOICE = "af_heart"          # warm American female voice
+SPEED = 1.0                 # 1.0 = normal speed
+
+# ── Generate audio ─────────────────────────────────────────────────────────────
+print(f"Generating speech with voice '{VOICE}' …")
+
+import numpy as np
+
+audio_chunks = []
+for _, _, chunk_audio in pipeline(TEXT, voice=VOICE, speed=SPEED):
+    # chunk_audio is a torch.Tensor of shape [N], dtype float32
+    if hasattr(chunk_audio, "numpy"):
+        chunk_audio = chunk_audio.cpu().numpy()
+    chunk_audio = np.atleast_1d(chunk_audio.squeeze())
+    if chunk_audio.size > 0:
+        audio_chunks.append(chunk_audio)
+
+if audio_chunks:
+    audio = np.concatenate(audio_chunks, axis=0)
+    sf.write(OUTPUT_FILE, audio, 24000)
+    duration = len(audio) / 24000
+    print(f"✓ Saved '{OUTPUT_FILE}'  ({duration:.1f}s, 24 kHz)")
+else:
+    print("No audio generated — check input text.")