Split audio into proper_nouns_audio/, track JSON files in git
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@ -9,7 +9,10 @@ __pycache__/
|
||||
*.mp3
|
||||
*.flac
|
||||
output_audiobook/
|
||||
output_proper_nouns/
|
||||
proper_nouns_audio/
|
||||
|
||||
# Generated data (JSON files in output_proper_nouns/ are tracked)
|
||||
output_proper_nouns/remaining_review.txt
|
||||
|
||||
# Text files (except proper_nouns.txt)
|
||||
*.txt
|
||||
|
||||
@ -24,8 +24,9 @@ from pathlib import Path
|
||||
from kokoro import KPipeline
|
||||
|
||||
PROPER_NOUNS_FILE = Path("proper_nouns.txt")
|
||||
OUTPUT_DIR = Path("output_proper_nouns")
|
||||
MANIFEST_FILE = OUTPUT_DIR / "manifest.json"
|
||||
DATA_DIR = Path("output_proper_nouns") # JSON files — tracked in git
|
||||
AUDIO_DIR = Path("proper_nouns_audio") # WAV files — not tracked
|
||||
MANIFEST_FILE = DATA_DIR / "manifest.json"
|
||||
VOICE = "am_michael"
|
||||
SAMPLE_RATE = 24000
|
||||
SPEED = 1.0
|
||||
@ -96,7 +97,8 @@ def main() -> None:
|
||||
if device == "cuda":
|
||||
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
||||
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
AUDIO_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"Parsing '{PROPER_NOUNS_FILE}' …")
|
||||
entries = parse_entries(PROPER_NOUNS_FILE)
|
||||
@ -118,7 +120,7 @@ def main() -> None:
|
||||
for i, (cat, entry) in enumerate(entries):
|
||||
slug = slugify(entry)
|
||||
wav_name = f"{slug}.wav"
|
||||
wav_path = OUTPUT_DIR / wav_name
|
||||
wav_path = AUDIO_DIR / wav_name
|
||||
|
||||
if entry in manifest and wav_path.exists():
|
||||
skipped += 1
|
||||
|
||||
@ -55,11 +55,12 @@ os.environ.setdefault("HF_HUB_OFFLINE", "1")
|
||||
import sounddevice as sd
|
||||
import soundfile as sf
|
||||
|
||||
MANIFEST_FILE = Path("output_proper_nouns/manifest.json")
|
||||
OUTPUT_DIR = Path("output_proper_nouns")
|
||||
REPLACEMENTS_DIR = OUTPUT_DIR / "replacements_cache"
|
||||
CORRECT_FILE = OUTPUT_DIR / "correct_words.json"
|
||||
FIXES_FILE = OUTPUT_DIR / "pronunciation_fixes.json"
|
||||
DATA_DIR = Path("output_proper_nouns") # JSON files — tracked in git
|
||||
AUDIO_DIR = Path("proper_nouns_audio") # WAV files — not tracked
|
||||
MANIFEST_FILE = DATA_DIR / "manifest.json"
|
||||
REPLACEMENTS_DIR = AUDIO_DIR / "replacements_cache"
|
||||
CORRECT_FILE = DATA_DIR / "correct_words.json"
|
||||
FIXES_FILE = DATA_DIR / "pronunciation_fixes.json"
|
||||
SOURCE_TEXT = Path("Audio Master Nem Full.txt")
|
||||
FIXED_TEXT_OUT = Path("Audio Master Nem Full (TTS Fixed).txt")
|
||||
|
||||
@ -417,7 +418,7 @@ class ProperNounAuditor(tk.Tk):
|
||||
wav_name = self.manifest.get(word)
|
||||
if not wav_name:
|
||||
return
|
||||
wav_path = OUTPUT_DIR / wav_name
|
||||
wav_path = AUDIO_DIR / wav_name
|
||||
if not wav_path.exists():
|
||||
messagebox.showwarning("Missing audio",
|
||||
f"No audio file for '{word}'.\n"
|
||||
@ -521,7 +522,7 @@ class ProperNounAuditor(tk.Tk):
|
||||
wav_name = self.manifest.get(word)
|
||||
if not wav_name:
|
||||
return
|
||||
wav_path = OUTPUT_DIR / wav_name
|
||||
wav_path = AUDIO_DIR / wav_name
|
||||
if wav_path.exists():
|
||||
wav_path.unlink()
|
||||
self.now_playing_var.set(f"… regen {word}")
|
||||
@ -687,7 +688,7 @@ class ProperNounAuditor(tk.Tk):
|
||||
if not words:
|
||||
messagebox.showinfo("Nothing to export", "No words left to review.")
|
||||
return
|
||||
out = OUTPUT_DIR / "remaining_review.txt"
|
||||
out = DATA_DIR / "remaining_review.txt"
|
||||
out.write_text("\n".join(words), encoding="utf-8")
|
||||
messagebox.showinfo("Exported",
|
||||
f"{len(words)} words written to:\n{out}")
|
||||
@ -720,6 +721,7 @@ class ProperNounAuditor(tk.Tk):
|
||||
def main() -> None:
|
||||
if not MANIFEST_FILE.exists():
|
||||
print(f"Manifest not found: '{MANIFEST_FILE}'")
|
||||
print("Run generate_proper_noun_audio.py first.") # noqa
|
||||
print("Run generate_proper_noun_audio.py first.")
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
1269
output_proper_nouns/correct_words.json
Normal file
1269
output_proper_nouns/correct_words.json
Normal file
File diff suppressed because it is too large
Load Diff
1540
output_proper_nouns/manifest.json
Normal file
1540
output_proper_nouns/manifest.json
Normal file
File diff suppressed because it is too large
Load Diff
28
output_proper_nouns/pronunciation_fixes.json
Normal file
28
output_proper_nouns/pronunciation_fixes.json
Normal file
@ -0,0 +1,28 @@
|
||||
{
|
||||
"Gadianton Robbers": "Gadeeantun Robbers",
|
||||
"Gadianton": "Gadeeantun",
|
||||
"Coriantumr": "Coryantomer",
|
||||
"Laman": "Layman",
|
||||
"Lehi And Nephi": "Leehi And Nephi",
|
||||
"Lehi": "Leehi",
|
||||
"Lehi Mathonihah": "Leehi Mathonihah",
|
||||
"Lehis": "Leehis",
|
||||
"Lehies": "Leehis",
|
||||
"Liahona": "Leeahona",
|
||||
"Moroni": "Morero-ni",
|
||||
"Alma": "Al-ma",
|
||||
"Gadiantons": "Gadeeantuns",
|
||||
"Laban": "Layban",
|
||||
"Mosiah": "Moziah",
|
||||
"Mosiah The King": "Moziah The King",
|
||||
"Nehors": "Kneehores",
|
||||
"Samuel The Lamanite": "Samuel The Laymanite",
|
||||
"Tarry": "Tarery",
|
||||
"The Lamanite Twins": "The Laymanite Twins",
|
||||
"The Lamanites Of Ammon": "The Laymanites Of Ammon",
|
||||
"The Lamanites Of The Land Of Zarahemla": "The Laymanites Of The Land Of Zarahemla",
|
||||
"The Lamanites Of The Land Southward": "The Laymanites Of The Land Southward",
|
||||
"The Lamanites Of The People Of Ammon": "The Laymanites Of The People Of Ammon",
|
||||
"The Lamb'S Book Of Life": "The Lamb's Book Of Life",
|
||||
"The Land Of Nephi": "The Land Of Kneefi"
|
||||
}
|
||||
Reference in New Issue
Block a user