Files
audiobook_creator/gui_proper_noun_player.py

1367 lines
54 KiB
Python
Raw Normal View History

"""
2026-03-10 00:12:04 -06:00
gui_proper_noun_player.py
GUI for auditing proper noun pronunciations supports multiple books.
Each book's data is isolated in its own subdirectory:
output_proper_nouns/<book_slug>/manifest.json
output_proper_nouns/<book_slug>/correct_words.json
output_proper_nouns/<book_slug>/pronunciation_fixes.json
proper_nouns_audio/<book_slug>/<word>.wav
proper_nouns_audio/<book_slug>/replacements_cache/<phonetic>.wav
Three columns (all persisted as JSON per book):
Review words not yet audited
Correct words that already pronounce fine
Fixes linked list: original word phonetic replacement
e.g. "Nephi" "Kneephi"
Hotkeys (always active):
Space replay current word
s stop audio
Escape reset fix entry to original word, refocus review list
On the Review list:
/ navigate
Click / Enter play word AND focus fix entry
On the fix entry (bottom bar, right of the word label):
Start typing to overwrite the pre-filled word.
Enter if text == original word mark Correct, advance to next
if text differs add as Fix, advance to next
Escape reset text to original word, return focus to review list
On the Correct list:
Delete / BackSpace move selected word back to Review
On the Fixes list:
Delete / BackSpace move selected fix back to Review
"Apply Fixes to Text" writes a TTS-ready copy of the source file with all
substitutions applied (case-sensitive whole-word replace).
Run:
2026-03-10 00:12:04 -06:00
.venv/bin/python gui_proper_noun_player.py
"""
import json
2026-02-24 21:12:21 -07:00
import os
import re
2026-04-08 01:42:29 -06:00
import sys
import threading
2026-04-08 01:42:29 -06:00
import time
from pathlib import Path
2026-03-10 00:12:04 -06:00
from typing import NamedTuple
2026-02-24 21:12:21 -07:00
# Model is already cached locally — skip all HuggingFace Hub network calls
os.environ.setdefault("HF_HUB_OFFLINE", "1")
import sounddevice as sd
import soundfile as sf
2026-04-08 01:42:29 -06:00
from PySide6.QtWidgets import *
from PySide6.QtCore import *
from PySide6.QtGui import *
# ── Project management ──────────────────────────────────────────────────────────
class Project(NamedTuple):
name: str
source_paths: list[Path]
def _project_slug(name: str) -> str:
return re.sub(r"[^a-zA-Z0-9_-]", "_", name).strip("_")[:60].lower()
def load_projects() -> list[Project]:
projects_file = Path("projects.json")
if projects_file.exists():
data = json.loads(projects_file.read_text(encoding="utf-8"))
projects = []
for item in data:
paths = [Path(p) for p in item["source_paths"]]
projects.append(Project(name=item["name"], source_paths=paths))
return projects
return []
def save_projects(projects: list[Project]) -> None:
data = [{"name": p.name, "source_paths": [str(path) for path in p.source_paths]} for p in projects]
Path("projects.json").write_text(json.dumps(data, indent=2), encoding="utf-8")
2026-03-10 00:12:04 -06:00
VOICE = "am_michael"
SAMPLE_RATE = 24000
# ── Book source ────────────────────────────────────────────────────────────────
class BookSource(NamedTuple):
label: str # Display name shown in the UI
slug: str # Filesystem-safe identifier used for subdirectory names
source_paths: list # list[Path] — one or more source .txt files
fixed_out: Path # Where "Apply Fixes to Text" writes the TTS-ready copy
def _book_slug(text: str) -> str:
"""Convert a display name to a lowercase filesystem-safe slug."""
return re.sub(r"[^a-zA-Z0-9_-]", "_", text).strip("_")[:60].lower()
2026-04-08 01:42:29 -06:00
def load_books_from_projects() -> list[BookSource]:
projects = load_projects()
books = []
for project in projects:
slug = _project_slug(project.name)
fixed_out = Path(f"{project.name} (TTS Fixed).txt")
books.append(BookSource(label=project.name, slug=slug,
source_paths=project.source_paths, fixed_out=fixed_out))
2026-03-10 00:12:04 -06:00
return books
# ── Colours ────────────────────────────────────────────────────────────────────
BG = "#1e1e2e"
BG2 = "#181825"
BG3 = "#313244"
FG = "#cdd6f4"
FG_DIM = "#6c7086"
GREEN = "#a6e3a1"
BLUE = "#89b4fa"
RED = "#f38ba8"
YELLOW = "#f9e2af"
MAUVE = "#cba6f7"
# ── Audio ──────────────────────────────────────────────────────────────────────
def play_async(path: Path) -> None:
sd.stop()
def _play():
2026-02-26 12:09:43 -07:00
try:
data, sr = sf.read(str(path), dtype="float32")
sd.play(data, sr)
except Exception as exc:
print(f"[audio] playback error: {exc}")
threading.Thread(target=_play, daemon=True).start()
def _slug(text: str) -> str:
"""Safe filename from arbitrary text."""
return re.sub(r"[^a-zA-Z0-9_-]", "_", text).strip("_")[:80]
2026-04-08 01:42:29 -06:00
_CHAPTER_LINE_RE = re.compile(r"^Chapter\s+(\d+)\s*-\s*(.+)\s*$", re.IGNORECASE)
_PROLOGUE_LINE_RE = re.compile(r"^Prologue\s*$", re.IGNORECASE)
def _chapter_slug(title: str) -> str:
text = title.lower()
text = re.sub(r"[^a-z0-9]+", "_", text)
return text.strip("_")
def _clean_tts_text(text: str) -> str:
text = re.sub(r"^[_\-\*\s]{3,}\s*$", "", text, flags=re.MULTILINE)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def _parse_chapters_from_paths(source_paths: list[Path]) -> list[dict]:
"""Parse chapters from source files.
Supported heading formats:
- Prologue
- Chapter # - chapter name
"""
chapters: list[dict] = []
current: dict | None = None
for path in source_paths:
lines = path.read_text(encoding="utf-8").splitlines()
for line in lines:
m = _CHAPTER_LINE_RE.match(line.strip())
if m:
if current is not None:
current["text"] = "\n".join(current["lines"])
chapters.append(current)
num = int(m.group(1))
title = m.group(2).strip()
current = {
"num": num,
"title": title,
"label": f"Chapter {num} - {title}",
"slug": f"chapter_{num:02d}_{_chapter_slug(title)}",
"lines": [line],
}
elif _PROLOGUE_LINE_RE.match(line.strip()):
if current is not None:
current["text"] = "\n".join(current["lines"])
chapters.append(current)
current = {
"num": 0,
"title": "Prologue",
"label": "Prologue",
"slug": "chapter_00_prologue",
"lines": [line],
}
elif current is not None:
current["lines"].append(line)
if current is not None:
current["text"] = "\n".join(current["lines"])
chapters.append(current)
deduped: list[dict] = []
seen: set[int] = set()
for ch in chapters:
if ch["num"] in seen:
continue
seen.add(ch["num"])
ch.pop("lines", None)
deduped.append(ch)
return sorted(deduped, key=lambda x: x["num"])
def _parse_chapter_selection(raw: str, valid_numbers: set[int]) -> list[int]:
"""Parse chapter selection like: all | 1,2,5-8."""
text = (raw or "").strip().lower()
if not text or text == "all":
return sorted(valid_numbers)
out: set[int] = set()
for part in text.split(","):
token = part.strip()
if not token:
continue
if "-" in token:
a, b = token.split("-", 1)
start = int(a.strip())
end = int(b.strip())
if end < start:
start, end = end, start
for n in range(start, end + 1):
if n in valid_numbers:
out.add(n)
else:
n = int(token)
if n in valid_numbers:
out.add(n)
return sorted(out)
# Lazy KPipeline singleton — only imported+loaded on first synthesis request
_pipeline = None
_pipeline_lock = threading.Lock()
def _get_pipeline():
global _pipeline
if _pipeline is None:
with _pipeline_lock:
if _pipeline is None:
import warnings
from kokoro import KPipeline # type: ignore
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
2026-02-24 21:12:21 -07:00
warnings.filterwarnings("ignore", message=".*unauthenticated.*")
_pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
return _pipeline
2026-03-10 00:12:04 -06:00
def synth_and_play(text: str, replacements_dir: Path, on_ready=None) -> None:
"""Synthesise *text* with Kokoro (cached to *replacements_dir*) and play it.
Runs entirely on a daemon thread so the GUI never blocks.
*on_ready(path)* is called on the same thread once the file is written.
"""
def _run():
2026-02-26 12:09:43 -07:00
try:
2026-03-10 00:12:04 -06:00
path = _synth_to_cache(text, replacements_dir)
2026-02-26 12:09:43 -07:00
if path:
if on_ready:
on_ready(path)
play_async(path)
except Exception as exc:
print(f"[synth] error synthesising '{text}': {exc}")
threading.Thread(target=_run, daemon=True).start()
2026-03-10 00:12:04 -06:00
def _synth_to_cache(text: str, replacements_dir: Path) -> "Path | None":
"""Synthesise *text* to a cached WAV and return its path (or None on failure).
Skips synthesis if the file already exists. Safe to call from any thread.
"""
2026-03-10 00:12:04 -06:00
replacements_dir.mkdir(parents=True, exist_ok=True)
cache_path = replacements_dir / f"{_slug(text)}.wav"
if not cache_path.exists():
import warnings
import numpy as np
pipeline = _get_pipeline()
chunks = []
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
for _, _, audio in pipeline(text, voice=VOICE):
if audio is not None:
chunks.append(audio)
if chunks:
combined = np.concatenate(chunks)
sf.write(str(cache_path), combined, SAMPLE_RATE)
return cache_path if cache_path.exists() else None
# ── Persistence helpers ────────────────────────────────────────────────────────
def load_json(path: Path, default):
if path.exists():
return json.loads(path.read_text(encoding="utf-8"))
return default
def save_json(path: Path, obj) -> None:
path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
# ── Main app ───────────────────────────────────────────────────────────────────
2026-04-08 01:42:29 -06:00
class ProperNounAuditor(QMainWindow):
# tracks which word is currently loaded into the fix entry
_fix_entry_word: str = ""
2026-03-10 00:12:04 -06:00
def __init__(self, books: list[BookSource]) -> None:
super().__init__()
2026-04-08 01:42:29 -06:00
self.setWindowTitle("Proper Noun Pronunciation Auditor")
self.setGeometry(100, 100, 1020, 760)
2026-03-10 00:12:04 -06:00
self.books: list[BookSource] = books
2026-04-08 01:42:29 -06:00
self.projects: list[Project] = load_projects()
2026-03-10 00:12:04 -06:00
self.book: BookSource | None = None
2026-03-10 00:12:04 -06:00
# Loaded per-book via _load_book()
self.manifest: dict[str, str] = {}
self.all_words: list[str] = []
self.correct: list[str] = []
self.fixes: dict[str, str] = {}
self._build_ui()
2026-04-08 01:42:29 -06:00
# Auto-load first project that has data; otherwise select first
if self.projects:
first_project = self.projects[0]
self._book_var.setCurrentText(first_project.name)
self._on_project_change()
# Hotkeys
self._setup_shortcuts()
def _setup_shortcuts(self):
# Space replay
shortcut = QShortcut(QKeySequence("Space"), self)
shortcut.activated.connect(self._replay)
# s stop
shortcut = QShortcut(QKeySequence("S"), self)
shortcut.activated.connect(lambda: sd.stop())
# r regen
shortcut = QShortcut(QKeySequence("R"), self)
shortcut.activated.connect(self._regen_current)
# Escape reset
shortcut = QShortcut(QKeySequence("Escape"), self)
shortcut.activated.connect(self._reset_fix_entry)
2026-03-10 00:12:04 -06:00
# ── Per-book path properties ─────────────────────────────────────────────────
@property
def _data_dir(self) -> Path:
return Path("output_proper_nouns") / self.book.slug
@property
def _audio_dir(self) -> Path:
return Path("proper_nouns_audio") / self.book.slug
@property
def _manifest_file(self) -> Path:
return self._data_dir / "manifest.json"
@property
def _replacements_dir(self) -> Path:
return self._audio_dir / "replacements_cache"
@property
def _correct_file(self) -> Path:
return self._data_dir / "correct_words.json"
@property
def _fixes_file(self) -> Path:
return self._data_dir / "pronunciation_fixes.json"
# ── Book loading / switching ──────────────────────────────────────────────────
def _load_book(self, book: BookSource) -> None:
"""Switch to *book* — reload all state from its per-book data files."""
sd.stop()
self.book = book
2026-04-08 01:42:29 -06:00
self._book_var.setCurrentText(book.label)
2026-03-10 00:12:04 -06:00
if self._manifest_file.exists():
self.manifest = load_json(self._manifest_file, {})
else:
self.manifest = {}
self.all_words = sorted(self.manifest.keys(), key=str.casefold)
self.correct = load_json(self._correct_file, [])
self.fixes = load_json(self._fixes_file, {})
n = len(self.manifest)
if n:
status = f"{n} words loaded · {len(self.correct)} correct · {len(self.fixes)} fixes"
else:
2026-04-08 01:42:29 -06:00
status = "No manifest yet — click find proper nouns to create one"
self._book_status_var.setText(status)
2026-03-10 00:12:04 -06:00
self._refresh_all()
2026-04-08 01:42:29 -06:00
self.fix_var = ""
self._fix_entry.setText("")
2026-03-10 00:12:04 -06:00
self._fix_entry_word = ""
2026-04-08 01:42:29 -06:00
self.now_playing_var.setText("")
2026-03-10 00:12:04 -06:00
def _on_book_change(self, event=None) -> None:
label = self._book_var.get()
book = next((b for b in self.books if b.label == label), None)
if book:
self._load_book(book)
2026-04-08 01:42:29 -06:00
def _on_project_change(self) -> None:
name = self._book_var.currentText()
project = next((p for p in self.projects if p.name == name), None)
if project:
# Create BookSource from project
slug = _project_slug(project.name)
fixed_out = Path(f"{project.name} (TTS Fixed).txt")
book = BookSource(label=project.name, slug=slug,
source_paths=project.source_paths, fixed_out=fixed_out)
self.books = [book]
self._load_book(book)
2026-02-26 12:09:43 -07:00
2026-04-08 01:42:29 -06:00
def _new_project(self) -> None:
name, ok = QInputDialog.getText(self, "New Project", "Enter project name:")
if ok and name:
# Check if exists
if any(p.name == name for p in self.projects):
QMessageBox.critical(self, "Error", "Project name already exists.")
return
# Select files
files, _ = QFileDialog.getOpenFileNames(self, "Select TXT files", "", "Text files (*.txt)")
if files:
paths = [Path(f) for f in files]
project = Project(name=name, source_paths=paths)
self.projects.append(project)
save_projects(self.projects)
# Update combobox values
self._book_var.clear()
self._book_var.addItems([p.name for p in self.projects])
self._book_var.setCurrentText(name)
self._on_project_change()
def _add_files(self) -> None:
if not self._book_var.currentText():
QMessageBox.information(self, "No project selected", "Select a project first.")
return
files, _ = QFileDialog.getOpenFileNames(self, "Add TXT files", "", "Text files (*.txt)")
if files:
name = self._book_var.currentText()
project = next((p for p in self.projects if p.name == name), None)
if project:
new_paths = [Path(f) for f in files if Path(f) not in project.source_paths]
project.source_paths.extend(new_paths)
save_projects(self.projects)
self._on_project_change()
def closeEvent(self, event) -> None:
sd.stop()
event.accept()
2026-02-26 12:09:43 -07:00
# ── UI construction ────────────────────────────────────────────────────────
def _build_ui(self) -> None:
2026-04-08 01:42:29 -06:00
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QVBoxLayout(central_widget)
# ── Project selector bar ──────────────────────────────────────────────────
book_bar = QWidget()
book_bar.setStyleSheet(f"background-color: {BG2}; padding: 7px;")
book_layout = QHBoxLayout(book_bar)
book_label = QLabel("Project:")
book_label.setStyleSheet(f"color: {FG_DIM}; font-weight: bold; font-size: 10pt;")
book_layout.addWidget(book_label)
self._book_var = QComboBox()
self._book_var.addItems([p.name for p in self.projects])
self._book_var.setEditable(False)
self._book_var.setStyleSheet(f"font-size: 10pt; min-width: 300px;")
self._book_var.currentTextChanged.connect(self._on_project_change)
book_layout.addWidget(self._book_var)
new_project_btn = self._create_button("New Project", self._new_project, BLUE, BG3)
book_layout.addWidget(new_project_btn)
add_files_btn = self._create_button("Add Files", self._add_files, GREEN, BG3)
book_layout.addWidget(add_files_btn)
self._extract_btn = self._create_button("find proper nouns", self._extract_and_generate, GREEN, BG3)
book_layout.addWidget(self._extract_btn)
apply_fixes_btn = self._create_button("⇄ Apply Fixes to Text", self._apply_fixes, YELLOW, BG3)
book_layout.addWidget(apply_fixes_btn)
export_remaining_btn = self._create_button("⬇ Export Remaining", self._export_remaining, BLUE, BG3)
book_layout.addWidget(export_remaining_btn)
voice_label = QLabel("Voice:")
voice_label.setStyleSheet(f"color: {FG_DIM}; font-size: 9pt;")
book_layout.addWidget(voice_label)
self._voice_combo = QComboBox()
self._voice_combo.setEditable(True)
self._voice_combo.addItems([
"am_onyx",
"am_michael",
"af_heart",
"af_bella",
"af_nicole",
"bm_george",
"bm_lewis",
])
self._voice_combo.setCurrentText("am_onyx")
self._voice_combo.setStyleSheet("font-size: 9pt; min-width: 120px;")
book_layout.addWidget(self._voice_combo)
chapters_label = QLabel("Chapters:")
chapters_label.setStyleSheet(f"color: {FG_DIM}; font-size: 9pt;")
book_layout.addWidget(chapters_label)
self._chapters_entry = QLineEdit("all")
self._chapters_entry.setPlaceholderText("all or 0,1,2,5-8")
self._chapters_entry.setStyleSheet("font-size: 9pt; min-width: 130px;")
book_layout.addWidget(self._chapters_entry)
self._gen_audio_btn = self._create_button("Generate Audio", self._generate_selected_chapters, MAUVE, BG3)
book_layout.addWidget(self._gen_audio_btn)
self._gen_audio_status = QLabel("")
self._gen_audio_status.setStyleSheet(f"color: {FG_DIM}; font-size: 8pt;")
book_layout.addWidget(self._gen_audio_status)
self._book_status_var = QLabel("Select a book above")
self._book_status_var.setStyleSheet(f"color: {FG_DIM}; font-size: 9pt;")
book_layout.addWidget(self._book_status_var)
book_layout.addStretch()
main_layout.addWidget(book_bar)
2026-03-10 00:12:04 -06:00
# ── Title bar ─────────────────────────────────────────────────────────
2026-04-08 01:42:29 -06:00
title_bar = QWidget()
title_bar.setStyleSheet(f"background-color: {BG}; padding: 6px;")
title_layout = QHBoxLayout(title_bar)
title_label = QLabel("Proper Noun Pronunciation Auditor")
title_label.setStyleSheet(f"font-size: 15pt; font-weight: bold; color: {FG};")
title_layout.addWidget(title_label)
hint_label = QLabel("Space=replay r=regen s=stop Esc=reset fix Del=remove from list Enter=correct|fix")
hint_label.setStyleSheet(f"font-size: 8pt; color: {FG_DIM};")
title_layout.addWidget(hint_label)
title_layout.addStretch()
main_layout.addWidget(title_bar)
# Three-column body
2026-04-08 01:42:29 -06:00
body = QWidget()
body_layout = QHBoxLayout(body)
body_layout.setSpacing(8)
# ── Column 0: Review list ──────────────────────────────────────────────
2026-04-08 01:42:29 -06:00
col0 = QWidget()
col0_layout = QVBoxLayout(col0)
filter_row = QWidget()
filter_layout = QHBoxLayout(filter_row)
filter_label = QLabel("Filter:")
filter_label.setStyleSheet(f"color: {FG}; font-size: 10pt;")
filter_layout.addWidget(filter_label)
self.search_var = ""
self._filter_entry = QLineEdit()
self._filter_entry.setStyleSheet(f"font-size: 11pt; background-color: {BG3}; color: {FG}; border: 1px solid {BG3}; padding: 4px;")
self._filter_entry.textChanged.connect(self._refresh_review)
filter_layout.addWidget(self._filter_entry)
clear_filter_btn = self._create_button("", lambda: self._filter_entry.clear(), RED, BG3)
filter_layout.addWidget(clear_filter_btn)
col0_layout.addWidget(filter_row)
hdr0 = QWidget()
hdr0_layout = QHBoxLayout(hdr0)
review_section_label = QLabel("TO REVIEW")
review_section_label.setStyleSheet(f"font-weight: bold; color: {FG};")
hdr0_layout.addWidget(review_section_label)
self.review_count_var = QLabel("")
self.review_count_var.setStyleSheet(f"color: {FG_DIM}; font-size: 9pt;")
hdr0_layout.addStretch()
hdr0_layout.addWidget(self.review_count_var)
col0_layout.addWidget(hdr0)
self.review_lb = QListWidget()
self.review_lb.setStyleSheet(f"background-color: {BG2}; color: {FG}; border: none;")
self.review_lb.itemSelectionChanged.connect(self._on_review_select)
self.review_lb.itemDoubleClicked.connect(self._on_review_select)
self.review_lb.keyPressEvent = self._review_key_press
col0_layout.addWidget(self.review_lb)
body_layout.addWidget(col0, 3)
# ── Column 1: Correct list ─────────────────────────────────────────────
2026-04-08 01:42:29 -06:00
col1 = QWidget()
col1_layout = QVBoxLayout(col1)
hdr1 = QWidget()
hdr1_layout = QHBoxLayout(hdr1)
correct_section_label = QLabel("✓ CORRECT [Del=remove]")
correct_section_label.setStyleSheet(f"font-weight: bold; color: {FG};")
hdr1_layout.addWidget(correct_section_label)
self.correct_count_var = QLabel("")
self.correct_count_var.setStyleSheet(f"color: {FG_DIM}; font-size: 9pt;")
hdr1_layout.addStretch()
hdr1_layout.addWidget(self.correct_count_var)
col1_layout.addWidget(hdr1)
self.correct_lb = QListWidget()
self.correct_lb.setStyleSheet(f"background-color: {BG2}; color: {FG}; border: none;")
self.correct_lb.itemSelectionChanged.connect(lambda: self._on_side_select(self.correct_lb))
self.correct_lb.keyPressEvent = lambda e: self._side_key_press(e, self.correct_lb, False)
col1_layout.addWidget(self.correct_lb)
back_to_review_btn = self._create_button("← Back to Review [Del]", lambda: self._move_back(self.correct_lb, is_dict=False), YELLOW)
col1_layout.addWidget(back_to_review_btn)
body_layout.addWidget(col1, 2)
# ── Column 2: Fixes list ───────────────────────────────────────────────
2026-04-08 01:42:29 -06:00
col2 = QWidget()
col2_layout = QVBoxLayout(col2)
hdr2 = QWidget()
hdr2_layout = QHBoxLayout(hdr2)
fixes_section_label = QLabel("⇄ FIXES (original → phonetic)")
fixes_section_label.setStyleSheet(f"font-weight: bold; color: {FG};")
hdr2_layout.addWidget(fixes_section_label)
self.fixes_count_var = QLabel("")
self.fixes_count_var.setStyleSheet(f"color: {FG_DIM}; font-size: 9pt;")
hdr2_layout.addStretch()
hdr2_layout.addWidget(self.fixes_count_var)
col2_layout.addWidget(hdr2)
self.fixes_lb = QListWidget()
self.fixes_lb.setStyleSheet(f"background-color: {BG2}; color: {FG}; border: none;")
self.fixes_lb.itemSelectionChanged.connect(lambda: self._on_side_select(self.fixes_lb))
self.fixes_lb.keyPressEvent = lambda e: self._side_key_press(e, self.fixes_lb, True)
col2_layout.addWidget(self.fixes_lb)
back_to_review_fixes_btn = self._create_button("← Back to Review [Del]", lambda: self._move_back(self.fixes_lb, is_dict=True), YELLOW)
col2_layout.addWidget(back_to_review_fixes_btn)
body_layout.addWidget(col2, 2)
main_layout.addWidget(body)
# ── Bottom action bar ──────────────────────────────────────────────────
2026-04-08 01:42:29 -06:00
action_bar = QWidget()
action_bar.setStyleSheet(f"background-color: {BG3}; padding: 8px;")
action_layout = QHBoxLayout(action_bar)
playing_icon = QLabel("")
playing_icon.setStyleSheet(f"color: {GREEN}; font-size: 11pt;")
action_layout.addWidget(playing_icon)
self.now_playing_var = QLabel("")
self.now_playing_var.setStyleSheet(f"color: {GREEN}; font-size: 11pt; font-weight: bold; min-width: 150px;")
action_layout.addWidget(self.now_playing_var)
arrow_label = QLabel("")
arrow_label.setStyleSheet(f"color: {MAUVE}; font-size: 13pt; font-weight: bold;")
action_layout.addWidget(arrow_label)
self.fix_var = ""
self._fix_entry = QLineEdit()
self._fix_entry.setStyleSheet(f"font-size: 11pt; background-color: {BG2}; color: {MAUVE}; border: 1px solid {BG2}; padding: 4px; max-width: 150px;")
self._fix_entry.returnPressed.connect(self._enter_action)
self._fix_entry.keyPressEvent = self._fix_entry_key_press
action_layout.addWidget(self._fix_entry)
hint_action = QLabel("Enter=correct (edit first for fix) Esc=reset")
hint_action.setStyleSheet(f"color: {FG_DIM}; font-size: 8pt;")
action_layout.addWidget(hint_action)
separator1 = QLabel("")
separator1.setStyleSheet(f"color: {FG_DIM};")
action_layout.addWidget(separator1)
stop_btn = self._create_button("■ Stop [s]", lambda: sd.stop(), RED)
action_layout.addWidget(stop_btn)
replay_btn = self._create_button("↺ Replay [Space]", self._replay, BLUE)
action_layout.addWidget(replay_btn)
regen_btn = self._create_button("↻ Regen [r]", self._regen_current, GREEN)
action_layout.addWidget(regen_btn)
separator2 = QLabel("")
separator2.setStyleSheet(f"color: {FG_DIM};")
action_layout.addWidget(separator2)
self._pregen_btn = self._create_button("↻ Pre-gen Fix Audio", self._pregen_all_fix_audio, MAUVE, BG2)
action_layout.addWidget(self._pregen_btn)
self._pregen_status_var = QLabel("")
self._pregen_status_var.setStyleSheet(f"color: {FG_DIM}; font-size: 8pt; min-width: 200px;")
action_layout.addWidget(self._pregen_status_var)
action_layout.addStretch()
main_layout.addWidget(action_bar)
def _create_button(self, text: str, callback, color: str = GREEN, bg: str = BG3) -> QPushButton:
btn = QPushButton(text)
btn.setStyleSheet(f"background-color: {bg}; color: {color}; border: 1px solid {color}; padding: 4px 8px;")
btn.clicked.connect(callback)
return btn
# ── Refresh helpers ────────────────────────────────────────────────────────
def _review_words(self) -> list[str]:
2026-02-25 11:37:35 -07:00
excluded = set(self.correct) | set(self.fixes.keys())
2026-04-08 01:42:29 -06:00
q = self._filter_entry.text().strip().casefold()
words = [w for w in self.all_words if w not in excluded]
if q:
words = [w for w in words if q in w.casefold()]
return words
def _refresh_review(self) -> None:
words = self._review_words()
2026-04-08 01:42:29 -06:00
self.review_lb.clear()
for w in words:
2026-04-08 01:42:29 -06:00
self.review_lb.addItem(f" {w}")
self.review_count_var.setText(f"{len(words)}")
def _refresh_correct(self) -> None:
2026-04-08 01:42:29 -06:00
self.correct_lb.clear()
2026-02-25 11:37:35 -07:00
for w in self.correct: # already newest-first
2026-04-08 01:42:29 -06:00
self.correct_lb.addItem(f" {w}")
self.correct_count_var.setText(f"{len(self.correct)}")
def _refresh_fixes(self) -> None:
2026-04-08 01:42:29 -06:00
self.fixes_lb.clear()
2026-02-25 11:37:35 -07:00
for orig, rep in reversed(list(self.fixes.items())): # newest-first
2026-04-08 01:42:29 -06:00
self.fixes_lb.addItem(f" {orig}{rep}")
self.fixes_count_var.setText(f"{len(self.fixes)}")
def _refresh_all(self) -> None:
self._refresh_review()
self._refresh_correct()
self._refresh_fixes()
# ── Playback ───────────────────────────────────────────────────────────────
def _play_word(self, word: str) -> None:
2026-03-10 00:12:04 -06:00
if not self.book:
return
wav_name = self.manifest.get(word)
if not wav_name:
return
2026-03-10 00:12:04 -06:00
wav_path = self._audio_dir / wav_name
if not wav_path.exists():
2026-04-08 01:42:29 -06:00
QMessageBox.warning(self, "Missing audio",
f"No audio file for '{word}'.\n"
"Click 'find proper nouns' first.")
return
2026-04-08 01:42:29 -06:00
self.now_playing_var.setText(word)
play_async(wav_path)
# ── Selection callbacks ────────────────────────────────────────────────────
2026-04-08 01:42:29 -06:00
def _on_review_select(self) -> None:
item = self.review_lb.currentItem()
if not item:
return
2026-04-08 01:42:29 -06:00
word = item.text().strip()
self._fix_entry_word = word
2026-04-08 01:42:29 -06:00
self.fix_var = word # pre-fill fix entry with the word
self._fix_entry.setText(word)
self._fix_entry.selectAll()
self._fix_entry.setFocus()
self._play_word(word)
2026-04-08 01:42:29 -06:00
def _on_side_select(self, listbox: QListWidget) -> None:
2026-03-10 00:12:04 -06:00
if not self.book:
return
2026-04-08 01:42:29 -06:00
item = listbox.currentItem()
if not item:
return
2026-04-08 01:42:29 -06:00
row = item.text().strip()
parts = row.split("")
original = parts[0].strip()
if listbox is self.fixes_lb and len(parts) == 2:
replacement = parts[1].strip()
2026-02-24 21:12:21 -07:00
self._fix_entry_word = original
2026-04-08 01:42:29 -06:00
self.fix_var = replacement
self._fix_entry.setText(replacement)
self.now_playing_var.setText(f"{replacement}")
2026-03-10 00:12:04 -06:00
rdir = self._replacements_dir
def _on_ready(_path):
2026-04-08 01:42:29 -06:00
self.now_playing_var.setText(replacement)
2026-03-10 00:12:04 -06:00
synth_and_play(replacement, rdir, on_ready=_on_ready)
else:
2026-02-24 21:12:21 -07:00
self._fix_entry_word = original
2026-04-08 01:42:29 -06:00
self.fix_var = original
self._fix_entry.setText(original)
self._play_word(original)
2026-04-08 01:42:29 -06:00
def _review_key_press(self, event):
if event.key() == Qt.Key_Return or event.key() == Qt.Key_Enter:
self._on_review_select()
else:
QListWidget.keyPressEvent(self.review_lb, event)
def _fix_entry_key_press(self, event):
if event.key() == Qt.Key_Up:
self._navigate_review(-1)
event.accept()
elif event.key() == Qt.Key_Down:
self._navigate_review(1)
event.accept()
elif event.key() == Qt.Key_Escape:
self._reset_fix_entry()
event.accept()
else:
QLineEdit.keyPressEvent(self._fix_entry, event)
# ── Actions ────────────────────────────────────────────────────────────────
def _selected_review_word(self) -> str | None:
2026-04-08 01:42:29 -06:00
item = self.review_lb.currentItem()
if not item:
return None
2026-04-08 01:42:29 -06:00
return item.text().strip()
def _enter_action(self) -> None:
"""Smart Enter handler for the fix entry.
If the entry text matches the original word mark Correct.
If the entry text differs from the original add as Fix.
"""
word = self._fix_entry_word or self._selected_review_word()
if not word:
return
2026-04-08 01:42:29 -06:00
text = self.fix_var.strip()
if not text or text == word:
self._mark_correct_word(word)
else:
self._add_fix_for_word(word, text)
def _reset_fix_entry(self) -> None:
"""Escape: reset fix entry to the original word, refocus the review list."""
2026-04-08 01:42:29 -06:00
self.fix_var = self._fix_entry_word
self._fix_entry.setText(self._fix_entry_word)
self.review_lb.setFocus()
def _replay(self) -> None:
if self._fix_entry_word:
self._play_word(self._fix_entry_word)
2026-02-24 21:12:21 -07:00
def _regen_current(self) -> None:
"""Delete the cached WAV for the current word/replacement and re-synthesise."""
word = self._fix_entry_word
if not word:
return
# Determine which file to delete based on context
2026-04-08 01:42:29 -06:00
fix_text = self.fix_var.strip()
2026-02-24 21:12:21 -07:00
# If the fix box contains something different from the word, regen that text
is_fix_replacement = bool(fix_text and fix_text != word)
2026-03-10 00:12:04 -06:00
if not self.book:
return
2026-02-24 21:12:21 -07:00
if is_fix_replacement:
2026-03-10 00:12:04 -06:00
target = self._replacements_dir / f"{_slug(fix_text)}.wav"
2026-02-24 21:12:21 -07:00
if target.exists():
target.unlink()
2026-04-08 01:42:29 -06:00
self.now_playing_var.setText(f"… regen {fix_text}")
2026-03-10 00:12:04 -06:00
rdir = self._replacements_dir
2026-02-24 21:12:21 -07:00
def _on_ready(_p):
2026-04-08 01:42:29 -06:00
self.now_playing_var.setText(fix_text)
2026-03-10 00:12:04 -06:00
synth_and_play(fix_text, rdir, on_ready=_on_ready)
2026-02-24 21:12:21 -07:00
else:
wav_name = self.manifest.get(word)
if not wav_name:
return
2026-03-10 00:12:04 -06:00
wav_path = self._audio_dir / wav_name
2026-02-24 21:12:21 -07:00
if wav_path.exists():
wav_path.unlink()
2026-04-08 01:42:29 -06:00
self.now_playing_var.setText(f"… regen {word}")
2026-02-24 21:12:21 -07:00
def _regen():
2026-02-26 12:09:43 -07:00
try:
import warnings, numpy as np
pipeline = _get_pipeline()
chunks = []
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
for _, _, audio in pipeline(word, voice=VOICE):
if audio is not None:
chunks.append(audio)
if chunks:
sf.write(str(wav_path), np.concatenate(chunks), SAMPLE_RATE)
2026-04-08 01:42:29 -06:00
self.now_playing_var.setText(word)
2026-02-26 12:09:43 -07:00
play_async(wav_path)
except Exception as exc:
print(f"[regen] error for '{word}': {exc}")
2026-02-24 21:12:21 -07:00
threading.Thread(target=_regen, daemon=True).start()
def _navigate_review(self, delta: int) -> None:
"""Move the review list selection up (delta=-1) or down (delta=+1)."""
2026-04-08 01:42:29 -06:00
count = self.review_lb.count()
if count == 0:
2026-02-24 21:12:21 -07:00
return
2026-04-08 01:42:29 -06:00
current_row = self.review_lb.currentRow()
if current_row == -1:
current_row = 0
new_row = max(0, min(count - 1, current_row + delta))
if new_row == current_row:
2026-02-24 21:12:21 -07:00
return
2026-04-08 01:42:29 -06:00
self.review_lb.setCurrentRow(new_row)
self._on_review_select()
2026-02-24 21:12:21 -07:00
def _advance_review(self, from_idx: int = 0) -> None:
2026-02-25 11:37:35 -07:00
"""Select the item at from_idx (clamped), positioned in the upper portion
of the viewport so the word doesn't end up in the bottom half unless
the list can't scroll any further down."""
2026-04-08 01:42:29 -06:00
count = self.review_lb.count()
if count == 0:
2026-02-24 21:12:21 -07:00
return
2026-04-08 01:42:29 -06:00
target = min(from_idx, count - 1)
self.review_lb.setCurrentRow(target)
self.review_lb.scrollToItem(self.review_lb.item(target))
2026-02-25 11:37:35 -07:00
self.review_lb.yview_moveto(ideal_top / size)
2026-02-24 21:12:21 -07:00
self.review_lb.event_generate("<<ListboxSelect>>")
def _mark_correct_word(self, word: str) -> None:
2026-02-24 21:12:21 -07:00
idx = self.review_lb.curselection()
from_idx = idx[0] if idx else 0
2026-02-25 11:37:35 -07:00
if word not in self.correct:
self.correct.insert(0, word)
2026-03-10 00:12:04 -06:00
save_json(self._correct_file, self.correct)
self._fix_entry_word = ""
2026-04-08 01:42:29 -06:00
self.fix_var = ""
self._fix_entry.setText("")
self.now_playing_var.setText("")
self._refresh_all()
2026-02-24 21:12:21 -07:00
self._advance_review(from_idx)
def _add_fix_for_word(self, word: str, replacement: str) -> None:
2026-04-08 01:42:29 -06:00
current_row = self.review_lb.currentRow()
from_idx = current_row if current_row != -1 else 0
2026-02-25 11:37:35 -07:00
self.fixes.pop(word, None)
self.fixes[word] = replacement
2026-03-10 00:12:04 -06:00
save_json(self._fixes_file, self.fixes)
self._fix_entry_word = ""
2026-04-08 01:42:29 -06:00
self.fix_var = ""
self._fix_entry.setText("")
self.now_playing_var.setText("")
self._refresh_all()
2026-02-24 21:12:21 -07:00
self._advance_review(from_idx)
2026-04-08 01:42:29 -06:00
def _move_back(self, listbox: QListWidget, is_dict: bool) -> None:
item = listbox.currentItem()
if not item:
return
2026-04-08 01:42:29 -06:00
raw = item.text().strip().split("")[0].strip()
if is_dict:
self.fixes.pop(raw, None)
2026-03-10 00:12:04 -06:00
save_json(self._fixes_file, self.fixes)
if raw in self.correct:
self.correct.remove(raw)
2026-03-10 00:12:04 -06:00
save_json(self._correct_file, self.correct)
else:
2026-02-25 11:37:35 -07:00
if raw in self.correct:
self.correct.remove(raw)
2026-03-10 00:12:04 -06:00
save_json(self._correct_file, self.correct)
self._refresh_all()
2026-03-10 00:12:04 -06:00
# ── Extract & Generate ─────────────────────────────────────────────────────────────
def _extract_and_generate(self) -> None:
"""Extract proper nouns from the selected books source text, then
generate a TTS audio clip for each one. Runs in a background thread.
"""
if not self.book:
2026-04-08 01:42:29 -06:00
QMessageBox.information(self, "No book selected", "Please select a book first.")
2026-03-10 00:12:04 -06:00
return
missing = [p for p in self.book.source_paths if not p.exists()]
if missing:
2026-04-08 01:42:29 -06:00
QMessageBox.critical(
self, "Source file(s) not found",
2026-03-10 00:12:04 -06:00
"Could not find:\n" + "\n".join(str(p) for p in missing))
return
2026-04-08 01:42:29 -06:00
self._extract_btn.setEnabled(False)
self._book_status_var.setText("Loading spaCy NLP model…")
2026-03-10 00:12:04 -06:00
book = self.book # capture for the thread
def _run():
try:
2026-04-08 01:42:29 -06:00
self._book_status_var.setText(
"Running NLP extraction (may take a minute)…")
2026-03-10 00:12:04 -06:00
words = _extract_nouns_from_paths(book.source_paths)
n_extracted = len(words)
2026-04-08 01:42:29 -06:00
self._book_status_var.setText(
f"Extracted {n_extracted} nouns — generating audio…")
2026-03-10 00:12:04 -06:00
data_dir = Path("output_proper_nouns") / book.slug
audio_dir = Path("proper_nouns_audio") / book.slug
data_dir.mkdir(parents=True, exist_ok=True)
audio_dir.mkdir(parents=True, exist_ok=True)
manifest_path = data_dir / "manifest.json"
manifest: dict = load_json(manifest_path, {})
pipeline = _get_pipeline()
done = failed = 0
for i, word in enumerate(sorted(words, key=str.casefold)):
word_slug = re.sub(r"[^a-z0-9]+", "_", word.lower()).strip("_")
wav_name = f"{word_slug}.wav"
wav_path = audio_dir / wav_name
if word in manifest and wav_path.exists():
continue
try:
import warnings, numpy as np
chunks = []
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
for _, _, audio in pipeline(word, voice=VOICE):
if audio is not None:
chunks.append(audio)
if chunks:
sf.write(str(wav_path), np.concatenate(chunks), SAMPLE_RATE)
manifest[word] = wav_name
done += 1
else:
failed += 1
except Exception as exc:
print(f"[gen] failed for '{word}': {exc}")
failed += 1
if i % 10 == 0:
remaining = n_extracted - i
2026-04-08 01:42:29 -06:00
self._book_status_var.setText(f"Generating audio… {remaining} remaining")
2026-03-10 00:12:04 -06:00
manifest_path.write_text(
json.dumps(manifest, ensure_ascii=False, indent=2))
2026-04-08 01:42:29 -06:00
self._finish_extract(book, manifest, done, failed)
2026-03-10 00:12:04 -06:00
except ImportError as exc:
msg = (f"Missing dependency: {exc}\n\n"
"Install with: pip install spacy wordfreq\n"
"Then: python -m spacy download en_core_web_sm")
2026-04-08 01:42:29 -06:00
QMessageBox.critical(self, "Missing package", msg)
self._book_status_var.setText("Error — see popup")
self._extract_btn.setEnabled(True)
2026-03-10 00:12:04 -06:00
except Exception as exc:
err = str(exc)
2026-04-08 01:42:29 -06:00
self._book_status_var.setText(f"Error: {err}")
self._extract_btn.setEnabled(True)
2026-03-10 00:12:04 -06:00
threading.Thread(target=_run, daemon=True).start()
def _finish_extract(self, book: BookSource, manifest: dict,
done: int, failed: int) -> None:
2026-04-08 01:42:29 -06:00
self._extract_btn.setEnabled(True)
self._book_status_var.setText(
2026-03-10 00:12:04 -06:00
f"Done — {len(manifest)} words total ({done} new, {failed} failed)")
if self.book and self.book.slug == book.slug:
self._load_book(book)
def _pregen_all_fix_audio(self) -> None:
2026-03-10 00:12:04 -06:00
if not self.book:
return
if not self.fixes:
2026-04-08 01:42:29 -06:00
QMessageBox.information(self, "No fixes", "The Fixes list is empty.")
return
replacements = list(self.fixes.values())
total = len(replacements)
2026-03-10 00:12:04 -06:00
rdir = self._replacements_dir
already = sum(1 for r in replacements if (rdir / f"{_slug(r)}.wav").exists())
new_count = total - already
if new_count == 0:
2026-04-08 01:42:29 -06:00
QMessageBox.information(self, "Already done",
f"All {total} replacement clips already exist.")
return
2026-04-08 01:42:29 -06:00
self._pregen_btn.setEnabled(False)
self._pregen_status_var.setText(f"0 / {new_count} new ({already} cached)")
def _run():
2026-02-26 12:09:43 -07:00
try:
done = 0
for rep in replacements:
2026-03-10 00:12:04 -06:00
if not (rdir / f"{_slug(rep)}.wav").exists():
_synth_to_cache(rep, rdir)
2026-02-26 12:09:43 -07:00
done += 1
2026-04-08 01:42:29 -06:00
self._pregen_status_var.setText(f"{done} / {new_count} synthesised…")
self._pregen_status_var.setText(f"Done — {total} clips ready")
2026-02-26 12:09:43 -07:00
except Exception as exc:
print(f"[pregen] error: {exc}")
finally:
2026-04-08 01:42:29 -06:00
self._pregen_btn.setEnabled(True)
threading.Thread(target=_run, daemon=True).start()
2026-02-25 11:37:35 -07:00
def _export_remaining(self) -> None:
2026-03-10 00:12:04 -06:00
if not self.book:
return
2026-02-25 11:37:35 -07:00
words = self._review_words()
if not words:
2026-04-08 01:42:29 -06:00
QMessageBox.information(self, "Nothing to export", "No words left to review.")
2026-02-25 11:37:35 -07:00
return
2026-03-10 00:12:04 -06:00
out = self._data_dir / "remaining_review.txt"
2026-02-25 11:37:35 -07:00
out.write_text("\n".join(words), encoding="utf-8")
2026-04-08 01:42:29 -06:00
QMessageBox.information(self, "Exported", f"{len(words)} words written to:\n{out}")
2026-02-25 11:37:35 -07:00
def _apply_fixes(self) -> None:
2026-03-10 00:12:04 -06:00
if not self.book:
return
if not self.fixes:
2026-04-08 01:42:29 -06:00
QMessageBox.information(self, "No fixes", "The Fixes list is empty.")
return
2026-03-10 00:12:04 -06:00
parts = []
for p in self.book.source_paths:
if not p.exists():
2026-04-08 01:42:29 -06:00
QMessageBox.critical(self, "Source not found", f"Cannot find:\n{p}")
2026-03-10 00:12:04 -06:00
return
parts.append(p.read_text(encoding="utf-8"))
text = "\n\n".join(parts)
count_total = 0
for original, replacement in self.fixes.items():
pattern = r'\b' + re.escape(original) + r'\b'
2026-02-26 15:08:44 -07:00
new_text, n = re.subn(pattern, replacement, text, flags=re.IGNORECASE)
if n:
text = new_text
count_total += n
2026-02-26 15:08:44 -07:00
text, n_caps = re.subn(
r'\b[A-Z]{2,}(?:-[A-Z]{2,})*\b',
lambda m: m.group(0).title(),
text,
)
2026-03-10 00:12:04 -06:00
self.book.fixed_out.write_text(text, encoding="utf-8")
2026-04-08 01:42:29 -06:00
QMessageBox.information(
self, "Done",
2026-02-26 15:08:44 -07:00
f"Applied {len(self.fixes)} fix rules ({count_total} replacements).\n"
f"Converted {n_caps} ALL-CAPS words to Title Case.\n\n"
2026-03-10 00:12:04 -06:00
f"Saved to:\n{self.book.fixed_out}"
)
2026-04-08 01:42:29 -06:00
def _set_gen_audio_status(self, text: str) -> None:
QTimer.singleShot(0, lambda: self._gen_audio_status.setText(text))
def _set_gen_audio_enabled(self, enabled: bool) -> None:
QTimer.singleShot(0, lambda: self._gen_audio_btn.setEnabled(enabled))
def _generate_selected_chapters(self) -> None:
"""Generate chapter audio from source files with selected voice and chapter set."""
if not self.book:
return
missing = [p for p in self.book.source_paths if not p.exists()]
if missing:
QMessageBox.critical(self, "Source file(s) not found", "Could not find:\n" + "\n".join(str(p) for p in missing))
return
voice = self._voice_combo.currentText().strip() or "am_onyx"
chapter_expr = self._chapters_entry.text().strip() or "all"
out_dir = Path("output_audiobook") / self.book.slug
out_dir.mkdir(parents=True, exist_ok=True)
self._gen_audio_btn.setEnabled(False)
self._set_gen_audio_status("Parsing chapters…")
def _run() -> None:
try:
chapters = _parse_chapters_from_paths(self.book.source_paths)
if not chapters:
self._set_gen_audio_status("No chapters found (expected 'Prologue' or 'Chapter # - chapter name').")
return
valid = {ch["num"] for ch in chapters}
selected_nums = _parse_chapter_selection(chapter_expr, valid)
if not selected_nums:
self._set_gen_audio_status("No matching chapters selected.")
return
selected = [ch for ch in chapters if ch["num"] in selected_nums]
pipeline = _get_pipeline()
total = len(selected)
done = 0
for i, ch in enumerate(selected, start=1):
text = _clean_tts_text(ch["text"])
if not text:
continue
self._set_gen_audio_status(f"Generating {i}/{total}: {ch['label']}")
out_path = out_dir / f"{ch['slug']}.wav"
t0 = time.monotonic()
chunks = []
import numpy as np
for _, _, chunk_audio in pipeline(text, voice=voice):
if chunk_audio is None:
continue
if hasattr(chunk_audio, "numpy"):
chunk_audio = chunk_audio.cpu().numpy()
chunk_audio = np.atleast_1d(chunk_audio.squeeze())
if chunk_audio.size > 0:
chunks.append(chunk_audio)
if chunks:
audio = np.concatenate(chunks, axis=0)
sf.write(str(out_path), audio, SAMPLE_RATE)
elapsed = int(time.monotonic() - t0)
done += 1
self._set_gen_audio_status(
f"Saved {done}/{total}: {out_path.name} ({elapsed}s)"
)
self._set_gen_audio_status(f"Done. Generated {done}/{total} chapters to {out_dir}")
except ValueError:
self._set_gen_audio_status("Invalid chapter selection. Use: all or 0,1,2,5-8")
except Exception as exc:
self._set_gen_audio_status(f"Error: {exc}")
finally:
self._set_gen_audio_enabled(True)
threading.Thread(target=_run, daemon=True).start()
2026-03-10 00:12:04 -06:00
# ── Standalone NLP extraction (lazy-imports spaCy) ─────────────────────────────────
def _extract_nouns_from_paths(source_paths: list) -> set[str]:
"""Run spaCy NER + PROPN pass over all *source_paths* and return a set of
unique proper-noun strings, noise-filtered.
Raises ImportError if spaCy or wordfreq are not installed.
"""
import spacy # lazy — only loaded when button is clicked
from wordfreq import top_n_list
TOP_10K: frozenset[str] = frozenset(top_n_list("en", 10_000))
WHITELIST: frozenset[str] = frozenset({
"aaron","abel","abraham","adam","cain","eden","egypt",
"elijah","ephraim","eve","gad","ham","isaac","israel",
"jacob","james","jehovah","john","joseph","judah",
"laban","lehi","levi","micah","michael","moses","noah",
"peter","pharaoh","samuel","sarah","sarai","seth","simeon",
"timothy","zion",
"alma","ether","gideon","limhi","mormon","moroni","mulek",
"mosiah","nephi","satan","sidon",
})
STOP_WORDS: set[str] = {
"A","AN","AND","AS","AT","BE","BUT","BY","DO","DID","DOTH","EVEN",
"FOR","FROM","HAD","HAS","HAVE","HATH","HE","HER","HIS","HOW","I",
"IN","IS","IT","ITS","MAY","ME","MORE","MY","NAY","NO","NOT","NOW",
"OF","OR","OUR","SHALL","SHE","SO","SOME","THAT","THE","THEE",
"THEIR","THEN","THERE","THESE","THEY","THIS","THOSE","THOU","THUS",
"THY","TO","UP","UPON","US","WAS","WE","WHEN","WHERE","WHICH","WHO",
"WILL","WITH","YE","YEA","YET","YOU","YOUR",
"BEHOLD","CHAPTER","CHRIST","GOD","GHOST","HOLY","LORD","VERSE",
"CITY","DAYS","DAY","GREAT","LAND","MAN","MEN","NEW","PEOPLE","SON","TIME",
}
def _is_noise(t: str) -> bool:
t = t.strip()
if len(t) <= 1: return True
if t.isupper() and len(t) > 4: return True
if t.upper() in STOP_WORDS: return True
if re.search(r"[^a-zA-Z\-']", t): return True
if "-" not in t and t.lower() in TOP_10K and t.lower() not in WHITELIST:
return True
return False
def _canonical(text: str) -> str:
return " ".join(text.split()).title()
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 4_000_000
PERSON = {"PERSON"}
PLACE = {"GPE", "LOC", "FAC"}
ORG = {"ORG", "NORP"}
OTHER = {"EVENT", "WORK_OF_ART", "LAW", "PRODUCT", "LANGUAGE"}
found: set[str] = set()
for path in source_paths:
raw = path.read_text(encoding="utf-8")
doc = nlp(raw)
for ent in doc.ents:
if ent.label_ not in (PERSON | PLACE | ORG | OTHER):
continue
for word in _canonical(ent.text).split():
if not _is_noise(word):
found.add(word)
for token in doc:
if token.pos_ != "PROPN":
continue
t = token.text.strip()
if not t[0].isupper() or t.isupper():
continue
if token.i == token.sent.start:
continue
word = _canonical(t)
if not _is_noise(word) and word not in found:
found.add(word)
return found
# ── Entry point ──────────────────────────────────────────────────────────────────
def main() -> None:
2026-04-08 01:42:29 -06:00
books = load_books_from_projects()
print(f"Loaded {len(books)} project(s):")
2026-03-10 00:12:04 -06:00
for b in books:
print(f" [{b.slug}] {b.label} ({len(b.source_paths)} file(s))")
2026-04-08 01:42:29 -06:00
app = QApplication(sys.argv)
window = ProperNounAuditor(books)
window.show()
sys.exit(app.exec())
if __name__ == "__main__":
main()