diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 5f6cda7..9dd8d78 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -55,6 +55,16 @@ Use project virtualenvs where available (`.venv312`, `.venv`, or `venv`) for bac - Media URL handling between project load paths should remain consistent to avoid format-specific regressions (especially WAV/MP3 behavior). - Export pipeline changes must preserve caption modes (`none`, `sidecar`, `burn-in`) and audio enhancement behavior. +## Recent Changes + +### 2026-05-04 — Word text correction, low-confidence highlighting, audio normalization + +- **Word text correction (#015)**: Double-click any word in the transcript editor to edit its text inline. Press Enter to commit, Escape to cancel. State is updated in both `words[]` and `segments[]` arrays (segment text recomposed from updated words). Pure frontend; no backend changes needed. +- **Low-confidence word highlighting (#012)**: Words with `confidence < threshold` (default 0.6, configurable in Settings panel) render with an orange dotted underline. Tooltip shows exact confidence percentage. Threshold is persisted in `localStorage` key `talkedit:confidenceThreshold`. +- **Audio normalization (#018)**: New backend endpoint `POST /audio/normalize` in `backend/routers/audio.py`. Two-pass FFmpeg `loudnorm` (measure then apply) implemented in `backend/services/audio_cleaner.py:normalize_audio()`. Falls back to single-pass if measurement fails. Frontend UI in Export panel: target selector (YouTube -14, Spotify -16, Broadcast -23, etc.) with "Normalize" button. +- **Store**: New `updateWordText(index, text)` action in `editorStore.ts` updates both `words[]` and recomputes `segments[].text`. +- **Settings panel**: New confidence threshold slider (0–1 range). + ## Update Rules (Important) When a task changes architecture, app wiring, commands, API shape, project schema, or major conventions, update this file before finishing. diff --git a/FEATURES.md b/FEATURES.md index cfca8e3..be04e03 100644 --- a/FEATURES.md +++ b/FEATURES.md @@ -6,13 +6,13 @@ Features are grouped by priority. Check off items as they are implemented. ## 🔴 Highest Impact Next — Conversion and retention -- [ ] [#015] **Word text correction** — allow editing the transcript text of a word without affecting its timing. Whisper gets homophones/proper nouns wrong constantly. Pure frontend state change; no backend needed. +- [x] [#015] **Word text correction** — double-click any word to edit its text in-place. Preserves timing and confidence. Pure frontend state change. (2026-05-04) - [ ] [#013] **Re-transcribe selection** — if Whisper gets a section wrong, let the user select a word range and re-run transcription on just that segment (optionally with a different model or language). -- [ ] [#012] **Low-confidence word highlighting** — WhisperX already returns `confidence` per word. Words below a threshold (e.g. < 0.6) should be visually underlined or tinted so the user knows where to double-check. +- [x] [#012] **Low-confidence word highlighting** — words with `confidence < 0.6` (configurable in Settings) get an orange dotted underline. Hover shows exact confidence %. (2026-05-04) -- [ ] [#018] **Audio normalization / loudness targeting** — single "Normalize" button that targets a LUFS level (-14 for YouTube, -16 for Spotify). Backend: `ffmpeg -af loudnorm`. Very high value for podcasters, ~2–3 hours of work. +- [x] [#018] **Audio normalization / loudness targeting** — "Normalize" button in Export panel with LUFS target selector (-14 YouTube, -16 Spotify, -23 Broadcast). Backend: FFmpeg two-pass `loudnorm`. (2026-05-04) - [ ] [#024] **Export to transcript text / SRT only** — some users just want a clean `.txt` or `.srt` of the edited transcript without rendering video. diff --git a/backend/routers/audio.py b/backend/routers/audio.py index 16ad113..5cb2d16 100644 --- a/backend/routers/audio.py +++ b/backend/routers/audio.py @@ -11,7 +11,7 @@ from fastapi import APIRouter, HTTPException, Query, Request from fastapi.responses import FileResponse from pydantic import BaseModel -from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available +from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available, normalize_audio logger = logging.getLogger(__name__) router = APIRouter() @@ -164,3 +164,30 @@ async def get_waveform_audio(request: Request, path: str = Query(...)): ) _waveform_cache[cache_key] = str(out_wav) return FileResponse(str(out_wav), media_type="audio/wav") + + +class NormalizeRequest(BaseModel): + input_path: str + output_path: Optional[str] = None + target_lufs: float = -14.0 + + +@router.post("/audio/normalize") +async def normalize_audio_endpoint(req: NormalizeRequest): + """Normalize audio loudness to a target LUFS level using FFmpeg loudnorm.""" + if req.target_lufs < -70 or req.target_lufs > 0: + raise HTTPException(status_code=400, detail="target_lufs must be between -70 and 0") + try: + output = normalize_audio( + req.input_path, + req.output_path or "", + target_lufs=req.target_lufs, + ) + return { + "status": "ok", + "output_path": output, + "target_lufs": req.target_lufs, + } + except Exception as e: + logger.error(f"Audio normalization failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) diff --git a/backend/services/audio_cleaner.py b/backend/services/audio_cleaner.py index 74c5fb4..122e0f5 100644 --- a/backend/services/audio_cleaner.py +++ b/backend/services/audio_cleaner.py @@ -158,3 +158,125 @@ def detect_silence_ranges(input_path: str, min_silence_ms: int, silence_db: floa silence_db, ) return ranges + + +def normalize_audio( + input_path: str, + output_path: str = "", + target_lufs: float = -14.0, +) -> str: + """ + Normalize audio loudness to a target LUFS level using FFmpeg's loudnorm filter. + + Args: + input_path: Path to the input audio/video file. + output_path: Path for the normalized output. Auto-generated if empty. + target_lufs: Target integrated loudness in LUFS. + Common targets: -14 (YouTube), -16 (Spotify), -23 (broadcast). + + Returns: path to the normalized audio file. + """ + import os as _os + + inp = Path(input_path) + if not output_path: + output_path = str(inp.with_stem(inp.stem + "_normalized")) + + # Two-pass loudnorm: first pass measures loudness, second pass applies correction. + # First pass: measure only (print_format=json) + measure_cmd = [ + "ffmpeg", "-y", + "-i", str(inp), + "-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:print_format=json", + "-f", "null", + "-", + ] + logger.info("Running loudnorm first pass (measurement): %s", " ".join(measure_cmd)) + measure_result = subprocess.run(measure_cmd, capture_output=True, text=True) + + # Parse measured parameters from stderr (loudnorm outputs JSON to stderr) + measured = _parse_loudnorm_measurement(measure_result.stderr) + if not measured: + logger.warning( + "loudnorm measurement failed or produced no output; " + "falling back to single-pass normalization" + ) + # Single-pass fallback + cmd = [ + "ffmpeg", "-y", + "-i", str(inp), + "-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5", + "-c:v", "copy", + output_path, + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"Audio normalization failed: {result.stderr[-300:]}") + logger.info("Single-pass normalized audio saved to %s", output_path) + return output_path + + # Second pass: apply normalization using measured values + input_i = measured.get("input_i", target_lufs) + input_lra = measured.get("input_lra", 7.0) + input_tp = measured.get("input_tp", -1.5) + input_thresh = measured.get("input_thresh", -30.0) + offset = measured.get("target_offset", 0.0) + + apply_cmd = [ + "ffmpeg", "-y", + "-i", str(inp), + "-af", + ( + f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:" + f"measured_I={input_i}:measured_LRA={input_lra}:" + f"measured_TP={input_tp}:measured_thresh={input_thresh}:" + f"offset={offset}:linear=true:print_format=summary" + ), + "-c:v", "copy", + output_path, + ] + logger.info("Running loudnorm second pass (apply): %s", " ".join(apply_cmd)) + result = subprocess.run(apply_cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"Audio normalization (apply) failed: {result.stderr[-300:]}") + + logger.info( + "Normalized audio saved to %s (target=%s LUFS, measured_I=%s)", + output_path, + target_lufs, + input_i, + ) + return output_path + + +def _parse_loudnorm_measurement(stderr_output: str) -> dict: + """Parse loudnorm JSON measurement output from FFmpeg stderr.""" + import json + + # loudnorm prints JSON block between "Parsed_loudnorm" lines + lines = stderr_output.split("\n") + json_lines = [] + in_json = False + for line in lines: + if "Parsed_loudnorm" in line and "}" in line: + # Single-line JSON + try: + start = line.index("{") + end = line.rindex("}") + 1 + return json.loads(line[start:end]) + except (ValueError, json.JSONDecodeError): + continue + if "{" in line and not in_json: + in_json = True + if in_json: + json_lines.append(line) + if in_json and "}" in line: + in_json = False + break + + if json_lines: + try: + return json.loads("".join(json_lines)) + except json.JSONDecodeError: + pass + return {} diff --git a/frontend/src/components/ExportDialog.tsx b/frontend/src/components/ExportDialog.tsx index 0be5392..4f5b8a6 100644 --- a/frontend/src/components/ExportDialog.tsx +++ b/frontend/src/components/ExportDialog.tsx @@ -1,13 +1,16 @@ import { useState, useCallback } from 'react'; import { useEditorStore } from '../store/editorStore'; -import { Download, Loader2, Zap, Cog, Info } from 'lucide-react'; +import { Download, Loader2, Zap, Cog, Info, Volume2 } from 'lucide-react'; import type { ExportOptions } from '../types/project'; export default function ExportDialog() { - const { videoPath, words, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, isExporting, exportProgress, backendUrl, setExporting, getKeepSegments } = + const { videoPath, words, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, isExporting, exportProgress, backendUrl, setExporting, setExportedAudioPath, getKeepSegments } = useEditorStore(); const hasCuts = cutRanges.length > 0; + const [isNormalizing, setIsNormalizing] = useState(false); + const [normalizeTarget, setNormalizeTarget] = useState(-14); + const [normalizeResult, setNormalizeResult] = useState(null); const [options, setOptions] = useState>({ mode: 'fast', @@ -78,6 +81,41 @@ export default function ExportDialog() { } }, [videoPath, options, backendUrl, setExporting, getKeepSegments, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, words]); + const handleNormalize = useCallback(async () => { + if (!videoPath) return; + setIsNormalizing(true); + setNormalizeResult(null); + try { + const res = await fetch(`${backendUrl}/audio/normalize`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + input_path: videoPath, + target_lufs: normalizeTarget, + output_path: '', + }), + }); + if (!res.ok) { + let detail = res.statusText; + try { + const body = await res.json(); + if (body?.detail) detail = String(body.detail); + } catch { + // Keep statusText fallback + } + throw new Error(detail); + } + const data = await res.json(); + setExportedAudioPath(data.output_path); + setNormalizeResult(`Normalized to ${data.target_lufs} LUFS → ${data.output_path.split('/').pop() || 'done'}`); + } catch (err) { + console.error('Normalize error:', err); + setNormalizeResult(`Error: ${err instanceof Error ? err.message : 'Normalization failed'}`); + } finally { + setIsNormalizing(false); + } + }, [videoPath, backendUrl, normalizeTarget, setExportedAudioPath]); + return (

Export Video

@@ -129,6 +167,46 @@ export default function ExportDialog() { ]} /> + {/* Audio normalization */} +
+

+ + Audio Normalization +

+

+ Normalize loudness to a target LUFS level. YouTube uses -14 LUFS, + Spotify uses -16 LUFS, broadcast uses -23 LUFS. +

+
+ + +
+ {normalizeResult && ( +

{normalizeResult}

+ )} +
+ {/* Audio enhancement */}
+ {/* Confidence threshold */} +
+ +

+ Words with confidence below this value are highlighted with an orange dotted underline. + Whisper often gets homophones and proper nouns wrong at low confidence. +

+
+ setConfidenceThreshold(Number(e.target.value))} + className="flex-1 h-1.5" + /> + setConfidenceThreshold(Math.max(0, Math.min(1, Number(e.target.value) || 0)))} + className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent" + /> +
+
+ Show all + {confidenceThreshold.toFixed(2)} + Strict +
+
+ {/* Default provider selector */}
diff --git a/frontend/src/components/TranscriptEditor.tsx b/frontend/src/components/TranscriptEditor.tsx index 243f879..98ac465 100644 --- a/frontend/src/components/TranscriptEditor.tsx +++ b/frontend/src/components/TranscriptEditor.tsx @@ -49,6 +49,10 @@ export default function TranscriptEditor({ const [searchQuery, setSearchQuery] = useState(''); const [activeMatchIdx, setActiveMatchIdx] = useState(0); const searchInputRef = useRef(null); + const updateWordText = useEditorStore((s) => s.updateWordText); + const [editingWordIndex, setEditingWordIndex] = useState(null); + const [editText, setEditText] = useState(''); + const editInputRef = useRef(null); const selectedSet = useMemo(() => new Set(selectedWordIndices), [selectedWordIndices]); const matchIndices = useMemo(() => { @@ -224,6 +228,61 @@ export default function TranscriptEditor({ [setSelectedWordIndices], ); + const startEditing = useCallback((index: number) => { + const word = words[index]; + if (!word) return; + setEditingWordIndex(index); + setEditText(word.word); + requestAnimationFrame(() => { + editInputRef.current?.focus(); + editInputRef.current?.select(); + }); + }, [words]); + + const commitEdit = useCallback(() => { + if (editingWordIndex === null) return; + const trimmed = editText.trim(); + if (trimmed && trimmed !== words[editingWordIndex]?.word) { + updateWordText(editingWordIndex, trimmed); + } + setEditingWordIndex(null); + setEditText(''); + }, [editingWordIndex, editText, words, updateWordText]); + + const cancelEdit = useCallback(() => { + setEditingWordIndex(null); + setEditText(''); + }, []); + + const handleWordDoubleClick = useCallback((index: number) => { + if (cutMode || muteMode || gainMode || speedMode) return; + startEditing(index); + }, [cutMode, muteMode, gainMode, speedMode, startEditing]); + + // Focus edit input when it appears + useEffect(() => { + if (editingWordIndex !== null && editInputRef.current) { + editInputRef.current.focus(); + editInputRef.current.select(); + } + }, [editingWordIndex]); + + // Global key handler for edit mode + useEffect(() => { + const onKeyDown = (e: KeyboardEvent) => { + if (editingWordIndex === null) return; + if (e.key === 'Enter') { + e.preventDefault(); + commitEdit(); + } else if (e.key === 'Escape') { + e.preventDefault(); + cancelEdit(); + } + }; + window.addEventListener('keydown', onKeyDown); + return () => window.removeEventListener('keydown', onKeyDown); + }, [editingWordIndex, commitEdit, cancelEdit]); + const cutSelectedWords = useCallback(() => { if (selectedWordIndices.length === 0) return; const sorted = [...selectedWordIndices].sort((a, b) => a - b); @@ -319,15 +378,25 @@ export default function TranscriptEditor({ const isSearchMatch = matchSet.has(globalIndex); const isActiveSearchMatch = matchIndices.length > 0 && matchIndices[safeActiveMatchIdx] === globalIndex; + const isEditing = globalIndex === editingWordIndex; + + // Low-confidence highlighting + const CONFIDENCE_THRESHOLD_KEY = 'talkedit:confidenceThreshold'; + const storedThreshold = typeof window !== 'undefined' ? Number(window.localStorage.getItem(CONFIDENCE_THRESHOLD_KEY)) : 0; + const confidenceThreshold = Number.isFinite(storedThreshold) ? storedThreshold : 0.6; + const isLowConfidence = word.confidence > 0 && word.confidence < confidenceThreshold && !cutRange && !muteRange && !gainRange && !speedRange; + const confidencePct = word.confidence > 0 ? Math.round(word.confidence * 100) : null; + return ( handleWordMouseDown(globalIndex, e)} onMouseEnter={() => handleWordMouseEnter(globalIndex)} onMouseLeave={() => setHoveredWordIndex(null)} + onDoubleClick={() => handleWordDoubleClick(globalIndex)} className={` relative px-[2px] py-[1px] rounded cursor-pointer transition-colors ${cutRange ? 'bg-red-500/20 text-red-100' : ''} @@ -343,9 +412,21 @@ export default function TranscriptEditor({ ${isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-selected text-white' : ''} ${isActive && !isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/20 text-editor-accent' : ''} ${isHovered && !isSelected && !isActive && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-hover' : ''} + ${isLowConfidence ? 'border-b border-dashed border-orange-400/60' : ''} `} > - {word.word}{' '} + {isEditing ? ( + setEditText(e.target.value)} + onBlur={commitEdit} + className="w-24 px-1 py-0 text-xs bg-editor-bg border border-editor-accent rounded text-editor-text focus:outline-none" + style={{ minWidth: `${Math.max(word.word.length * 8, 48)}px` }} + /> + ) : ( + <>{word.word}{' '} + )} {(cutRange || muteRange || gainRange || speedRange) && isHovered && (
); }, - [segments, selectedSet, matchSet, matchIndices, safeActiveMatchIdx, activeWordIndex, hoveredWordIndex, handleWordMouseDown, handleWordMouseEnter, setHoveredWordIndex, getCutRangeForWord, getMuteRangeForWord, getGainRangeForWord, getSpeedRangeForWord, removeCutRange, removeMuteRange, removeGainRange, removeSpeedRange, zoneDragRange, cutMode, muteMode, gainMode, speedMode], + [segments, selectedSet, matchSet, matchIndices, safeActiveMatchIdx, activeWordIndex, hoveredWordIndex, handleWordMouseDown, handleWordMouseEnter, setHoveredWordIndex, getCutRangeForWord, getMuteRangeForWord, getGainRangeForWord, getSpeedRangeForWord, removeCutRange, removeMuteRange, removeGainRange, removeSpeedRange, zoneDragRange, cutMode, muteMode, gainMode, speedMode, editingWordIndex, editText, editInputRef, handleWordDoubleClick, commitEdit, setEditText], ); return ( diff --git a/frontend/src/lib/tauri-bridge.ts b/frontend/src/lib/tauri-bridge.ts index 223bb09..e89faab 100644 --- a/frontend/src/lib/tauri-bridge.ts +++ b/frontend/src/lib/tauri-bridge.ts @@ -11,7 +11,7 @@ import { invoke } from '@tauri-apps/api/core'; import { open, save } from '@tauri-apps/plugin-dialog'; import { readTextFile, writeTextFile } from '@tauri-apps/plugin-fs'; -const backendPort = import.meta.env.VITE_BACKEND_PORT || '8642'; +const backendPort = import.meta.env.VITE_BACKEND_PORT || '8000'; const backendUrl = `http://127.0.0.1:${backendPort}`; const VIDEO_FILTERS = [ diff --git a/frontend/src/store/editorStore.ts b/frontend/src/store/editorStore.ts index 36cdaee..c698963 100644 --- a/frontend/src/store/editorStore.ts +++ b/frontend/src/store/editorStore.ts @@ -67,6 +67,7 @@ interface EditorActions { setHoveredWordIndex: (index: number | null) => void; deleteSelectedWords: () => void; deleteWordRange: (startIndex: number, endIndex: number) => void; + updateWordText: (index: number, text: string) => void; addCutRange: (start: number, end: number, trimGroupId?: string) => void; addMuteRange: (start: number, end: number) => void; addGainRange: (start: number, end: number, gainDb: number) => void; @@ -262,6 +263,33 @@ export const useEditorStore = create()( get().addCutRange(words[startIndex].start, words[endIndex].end); }, + updateWordText: (index, text) => { + const { words, segments } = get(); + if (index < 0 || index >= words.length) return; + const newWords = words.map((w, i) => + i === index ? { ...w, word: text } : w + ); + // Also update the corresponding segment's words and text + let globalIdx = 0; + const newSegments = segments.map((seg) => { + const start = globalIdx; + globalIdx += seg.words.length; + if (index >= start && index < start + seg.words.length) { + const localIdx = index - start; + const updatedSegWords = seg.words.map((w, i) => + i === localIdx ? { ...w, word: text } : w + ); + return { + ...seg, + words: updatedSegWords, + text: updatedSegWords.map((w) => w.word).join(' '), + }; + } + return seg; + }); + set({ words: newWords, segments: newSegments }); + }, + addCutRange: (start, end, trimGroupId) => { const { cutRanges } = get(); const newRange: CutRange = {