implemented 15,12,18 didn't check 18

This commit is contained in:
2026-05-04 16:37:25 -06:00
parent 0c7a4c94c2
commit 90b1999a57
9 changed files with 402 additions and 10 deletions

View File

@ -55,6 +55,16 @@ Use project virtualenvs where available (`.venv312`, `.venv`, or `venv`) for bac
- Media URL handling between project load paths should remain consistent to avoid format-specific regressions (especially WAV/MP3 behavior).
- Export pipeline changes must preserve caption modes (`none`, `sidecar`, `burn-in`) and audio enhancement behavior.
## Recent Changes
### 2026-05-04 — Word text correction, low-confidence highlighting, audio normalization
- **Word text correction (#015)**: Double-click any word in the transcript editor to edit its text inline. Press Enter to commit, Escape to cancel. State is updated in both `words[]` and `segments[]` arrays (segment text recomposed from updated words). Pure frontend; no backend changes needed.
- **Low-confidence word highlighting (#012)**: Words with `confidence < threshold` (default 0.6, configurable in Settings panel) render with an orange dotted underline. Tooltip shows exact confidence percentage. Threshold is persisted in `localStorage` key `talkedit:confidenceThreshold`.
- **Audio normalization (#018)**: New backend endpoint `POST /audio/normalize` in `backend/routers/audio.py`. Two-pass FFmpeg `loudnorm` (measure then apply) implemented in `backend/services/audio_cleaner.py:normalize_audio()`. Falls back to single-pass if measurement fails. Frontend UI in Export panel: target selector (YouTube -14, Spotify -16, Broadcast -23, etc.) with "Normalize" button.
- **Store**: New `updateWordText(index, text)` action in `editorStore.ts` updates both `words[]` and recomputes `segments[].text`.
- **Settings panel**: New confidence threshold slider (01 range).
## Update Rules (Important)
When a task changes architecture, app wiring, commands, API shape, project schema, or major conventions, update this file before finishing.

View File

@ -6,13 +6,13 @@ Features are grouped by priority. Check off items as they are implemented.
## 🔴 Highest Impact Next — Conversion and retention
- [ ] [#015] **Word text correction**allow editing the transcript text of a word without affecting its timing. Whisper gets homophones/proper nouns wrong constantly. Pure frontend state change; no backend needed.
- [x] [#015] **Word text correction**double-click any word to edit its text in-place. Preserves timing and confidence. Pure frontend state change. (2026-05-04)
- [ ] [#013] **Re-transcribe selection** — if Whisper gets a section wrong, let the user select a word range and re-run transcription on just that segment (optionally with a different model or language).
- [ ] [#012] **Low-confidence word highlighting**WhisperX already returns `confidence` per word. Words below a threshold (e.g. < 0.6) should be visually underlined or tinted so the user knows where to double-check.
- [x] [#012] **Low-confidence word highlighting**words with `confidence < 0.6` (configurable in Settings) get an orange dotted underline. Hover shows exact confidence %. (2026-05-04)
- [ ] [#018] **Audio normalization / loudness targeting** single "Normalize" button that targets a LUFS level (-14 for YouTube, -16 for Spotify). Backend: `ffmpeg -af loudnorm`. Very high value for podcasters, ~23 hours of work.
- [x] [#018] **Audio normalization / loudness targeting** — "Normalize" button in Export panel with LUFS target selector (-14 YouTube, -16 Spotify, -23 Broadcast). Backend: FFmpeg two-pass `loudnorm`. (2026-05-04)
- [ ] [#024] **Export to transcript text / SRT only** — some users just want a clean `.txt` or `.srt` of the edited transcript without rendering video.

View File

@ -11,7 +11,7 @@ from fastapi import APIRouter, HTTPException, Query, Request
from fastapi.responses import FileResponse
from pydantic import BaseModel
from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available
from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available, normalize_audio
logger = logging.getLogger(__name__)
router = APIRouter()
@ -164,3 +164,30 @@ async def get_waveform_audio(request: Request, path: str = Query(...)):
)
_waveform_cache[cache_key] = str(out_wav)
return FileResponse(str(out_wav), media_type="audio/wav")
class NormalizeRequest(BaseModel):
input_path: str
output_path: Optional[str] = None
target_lufs: float = -14.0
@router.post("/audio/normalize")
async def normalize_audio_endpoint(req: NormalizeRequest):
"""Normalize audio loudness to a target LUFS level using FFmpeg loudnorm."""
if req.target_lufs < -70 or req.target_lufs > 0:
raise HTTPException(status_code=400, detail="target_lufs must be between -70 and 0")
try:
output = normalize_audio(
req.input_path,
req.output_path or "",
target_lufs=req.target_lufs,
)
return {
"status": "ok",
"output_path": output,
"target_lufs": req.target_lufs,
}
except Exception as e:
logger.error(f"Audio normalization failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View File

@ -158,3 +158,125 @@ def detect_silence_ranges(input_path: str, min_silence_ms: int, silence_db: floa
silence_db,
)
return ranges
def normalize_audio(
input_path: str,
output_path: str = "",
target_lufs: float = -14.0,
) -> str:
"""
Normalize audio loudness to a target LUFS level using FFmpeg's loudnorm filter.
Args:
input_path: Path to the input audio/video file.
output_path: Path for the normalized output. Auto-generated if empty.
target_lufs: Target integrated loudness in LUFS.
Common targets: -14 (YouTube), -16 (Spotify), -23 (broadcast).
Returns: path to the normalized audio file.
"""
import os as _os
inp = Path(input_path)
if not output_path:
output_path = str(inp.with_stem(inp.stem + "_normalized"))
# Two-pass loudnorm: first pass measures loudness, second pass applies correction.
# First pass: measure only (print_format=json)
measure_cmd = [
"ffmpeg", "-y",
"-i", str(inp),
"-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:print_format=json",
"-f", "null",
"-",
]
logger.info("Running loudnorm first pass (measurement): %s", " ".join(measure_cmd))
measure_result = subprocess.run(measure_cmd, capture_output=True, text=True)
# Parse measured parameters from stderr (loudnorm outputs JSON to stderr)
measured = _parse_loudnorm_measurement(measure_result.stderr)
if not measured:
logger.warning(
"loudnorm measurement failed or produced no output; "
"falling back to single-pass normalization"
)
# Single-pass fallback
cmd = [
"ffmpeg", "-y",
"-i", str(inp),
"-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5",
"-c:v", "copy",
output_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Audio normalization failed: {result.stderr[-300:]}")
logger.info("Single-pass normalized audio saved to %s", output_path)
return output_path
# Second pass: apply normalization using measured values
input_i = measured.get("input_i", target_lufs)
input_lra = measured.get("input_lra", 7.0)
input_tp = measured.get("input_tp", -1.5)
input_thresh = measured.get("input_thresh", -30.0)
offset = measured.get("target_offset", 0.0)
apply_cmd = [
"ffmpeg", "-y",
"-i", str(inp),
"-af",
(
f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:"
f"measured_I={input_i}:measured_LRA={input_lra}:"
f"measured_TP={input_tp}:measured_thresh={input_thresh}:"
f"offset={offset}:linear=true:print_format=summary"
),
"-c:v", "copy",
output_path,
]
logger.info("Running loudnorm second pass (apply): %s", " ".join(apply_cmd))
result = subprocess.run(apply_cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Audio normalization (apply) failed: {result.stderr[-300:]}")
logger.info(
"Normalized audio saved to %s (target=%s LUFS, measured_I=%s)",
output_path,
target_lufs,
input_i,
)
return output_path
def _parse_loudnorm_measurement(stderr_output: str) -> dict:
"""Parse loudnorm JSON measurement output from FFmpeg stderr."""
import json
# loudnorm prints JSON block between "Parsed_loudnorm" lines
lines = stderr_output.split("\n")
json_lines = []
in_json = False
for line in lines:
if "Parsed_loudnorm" in line and "}" in line:
# Single-line JSON
try:
start = line.index("{")
end = line.rindex("}") + 1
return json.loads(line[start:end])
except (ValueError, json.JSONDecodeError):
continue
if "{" in line and not in_json:
in_json = True
if in_json:
json_lines.append(line)
if in_json and "}" in line:
in_json = False
break
if json_lines:
try:
return json.loads("".join(json_lines))
except json.JSONDecodeError:
pass
return {}

View File

@ -1,13 +1,16 @@
import { useState, useCallback } from 'react';
import { useEditorStore } from '../store/editorStore';
import { Download, Loader2, Zap, Cog, Info } from 'lucide-react';
import { Download, Loader2, Zap, Cog, Info, Volume2 } from 'lucide-react';
import type { ExportOptions } from '../types/project';
export default function ExportDialog() {
const { videoPath, words, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, isExporting, exportProgress, backendUrl, setExporting, getKeepSegments } =
const { videoPath, words, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, isExporting, exportProgress, backendUrl, setExporting, setExportedAudioPath, getKeepSegments } =
useEditorStore();
const hasCuts = cutRanges.length > 0;
const [isNormalizing, setIsNormalizing] = useState(false);
const [normalizeTarget, setNormalizeTarget] = useState(-14);
const [normalizeResult, setNormalizeResult] = useState<string | null>(null);
const [options, setOptions] = useState<Omit<ExportOptions, 'outputPath'>>({
mode: 'fast',
@ -78,6 +81,41 @@ export default function ExportDialog() {
}
}, [videoPath, options, backendUrl, setExporting, getKeepSegments, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, words]);
const handleNormalize = useCallback(async () => {
if (!videoPath) return;
setIsNormalizing(true);
setNormalizeResult(null);
try {
const res = await fetch(`${backendUrl}/audio/normalize`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
input_path: videoPath,
target_lufs: normalizeTarget,
output_path: '',
}),
});
if (!res.ok) {
let detail = res.statusText;
try {
const body = await res.json();
if (body?.detail) detail = String(body.detail);
} catch {
// Keep statusText fallback
}
throw new Error(detail);
}
const data = await res.json();
setExportedAudioPath(data.output_path);
setNormalizeResult(`Normalized to ${data.target_lufs} LUFS → ${data.output_path.split('/').pop() || 'done'}`);
} catch (err) {
console.error('Normalize error:', err);
setNormalizeResult(`Error: ${err instanceof Error ? err.message : 'Normalization failed'}`);
} finally {
setIsNormalizing(false);
}
}, [videoPath, backendUrl, normalizeTarget, setExportedAudioPath]);
return (
<div className="p-4 space-y-5">
<h3 className="text-sm font-semibold">Export Video</h3>
@ -129,6 +167,46 @@ export default function ExportDialog() {
]}
/>
{/* Audio normalization */}
<div className="space-y-2 border-t border-editor-border pt-3">
<h4 className="text-xs font-semibold flex items-center gap-1.5">
<Volume2 className="w-3.5 h-3.5" />
Audio Normalization
</h4>
<p className="text-[10px] text-editor-text-muted leading-relaxed">
Normalize loudness to a target LUFS level. YouTube uses <strong>-14 LUFS</strong>,
Spotify uses <strong>-16 LUFS</strong>, broadcast uses <strong>-23 LUFS</strong>.
</p>
<div className="flex items-center gap-2">
<select
value={normalizeTarget}
onChange={(e) => setNormalizeTarget(Number(e.target.value))}
className="flex-1 px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent"
>
<option value={-14}>YouTube (-14 LUFS)</option>
<option value={-16}>Spotify (-16 LUFS)</option>
<option value={-23}>Broadcast (-23 LUFS)</option>
<option value={-11}>Loud (-11 LUFS)</option>
<option value={-9}>Very Loud (-9 LUFS)</option>
</select>
<button
onClick={handleNormalize}
disabled={isNormalizing || !videoPath}
className="flex items-center gap-1.5 px-3 py-1.5 text-xs rounded bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30 disabled:opacity-40 transition-colors"
>
{isNormalizing ? (
<Loader2 className="w-3 h-3 animate-spin" />
) : (
<Volume2 className="w-3 h-3" />
)}
{isNormalizing ? 'Normalizing...' : 'Normalize'}
</button>
</div>
{normalizeResult && (
<p className="text-[10px] text-editor-success">{normalizeResult}</p>
)}
</div>
{/* Audio enhancement */}
<label className="flex items-center gap-2 cursor-pointer">
<input

View File

@ -7,6 +7,18 @@ import { Bot, Cloud, Brain, RefreshCw } from 'lucide-react';
export default function SettingsPanel() {
const { providers, defaultProvider, setProviderConfig, setDefaultProvider } = useAIStore();
const { backendUrl, zonePreviewPaddingSeconds, setZonePreviewPaddingSeconds } = useEditorStore();
const CONFIDENCE_THRESHOLD_KEY = 'talkedit:confidenceThreshold';
const [confidenceThreshold, setConfidenceThresholdState] = useState(() => {
const stored = typeof window !== 'undefined' ? Number(window.localStorage.getItem(CONFIDENCE_THRESHOLD_KEY)) : 0;
return Number.isFinite(stored) ? stored : 0.6;
});
const setConfidenceThreshold = (value: number) => {
const clamped = Math.max(0, Math.min(1, value));
setConfidenceThresholdState(clamped);
if (typeof window !== 'undefined') {
window.localStorage.setItem(CONFIDENCE_THRESHOLD_KEY, String(clamped));
}
};
const [ollamaModels, setOllamaModels] = useState<string[]>([]);
const [loadingModels, setLoadingModels] = useState(false);
@ -66,6 +78,40 @@ export default function SettingsPanel() {
</div>
</ProviderSection>
{/* Confidence threshold */}
<div className="space-y-2">
<label className="text-xs text-editor-text-muted font-medium">Low-Confidence Word Threshold</label>
<p className="text-[10px] text-editor-text-muted leading-relaxed">
Words with confidence below this value are highlighted with an orange dotted underline.
Whisper often gets homophones and proper nouns wrong at low confidence.
</p>
<div className="flex items-center gap-2">
<input
type="range"
min={0}
max={1}
step={0.05}
value={confidenceThreshold}
onChange={(e) => setConfidenceThreshold(Number(e.target.value))}
className="flex-1 h-1.5"
/>
<input
type="number"
min={0}
max={1}
step={0.05}
value={confidenceThreshold}
onChange={(e) => setConfidenceThreshold(Math.max(0, Math.min(1, Number(e.target.value) || 0)))}
className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
/>
</div>
<div className="flex items-center justify-between text-[10px]">
<span className="text-editor-text-muted">Show all</span>
<span className="font-medium text-editor-text">{confidenceThreshold.toFixed(2)}</span>
<span className="text-editor-text-muted">Strict</span>
</div>
</div>
{/* Default provider selector */}
<div className="space-y-2">
<label className="text-xs text-editor-text-muted font-medium">Default AI Provider</label>

View File

@ -49,6 +49,10 @@ export default function TranscriptEditor({
const [searchQuery, setSearchQuery] = useState('');
const [activeMatchIdx, setActiveMatchIdx] = useState(0);
const searchInputRef = useRef<HTMLInputElement | null>(null);
const updateWordText = useEditorStore((s) => s.updateWordText);
const [editingWordIndex, setEditingWordIndex] = useState<number | null>(null);
const [editText, setEditText] = useState('');
const editInputRef = useRef<HTMLInputElement | null>(null);
const selectedSet = useMemo(() => new Set(selectedWordIndices), [selectedWordIndices]);
const matchIndices = useMemo(() => {
@ -224,6 +228,61 @@ export default function TranscriptEditor({
[setSelectedWordIndices],
);
const startEditing = useCallback((index: number) => {
const word = words[index];
if (!word) return;
setEditingWordIndex(index);
setEditText(word.word);
requestAnimationFrame(() => {
editInputRef.current?.focus();
editInputRef.current?.select();
});
}, [words]);
const commitEdit = useCallback(() => {
if (editingWordIndex === null) return;
const trimmed = editText.trim();
if (trimmed && trimmed !== words[editingWordIndex]?.word) {
updateWordText(editingWordIndex, trimmed);
}
setEditingWordIndex(null);
setEditText('');
}, [editingWordIndex, editText, words, updateWordText]);
const cancelEdit = useCallback(() => {
setEditingWordIndex(null);
setEditText('');
}, []);
const handleWordDoubleClick = useCallback((index: number) => {
if (cutMode || muteMode || gainMode || speedMode) return;
startEditing(index);
}, [cutMode, muteMode, gainMode, speedMode, startEditing]);
// Focus edit input when it appears
useEffect(() => {
if (editingWordIndex !== null && editInputRef.current) {
editInputRef.current.focus();
editInputRef.current.select();
}
}, [editingWordIndex]);
// Global key handler for edit mode
useEffect(() => {
const onKeyDown = (e: KeyboardEvent) => {
if (editingWordIndex === null) return;
if (e.key === 'Enter') {
e.preventDefault();
commitEdit();
} else if (e.key === 'Escape') {
e.preventDefault();
cancelEdit();
}
};
window.addEventListener('keydown', onKeyDown);
return () => window.removeEventListener('keydown', onKeyDown);
}, [editingWordIndex, commitEdit, cancelEdit]);
const cutSelectedWords = useCallback(() => {
if (selectedWordIndices.length === 0) return;
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
@ -319,15 +378,25 @@ export default function TranscriptEditor({
const isSearchMatch = matchSet.has(globalIndex);
const isActiveSearchMatch = matchIndices.length > 0 && matchIndices[safeActiveMatchIdx] === globalIndex;
const isEditing = globalIndex === editingWordIndex;
// Low-confidence highlighting
const CONFIDENCE_THRESHOLD_KEY = 'talkedit:confidenceThreshold';
const storedThreshold = typeof window !== 'undefined' ? Number(window.localStorage.getItem(CONFIDENCE_THRESHOLD_KEY)) : 0;
const confidenceThreshold = Number.isFinite(storedThreshold) ? storedThreshold : 0.6;
const isLowConfidence = word.confidence > 0 && word.confidence < confidenceThreshold && !cutRange && !muteRange && !gainRange && !speedRange;
const confidencePct = word.confidence > 0 ? Math.round(word.confidence * 100) : null;
return (
<span
key={globalIndex}
id={`word-${globalIndex}`}
data-word-index={globalIndex}
title={`${word.start.toFixed(2)}s — Ctrl+click to seek`}
title={`${word.start.toFixed(2)}s — confidence: ${confidencePct !== null ? confidencePct + '%' : 'N/A'}${isLowConfidence ? ' ⚠️ Low confidence' : ''} — Ctrl+click to seek, double-click to edit`}
onMouseDown={(e) => handleWordMouseDown(globalIndex, e)}
onMouseEnter={() => handleWordMouseEnter(globalIndex)}
onMouseLeave={() => setHoveredWordIndex(null)}
onDoubleClick={() => handleWordDoubleClick(globalIndex)}
className={`
relative px-[2px] py-[1px] rounded cursor-pointer transition-colors
${cutRange ? 'bg-red-500/20 text-red-100' : ''}
@ -343,9 +412,21 @@ export default function TranscriptEditor({
${isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-selected text-white' : ''}
${isActive && !isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/20 text-editor-accent' : ''}
${isHovered && !isSelected && !isActive && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-hover' : ''}
${isLowConfidence ? 'border-b border-dashed border-orange-400/60' : ''}
`}
>
{word.word}{' '}
{isEditing ? (
<input
ref={editInputRef}
value={editText}
onChange={(e) => setEditText(e.target.value)}
onBlur={commitEdit}
className="w-24 px-1 py-0 text-xs bg-editor-bg border border-editor-accent rounded text-editor-text focus:outline-none"
style={{ minWidth: `${Math.max(word.word.length * 8, 48)}px` }}
/>
) : (
<>{word.word}{' '}</>
)}
{(cutRange || muteRange || gainRange || speedRange) && isHovered && (
<button
onClick={(e) => {
@ -367,7 +448,7 @@ export default function TranscriptEditor({
</div>
);
},
[segments, selectedSet, matchSet, matchIndices, safeActiveMatchIdx, activeWordIndex, hoveredWordIndex, handleWordMouseDown, handleWordMouseEnter, setHoveredWordIndex, getCutRangeForWord, getMuteRangeForWord, getGainRangeForWord, getSpeedRangeForWord, removeCutRange, removeMuteRange, removeGainRange, removeSpeedRange, zoneDragRange, cutMode, muteMode, gainMode, speedMode],
[segments, selectedSet, matchSet, matchIndices, safeActiveMatchIdx, activeWordIndex, hoveredWordIndex, handleWordMouseDown, handleWordMouseEnter, setHoveredWordIndex, getCutRangeForWord, getMuteRangeForWord, getGainRangeForWord, getSpeedRangeForWord, removeCutRange, removeMuteRange, removeGainRange, removeSpeedRange, zoneDragRange, cutMode, muteMode, gainMode, speedMode, editingWordIndex, editText, editInputRef, handleWordDoubleClick, commitEdit, setEditText],
);
return (

View File

@ -11,7 +11,7 @@ import { invoke } from '@tauri-apps/api/core';
import { open, save } from '@tauri-apps/plugin-dialog';
import { readTextFile, writeTextFile } from '@tauri-apps/plugin-fs';
const backendPort = import.meta.env.VITE_BACKEND_PORT || '8642';
const backendPort = import.meta.env.VITE_BACKEND_PORT || '8000';
const backendUrl = `http://127.0.0.1:${backendPort}`;
const VIDEO_FILTERS = [

View File

@ -67,6 +67,7 @@ interface EditorActions {
setHoveredWordIndex: (index: number | null) => void;
deleteSelectedWords: () => void;
deleteWordRange: (startIndex: number, endIndex: number) => void;
updateWordText: (index: number, text: string) => void;
addCutRange: (start: number, end: number, trimGroupId?: string) => void;
addMuteRange: (start: number, end: number) => void;
addGainRange: (start: number, end: number, gainDb: number) => void;
@ -262,6 +263,33 @@ export const useEditorStore = create<EditorState & EditorActions>()(
get().addCutRange(words[startIndex].start, words[endIndex].end);
},
updateWordText: (index, text) => {
const { words, segments } = get();
if (index < 0 || index >= words.length) return;
const newWords = words.map((w, i) =>
i === index ? { ...w, word: text } : w
);
// Also update the corresponding segment's words and text
let globalIdx = 0;
const newSegments = segments.map((seg) => {
const start = globalIdx;
globalIdx += seg.words.length;
if (index >= start && index < start + seg.words.length) {
const localIdx = index - start;
const updatedSegWords = seg.words.map((w, i) =>
i === localIdx ? { ...w, word: text } : w
);
return {
...seg,
words: updatedSegWords,
text: updatedSegWords.map((w) => w.word).join(' '),
};
}
return seg;
});
set({ words: newWords, segments: newSegments });
},
addCutRange: (start, end, trimGroupId) => {
const { cutRanges } = get();
const newRange: CutRange = {