From c7445206cc05bbe2a918d23eeec7f8e0493023ab Mon Sep 17 00:00:00 2001 From: dillonj Date: Fri, 3 Apr 2026 10:25:48 -0600 Subject: [PATCH] added distil models --- FEATURES.md | 84 ++++++++++++++++++++++++++++++++++ frontend/src/App.tsx | 48 ++++++++++++++++--- src-tauri/src/transcription.rs | 1 + 3 files changed, 126 insertions(+), 7 deletions(-) create mode 100644 FEATURES.md diff --git a/FEATURES.md b/FEATURES.md new file mode 100644 index 0000000..891b7b5 --- /dev/null +++ b/FEATURES.md @@ -0,0 +1,84 @@ +# TalkEdit โ€” Feature Roadmap + +Features are grouped by priority. Check off items as they are implemented. + +--- + +## ๐Ÿ”ด High Priority โ€” Core editing gaps + +- [ ] **Silence / pause trimmer** โ€” detect and auto-remove pauses longer than X ms. One backend endpoint (`/audio/remove-silence`) + a button in the UI. Saves enormous time in podcast/interview editing. + +- [ ] **Volume / gain control** โ€” per-selection or global audio gain slider. Every editor has this. Descript users constantly complain it's missing. Backend: `ffmpeg -af volume=Xdb`. + +- [ ] **Speed adjustment** โ€” slow down or speed up a selection or the whole clip. Backend: `ffmpeg -filter:v setpts` + `atempo`. Common use case: slightly speed up boring sections. + +- [ ] **Cut preview** โ€” before committing a delete, play what the audio will sound like with that section removed (pre-listen across the edit point). Pure frontend using Web Audio API โ€” splice the AudioBuffer and play the join. + +- [ ] **Timeline shows output length** โ€” deleted regions should visually collapse (or show as narrow gaps) so the user sees the *output* duration, not just the source duration. + +--- + +## ๐ŸŸก Medium Priority โ€” Widely expected features + +- [ ] **Transcript search (Ctrl+F)** โ€” find words/phrases in the transcript and highlight matches. Pure frontend. Critical for long-form content. Jump between matches with Enter. + +- [ ] **Mark In / Out + delete (I / O keys)** โ€” keyboard shortcuts to mark a time range on the timeline, then delete it. Faster than click-dragging words. Store the in/out points in state, `Delete` removes them. + +- [ ] **Low-confidence word highlighting** โ€” WhisperX already returns `confidence` per word. Words below a threshold (e.g. < 0.6) should be visually underlined or tinted so the user knows where to double-check. + +- [ ] **Re-transcribe selection** โ€” if Whisper gets a section wrong, let the user select a word range and re-run transcription on just that segment (optionally with a different model or language). + +- [ ] **Word text correction** โ€” allow editing the transcript text of a word without affecting its timing. Whisper gets homophones/proper nouns wrong constantly. Pure frontend state change; no backend needed. + +- [ ] **Named timeline markers** โ€” drop named marker pins on the waveform (like Resolve markers). Store as `{ id, time, label, color }` in the project. Rendered as colored triangles on the timeline canvas. + +- [ ] **Chapters** โ€” group markers into named chapter ranges. Useful for podcasts and lectures. Exportable as YouTube chapter timestamps in the description. + +--- + +## ๐ŸŸข Lower Priority โ€” Differentiating / power features + +- [ ] **Audio normalization / loudness targeting** โ€” single "Normalize" button that targets a LUFS level (-14 for YouTube, -16 for Spotify). Backend: `ffmpeg -af loudnorm`. Very high value for podcasters, ~2โ€“3 hours of work. + +- [ ] **Background music track** โ€” a second audio track for background music with volume ducking. Major gap in Descript that TalkEdit could own. Backend: `ffmpeg` amix + `asendcmd` for auto-ducking. + +- [ ] **Video zoom / punch-in** โ€” scale and position the video (crop, zoom, pan). Used constantly on talking-head videos for emphasis. Backend: `ffmpeg -vf crop/scale/zoompan`. + +- [ ] **Multi-clip / append** โ€” load a second video and append it to the timeline. Even without a full multi-track timeline, "append clip" is a heavily used workflow. + +- [ ] **Clip thumbnail strip** โ€” video frame thumbnails along the timeline so users can navigate visually, not only by waveform. Backend: `ffmpeg` thumbnail extraction at regular intervals. + +- [ ] **Batch silence removal** โ€” full-file scan + remove all pauses above threshold in one click. Distinct from the manual trimmer above; this is a "fix the whole file" operation. + +- [ ] **Export to transcript text / SRT only** โ€” some users just want a clean `.txt` or `.srt` of the edited transcript without rendering video. + +--- + +## ๐Ÿ’ก TalkEdit competitive advantages to lean into + +These aren't features to build โ€” they're things to make more visible in the UI and README: + +- **100% offline / no account required** โ€” CapCut requires login and sends data to servers. Descript is cloud-first. TalkEdit never leaves the machine. +- **Local AI models** โ€” Ollama support means no API costs and no data leaving the device. +- **Word-level precision** โ€” editing by deleting words (not dragging razor cuts) is faster for talking-head content than any timeline-based editor. +- **Works on long files** โ€” virtualized transcript + chunked waveform handles 1hr+ content that bogs down CapCut. + +--- + +## โœ… Already Implemented + +- Word-level transcript editing (select, drag, shift-click, delete) +- Ctrl+click word โ†’ seek timeline to that position +- Waveform timeline with zoom (Ctrl+scroll), scroll, drag-to-scrub playhead +- Auto-scroll waveform when playhead goes off-screen +- AI filler word detection and removal (Ollama / OpenAI / Claude) +- AI clip suggestions for social media +- Noise reduction (DeepFilterNet or FFmpeg ANLMDN) +- Export: fast stream-copy or full reencode (MP4/MOV/WebM, 720p/1080p/4K) +- Captions: SRT, VTT, ASS burn-in with font/color/position options +- Speaker diarization +- Project save / load (.aive JSON format) +- Undo / redo (100-level history via Zundo) +- Multi-format input (MP4, MKV, MOV, AVI, WebM, M4A) +- Keyboard shortcuts (Space, J/K/L, arrows, Ctrl+Z/Shift+Z, Ctrl+S, Ctrl+E) +- Settings panel: AI provider config (Ollama, OpenAI, Claude) diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index a018e67..c26c581 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -113,7 +113,24 @@ export default function App() { throw new Error('Transcription not available'); } // Step 1: ensure model is downloaded (may take a while on first run) - const modelLabel = whisperModel === 'tiny' ? '~75 MB' : whisperModel === 'base' ? '~140 MB' : '~460 MB'; + const MODEL_SIZES: Record = { + 'tiny': '~75 MB', + 'tiny.en': '~75 MB', + 'base': '~140 MB', + 'base.en': '~140 MB', + 'small': '~460 MB', + 'small.en': '~460 MB', + 'medium': '~1.5 GB', + 'medium.en': '~1.5 GB', + 'large': '~2.9 GB', + 'large-v2': '~2.9 GB', + 'large-v3': '~2.9 GB', + 'large-v3-turbo': '~1.6 GB', + 'distil-large-v3': '~1.5 GB', + 'distil-medium.en': '~750 MB', + 'distil-small.en': '~190 MB', + }; + const modelLabel = MODEL_SIZES[whisperModel] ?? 'unknown size'; setTranscribing(true, 5, `Downloading ${whisperModel} model (${modelLabel})...`); await window.electronAPI.ensureModel(whisperModel); @@ -145,19 +162,36 @@ export default function App() { {/* Whisper model selector */}
- +
+

+ For noisy/YouTube videos use large-v3 or large-v3-turbo. + English-only models are ~10% faster and more accurate for English content. +

{IS_ELECTRON ? (
diff --git a/src-tauri/src/transcription.rs b/src-tauri/src/transcription.rs index 9fece24..0c95df8 100644 --- a/src-tauri/src/transcription.rs +++ b/src-tauri/src/transcription.rs @@ -46,6 +46,7 @@ pub fn transcribe_audio( // Run Python script let output = Command::new(python_exe) .args(&args) + .env("PYTHONPATH", crate::paths::project_root().join(".venv312").join("lib").join("python3.12").join("site-packages")) .output() .map_err(|e| format!("Failed to run Python script: {}", e))?;