Compare commits

83 Commits

Author SHA1 Message Date
3093b41033 Bundle FFmpeg as Tauri sidecar, download in CI
Some checks failed
CI / rust (push) Failing after 1m21s
CI / frontend (push) Successful in 24s
CI / python (push) Failing after 6s
Validate All / validate-all (push) Failing after 4m51s
Release / linux (push) Failing after 5m32s
Release / windows (push) Has been cancelled
2026-05-07 11:23:34 -06:00
a64ae78833 Update app icons to custom waveform SVG
Some checks failed
CI / rust (push) Failing after 2m46s
CI / frontend (push) Successful in 36s
CI / python (push) Failing after 8s
Validate All / validate-all (push) Failing after 4m53s
2026-05-07 02:58:45 -06:00
b558ef8a7f Simplify release workflow: deb, rpm, msi
Some checks failed
CI / rust (push) Failing after 1m38s
CI / frontend (push) Successful in 29s
CI / python (push) Failing after 11s
Validate All / validate-all (push) Failing after 4m52s
Release / linux (push) Failing after 5m57s
Release / windows (push) Failing after 3m48s
2026-05-07 02:15:22 -06:00
f1e6c010eb Add AppImage to release bundles
Some checks failed
CI / python (push) Failing after 1m44s
Validate All / validate-all (push) Has been cancelled
Release / build (appimage, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (archlinux, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (deb, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (msi, windows-latest, x86_64-pc-windows-msvc) (push) Has been cancelled
Release / build (rpm, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
CI / frontend (push) Failing after 14m32s
CI / rust (push) Failing after 14m49s
2026-05-07 01:35:42 -06:00
124f215a0a Add local LLM router and service
Some checks failed
CI / rust (push) Has been cancelled
CI / frontend (push) Has been cancelled
CI / python (push) Has been cancelled
Validate All / validate-all (push) Has been cancelled
Release / build (archlinux, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (deb, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (msi, windows-latest, x86_64-pc-windows-msvc) (push) Has been cancelled
Release / build (rpm, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
2026-05-07 01:32:19 -06:00
1993aabeac Add release workflow: .deb, .rpm, .pkg.tar.zst, .msi
Some checks failed
CI / rust (push) Has been cancelled
CI / frontend (push) Has been cancelled
CI / python (push) Has been cancelled
Validate All / validate-all (push) Has been cancelled
2026-05-07 01:25:50 -06:00
573ac9c9f5 Update Ed25519 keypair for license signing 2026-05-07 00:25:25 -06:00
5d52c8aec5 AI editing now requires Business tier, remove lifetime updates 2026-05-06 23:39:50 -06:00
8bd1ad5b69 Gate AI editing features behind license (trial users no longer get AI editing) 2026-05-06 23:21:45 -06:00
850b373d42 removed home 2026-05-06 23:11:00 -06:00
2212d7b265 features 2026-05-06 16:47:54 -06:00
813877a7b4 updated features and docs 2026-05-06 16:15:38 -06:00
e4484a57f9 improve home screen 2026-05-06 16:05:04 -06:00
10437c02ca added tests 2026-05-06 14:40:58 -06:00
4004312994 more stuff to improve robustness 2026-05-06 14:25:23 -06:00
9a301fe2a2 robustness plan 2026-05-06 13:18:53 -06:00
6ac1d68887 help menu 2026-05-06 13:00:57 -06:00
acf7f2e64c more polish 2026-05-06 12:15:46 -06:00
a96e42c9f9 improved tooltips 2026-05-06 11:41:32 -06:00
fd6697b48e polishing 2026-05-06 10:53:27 -06:00
09ebcbc9ec added marketing plan 2026-05-06 02:33:18 -06:00
88cd9a21d0 plans and features 2026-05-06 02:29:10 -06:00
91217f6db0 added free trial timer at welcome screen 2026-05-06 01:43:55 -06:00
835719a907 added licensing stuff and free trial timer 2026-05-06 01:35:42 -06:00
810957747b clean up of features 2026-05-05 23:31:18 -06:00
4d4dfa7f7c implemented the lower priority features; haven't tested them 2026-05-05 20:46:55 -06:00
cde635a660 improved chapters/markers 2026-05-05 12:29:25 -06:00
21e4255325 verified hotkeys 2026-05-05 10:22:35 -06:00
1678d28db7 able to re-transcribe 2026-05-04 23:54:14 -06:00
137dc80cde features update 2026-05-04 19:01:11 -06:00
dd4ce58920 fixed dropdown bar visibility 2026-05-04 18:39:36 -06:00
5758401dda export works 2026-05-04 17:43:00 -06:00
90b1999a57 implemented 15,12,18 didn't check 18 2026-05-04 16:37:25 -06:00
0c7a4c94c2 trying to fix export issue and waveform load 2026-04-15 21:51:05 -06:00
168676a9e9 improved feature 11 and UI 2026-04-15 21:25:47 -06:00
3fa67383c4 feature 10,11 2026-04-15 20:57:43 -06:00
f121d71f5f added save as 2026-04-15 20:51:24 -06:00
af8e0cf6eb zone previews 2026-04-15 20:27:24 -06:00
4d3d8a2218 speed zones work now 2026-04-15 20:17:05 -06:00
b7a795f986 UI improvements, moved file name and moved buttons left 2026-04-15 19:54:39 -06:00
7479acd3ee forgot to add stuff 2026-04-15 18:02:25 -06:00
17874587a4 improved zone handling 2026-04-15 18:00:34 -06:00
84edddded8 removed electron 2026-04-15 17:40:27 -06:00
48d761c713 defaults to project folders; examining zones 2026-04-15 17:31:41 -06:00
024b9bd806 ai tools finished 2026-04-15 17:13:56 -06:00
d11e26cf2d improved tools for ai 2026-04-15 16:36:21 -06:00
4f90750497 volume panel; copilot instructions 2026-04-15 16:10:35 -06:00
0df967507f able to process audio with different model; new project button 2026-04-11 19:42:30 -06:00
b8ec396ebd able to modify trim zones 2026-04-11 19:13:04 -06:00
140b7a5319 fixed error 2026-04-09 01:50:19 -06:00
1d17a8f19a trying to fix bug 2026-04-09 01:36:28 -06:00
f9cd2bf579 gitignore cache 2026-04-08 21:19:52 -06:00
d80ff847d8 silence trimmer 2026-04-03 12:05:44 -06:00
8a7c94d594 delete key can remove zones 2026-04-03 11:38:58 -06:00
0237d685e5 able to drag edges of zones 2026-04-03 11:36:08 -06:00
585262c3e7 added cut and mute zones 2026-04-03 11:14:31 -06:00
d7bc6ea74d added split audio 2026-04-03 10:46:49 -06:00
f0568ed267 darker model name text 2026-04-03 10:46:26 -06:00
7c8c74d04d removed cutscript 2026-04-03 10:35:07 -06:00
addd87c45b Remove obsolete open-cutscript script 2026-04-03 10:34:04 -06:00
bb9ac53ae5 Remove CutScript submodule - can be cloned separately from https://github.com/DataAnts-AI/CutScript.git 2026-04-03 10:33:56 -06:00
c7445206cc added distil models 2026-04-03 10:25:48 -06:00
ea3f1d2b23 close sh;able to save/load projects 2026-03-30 18:36:41 -06:00
246d816f84 added close; fixed some issues 2026-03-28 15:09:56 -06:00
2ffc406b10 changed to python312 2026-03-28 12:26:45 -06:00
4a857d8cbf added api for ai; got backend working 2026-03-26 23:39:31 -06:00
164b2f87d4 got cpu based backend working; trying python/gpu solution bc faster probs 2026-03-26 00:58:57 -06:00
00ee076baa frontend changes 2026-03-25 01:41:40 -06:00
b4bcb8f3f2 i think i got step one working 2026-03-25 01:22:30 -06:00
4230ae6cb9 added features doc 2026-03-25 00:11:35 -06:00
c01db38eb3 update files 2026-03-24 23:56:08 -06:00
d134e4ab27 add CutScript submodule 2026-03-24 23:53:59 -06:00
a864b562ae initial commit 2026-03-24 23:53:59 -06:00
e5c47e31b3 Audio 2026-03-06 21:49:22 +05:30
78d34133ad Add image to README for visual enhancement
Added an image to the README to enhance visual appeal.
2026-03-03 14:45:36 -05:00
33cca5f552 Initial CutScript release - Open-source AI-powered text-based video editor
CutScript is a local-first, Descript-like video editor where you edit video by editing text.
Delete a word from the transcript and it's cut from the video.

Features:
- Word-level transcription with WhisperX
- Text-based video editing with undo/redo
- AI filler word removal (Ollama/OpenAI/Claude)
- AI clip creation for shorts
- Waveform timeline with virtualized transcript
- FFmpeg stream-copy (fast) and re-encode (4K) export
- Caption burn-in and sidecar SRT generation
- Studio Sound audio enhancement (DeepFilterNet)
- Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E)
- Encrypted API key storage
- Project save/load (.aive files)

Architecture:
- Electron + React + Tailwind (frontend)
- FastAPI + Python (backend)
- WhisperX for transcription
- FFmpeg for video processing
- Multi-provider AI support

Performance optimizations:
- RAF-throttled time updates
- Zustand selectors for granular subscriptions
- Dual-canvas waveform rendering
- Virtualized transcript with react-virtuoso

Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application.

License: MIT
2026-03-03 06:31:04 -05:00
d1e1fedcae fix: Resolve issues #7, #8, #9 - moviepy, transformers, Whisper OOM
Issue #7: Handle moviepy 2.x removing verbose param from write_audiofile

Issue #8: Pin transformers<5.0.0 to fix summarization pipeline task registry

Issue #9: Add Whisper model memory warnings and OOM error handling
2026-03-03 02:10:52 -05:00
70c5d32413 feat: Add streaming Ollama support, model caching, and UI improvements
- Add streaming summarization via Ollama API (stream_summarize_with_ollama)

- Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper)

- Add temp file cleanup for extracted audio

- Add system capabilities detection (FFmpeg, GPU info)

- Add get_video_duration utility

- Improve validation with FFmpeg check

- Rewrite app.py with streaming support and UI enhancements

- Clean up redundant comments and unused imports across all utils
2026-02-18 10:26:09 -05:00
ce398ae1d4 fix: Update moviepy import for v2.x compatibility (closes #6)
moviepy 2.x removed the moviepy.editor submodule. Import AudioFileClip directly from moviepy with a fallback for moviepy 1.x users. Also close the audio clip after writing to prevent resource leaks.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-13 00:15:07 -05:00
168bf5f573 fix: Free up disk space in GitHub Actions runner
- Add disk cleanup step to remove unused packages (~30GB freed)
- Remove arm64 platform build (reduces disk usage significantly)
- Add cleanup step between CPU and GPU builds
- Fixes 'No space left on device' error during Docker build
2026-01-05 17:34:07 -05:00
efee0b0abe fix: Resolve protobuf dependency conflict for pyannote.audio 4.x
- Update protobuf from <5.0.0 to >=5.0.0 (required by opentelemetry-proto)
- Update streamlit minimum version to >=1.30.0 (protobuf 5.x compatible)
- Update regular Dockerfile to match GPU dockerfile structure
- Install PyTorch CPU version in regular Dockerfile for consistency
2026-01-05 11:30:56 -05:00
4dd3c7600e fix: Remove torch from requirements.txt and use flexible versions
- Remove torch/torchaudio/torchvision from requirements.txt (installed separately in Docker)
- Use >= instead of == for most packages to avoid version conflicts
- Install numpy before other requirements
- Add setuptools and wheel to pip upgrade step
2026-01-05 11:25:12 -05:00
78e9df31e6 fix: Use flexible PyTorch versions and fix Docker build order
- Don't pin exact torch/torchaudio/torchvision versions (use >=2.1.0)
- Install CUDA PyTorch first in Docker before other requirements
- Upgrade pip before installations to avoid compatibility issues
- Let pip resolve latest compatible versions from cu118 index
2026-01-05 10:50:20 -05:00
179 changed files with 29973 additions and 3599 deletions

Binary file not shown.

View File

@ -0,0 +1,16 @@
# backend_health_check
# cmd: /home/dillon/_code/TalkEdit/.venv312/bin/python3.12 -c import importlib; importlib.import_module('backend.main'); print('backend import OK')
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.12/importlib/__init__.py", line 90, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<frozen importlib._bootstrap>", line 1387, in _gcd_import
File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 935, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 999, in exec_module
File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
File "/home/dillon/_code/TalkEdit/backend/main.py", line 12, in <module>
from routers import transcribe, export, ai, captions, audio
ModuleNotFoundError: No module named 'routers'

View File

@ -0,0 +1,3 @@
# backend_python_version
# cmd: /home/dillon/_code/TalkEdit/.venv312/bin/python3.12 --version
Python 3.12.13

View File

@ -0,0 +1,3 @@
# env_git_head
# cmd: git -C /home/dillon/_code/TalkEdit rev-parse --short HEAD
4f90750

View File

@ -0,0 +1,10 @@
# env_git_status
# cmd: git -C /home/dillon/_code/TalkEdit status --short
M frontend/src/App.tsx
M frontend/src/components/VolumePanel.tsx
M frontend/src/components/WaveformTimeline.tsx
M frontend/src/store/editorStore.ts
?? .diagnostics/
?? AI_dev.md
?? docs/
?? scripts/

View File

@ -0,0 +1,3 @@
# env_node_version
# cmd: node --version
v22.18.0

View File

@ -0,0 +1,3 @@
# env_npm_version
# cmd: npm --version
10.9.3

View File

@ -0,0 +1,3 @@
# env_uname
# cmd: uname -a
Linux cachyos-x 6.19.10-1-cachyos #1 SMP PREEMPT_DYNAMIC Wed, 25 Mar 2026 23:30:07 +0000 x86_64 GNU/Linux

View File

@ -0,0 +1,11 @@
# frontend_build
# cmd: bash -lc cd '/home/dillon/_code/TalkEdit/frontend' && npm run -s build
vite v6.4.1 building for production...
transforming...
✓ 1606 modules transformed.
rendering chunks...
computing gzip size...
dist/index.html 1.20 kB │ gzip: 0.57 kB
dist/assets/index-gyhcOzhr.css 19.31 kB │ gzip: 4.48 kB
dist/assets/index-B5NnH24A.js 354.13 kB │ gzip: 108.13 kB
✓ built in 2.43s

View File

@ -0,0 +1,3 @@
# frontend_lint
# cmd: bash -lc cd '/home/dillon/_code/TalkEdit/frontend' && npm run -s lint
sh: line 1: eslint: command not found

View File

@ -0,0 +1,72 @@
# list_recent_files
# cmd: find /home/dillon/_code/TalkEdit -maxdepth 2 -type f
/home/dillon/_code/TalkEdit/.git/description
/home/dillon/_code/TalkEdit/.git/packed-refs
/home/dillon/_code/TalkEdit/.git/COMMIT_EDITMSG
/home/dillon/_code/TalkEdit/.git/FETCH_HEAD
/home/dillon/_code/TalkEdit/.git/ORIG_HEAD
/home/dillon/_code/TalkEdit/.git/REBASE_HEAD
/home/dillon/_code/TalkEdit/.git/HEAD
/home/dillon/_code/TalkEdit/.git/config
/home/dillon/_code/TalkEdit/.git/index
/home/dillon/_code/TalkEdit/backend/requirements.txt
/home/dillon/_code/TalkEdit/backend/.python-version
/home/dillon/_code/TalkEdit/backend/dev_main.py
/home/dillon/_code/TalkEdit/backend/video_editor.py
/home/dillon/_code/TalkEdit/backend/audio_cleaner.py
/home/dillon/_code/TalkEdit/backend/diarization.py
/home/dillon/_code/TalkEdit/backend/ai_provider.py
/home/dillon/_code/TalkEdit/backend/caption_generator.py
/home/dillon/_code/TalkEdit/backend/background_removal.py
/home/dillon/_code/TalkEdit/backend/main.py
/home/dillon/_code/TalkEdit/frontend/postcss.config.js
/home/dillon/_code/TalkEdit/frontend/tailwind.config.js
/home/dillon/_code/TalkEdit/frontend/tsconfig.json
/home/dillon/_code/TalkEdit/frontend/vite.config.ts
/home/dillon/_code/TalkEdit/frontend/frontend_dev.log
/home/dillon/_code/TalkEdit/frontend/index.html
/home/dillon/_code/TalkEdit/frontend/package-lock.json
/home/dillon/_code/TalkEdit/frontend/package.json
/home/dillon/_code/TalkEdit/frontend/tsconfig.tsbuildinfo
/home/dillon/_code/TalkEdit/shared/project-schema.json
/home/dillon/_code/TalkEdit/node_modules/.package-lock.json
/home/dillon/_code/TalkEdit/src-tauri/.gitignore
/home/dillon/_code/TalkEdit/src-tauri/Cargo.toml
/home/dillon/_code/TalkEdit/src-tauri/build.rs
/home/dillon/_code/TalkEdit/src-tauri/tauri_dev.log
/home/dillon/_code/TalkEdit/src-tauri/Cargo.lock
/home/dillon/_code/TalkEdit/src-tauri/tauri.conf.json
/home/dillon/_code/TalkEdit/.dockerignore
/home/dillon/_code/TalkEdit/.gitattributes
/home/dillon/_code/TalkEdit/FIX-GITHUB-ACTIONS.md
/home/dillon/_code/TalkEdit/LICENSE
/home/dillon/_code/TalkEdit/M4A-SUPPORT.md
/home/dillon/_code/TalkEdit/package-lock.json
/home/dillon/_code/TalkEdit/TECH_FEATURES.md
/home/dillon/_code/TalkEdit/FFmpeg_COMPLIANCE.md
/home/dillon/_code/TalkEdit/transcribe.py
/home/dillon/_code/TalkEdit/test_api.py
/home/dillon/_code/TalkEdit/.vscode/settings.json
/home/dillon/_code/TalkEdit/.venv312/pyvenv.cfg
/home/dillon/_code/TalkEdit/webview.log
/home/dillon/_code/TalkEdit/.gitmodules
/home/dillon/_code/TalkEdit/split_audio.sh
/home/dillon/_code/TalkEdit/venv/.gitignore
/home/dillon/_code/TalkEdit/venv/pyvenv.cfg
/home/dillon/_code/TalkEdit/.gitignore
/home/dillon/_code/TalkEdit/FEATURES.md
/home/dillon/_code/TalkEdit/README.md
/home/dillon/_code/TalkEdit/close
/home/dillon/_code/TalkEdit/electron/main.js
/home/dillon/_code/TalkEdit/electron/preload.js
/home/dillon/_code/TalkEdit/electron/python-bridge.js
/home/dillon/_code/TalkEdit/idea summary.md
/home/dillon/_code/TalkEdit/open
/home/dillon/_code/TalkEdit/package.json
/home/dillon/_code/TalkEdit/plan.md
/home/dillon/_code/TalkEdit/.github/copilot-instructions.md
/home/dillon/_code/TalkEdit/AI_dev.md
/home/dillon/_code/TalkEdit/docs/spec-template.md
/home/dillon/_code/TalkEdit/docs/ai-policy.md
/home/dillon/_code/TalkEdit/scripts/validate-all.sh
/home/dillon/_code/TalkEdit/scripts/collect-diagnostics.sh

109
.github/copilot-instructions.md vendored Normal file
View File

@ -0,0 +1,109 @@
# TalkEdit Copilot Instructions (Living Project Context)
Purpose: give AI assistants immediate, accurate context for this repository and define what must be kept in sync when the project evolves.
## How To Use This File
- This is a workspace instruction file for VS Code Chat/Copilot.
- Treat this as the first source of truth for architecture and workflow expectations.
- If your code changes make any section outdated, update this file in the same change.
## Project Snapshot
- Name: TalkEdit
- Product: local-first, AI-powered, text-based audio/video editor.
- Primary runtime: Tauri + React frontend + Python FastAPI backend.
- Desktop only (Electron has been removed; Tauri is the exclusive desktop runtime).
## Tech Stack
- Frontend: React 19, TypeScript, Vite, Tailwind, Zustand.
- Desktop bridge: Tauri API (IPC commands via `window.electronAPI` polyfill in `frontend/src/lib/tauri-bridge.ts` for unified call-site interface).
- Backend: FastAPI + Uvicorn (`backend/main.py`) with routers in `backend/routers` and core services in `backend/services`.
- Media tooling: FFmpeg for edit/export and codec operations.
- AI tooling: WhisperX/faster-whisper for transcription; provider layer supports OpenAI/Anthropic/Ollama.
## Code Map
- `frontend/src/components`: editor UI (player, transcript, waveform, settings, export, AI panel).
- `frontend/src/store`: Zustand state (`editorStore`, `aiStore`).
- `frontend/src/hooks`: keyboard/video sync behavior.
- `backend/routers`: API surface (`/transcribe`, `/export`, `/ai/*`, `/captions`, `/audio/*`).
- `backend/services`: heavy operations (transcription, captioning, diarization, video editing, cleanup).
- `shared/project-schema.json`: saved project schema contract.
- `src-tauri`: Rust/Tauri host code and app configuration.
## Run And Build (Preferred)
- Frontend dev: `npm run dev`
- Backend dev: `npm run dev:backend`
- Tauri dev: `npm run dev:tauri`
- Tauri build: `npm run build:tauri`
Use project virtualenvs where available (`.venv312`, `.venv`, or `venv`) for backend execution.
## Working Conventions
- Keep router files thin; put heavy logic in `backend/services`.
- Preserve response compatibility for existing frontend callers unless task explicitly allows API breakage.
- Frontend uses unified `window.electronAPI` interface (Tauri-backed via tauri-bridge.ts); desktop APIs are implemented exclusively in Tauri.
- Prefer small, focused edits over broad refactors.
## Known Risk Areas
- Startup/rendering on Linux WebKit can regress when reintroducing remote fonts/CSP allowances; prefer local font assets.
- Media URL handling between project load paths should remain consistent to avoid format-specific regressions (especially WAV/MP3 behavior).
- Export pipeline changes must preserve caption modes (`none`, `sidecar`, `burn-in`) and audio enhancement behavior.
- WAV export uses `pcm_s16le` codec — only available for audio-only inputs (no video stream). Format selector conditionally shows WAV based on input file extension.
- `<select>` dropdowns need `[color-scheme:dark]` Tailwind class on Linux WebKit or the native popup renders white-on-light-gray.
- Frontend gain ranges use camelCase (`gainDb`) but the backend expects snake_case (`gain_db`). The ExportDialog maps them before sending. Any new call sites must do the same.
## Recent Changes
### 2026-05-04 — Word text correction, low-confidence highlighting, audio normalization
- **Word text correction (#015)**: Double-click any word in the transcript editor to edit its text inline. Press Enter to commit, Escape to cancel. State is updated in both `words[]` and `segments[]` arrays (segment text recomposed from updated words). Pure frontend; no backend changes needed.
- **Low-confidence word highlighting (#012)**: Words with `confidence < threshold` (default 0.6, configurable in Settings panel) render with an orange dotted underline. Tooltip shows exact confidence percentage. Threshold is persisted in `localStorage` key `talkedit:confidenceThreshold`.
- **Audio normalization (#018)**: New backend endpoint `POST /audio/normalize` in `backend/routers/audio.py`. Two-pass FFmpeg `loudnorm` (measure then apply) implemented in `backend/services/audio_cleaner.py:normalize_audio()`. Falls back to single-pass if measurement fails. Frontend UI in Export panel: target selector (YouTube -14, Spotify -16, Broadcast -23, etc.) with "Normalize" button.
- **Store**: New `updateWordText(index, text)` action in `editorStore.ts` updates both `words[]` and recomputes `segments[].text`.
- **Settings panel**: New confidence threshold slider (01 range).
- **WAV export format**: Format selector shows "WAV (Uncompressed)" for audio-only inputs. Backend uses `pcm_s16le` codec via `_get_codec_args()` helper. Codec selection centralized in `backend/services/video_editor.py:_get_codec_args(format_hint, has_video)`.
- **Normalization moved to export**: No longer a standalone button. Integrated as `normalizeAudio` checkbox + LUFS target selector in ExportPanel. Sent as `normalize_loudness`/`normalize_target_lufs` to backend. Applied via `loudnorm` in FFmpeg audio filter chain during export.
- **Export camelCase fix**: `ExportDialog.tsx` now manually maps `gainRanges``gain_db` and `muteRanges``{start,end}` before sending to backend. Prevents Pydantic v2 field rejection.
- **color-scheme:dark**: All `<select>` elements in ExportDialog use `[color-scheme:dark]` to ensure readable native dropdown popups on Linux WebKit.
- **Re-transcribe selection (#013)**: Backend `POST /transcribe/segment` extracts audio via FFmpeg, runs Whisper, adjusts timestamps. Frontend: "Re-transcribe" button on selected words in TranscriptEditor; `replaceWordRange()` store action swaps words + rebuilds segments by speaker.
- **Transcript-only export (#024)**: "Export Transcript Only" in ExportDialog with .txt/.srt options. **Pure frontend** — generates content in-browser, writes via Tauri `writeFile`. No backend dependency. Respects word cuts.
- **Named timeline markers (#016)**: `TimelineMarker` type in `project.ts`. Store actions: `addTimelineMarker`, `updateTimelineMarker`, `removeTimelineMarker`. Colored pins on waveform canvas. MarkersPanel UI for add/edit/delete. Persisted in project.
- **Chapters (#017)**: `getChapters()` store action derives from sorted markers. "Copy as YouTube timestamps" in MarkersPanel. Zero backend.
- **Clip thumbnail strip (#022)**: `lib/thumbnails.ts` — frontend canvas capture from `<video>`. Toggle button in WaveformTimeline. Clickable frames at 10s intervals.
- **Customizable hotkeys (#041)**: `lib/keybindings.ts` with two presets (standard + left-hand). `useKeyboardShortcuts.ts` reads bindings dynamically. Settings panel includes key remapper with conflict detection and per-key reset. `?` key shows dynamic cheatsheet.
## Update Rules (Important)
When a task changes architecture, app wiring, commands, API shape, project schema, or major conventions, update this file before finishing.
Always update these sections if affected:
- `Project Snapshot`
- `Tech Stack`
- `Code Map`
- `Run And Build (Preferred)`
- `Working Conventions`
- `Known Risk Areas`
- Recent changes section (if applicable)
- `Code Map`
- `Run And Build (Preferred)`
- `Known Risk Areas`
If behavior changed significantly, add a short note under a new `Recent Changes` section with:
- Date (`YYYY-MM-DD`)
- What changed
- What future edits should preserve
## Assistant Behavior For This Repo
- Validate assumptions against current files before editing.
- Prefer existing patterns in neighboring files over introducing new patterns.
- Call out uncertainty explicitly when code and docs disagree.
- If you discover stale docs, fix them as part of the same task when reasonable.

23
.github/pull_request_template.md vendored Normal file
View File

@ -0,0 +1,23 @@
## Summary
Describe what changed and why.
## Spec Link (Required For Feature Changes)
- Spec file in `docs/specs/`: <!-- e.g. docs/specs/2026-04-15-speed-adjustment.md -->
## Acceptance Criteria Checklist
- [ ] Acceptance criteria reviewed against the linked spec
- [ ] User-visible behavior verified for this change
- [ ] Backward compatibility impact assessed
## Validation
- [ ] `./scripts/validate-all.sh` passes locally
- [ ] Added/updated tests for changed behavior
## Risk And Rollback
- Risk level: Low / Medium / High
- Rollback plan:

45
.github/workflows/ci.yml vendored Normal file
View File

@ -0,0 +1,45 @@
name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
rust:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- run: cargo test
working-directory: src-tauri
- run: cargo check --release
working-directory: src-tauri
- run: cargo clippy -- -D warnings
working-directory: src-tauri
continue-on-error: true
frontend:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
- run: npm ci
working-directory: frontend
- run: npx tsc --noEmit
working-directory: frontend
- run: npx vitest run
working-directory: frontend
python:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- run: pip install pytest
- run: python -m pytest backend/tests/ || true

View File

@ -1,76 +0,0 @@
name: Build and Push Docker Images
on:
push:
branches: [ main, develop ]
tags: [ 'v*' ]
pull_request:
branches: [ main ]
release:
types: [published]
env:
REGISTRY: ghcr.io
jobs:
build:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Convert repository name to lowercase
id: lowercase-repo
run: echo "repository=$(echo ${{ github.repository }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT
- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
type=raw,value=latest,enable={{is_default_branch}}
- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Build and push GPU-enabled image
uses: docker/build-push-action@v5
with:
context: .
file: Dockerfile.gpu
platforms: linux/amd64
push: true
tags: |
${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}:latest-gpu
${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}:${{ github.sha }}-gpu
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

94
.github/workflows/release.yml vendored Normal file
View File

@ -0,0 +1,94 @@
name: Release
on:
push:
tags:
- 'v*'
jobs:
linux:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
cache: npm
cache-dependency-path: frontend/package-lock.json
- run: npm ci
working-directory: frontend
- uses: dtolnay/rust-toolchain@stable
- run: |
sudo apt-get update
sudo apt-get install -y \
libwebkit2gtk-4.1-dev \
librsvg2-dev \
patchelf \
libssl-dev \
libgtk-3-dev \
libayatana-appindicator3-dev \
rpm
- name: Download FFmpeg (bundled sidecar)
run: |
mkdir -p src-tauri/binaries
curl -sL "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz" -o /tmp/ffmpeg.tar.xz
tar -xf /tmp/ffmpeg.tar.xz -C /tmp
cp /tmp/ffmpeg-*-amd64-static/ffmpeg src-tauri/binaries/ffmpeg-x86_64-unknown-linux-gnu
cp /tmp/ffmpeg-*-amd64-static/ffprobe src-tauri/binaries/ffprobe-x86_64-unknown-linux-gnu
chmod +x src-tauri/binaries/*
- uses: tauri-apps/tauri-action@v0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tagName: ${{ github.ref_name }}
releaseName: 'TalkEdit ${{ github.ref_name }}'
releaseBody: 'See the assets to download and install this version.'
releaseDraft: false
includeUpdaterJson: true
args: --bundles deb,rpm
windows:
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
cache: npm
cache-dependency-path: frontend/package-lock.json
- run: npm ci
working-directory: frontend
- uses: dtolnay/rust-toolchain@stable
- uses: tauri-apps/tauri-action@v0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tagName: ${{ github.ref_name }}
releaseName: 'TalkEdit ${{ github.ref_name }}'
releaseBody: 'See the assets to download and install this version.'
releaseDraft: false
includeUpdaterJson: true
args: --bundles msi
# macos:
# runs-on: macos-latest
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-node@v4
# with:
# node-version: 20
# cache: npm
# cache-dependency-path: frontend/package-lock.json
# - run: npm ci
# working-directory: frontend
# - uses: dtolnay/rust-toolchain@stable
# - uses: tauri-apps/tauri-action@v0
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# with:
# tagName: ${{ github.ref_name }}
# releaseName: 'TalkEdit ${{ github.ref_name }}'
# releaseBody: 'See the assets to download and install this version.'
# releaseDraft: false
# includeUpdaterJson: true
# args: --bundles dmg

58
.github/workflows/validate-all.yml vendored Normal file
View File

@ -0,0 +1,58 @@
name: Validate All
on:
pull_request:
push:
branches:
- main
jobs:
validate-all:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: '20'
cache: npm
cache-dependency-path: |
frontend/package-lock.json
package-lock.json
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Enforce feature spec policy (PR only)
if: github.event_name == 'pull_request'
env:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
run: ./scripts/check-feature-spec.sh
- name: Install frontend dependencies
run: |
cd frontend
npm install
- name: Run validate-all
env:
SKIP_BACKEND_IMPORT_SMOKE: '1'
run: ./scripts/validate-all.sh
- name: Collect diagnostics on failure
if: failure()
run: ./scripts/collect-diagnostics.sh
- name: Upload diagnostics artifact
if: failure()
uses: actions/upload-artifact@v4
with:
name: diagnostics
path: .diagnostics

51
.gitignore vendored
View File

@ -1,13 +1,60 @@
# Python virtual environment
# Dependencies
node_modules/
# Build output
frontend/dist/
# Python
venv/
.venv312/
__pycache__/
*.pyc
*.pyo
*.egg-info/
.pytest_cache/
.mypy_cache/
.coverage
htmlcov/
# IDE files
# IDE / Editor
.vscode/
.idea/
.cursor/
# Submodules (can be cloned separately if needed)
CutScript/
# OS files
.env
.env.local
.env.*.local
.DS_Store
Thumbs.db
*.swp
*.tmp
# Logs
*.log
logs/
cache/
*.aive
# Build output
frontend/dist/
dist/
build/
*.asar
target/
src-tauri/target/
# Node.js
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Rust
Cargo.lock
# Lock files (root only — frontend lock is committed)
/package-lock.json

1
.gitmodules vendored Normal file
View File

@ -0,0 +1 @@

157
AI_dev_plan.md Normal file
View File

@ -0,0 +1,157 @@
# AI Dev Plan (Must-Haves Only)
## Purpose
This is the minimum implementation needed for AI to reliably build, test, and debug TalkEdit with high confidence.
Target: reliable 80-90% autonomous implementation/debugging on scoped tasks.
## Must-Have Pillars
## 1. Single Validation Command
Required:
1. One command that runs lint, build, backend tests, and smoke checks.
2. Works locally and in CI.
Current status:
1. Implemented via scripts/validate-all.sh.
2. Enforced in CI via .github/workflows/validate-all.yml.
## 2. CI Quality Gate
Required:
1. Pull requests fail if validation fails.
2. Failures produce diagnostics artifacts.
Current status:
1. Implemented in .github/workflows/validate-all.yml.
2. Diagnostics collected by scripts/collect-diagnostics.sh on failure.
## 3. Spec Requirement for Feature Changes
Required:
1. Feature code changes must include a spec file update.
2. Spec format must be standardized.
Current status:
1. Implemented via scripts/check-feature-spec.sh.
2. Spec template exists at docs/spec-template.md.
3. Specs folder guidance exists at docs/specs/README.md.
## 4. Backend Contract Test Coverage
Required:
1. Router-level contract tests for success and error paths.
2. Tests are deterministic and mock heavy services.
Current status:
1. Implemented in backend/tests/test_router_contracts.py.
2. Cache utility baseline tests implemented in backend/tests/test_cache_utils.py.
## 5. Error-Tolerant Router Contracts
Required:
1. Expected client errors must remain 4xx.
2. Server failures must return 5xx with useful detail.
Current status:
1. Implemented for captions/export HTTPException passthrough.
2. Covered by contract tests.
## 6. Basic Autonomy Policy
Required:
1. Clear autonomous scope and escalation rules.
2. Clear restrictions for high-risk changes.
Current status:
1. Implemented in docs/ai-policy.md.
## Must-Have Remaining Work
No remaining must-have items.
Completed in this pass:
1. Added lightweight frontend tests and integrated them into scripts/validate-all.sh.
2. Added pull request template with required spec link and acceptance criteria checklist.
3. Added endpoint-level contract assertions for /file range requests and /audio/waveform cache-hit/cache-miss behavior.
4. Confirmed scripts/validate-all.sh passes end-to-end with frontend tests + expanded backend contracts.
## Out of Scope for Must-Have Baseline
Useful later, but not required for strong day-to-day autonomous implementation:
1. Full quality dashboards.
2. Advanced autonomy telemetry.
3. Complete long-term governance expansion.
4. High-autonomy optimization beyond 90% reliability target.
## Definition of Done (Must-Have Plan)
Must-have plan is complete when all are true:
1. scripts/validate-all.sh passes locally and in CI.
2. Feature PRs without spec updates are blocked.
3. Backend router contracts cover core success and error paths.
4. Frontend has at least one stable test command integrated into validation.
## 7. AI Tools Validation Strategy
Required:
1. **Per-edit validation**: After each code change (file edit, replacement, or creation), validate immediately with appropriate tools.
2. **Tool selection by change type**:
- Frontend changes: ESLint (`npm run -s lint`), then TypeScript build (`npm run build`)
- Backend changes: Syntax check via Python import, then run relevant test suite
- Type/interface changes: Full type check via build or `tsc -b`
3. **Failure handling**: If validation fails, fix immediately before proceeding to next edit.
4. **Documentation updates**: When changing architecture, always update [.github/copilot-instructions.md](.github/copilot-instructions.md) as part of the same PR.
5. **Large multi-edit operations**: Use `multi_replace_string_in_file` to batch independent edits and reduce tool call overhead.
6. **Error collection**: Use `get_errors` tool to identify issues across multiple files in one call post-change.
Current implementation:
1. Electron removal completed with post-edit lint and build validation at each phase.
2. Zone editor feature implemented with immediate lint/build validation after component creation and UI integration.
3. Validation tools: `npm run -s lint`, `npm run build`, `get_errors`, `run_in_terminal` for test scripts.
Best practices established:
- Always run lint before build to catch TypeScript errors early
- Run full build after component changes to verify tree-shaking and bundling
- Use `get_errors` for multi-file error detection rather than sequential file reads
- Batch unrelated edits with `multi_replace_string_in_file` for efficiency
- Cache key decisions in session memory to avoid repeated exploration
5. AI policy + diagnostics workflow are active.
## Current State Summary
Completed:
1. Validation and CI enforcement.
2. Diagnostics capture.
3. Spec policy and templates.
4. Backend contract test foundation (including AI endpoints).
5. Core router error-path correctness.
6. Autonomy policy baseline.
7. Frontend test command integrated into validation.
8. PR template requirement added.
9. /file and /audio/waveform contract assertions implemented.
Remaining:
1. No must-have items remaining.

305
DOCKER.md
View File

@ -1,305 +0,0 @@
# Docker Deployment Guide for VideoTranscriber
This guide explains how to run VideoTranscriber in a Docker container while using Ollama models on your host system.
## Architecture Overview
```
┌─────────────────────────────────────────┐
│ Host System │
│ ┌─────────────────┐ ┌──────────────────│
│ │ Ollama Service │ │ Video Files │
│ │ (port 11434) │ │ Directory │
│ └─────────────────┘ └──────────────────│
│ ▲ ▲ │
│ │ │ │
│ ┌───────┼─────────────────────┼─────────│
│ │ Docker Container │ │
│ │ ┌─────▼─────────┐ │ │
│ │ │ VideoTranscriber │ │
│ │ │ - Streamlit App │ │
│ │ │ - Whisper Models │ │
│ │ │ - ML Dependencies │ │
│ │ └───────────────┘ │ │
│ └────────────────────────────┼─────────│
│ │ │
│ Mounted Volumes ─────┘ │
└─────────────────────────────────────────┘
```
## Quick Start
### Prerequisites
1. **Docker & Docker Compose** installed
2. **Ollama running on host**:
```bash
# Install Ollama (if not already installed)
curl -fsSL https://ollama.ai/install.sh | sh
# Start Ollama service
ollama serve
# Pull a model (in another terminal)
ollama pull llama3
```
### 1. Setup Environment
```bash
# Copy environment template
cp docker.env.example .env
# Edit .env file with your paths
# Key settings to update:
VIDEO_PATH=/path/to/your/videos
OUTPUT_PATH=/path/to/save/outputs
HF_TOKEN=your_huggingface_token_if_needed
```
### 2. Create Required Directories
```bash
# Create directories for mounting
mkdir -p videos outputs cache config
```
### 3. Build and Run
```bash
# Build and start the container
docker-compose up -d
# View logs
docker-compose logs -f
# Access the application
# Open browser to: http://localhost:8501
```
## Configuration Options
### Environment Variables
| Variable | Description | Default | Required |
|----------|-------------|---------|----------|
| `VIDEO_PATH` | Host directory containing video files | `./videos` | Yes |
| `OUTPUT_PATH` | Host directory for outputs | `./outputs` | Yes |
| `CACHE_PATH` | Host directory for model cache | `./cache` | No |
| `OLLAMA_API_URL` | Ollama API endpoint | `http://host.docker.internal:11434/api` | No |
| `HF_TOKEN` | HuggingFace token for advanced features | - | No |
| `CUDA_VISIBLE_DEVICES` | GPU devices to use | - | No |
### Volume Mounts
| Host Path | Container Path | Purpose |
|-----------|----------------|---------|
| `${VIDEO_PATH}` | `/app/data/videos` | Input video files |
| `${OUTPUT_PATH}` | `/app/data/outputs` | Generated transcripts/summaries |
| `${CACHE_PATH}` | `/app/data/cache` | Model and processing cache |
| `${CONFIG_PATH}` | `/app/config` | Configuration files |
## Platform-Specific Setup
### Windows (Docker Desktop)
```yaml
# In docker-compose.yml - use bridge networking
networks:
- videotranscriber-network
environment:
- OLLAMA_API_URL=http://host.docker.internal:11434/api
```
### macOS (Docker Desktop)
Same as Windows - uses `host.docker.internal` to access host services.
### Linux
Option 1 - Host Networking (Recommended):
```yaml
# In docker-compose.yml
network_mode: host
environment:
- OLLAMA_API_URL=http://localhost:11434/api
```
Option 2 - Bridge Networking:
```yaml
environment:
- OLLAMA_API_URL=http://172.17.0.1:11434/api # Docker bridge IP
```
## GPU Support
### NVIDIA GPU Setup
1. **Install NVIDIA Container Toolkit**:
```bash
# Ubuntu/Debian
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
```
2. **Enable in docker-compose.yml**:
```yaml
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
```
## Usage in Container
### Application Settings
When running in Docker, update these settings in the VideoTranscriber UI:
1. **Base Folder**: Set to `/app/data/videos`
2. **Ollama Models**: Should auto-detect from host
3. **GPU Settings**: Will use container GPU if configured
### File Access
- **Input Videos**: Place in your `${VIDEO_PATH}` directory on host
- **Outputs**: Generated files appear in `${OUTPUT_PATH}` on host
- **Cache**: Models cached in `${CACHE_PATH}` for faster subsequent runs
## Troubleshooting
### Common Issues
#### 1. Can't Connect to Ollama
**Symptoms**: "Ollama service is not available" message
**Solutions**:
- Verify Ollama is running: `curl http://localhost:11434/api/tags`
- Check firewall settings
- For Linux, try host networking mode
- Verify OLLAMA_API_URL in environment
#### 2. No Video Files Detected
**Symptoms**: "No recordings found" message
**Solutions**:
- Check VIDEO_PATH points to correct directory
- Ensure directory contains supported formats (.mp4, .avi, .mov, .mkv)
- Check file permissions
#### 3. GPU Not Detected
**Symptoms**: Processing is slow, no GPU utilization
**Solutions**:
- Install NVIDIA Container Toolkit
- Uncomment GPU section in docker-compose.yml
- Verify: `docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi`
#### 4. Permission Issues
**Symptoms**: Cannot write to output directory
**Solutions**:
```bash
# Fix permissions
sudo chown -R $(id -u):$(id -g) outputs cache config
chmod -R 755 outputs cache config
```
### Debugging
```bash
# View container logs
docker-compose logs -f videotranscriber
# Execute shell in container
docker-compose exec videotranscriber bash
# Check Ollama connectivity from container
docker-compose exec videotranscriber curl -f $OLLAMA_API_URL/tags
# Monitor resource usage
docker stats videotranscriber
```
## Advanced Configuration
### Custom Dockerfile
For specialized requirements, modify the Dockerfile:
```dockerfile
# Add custom dependencies
RUN pip install your-custom-package
# Set custom environment variables
ENV YOUR_CUSTOM_VAR=value
# Copy custom configuration
COPY custom-config.yaml /app/config/
```
### Multi-Instance Deployment
Run multiple instances for different use cases:
```bash
# Copy docker-compose.yml to docker-compose.prod.yml
# Modify ports and paths
docker-compose -f docker-compose.prod.yml up -d
```
### CI/CD Integration
```yaml
# .github/workflows/docker.yml
name: Build and Deploy
on:
push:
branches: [main]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Build Docker image
run: docker build -t videotranscriber .
```
## Performance Optimization
### Memory Management
```yaml
# In docker-compose.yml
deploy:
resources:
limits:
memory: 8G
reservations:
memory: 4G
```
### Model Caching
- Use persistent volumes for `/app/data/cache`
- Pre-download models to reduce startup time
- Configure appropriate cache size limits
### Network Optimization
- Use host networking on Linux for better performance
- Consider running Ollama and VideoTranscriber on same machine
- Use SSD storage for cache directories

View File

@ -1,45 +0,0 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
ffmpeg \
git \
wget \
curl \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first for better Docker layer caching
COPY requirements.txt .
# Install Python dependencies with pinned versions
RUN pip install --no-cache-dir -r requirements.txt
# Optional: Install CUDA-specific PyTorch if GPU support needed
# Uncomment and modify for your CUDA version:
# RUN pip install --force-reinstall torch==2.1.0+cu118 torchvision==0.16.0+cu118 torchaudio==2.1.0+cu118 --index-url https://download.pytorch.org/whl/cu118
# Copy application code
COPY . .
# Create directories for mounted volumes
RUN mkdir -p /app/data/videos /app/data/outputs /app/data/cache
# Set environment variables
ENV STREAMLIT_SERVER_PORT=8501
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
ENV STREAMLIT_SERVER_HEADLESS=true
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
# Expose Streamlit port
EXPOSE 8501
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8501/_stcore/health || exit 1
# Start the application
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

View File

@ -1,54 +0,0 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies including CUDA-related packages
RUN apt-get update && apt-get install -y \
ffmpeg \
git \
wget \
curl \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first for better Docker layer caching
COPY requirements.txt .
# Install CPU versions from requirements.txt first
RUN pip install --no-cache-dir -r requirements.txt
# Install CUDA-optimized PyTorch (overwrites CPU versions)
# Updated to torch 2.1.0+ for SpeechBrain 1.0 / pyannote diarization compatibility
RUN pip install --force-reinstall \
torch==2.1.0+cu118 \
torchvision==0.16.0+cu118 \
torchaudio==2.1.0+cu118 \
--index-url https://download.pytorch.org/whl/cu118
# Copy application code
COPY . .
# Create directories for mounted volumes
RUN mkdir -p /app/data/videos /app/data/outputs /app/data/cache
# Set environment variables
ENV STREAMLIT_SERVER_PORT=8501
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
ENV STREAMLIT_SERVER_HEADLESS=true
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
# GPU-specific environment variables
ENV CUDA_VISIBLE_DEVICES=0
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
# Expose Streamlit port
EXPOSE 8501
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8501/_stcore/health || exit 1
# Start the application
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

181
FEATURES.md Normal file
View File

@ -0,0 +1,181 @@
# TalkEdit — Features & Roadmap
**Niche:** "Descript for long-form content" — works on hour+ files without degrading, fully offline, one-time payment.
---
## ✅ Already Implemented
### Core editing
- [x] [#001] **Cut / Mute sections** — remove or silence segments from output
- [x] [#002] **Silence / pause trimmer** — batch detect and remove silent pauses
- [x] [#006] **Volume / gain control** — per-zone and global gain adjustment
- [x] [#007] **Speed adjustment** — per-zone playback speed changes (0.25x4x)
- [x] [#008] **Cut preview** — preview zones before export with configurable padding
- [x] [#009] **Timeline shows output length** — adjusted timeline with cut compression
- [x] [#011] **Mark In / Out** — I/O keys to set selection range on timeline
### Transcript
- [x] [#010] **Transcript search (Ctrl+F)** — find words, navigate matches
- [x] [#012] **Low-confidence word highlighting** — orange dotted underline with confidence %
- [x] [#013] **Re-transcribe selection** — re-run Whisper on a selected word range
- [x] [#015] **Word text correction** — double-click any word to edit text in-place
- [x] [#016] **Named timeline markers** — colored pins with labels, editable
- [x] [#017] **Chapters** — auto-form from markers, copy as YouTube timestamps
- [x] [#025] Word-level transcript editing (click, shift+click, drag select)
- [x] [#026] Ctrl+click word → seek video to that timestamp
- [x] [#027] Waveform timeline with zoom (Ctrl+scroll), scroll, drag-to-scrub
- [x] [#028] Auto-scroll waveform when playhead goes off-screen
### AI features
- [x] [#029] **AI filler word detection** — find and remove "um", "uh", "like" etc.
- [x] [#030] **AI clip suggestions** — find best 20-60s segments for social media
- [x] [#031] **Noise reduction** — DeepFilterNet or FFmpeg ANLMDN
- [x] [#034] **Speaker diarization** — label speakers in transcript
- [x] [#042] **Background removal** — MediaPipe segmentation, blur/color/image replacement
### Export
- [x] [#018] **Audio loudness normalization** — LUFS targets (-14 YouTube, -16 Spotify, -23 Broadcast)
- [x] [#019] **Background music** — auto-ducking via FFmpeg sidechain compress
- [x] [#020] **Video zoom / punch-in** — crop, zoom, pan during export
- [x] [#021] **Multi-clip / append** — concatenate multiple video files
- [x] [#024] **Export transcript** — plain text or SRT without video
- [x] [#032] **Export** — fast stream-copy or full re-encode (MP4/MOV/WebM/WAV, 720p4K)
- [x] [#033] **Captions** — SRT, VTT, ASS burn-in with font/color/position options
### Project & state
- [x] [#003] **Undo / redo** — 100-level history via Zundo
- [x] [#004] **Grouped silence-trim zones** — editable batch groups
- [x] [#005] **Edit silence-trim group** settings after applying
- [x] [#022] **Clip thumbnail strip** — canvas capture from video, clickable
- [x] [#035] **Project save / load** — .aive JSON format
- [x] [#037] **Multi-format input** — MP4, MKV, MOV, AVI, WebM, M4A
- [x] [#038] **Keyboard shortcuts** — Space, J/K/L, arrows, Ctrl+Z/S/E, ?
- [x] [#039] **Settings panel** — AI provider config (Ollama, OpenAI, Claude)
- [x] [#040] **Zone creation on timeline** — draggable edits, Delete to remove
- [x] [#041] **Customizable hotkeys** — two presets, click-to-remap, conflict detection
- [x] **[M] Manage Models** — view/delete downloaded Whisper and LLM files
- [x] **[M] Keyboard cheatsheet** — `?` overlay with close button, preset indicator
- [x] **[M] Visual toolbar** — grouped buttons with section dividers
- [x] **[M] Help panel** — full feature documentation in sidebar
- [x] **[M] First-run welcome overlay** — 3-step quick-start guide
- [x] **[M] Responsive welcome screen** — animated audio bars, model picker
- [x] **[M] Error boundary** — catches React crashes, shows fallback + reload
- [x] **[M] Global error logging** — uncaught errors logged to Rust backend
- [x] **[M] Store input validation** — NaN rejection, bounds clamping, min zone duration
- [x] **[M] Runtime assertions** — dev-mode guards in critical paths
- [x] **[M] Backend health check** — polls every 30s, shows reconnecting banner
### Licensing
- [x] **[L] 7-day free trial** — no credit card required
- [x] **[L] License activation** — email confirmation step to deter key sharing
- [x] **[L] Ed25519-signed license keys** — offline verification
- [x] **[L] Trial integrity** — sentinel file prevents delete-and-reset, XOR checksum deters timestamp editing
- [x] **[L] canEdit gate** — defaults to locked, only unlocks after verified status
- [x] **[L] Expired state** — export and loading still work, editing and AI locked
### Robustness
- [x] **[R] Auto-save crash recovery** — every 60s, restore prompt on next launch
- [x] **[R] Bad project state recovery** — auto-prunes invalid zones on load
- [x] **[R] Zone/marker deletion confirmations** — prevents accidental removals
- [x] **[R] Progress bars** — export (determinate), transcription (indeterminate)
- [x] **[R] Loading spinners** — waveform, AI processing
- [x] **[R] Error states with retry** — AIPanel, WaveformTimeline
- [x] **[R] Empty states** — MarkersPanel, AIPanel, ZoneEditor
- [x] **[R] Canvas zone handles enlarged** — radius 6px, hit area increased
- [x] **[R] Search match contrast** — thicker rings, higher opacity
- [x] **[R] Split panes keyboard-accessible** — arrow keys, tabIndex, ARIA
### Testing
- [x] **95 frontend tests** — editorStore (68), licenseStore (22), aiStore (15), assert (4)
- [x] **12 Rust tests** — licensing (7), models (5)
- [x] **CI pipeline** — GitHub Actions (Rust: test+clippy, Frontend: tsc+vitest, Python: pytest)
---
## 🔴 What's Next — highest impact
- [ ] **[LLM] Bundled Qwen3 LLM** — auto-download on first AI use, no API keys needed. Replace Python `ai_provider.py` with llama.cpp Rust bindings. Two sizes: 4B (2.5GB, 8GB+ RAM) and 1.7B (1GB, 4GB+ RAM)
- [ ] **[SHORTS] Smart Shorts finder** — scan transcript for self-contained 1090s segments, ranked by engagement. One-click export as separate clips
- [ ] **[PAYMENT] Wire checkout** — payment page at talked.it, Stripe → license key generation → delivery email
- [ ] **[BETA] Beta testers** — give 510 podcasters free licenses in exchange for feedback
- [ ] **[BUILD] Production builds** — `cargo tauri build` for Windows, macOS, Linux
---
## 🟡 Medium impact — AI features
- [ ] [#044] **AI Transcript Summarization** — bullet-point summary from transcript
- [ ] [#045] **AI Sentence Rephrase** — right-click word → see alternatives → replace
- [ ] [#046] **AI Smart Speed** — detect slow sections → suggest speed adjustments
- [ ] [#047] **AI Auto-Chapters** — topic detection from transcript → markers
- [ ] [#048] **AI Show Notes** — title, description, keywords, timestamps
- [ ] [#049] **AI Find Fluff** — detect rambles, off-topic chatter
- [ ] [#050] **AI Smooth Cuts** — crossfade between deleted segments
---
## 🟢 Lower impact — expansion
- [ ] **Project stitching** — load multiple .aive projects into one export
- [ ] **Batch export** — multiple projects/cuts in sequence
- [ ] **Smart chunking** — overlapping chunks for files >2hr
- [ ] [#014] Alternate transcription backend (VibeVoice-ASR-HF)
- [ ] [#051] **AI B-roll** — generate footage from text prompt
- [ ] [#052] **Smart Layouts** — auto-switch speakers in video frame
- [ ] [#053] **Per-track audio levels** — gain per speaker track
- [ ] [#054] **Intro/Outro templates** — reusable segment presets
- [ ] [#055] **Built-in free music library** — CC0 loops shipped with app
- [ ] [#056] **Stock media browser** — browse local resources/media/
- [ ] [#057] **Sample content downloader** — test video with pre-made transcript
---
## 🎬 OpenShot-inspired (long-term)
- [ ] Keyframe animations — clip position, scale, opacity over time
- [ ] Video transitions — crossfade, wipe between clips
- [ ] Title / text overlays — SVG templates, adjustable font/color
- [ ] Chroma key / greenscreen — per-clip effect
- [ ] Speed ramps — animate speed within a clip
- [ ] Frame-accurate stepping — arrow keys frame by frame
- [ ] Clip trimming on timeline — drag edges to trim
- [ ] Snapping — magnetic snap to markers and edges
---
## 💡 Competitive advantages
- **7-day free trial (no CC)** — full features, no risk
- **One-time purchase** — $39 Pro, $79 Business, no subscription
- **100% offline** — no account, no cloud, no data leaves your machine
- **Local AI** — filler detection, clip suggestions, Smart Clean work offline
- **Word-level precision** — edit video by deleting words, not razor cuts
- **Per-segment re-transcription** — fix transcription errors on just the bad part
- **Auto-ducking background music** — music lowers when speech detected, no keyframing
- **Works on long files** — virtualized transcript + chunked waveform handles 1hr+
---
## 🚫 Explicitly deferred
- Cloud sync / collaboration
- Voice cloning / TTS
- Full multi-track NLE (compositing, keyframes, nested sequences)
- Mobile app
- Subscription model
- Image/video generation models
TalkEdit's advantage is that it isn't a timeline editor — the text-is-the-timeline model makes spoken-word editing drastically faster than dragging razor cuts.
---
## 📦 Launch checklist
- [ ] Landing page at talked.it (features, screenshots, pricing, downloads)
- [ ] Demo video (35 min walkthrough)
- [ ] Product Hunt listing + 50 free licenses
- [ ] r/podcasting, r/VideoEditing, r/selfhosted posts
- [ ] Hacker News "Show HN"
- [ ] GitHub v1.0.0 release with Windows/macOS/Linux binaries
- [ ] Compare page: TalkEdit vs Descript

52
FFmpeg_COMPLIANCE.md Normal file
View File

@ -0,0 +1,52 @@
# FFmpeg Compliance Checklist
Purpose: quick, practical checklist to ensure your TalkEdit distribution complies with FFmpeg licensing and packaging requirements.
1) Choose the FFmpeg build strategy
- Prefer an LGPL-only build (no GPL-only encoders) for minimal obligations.
- If you require GPL encoders (x264/x265/fdk-aac), document the decision and prepare to comply with GPL obligations.
2) Linking vs external binary
- Prefer spawning an external `ffmpeg` binary from Rust (invoke process) rather than statically linking FFmpeg into your app.
- If you link or bundle as a library, treat it as a third-party component and follow license terms strictly.
3) Bundling binary in installers
- If bundling `ffmpeg` binaries in installers, include the appropriate license files (COPYING.LGPL, COPYING.GPL) in the installer and app About/Legal.
- Include a plain-language notice in the installer/readme that explains which codecs/encoders are present and any implications.
4) Source & build-info disclosure
- For GPL components, you must provide access to the corresponding source or provide a written offer. Record the exact FFmpeg commit/configure flags used.
- Add a `third_party/ffmpeg/BUILD_INFO.txt` in the repo (or in release artifacts) containing:
- FFmpeg git commit or version
- configure flags used
- date and builder identity (automated CI username)
- link to the exact source tarball or repo snapshot
5) Make GPL components opt-in
- Default distribution: ship LGPL-only binary or no binary and invoke system `ffmpeg` when available.
- Offer an optional "codec pack" download or advanced installer that includes GPL encoders; make users explicitly accept terms before download.
6) Patent/licensing notice for codecs
- Add a short note in the README/installer explaining that certain codecs (H.264/AAC) may be patent-encumbered and that distributors may require separate licensing.
7) Platform-specific recommendations
- Linux: Prefer calling system FFmpeg (packaged by distro) or instruct users to install via package manager. If bundling, consider AppImage guidance.
- macOS: Prefer Homebrew/optional download; if bundling, include license files and sign/ notarize appropriately.
- Windows: If shipping `ffmpeg.exe`, include license files and a link to the source/build info; include checksums for shipped binaries.
8) Build automation & compliance artifacts
- Add a CI step that builds or fetches the FFmpeg binary, captures `ffmpeg -buildconf`, and writes `BUILD_INFO.txt` into the release artifacts.
- Produce a LICENSES folder in each installer containing FFmpeg license text and any third-party license texts used by your chosen build.
9) User-visible legal UI
- Add an About > Legal pane listing third-party components and linked license files.
- If downloading binaries on first run, show an explicit notice with a link to license and source information and require an OK from the user.
10) Pre-release legal checklist
- Verify whether chosen build enables GPL libraries; if yes, prepare source or written offer before publishing.
- Ensure installer contains license files and links to source/build-info.
- Add a short FAQ entry about codec patents and user options.
Notes & Next Steps
- This checklist is practical guidance, not legal advice. For final release compliance, consult legal counsel experienced in open-source licensing.
- I can add a small CI script snippet that records `ffmpeg -buildconf` and uploads `BUILD_INFO.txt` to release assets — tell me which CI you use and I'll draft it.

View File

@ -1,105 +0,0 @@
# Gemini Insights: OBS Recording Transcriber
## Project Overview
The OBS Recording Transcriber is a Python application built with Streamlit that processes video recordings (particularly from OBS Studio) to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization.
## Key Improvement Areas
### 1. UI Enhancements
- **Implemented:**
- Responsive layout with columns for better organization
- Expanded sidebar with categorized settings
- Custom CSS for improved button styling
- Spinner for long-running operations
- Expanded transcript view by default
- **Additional Recommendations:**
- Add a dark mode toggle
- Implement progress bars for each processing step
- Add tooltips for complex options
- Create a dashboard view for batch processing results
- Add visualization of transcript segments with timestamps
### 2. Ollama Local API Integration
- **Implemented:**
- Local API integration for offline summarization
- Model selection from available Ollama models
- Chunking for long texts
- Fallback to online models when Ollama fails
- **Additional Recommendations:**
- Add temperature and other generation parameters as advanced options
- Implement streaming responses for real-time feedback
- Cache results to avoid reprocessing
- Add support for custom Ollama model creation with specific instructions
- Implement parallel processing for multiple chunks
### 3. Subtitle Export Formats
- **Implemented:**
- SRT export with proper formatting
- ASS export with basic styling
- Multi-format export options
- Automatic segment creation from plain text
- **Additional Recommendations:**
- Add customizable styling options for ASS subtitles
- Implement subtitle editing before export
- Add support for VTT format for web videos
- Implement subtitle timing adjustment
- Add batch export for multiple files
### 4. Architecture and Code Quality
- **Recommendations:**
- Implement proper error handling and logging throughout
- Add unit tests for critical components
- Create a configuration file for default settings
- Implement caching for processed files
- Add type hints throughout the codebase
- Document API endpoints for potential future web service
### 5. Performance Optimizations
- **Recommendations:**
- Implement parallel processing for batch operations
- Add GPU acceleration configuration options
- Optimize memory usage for large files
- Implement incremental processing for very long recordings
- Add compression options for exported files
### 6. Additional Features
- **Recommendations:**
- Speaker diarization (identifying different speakers)
- Language detection and translation
- Keyword extraction and timestamp linking
- Integration with video editing software
- Batch processing queue with email notifications
- Custom vocabulary for domain-specific terminology
## Implementation Roadmap
1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export
2. **Phase 2 (Completed):** Performance optimizations and additional export formats
- Added WebVTT export format for web videos
- Implemented GPU acceleration with automatic device selection
- Added caching system for faster processing of previously transcribed files
- Optimized memory usage with configurable memory limits
- Added compression options for exported files
- Enhanced ASS subtitle styling options
- Added progress indicators for better user feedback
3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation
- Implemented speaker diarization to identify different speakers in recordings
- Added language detection and translation capabilities
- Integrated keyword extraction with timestamp linking
- Created interactive transcript with keyword highlighting
- Added named entity recognition for better content analysis
- Generated keyword index with timestamp references
- Provided speaker statistics and word count analysis
4. **Phase 4:** Integration with other tools and services
## Technical Considerations
- Ensure compatibility with different Whisper model sizes
- Handle large files efficiently to prevent memory issues
- Provide graceful degradation when optional dependencies are missing
- Maintain backward compatibility with existing workflows
- Consider containerization for easier deployment
## Conclusion
The OBS Recording Transcriber has a solid foundation but can be significantly enhanced with the suggested improvements. The focus should be on improving user experience, adding offline processing capabilities, and expanding export options to make the tool more versatile for different use cases.

View File

@ -1,141 +0,0 @@
# Installation Guide for OBS Recording Transcriber
This guide will help you install all the necessary dependencies for the OBS Recording Transcriber application, including the advanced features from Phase 3.
## Prerequisites
Before installing the Python packages, you need to set up some prerequisites:
### 1. Python 3.8 or higher
Make sure you have Python 3.8 or higher installed. You can download it from [python.org](https://www.python.org/downloads/).
### 2. FFmpeg
FFmpeg is required for audio processing:
- **Windows**:
- Download from [gyan.dev/ffmpeg/builds](https://www.gyan.dev/ffmpeg/builds/)
- Extract the ZIP file
- Add the `bin` folder to your system PATH
- **macOS**:
```bash
brew install ffmpeg
```
- **Linux**:
```bash
sudo apt update
sudo apt install ffmpeg
```
### 3. Visual C++ Build Tools (Windows only)
Some packages like `tokenizers` require C++ build tools:
1. Download and install [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
2. During installation, select "Desktop development with C++"
## Installation Steps
### 1. Create a Virtual Environment (Recommended)
```bash
# Create a virtual environment
python -m venv venv
# Activate the virtual environment
# Windows
venv\Scripts\activate
# macOS/Linux
source venv/bin/activate
```
### 2. Install PyTorch
For better performance, install PyTorch with CUDA support if you have an NVIDIA GPU:
```bash
# Windows/Linux with CUDA
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# macOS or CPU-only
pip install torch torchvision torchaudio
```
### 3. Install Dependencies
```bash
# Install all dependencies from requirements.txt
pip install -r requirements.txt
```
### 4. Troubleshooting Common Issues
#### Tokenizers Installation Issues
If you encounter issues with `tokenizers` installation:
1. Make sure you have Visual C++ Build Tools installed (Windows)
2. Try installing Rust: [rustup.rs](https://rustup.rs/)
3. Install tokenizers separately:
```bash
pip install tokenizers --no-binary tokenizers
```
#### PyAnnote.Audio Access
To use speaker diarization, you need a HuggingFace token with access to the pyannote models:
1. Create an account on [HuggingFace](https://huggingface.co/)
2. Generate an access token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
3. Request access to [pyannote/speaker-diarization-3.0](https://huggingface.co/pyannote/speaker-diarization-3.0)
4. Set the token in the application when prompted or as an environment variable:
```bash
# Windows
set HF_TOKEN=your_token_here
# macOS/Linux
export HF_TOKEN=your_token_here
```
#### Memory Issues with Large Files
If you encounter memory issues with large files:
1. Use a smaller Whisper model (e.g., "base" instead of "large")
2. Reduce the GPU memory fraction in the application settings
3. Increase your system's swap space/virtual memory
## Running the Application
After installation, run the application with:
```bash
streamlit run app.py
```
## Optional: Ollama Setup for Local Summarization
To use Ollama for local summarization:
1. Install Ollama from [ollama.ai](https://ollama.ai/)
2. Pull a model:
```bash
ollama pull llama3
```
3. Uncomment the Ollama line in requirements.txt and install:
```bash
pip install ollama
```
## Verifying Installation
To verify that all components are working correctly:
1. Run the application
2. Check that GPU acceleration is available (if applicable)
3. Test a small video file with basic transcription
4. Gradually enable advanced features like diarization and translation
If you encounter any issues, check the application logs for specific error messages.

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2025 DataAnts-AI
Copyright (c) 2026 DataAnts AI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -1,63 +0,0 @@
# 🚨 Quick Fix for PyTorch Compatibility Error
If you're seeing the `torch.compiler.disable` error, here's how to fix it:
## Immediate Fix
```bash
# Stop the current container
docker-compose down
# Remove the old image to force rebuild with fixed versions
docker rmi $(docker images | grep videotranscriber | awk '{print $3}')
# Rebuild with fixed dependencies
docker-compose up -d --build
```
## Better Solution: Use Prebuilt Images
⚠️ **Note**: GitHub Actions had a naming issue that's now fixed. See [FIX-GITHUB-ACTIONS.md](FIX-GITHUB-ACTIONS.md) for details.
Once prebuilt images are available, use them instead:
```bash
# Check if images are ready
docker pull ghcr.io/dataants-ai/videotranscriber:latest
# If successful, stop current container and use prebuilt image
docker-compose down
docker-compose -f docker-compose.prebuilt.yml up -d
```
## What Was Fixed
1. **Version Pinning**: Updated `requirements.txt` with compatible versions:
- `torch==2.0.1` (was `>=1.7.0`)
- `pytorch-lightning==2.0.6` (compatible with torch 2.0.1)
- `pyannote.audio==3.1.1` (updated to compatible version)
2. **Build Process**: Removed duplicate PyTorch installation that could cause conflicts
3. **Prebuilt Images**: Created GitHub Actions to build reliable, tested images
## Verification
After fixing, you should see the Streamlit app load without errors at `http://localhost:8501`
## If Still Having Issues
1. **Clear Docker cache**:
```bash
docker system prune -a
```
2. **Check logs**:
```bash
docker-compose logs -f
```
3. **Manual rebuild**:
```bash
docker build --no-cache -t videotranscriber .
```

308
README.md
View File

@ -1,198 +1,176 @@
# Video Transcriber
# TalkEdit
## Project Overview
The Video Recording Transcriber is a Python application built with Streamlit that processes video and audio recordings to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization.
**Edit video by editing text.** An offline, local-first desktop video editor where deleting a word from the transcript cuts it from the video.
**Supported Formats**: MP4, AVI, MOV, MKV (video) and M4A (audio)
<img width="1034" height="661" alt="TalkEdit screenshot" src="https://github.com/user-attachments/assets/b1ed9505-792e-42ca-bb73-85458d0f02a5" />
---
![SuiteQL_query_UI-1-Thumbnail](https://github.com/user-attachments/assets/72aaf238-6615-4739-b77f-c4eb9ff96996)
## Features
Demo here
- **Text-based editing** — delete, reorder, or correct words in the transcript to edit the underlying video. No razor tool, no timeline slicing.
- **Word-level transcription** — Whisper.cpp with per-word timestamps and confidence scores. Low-confidence words get a visual warning.
- **Four zone types** — Cut, Mute, Sound Gain, and Speed Adjust. Create zones on the waveform timeline and drag edges to refine.
- **Waveform timeline** — zoomable, scrollable waveform with playhead scrubbing, zone visualization, markers, chapters, and thumbnail strips.
- **AI-powered editing**
- Filler word detection and removal
- Smart Clean: one-click filler removal + silence trim + noise reduction + loudness normalization
- Clip suggestions for social media shorts
- Sentence rephrase with AI alternatives
- Supports **Ollama** (local), **OpenAI**, and **Claude** backends
- **Background music** — import a second audio track with auto-ducking via sidechain compression.
- **Export** — fast stream-copy or full re-encode to MP4, MOV, WebM, or WAV. Resolution up to 4K.
- **Captions** — generate SRT, VTT, or burn-in ASS subtitles with configurable font, color, and position.
- **Speaker diarization** — identify and label multiple speakers.
- **Audio tools** — noise reduction (DeepFilterNet), loudness normalization (LUFS targeting), background removal (MediaPipe), batch silence removal, video zoom/punch-in.
- **Project save/load** — `.aive` JSON format preserves all edits, zones, markers, and AI config.
- **Customizable hotkeys** — two presets (Standard / Left-hand) with per-key remapping and conflict detection.
- **100% offline, no account required** — everything runs on your machine. No telemetry, no cloud dependency.
- **7-day free trial** with one-time license key purchase. No subscription.
https://github.com/user-attachments/assets/990e63fc-232e-46a0-afdf-ca8836d46a13
---
## Tech Stack
## Installation
| Layer | Technology |
|-------|------------|
| Desktop shell | **Tauri 2.0** (Rust) |
| Frontend | **React** + **TypeScript** + **Tailwind CSS** |
| State management | **Zustand** with Zundo undo/redo |
| Transcription | **Whisper.cpp** (word-level timestamps) |
| AI / LLM | **Ollama**, **OpenAI**, **Claude** (plugable backends) |
| Media processing | **FFmpeg** |
| Python services | **FastAPI** (spawned as a child process) |
### 🐳 Docker Installation (Recommended)
---
**Benefits**: Isolated environment, no dependency conflicts, easy deployment
## Quick Start
### Prerequisites
- **Node.js** 18+
- **Python** 3.10+
- **FFmpeg** (in PATH)
- **Rust** toolchain (for Tauri)
- **Ollama** (optional, for local AI features)
### Install
#### Option A: Prebuilt Images (Fastest & Most Reliable)
```bash
# 1. Clone repository for config files
git clone https://github.com/DataAnts-AI/VideoTranscriber.git
cd VideoTranscriber
# Root and frontend dependencies
npm install
cd frontend && npm install && cd ..
# 2. Setup environment
cp docker.env.example .env
# Edit .env with your video directory paths
# 3. Ensure Ollama is running on host
ollama serve # In separate terminal
ollama pull llama3
# 4. Start with prebuilt image
docker-compose -f docker-compose.prebuilt.yml up -d
# 5. Access application
# Open browser to: http://localhost:8501
# Backend dependencies
cd backend && pip install -r requirements.txt && cd ..
```
#### Option B: Build from Source (Development)
### Run (Development)
```bash
# Use the local build approach
docker-compose up -d
# Start everything: backend + frontend + Tauri
npm run dev:tauri
```
See [DOCKER.md](DOCKER.md) for complete Docker setup guide.
Or run components separately:
### Easy Installation (Recommended)
```bash
# Terminal 1: Python backend
npm run dev:backend
#### Windows
1. Download or clone the repository
2. Run `install.bat` by double-clicking it
3. Follow the on-screen instructions
#### Linux/macOS
1. Download or clone the repository
2. Open a terminal in the project directory
3. Make the install script executable: `chmod +x install.sh`
4. Run the script: `./install.sh`
5. Follow the on-screen instructions
### Manual Installation
1. Clone the repo.
```
git clone https://github.com/DataAnts-AI/VideoTranscriber.git
cd VideoTranscriber
# Terminal 2: Frontend + Tauri
cd frontend && cargo tauri dev
```
2. Install dependencies:
```
pip install -r requirements.txt
### Build
```bash
npm run build:tauri
```
Notes:
- Ensure that the versions align with the features you use and your system compatibility.
- torch version should match the capabilities of your hardware (e.g., CUDA support for GPUs).
- For advanced features like speaker diarization, you'll need a HuggingFace token.
- See `INSTALLATION.md` for detailed instructions and troubleshooting.
---
## Project Structure
3. Run the application:
```
streamlit run app.py
talkedit/
├── src-tauri/ # Tauri 2.0 Rust runtime
│ ├── Cargo.toml
│ └── src/
│ ├── main.rs # App entry, backend spawner
│ ├── lib.rs # Command handlers (IPC bridge)
│ ├── transcription.rs # Whisper.cpp integration
│ ├── video_editor.rs # FFmpeg-based editing
│ ├── caption_generator.rs
│ ├── diarization.rs
│ ├── ai_provider.rs # Ollama / OpenAI / Claude
│ ├── audio_cleaner.rs
│ ├── background_removal.rs
│ ├── licensing.rs # Trial + key activation
│ ├── models.rs # Shared data types
│ └── paths.rs
├── frontend/ # React + Vite + Tailwind
│ └── src/
│ ├── components/ # UI components
│ │ ├── TranscriptEditor.tsx
│ │ ├── WaveformTimeline.tsx
│ │ ├── VideoPlayer.tsx
│ │ ├── AIPanel.tsx
│ │ ├── ExportDialog.tsx
│ │ ├── SettingsPanel.tsx
│ │ ├── BackgroundMusicPanel.tsx
│ │ ├── MarkersPanel.tsx
│ │ ├── ZoneEditor.tsx
│ │ ├── SilenceTrimmerPanel.tsx
│ │ ├── AppendClipPanel.tsx
│ │ ├── LicenseDialog.tsx
│ │ └── DevPanel.tsx
│ ├── store/ # Zustand state (editorStore, aiStore, settingsStore)
│ ├── hooks/ # Custom React hooks
│ ├── lib/ # Utilities and Tauri bridge
│ └── types/ # TypeScript interfaces
├── backend/ # FastAPI Python services
│ ├── main.py
│ ├── routers/ # API endpoints
│ │ ├── transcribe.py
│ │ ├── ai.py
│ │ ├── audio.py
│ │ ├── captions.py
│ │ └── export.py
│ ├── services/ # Core logic
│ ├── video_editor.py
│ ├── caption_generator.py
│ ├── ai_provider.py
│ ├── diarization.py
│ ├── audio_cleaner.py
│ ├── background_removal.py
│ └── license_server.py
├── shared/ # Schema definitions (project format)
├── models/ # Whisper model storage
└── docs/ # Documentation
```
## Usage
1. Set your base folder where video/audio recordings are stored
2. Select a recording from the dropdown (supports MP4, AVI, MOV, MKV, M4A)
3. Choose transcription and summarization models
4. Configure performance settings (GPU acceleration, caching)
5. Select export formats and compression options
6. Click "Process Recording" to start
---
## Advanced Features
- **Speaker Diarization**: Identify and label different speakers in your recordings
- **Translation**: Automatically detect language and translate to multiple languages
- **Keyword Extraction**: Extract important keywords with timestamp links
- **Interactive Transcript**: Navigate through the transcript with keyword highlighting
- **GPU Acceleration**: Utilize your GPU for faster processing
- **Caching**: Save processing time by caching results
## Keyboard Shortcuts
| Key | Action |
|-----|--------|
| Space | Play / Pause |
| J / K / L | Reverse / Pause / Forward |
| I / O | Mark In / Mark Out |
| ← / → | Seek ±5 seconds |
| Delete | Delete selected words or zones |
| Ctrl+Z | Undo |
| Ctrl+Shift+Z | Redo |
| Ctrl+S | Save project |
| Ctrl+E | Export |
| Ctrl+F | Search transcript |
| Ctrl+Scroll | Zoom waveform |
| ? | Shortcut cheatsheet |
---
## Key Improvement Areas
## License
### 1. UI Enhancements
- **Implemented:**
- Responsive layout with columns for better organization
- Expanded sidebar with categorized settings
- Custom CSS for improved button styling
- Spinner for long-running operations
- Expanded transcript view by default
- **Additional Recommendations:**
- Add a dark mode toggle
- Implement progress bars for each processing step
- Add tooltips for complex options
- Create a dashboard view for batch processing results
- Add visualization of transcript segments with timestamps
### 2. Ollama Local API Integration
- **Implemented:**
- Local API integration for offline summarization
- Model selection from available Ollama models
- Chunking for long texts
- Fallback to online models when Ollama fails
- **Additional Recommendations:**
- Add temperature and other generation parameters as advanced options
- Implement streaming responses for real-time feedback
- Cache results to avoid reprocessing
- Add support for custom Ollama model creation with specific instructions
- Implement parallel processing for multiple chunks
### 3. Subtitle Export Formats
- **Implemented:**
- SRT export with proper formatting
- ASS export with basic styling
- Multi-format export options
- Automatic segment creation from plain text
- **Additional Recommendations:**
- Add customizable styling options for ASS subtitles
- Implement subtitle editing before export
- Add support for VTT format for web videos
- Implement subtitle timing adjustment
- Add batch export for multiple files
### 4. Architecture and Code Quality
- **Recommendations:**
- Implement proper error handling and logging throughout
- Add unit tests for critical components
- Create a configuration file for default settings
- Implement caching for processed files
- Add type hints throughout the codebase
- Document API endpoints for potential future web service
### 5. Performance Optimizations
- **Recommendations:**
- Implement parallel processing for batch operations
- Add GPU acceleration configuration options
- Optimize memory usage for large files
- Implement incremental processing for very long recordings
- Add compression options for exported files
### 6. Additional Features
- **Recommendations:**
- Speaker diarization (identifying different speakers)
- Language detection and translation
- Keyword extraction and timestamp linking
- Integration with video editing software
- Batch processing queue with email notifications
- Custom vocabulary for domain-specific terminology
## Implementation Roadmap
1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export
2. **Phase 2 (Completed):** Performance optimizations and additional export formats
- Added WebVTT export format for web videos
- Implemented GPU acceleration with automatic device selection
- Added caching system for faster processing of previously transcribed files
- Optimized memory usage with configurable memory limits
- Added compression options for exported files
- Enhanced ASS subtitle styling options
- Added progress indicators for better user feedback
3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation
- Implemented speaker diarization to identify different speakers in recordings
- Added language detection and translation capabilities
- Integrated keyword extraction with timestamp linking
- Created interactive transcript with keyword highlighting
- Added named entity recognition for better content analysis
- Generated keyword index with timestamp references
- Provided speaker statistics and word count analysis
4. **Phase 4:** Integration with other tools and services (In progess)
Reach out to support@dataants.org if you need assistance with any AI solutions - we offer support for n8n workflows, local RAG chatbots, and ERP and Financial reporting.
Source code is MIT — see [LICENSE](LICENSE) for details. The distributed binary includes a 7-day free trial requiring a one-time license key purchase for continued use.

83
TECH_FEATURES.md Normal file
View File

@ -0,0 +1,83 @@
# TalkEdit — Tech Stack, Tools, and Features
This document summarizes the chosen technology, tooling, the full feature set, recommended additions, and items on the back burner.
## Overview
- Goal: Offline, local text-based audio/video editor (Descript-style) focused on spoken-word creators (podcasters, YouTubers). Fast, privacy-first, single-file installer.
## Tech Stack
- Frontend: React 19 + Vite + TypeScript + Tailwind CSS + Zustand (with zundo undo/redo) + Virtuoso (virtualized transcript)
- Backend: Tauri 2.0 (Rust) for file I/O, licensing, licensing crypto (Ed25519), model management, error logging
- Transcription: Python faster-whisper with WhisperX for word-level alignment. Models downloaded on demand.
- Audio/Video Processing: FFmpeg invoked from Rust via Python scripts (video_editor.py, audio_cleaner.py, caption_generator.py)
- AI: Ollama, OpenAI, Claude through Python ai_provider.py. Bundled Qwen3 LLM planned.
- State: Zustand (in-frontend store) + zundo middleware for undo/redo history
- Packaging: Tauri `tauri build` for cross-platform installers
## Developer Tools
- Rust toolchain (cargo, rustc)
- Node.js + npm for frontend
- Python 3.11+ (faster-whisper, WhisperX, AI providers)
- FFmpeg binaries (platform-specific; bundled or downloaded at install)
- Build/test: Tauri CLI, Vite dev server
- Testing: Vitest (frontend), cargo test (Rust), pytest (Python)
- CI: GitHub Actions (Rust clippy/test, Frontend tsc/vitest, Python pytest)
## Implemented Features
- [x] 1. Media import via file dialog (audio/video auto audio-extract)
- [x] 2. One-click local transcription with model selector (tiny/base → larger models) and model-size chooser
- [x] 3. Scrollable, Google-Doc-style transcript editor (Virtuoso virtualized)
- Click word → seek video/audio
- Select words → cut corresponding media segment (smart 150250ms fades)
- [x] 4. Smart Cleanup
- Filler word removal (configurable list per-project)
- Silence trimming
- [x] 5. Audio Polish chain (FFmpeg): normalize, compression, noise reduction
- [x] 6. Preview with synced playback, undo/redo (zundo), project save/load
- [x] 7. Export MP4/audio with SRT/VTT/ASS captions (speaker-labeled)
- [x] 8. Speaker diarization
- [x] 9. Custom filler lists per-project
- [x] 10. Background music with auto-ducking
- [x] 11. Append clips (concatenation)
- [x] 12. Settings: AI provider config (Ollama, OpenAI, Claude)
- [x] 13. Keyboard shortcuts with custom remapping
- [x] 14. Help panel + cheatsheet
- [x] 15. 7-day licensing with Ed25519-signed license keys
## Recommended Additions (near-term, high ROI)
- [ ] Local GPU/CPU detection & recommended model/settings UI
- [ ] Per-project incremental transcription: re-run only edited segments
- [ ] "Preview cleaning" dry-run that highlights candidate removals before applying
- [ ] Export size/time estimator and suggested export presets
- [ ] Accessibility export presets (podcast vs YouTube presets)
- [ ] Bundled Qwen3 LLM for offline AI features
## Remove / Defer (Back Burner)
These broaden scope or add legal/privacy surface — defer for now.
- Voice cloning / TTS: DEFER
- Multi-track, full timeline NLE features: DEFER
- Real-time collaboration / cloud sync: DEFER
- Built-in cloud processing by default: DEFER (make optional add-on later)
## Risks & Mitigations
- Large model sizes: don't bundle large models; download on-demand and document storage location.
- Timestamp accuracy: WhisperX word-level alignment + manual per-segment re-run available.
- FFmpeg packaging/licensing: ship platform-specific binaries or use Tauri bundling guidance; document license compliance.
## Prioritized Quick Wins
1. Per-project incremental transcription
2. "Preview cleaning" dry-run UI
3. Export presets (podcast vs YouTube)
## Next Steps for Implementation
- Bundle Qwen3 LLM for offline AI processing.
- Implement incremental transcription to speed up re-editing workflows.
- Add export presets and size estimation.
- Improve GPU/CPU detection and model recommendations.
---
Generated to capture tech, tools, implemented features, and the recommended add/remove/defer list.

544
app.py
View File

@ -1,544 +0,0 @@
import streamlit as st
from utils.audio_processing import extract_audio
from utils.transcription import transcribe_audio
from utils.summarization import summarize_text
from utils.validation import validate_environment
from utils.export import export_transcript
from pathlib import Path
import os
import logging
import humanize
from datetime import timedelta
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Try to import Ollama integration, but don't fail if it's not available
try:
from utils.ollama_integration import check_ollama_available, list_available_models, chunk_and_summarize
OLLAMA_AVAILABLE = check_ollama_available()
except ImportError:
OLLAMA_AVAILABLE = False
# Try to import GPU utilities, but don't fail if not available
try:
from utils.gpu_utils import get_gpu_info, configure_gpu
GPU_UTILS_AVAILABLE = True
except ImportError:
GPU_UTILS_AVAILABLE = False
# Try to import caching utilities, but don't fail if not available
try:
from utils.cache import get_cache_size, clear_cache
CACHE_AVAILABLE = True
except ImportError:
CACHE_AVAILABLE = False
# Try to import diarization utilities, but don't fail if not available
try:
from utils.diarization import transcribe_with_diarization
DIARIZATION_AVAILABLE = True
except ImportError:
DIARIZATION_AVAILABLE = False
# Try to import translation utilities, but don't fail if not available
try:
from utils.translation import transcribe_and_translate, get_language_name
TRANSLATION_AVAILABLE = True
except ImportError:
TRANSLATION_AVAILABLE = False
# Try to import keyword extraction utilities, but don't fail if not available
try:
from utils.keyword_extraction import extract_keywords_from_transcript, generate_keyword_index, generate_interactive_transcript
KEYWORD_EXTRACTION_AVAILABLE = True
except ImportError:
KEYWORD_EXTRACTION_AVAILABLE = False
def main():
# Set page configuration
st.set_page_config(
page_title="OBS Recording Transcriber",
page_icon="🎥",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better UI
st.markdown("""
<style>
.main .block-container {
padding-top: 2rem;
padding-bottom: 2rem;
}
.stButton>button {
width: 100%;
}
.stDownloadButton>button {
width: 100%;
}
.stProgress > div > div > div {
background-color: #4CAF50;
}
.speaker {
font-weight: bold;
color: #1E88E5;
}
.timestamp {
color: #757575;
font-size: 0.9em;
margin-right: 8px;
}
.keyword {
background-color: #FFF9C4;
padding: 0 2px;
border-radius: 3px;
}
.interactive-transcript p {
margin-bottom: 8px;
}
</style>
""", unsafe_allow_html=True)
st.title("🎥 OBS Recording Transcriber")
st.caption("Process your OBS recordings with AI transcription and summarization")
# Sidebar configuration
st.sidebar.header("Settings")
# Allow the user to select a base folder
base_folder = st.sidebar.text_input(
"Enter the base folder path:",
value=str(Path.home())
)
base_path = Path(base_folder)
# Model selection
st.sidebar.subheader("Model Settings")
# Transcription model selection
transcription_model = st.sidebar.selectbox(
"Transcription Model",
["tiny", "base", "small", "medium", "large"],
index=1,
help="Select the Whisper model size. Larger models are more accurate but slower."
)
# Summarization model selection
summarization_options = ["Hugging Face (Online)", "Ollama (Local)"] if OLLAMA_AVAILABLE else ["Hugging Face (Online)"]
summarization_method = st.sidebar.selectbox(
"Summarization Method",
summarization_options,
index=0,
help="Select the summarization method. Ollama runs locally but requires installation."
)
# If Ollama is selected, show model selection
ollama_model = None
if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)":
available_models = list_available_models()
if available_models:
ollama_model = st.sidebar.selectbox(
"Ollama Model",
available_models,
index=0 if "llama3" in available_models else 0,
help="Select the Ollama model to use for summarization."
)
else:
st.sidebar.warning("No Ollama models found. Please install models using 'ollama pull model_name'.")
# Advanced features
st.sidebar.subheader("Advanced Features")
# Speaker diarization
use_diarization = st.sidebar.checkbox(
"Speaker Diarization",
value=False,
disabled=not DIARIZATION_AVAILABLE,
help="Identify different speakers in the recording."
)
# Show HF token input if diarization is enabled
hf_token = None
if use_diarization and DIARIZATION_AVAILABLE:
hf_token = st.sidebar.text_input(
"HuggingFace Token",
type="password",
help="Required for speaker diarization. Get your token at huggingface.co/settings/tokens"
)
num_speakers = st.sidebar.number_input(
"Number of Speakers",
min_value=1,
max_value=10,
value=2,
help="Specify the number of speakers if known, or leave at default for auto-detection."
)
# Translation
use_translation = st.sidebar.checkbox(
"Translation",
value=False,
disabled=not TRANSLATION_AVAILABLE,
help="Translate the transcript to another language."
)
# Target language selection if translation is enabled
target_lang = None
if use_translation and TRANSLATION_AVAILABLE:
target_lang = st.sidebar.selectbox(
"Target Language",
["en", "es", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko", "ar"],
format_func=lambda x: f"{get_language_name(x)} ({x})",
help="Select the language to translate to."
)
# Keyword extraction
use_keywords = st.sidebar.checkbox(
"Keyword Extraction",
value=False,
disabled=not KEYWORD_EXTRACTION_AVAILABLE,
help="Extract keywords and link them to timestamps."
)
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
max_keywords = st.sidebar.slider(
"Max Keywords",
min_value=5,
max_value=30,
value=15,
help="Maximum number of keywords to extract."
)
# Performance settings
st.sidebar.subheader("Performance Settings")
# GPU acceleration
use_gpu = st.sidebar.checkbox(
"Use GPU Acceleration",
value=True if GPU_UTILS_AVAILABLE else False,
disabled=not GPU_UTILS_AVAILABLE,
help="Use GPU for faster processing if available."
)
# Show GPU info if available
if GPU_UTILS_AVAILABLE and use_gpu:
gpu_info = get_gpu_info()
if gpu_info["cuda_available"]:
gpu_devices = [f"{d['name']} ({humanize.naturalsize(d['total_memory'])})" for d in gpu_info["cuda_devices"]]
st.sidebar.info(f"GPU(s) available: {', '.join(gpu_devices)}")
elif gpu_info["mps_available"]:
st.sidebar.info("Apple Silicon GPU (MPS) available")
else:
st.sidebar.warning("No GPU detected. Using CPU.")
# Memory usage
memory_fraction = st.sidebar.slider(
"GPU Memory Usage",
min_value=0.1,
max_value=1.0,
value=0.8,
step=0.1,
disabled=not (GPU_UTILS_AVAILABLE and use_gpu),
help="Fraction of GPU memory to use. Lower if you encounter out-of-memory errors."
)
# Caching options
use_cache = st.sidebar.checkbox(
"Use Caching",
value=True if CACHE_AVAILABLE else False,
disabled=not CACHE_AVAILABLE,
help="Cache transcription results to avoid reprocessing the same files."
)
# Cache management
if CACHE_AVAILABLE and use_cache:
cache_size, cache_files = get_cache_size()
if cache_size > 0:
st.sidebar.info(f"Cache: {humanize.naturalsize(cache_size)} ({cache_files} files)")
if st.sidebar.button("Clear Cache"):
cleared = clear_cache()
st.sidebar.success(f"Cleared {cleared} cache files")
# Export options
st.sidebar.subheader("Export Options")
export_format = st.sidebar.multiselect(
"Export Formats",
["TXT", "SRT", "VTT", "ASS"],
default=["TXT"],
help="Select the formats to export the transcript."
)
# Compression options
compress_exports = st.sidebar.checkbox(
"Compress Exports",
value=False,
help="Compress exported files to save space."
)
if compress_exports:
compression_type = st.sidebar.radio(
"Compression Format",
["gzip", "zip"],
index=0,
help="Select the compression format for exported files."
)
else:
compression_type = None
# ASS subtitle styling
if "ASS" in export_format:
st.sidebar.subheader("ASS Subtitle Styling")
show_style_options = st.sidebar.checkbox("Customize ASS Style", value=False)
if show_style_options:
ass_style = {}
ass_style["fontname"] = st.sidebar.selectbox(
"Font",
["Arial", "Helvetica", "Times New Roman", "Courier New", "Comic Sans MS"],
index=0
)
ass_style["fontsize"] = st.sidebar.slider("Font Size", 12, 72, 48)
ass_style["alignment"] = st.sidebar.selectbox(
"Alignment",
["2 (Bottom Center)", "1 (Bottom Left)", "3 (Bottom Right)", "8 (Top Center)"],
index=0
).split()[0] # Extract just the number
ass_style["bold"] = "-1" if st.sidebar.checkbox("Bold", value=True) else "0"
ass_style["italic"] = "-1" if st.sidebar.checkbox("Italic", value=False) else "0"
else:
ass_style = None
# Validate environment
env_errors = validate_environment(base_path)
if env_errors:
st.error("## Environment Issues")
for error in env_errors:
st.markdown(f"- {error}")
return
# File selection - support multiple video and audio formats
supported_extensions = ["*.mp4", "*.avi", "*.mov", "*.mkv", "*.m4a"]
recordings = []
for extension in supported_extensions:
recordings.extend(base_path.glob(extension))
if not recordings:
st.warning(f"📂 No recordings found in the folder: {base_folder}!")
st.info("💡 Supported formats: MP4, AVI, MOV, MKV, M4A")
return
selected_file = st.selectbox("Choose a recording", recordings)
# Process button with spinner
if st.button("🚀 Start Processing"):
# Create a progress bar
progress_bar = st.progress(0)
status_text = st.empty()
try:
# Update progress
status_text.text("Extracting audio...")
progress_bar.progress(10)
# Process based on selected features
if use_diarization and DIARIZATION_AVAILABLE and hf_token:
# Transcribe with speaker diarization
status_text.text("Transcribing with speaker diarization...")
num_speakers_arg = int(num_speakers) if num_speakers > 0 else None
diarized_segments, diarized_transcript = transcribe_with_diarization(
selected_file,
whisper_model=transcription_model,
num_speakers=num_speakers_arg,
use_gpu=use_gpu,
hf_token=hf_token
)
segments = diarized_segments
transcript = diarized_transcript
elif use_translation and TRANSLATION_AVAILABLE:
# Transcribe and translate
status_text.text("Transcribing and translating...")
original_segments, translated_segments, original_transcript, translated_transcript = transcribe_and_translate(
selected_file,
whisper_model=transcription_model,
target_lang=target_lang,
use_gpu=use_gpu
)
segments = translated_segments
transcript = translated_transcript
# Store original for display
original_text = original_transcript
else:
# Standard transcription
status_text.text("Transcribing audio...")
segments, transcript = transcribe_audio(
selected_file,
model=transcription_model,
use_cache=use_cache,
use_gpu=use_gpu,
memory_fraction=memory_fraction
)
progress_bar.progress(50)
if transcript:
# Extract keywords if requested
keyword_timestamps = None
entity_timestamps = None
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
status_text.text("Extracting keywords...")
keyword_timestamps, entity_timestamps = extract_keywords_from_transcript(
transcript,
segments,
max_keywords=max_keywords,
use_gpu=use_gpu
)
# Generate keyword index
keyword_index = generate_keyword_index(keyword_timestamps, entity_timestamps)
# Generate interactive transcript
interactive_transcript = generate_interactive_transcript(
segments,
keyword_timestamps,
entity_timestamps
)
# Generate summary based on selected method
status_text.text("Generating summary...")
if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)" and ollama_model:
summary = chunk_and_summarize(transcript, model=ollama_model)
if not summary:
st.warning("Ollama summarization failed. Falling back to Hugging Face.")
summary = summarize_text(
transcript,
use_gpu=use_gpu,
memory_fraction=memory_fraction
)
else:
summary = summarize_text(
transcript,
use_gpu=use_gpu,
memory_fraction=memory_fraction
)
progress_bar.progress(80)
status_text.text("Preparing results...")
# Display results in tabs
tab1, tab2, tab3 = st.tabs(["Summary", "Transcript", "Advanced"])
with tab1:
st.subheader("🖍 Summary")
st.write(summary)
# If translation was used, show original language
if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals():
with st.expander("Original Language Summary"):
original_summary = summarize_text(
original_text,
use_gpu=use_gpu,
memory_fraction=memory_fraction
)
st.write(original_summary)
with tab2:
st.subheader("📜 Full Transcript")
# Show interactive transcript if keywords were extracted
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'interactive_transcript' in locals():
st.markdown(interactive_transcript, unsafe_allow_html=True)
else:
st.text(transcript)
# If translation was used, show original language
if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals():
with st.expander("Original Language Transcript"):
st.text(original_text)
with tab3:
# Show keyword index if available
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'keyword_index' in locals():
st.subheader("🔑 Keyword Index")
st.markdown(keyword_index)
# Show speaker information if available
if use_diarization and DIARIZATION_AVAILABLE:
st.subheader("🎙️ Speaker Information")
speakers = set(segment.get('speaker', 'UNKNOWN') for segment in segments)
st.write(f"Detected {len(speakers)} speakers: {', '.join(speakers)}")
# Count words per speaker
speaker_words = {}
for segment in segments:
speaker = segment.get('speaker', 'UNKNOWN')
words = len(segment['text'].split())
if speaker in speaker_words:
speaker_words[speaker] += words
else:
speaker_words[speaker] = words
# Display speaker statistics
st.write("### Speaker Statistics")
for speaker, words in speaker_words.items():
st.write(f"- **{speaker}**: {words} words")
# Export options
st.subheader("💾 Export Options")
export_cols = st.columns(len(export_format))
output_base = Path(selected_file).stem
for i, format_type in enumerate(export_format):
with export_cols[i]:
if format_type == "TXT":
st.download_button(
label=f"Download {format_type}",
data=transcript,
file_name=f"{output_base}_transcript.txt",
mime="text/plain"
)
elif format_type in ["SRT", "VTT", "ASS"]:
# Export to subtitle format
output_path = export_transcript(
transcript,
output_base,
format_type.lower(),
segments=segments,
compress=compress_exports,
compression_type=compression_type,
style=ass_style if format_type == "ASS" and ass_style else None
)
# Read the exported file for download
with open(output_path, 'rb') as f:
subtitle_content = f.read()
# Determine file extension
file_ext = f".{format_type.lower()}"
if compress_exports:
file_ext += ".gz" if compression_type == "gzip" else ".zip"
st.download_button(
label=f"Download {format_type}",
data=subtitle_content,
file_name=f"{output_base}{file_ext}",
mime="application/octet-stream"
)
# Clean up the temporary file
os.remove(output_path)
# Complete progress
progress_bar.progress(100)
status_text.text("Processing complete!")
else:
st.error("❌ Failed to process recording")
except Exception as e:
st.error(f"An error occurred: {e}")
st.write(e) # This will show the traceback in the Streamlit app
if __name__ == "__main__":
main()

1
backend/.python-version Normal file
View File

@ -0,0 +1 @@
3.11.15

54
backend/ai_provider.py Normal file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""
AI provider interface for Ollama, OpenAI, and Claude.
"""
import json
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from services.ai_provider import AIProvider
def main():
if len(sys.argv) < 2:
print("Usage: python ai_provider.py <command> [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
if command == "complete":
if len(sys.argv) < 4:
print("Usage: python ai_provider.py complete <prompt> <provider> [model] [api_key] [base_url] [system_prompt] [temperature]", file=sys.stderr)
sys.exit(1)
prompt = sys.argv[2]
provider = sys.argv[3]
model = sys.argv[4] if len(sys.argv) > 4 else None
api_key = sys.argv[5] if len(sys.argv) > 5 else None
base_url = sys.argv[6] if len(sys.argv) > 6 else None
system_prompt = sys.argv[7] if len(sys.argv) > 7 else None
temperature = float(sys.argv[8]) if len(sys.argv) > 8 else 0.3
result = AIProvider.complete(prompt, provider, model, api_key, base_url, system_prompt, temperature)
print(json.dumps({"response": result}))
elif command == "list_ollama_models":
base_url = sys.argv[2] if len(sys.argv) > 2 else "http://localhost:11434"
result = AIProvider.list_ollama_models(base_url)
print(json.dumps({"models": result}))
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

47
backend/audio_cleaner.py Normal file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python3
"""
Audio cleaning operations using DeepFilterNet or FFmpeg fallback.
"""
import json
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from services.audio_cleaner import clean_audio, is_deepfilter_available
def main():
if len(sys.argv) < 2:
print("Usage: python audio_cleaner.py <command> [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
if command == "clean_audio":
if len(sys.argv) != 4:
print("Usage: python audio_cleaner.py clean_audio <input_path> <output_path>", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[2]
output_path = sys.argv[3]
result = clean_audio(input_path, output_path)
print(json.dumps({"output_path": result}))
elif command == "is_deepfilter_available":
result = is_deepfilter_available()
print(json.dumps({"available": result}))
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python3
"""
Background removal operations (placeholder for Phase 5).
"""
import json
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from services.background_removal import is_available, remove_background_on_export
def main():
if len(sys.argv) < 2:
print("Usage: python background_removal.py <command> [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
if command == "is_available":
result = is_available()
print(json.dumps({"available": result}))
elif command == "remove_background_on_export":
if len(sys.argv) != 6:
print("Usage: python background_removal.py remove_background_on_export <input_path> <output_path> <replacement> <replacement_value>", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[2]
output_path = sys.argv[3]
replacement = sys.argv[4]
replacement_value = sys.argv[5]
result = remove_background_on_export(input_path, output_path, replacement, replacement_value)
print(json.dumps({"output_path": result}))
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
Generate caption files from word-level timestamps.
"""
import json
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from services.caption_generator import generate_srt, generate_vtt, generate_ass, save_captions
def main():
if len(sys.argv) < 2:
print("Usage: python caption_generator.py <command> [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
if command == "generate_srt":
if len(sys.argv) < 4:
print("Usage: python caption_generator.py generate_srt <words_json> [deleted_indices_json] [words_per_line]", file=sys.stderr)
sys.exit(1)
words = json.loads(sys.argv[2])
deleted_indices = set(json.loads(sys.argv[3])) if len(sys.argv) > 3 and sys.argv[3] != "null" else None
words_per_line = int(sys.argv[4]) if len(sys.argv) > 4 else 8
result = generate_srt(words, deleted_indices, words_per_line)
print(json.dumps({"content": result}))
elif command == "generate_vtt":
if len(sys.argv) < 4:
print("Usage: python caption_generator.py generate_vtt <words_json> [deleted_indices_json] [words_per_line]", file=sys.stderr)
sys.exit(1)
words = json.loads(sys.argv[2])
deleted_indices = set(json.loads(sys.argv[3])) if len(sys.argv) > 3 and sys.argv[3] != "null" else None
words_per_line = int(sys.argv[4]) if len(sys.argv) > 4 else 8
result = generate_vtt(words, deleted_indices, words_per_line)
print(json.dumps({"content": result}))
elif command == "generate_ass":
if len(sys.argv) < 4:
print("Usage: python caption_generator.py generate_ass <words_json> [deleted_indices_json] [words_per_line] [style_json]", file=sys.stderr)
sys.exit(1)
words = json.loads(sys.argv[2])
deleted_indices = set(json.loads(sys.argv[3])) if len(sys.argv) > 3 and sys.argv[3] != "null" else None
words_per_line = int(sys.argv[4]) if len(sys.argv) > 4 else 8
style = json.loads(sys.argv[5]) if len(sys.argv) > 5 and sys.argv[5] != "null" else None
result = generate_ass(words, deleted_indices, words_per_line, style)
print(json.dumps({"content": result}))
elif command == "save_captions":
if len(sys.argv) != 4:
print("Usage: python caption_generator.py save_captions <content> <output_path>", file=sys.stderr)
sys.exit(1)
content = sys.argv[2]
output_path = sys.argv[3]
result = save_captions(content, output_path)
print(json.dumps({"output_path": result}))
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

98
backend/dev_main.py Normal file
View File

@ -0,0 +1,98 @@
"""Lightweight development backend for UI work.
This avoids importing heavy ML dependencies so the UI can run during frontend
development without installing large Python packages (torch/whisperx/etc.).
Use this when you only need the health/file streaming endpoints.
"""
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from pathlib import Path
from routers import audio
app = FastAPI(title="TalkEdit Dev Backend", version="0.0.1")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
expose_headers=["Content-Range", "Accept-Ranges", "Content-Length"],
)
MIME_MAP = {
".mp4": "video/mp4",
".mkv": "video/x-matroska",
".mov": "video/quicktime",
".avi": "video/x-msvideo",
".webm": "video/webm",
".m4a": "audio/mp4",
".wav": "audio/wav",
".mp3": "audio/mpeg",
".flac": "audio/flac",
}
app.include_router(audio.router)
@app.get("/health")
async def health():
return {"status": "ok"}
@app.get("/file")
async def serve_local_file(request: Request, path: str):
file_path = Path(path)
if not file_path.is_file():
raise HTTPException(status_code=404, detail=f"File not found: {path}")
file_size = file_path.stat().st_size
content_type = MIME_MAP.get(file_path.suffix.lower(), "application/octet-stream")
range_header = request.headers.get("range")
if range_header:
range_spec = range_header.replace("bytes=", "")
start_str, end_str = range_spec.split("-")
start = int(start_str) if start_str else 0
end = int(end_str) if end_str else file_size - 1
end = min(end, file_size - 1)
content_length = end - start + 1
def iter_range():
with open(file_path, "rb") as f:
f.seek(start)
remaining = content_length
while remaining > 0:
chunk = f.read(min(65536, remaining))
if not chunk:
break
remaining -= len(chunk)
yield chunk
return StreamingResponse(
iter_range(),
status_code=206,
media_type=content_type,
headers={
"Content-Range": f"bytes {start}-{end}/{file_size}",
"Accept-Ranges": "bytes",
"Content-Length": str(content_length),
},
)
def iter_file():
with open(file_path, "rb") as f:
while chunk := f.read(65536):
yield chunk
return StreamingResponse(
iter_file(),
media_type=content_type,
headers={
"Accept-Ranges": "bytes",
"Content-Length": str(file_size),
},
)

47
backend/diarization.py Normal file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python3
"""
Speaker diarization using pyannote.audio.
"""
import json
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from services.diarization import diarize_and_label
def main():
if len(sys.argv) < 2:
print("Usage: python diarization.py <command> [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
if command == "diarize_and_label":
if len(sys.argv) < 4:
print("Usage: python diarization.py diarize_and_label <transcription_result_json> <audio_path> [hf_token] [num_speakers] [use_gpu]", file=sys.stderr)
sys.exit(1)
transcription_result = json.loads(sys.argv[2])
audio_path = sys.argv[3]
hf_token = sys.argv[4] if len(sys.argv) > 4 else None
num_speakers = int(sys.argv[5]) if len(sys.argv) > 5 and sys.argv[5] != "null" else None
use_gpu = sys.argv[6].lower() == "true" if len(sys.argv) > 6 else True
result = diarize_and_label(transcription_result, audio_path, hf_token, num_speakers, use_gpu)
print(json.dumps(result))
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

223
backend/license_server.py Normal file
View File

@ -0,0 +1,223 @@
#!/usr/bin/env python3
"""
TalkEdit License Server — Stripe webhook + license key generator.
Usage (development):
python backend/license_server.py
Then create a test license:
python backend/license_server.py generate --email test@example.com --tier pro
This is a minimal server. In production, deploy as a Cloudflare Worker,
Vercel function, or a small VPS behind nginx.
"""
import base64
import json
import os
import time
import hmac
import hashlib
from nacl.bindings import (
crypto_sign_seed_keypair,
crypto_sign,
crypto_sign_BYTES,
)
# === CONFIGURATION ===
# The Ed25519 private key (base64-encoded). Keep this secret!
# Generate with: python3 -c "import os,base64; print(base64.b64encode(os.urandom(32)).decode())"
LICENSE_PRIVATE_KEY_B64 = "ONTdT2Hn367fMlovqulz7WYQPQru7uFa/GaSfjhnR9x7Qoe7uBPNwIFeW4p7A0g05Qj14rvaQ4Mm1u/LzgeEsA=="
# Stripe webhook secret (set this in production)
STRIPE_WEBHOOK_SECRET = os.environ.get("STRIPE_WEBHOOK_SECRET", "")
# === TIER DEFINITIONS ===
TIERS = {
"pro": {
"price_id": "price_pro_monthly", # Replace with your Stripe price ID
"features": ["bundled_deps", "auto_updates", "priority_support"],
"max_activations": 1,
"duration_days": 365,
},
"business": {
"price_id": "price_business_monthly",
"features": ["bundled_deps", "auto_updates", "priority_support",
"white_label", "audit_logging", "bulk_deployment"],
"max_activations": 10,
"duration_days": 365,
},
}
def generate_license_key(
customer_email: str,
tier: str = "pro",
license_id: str = None,
duration_days: int = None,
features: list = None,
max_activations: int = None,
) -> str:
"""Generate a signed TalkEdit license key.
Returns a string like: talkedit_v1_<base64(payload)>.<base64(signature)>
"""
if license_id is None:
license_id = f"lic_{int(time.time())}_{os.urandom(4).hex()}"
tier_config = TIERS.get(tier, TIERS["pro"])
if duration_days is None:
duration_days = tier_config["duration_days"]
if features is None:
features = tier_config["features"]
if max_activations is None:
max_activations = tier_config["max_activations"]
now = int(time.time())
payload = {
"license_id": license_id,
"customer_email": customer_email,
"tier": tier,
"features": features,
"issued_at": now,
"expires_at": now + duration_days * 86400,
"max_activations": max_activations,
}
payload_bytes = json.dumps(payload, separators=(",", ":")).encode("utf-8")
# Sign with Ed25519
seed = base64.b64decode(LICENSE_PRIVATE_KEY_B64)
if len(seed) == 64:
seed = seed[:32] # First 32 bytes are the actual seed
pk, sk = crypto_sign_seed_keypair(seed)
signed = crypto_sign(payload_bytes, sk)
signature = signed[:crypto_sign_BYTES]
payload_b64 = base64.b64encode(payload_bytes).decode().rstrip("=")
sig_b64 = base64.b64encode(signature).decode().rstrip("=")
return f"talkedit_v1_{payload_b64}.{sig_b64}"
def verify_stripe_webhook(payload: bytes, sig_header: str) -> dict:
"""Verify Stripe webhook signature and return the event."""
if not STRIPE_WEBHOOK_SECRET:
raise ValueError("STRIPE_WEBHOOK_SECRET not configured")
# Stripe sends signature in the `stripe-signature` header
# Format: t=timestamp,v1=signature
parts = {}
for item in sig_header.split(","):
key, _, value = item.partition("=")
parts[key.strip()] = value.strip()
timestamp = parts.get("t", "")
expected_sig = parts.get("v1", "")
# Compute signature
signed_payload = f"{timestamp}.{payload.decode()}".encode()
computed_sig = hmac.new(
STRIPE_WEBHOOK_SECRET.encode(),
signed_payload,
hashlib.sha256,
).hexdigest()
if not hmac.compare_digest(computed_sig, expected_sig):
raise ValueError("Invalid webhook signature")
return json.loads(payload)
# === CLI ===
def main():
import sys
if len(sys.argv) > 1 and sys.argv[1] == "generate":
# CLI mode: generate a test license key
import argparse
parser = argparse.ArgumentParser(description="Generate TalkEdit license key")
parser.add_argument("--email", default="test@example.com")
parser.add_argument("--tier", default="pro", choices=["pro", "business"])
parser.add_argument("--days", type=int, default=None)
args = parser.parse_args(sys.argv[2:])
key = generate_license_key(
customer_email=args.email,
tier=args.tier,
duration_days=args.days,
)
print()
print("=== TALKEDIT LICENSE KEY ===")
print(key)
print()
print("Paste this into the TalkEdit app to activate.")
return
# Server mode
from http.server import HTTPServer, BaseHTTPRequestHandler
import urllib.parse
class LicenseHandler(BaseHTTPRequestHandler):
def do_POST(self):
path = urllib.parse.urlparse(self.path).path
if path == "/webhook/stripe":
content_length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(content_length)
sig_header = self.headers.get("Stripe-Signature", "")
try:
event = verify_stripe_webhook(body, sig_header)
event_type = event.get("type", "")
if event_type == "checkout.session.completed":
session = event["data"]["object"]
email = session.get("customer_email", session.get("customer_details", {}).get("email", "unknown"))
tier = "pro" # Map from session["metadata"]["tier"] or line items
license_key = generate_license_key(
customer_email=email,
tier=tier,
)
# In production: email the license key to the customer
print(f"License generated for {email}: {license_key[:40]}...")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps({"status": "ok"}).encode())
else:
self.send_response(200)
self.end_headers()
except Exception as e:
print(f"Webhook error: {e}")
self.send_response(400)
self.end_headers()
self.wfile.write(str(e).encode())
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
print(f"[license-server] {args}")
port = int(os.environ.get("PORT", 8643))
server = HTTPServer(("0.0.0.0", port), LicenseHandler)
print(f"License server listening on http://0.0.0.0:{port}")
print(f" POST /webhook/stripe - Stripe webhook")
print()
print("To generate a test license:")
print(f" python {__file__} generate --email you@example.com --tier pro")
server.serve_forever()
if __name__ == "__main__":
main()

158
backend/main.py Normal file
View File

@ -0,0 +1,158 @@
import logging
import os
import stat
import sys
from contextlib import asynccontextmanager
from pathlib import Path
from fastapi import FastAPI, Query, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from routers import transcribe, export, ai, captions, audio
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Dev log file — frontend forwards console.error/warn here so the agent can read it
DEV_LOG_PATH = Path(__file__).parent.parent / "webview.log"
@asynccontextmanager
async def lifespan(app: FastAPI):
logger.info("AI Video Editor backend starting up")
yield
logger.info("AI Video Editor backend shutting down")
app = FastAPI(
title="AI Video Editor Backend",
version="0.1.0",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
expose_headers=["Content-Range", "Accept-Ranges", "Content-Length"],
)
app.include_router(transcribe.router)
app.include_router(export.router)
app.include_router(ai.router)
app.include_router(captions.router)
app.include_router(audio.router)
MIME_MAP = {
".mp4": "video/mp4",
".mkv": "video/x-matroska",
".mov": "video/quicktime",
".avi": "video/x-msvideo",
".webm": "video/webm",
".m4a": "audio/mp4",
".wav": "audio/wav",
".mp3": "audio/mpeg",
".flac": "audio/flac",
}
@app.get("/file")
async def serve_local_file(request: Request, path: str = Query(...)):
"""Stream a local file with HTTP Range support (required for video seeking)."""
file_path = Path(path)
if not file_path.is_file():
logger.warning(f"[serve_file] File not found: {path}")
raise HTTPException(status_code=404, detail=f"File not found: {path}")
file_size = file_path.stat().st_size
content_type = MIME_MAP.get(file_path.suffix.lower(), "application/octet-stream")
range_header = request.headers.get("range")
logger.info(
f"[serve_file] {file_path.name} | size={file_size} | "
f"type={content_type} | range={range_header or 'none'}"
)
if content_type == "application/octet-stream":
logger.warning(
f"[serve_file] Unknown MIME type for extension '{file_path.suffix}'"
f"browser may fail to decode audio/video for '{file_path.name}'"
)
if file_size == 0:
logger.error(f"[serve_file] File is empty: {path}")
raise HTTPException(status_code=422, detail=f"File is empty: {path}")
if range_header:
try:
range_spec = range_header.replace("bytes=", "")
range_start_str, range_end_str = range_spec.split("-")
range_start = int(range_start_str) if range_start_str else 0
range_end = int(range_end_str) if range_end_str else file_size - 1
range_end = min(range_end, file_size - 1)
except (ValueError, TypeError) as e:
logger.error(f"[serve_file] Malformed Range header '{range_header}': {e}")
raise HTTPException(status_code=416, detail=f"Invalid Range header: {range_header}")
content_length = range_end - range_start + 1
def iter_range():
with open(file_path, "rb") as f:
f.seek(range_start)
remaining = content_length
while remaining > 0:
chunk = f.read(min(65536, remaining))
if not chunk:
break
remaining -= len(chunk)
yield chunk
return StreamingResponse(
iter_range(),
status_code=206,
media_type=content_type,
headers={
"Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
"Accept-Ranges": "bytes",
"Content-Length": str(content_length),
},
)
def iter_file():
with open(file_path, "rb") as f:
while chunk := f.read(65536):
yield chunk
return StreamingResponse(
iter_file(),
media_type=content_type,
headers={
"Accept-Ranges": "bytes",
"Content-Length": str(file_size),
},
)
@app.get("/health")
async def health():
return {"status": "ok"}
import datetime
@app.post("/dev/log")
async def dev_log(request: Request):
data = await request.json()
level = data.get("level", "log")
msg = str(data.get("message", ""))
args = [str(a) for a in data.get("args", [])]
ts = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
line = f"[{ts}] [{level.upper():5}] {msg}"
if args:
line += " " + " ".join(args)
line += "\n"
with open(DEV_LOG_PATH, "a") as f:
f.write(line)
return {"ok": True}

164
backend/requirements.txt Normal file
View File

@ -0,0 +1,164 @@
aiohappyeyeballs==2.6.1
aiohttp==3.13.4
aiosignal==1.4.0
alembic==1.18.4
annotated-doc==0.0.4
annotated-types==0.7.0
anthropic==0.86.0
antlr4-python3-runtime==4.9.3
anyio==4.13.0
appdirs==1.4.4
asteroid-filterbanks==0.4.0
attrs==26.1.0
av==17.0.0
certifi==2026.2.25
cffi==2.0.0
charset-normalizer==3.4.6
click==8.3.1
colorlog==6.10.1
contourpy==1.3.3
ctranslate2==4.7.1
cuda-bindings==12.9.4
cuda-pathfinder==1.2.2
cuda-toolkit==12.6.3
cycler==0.12.1
Cython==0.29.37
decorator==5.2.1
DeepFilterLib==0.5.6
DeepFilterNet==0.5.6
distro==1.9.0
docstring_parser==0.17.0
einops==0.8.2
fastapi==0.135.2
faster-whisper==1.2.1
ffmpeg-python==0.2.0
filelock==3.25.2
flatbuffers==25.12.19
fonttools==4.62.1
frozenlist==1.8.0
fsspec==2026.2.0
future==1.0.0
googleapis-common-protos==1.73.1
greenlet==3.3.2
grpcio==1.78.0
h11==0.16.0
hf-xet==1.4.2
httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
huggingface_hub==0.36.2
HyperPyYAML==1.2.3
idna==3.11
ImageIO==2.37.3
imageio-ffmpeg==0.6.0
importlib_metadata==8.7.1
Jinja2==3.1.6
jiter==0.13.0
joblib==1.5.3
julius==0.2.7
kiwisolver==1.5.0
lightning==2.6.1
lightning-utilities==0.15.3
loguru==0.7.3
Mako==1.3.10
markdown-it-py==4.0.0
MarkupSafe==3.0.3
matplotlib==3.10.8
maturin==1.12.6
mdurl==0.1.2
moviepy==2.2.1
mpmath==1.3.0
multidict==6.7.1
networkx==3.6.1
nltk==3.9.4
numpy==2.4.3
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-nccl-cu12==2.27.3
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.4.5
nvidia-nvtx-cu12==12.8.90
omegaconf==2.3.0
onnxruntime==1.24.4
openai==2.30.0
opentelemetry-api==1.40.0
opentelemetry-exporter-otlp==1.40.0
opentelemetry-exporter-otlp-proto-common==1.40.0
opentelemetry-exporter-otlp-proto-grpc==1.40.0
opentelemetry-exporter-otlp-proto-http==1.40.0
opentelemetry-proto==1.40.0
opentelemetry-sdk==1.40.0
opentelemetry-semantic-conventions==0.61b0
optuna==4.8.0
packaging==23.2
pandas==3.0.1
pillow==11.3.0
primePy==1.3
proglog==0.1.12
propcache==0.4.1
protobuf==6.33.6
pyannote-audio==4.0.4
pyannote-core==6.0.1
pyannote-database==6.1.1
pyannote-metrics==4.0.0
pyannote-pipeline==4.0.0
pyannoteai-sdk==0.4.0
pycparser==3.0
pydantic==2.12.5
pydantic_core==2.41.5
Pygments==2.19.2
pyparsing==3.3.2
python-dateutil==2.9.0.post0
python-dotenv==1.2.2
python-multipart==0.0.22
pytorch-lightning==2.6.1
pytorch-metric-learning==2.9.0
PyYAML==6.0.3
regex==2026.2.28
requests==2.33.0
rich==14.3.3
ruamel.yaml==0.18.17
ruamel.yaml.clib==0.2.15
safetensors==0.7.0
scikit-learn==1.8.0
scipy==1.17.1
setuptools==70.2.0
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
sortedcontainers==2.4.0
soundfile==0.13.1
SQLAlchemy==2.0.48
starlette==1.0.0
sympy==1.14.0
threadpoolctl==3.6.0
tokenizers==0.22.2
torch==2.8.0
torch-audiomentations==0.12.0
torch_pitch_shift==1.2.5
torchaudio==2.8.0
torchmetrics==1.9.0
tqdm==4.67.3
transformers==4.57.6
triton==3.4.0
typer==0.24.1
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.6.3
uvicorn==0.42.0
uvloop==0.22.1
watchfiles==1.1.1
websockets==16.0
wheel==0.46.3
whisperx==3.8.4
yarl==1.23.0
zipp==3.23.0

View File

83
backend/routers/ai.py Normal file
View File

@ -0,0 +1,83 @@
"""AI feature endpoints: filler word detection, clip creation, Ollama model listing."""
import logging
from typing import List, Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services.ai_provider import AIProvider, detect_filler_words, create_clip_suggestion
logger = logging.getLogger(__name__)
router = APIRouter()
class WordInfo(BaseModel):
index: int
word: str
start: Optional[float] = None
end: Optional[float] = None
class FillerRequest(BaseModel):
transcript: str
words: List[WordInfo]
provider: str = "ollama"
model: Optional[str] = None
api_key: Optional[str] = None
base_url: Optional[str] = None
custom_filler_words: Optional[str] = None
class ClipRequest(BaseModel):
transcript: str
words: List[WordInfo]
provider: str = "ollama"
model: Optional[str] = None
api_key: Optional[str] = None
base_url: Optional[str] = None
target_duration: int = 60
@router.post("/ai/filler-removal")
async def filler_removal(req: FillerRequest):
try:
words_dicts = [w.model_dump() for w in req.words]
result = detect_filler_words(
transcript=req.transcript,
words=words_dicts,
provider=req.provider,
model=req.model,
api_key=req.api_key,
base_url=req.base_url,
custom_filler_words=req.custom_filler_words,
)
return result
except Exception as e:
logger.error(f"Filler detection failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/ai/create-clip")
async def create_clip(req: ClipRequest):
try:
words_dicts = [w.model_dump() for w in req.words]
result = create_clip_suggestion(
transcript=req.transcript,
words=words_dicts,
target_duration=req.target_duration,
provider=req.provider,
model=req.model,
api_key=req.api_key,
base_url=req.base_url,
)
return result
except Exception as e:
logger.error(f"Clip creation failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/ai/ollama-models")
async def ollama_models(base_url: str = "http://localhost:11434"):
models = AIProvider.list_ollama_models(base_url)
return {"models": models}

193
backend/routers/audio.py Normal file
View File

@ -0,0 +1,193 @@
"""Audio processing endpoint (noise reduction / Studio Sound)."""
import hashlib
import logging
import subprocess
import tempfile
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, HTTPException, Query, Request
from fastapi.responses import FileResponse
from pydantic import BaseModel
from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available, normalize_audio
logger = logging.getLogger(__name__)
router = APIRouter()
# Simple in-process cache: video path → extracted WAV path
_waveform_cache: dict[str, str] = {}
class AudioCleanRequest(BaseModel):
input_path: str
output_path: Optional[str] = None
class SilenceDetectRequest(BaseModel):
input_path: str
min_silence_ms: int = 500
silence_db: float = -35.0
@router.post("/audio/clean")
async def clean_audio_endpoint(req: AudioCleanRequest):
try:
output = clean_audio(req.input_path, req.output_path or "")
return {
"status": "ok",
"output_path": output,
"engine": "deepfilternet" if is_deepfilter_available() else "ffmpeg_anlmdn",
}
except Exception as e:
logger.error(f"Audio cleaning failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/audio/capabilities")
async def audio_capabilities():
return {
"deepfilternet_available": is_deepfilter_available(),
}
@router.post("/audio/detect-silence")
async def detect_silence_endpoint(req: SilenceDetectRequest):
try:
ranges = detect_silence_ranges(
req.input_path,
req.min_silence_ms,
req.silence_db,
)
return {
"status": "ok",
"ranges": ranges,
"count": len(ranges),
}
except Exception as e:
logger.error(f"Silence detection failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/audio/waveform")
async def get_waveform_audio(request: Request, path: str = Query(...)):
"""
Extract audio from any video/audio file and return it as a WAV.
The WAV is cached on disk for subsequent requests.
Uses FFmpeg directly so it works with MKV, MOV, AVI, MP4, etc.
"""
req_id = hashlib.md5(f"{path}:{request.url}".encode()).hexdigest()[:10]
file_path = Path(path)
logger.info(
"[waveform:%s] request raw_url=%s raw_query=%s decoded_path=%r path_len=%s",
req_id,
str(request.url),
request.url.query,
path,
len(path),
)
try:
resolved_path = file_path.expanduser().resolve(strict=False)
except Exception:
resolved_path = file_path
logger.info(
"[waveform:%s] normalized path=%s exists=%s is_file=%s",
req_id,
resolved_path,
file_path.exists(),
file_path.is_file(),
)
if not file_path.is_file():
logger.warning("[waveform:%s] file_not_found path=%r", req_id, path)
raise HTTPException(status_code=404, detail=f"File not found: {path}")
# Cache key based on path + mtime so stale cache is auto-invalidated
mtime = file_path.stat().st_mtime
cache_key = hashlib.md5(f"{path}:{mtime}".encode()).hexdigest()
logger.info("[waveform:%s] cache_key=%s mtime=%s", req_id, cache_key, mtime)
if cache_key in _waveform_cache:
cached = Path(_waveform_cache[cache_key])
if cached.exists():
logger.info("[waveform:%s] cache_hit cached=%s", req_id, cached)
return FileResponse(str(cached), media_type="audio/wav")
else:
del _waveform_cache[cache_key]
logger.info("[waveform:%s] cache_miss extracting file=%s", req_id, file_path)
tmp_dir = tempfile.mkdtemp(prefix="talkedit_waveform_")
out_wav = Path(tmp_dir) / f"{cache_key}.wav"
# Downsample to mono 8000 Hz — enough for waveform drawing and much smaller payloads
cmd = [
"ffmpeg", "-y",
"-i", str(file_path),
"-vn", # drop video
"-ac", "1", # mono
"-ar", "8000", # 8 kHz sample rate
"-acodec", "pcm_s16le", # 16-bit PCM WAV
str(out_wav),
]
logger.info("[waveform:%s] ffmpeg_cmd=%s", req_id, " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
logger.error(
"[waveform:%s] ffmpeg_failed returncode=%s stderr_tail=%s",
req_id,
result.returncode,
result.stderr[-2000:],
)
raise HTTPException(
status_code=500,
detail=f"Failed to extract audio: {result.stderr[-300:]}"
)
if not out_wav.exists() or out_wav.stat().st_size == 0:
logger.error(
"[waveform:%s] empty_output out_wav=%s exists=%s size=%s",
req_id,
out_wav,
out_wav.exists(),
out_wav.stat().st_size if out_wav.exists() else -1,
)
raise HTTPException(status_code=500, detail="Audio extraction produced empty file")
logger.info(
"[waveform:%s] extracted_bytes=%s out_wav=%s",
req_id,
out_wav.stat().st_size,
out_wav,
)
_waveform_cache[cache_key] = str(out_wav)
return FileResponse(str(out_wav), media_type="audio/wav")
class NormalizeRequest(BaseModel):
input_path: str
output_path: Optional[str] = None
target_lufs: float = -14.0
@router.post("/audio/normalize")
async def normalize_audio_endpoint(req: NormalizeRequest):
"""Normalize audio loudness to a target LUFS level using FFmpeg loudnorm."""
if req.target_lufs < -70 or req.target_lufs > 0:
raise HTTPException(status_code=400, detail="target_lufs must be between -70 and 0")
try:
output = normalize_audio(
req.input_path,
req.output_path or "",
target_lufs=req.target_lufs,
)
return {
"status": "ok",
"output_path": output,
"target_lufs": req.target_lufs,
}
except Exception as e:
logger.error(f"Audio normalization failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,67 @@
"""Caption generation endpoint."""
import logging
from typing import List, Optional
from fastapi import APIRouter, HTTPException
from fastapi.responses import PlainTextResponse
from pydantic import BaseModel
from services.caption_generator import generate_srt, generate_vtt, generate_ass, save_captions
logger = logging.getLogger(__name__)
router = APIRouter()
class CaptionWord(BaseModel):
word: str
start: float
end: float
confidence: float = 0.0
class CaptionStyle(BaseModel):
fontName: str = "Arial"
fontSize: int = 48
fontColor: str = "&H00FFFFFF"
backgroundColor: str = "&H80000000"
position: str = "bottom"
bold: bool = True
class CaptionRequest(BaseModel):
words: List[CaptionWord]
deleted_indices: List[int] = []
format: str = "srt"
words_per_line: int = 8
style: Optional[CaptionStyle] = None
output_path: Optional[str] = None
@router.post("/captions")
async def generate_captions(req: CaptionRequest):
try:
words_dicts = [w.model_dump() for w in req.words]
deleted_set = set(req.deleted_indices)
if req.format == "srt":
content = generate_srt(words_dicts, deleted_set, req.words_per_line)
elif req.format == "vtt":
content = generate_vtt(words_dicts, deleted_set, req.words_per_line)
elif req.format == "ass":
style_dict = req.style.model_dump() if req.style else None
content = generate_ass(words_dicts, deleted_set, req.words_per_line, style_dict)
else:
raise HTTPException(status_code=400, detail=f"Unknown format: {req.format}")
if req.output_path:
saved = save_captions(content, req.output_path)
return {"status": "ok", "output_path": saved}
return PlainTextResponse(content, media_type="text/plain")
except HTTPException:
raise
except Exception as e:
logger.error(f"Caption generation failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

357
backend/routers/export.py Normal file
View File

@ -0,0 +1,357 @@
"""Export endpoint for video cutting and rendering."""
import logging
import tempfile
import os
from typing import List, Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs, mix_background_music, concat_clips
from services.audio_cleaner import clean_audio
from services.caption_generator import generate_srt, generate_ass, save_captions
from services.background_removal import remove_background_on_export as remove_bg
logger = logging.getLogger(__name__)
router = APIRouter()
class SegmentModel(BaseModel):
start: float
end: float
class GainRangeModel(SegmentModel):
gain_db: float
class SpeedRangeModel(SegmentModel):
speed: float
class ExportWordModel(BaseModel):
word: str
start: float
end: float
confidence: float = 0.0
class ZoomConfigModel(BaseModel):
enabled: bool = False
zoomFactor: float = 1.0
panX: float = 0.0
panY: float = 0.0
class BackgroundMusicModel(BaseModel):
path: str
volumeDb: float = 0.0
duckingEnabled: bool = False
duckingDb: float = 6.0
duckingAttackMs: float = 10.0
duckingReleaseMs: float = 200.0
class ExportRequest(BaseModel):
input_path: str
output_path: str
keep_segments: List[SegmentModel]
mute_ranges: Optional[List[SegmentModel]] = None
gain_ranges: Optional[List[GainRangeModel]] = None
speed_ranges: Optional[List[SpeedRangeModel]] = None
global_gain_db: float = 0.0
mode: str = "fast"
resolution: str = "1080p"
format: str = "mp4"
enhanceAudio: bool = False
normalize_loudness: bool = False
normalize_target_lufs: float = -14.0
captions: str = "none"
words: Optional[List[ExportWordModel]] = None
deleted_indices: Optional[List[int]] = None
zoom: Optional[ZoomConfigModel] = None
additional_clips: Optional[List[str]] = None
background_music: Optional[BackgroundMusicModel] = None
remove_background: bool = False
background_replacement: str = "blur"
background_replacement_value: str = ""
class TranscriptExportRequest(BaseModel):
words: List[ExportWordModel]
deleted_indices: Optional[List[int]] = None
output_path: str
format: str = "txt" # "txt" or "srt"
def _map_ranges_to_output_timeline(
ranges: List[dict],
keep_segments: List[dict],
) -> List[dict]:
"""Map source-time ranges to output timeline after cuts are applied."""
if not ranges or not keep_segments:
return []
mapped: List[dict] = []
output_cursor = 0.0
for keep in keep_segments:
keep_start = float(keep["start"])
keep_end = float(keep["end"])
keep_len = max(0.0, keep_end - keep_start)
if keep_len <= 0:
continue
for src_range in ranges:
overlap_start = max(keep_start, float(src_range["start"]))
overlap_end = min(keep_end, float(src_range["end"]))
if overlap_end <= overlap_start:
continue
mapped_range = {
"start": output_cursor + (overlap_start - keep_start),
"end": output_cursor + (overlap_end - keep_start),
}
if "gain_db" in src_range:
mapped_range["gain_db"] = float(src_range["gain_db"])
if "speed" in src_range:
mapped_range["speed"] = float(src_range["speed"])
mapped.append(mapped_range)
output_cursor += keep_len
return mapped
def _mux_audio(video_path: str, audio_path: str, output_path: str) -> str:
"""Replace video's audio track with cleaned audio using FFmpeg."""
import subprocess
cmd = [
"ffmpeg", "-y",
"-i", video_path,
"-i", audio_path,
"-c:v", "copy",
"-map", "0:v:0",
"-map", "1:a:0",
"-shortest",
output_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Audio mux failed: {result.stderr[-300:]}")
return output_path
@router.post("/export")
async def export_video(req: ExportRequest):
try:
segments = [{"start": s.start, "end": s.end} for s in req.keep_segments]
mute_segments = [{"start": s.start, "end": s.end} for s in req.mute_ranges] if req.mute_ranges else None
gain_segments = [{"start": s.start, "end": s.end, "gain_db": s.gain_db} for s in req.gain_ranges] if req.gain_ranges else None
speed_segments = [{"start": s.start, "end": s.end, "speed": s.speed} for s in req.speed_ranges] if req.speed_ranges else None
if not segments and not mute_segments:
raise HTTPException(status_code=400, detail="No segments to export")
# Convert zoom config to dict
zoom_dict = None
if req.zoom and req.zoom.enabled:
zoom_dict = {
"enabled": True,
"zoomFactor": req.zoom.zoomFactor,
"panX": req.zoom.panX,
"panY": req.zoom.panY,
}
# Handle additional clips: pre-concat before main editing
working_input = req.input_path
has_additional = bool(req.additional_clips)
if has_additional:
try:
concat_output = req.output_path + ".concat.mp4"
concat_clips(req.input_path, req.additional_clips, concat_output)
working_input = concat_output
logger.info("Pre-concatenated %d additional clips into %s", len(req.additional_clips), concat_output)
except Exception as e:
logger.warning(f"Clip concatenation failed (non-fatal): {e}")
# Fall back to main input only
mapped_gain_segments = _map_ranges_to_output_timeline(gain_segments or [], segments)
has_gain = abs(float(req.global_gain_db)) > 1e-6 or bool(gain_segments)
has_speed = bool(speed_segments)
if has_speed and (mute_segments or has_gain):
raise HTTPException(
status_code=400,
detail="Speed zones currently cannot be combined with mute/gain filters in one export",
)
use_stream_copy = req.mode == "fast" and len(segments) == 1 and not mute_segments and not has_gain and not has_speed and not zoom_dict and not has_additional
needs_reencode_for_subs = req.captions == "burn-in"
# Burn-in captions or audio filters require re-encode
if needs_reencode_for_subs or mute_segments or has_gain or has_speed:
use_stream_copy = False
words_dicts = [w.model_dump() for w in req.words] if req.words else []
deleted_set = set(req.deleted_indices or [])
# Generate ASS file for burn-in
ass_path = None
if req.captions == "burn-in" and words_dicts:
ass_content = generate_ass(words_dicts, deleted_set)
tmp = tempfile.NamedTemporaryFile(suffix=".ass", delete=False, mode="w", encoding="utf-8")
tmp.write(ass_content)
tmp.close()
ass_path = tmp.name
try:
if use_stream_copy:
output = export_stream_copy(working_input, req.output_path, segments)
elif ass_path:
output = export_reencode_with_subs(
working_input,
req.output_path,
segments,
ass_path,
resolution=req.resolution,
format_hint=req.format,
mute_ranges=mute_segments,
gain_ranges=mapped_gain_segments,
speed_ranges=speed_segments,
global_gain_db=req.global_gain_db,
normalize_loudness=req.normalize_loudness,
normalize_target_lufs=req.normalize_target_lufs,
zoom_config=zoom_dict,
)
else:
output = export_reencode(
working_input,
req.output_path,
segments,
resolution=req.resolution,
format_hint=req.format,
mute_ranges=mute_segments,
gain_ranges=mapped_gain_segments,
speed_ranges=speed_segments,
global_gain_db=req.global_gain_db,
normalize_loudness=req.normalize_loudness,
normalize_target_lufs=req.normalize_target_lufs,
zoom_config=zoom_dict,
)
finally:
if ass_path and os.path.exists(ass_path):
os.unlink(ass_path)
# Audio enhancement: clean, then mux back into the exported video
if req.enhanceAudio:
try:
tmp_dir = tempfile.mkdtemp(prefix="cutscript_audio_")
cleaned_audio = os.path.join(tmp_dir, "cleaned.wav")
clean_audio(output, cleaned_audio)
muxed_path = output + ".muxed.mp4"
_mux_audio(output, cleaned_audio, muxed_path)
os.replace(muxed_path, output)
logger.info(f"Audio enhanced and muxed into {output}")
try:
os.remove(cleaned_audio)
os.rmdir(tmp_dir)
except OSError:
pass
except Exception as e:
logger.warning(f"Audio enhancement failed (non-fatal): {e}")
# Background removal (post-process)
if req.remove_background:
try:
bg_output = output + ".nobg.mp4"
remove_bg(output, bg_output, req.background_replacement, req.background_replacement_value)
os.replace(bg_output, output)
logger.info("Background removed from %s", output)
except Exception as e:
logger.warning(f"Background removal failed (non-fatal): {e}")
# Background music mixing (post-process)
if req.background_music:
try:
music_output = output + ".music.mp4"
mix_background_music(
output,
req.background_music.path,
music_output,
volume_db=req.background_music.volumeDb,
ducking_enabled=req.background_music.duckingEnabled,
ducking_db=req.background_music.duckingDb,
ducking_attack_ms=req.background_music.duckingAttackMs,
ducking_release_ms=req.background_music.duckingReleaseMs,
)
os.replace(music_output, output)
logger.info("Background music mixed into %s", output)
except Exception as e:
logger.warning(f"Background music mixing failed (non-fatal): {e}")
# Sidecar SRT: generate and save alongside video
srt_path = None
if req.captions == "sidecar" and words_dicts:
srt_content = generate_srt(words_dicts, deleted_set)
srt_path = req.output_path.rsplit(".", 1)[0] + ".srt"
save_captions(srt_content, srt_path)
logger.info(f"Sidecar SRT saved to {srt_path}")
# Cleanup pre-concat temp file
if has_additional and working_input != req.input_path and os.path.exists(working_input):
try:
os.remove(working_input)
except OSError:
pass
result = {"status": "ok", "output_path": output}
if srt_path:
result["srt_path"] = srt_path
return result
except HTTPException:
raise
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
logger.error(f"Export failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
logger.error(f"Export error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/export/transcript")
async def export_transcript(req: TranscriptExportRequest):
"""Export transcript as plain text or SRT without rendering video."""
try:
from services.caption_generator import generate_srt
deleted_set = set(req.deleted_indices or [])
word_dicts = [w.model_dump() for w in req.words]
if req.format == "srt":
content = generate_srt(word_dicts, deleted_set)
else:
# Plain text: join non-deleted words
active_words = []
for i, w in enumerate(word_dicts):
if i not in deleted_set:
active_words.append(w["word"])
content = " ".join(active_words)
os.makedirs(os.path.dirname(req.output_path) or ".", exist_ok=True)
with open(req.output_path, "w", encoding="utf-8") as f:
f.write(content)
logger.info("Transcript exported to %s (format=%s)", req.output_path, req.format)
return {"status": "ok", "output_path": req.output_path}
except Exception as e:
logger.error(f"Transcript export failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,54 @@
"""Local LLM endpoints for bundled Qwen3 inference."""
import logging
from typing import Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services.local_llm import get_status, download_model, complete
logger = logging.getLogger(__name__)
router = APIRouter()
class CompleteRequest(BaseModel):
prompt: str
model_id: str = "qwen3-1.7b"
system_prompt: Optional[str] = None
temperature: float = 0.3
max_tokens: int = 2048
@router.get("/local-llm/status")
async def llm_status():
try:
return get_status()
except Exception as e:
logger.error(f"Local LLM status failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/local-llm/download")
async def llm_download(model_id: str = "qwen3-1.7b"):
try:
return download_model(model_id)
except Exception as e:
logger.error(f"Local LLM download failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/local-llm/complete")
async def llm_complete(req: CompleteRequest):
try:
result = complete(
prompt=req.prompt,
model_id=req.model_id,
system_prompt=req.system_prompt,
temperature=req.temperature,
max_tokens=req.max_tokens,
)
return {"response": result}
except Exception as e:
logger.error(f"Local LLM completion failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,149 @@
"""Transcription endpoint using WhisperX."""
import logging
from typing import Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services.transcription import transcribe_audio
from services.diarization import diarize_and_label
logger = logging.getLogger(__name__)
router = APIRouter()
class TranscribeRequest(BaseModel):
file_path: str
model: str = "base"
language: Optional[str] = None
use_gpu: bool = True
use_cache: bool = True
diarize: bool = False
hf_token: Optional[str] = None
num_speakers: Optional[int] = None
@router.post("/transcribe")
async def transcribe(req: TranscribeRequest):
try:
result = transcribe_audio(
file_path=req.file_path,
model_name=req.model,
use_gpu=req.use_gpu,
use_cache=req.use_cache,
language=req.language,
)
if req.diarize and req.hf_token:
result = diarize_and_label(
transcription_result=result,
audio_path=req.file_path,
hf_token=req.hf_token,
num_speakers=req.num_speakers,
use_gpu=req.use_gpu,
)
return result
except FileNotFoundError:
raise HTTPException(status_code=404, detail=f"File not found: {req.file_path}")
except Exception as e:
logger.error(f"Transcription failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
class ReTranscribeSegmentRequest(BaseModel):
file_path: str
start: float
end: float
model: str = "base"
language: Optional[str] = None
@router.post("/transcribe/segment")
async def transcribe_segment(req: ReTranscribeSegmentRequest):
"""
Re-transcribe a specific segment of audio.
Extracts the segment with FFmpeg, transcribes it, and returns words
with timestamps adjusted to the original file timeline.
"""
import subprocess
import tempfile
import os
try:
# Extract the segment to a temp file
tmp_dir = tempfile.mkdtemp(prefix="talkedit_segment_")
segment_path = os.path.join(tmp_dir, "segment.wav")
cmd = [
"ffmpeg", "-y",
"-i", req.file_path,
"-ss", str(req.start),
"-to", str(req.end),
"-vn",
"-acodec", "pcm_s16le",
"-ar", "16000",
"-ac", "1",
segment_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Segment extraction failed: {result.stderr[-300:]}")
# Transcribe the segment — try GPU first, fall back to CPU
try:
segment_result = transcribe_audio(
file_path=segment_path,
model_name=req.model,
use_gpu=True,
use_cache=False,
language=req.language,
)
except Exception as gpu_err:
logger.warning(f"GPU transcription failed (%s), falling back to CPU", gpu_err)
segment_result = transcribe_audio(
file_path=segment_path,
model_name=req.model,
use_gpu=False,
use_cache=False,
language=req.language,
)
# Adjust timestamps to be relative to the original file
offset = req.start
adjusted_words = []
for w in segment_result.get("words", []):
w["start"] = round(w["start"] + offset, 3)
w["end"] = round(w["end"] + offset, 3)
adjusted_words.append(w)
adjusted_segments = []
for seg in segment_result.get("segments", []):
seg["start"] = round(seg["start"] + offset, 3)
seg["end"] = round(seg["end"] + offset, 3)
# Also adjust words within each segment
for w in seg.get("words", []):
w["start"] = round(w["start"] + offset, 3)
w["end"] = round(w["end"] + offset, 3)
adjusted_segments.append(seg)
# Cleanup
try:
os.remove(segment_path)
os.rmdir(tmp_dir)
except OSError:
pass
return {
"words": adjusted_words,
"segments": adjusted_segments,
"language": segment_result.get("language", "en"),
}
except FileNotFoundError:
raise HTTPException(status_code=404, detail=f"File not found: {req.file_path}")
except Exception as e:
logger.error(f"Segment transcription failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View File

View File

@ -0,0 +1,211 @@
"""
Unified AI provider interface for Ollama, OpenAI, and Claude.
"""
import json
import logging
from typing import Optional, List
import requests
logger = logging.getLogger(__name__)
class AIProvider:
"""Routes completion requests to the configured provider."""
@staticmethod
def complete(
prompt: str,
provider: str = "ollama",
model: Optional[str] = None,
api_key: Optional[str] = None,
base_url: Optional[str] = None,
system_prompt: Optional[str] = None,
temperature: float = 0.3,
) -> str:
if provider == "ollama":
return _ollama_complete(prompt, model or "llama3", base_url or "http://localhost:11434", system_prompt, temperature)
elif provider == "openai":
return _openai_complete(prompt, model or "gpt-4o", api_key or "", system_prompt, temperature)
elif provider == "claude":
return _claude_complete(prompt, model or "claude-sonnet-4-20250514", api_key or "", system_prompt, temperature)
else:
raise ValueError(f"Unknown provider: {provider}")
@staticmethod
def list_ollama_models(base_url: str = "http://localhost:11434") -> List[str]:
try:
resp = requests.get(f"{base_url}/api/tags", timeout=3)
if resp.status_code == 200:
return [m["name"] for m in resp.json().get("models", [])]
except Exception:
pass
return []
def _ollama_complete(prompt: str, model: str, base_url: str, system_prompt: Optional[str], temperature: float) -> str:
body = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": temperature},
}
if system_prompt:
body["system"] = system_prompt
try:
resp = requests.post(f"{base_url}/api/generate", json=body, timeout=120)
resp.raise_for_status()
return resp.json().get("response", "").strip()
except Exception as e:
logger.error(f"Ollama error: {e}")
raise
def _openai_complete(prompt: str, model: str, api_key: str, system_prompt: Optional[str], temperature: float) -> str:
try:
from openai import OpenAI
client = OpenAI(api_key=api_key)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
)
return response.choices[0].message.content.strip()
except Exception as e:
logger.error(f"OpenAI error: {e}")
raise
def _claude_complete(prompt: str, model: str, api_key: str, system_prompt: Optional[str], temperature: float) -> str:
try:
import anthropic
client = anthropic.Anthropic(api_key=api_key)
kwargs = {
"model": model,
"max_tokens": 4096,
"temperature": temperature,
"messages": [{"role": "user", "content": prompt}],
}
if system_prompt:
kwargs["system"] = system_prompt
response = client.messages.create(**kwargs)
return response.content[0].text.strip()
except Exception as e:
logger.error(f"Claude error: {e}")
raise
def detect_filler_words(
transcript: str,
words: List[dict],
provider: str = "ollama",
model: Optional[str] = None,
api_key: Optional[str] = None,
base_url: Optional[str] = None,
custom_filler_words: Optional[str] = None,
) -> dict:
"""
Use an LLM to identify filler words in the transcript.
Returns {"wordIndices": [...], "fillerWords": [{"index": N, "word": "...", "reason": "..."}]}
"""
word_list = "\n".join(f"{w['index']}: {w['word']}" for w in words)
custom_line = ""
if custom_filler_words and custom_filler_words.strip():
custom_line = f"\n\nAdditionally, flag these user-specified filler words/phrases: {custom_filler_words.strip()}"
prompt = f"""Analyze this transcript for filler words and verbal hesitations.
Filler words include: um, uh, uh huh, hmm, like (when used as filler), you know, so (when starting sentences unnecessarily), basically, actually, literally, right, I mean, kind of, sort of, well (when used as filler).
Also flag repeated words that indicate stammering (e.g., "I I I" or "the the").{custom_line}
Here are the words with their indices:
{word_list}
Return ONLY a valid JSON object with this exact structure:
{{"wordIndices": [list of integer indices to remove], "fillerWords": [{{"index": integer, "word": "the word", "reason": "brief reason"}}]}}
Be conservative -- only flag clear filler words, not words that are part of meaningful sentences."""
system = "You are a precise text analysis tool. Return only valid JSON, no explanation."
result_text = AIProvider.complete(
prompt=prompt,
provider=provider,
model=model,
api_key=api_key,
base_url=base_url,
system_prompt=system,
temperature=0.1,
)
try:
start = result_text.find("{")
end = result_text.rfind("}") + 1
if start >= 0 and end > start:
return json.loads(result_text[start:end])
except json.JSONDecodeError:
logger.error(f"Failed to parse AI response as JSON: {result_text[:200]}")
return {"wordIndices": [], "fillerWords": []}
def create_clip_suggestion(
transcript: str,
words: List[dict],
target_duration: int = 60,
provider: str = "ollama",
model: Optional[str] = None,
api_key: Optional[str] = None,
base_url: Optional[str] = None,
) -> dict:
"""
Use an LLM to find the best clip segments in a transcript.
"""
word_list = "\n".join(
f"{w['index']}: \"{w['word']}\" ({w.get('start', 0):.1f}s - {w.get('end', 0):.1f}s)"
for w in words
)
prompt = f"""Analyze this transcript and find the most engaging {target_duration}-second segment(s) that would work well as a YouTube Short or social media clip.
Look for: compelling stories, surprising facts, emotional moments, clear explanations, humor, or quotable statements.
Words with indices and timestamps:
{word_list}
Return ONLY a valid JSON object:
{{"clips": [{{"title": "short catchy title", "startWordIndex": integer, "endWordIndex": integer, "startTime": float, "endTime": float, "reason": "why this segment is engaging"}}]}}
Suggest 1-3 clips, each approximately {target_duration} seconds long."""
system = "You are a viral content expert. Return only valid JSON, no explanation."
result_text = AIProvider.complete(
prompt=prompt,
provider=provider,
model=model,
api_key=api_key,
base_url=base_url,
system_prompt=system,
temperature=0.5,
)
try:
start = result_text.find("{")
end = result_text.rfind("}") + 1
if start >= 0 and end > start:
return json.loads(result_text[start:end])
except json.JSONDecodeError:
logger.error(f"Failed to parse clip suggestions: {result_text[:200]}")
return {"clips": []}

View File

@ -0,0 +1,282 @@
"""
Audio noise reduction using DeepFilterNet.
Falls back to a basic FFmpeg noise filter if DeepFilterNet is not installed.
"""
import logging
import re
import subprocess
import tempfile
import warnings
from pathlib import Path
logger = logging.getLogger(__name__)
DEEPFILTER_AVAILABLE = None
enhance = None
init_df = None
load_audio = None
save_audio = None
_df_model = None
_df_state = None
def _ensure_deepfilter_loaded() -> bool:
global DEEPFILTER_AVAILABLE, enhance, init_df, load_audio, save_audio
if DEEPFILTER_AVAILABLE is not None:
return DEEPFILTER_AVAILABLE
try:
# DeepFilterNet currently triggers a third-party torchaudio deprecation warning
# on import in some environments; suppress only this known warning.
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message=r".*torchaudio\._backend\.common\.AudioMetaData has been moved.*",
category=UserWarning,
)
from df.enhance import enhance as _enhance, init_df as _init_df, load_audio as _load_audio, save_audio as _save_audio
enhance = _enhance
init_df = _init_df
load_audio = _load_audio
save_audio = _save_audio
DEEPFILTER_AVAILABLE = True
except ImportError:
DEEPFILTER_AVAILABLE = False
return DEEPFILTER_AVAILABLE
def _init_deepfilter():
global _df_model, _df_state
if not _ensure_deepfilter_loaded():
raise RuntimeError("DeepFilterNet is not available")
if _df_model is None:
logger.info("Initializing DeepFilterNet model")
_df_model, _df_state, _ = init_df()
return _df_model, _df_state
def clean_audio(
input_path: str,
output_path: str = "",
) -> str:
"""
Apply noise reduction to an audio file.
If DeepFilterNet is available, uses it for high-quality results.
Otherwise falls back to FFmpeg's anlmdn filter.
Returns: path to the cleaned audio file.
"""
input_path = Path(input_path)
if not output_path:
output_path = str(input_path.with_stem(input_path.stem + "_clean"))
if is_deepfilter_available():
return _clean_with_deepfilter(str(input_path), output_path)
else:
return _clean_with_ffmpeg(str(input_path), output_path)
def _clean_with_deepfilter(input_path: str, output_path: str) -> str:
model, state = _init_deepfilter()
audio, info = load_audio(input_path, sr=state.sr())
enhanced = enhance(model, state, audio)
save_audio(output_path, enhanced, sr=state.sr())
logger.info(f"DeepFilterNet cleaned audio saved to {output_path}")
return output_path
def _clean_with_ffmpeg(input_path: str, output_path: str) -> str:
"""Fallback: basic noise reduction using FFmpeg's anlmdn filter."""
cmd = [
"ffmpeg", "-y",
"-i", input_path,
"-af", "anlmdn=s=7:p=0.002:r=0.002:m=15",
output_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg audio cleaning failed: {result.stderr[-300:]}")
logger.info(f"FFmpeg cleaned audio saved to {output_path}")
return output_path
def is_deepfilter_available() -> bool:
return _ensure_deepfilter_loaded()
def detect_silence_ranges(input_path: str, min_silence_ms: int, silence_db: float):
"""Detect silence ranges using ffmpeg silencedetect.
Returns a list of dicts: {start, end, duration} in seconds.
"""
min_silence_seconds = max(0.05, float(min_silence_ms) / 1000.0)
noise_threshold = float(silence_db)
cmd = [
"ffmpeg",
"-hide_banner",
"-i",
input_path,
"-af",
f"silencedetect=noise={noise_threshold}dB:d={min_silence_seconds}",
"-f",
"null",
"-",
]
result = subprocess.run(cmd, capture_output=True, text=True)
# silencedetect prints to stderr even on success.
output = result.stderr or ""
start_pat = re.compile(r"silence_start:\s*([0-9.]+)")
end_pat = re.compile(r"silence_end:\s*([0-9.]+)\s*\|\s*silence_duration:\s*([0-9.]+)")
starts = [float(m.group(1)) for m in start_pat.finditer(output)]
ends = [(float(m.group(1)), float(m.group(2))) for m in end_pat.finditer(output)]
ranges = []
pair_count = min(len(starts), len(ends))
for i in range(pair_count):
start = max(0.0, starts[i])
end, duration = ends[i]
if end > start and duration >= min_silence_seconds:
ranges.append({
"start": round(start, 3),
"end": round(end, 3),
"duration": round(duration, 3),
})
logger.info(
"Detected %s silence ranges in %s (min=%sms, threshold=%sdB)",
len(ranges),
input_path,
min_silence_ms,
silence_db,
)
return ranges
def normalize_audio(
input_path: str,
output_path: str = "",
target_lufs: float = -14.0,
) -> str:
"""
Normalize audio loudness to a target LUFS level using FFmpeg's loudnorm filter.
Args:
input_path: Path to the input audio/video file.
output_path: Path for the normalized output. Auto-generated if empty.
target_lufs: Target integrated loudness in LUFS.
Common targets: -14 (YouTube), -16 (Spotify), -23 (broadcast).
Returns: path to the normalized audio file.
"""
import os as _os
inp = Path(input_path)
if not output_path:
output_path = str(inp.with_stem(inp.stem + "_normalized"))
# Two-pass loudnorm: first pass measures loudness, second pass applies correction.
# First pass: measure only (print_format=json)
measure_cmd = [
"ffmpeg", "-y",
"-i", str(inp),
"-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:print_format=json",
"-f", "null",
"-",
]
logger.info("Running loudnorm first pass (measurement): %s", " ".join(measure_cmd))
measure_result = subprocess.run(measure_cmd, capture_output=True, text=True)
# Parse measured parameters from stderr (loudnorm outputs JSON to stderr)
measured = _parse_loudnorm_measurement(measure_result.stderr)
if not measured:
logger.warning(
"loudnorm measurement failed or produced no output; "
"falling back to single-pass normalization"
)
# Single-pass fallback
cmd = [
"ffmpeg", "-y",
"-i", str(inp),
"-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5",
"-c:v", "copy",
output_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Audio normalization failed: {result.stderr[-300:]}")
logger.info("Single-pass normalized audio saved to %s", output_path)
return output_path
# Second pass: apply normalization using measured values
input_i = measured.get("input_i", target_lufs)
input_lra = measured.get("input_lra", 7.0)
input_tp = measured.get("input_tp", -1.5)
input_thresh = measured.get("input_thresh", -30.0)
offset = measured.get("target_offset", 0.0)
apply_cmd = [
"ffmpeg", "-y",
"-i", str(inp),
"-af",
(
f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:"
f"measured_I={input_i}:measured_LRA={input_lra}:"
f"measured_TP={input_tp}:measured_thresh={input_thresh}:"
f"offset={offset}:linear=true:print_format=summary"
),
"-c:v", "copy",
output_path,
]
logger.info("Running loudnorm second pass (apply): %s", " ".join(apply_cmd))
result = subprocess.run(apply_cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Audio normalization (apply) failed: {result.stderr[-300:]}")
logger.info(
"Normalized audio saved to %s (target=%s LUFS, measured_I=%s)",
output_path,
target_lufs,
input_i,
)
return output_path
def _parse_loudnorm_measurement(stderr_output: str) -> dict:
"""Parse loudnorm JSON measurement output from FFmpeg stderr."""
import json
# loudnorm prints JSON block between "Parsed_loudnorm" lines
lines = stderr_output.split("\n")
json_lines = []
in_json = False
for line in lines:
if "Parsed_loudnorm" in line and "}" in line:
# Single-line JSON
try:
start = line.index("{")
end = line.rindex("}") + 1
return json.loads(line[start:end])
except (ValueError, json.JSONDecodeError):
continue
if "{" in line and not in_json:
in_json = True
if in_json:
json_lines.append(line)
if in_json and "}" in line:
in_json = False
break
if json_lines:
try:
return json.loads("".join(json_lines))
except json.JSONDecodeError:
pass
return {}

View File

@ -0,0 +1,232 @@
"""
AI background removal using MediaPipe for person segmentation.
Applied during export as a post-processing step — no real-time preview.
"""
import logging
import subprocess
import tempfile
import os
from pathlib import Path
logger = logging.getLogger(__name__)
MEDIAPIPE_AVAILABLE = False
try:
import mediapipe as mp
MEDIAPIPE_AVAILABLE = True
except ImportError:
pass
def is_available() -> bool:
return MEDIAPIPE_AVAILABLE
def remove_background_on_export(
input_path: str,
output_path: str,
replacement: str = "blur",
replacement_value: str = "",
) -> str:
"""
Process video frame-by-frame using FFmpeg chromakey fallback,
or MediaPipe-based segmentation if available.
Args:
input_path: source video
output_path: destination
replacement: 'blur', 'color', or 'image'
replacement_value: hex color or image path (for color/image modes)
Returns:
output_path
"""
input_path = str(Path(input_path).resolve())
output_path = str(Path(output_path).resolve())
if MEDIAPIPE_AVAILABLE:
return _remove_with_mediapipe(input_path, output_path, replacement, replacement_value)
else:
return _remove_with_ffmpeg_portrait(input_path, output_path, replacement, replacement_value)
def _remove_with_mediapipe(
input_path: str,
output_path: str,
replacement: str = "blur",
replacement_value: str = "",
) -> str:
"""Use MediaPipe Selfie Segmentation + FFmpeg for background removal.
Extracts frames, applies segmentation, composites replacement background.
"""
try:
import cv2
import numpy as np
import mediapipe as mp
mp_selfie_segmentation = mp.solutions.selfie_segmentation
# Determine background color/image
if replacement == "color":
color_hex = replacement_value or "#00FF00"
color_hex = color_hex.lstrip("#")
bg_color = tuple(int(color_hex[i:i+2], 16) for i in (0, 2, 4))
bg_color = bg_color[::-1] # RGB -> BGR
elif replacement == "image":
bg_image = cv2.imread(replacement_value) if replacement_value else None
if bg_image is None:
bg_color = (0, 255, 0)
bg_image = None
else:
# Blur background (default)
bg_color = None
# Open video
cap = cv2.VideoCapture(input_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Temp directory for processed frames
temp_dir = tempfile.mkdtemp(prefix="aive_bgrem_")
frame_dir = os.path.join(temp_dir, "frames")
os.makedirs(frame_dir, exist_ok=True)
with mp_selfie_segmentation.SelfieSegmentation(model_selection=0) as segmenter:
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Convert to RGB for MediaPipe
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
result = segmenter.process(rgb)
mask = result.segmentation_mask
# Threshold the mask
condition = mask > 0.5
if replacement == "blur":
# Apply strong blur to background
blurred = cv2.GaussianBlur(frame, (99, 99), 0)
output_frame = np.where(condition[..., None], frame, blurred)
elif replacement == "color":
bg = np.full(frame.shape, bg_color, dtype=np.uint8)
output_frame = np.where(condition[..., None], frame, bg)
elif replacement == "image" and bg_image is not None:
bg_resized = cv2.resize(bg_image, (width, height))
output_frame = np.where(condition[..., None], frame, bg_resized)
else:
output_frame = frame
out_path = os.path.join(frame_dir, f"frame_{frame_idx:06d}.png")
cv2.imwrite(out_path, output_frame)
frame_idx += 1
if frame_idx % 100 == 0:
logger.info("Background removal: %d/%d frames", frame_idx, total_frames)
cap.release()
# Encode frames back to video using FFmpeg
import subprocess as _sp
ffmpeg = "ffmpeg"
cmd = [
ffmpeg, "-y",
"-framerate", str(fps),
"-i", os.path.join(frame_dir, "frame_%06d.png"),
"-i", input_path,
"-map", "0:v:0",
"-map", "1:a:0?",
"-c:v", "libx264", "-preset", "medium", "-crf", "18",
"-c:a", "aac", "-b:a", "192k",
"-shortest",
"-pix_fmt", "yuv420p",
output_path,
]
result = _sp.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg frame encode failed: {result.stderr[-500:]}")
# Cleanup
for f in os.listdir(frame_dir):
try:
os.remove(os.path.join(frame_dir, f))
except OSError:
pass
try:
os.rmdir(frame_dir)
os.rmdir(temp_dir)
except OSError:
pass
logger.info("MediaPipe background removal completed -> %s", output_path)
return output_path
except ImportError:
logger.warning("mediapipe/cv2 not available, falling back to FFmpeg portrait mode")
return _remove_with_ffmpeg_portrait(input_path, output_path, replacement, replacement_value)
except Exception as e:
raise RuntimeError(f"MediaPipe background removal failed: {e}")
def _remove_with_ffmpeg_portrait(
input_path: str,
output_path: str,
replacement: str = "blur",
replacement_value: str = "",
) -> str:
"""Fallback: basic FFmpeg-only background blur.
Uses a strong gaussian blur as a crude background replacement.
For proper person segmentation (color/image replacement), install:
pip install mediapipe opencv-python
"""
ffmpeg = "ffmpeg"
if replacement == "blur":
filter_complex = "gblur=sigma=30"
elif replacement == "color":
color = replacement_value or "00FF00"
filter_complex = (
f"split[fg][bg];"
f"[bg]colorkey=0x{color}:0.3:0.1[bg_key];"
f"[fg][bg_key]overlay"
)
elif replacement == "image" and replacement_value:
escaped = replacement_value.replace("\\", "/").replace(":", "\\:")
filter_complex = (
f"movie='{escaped}':loop=0,scale=iw:ih[bg];"
f"[0:v][bg]overlay=0:0:shortest=1"
)
else:
filter_complex = "null"
if filter_complex == "null":
cmd = [ffmpeg, "-y", "-i", input_path, "-c", "copy", output_path]
else:
cmd = [
ffmpeg, "-y",
"-i", input_path,
"-vf", filter_complex,
"-c:v", "libx264", "-preset", "medium", "-crf", "18",
"-c:a", "aac", "-b:a", "192k",
"-movflags", "+faststart",
output_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg background removal failed: {result.stderr[-500:]}")
logger.warning(
"FFmpeg fallback background removal used (no MediaPipe). "
"Install 'mediapipe' and 'opencv-python' for proper person segmentation."
)
return output_path

View File

@ -0,0 +1,148 @@
"""
Generate caption files (SRT, VTT, ASS) from word-level timestamps.
"""
import logging
from pathlib import Path
from typing import List, Optional
logger = logging.getLogger(__name__)
def _format_srt_time(seconds: float) -> str:
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
ms = int((seconds % 1) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
def _format_vtt_time(seconds: float) -> str:
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
ms = int((seconds % 1) * 1000)
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
def _format_ass_time(seconds: float) -> str:
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
cs = int((seconds % 1) * 100)
return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
def generate_srt(
words: List[dict],
deleted_indices: Optional[set] = None,
words_per_line: int = 8,
) -> str:
"""Generate SRT caption content from word-level timestamps."""
deleted_indices = deleted_indices or set()
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
lines = []
counter = 1
for chunk_start in range(0, len(active_words), words_per_line):
chunk = active_words[chunk_start:chunk_start + words_per_line]
if not chunk:
continue
start_time = chunk[0][1]["start"]
end_time = chunk[-1][1]["end"]
text = " ".join(w["word"] for _, w in chunk)
lines.append(str(counter))
lines.append(f"{_format_srt_time(start_time)} --> {_format_srt_time(end_time)}")
lines.append(text)
lines.append("")
counter += 1
return "\n".join(lines)
def generate_vtt(
words: List[dict],
deleted_indices: Optional[set] = None,
words_per_line: int = 8,
) -> str:
"""Generate WebVTT caption content."""
deleted_indices = deleted_indices or set()
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
lines = ["WEBVTT", ""]
for chunk_start in range(0, len(active_words), words_per_line):
chunk = active_words[chunk_start:chunk_start + words_per_line]
if not chunk:
continue
start_time = chunk[0][1]["start"]
end_time = chunk[-1][1]["end"]
text = " ".join(w["word"] for _, w in chunk)
lines.append(f"{_format_vtt_time(start_time)} --> {_format_vtt_time(end_time)}")
lines.append(text)
lines.append("")
return "\n".join(lines)
def generate_ass(
words: List[dict],
deleted_indices: Optional[set] = None,
words_per_line: int = 8,
style: Optional[dict] = None,
) -> str:
"""Generate ASS subtitle content with styling."""
deleted_indices = deleted_indices or set()
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
s = style or {}
font = s.get("fontName", "Arial")
size = s.get("fontSize", 48)
color = s.get("fontColor", "&H00FFFFFF")
bold = "-1" if s.get("bold", True) else "0"
alignment = 2
header = f"""[Script Info]
Title: AI Video Editor Captions
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,{font},{size},{color},&H000000FF,&H00000000,&H80000000,{bold},0,0,0,100,100,0,0,1,2,1,{alignment},20,20,40,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
events = []
for chunk_start in range(0, len(active_words), words_per_line):
chunk = active_words[chunk_start:chunk_start + words_per_line]
if not chunk:
continue
start_time = chunk[0][1]["start"]
end_time = chunk[-1][1]["end"]
text = " ".join(w["word"] for _, w in chunk)
events.append(
f"Dialogue: 0,{_format_ass_time(start_time)},{_format_ass_time(end_time)},Default,,0,0,0,,{text}"
)
return header + "\n".join(events) + "\n"
def save_captions(
content: str,
output_path: str,
) -> str:
"""Write caption content to a file."""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(content, encoding="utf-8")
logger.info(f"Saved captions to {output_path}")
return str(output_path)

View File

@ -0,0 +1,98 @@
"""
Speaker diarization service using pyannote.audio.
Refactored from the original repo -- removed Streamlit dependency.
"""
import logging
import os
from pathlib import Path
from typing import Optional
import torch
from utils.gpu_utils import get_optimal_device
logger = logging.getLogger(__name__)
_pipeline_cache = {}
def _get_pipeline(hf_token: str, device: torch.device):
cache_key = str(device)
if cache_key in _pipeline_cache:
return _pipeline_cache[cache_key]
try:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.0",
use_auth_token=hf_token,
)
if device.type == "cuda":
pipeline = pipeline.to(device)
_pipeline_cache[cache_key] = pipeline
return pipeline
except Exception as e:
logger.error(f"Failed to load diarization pipeline: {e}")
return None
def diarize_and_label(
transcription_result: dict,
audio_path: str,
hf_token: Optional[str] = None,
num_speakers: Optional[int] = None,
use_gpu: bool = True,
) -> dict:
"""
Apply speaker diarization to an existing transcription result.
Adds 'speaker' field to each word and segment.
Returns the mutated transcription_result with speaker labels.
"""
hf_token = hf_token or os.environ.get("HF_TOKEN")
if not hf_token:
logger.warning("No HuggingFace token provided; skipping diarization")
return transcription_result
device = get_optimal_device() if use_gpu else torch.device("cpu")
pipeline = _get_pipeline(hf_token, device)
if pipeline is None:
return transcription_result
audio_path = Path(audio_path)
logger.info(f"Running diarization on {audio_path}")
try:
diarization = pipeline(str(audio_path), num_speakers=num_speakers)
except Exception as e:
logger.error(f"Diarization failed: {e}")
return transcription_result
speaker_map = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
speaker_map.append((turn.start, turn.end, speaker))
def _find_speaker(start: float, end: float) -> str:
best_overlap = 0
best_speaker = "UNKNOWN"
for s_start, s_end, speaker in speaker_map:
overlap_start = max(start, s_start)
overlap_end = min(end, s_end)
overlap = max(0, overlap_end - overlap_start)
if overlap > best_overlap:
best_overlap = overlap
best_speaker = speaker
return best_speaker
for word in transcription_result.get("words", []):
word["speaker"] = _find_speaker(word["start"], word["end"])
for segment in transcription_result.get("segments", []):
segment["speaker"] = _find_speaker(segment["start"], segment["end"])
for w in segment.get("words", []):
w["speaker"] = _find_speaker(w["start"], w["end"])
return transcription_result

View File

@ -0,0 +1,125 @@
"""
Local LLM inference using llama.cpp via llama-cpp-python.
Handles model download from HuggingFace and text completion.
"""
import json
import logging
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
QWEN_MODELS = {
"qwen3-1.7b": {
"repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
"file": "qwen3-1.7b-instruct-q4_k_m.gguf",
"size_gb": 1.0,
},
"qwen3-4b": {
"repo": "Qwen/Qwen3-4B-Instruct-GGUF",
"file": "qwen3-4b-instruct-q4_k_m.gguf",
"size_gb": 2.5,
},
}
def _ensure_llama_cpp() -> bool:
try:
from llama_cpp import Llama
return True
except ImportError:
return False
def _model_path(model_id: str) -> Path:
info = QWEN_MODELS.get(model_id)
if not info:
raise ValueError(f"Unknown model: {model_id}")
return LOCAL_MODELS_DIR / model_id / info["file"]
def get_status() -> dict:
"""Check status of local LLM setup."""
llama_available = _ensure_llama_cpp()
models = {}
for model_id in QWEN_MODELS:
path = _model_path(model_id)
models[model_id] = {
"downloaded": path.exists(),
"size_bytes": path.stat().st_size if path.exists() else 0,
"total_gb": QWEN_MODELS[model_id]["size_gb"],
}
return {
"llama_cpp_available": llama_available,
"models": models,
"models_dir": str(LOCAL_MODELS_DIR),
}
def download_model(model_id: str) -> dict:
"""Download a Qwen3 GGUF model from HuggingFace."""
info = QWEN_MODELS.get(model_id)
if not info:
raise ValueError(f"Unknown model: {model_id}")
model_dir = LOCAL_MODELS_DIR / model_id
model_dir.mkdir(parents=True, exist_ok=True)
output_path = model_dir / info["file"]
if output_path.exists():
return {"status": "already_downloaded", "path": str(output_path)}
logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
subprocess.run([
sys.executable, "-m", "huggingface_hub", "download",
info["repo"], info["file"],
"--local-dir", str(model_dir),
"--local-dir-use-symlinks", "False",
], check=True)
if not output_path.exists():
raise RuntimeError(f"Download failed: {output_path} not found")
return {"status": "downloaded", "path": str(output_path)}
def complete(
prompt: str,
model_id: str = "qwen3-1.7b",
system_prompt: Optional[str] = None,
temperature: float = 0.3,
max_tokens: int = 2048,
) -> str:
"""Run inference using a local Qwen3 model."""
model_path = _model_path(model_id)
if not model_path.exists():
raise RuntimeError(f"Model not downloaded: {model_id}")
from llama_cpp import Llama
llm = Llama(
model_path=str(model_path),
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1,
verbose=False,
)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = llm.create_chat_completion(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
return response["choices"][0]["message"]["content"].strip()

View File

@ -0,0 +1,205 @@
"""
WhisperX-based transcription service with word-level alignment.
Falls back to standard Whisper if WhisperX is not available.
"""
import logging
from pathlib import Path
from typing import Optional
import torch
from utils.gpu_utils import get_optimal_device, configure_gpu
from utils.audio_processing import extract_audio
from utils.cache import load_from_cache, save_to_cache
logger = logging.getLogger(__name__)
_model_cache: dict = {}
try:
import whisperx
WHISPERX_AVAILABLE = True
except ImportError:
WHISPERX_AVAILABLE = False
import whisper
try:
HF_TOKEN = None
import os
HF_TOKEN = os.environ.get("HF_TOKEN")
except Exception:
pass
def _get_device(use_gpu: bool = True) -> torch.device:
if use_gpu:
return get_optimal_device()
return torch.device("cpu")
def _load_model(model_name: str, device: torch.device):
cache_key = f"{model_name}_{device}"
if cache_key in _model_cache:
return _model_cache[cache_key]
logger.info(f"Loading model: {model_name} on {device}")
if WHISPERX_AVAILABLE:
compute_type = "float16" if device.type == "cuda" else "int8"
model = whisperx.load_model(
model_name,
device=device.type, # use "cuda" not "cuda:0" — some WhisperX versions don't support device ordinal
compute_type=compute_type,
)
else:
model = whisper.load_model(model_name, device=str(device))
_model_cache[cache_key] = model
return model
def transcribe_audio(
file_path: str,
model_name: str = "base",
use_gpu: bool = True,
use_cache: bool = True,
language: Optional[str] = None,
) -> dict:
"""
Transcribe audio/video file and return word-level timestamps.
Returns:
dict with keys: words, segments, language
"""
file_path = Path(file_path)
if use_cache:
cached = load_from_cache(file_path, model_name, "transcribe_wx")
if cached:
logger.info("Using cached transcription")
return cached
video_extensions = {".mp4", ".avi", ".mov", ".mkv", ".webm"}
if file_path.suffix.lower() in video_extensions:
audio_path = extract_audio(file_path)
else:
audio_path = file_path
device = _get_device(use_gpu)
model = _load_model(model_name, device)
logger.info(f"Transcribing: {file_path}")
if WHISPERX_AVAILABLE:
result = _transcribe_whisperx(model, str(audio_path), device, language)
else:
result = _transcribe_standard(model, str(audio_path), language)
if use_cache:
save_to_cache(file_path, result, model_name, "transcribe_wx")
return result
def _transcribe_whisperx(model, audio_path: str, device: torch.device, language: Optional[str]) -> dict:
audio = whisperx.load_audio(audio_path)
transcribe_opts = {}
if language:
transcribe_opts["language"] = language
result = model.transcribe(audio, batch_size=16, **transcribe_opts)
detected_language = result.get("language", "en")
align_model, align_metadata = whisperx.load_align_model(
language_code=detected_language,
device=device.type,
)
aligned = whisperx.align(
result["segments"],
align_model,
align_metadata,
audio,
str(device),
return_char_alignments=False,
)
words = []
for seg in aligned.get("segments", []):
for w in seg.get("words", []):
words.append({
"word": w.get("word", ""),
"start": round(w.get("start", 0), 3),
"end": round(w.get("end", 0), 3),
"confidence": round(w.get("score", 0), 3),
})
segments = []
for i, seg in enumerate(aligned.get("segments", [])):
seg_words = []
for w in seg.get("words", []):
seg_words.append({
"word": w.get("word", ""),
"start": round(w.get("start", 0), 3),
"end": round(w.get("end", 0), 3),
"confidence": round(w.get("score", 0), 3),
})
segments.append({
"id": i,
"start": round(seg.get("start", 0), 3),
"end": round(seg.get("end", 0), 3),
"text": seg.get("text", "").strip(),
"words": seg_words,
})
return {
"words": words,
"segments": segments,
"language": detected_language,
}
def _transcribe_standard(model, audio_path: str, language: Optional[str]) -> dict:
"""Fallback: standard Whisper (segment-level only, synthesized word timestamps)."""
opts = {}
if language:
opts["language"] = language
result = model.transcribe(audio_path, **opts)
detected_language = result.get("language", "en")
words = []
segments = []
for i, seg in enumerate(result.get("segments", [])):
text = seg.get("text", "").strip()
seg_start = seg.get("start", 0)
seg_end = seg.get("end", 0)
seg_words_text = text.split()
duration = seg_end - seg_start
seg_words = []
for j, w_text in enumerate(seg_words_text):
w_start = seg_start + (j / max(len(seg_words_text), 1)) * duration
w_end = seg_start + ((j + 1) / max(len(seg_words_text), 1)) * duration
word_obj = {
"word": w_text,
"start": round(w_start, 3),
"end": round(w_end, 3),
"confidence": 0.5,
}
words.append(word_obj)
seg_words.append(word_obj)
segments.append({
"id": i,
"start": round(seg_start, 3),
"end": round(seg_end, 3),
"text": text,
"words": seg_words,
})
return {
"words": words,
"segments": segments,
"language": detected_language,
}

View File

@ -0,0 +1,824 @@
"""
FFmpeg-based video cutting engine.
Uses stream copy for fast, lossless cuts and falls back to re-encode when needed.
"""
import logging
import subprocess
import tempfile
import os
from pathlib import Path
from typing import List
logger = logging.getLogger(__name__)
def _get_codec_args(format_hint: str, has_video: bool = True) -> list:
"""Return FFmpeg codec arguments for the given format."""
if format_hint == "wav":
return ["-c:a", "pcm_s16le"]
if format_hint == "webm":
if has_video:
return ["-c:v", "libvpx-vp9", "-crf", "30", "-b:v", "0", "-c:a", "libopus"]
return ["-c:a", "libopus", "-b:a", "160k"]
# Default: MP4
if has_video:
return ["-c:v", "libx264", "-preset", "medium", "-crf", "18", "-c:a", "aac", "-b:a", "192k"]
return ["-c:a", "aac", "-b:a", "192k"]
def _input_has_video_stream(ffmpeg_cmd: str, input_path: str) -> bool:
"""Return True if the input contains at least one video stream."""
ffprobe = ffmpeg_cmd.replace("ffmpeg", "ffprobe")
cmd = [
ffprobe,
"-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=index",
"-of", "csv=p=0",
str(input_path),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True)
return result.returncode == 0 and bool(result.stdout.strip())
except Exception:
return False
def _input_has_audio_stream(ffmpeg_cmd: str, input_path: str) -> bool:
"""Return True if the input contains at least one audio stream."""
ffprobe = ffmpeg_cmd.replace("ffmpeg", "ffprobe")
cmd = [
ffprobe,
"-v", "error",
"-select_streams", "a:0",
"-show_entries", "stream=index",
"-of", "csv=p=0",
str(input_path),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True)
return result.returncode == 0 and bool(result.stdout.strip())
except Exception:
return False
def _clamp_speed(speed: float) -> float:
return max(0.25, min(4.0, float(speed)))
def _build_atempo_chain(speed: float) -> str:
"""Build an FFmpeg atempo chain since each atempo node only supports 0.5..2.0."""
s = _clamp_speed(speed)
filters = []
while s > 2.0:
filters.append("atempo=2.0")
s /= 2.0
while s < 0.5:
filters.append("atempo=0.5")
s /= 0.5
filters.append(f"atempo={s:.6f}")
return ",".join(filters)
def _split_keep_segments_by_speed(
keep_segments: List[dict],
speed_ranges: List[dict] = None,
) -> List[dict]:
"""Split keep segments by speed ranges, attaching speed multiplier per piece."""
if not keep_segments:
return []
normalized_ranges = []
for r in speed_ranges or []:
start = float(r.get("start", 0.0))
end = float(r.get("end", 0.0))
if end <= start:
continue
normalized_ranges.append({
"start": start,
"end": end,
"speed": _clamp_speed(float(r.get("speed", 1.0))),
})
normalized_ranges.sort(key=lambda x: x["start"])
result = []
for keep in keep_segments:
k_start = float(keep["start"])
k_end = float(keep["end"])
if k_end <= k_start:
continue
cuts = {k_start, k_end}
for sr in normalized_ranges:
overlap_start = max(k_start, sr["start"])
overlap_end = min(k_end, sr["end"])
if overlap_end > overlap_start:
cuts.add(overlap_start)
cuts.add(overlap_end)
points = sorted(cuts)
for i in range(len(points) - 1):
seg_start = points[i]
seg_end = points[i + 1]
if seg_end - seg_start < 1e-6:
continue
speed = 1.0
for sr in normalized_ranges:
if seg_start >= sr["start"] and seg_end <= sr["end"]:
speed = sr["speed"]
break
result.append({"start": seg_start, "end": seg_end, "speed": speed})
return result
def _build_zoom_filter(zoom_config: dict = None) -> str:
"""Build FFmpeg video filter snippet for zoom/punch-in effect.
zoom_config: {enabled, zoomFactor, panX, panY}
Returns empty string if disabled. Should be prepended to the video filter chain.
"""
if not zoom_config or not zoom_config.get("enabled"):
return ""
factor = float(zoom_config.get("zoomFactor", 1.0))
if abs(factor - 1.0) < 0.01:
return ""
pan_x = float(zoom_config.get("panX", 0.0))
pan_y = float(zoom_config.get("panY", 0.0))
return f"crop=iw/{factor}:ih/{factor}:((iw-iw/{factor})/2)+({pan_x}*(iw-iw/{factor})/2):((ih-ih/{factor})/2)+({pan_y}*(ih-ih/{factor})/2),scale=iw:ih"
def mix_background_music(
video_path: str,
music_path: str,
output_path: str,
volume_db: float = 0.0,
ducking_enabled: bool = False,
ducking_db: float = 6.0,
ducking_attack_ms: float = 10.0,
ducking_release_ms: float = 200.0,
) -> str:
"""Mix background music into a video with optional ducking.
Uses FFmpeg amix + sidechaincompress. If the input has no audio,
the music track becomes the sole audio track. Output is written to output_path.
"""
ffmpeg = _find_ffmpeg()
escaped_music = music_path.replace("\\", "/").replace(":", "\\:")
has_audio_result = _input_has_audio_stream(ffmpeg, video_path)
if not has_audio_result:
cmd = [
ffmpeg, "-y",
"-i", video_path,
"-i", music_path,
"-map", "0:v",
"-map", "1:a",
"-c:v", "copy",
"-c:a", "aac", "-b:a", "192k",
"-shortest",
"-movflags", "+faststart",
output_path,
]
elif ducking_enabled:
music_source = f"amovie='{escaped_music}',volume={volume_db}dB[music]"
filter_complex = (
f"[0:a]asplit[main][sidechain];"
f"{music_source};"
f"[main][music]amix=inputs=2:duration=first:dropout_transition=2[mixed];"
f"[mixed][sidechain]sidechaincompress="
f"threshold=-30dB:ratio=20:attack={ducking_attack_ms / 1000}:"
f"release={ducking_release_ms / 1000}:makeup=1:level_sc={ducking_db}[outa]"
)
cmd = [
ffmpeg, "-y",
"-i", video_path,
"-filter_complex", filter_complex,
"-map", "0:v",
"-map", "[outa]",
"-c:v", "copy",
"-c:a", "aac", "-b:a", "192k",
"-shortest",
output_path,
]
else:
music_source = f"amovie='{escaped_music}',volume={volume_db}dB[music]"
filter_complex = (
f"{music_source};"
f"[0:a][music]amix=inputs=2:duration=first:dropout_transition=2[outa]"
)
cmd = [
ffmpeg, "-y",
"-i", video_path,
"-filter_complex", filter_complex,
"-map", "0:v",
"-map", "[outa]",
"-c:v", "copy",
"-c:a", "aac", "-b:a", "192k",
"-shortest",
output_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Background music mix failed: {result.stderr[-500:]}")
return output_path
def concat_clips(
main_path: str,
append_paths: list,
output_path: str,
) -> str:
"""Concatenate multiple video clips using FFmpeg concat demuxer.
The main_path is kept as-is. append_paths are appended after it.
"""
if not append_paths:
raise ValueError("No clips to concatenate")
ffmpeg = _find_ffmpeg()
resolved_main = str(Path(main_path).resolve())
# If output_path collides with an input, write to temp first
all_inputs = [resolved_main] + [str(Path(p).resolve()) for p in append_paths]
needs_rename = str(Path(output_path).resolve()) in all_inputs
final_output = output_path
if needs_rename:
final_output = output_path + ".concat_tmp.mp4"
temp_dir = tempfile.mkdtemp(prefix="aive_concat_")
try:
concat_file = os.path.join(temp_dir, "concat.txt")
with open(concat_file, "w") as f:
for path in all_inputs:
f.write(f"file '{path}'\n")
cmd = [
ffmpeg, "-y",
"-f", "concat",
"-safe", "0",
"-i", concat_file,
"-c", "copy",
"-movflags", "+faststart",
final_output,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Clip concat failed: {result.stderr[-500:]}")
if needs_rename:
os.replace(final_output, output_path)
return output_path
finally:
for f in os.listdir(temp_dir):
try:
os.remove(os.path.join(temp_dir, f))
except OSError:
pass
try:
os.rmdir(temp_dir)
except OSError:
pass
def _find_ffmpeg() -> str:
"""Locate ffmpeg binary."""
for cmd in ["ffmpeg", "ffmpeg.exe"]:
try:
subprocess.run([cmd, "-version"], capture_output=True, check=True)
return cmd
except (FileNotFoundError, subprocess.CalledProcessError):
continue
raise RuntimeError("FFmpeg not found. Install it or add it to PATH.")
def export_stream_copy(
input_path: str,
output_path: str,
keep_segments: List[dict],
mute_ranges: List[dict] = None,
) -> str:
"""
Export video using FFmpeg concat demuxer with stream copy.
~100x faster than re-encoding. No quality loss.
Falls back to re-encoding if mute_ranges are provided.
Args:
input_path: source video file
output_path: destination file
keep_segments: list of {"start": float, "end": float} to keep
mute_ranges: list of {"start": float, "end": float} to mute (optional)
Returns:
output_path on success
"""
if mute_ranges:
# Mute ranges require audio filtering, so fall back to re-encode
return export_reencode(input_path, output_path, keep_segments, "1080p", "mp4", mute_ranges)
ffmpeg = _find_ffmpeg()
if not _input_has_video_stream(ffmpeg, input_path):
# Audio-only inputs cannot use TS segment stream-copy concat reliably.
return export_reencode(input_path, output_path, keep_segments)
input_path = str(Path(input_path).resolve())
output_path = str(Path(output_path).resolve())
if not keep_segments:
raise ValueError("No segments to export")
temp_dir = tempfile.mkdtemp(prefix="aive_export_")
try:
segment_files = []
for i, seg in enumerate(keep_segments):
seg_file = os.path.join(temp_dir, f"seg_{i:04d}.ts")
cmd = [
ffmpeg, "-y",
"-ss", str(seg["start"]),
"-to", str(seg["end"]),
"-i", input_path,
"-c", "copy",
"-avoid_negative_ts", "make_zero",
"-f", "mpegts",
seg_file,
]
logger.info(f"Extracting segment {i}: {seg['start']:.2f}s - {seg['end']:.2f}s")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
logger.warning(f"Stream copy segment {i} failed, will try re-encode: {result.stderr[-200:]}")
return export_reencode(input_path, output_path, keep_segments)
segment_files.append(seg_file)
concat_str = "|".join(segment_files)
cmd = [
ffmpeg, "-y",
"-i", f"concat:{concat_str}",
"-c", "copy",
"-movflags", "+faststart",
output_path,
]
logger.info(f"Concatenating {len(segment_files)} segments -> {output_path}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
logger.warning(f"Concat failed, falling back to re-encode: {result.stderr[-200:]}")
return export_reencode(input_path, output_path, keep_segments)
return output_path
finally:
for f in os.listdir(temp_dir):
try:
os.remove(os.path.join(temp_dir, f))
except OSError:
pass
try:
os.rmdir(temp_dir)
except OSError:
pass
def _apply_zoom_post(input_path: str, output_path: str, zoom_config: dict) -> str:
"""Re-encode video applying zoom/punch-in crop+scale as a post-process step."""
ffmpeg = _find_ffmpeg()
zoom_filter = _build_zoom_filter(zoom_config)
if not zoom_filter:
return input_path
cmd = [
ffmpeg, "-y",
"-i", input_path,
"-filter_complex", f"[0:v]{zoom_filter}[v]",
"-map", "[v]",
"-map", "0:a?",
"-c:a", "copy",
"-movflags", "+faststart",
output_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Zoom post-process failed: {result.stderr[-500:]}")
return output_path
def export_reencode(
input_path: str,
output_path: str,
keep_segments: List[dict],
resolution: str = "1080p",
format_hint: str = "mp4",
mute_ranges: List[dict] = None,
gain_ranges: List[dict] = None,
speed_ranges: List[dict] = None,
global_gain_db: float = 0.0,
normalize_loudness: bool = False,
normalize_target_lufs: float = -14.0,
zoom_config: dict = None,
) -> str:
"""
Export video with full re-encode. Slower but supports resolution changes,
format conversion, and avoids stream-copy edge cases.
If mute_ranges are provided, applies audio muting instead of cutting.
"""
ffmpeg = _find_ffmpeg()
input_path = str(Path(input_path).resolve())
output_path = str(Path(output_path).resolve())
scale_map = {
"720p": "scale=-2:720",
"1080p": "scale=-2:1080",
"4k": "scale=-2:2160",
}
def build_audio_filter() -> str:
filters = []
if abs(float(global_gain_db)) > 1e-6:
filters.append(f"volume={float(global_gain_db)}dB")
for gain_range in gain_ranges or []:
start = gain_range['start']
end = gain_range['end']
gain_db = gain_range.get('gain_db', 0.0)
filters.append(f"volume={float(gain_db)}dB:enable='between(t,{start},{end})'")
for mute_range in mute_ranges or []:
start = mute_range['start']
end = mute_range['end']
filters.append(f"volume=0:enable='between(t,{start},{end})'")
if normalize_loudness:
filters.append(f"loudnorm=I={normalize_target_lufs}:LRA=7:TP=-1.5")
return ",".join(filters) if filters else "anull"
has_audio_filters = bool(mute_ranges) or bool(gain_ranges) or abs(float(global_gain_db)) > 1e-6
has_video = _input_has_video_stream(ffmpeg, input_path)
speed_segments = _split_keep_segments_by_speed(keep_segments, speed_ranges)
has_speed = any(abs(seg.get("speed", 1.0) - 1.0) > 1e-6 for seg in speed_segments)
if not has_video:
if not keep_segments:
raise ValueError("No segments to export")
segments_for_concat = speed_segments if speed_segments else _split_keep_segments_by_speed(keep_segments, None)
if not segments_for_concat:
raise ValueError("No segments to export")
filter_parts = []
for i, seg in enumerate(segments_for_concat):
speed = _clamp_speed(seg.get("speed", 1.0))
a_chain = f"atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS"
if abs(speed - 1.0) > 1e-6:
a_chain += f",{_build_atempo_chain(speed)}"
filter_parts.append(f"[0:a]{a_chain}[a{i}];")
n = len(segments_for_concat)
concat_inputs = "".join(f"[a{i}]" for i in range(n))
filter_parts.append(f"{concat_inputs}concat=n={n}:v=0:a=1[outa_raw]")
audio_filter = build_audio_filter()
if audio_filter != "anull":
filter_parts.append(f";[outa_raw]{audio_filter}[outa]")
audio_map = "[outa]"
else:
audio_map = "[outa_raw]"
filter_complex = "".join(filter_parts)
codec_args = _get_codec_args(format_hint, has_video=False)
cmd = [
ffmpeg, "-y",
"-i", input_path,
"-filter_complex", filter_complex,
"-map", audio_map,
*codec_args,
output_path,
]
logger.info(
"Re-encoding audio-only input (%s segments, speed-adjusted=%s) -> %s",
n,
has_speed,
output_path,
)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg audio-only export failed: {result.stderr[-500:]}")
return output_path
# Handle filtered full-timeline audio case (mute/gain/global gain) when no speed warping is needed
if has_audio_filters and not has_speed:
audio_filter = build_audio_filter()
# Video filter - just scaling if needed
scale = scale_map.get(resolution, "")
if scale:
video_filter = scale
video_map = "[v]"
else:
video_filter = "null"
video_map = "0:v"
filter_complex = f"[0:a]{audio_filter}[a];[0:v]{video_filter}{video_map}"
codec_args = _get_codec_args(format_hint, has_video)
cmd = [
ffmpeg, "-y",
"-i", input_path,
"-filter_complex", filter_complex,
"-map", video_map,
"-map", "[a]",
*codec_args,
"-movflags", "+faststart",
output_path,
]
logger.info(
"Re-encoding with audio filters (mute=%s gain=%s global=%s) -> %s (%s)",
len(mute_ranges or []),
len(gain_ranges or []),
global_gain_db,
output_path,
resolution,
)
else:
# Cutting logic with optional per-segment speed changes
if not keep_segments:
raise ValueError("No segments to export")
segments_for_concat = speed_segments if speed_segments else _split_keep_segments_by_speed(keep_segments, None)
if not segments_for_concat:
raise ValueError("No segments to export")
filter_parts = []
for i, seg in enumerate(segments_for_concat):
speed = _clamp_speed(seg.get("speed", 1.0))
v_chain = f"trim=start={seg['start']}:end={seg['end']},setpts=PTS-STARTPTS"
a_chain = f"atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS"
if abs(speed - 1.0) > 1e-6:
v_chain += f",setpts=PTS/{speed:.6f}"
a_chain += f",{_build_atempo_chain(speed)}"
filter_parts.append(f"[0:v]{v_chain}[v{i}];[0:a]{a_chain}[a{i}];")
n = len(segments_for_concat)
concat_inputs = "".join(f"[v{i}][a{i}]" for i in range(n))
filter_parts.append(f"{concat_inputs}concat=n={n}:v=1:a=1[outv][outa]")
filter_complex = "".join(filter_parts)
# Add loudnorm to the cutting path audio chain if enabled
audio_map_label = "[outa]"
if normalize_loudness:
filter_complex += f";{audio_map_label}loudnorm=I={normalize_target_lufs}:LRA=7:TP=-1.5[outa_norm]"
audio_map_label = "[outa_norm]"
scale = scale_map.get(resolution, "")
if scale:
filter_complex += f";[outv]{scale}[outv_scaled]"
video_map = "[outv_scaled]"
else:
video_map = "[outv]"
codec_args = _get_codec_args(format_hint, has_video)
cmd = [
ffmpeg, "-y",
"-i", input_path,
"-filter_complex", filter_complex,
"-map", video_map,
"-map", audio_map_label,
*codec_args,
"-movflags", "+faststart",
output_path,
]
logger.info(
"Re-encoding %s segments (speed-adjusted=%s, normalize=%s) -> %s (%s)",
n,
has_speed,
normalize_loudness,
output_path,
resolution,
)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg re-encode failed: {result.stderr[-500:]}")
# Apply zoom post-processing if configured
if zoom_config and zoom_config.get("enabled") and has_video:
zoomed_path = output_path + ".zoomed.mp4"
_apply_zoom_post(output_path, zoomed_path, zoom_config)
os.replace(zoomed_path, output_path)
logger.info("Zoom/punch-in applied to %s (factor=%s)", output_path, zoom_config.get("zoomFactor", 1.0))
return output_path
def export_reencode_with_subs(
input_path: str,
output_path: str,
keep_segments: List[dict],
subtitle_path: str,
resolution: str = "1080p",
format_hint: str = "mp4",
mute_ranges: List[dict] = None,
gain_ranges: List[dict] = None,
speed_ranges: List[dict] = None,
global_gain_db: float = 0.0,
normalize_loudness: bool = False,
normalize_target_lufs: float = -14.0,
zoom_config: dict = None,
) -> str:
"""
Export video with re-encode and burn-in subtitles (ASS format).
Applies trim+concat first, then overlays the subtitle file.
If mute_ranges are provided, applies audio muting instead of cutting.
"""
ffmpeg = _find_ffmpeg()
if not _input_has_video_stream(ffmpeg, input_path):
raise ValueError("Burn-in captions require a video track")
input_path = str(Path(input_path).resolve())
output_path = str(Path(output_path).resolve())
subtitle_path = str(Path(subtitle_path).resolve())
scale_map = {
"720p": "scale=-2:720",
"1080p": "scale=-2:1080",
"4k": "scale=-2:2160",
}
def build_audio_filter() -> str:
filters = []
if abs(float(global_gain_db)) > 1e-6:
filters.append(f"volume={float(global_gain_db)}dB")
for gain_range in gain_ranges or []:
start = gain_range['start']
end = gain_range['end']
gain_db = gain_range.get('gain_db', 0.0)
filters.append(f"volume={float(gain_db)}dB:enable='between(t,{start},{end})'")
for mute_range in mute_ranges or []:
start = mute_range['start']
end = mute_range['end']
filters.append(f"volume=0:enable='between(t,{start},{end})'")
if normalize_loudness:
filters.append(f"loudnorm=I={normalize_target_lufs}:LRA=7:TP=-1.5")
return ",".join(filters) if filters else "anull"
has_audio_filters = bool(mute_ranges) or bool(gain_ranges) or abs(float(global_gain_db)) > 1e-6
speed_segments = _split_keep_segments_by_speed(keep_segments, speed_ranges)
has_speed = any(abs(seg.get("speed", 1.0) - 1.0) > 1e-6 for seg in speed_segments)
# Handle filtered full-timeline audio case (mute/gain/global gain) when no speed warping is needed
if has_audio_filters and not has_speed:
audio_filter = build_audio_filter()
# Video filter with subtitles
escaped_sub = subtitle_path.replace("\\", "/").replace(":", "\\:")
scale = scale_map.get(resolution, "")
if scale:
video_filter = f"{scale},ass='{escaped_sub}'"
else:
video_filter = f"ass='{escaped_sub}'"
filter_complex = f"[0:a]{audio_filter}[a];[0:v]{video_filter}[v]"
codec_args = _get_codec_args(format_hint, has_video=True)
cmd = [
ffmpeg, "-y",
"-i", input_path,
"-filter_complex", filter_complex,
"-map", "[v]",
"-map", "[a]",
*codec_args,
"-movflags", "+faststart",
output_path,
]
logger.info(
"Re-encoding with subtitles and audio filters (mute=%s gain=%s global=%s) -> %s (%s)",
len(mute_ranges or []),
len(gain_ranges or []),
global_gain_db,
output_path,
resolution,
)
else:
# Cutting logic with subtitles and optional speed changes
if not keep_segments:
raise ValueError("No segments to export")
segments_for_concat = speed_segments if speed_segments else _split_keep_segments_by_speed(keep_segments, None)
if not segments_for_concat:
raise ValueError("No segments to export")
filter_parts = []
for i, seg in enumerate(segments_for_concat):
speed = _clamp_speed(seg.get("speed", 1.0))
v_chain = f"trim=start={seg['start']}:end={seg['end']},setpts=PTS-STARTPTS"
a_chain = f"atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS"
if abs(speed - 1.0) > 1e-6:
v_chain += f",setpts=PTS/{speed:.6f}"
a_chain += f",{_build_atempo_chain(speed)}"
filter_parts.append(f"[0:v]{v_chain}[v{i}];[0:a]{a_chain}[a{i}];")
n = len(segments_for_concat)
concat_inputs = "".join(f"[v{i}][a{i}]" for i in range(n))
filter_parts.append(f"{concat_inputs}concat=n={n}:v=1:a=1[outv][outa]")
filter_complex = "".join(filter_parts)
# Escape path for FFmpeg subtitle filter (Windows backslashes need escaping)
escaped_sub = subtitle_path.replace("\\", "/").replace(":", "\\:")
scale = scale_map.get(resolution, "")
if scale:
filter_complex += f";[outv]{scale},ass='{escaped_sub}'[outv_final]"
else:
filter_complex += f";[outv]ass='{escaped_sub}'[outv_final]"
video_map = "[outv_final]"
codec_args = _get_codec_args(format_hint, has_video=True)
cmd = [
ffmpeg, "-y",
"-i", input_path,
"-filter_complex", filter_complex,
"-map", video_map,
"-map", "[outa]",
*codec_args,
"-movflags", "+faststart",
output_path,
]
logger.info(
"Re-encoding %s segments with subtitles (speed-adjusted=%s) -> %s (%s)",
n,
has_speed,
output_path,
resolution,
)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg re-encode with subs failed: {result.stderr[-500:]}")
# Apply zoom post-processing if configured
if zoom_config and zoom_config.get("enabled"):
zoomed_path = output_path + ".zoomed.mp4"
_apply_zoom_post(output_path, zoomed_path, zoom_config)
os.replace(zoomed_path, output_path)
logger.info("Zoom/punch-in applied to %s (factor=%s)", output_path, zoom_config.get("zoomFactor", 1.0))
return output_path
def get_video_info(input_path: str) -> dict:
"""Get basic video metadata using ffprobe."""
ffmpeg = _find_ffmpeg()
ffprobe = ffmpeg.replace("ffmpeg", "ffprobe")
cmd = [
ffprobe, "-v", "quiet",
"-print_format", "json",
"-show_format", "-show_streams",
str(input_path),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
import json
data = json.loads(result.stdout)
fmt = data.get("format", {})
video_stream = next((s for s in data.get("streams", []) if s.get("codec_type") == "video"), {})
return {
"duration": float(fmt.get("duration", 0)),
"size": int(fmt.get("size", 0)),
"format": fmt.get("format_name", ""),
"width": int(video_stream.get("width", 0)),
"height": int(video_stream.get("height", 0)),
"codec": video_stream.get("codec_name", ""),
"fps": eval(video_stream.get("r_frame_rate", "0/1")) if "/" in video_stream.get("r_frame_rate", "") else 0,
}
except Exception as e:
logger.error(f"Failed to get video info: {e}")
return {}

View File

View File

@ -0,0 +1,57 @@
import tempfile
import time
import unittest
from pathlib import Path
from backend.utils import cache as cache_utils
class CacheUtilsTests(unittest.TestCase):
def setUp(self) -> None:
self._tmp_dir = tempfile.TemporaryDirectory()
self._old_cache_dir = cache_utils.CACHE_DIR
cache_utils.CACHE_DIR = Path(self._tmp_dir.name) / "cache"
self._work_dir = Path(self._tmp_dir.name) / "work"
self._work_dir.mkdir(parents=True, exist_ok=True)
self._src_file = self._work_dir / "sample.txt"
self._src_file.write_text("hello", encoding="utf-8")
def tearDown(self) -> None:
cache_utils.CACHE_DIR = self._old_cache_dir
self._tmp_dir.cleanup()
def test_get_file_hash_returns_none_for_missing_file(self) -> None:
missing = self._work_dir / "missing.txt"
self.assertIsNone(cache_utils.get_file_hash(missing))
def test_save_and_load_round_trip(self) -> None:
payload = {"value": 123, "ok": True}
saved = cache_utils.save_to_cache(self._src_file, payload, model="m1", operation="transcribe")
self.assertTrue(saved)
loaded = cache_utils.load_from_cache(self._src_file, model="m1", operation="transcribe")
self.assertEqual(payload, loaded)
def test_load_from_cache_respects_max_age(self) -> None:
payload = {"value": 999}
self.assertTrue(cache_utils.save_to_cache(self._src_file, payload, operation="transcribe"))
time.sleep(0.02)
expired = cache_utils.load_from_cache(self._src_file, operation="transcribe", max_age=0.001)
self.assertIsNone(expired)
def test_clear_cache_deletes_files(self) -> None:
self.assertTrue(cache_utils.save_to_cache(self._src_file, {"a": 1}, operation="transcribe"))
self.assertTrue(cache_utils.save_to_cache(self._src_file, {"a": 2}, operation="summarize"))
deleted_count = cache_utils.clear_cache()
self.assertGreaterEqual(deleted_count, 1)
size_bytes, file_count = cache_utils.get_cache_size()
self.assertEqual(size_bytes, 0)
self.assertEqual(file_count, 0)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,451 @@
import unittest
from unittest.mock import patch
from pathlib import Path
from tempfile import TemporaryDirectory
import os
from types import SimpleNamespace
from fastapi.testclient import TestClient
from backend.main import app
from routers import audio as audio_router
class RouterContractTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.client = TestClient(app)
def setUp(self) -> None:
audio_router._waveform_cache.clear()
def test_health_endpoint(self) -> None:
res = self.client.get("/health")
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json(), {"status": "ok"})
def test_file_endpoint_full_content(self) -> None:
with TemporaryDirectory() as tmp:
file_path = Path(tmp) / "sample.wav"
file_path.write_bytes(b"abcdefghij")
res = self.client.get("/file", params={"path": str(file_path)})
self.assertEqual(res.status_code, 200)
self.assertEqual(res.content, b"abcdefghij")
self.assertEqual(res.headers.get("accept-ranges"), "bytes")
def test_file_endpoint_range_request(self) -> None:
with TemporaryDirectory() as tmp:
file_path = Path(tmp) / "sample.wav"
file_path.write_bytes(b"abcdefghij")
res = self.client.get(
"/file",
params={"path": str(file_path)},
headers={"Range": "bytes=2-5"},
)
self.assertEqual(res.status_code, 206)
self.assertEqual(res.content, b"cdef")
self.assertEqual(res.headers.get("content-range"), "bytes 2-5/10")
def test_file_endpoint_missing_file(self) -> None:
res = self.client.get("/file", params={"path": "/tmp/does-not-exist.wav"})
self.assertEqual(res.status_code, 404)
self.assertIn("File not found", res.json()["detail"])
@patch("routers.audio.subprocess.run")
def test_audio_waveform_cache_miss_then_hit(self, mock_subprocess_run) -> None:
with TemporaryDirectory() as tmp:
media_file = Path(tmp) / "input.mp4"
media_file.write_bytes(b"fake-media")
def fake_ffmpeg(cmd, capture_output, text):
out_path = Path(cmd[-1])
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_bytes(b"fake-wav")
return SimpleNamespace(returncode=0, stderr="")
mock_subprocess_run.side_effect = fake_ffmpeg
res1 = self.client.get("/audio/waveform", params={"path": str(media_file)})
self.assertEqual(res1.status_code, 200)
self.assertTrue(res1.headers.get("content-type", "").startswith("audio/wav"))
res2 = self.client.get("/audio/waveform", params={"path": str(media_file)})
self.assertEqual(res2.status_code, 200)
self.assertTrue(res2.headers.get("content-type", "").startswith("audio/wav"))
self.assertEqual(mock_subprocess_run.call_count, 1)
@patch("routers.audio.subprocess.run")
def test_audio_waveform_ffmpeg_failure_returns_500(self, mock_subprocess_run) -> None:
with TemporaryDirectory() as tmp:
media_file = Path(tmp) / "input.mp4"
media_file.write_bytes(b"fake-media")
mock_subprocess_run.return_value = SimpleNamespace(returncode=1, stderr="ffmpeg failed")
res = self.client.get("/audio/waveform", params={"path": str(media_file)})
self.assertEqual(res.status_code, 500)
self.assertIn("Failed to extract audio", res.json()["detail"])
@patch("routers.ai.detect_filler_words")
def test_ai_filler_removal_contract(self, mock_detect_filler_words) -> None:
mock_detect_filler_words.return_value = {
"wordIndices": [2, 5],
"fillerWords": [
{"index": 2, "word": "um", "reason": "filler"},
{"index": 5, "word": "uh", "reason": "filler"},
],
}
payload = {
"transcript": "Hello um world uh",
"words": [
{"index": 0, "word": "Hello"},
{"index": 1, "word": "um"},
{"index": 2, "word": "world"},
],
"provider": "ollama",
"model": "llama3",
}
res = self.client.post("/ai/filler-removal", json=payload)
self.assertEqual(res.status_code, 200)
self.assertIn("wordIndices", res.json())
mock_detect_filler_words.assert_called_once()
@patch("routers.ai.detect_filler_words")
def test_ai_filler_removal_error_returns_500(self, mock_detect_filler_words) -> None:
mock_detect_filler_words.side_effect = RuntimeError("ai-filler-fail")
payload = {
"transcript": "Hello world",
"words": [{"index": 0, "word": "Hello"}],
"provider": "ollama",
}
res = self.client.post("/ai/filler-removal", json=payload)
self.assertEqual(res.status_code, 500)
self.assertEqual(res.json()["detail"], "ai-filler-fail")
@patch("routers.ai.create_clip_suggestion")
def test_ai_create_clip_contract(self, mock_create_clip_suggestion) -> None:
mock_create_clip_suggestion.return_value = {
"title": "Best Moment",
"startWordIndex": 10,
"endWordIndex": 40,
"startTime": 12.3,
"endTime": 48.8,
"reason": "Strong hook",
}
payload = {
"transcript": "Long transcript...",
"words": [{"index": 0, "word": "hello"}],
"provider": "ollama",
"target_duration": 45,
}
res = self.client.post("/ai/create-clip", json=payload)
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json()["title"], "Best Moment")
mock_create_clip_suggestion.assert_called_once()
@patch("routers.ai.create_clip_suggestion")
def test_ai_create_clip_error_returns_500(self, mock_create_clip_suggestion) -> None:
mock_create_clip_suggestion.side_effect = RuntimeError("ai-clip-fail")
payload = {
"transcript": "Hello world",
"words": [{"index": 0, "word": "hello"}],
"provider": "ollama",
}
res = self.client.post("/ai/create-clip", json=payload)
self.assertEqual(res.status_code, 500)
self.assertEqual(res.json()["detail"], "ai-clip-fail")
@patch("routers.ai.AIProvider.list_ollama_models")
def test_ai_ollama_models_contract(self, mock_list_ollama_models) -> None:
mock_list_ollama_models.return_value = ["llama3", "qwen2.5"]
res = self.client.get("/ai/ollama-models?base_url=http://localhost:11434")
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json(), {"models": ["llama3", "qwen2.5"]})
mock_list_ollama_models.assert_called_once_with("http://localhost:11434")
@patch("routers.ai.AIProvider.list_ollama_models")
def test_ai_ollama_models_unhandled_error_returns_500(self, mock_list_ollama_models) -> None:
mock_list_ollama_models.side_effect = RuntimeError("ollama-unreachable")
local_client = TestClient(app, raise_server_exceptions=False)
res = local_client.get("/ai/ollama-models")
self.assertEqual(res.status_code, 500)
@patch("routers.transcribe.transcribe_audio")
def test_transcribe_success(self, mock_transcribe) -> None:
mock_transcribe.return_value = {"words": [], "segments": [], "language": "en"}
payload = {
"file_path": "/tmp/input.wav",
"model": "base",
"use_gpu": False,
"use_cache": True,
}
res = self.client.post("/transcribe", json=payload)
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json(), {"words": [], "segments": [], "language": "en"})
mock_transcribe.assert_called_once()
@patch("routers.transcribe.diarize_and_label")
@patch("routers.transcribe.transcribe_audio")
def test_transcribe_with_diarization(self, mock_transcribe, mock_diarize) -> None:
mock_transcribe.return_value = {"words": [{"word": "hi", "start": 0.0, "end": 0.2}], "segments": []}
mock_diarize.return_value = {"words": [{"word": "hi", "start": 0.0, "end": 0.2, "speaker": "SPEAKER_00"}], "segments": []}
payload = {
"file_path": "/tmp/input.wav",
"model": "base",
"diarize": True,
"hf_token": "hf_xxx",
"num_speakers": 2,
}
res = self.client.post("/transcribe", json=payload)
self.assertEqual(res.status_code, 200)
self.assertIn("words", res.json())
mock_transcribe.assert_called_once()
mock_diarize.assert_called_once()
@patch("routers.transcribe.transcribe_audio")
def test_transcribe_file_not_found_returns_404(self, mock_transcribe) -> None:
mock_transcribe.side_effect = FileNotFoundError("missing")
payload = {
"file_path": "/tmp/missing.wav",
"model": "base",
}
res = self.client.post("/transcribe", json=payload)
self.assertEqual(res.status_code, 404)
self.assertIn("File not found", res.json()["detail"])
@patch("routers.transcribe.transcribe_audio")
def test_transcribe_runtime_failure_returns_500(self, mock_transcribe) -> None:
mock_transcribe.side_effect = RuntimeError("boom")
payload = {
"file_path": "/tmp/in.wav",
"model": "base",
}
res = self.client.post("/transcribe", json=payload)
self.assertEqual(res.status_code, 500)
self.assertEqual(res.json()["detail"], "boom")
@patch("routers.captions.generate_srt")
def test_captions_plain_response(self, mock_generate_srt) -> None:
mock_generate_srt.return_value = "1\n00:00:00,000 --> 00:00:01,000\nHello\n"
payload = {
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
"format": "srt",
}
res = self.client.post("/captions", json=payload)
self.assertEqual(res.status_code, 200)
self.assertIn("Hello", res.text)
mock_generate_srt.assert_called_once()
@patch("routers.captions.save_captions")
@patch("routers.captions.generate_srt")
def test_captions_save_output_path(self, mock_generate_srt, mock_save) -> None:
mock_generate_srt.return_value = "1\n00:00:00,000 --> 00:00:01,000\nHello\n"
mock_save.return_value = "/tmp/out.srt"
payload = {
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
"format": "srt",
"output_path": "/tmp/out.srt",
}
res = self.client.post("/captions", json=payload)
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json(), {"status": "ok", "output_path": "/tmp/out.srt"})
mock_save.assert_called_once()
def test_captions_unknown_format_returns_400(self) -> None:
payload = {
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
"format": "txt",
}
res = self.client.post("/captions", json=payload)
self.assertEqual(res.status_code, 400)
self.assertIn("Unknown format", res.json()["detail"])
@patch("routers.captions.generate_srt")
def test_captions_internal_error_returns_500(self, mock_generate_srt) -> None:
mock_generate_srt.side_effect = RuntimeError("caption-fail")
payload = {
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
"format": "srt",
}
res = self.client.post("/captions", json=payload)
self.assertEqual(res.status_code, 500)
self.assertEqual(res.json()["detail"], "caption-fail")
@patch("routers.audio.is_deepfilter_available")
@patch("routers.audio.clean_audio")
def test_audio_clean_contract(self, mock_clean_audio, mock_is_deepfilter_available) -> None:
mock_clean_audio.return_value = "/tmp/cleaned.wav"
mock_is_deepfilter_available.return_value = True
payload = {
"input_path": "/tmp/in.wav",
"output_path": "/tmp/cleaned.wav",
}
res = self.client.post("/audio/clean", json=payload)
self.assertEqual(res.status_code, 200)
body = res.json()
self.assertEqual(body["status"], "ok")
self.assertEqual(body["output_path"], "/tmp/cleaned.wav")
self.assertEqual(body["engine"], "deepfilternet")
@patch("routers.audio.clean_audio")
def test_audio_clean_error_returns_500(self, mock_clean_audio) -> None:
mock_clean_audio.side_effect = RuntimeError("clean-fail")
payload = {
"input_path": "/tmp/in.wav",
"output_path": "/tmp/cleaned.wav",
}
res = self.client.post("/audio/clean", json=payload)
self.assertEqual(res.status_code, 500)
self.assertEqual(res.json()["detail"], "clean-fail")
@patch("routers.audio.detect_silence_ranges")
def test_audio_detect_silence_contract(self, mock_detect_silence_ranges) -> None:
mock_detect_silence_ranges.return_value = [{"start": 1.2, "end": 2.1, "duration": 0.9}]
payload = {
"input_path": "/tmp/in.wav",
"min_silence_ms": 500,
"silence_db": -35.0,
}
res = self.client.post("/audio/detect-silence", json=payload)
self.assertEqual(res.status_code, 200)
body = res.json()
self.assertEqual(body["status"], "ok")
self.assertEqual(body["count"], 1)
self.assertEqual(len(body["ranges"]), 1)
@patch("routers.audio.detect_silence_ranges")
def test_audio_detect_silence_error_returns_500(self, mock_detect_silence_ranges) -> None:
mock_detect_silence_ranges.side_effect = RuntimeError("silence-fail")
payload = {
"input_path": "/tmp/in.wav",
"min_silence_ms": 500,
"silence_db": -35.0,
}
res = self.client.post("/audio/detect-silence", json=payload)
self.assertEqual(res.status_code, 500)
self.assertEqual(res.json()["detail"], "silence-fail")
@patch("routers.audio.is_deepfilter_available")
def test_audio_capabilities_contract(self, mock_is_deepfilter_available) -> None:
mock_is_deepfilter_available.return_value = False
res = self.client.get("/audio/capabilities")
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json(), {"deepfilternet_available": False})
@patch("routers.export.export_stream_copy")
def test_export_fast_contract(self, mock_export_stream_copy) -> None:
mock_export_stream_copy.return_value = "/tmp/out.mp4"
payload = {
"input_path": "/tmp/in.mp4",
"output_path": "/tmp/out.mp4",
"keep_segments": [{"start": 0.0, "end": 2.0}],
"mode": "fast",
"captions": "none",
}
res = self.client.post("/export", json=payload)
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json(), {"status": "ok", "output_path": "/tmp/out.mp4"})
mock_export_stream_copy.assert_called_once()
@patch("routers.export.save_captions")
@patch("routers.export.generate_srt")
@patch("routers.export.export_stream_copy")
def test_export_sidecar_caption_contract(self, mock_export_stream_copy, mock_generate_srt, mock_save_captions) -> None:
mock_export_stream_copy.return_value = "/tmp/out.mp4"
mock_generate_srt.return_value = "1\n00:00:00,000 --> 00:00:01,000\nHello\n"
payload = {
"input_path": "/tmp/in.mp4",
"output_path": "/tmp/out.mp4",
"keep_segments": [{"start": 0.0, "end": 2.0}],
"mode": "fast",
"captions": "sidecar",
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
"deleted_indices": [],
}
res = self.client.post("/export", json=payload)
self.assertEqual(res.status_code, 200)
body = res.json()
self.assertEqual(body["status"], "ok")
self.assertEqual(body["output_path"], "/tmp/out.mp4")
self.assertEqual(body["srt_path"], "/tmp/out.srt")
mock_save_captions.assert_called_once()
def test_export_missing_segments_returns_400(self) -> None:
payload = {
"input_path": "/tmp/in.mp4",
"output_path": "/tmp/out.mp4",
"keep_segments": [],
"mode": "fast",
"captions": "none",
}
res = self.client.post("/export", json=payload)
self.assertEqual(res.status_code, 400)
self.assertIn("No segments to export", res.json()["detail"])
@patch("routers.export.export_stream_copy")
def test_export_runtime_error_returns_500(self, mock_export_stream_copy) -> None:
mock_export_stream_copy.side_effect = RuntimeError("export-fail")
payload = {
"input_path": "/tmp/in.mp4",
"output_path": "/tmp/out.mp4",
"keep_segments": [{"start": 0.0, "end": 2.0}],
"mode": "fast",
"captions": "none",
}
res = self.client.post("/export", json=payload)
self.assertEqual(res.status_code, 500)
self.assertEqual(res.json()["detail"], "export-fail")
if __name__ == "__main__":
unittest.main()

View File

View File

@ -0,0 +1,74 @@
from pathlib import Path
import tempfile
import os
import logging
try:
from moviepy import AudioFileClip
except ImportError:
from moviepy.editor import AudioFileClip
logger = logging.getLogger(__name__)
_temp_audio_files = []
def extract_audio(video_path: Path):
"""Extract audio from a video file into a temp directory for automatic cleanup."""
logger.info(f"[extract_audio] Extracting audio from: {video_path}")
try:
audio = AudioFileClip(str(video_path))
if audio.duration is None or audio.duration == 0:
logger.error(f"[extract_audio] File has no audio track or zero duration: {video_path}")
raise RuntimeError(f"File has no audio track: {video_path}")
logger.info(f"[extract_audio] Duration: {audio.duration:.2f}s, fps: {audio.fps}")
temp_dir = tempfile.mkdtemp(prefix="videotranscriber_")
audio_path = Path(temp_dir) / f"{video_path.stem}_audio.wav"
try:
audio.write_audiofile(str(audio_path), logger=None)
except TypeError:
# moviepy 1.x uses verbose parameter; moviepy 2.x removed it
audio.write_audiofile(str(audio_path), verbose=False, logger=None)
audio.close()
if not audio_path.exists() or audio_path.stat().st_size == 0:
logger.error(f"[extract_audio] Output WAV is empty or missing: {audio_path}")
raise RuntimeError(f"Audio extraction produced empty file: {audio_path}")
logger.info(f"[extract_audio] Extracted to: {audio_path} ({audio_path.stat().st_size} bytes)")
_temp_audio_files.append(str(audio_path))
return audio_path
except RuntimeError:
raise
except Exception as e:
logger.error(f"[extract_audio] Failed for '{video_path}': {e}", exc_info=True)
raise RuntimeError(f"Audio extraction failed: {e}")
def cleanup_temp_audio():
"""Remove all temporary audio files created during processing."""
cleaned = 0
for fpath in _temp_audio_files:
try:
if os.path.exists(fpath):
os.remove(fpath)
parent = os.path.dirname(fpath)
if os.path.isdir(parent) and not os.listdir(parent):
os.rmdir(parent)
cleaned += 1
except Exception as e:
logger.warning(f"Could not remove temp file {fpath}: {e}")
_temp_audio_files.clear()
return cleaned
def get_video_duration(video_path: Path):
"""Get duration of a video/audio file in seconds."""
try:
clip = AudioFileClip(str(video_path))
duration = clip.duration
clip.close()
if duration is None or duration == 0:
logger.warning(f"[get_video_duration] Zero or null duration for: {video_path}")
return duration
except Exception as e:
logger.error(f"[get_video_duration] Failed for '{video_path}': {e}", exc_info=True)
return None

View File

@ -1,12 +1,9 @@
"""
GPU utilities for the OBS Recording Transcriber.
GPU utilities for the Video Transcriber.
Provides functions to detect and configure GPU acceleration.
"""
import logging
import os
import platform
import subprocess
import torch
# Configure logging
@ -68,8 +65,6 @@ def get_optimal_device():
def set_memory_limits(memory_fraction=0.8):
global torch
import torch
"""
Set memory limits for GPU usage.

77
backend/video_editor.py Normal file
View File

@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""
Video editing operations using FFmpeg.
"""
import json
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs, get_video_info
def main():
if len(sys.argv) < 2:
print("Usage: python video_editor.py <command> [args...]", file=sys.stderr)
sys.exit(1)
command = sys.argv[1]
try:
if command == "export_stream_copy":
if len(sys.argv) != 5:
print("Usage: python video_editor.py export_stream_copy <input_path> <output_path> <keep_segments_json>", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[2]
output_path = sys.argv[3]
keep_segments = json.loads(sys.argv[4])
result = export_stream_copy(input_path, output_path, keep_segments)
print(json.dumps({"output_path": result}))
elif command == "export_reencode":
if len(sys.argv) != 7:
print("Usage: python video_editor.py export_reencode <input_path> <output_path> <keep_segments_json> <resolution> <format_hint>", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[2]
output_path = sys.argv[3]
keep_segments = json.loads(sys.argv[4])
resolution = sys.argv[5]
format_hint = sys.argv[6]
result = export_reencode(input_path, output_path, keep_segments, resolution, format_hint)
print(json.dumps({"output_path": result}))
elif command == "export_reencode_with_subs":
if len(sys.argv) != 8:
print("Usage: python video_editor.py export_reencode_with_subs <input_path> <output_path> <keep_segments_json> <subtitle_path> <resolution> <format_hint>", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[2]
output_path = sys.argv[3]
keep_segments = json.loads(sys.argv[4])
subtitle_path = sys.argv[5]
resolution = sys.argv[6]
format_hint = sys.argv[7]
result = export_reencode_with_subs(input_path, output_path, keep_segments, subtitle_path, resolution, format_hint)
print(json.dumps({"output_path": result}))
elif command == "get_video_info":
if len(sys.argv) != 3:
print("Usage: python video_editor.py get_video_info <input_path>", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[2]
result = get_video_info(input_path)
print(json.dumps(result))
else:
print(f"Unknown command: {command}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

86
close Executable file
View File

@ -0,0 +1,86 @@
#!/bin/bash
# Close TalkEdit processes (Tauri dev and Python backend)
KILLED_ANY=0
kill_pids() {
local label=$1
shift
local pids=("$@")
[[ ${#pids[@]} -eq 0 ]] && return
echo "Stopping $label (PID(s): ${pids[*]})..."
kill -TERM "${pids[@]}" 2>/dev/null || true
sleep 0.7
local survivors=()
local pid
for pid in "${pids[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
survivors+=("$pid")
fi
done
if [[ ${#survivors[@]} -gt 0 ]]; then
echo "Force killing stubborn $label PID(s): ${survivors[*]}"
kill -KILL "${survivors[@]}" 2>/dev/null || true
fi
KILLED_ANY=1
}
kill_tree() {
local pid=$1
local children
children=$(pgrep -P "$pid" 2>/dev/null || true)
if [[ -n "$children" ]]; then
local child
for child in $children; do
kill_tree "$child"
done
fi
kill_pids "process tree" "$pid"
}
kill_port() {
local port=$1
local name=$2
local pids
pids=$(lsof -ti tcp:"$port" 2>/dev/null)
if [[ -n "$pids" ]]; then
# Kill any children first so watcher subprocesses do not survive.
local pid
for pid in $pids; do
kill_tree "$pid"
done
kill_pids "$name listener on port $port" $pids
fi
}
kill_pattern() {
local pattern=$1
local label=$2
local pids
pids=$(pgrep -f "$pattern" 2>/dev/null)
if [[ -n "$pids" ]]; then
kill_pids "$label" $pids
fi
}
# --- TalkEdit (Tauri, port 8000) ---
kill_port 8000 "TalkEdit"
kill_port 5173 "TalkEdit frontend"
kill_pattern "tauri.*TalkEdit\|TalkEdit.*tauri\|cargo.*tauri dev\|/TalkEdit/target/debug" "TalkEdit (Tauri dev)"
# Vite dev server for TalkEdit (fallback when not bound to 5173 yet)
kill_pattern "[/ ]vite([[:space:]]|$)\|[/ ]rsbuild([[:space:]]|$)" "TalkEdit frontend dev server"
# --- Orphaned uvicorn workers ---
kill_pattern "uvicorn.*main:app.*--port 8000" "leftover uvicorn workers (TalkEdit)"
kill_pattern "uvicorn.*main:app.*--port 8642" "leftover uvicorn workers"
if [[ $KILLED_ANY -eq 0 ]]; then
echo "Nothing to close — no TalkEdit processes found."
else
echo "Done."
fi

View File

@ -1,70 +0,0 @@
version: '3.8'
services:
videotranscriber:
# Use prebuilt image from GitHub Container Registry
image: ghcr.io/dataants-ai/videotranscriber:latest
container_name: videotranscriber
ports:
- "8501:8501"
volumes:
# Mount your video files directory (change the left path to your actual videos folder)
- "${VIDEO_PATH:-./videos}:/app/data/videos"
# Mount output directory for transcripts and summaries
- "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
# Mount cache directory for model caching (optional, improves performance)
- "${CACHE_PATH:-./cache}:/app/data/cache"
# Mount a config directory if needed
- "${CONFIG_PATH:-./config}:/app/config"
environment:
# Ollama configuration for host access
- OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
# Optional: HuggingFace token for advanced features
- HF_TOKEN=${HF_TOKEN:-}
# GPU configuration
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}
# Cache settings
- TRANSFORMERS_CACHE=/app/data/cache/transformers
- WHISPER_CACHE=/app/data/cache/whisper
restart: unless-stopped
# Use bridge networking for Windows/Mac with host.docker.internal
networks:
- videotranscriber-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# Alternative GPU-enabled service (uncomment to use)
# videotranscriber-gpu:
# image: ghcr.io/dataants-ai/videotranscriber:latest-gpu
# container_name: videotranscriber-gpu
# ports:
# - "8501:8501"
# volumes:
# - "${VIDEO_PATH:-./videos}:/app/data/videos"
# - "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
# - "${CACHE_PATH:-./cache}:/app/data/cache"
# - "${CONFIG_PATH:-./config}:/app/config"
# environment:
# - OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
# - HF_TOKEN=${HF_TOKEN:-}
# - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
# - TRANSFORMERS_CACHE=/app/data/cache/transformers
# - WHISPER_CACHE=/app/data/cache/whisper
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
# restart: unless-stopped
# networks:
# - videotranscriber-network
networks:
videotranscriber-network:
driver: bridge

View File

@ -1,51 +0,0 @@
version: '3.8'
services:
videotranscriber:
build: .
container_name: videotranscriber
ports:
- "8501:8501"
volumes:
# Mount your video files directory (change the left path to your actual videos folder)
- "${VIDEO_PATH:-./videos}:/app/data/videos"
# Mount output directory for transcripts and summaries
- "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
# Mount cache directory for model caching (optional, improves performance)
- "${CACHE_PATH:-./cache}:/app/data/cache"
# Mount a config directory if needed
- "${CONFIG_PATH:-./config}:/app/config"
environment:
# Ollama configuration for host access
- OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
# Optional: HuggingFace token for advanced features
- HF_TOKEN=${HF_TOKEN:-}
# GPU configuration
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}
# Cache settings
- TRANSFORMERS_CACHE=/app/data/cache/transformers
- WHISPER_CACHE=/app/data/cache/whisper
# For GPU access (uncomment if you have NVIDIA GPU and nvidia-docker)
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
restart: unless-stopped
# For Linux hosts, you might prefer host networking for better Ollama access
# network_mode: host # Uncomment for Linux hosts
# Use bridge networking for Windows/Mac with host.docker.internal
networks:
- videotranscriber-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
networks:
videotranscriber-network:
driver: bridge

View File

@ -1,63 +0,0 @@
# VideoTranscriber Docker Configuration
# Copy this file to .env and modify the values as needed
# =============================================================================
# DOCKER VOLUME PATHS (Host Directories)
# =============================================================================
# Path to your video files directory on the host
# This directory will be mounted into the container at /app/data/videos
VIDEO_PATH=./videos
# Path where outputs (transcripts, summaries) will be saved on the host
# This directory will be mounted into the container at /app/data/outputs
OUTPUT_PATH=./outputs
# Path for caching ML models and processed files (improves performance)
# This directory will be mounted into the container at /app/data/cache
CACHE_PATH=./cache
# Optional: Configuration directory for custom settings
CONFIG_PATH=./config
# =============================================================================
# OLLAMA CONFIGURATION
# =============================================================================
# Ollama API URL - how the container accesses your host Ollama service
# For Windows/Mac with Docker Desktop: use host.docker.internal
# For Linux: use host networking or the actual host IP
OLLAMA_API_URL=http://host.docker.internal:11434/api
# =============================================================================
# ML MODEL CONFIGURATION
# =============================================================================
# HuggingFace token for advanced features (speaker diarization, etc.)
# Get your token at: https://huggingface.co/settings/tokens
# Leave empty if not using advanced features
HF_TOKEN=
# GPU Configuration
# Specify which GPU devices to use (leave empty for all available)
# Examples: "0" for first GPU, "0,1" for first two GPUs
CUDA_VISIBLE_DEVICES=
# =============================================================================
# DOCKER-SPECIFIC SETTINGS
# =============================================================================
# Container name (change if you want to run multiple instances)
CONTAINER_NAME=videotranscriber
# Port mapping (host:container)
HOST_PORT=8501
# =============================================================================
# EXAMPLE USAGE
# =============================================================================
# 1. Copy this file: cp docker.env.example .env
# 2. Edit the paths to match your system
# 3. Make sure Ollama is running on your host: ollama serve
# 4. Start the container: docker-compose up -d
# 5. Access the app at: http://localhost:8501

73
docs/ai-policy.md Normal file
View File

@ -0,0 +1,73 @@
# AI Execution Policy
Purpose: define what autonomous AI can do in this repository without explicit human approval.
## Default Mode
- AI may implement and debug within approved scope.
- AI must run validation commands after code changes.
- AI must stop and escalate when blocked by policy or ambiguity.
## Allowed Autonomous Actions
1. Edit frontend, backend, shared schema, docs, and scripts.
2. Add/modify tests related to the task.
3. Run non-destructive validation commands.
4. Update project docs and Copilot instructions when behavior changes.
## Restricted Actions (Require Approval)
1. Security/privacy-sensitive logic changes.
2. Data migrations or destructive file operations.
3. Credential handling changes or secrets management changes.
4. Breaking API/schema changes.
5. Build/release signing, packaging, and deployment automation changes.
## Prohibited Actions
1. Destructive git commands (`git reset --hard`, force pushing protected branches).
2. Deleting user project/media data.
3. Bypassing required checks in CI.
## Required Validation Workflow
For each autonomous task:
1. Implement smallest safe change set.
2. Run lint/type/test/build checks for impacted scope.
3. Inspect errors and fix with bounded retries.
4. Re-run checks until green or escalated.
5. Produce concise summary with risks and assumptions.
## Escalation Triggers
AI must ask a human when:
1. Requirements are ambiguous and affect user-visible behavior.
2. Multiple product choices are plausible with no clear preference.
3. Potential legal, security, or compliance impact exists.
4. CI remains failing after 3 repair attempts in the same area.
5. A requested operation conflicts with this policy.
## Required Artifacts In AI PR/Change Summary
1. What changed.
2. Why it changed.
3. Validation commands and outcome.
4. Residual risks.
5. Follow-up tasks.
## Risk Levels
- Low: docs, styling, isolated refactors, non-critical bugfixes.
- Medium: feature additions with contract-stable behavior.
- High: API/schema/security/export pipeline/transcription pipeline changes.
High-risk changes require explicit human review before merge.
## TalkEdit-Specific Rules
1. Preserve compatibility for desktop bridge contracts unless explicitly approved.
2. Keep routers thin and business logic in backend services.
3. Export/transcription pipeline changes must include regression tests.
4. Linux WebKit startup behavior and media URL consistency are mandatory regression targets.

View File

@ -0,0 +1,44 @@
# Gitea Runner — Windows Laptop
Self-hosted runner registered as `windows-laptop` with label `windows-latest`.
## Setup
```powershell
# Download
Invoke-WebRequest -Uri "https://gitea.com/gitea/runner/releases/download/v1.0.1/gitea-runner-1.0.1-windows-amd64.exe" -OutFile "$env:USERPROFILE\gitea-runner-windows-amd64.exe"
# Register (Admin PowerShell)
.\gitea-runner-windows-amd64.exe register --instance http://143.244.157.110:3000 --token NS5LXzLzNOvPKD9Id4SrLQ09bReHOrn6T2c4EyGM --name windows-laptop --labels windows-latest --no-interactive
# Start (foreground)
.\gitea-runner-windows-amd64.exe daemon
# Install as Windows service (auto-starts on boot)
.\gitea-runner-windows-amd64.exe service install
```
## Logs
### Workflow job logs (step output)
Stored on the Gitea server (not locally). Download from:
`http://143.244.157.110:3000/<owner>/<repo>/actions/runs/<run_id>`
Click a job, then the **Download log** button at the top-right.
### Runner daemon logs (runner itself)
| Mode | Log location |
|---|---|
| Foreground (`daemon`) | PowerShell console stdout |
| Windows service (`service install`) | `%ProgramData%\gitea-runner\log\` or Windows Event Viewer → Windows Logs → Application |
## Diagnostics
If a CI job fails, download the full log from the Gitea Actions UI (as above), then search for the first error:
- **Rust**: look for `error[E...]`, `error: could not compile`, or `cargo test` failures
- **Python**: look for `FAILED`, `AssertionError`, or `ModuleNotFoundError`
The runner's own logs (`daemon` mode) will show which job it picked up, container lifecycle, and any infrastructure issues (disk full, Docker unavailable, etc.).

View File

@ -0,0 +1,113 @@
# Error Codes Runbook
Purpose: provide consistent, AI-readable error categories for faster autonomous debugging.
## Format
Use codes in this format: `<SUBSYSTEM>-<CATEGORY>-<ID>`
Examples:
- `BE-EXPORT-001`
- `FE-WAVEFORM-002`
- `HOST-BRIDGE-003`
## Backend (FastAPI / Services)
### Export
- `BE-EXPORT-001`: Export request validation failed.
- Symptoms: HTTP 400, missing/invalid ranges.
- Likely causes: malformed payload, empty segments.
- First checks: request body shape, keep/mute/gain ranges.
- `BE-EXPORT-002`: FFmpeg command failed.
- Symptoms: HTTP 500, stderr contains filter/codec error.
- Likely causes: invalid filter chain, unsupported codec/container.
- First checks: generated FFmpeg args, source media codec, target format.
- `BE-EXPORT-003`: Caption burn-in/subtitle generation failed.
- Symptoms: burn-in export fails while plain export works.
- Likely causes: ASS generation issue, subtitle path/temp file cleanup race.
- First checks: ASS file generation, temp file lifecycle.
### Transcription
- `BE-TRANSCRIBE-001`: Model unavailable or download failure.
- Symptoms: transcription never starts or exits early.
- Likely causes: missing model, network/cache issue.
- First checks: model cache path, ensure-model logs.
- `BE-TRANSCRIBE-002`: Inference pipeline runtime failure.
- Symptoms: mid-run crash, partial output.
- Likely causes: CUDA/CPU mismatch, unsupported media, resource exhaustion.
- First checks: environment, GPU availability, media decoding logs.
### Audio / Waveform
- `BE-AUDIO-001`: Waveform endpoint failed.
- Symptoms: waveform panel shows unavailable/error.
- Likely causes: decode error, invalid file path, unsupported media input.
- First checks: `audio/waveform` response body, file existence, FFmpeg decode path.
## Frontend (React)
### Timeline / Zones
- `FE-TIMELINE-001`: Zone interaction state inconsistency.
- Symptoms: cannot drag/select/delete zones predictably.
- Likely causes: stale selection/editing state, hidden/selected mismatch.
- First checks: zone mode flags, selectedZone state transitions.
- `FE-TIMELINE-002`: Visibility filter mismatch.
- Symptoms: hidden zones still interactive or selected.
- Likely causes: hit-testing ignores visibility flags.
- First checks: hit-test filters and selected-zone reset logic.
### Media UI
- `FE-WAVEFORM-001`: Waveform fetch failed.
- Symptoms: warning banner with URL/error.
- Likely causes: backend unavailable, bad path encoding, CORS/proxy issue.
- First checks: backend health endpoint, waveform URL, network tab logs.
- `FE-PROJECT-001`: Project load mismatch.
- Symptoms: loaded media/transcript differs from saved data.
- Likely causes: schema drift, fallback URL mismatch.
- First checks: project schema fields, loadVideo/loadProject URL parity.
## Host / Bridge (Tauri)
- `HOST-BRIDGE-001`: Desktop API bridge unavailable.
- Symptoms: open/save/transcribe actions no-op or throw.
- Likely causes: bridge init error, host command mismatch.
- First checks: bridge initialization, command names, runtime environment.
- `HOST-WEBKIT-001`: Linux WebKit startup/render regression.
- Symptoms: noisy startup errors, UI load issues.
- Likely causes: CSP/font regressions, unsupported protocol calls.
- First checks: CSP config, remote font usage, bridge fallback behavior.
## Logging Guidance
When raising errors, include:
1. Error code.
2. Human message.
3. Correlation/request id.
4. Relevant paths/ids (sanitized).
5. Suggested first-check hints.
Example structured payload:
```json
{
"code": "BE-EXPORT-002",
"message": "FFmpeg export failed",
"requestId": "exp_20260415_001",
"context": {
"format": "mp4",
"mode": "reencode"
}
}
```

113
docs/spec-template.md Normal file
View File

@ -0,0 +1,113 @@
# Feature Spec Template
Use this template for every net-new feature and major behavior change.
## Metadata
- Spec ID: SPEC-YYYYMMDD-<short-name>
- Owner:
- Date:
- Status: draft | approved | in-progress | done
- Related issue/PR:
## Problem Statement
Describe the user problem in 2-5 sentences.
## User Story
As a <user type>, I want <capability>, so that <outcome>.
## Scope
### In Scope
1.
2.
3.
### Out of Scope
1.
2.
## Functional Requirements
1.
2.
3.
## Acceptance Criteria
1. Given <state>, when <action>, then <result>.
2. Given <state>, when <action>, then <result>.
3. Failure handling is deterministic and user-visible.
## UX Notes
- Entry points (toolbar/panel/command):
- Empty/loading/error states:
- Keyboard shortcuts / accessibility expectations:
## API And Data Contracts
- Endpoints impacted:
- Request/response changes:
- Backward compatibility plan:
- Project schema impact (`shared/project-schema.json`):
## Architecture Impact
- Frontend files/components likely affected:
- Backend routers/services likely affected:
- Tauri/bridge changes required:
## Risks
1.
2.
## Test Plan
### Unit Tests
1.
2.
### Integration Tests
1.
2.
### E2E / Smoke Tests
1.
2.
### Regression Tests
List known regressions this spec must prevent.
## Observability
- New logs/error codes:
- Metrics/traces needed:
- Diagnostics artifacts expected on failure:
## Rollout Plan
1. Development and internal validation.
2. Staged rollout or feature flag (if applicable).
3. Rollback path.
## Open Questions
1.
2.
## Definition Of Done
1. Acceptance criteria pass.
2. Tests added and green.
3. Docs/instructions updated.
4. Risks and assumptions recorded in PR summary.

18
docs/specs/README.md Normal file
View File

@ -0,0 +1,18 @@
# Feature Specs
Place one feature spec document in this folder for each feature or major behavior change.
Use [docs/spec-template.md](../spec-template.md) as the canonical template.
Recommended naming format:
- `YYYY-MM-DD-short-feature-name.md`
Examples:
- `2026-04-15-gain-zones-and-visibility-filters.md`
- `2026-04-16-speed-adjustment.md`
CI policy:
- Pull requests that change app code are expected to include at least one changed spec file in this folder.

26
frontend/eslint.config.js Normal file
View File

@ -0,0 +1,26 @@
import js from '@eslint/js';
import globals from 'globals';
import reactHooks from 'eslint-plugin-react-hooks';
import reactRefresh from 'eslint-plugin-react-refresh';
import tseslint from 'typescript-eslint';
export default tseslint.config(
{ ignores: ['dist', 'node_modules'] },
{
extends: [js.configs.recommended, ...tseslint.configs.recommended],
files: ['**/*.{ts,tsx}'],
languageOptions: {
ecmaVersion: 2020,
globals: globals.browser,
},
plugins: {
'react-hooks': reactHooks,
'react-refresh': reactRefresh,
},
rules: {
...reactHooks.configs.recommended.rules,
'react-refresh/only-export-components': ['warn', { allowConstantExport: true }],
'@typescript-eslint/no-explicit-any': 'off',
},
},
);

16
frontend/index.html Normal file
View File

@ -0,0 +1,16 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline' https://fonts.googleapis.com; font-src 'self' data: https://fonts.gstatic.com; connect-src 'self' ipc: http://ipc.localhost http://localhost:* http://127.0.0.1:* ws://localhost:* ws://127.0.0.1:*; media-src 'self' file: blob: http://localhost:* http://127.0.0.1:*; img-src 'self' data: blob:;" />
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet" />
<title>TalkEdit</title>
</head>
<body class="bg-editor-bg text-editor-text antialiased">
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

5383
frontend/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

44
frontend/package.json Normal file
View File

@ -0,0 +1,44 @@
{
"name": "talkedit-frontend",
"private": true,
"version": "0.1.0",
"type": "module",
"scripts": {
"dev": "vite",
"build": "tsc -b && vite build",
"lint": "eslint .",
"test": "vitest run",
"preview": "vite preview"
},
"dependencies": {
"@tauri-apps/api": "^2",
"@tauri-apps/plugin-dialog": "^2",
"@tauri-apps/plugin-fs": "^2",
"lucide-react": "^0.468.0",
"react": "^19.0.0",
"react-dom": "^19.0.0",
"react-virtuoso": "^4.18.3",
"wavesurfer.js": "^7.8.0",
"zundo": "^2.3.0",
"zustand": "^5.0.0"
},
"devDependencies": {
"@eslint/js": "^9.39.4",
"@tauri-apps/cli": "^2",
"@types/react": "^19.0.0",
"@types/react-dom": "^19.0.0",
"@vitejs/plugin-react": "^4.3.0",
"autoprefixer": "^10.4.20",
"eslint": "^9.39.4",
"eslint-plugin-react-hooks": "^7.0.1",
"eslint-plugin-react-refresh": "^0.5.2",
"globals": "^17.5.0",
"jsdom": "^29.1.1",
"postcss": "^8.4.49",
"tailwindcss": "^3.4.0",
"typescript": "^5.7.0",
"typescript-eslint": "^8.58.2",
"vite": "^6.0.0",
"vitest": "^4.1.4"
}
}

View File

@ -0,0 +1,6 @@
export default {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
};

1085
frontend/src/App.tsx Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,474 @@
import { useCallback, useState } from 'react';
import { useEditorStore } from '../store/editorStore';
import { useAIStore } from '../store/aiStore';
import { useLicenseStore } from '../store/licenseStore';
import { Sparkles, Scissors, Film, Loader2, Check, X, Play, Download, RotateCcw, RefreshCw, Lock } from 'lucide-react';
import type { ClipSuggestion } from '../types/project';
interface AIPanelProps {
onReprocess: () => void;
whisperModel: string;
setWhisperModel: (model: string) => void;
}
export default function AIPanel({ onReprocess, whisperModel, setWhisperModel }: AIPanelProps) {
const { words, videoPath, backendUrl, deleteWordRange, setCurrentTime } = useEditorStore();
const canUseAI = useLicenseStore((s) => s.canUseAI);
const setShowLicenseDialog = useLicenseStore((s) => s.setShowDialog);
const {
defaultProvider,
providers,
customFillerWords,
fillerResult,
clipSuggestions,
isProcessing,
processingMessage,
setCustomFillerWords,
setFillerResult,
setClipSuggestions,
setProcessing,
} = useAIStore();
const [activeTab, setActiveTab] = useState<'filler' | 'clips' | 'reprocess'>('filler');
const [error, setError] = useState<string | null>(null);
const detectFillers = useCallback(async () => {
if (words.length === 0) return;
setError(null);
setProcessing(true, 'Detecting filler words...');
try {
const config = providers[defaultProvider];
const transcript = words.map((w) => w.word).join(' ');
const res = await fetch(`${backendUrl}/ai/filler-removal`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
transcript,
words: words.map((w, i) => ({ index: i, word: w.word })),
provider: defaultProvider,
model: config.model,
api_key: config.apiKey || undefined,
base_url: config.baseUrl || undefined,
custom_filler_words: customFillerWords || undefined,
}),
});
if (!res.ok) {
const errData = await res.json().catch(() => ({}));
throw new Error(errData.error || `Filler detection failed (${res.status})`);
}
const data = await res.json();
setFillerResult(data);
} catch (err) {
console.error(err);
setError(err instanceof Error ? err.message : 'Filler detection failed');
} finally {
setProcessing(false);
}
}, [words, backendUrl, defaultProvider, providers, customFillerWords, setProcessing, setFillerResult]);
const createClips = useCallback(async () => {
if (words.length === 0) return;
setError(null);
setProcessing(true, 'Finding best clip segments...');
try {
const config = providers[defaultProvider];
const transcript = words.map((w) => w.word).join(' ');
const res = await fetch(`${backendUrl}/ai/create-clip`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
transcript,
words: words.map((w, i) => ({
index: i,
word: w.word,
start: w.start,
end: w.end,
})),
provider: defaultProvider,
model: config.model,
api_key: config.apiKey || undefined,
base_url: config.baseUrl || undefined,
target_duration: 60,
}),
});
if (!res.ok) {
const errData = await res.json().catch(() => ({}));
throw new Error(errData.error || `Clip creation failed (${res.status})`);
}
const data = await res.json();
setClipSuggestions(data.clips || []);
} catch (err) {
console.error(err);
setError(err instanceof Error ? err.message : 'Clip creation failed');
} finally {
setProcessing(false);
}
}, [words, backendUrl, defaultProvider, providers, setProcessing, setClipSuggestions]);
const applyFillerDeletions = useCallback(() => {
if (!fillerResult) return;
const sorted = [...fillerResult.fillerWords].sort((a, b) => b.index - a.index);
for (const fw of sorted) {
deleteWordRange(fw.index, fw.index);
}
setFillerResult(null);
}, [fillerResult, deleteWordRange, setFillerResult]);
const handlePreviewClip = useCallback(
(clip: ClipSuggestion) => {
setCurrentTime(clip.startTime);
const video = document.querySelector('video');
if (video) {
video.currentTime = clip.startTime;
video.play();
}
},
[setCurrentTime],
);
const [exportingClipIndex, setExportingClipIndex] = useState<number | null>(null);
const handleExportClip = useCallback(
async (clip: ClipSuggestion, index: number) => {
if (!videoPath) return;
setExportingClipIndex(index);
try {
const safeName = clip.title.replace(/[^a-zA-Z0-9_-]/g, '_').substring(0, 40);
const dirSep = videoPath.lastIndexOf('\\') >= 0 ? '\\' : '/';
const dir = videoPath.substring(0, videoPath.lastIndexOf(dirSep));
const outputPath = `${dir}${dirSep}${safeName}_clip.mp4`;
const res = await fetch(`${backendUrl}/export`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
input_path: videoPath,
output_path: outputPath,
keep_segments: [{ start: clip.startTime, end: clip.endTime }],
mode: 'fast',
format: 'mp4',
}),
});
if (!res.ok) throw new Error('Export failed');
const data = await res.json();
alert(`Clip exported to: ${data.output_path}`);
} catch (err) {
console.error(err);
alert('Failed to export clip. Check console for details.');
} finally {
setExportingClipIndex(null);
}
},
[videoPath, backendUrl],
);
return (
<div className="flex flex-col h-full">
<div className="flex border-b border-editor-border shrink-0">
<TabButton
active={activeTab === 'filler'}
onClick={() => setActiveTab('filler')}
icon={<Scissors className="w-3.5 h-3.5" />}
label="Filler Words"
title="Detect and remove filler words from transcript"
/>
<TabButton
active={activeTab === 'clips'}
onClick={() => setActiveTab('clips')}
icon={<Film className="w-3.5 h-3.5" />}
label="Create Clips"
title="Find the best segments for social media clips"
/>
<TabButton
active={activeTab === 'reprocess'}
onClick={() => setActiveTab('reprocess')}
icon={<RefreshCw className="w-3.5 h-3.5" />}
label="Reprocess"
title="Re-run transcription with a different Whisper model"
/>
</div>
<div className="flex-1 overflow-y-auto p-4">
{activeTab === 'filler' && (
<div className="space-y-4">
{!canUseAI ? (
<div className="text-center py-8 px-4">
<Lock className="w-8 h-8 text-editor-text-muted mx-auto mb-3" />
<p className="text-sm font-medium mb-1">AI editing requires Business</p>
<p className="text-xs text-editor-text-muted mb-4">
Upgrade to Business to unlock filler word removal, clip suggestions, and more.
</p>
<button
onClick={() => setShowLicenseDialog(true)}
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover text-white rounded-lg text-sm font-medium transition-colors"
>
Upgrade Now
</button>
</div>
) : (
<>
<p className="text-xs text-editor-text-muted">
Use AI to detect and remove filler words like "um", "uh", "like", "you know" from
your transcript.
</p>
<div className="space-y-1.5">
<label className="text-[11px] text-editor-text-muted font-medium">
Custom filler words (comma-separated)
</label>
<input
type="text"
value={customFillerWords}
onChange={(e) => setCustomFillerWords(e.target.value)}
placeholder="e.g. okay, alright, anyway"
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
/>
</div>
<button
onClick={detectFillers}
disabled={isProcessing || words.length === 0}
title="Scan the entire transcript for filler words (um, uh, like, you know) and mark for removal"
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
>
{isProcessing ? (
<>
<Loader2 className="w-4 h-4 animate-spin" />
{processingMessage}
</>
) : (
<>
<Sparkles className="w-4 h-4" />
Detect Filler Words
</>
)}
</button>
{error && (
<div className="bg-red-500/10 border border-red-500/40 rounded text-xs text-red-300 p-2 flex items-center justify-between">
<span>{error}</span>
<button
onClick={detectFillers}
className="flex items-center gap-1 px-2 py-1 text-xs bg-red-500/20 hover:bg-red-500/30 rounded transition-colors shrink-0 ml-2"
>
<RotateCcw className="w-3 h-3" /> Retry
</button>
</div>
)}
{fillerResult && fillerResult.fillerWords.length > 0 && (
<div className="space-y-3">
<div className="flex items-center justify-between">
<span className="text-xs font-medium">
Found {fillerResult.fillerWords.length} filler words
</span>
<div className="flex gap-1">
<button
onClick={applyFillerDeletions}
title="Create cut ranges for all detected filler words at once"
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-success/20 text-editor-success rounded hover:bg-editor-success/30"
>
<Check className="w-3 h-3" /> Apply All
</button>
<button
onClick={() => { setFillerResult(null); setError(null); }}
title="Clear detected filler word results without applying"
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-border text-editor-text-muted rounded hover:bg-editor-surface"
>
<X className="w-3 h-3" /> Dismiss
</button>
</div>
</div>
<div className="space-y-1 max-h-64 overflow-y-auto">
{fillerResult.fillerWords.map((fw) => (
<div
key={fw.index}
className="flex items-center justify-between px-2 py-1.5 bg-editor-word-filler rounded text-xs"
>
<span>
<strong>"{fw.word}"</strong>
<span className="text-editor-text-muted ml-1"> {fw.reason}</span>
</span>
</div>
))}
</div>
</div>
)}
{fillerResult && fillerResult.fillerWords.length === 0 && (
<p className="text-xs text-editor-success">No filler words detected.</p>
)}
</>
)}
</div>
)}
{activeTab === 'clips' && (
<div className="space-y-4">
{!canUseAI ? (
<div className="text-center py-8 px-4">
<Lock className="w-8 h-8 text-editor-text-muted mx-auto mb-3" />
<p className="text-sm font-medium mb-1">AI clip suggestions require Business</p>
<p className="text-xs text-editor-text-muted mb-4">
Upgrade to Business to find the best segments for social media clips.
</p>
<button
onClick={() => setShowLicenseDialog(true)}
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover text-white rounded-lg text-sm font-medium transition-colors"
>
Upgrade Now
</button>
</div>
) : (
<>
<p className="text-xs text-editor-text-muted">
AI analyzes your transcript and suggests the most engaging segments for a
YouTube Short or social media clip.
</p>
<button
onClick={createClips}
disabled={isProcessing || words.length === 0}
title="Analyze transcript to find the most engaging 20-60 second segments for social media"
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
>
{isProcessing ? (
<>
<Loader2 className="w-4 h-4 animate-spin" />
{processingMessage}
</>
) : (
<>
<Film className="w-4 h-4" />
Find Best Clips
</>
)}
</button>
{error && (
<div className="bg-red-500/10 border border-red-500/40 rounded text-xs text-red-300 p-2 flex items-center justify-between">
<span>{error}</span>
<button
onClick={createClips}
className="flex items-center gap-1 px-2 py-1 text-xs bg-red-500/20 hover:bg-red-500/30 rounded transition-colors shrink-0 ml-2"
>
<RotateCcw className="w-3 h-3" /> Retry
</button>
</div>
)}
{clipSuggestions.length > 0 && (
<div className="space-y-3">
{clipSuggestions.map((clip, i) => (
<div key={i} className="p-3 bg-editor-surface rounded-lg space-y-2">
<div className="flex items-center justify-between">
<span className="text-xs font-semibold">{clip.title}</span>
<span className="text-[10px] text-editor-text-muted">
{Math.round(clip.endTime - clip.startTime)}s
</span>
</div>
<p className="text-[11px] text-editor-text-muted">{clip.reason}</p>
<div className="flex gap-2">
<button
onClick={() => handlePreviewClip(clip)}
title="Seek to this clip's position and play a preview"
className="flex-1 flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-accent/20 text-editor-accent rounded hover:bg-editor-accent/30 transition-colors"
>
<Play className="w-3 h-3" /> Preview
</button>
<button
onClick={() => handleExportClip(clip, i)}
disabled={exportingClipIndex === i}
title="Export just this segment as a standalone video file"
className="flex-1 flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-success/20 text-editor-success rounded hover:bg-editor-success/30 disabled:opacity-40 transition-colors"
>
{exportingClipIndex === i ? (
<Loader2 className="w-3 h-3 animate-spin" />
) : (
<Download className="w-3 h-3" />
)}
Export
</button>
</div>
</div>
))}
</div>
)}
</>
)}
</div>
)}
{activeTab === 'reprocess' && (
<div className="space-y-4">
<p className="text-xs text-editor-text-muted">
Re-run transcription with a different model replaces the current transcript entirely.
</p>
<div className="space-y-1.5">
<label className="text-[11px] text-editor-text-muted font-medium">
Whisper Model
</label>
<select
value={whisperModel}
onChange={(e) => setWhisperModel(e.target.value)}
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
>
<optgroup label="Multilingual (any language)">
<option value="tiny">tiny ~75 MB · fastest, low accuracy</option>
<option value="base">base ~140 MB · fast, decent accuracy</option>
<option value="small">small ~460 MB · good balance</option>
<option value="medium">medium ~1.5 GB · better accuracy</option>
<option value="large-v2">large-v2 ~2.9 GB · high accuracy</option>
<option value="large-v3">large-v3 ~2.9 GB · best overall </option>
<option value="large-v3-turbo">large-v3-turbo ~1.6 GB · fast + accurate </option>
<option value="distil-large-v3">distil-large-v3 ~1.5 GB · fast, near large-v3 quality</option>
</optgroup>
<optgroup label="English-only (faster &amp; more accurate for English)">
<option value="tiny.en">tiny.en ~75 MB · fastest English</option>
<option value="base.en">base.en ~140 MB · fast English</option>
<option value="small.en">small.en ~460 MB · good English</option>
<option value="medium.en">medium.en ~1.5 GB · great English</option>
<option value="distil-small.en">distil-small.en ~190 MB · fast English </option>
<option value="distil-medium.en">distil-medium.en ~750 MB · best fast English </option>
</optgroup>
</select>
</div>
<button
onClick={onReprocess}
disabled={isProcessing || words.length === 0}
title="Re-run transcription with the selected model — this will replace all current words"
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
>
<RefreshCw className="w-4 h-4" />
Reprocess Transcript
</button>
</div>
)}
</div>
</div>
);
}
function TabButton({
active,
onClick,
icon,
label,
title,
}: {
active: boolean;
onClick: () => void;
icon: React.ReactNode;
label: string;
title?: string;
}) {
return (
<button
onClick={onClick}
title={title}
className={`flex-1 flex items-center justify-center gap-1.5 px-3 py-2.5 text-xs font-medium transition-colors border-b-2 ${
active
? 'border-editor-accent text-editor-accent'
: 'border-transparent text-editor-text-muted hover:text-editor-text'
}`}
>
{icon}
{label}
</button>
);
}

View File

@ -0,0 +1,84 @@
import { useEditorStore } from '../store/editorStore';
import { Video, Plus, Trash2, ChevronUp, ChevronDown } from 'lucide-react';
export default function AppendClipPanel() {
const { additionalClips, addAdditionalClip, removeAdditionalClip, reorderAdditionalClip, videoPath } = useEditorStore();
const handleAddClip = async () => {
const path = await window.electronAPI?.openFile({
filters: [
{ name: 'Video Files', extensions: ['mp4', 'mkv', 'mov', 'avi', 'webm'] },
{ name: 'All Files', extensions: ['*'] },
],
});
if (path) {
addAdditionalClip(path);
}
};
return (
<div className="p-4 space-y-3">
<h3 className="text-sm font-semibold flex items-center gap-1.5">
<Video className="w-4 h-4" />
Append Clips
</h3>
<p className="text-[10px] text-editor-text-muted leading-relaxed">
Load additional video clips to append after the main video. Clips are concatenated in order during export.
</p>
{additionalClips.length === 0 ? (
<div className="text-[11px] text-editor-text-muted text-center py-3">
No additional clips loaded
</div>
) : (
<div className="space-y-1 max-h-60 overflow-y-auto">
{additionalClips.map((clip, idx) => (
<div
key={clip.id}
className="flex items-center gap-2 p-2 rounded bg-editor-surface border border-editor-border text-xs"
>
<Video className="w-3 h-3 text-editor-accent shrink-0" />
<span className="flex-1 truncate text-editor-text">{clip.label}</span>
<span className="text-[10px] text-editor-text-muted shrink-0">#{idx + 1}</span>
<div className="flex items-center gap-0.5 shrink-0">
<button
onClick={() => reorderAdditionalClip(clip.id, -1)}
disabled={idx === 0}
className="p-0.5 rounded hover:bg-editor-bg disabled:opacity-30 text-editor-text-muted hover:text-editor-text"
title="Move up"
>
<ChevronUp className="w-3 h-3" />
</button>
<button
onClick={() => reorderAdditionalClip(clip.id, 1)}
disabled={idx === additionalClips.length - 1}
className="p-0.5 rounded hover:bg-editor-bg disabled:opacity-30 text-editor-text-muted hover:text-editor-text"
title="Move down"
>
<ChevronDown className="w-3 h-3" />
</button>
</div>
<button
onClick={() => removeAdditionalClip(clip.id)}
className="p-0.5 rounded hover:bg-red-500/20 text-red-400"
title="Remove clip"
>
<Trash2 className="w-3 h-3" />
</button>
</div>
))}
</div>
)}
<button
onClick={handleAddClip}
disabled={!videoPath}
className="w-full flex items-center justify-center gap-2 px-3 py-2 rounded-lg border-2 border-dashed border-editor-border text-xs text-editor-text-muted hover:text-editor-text hover:border-editor-text-muted disabled:opacity-40 transition-colors"
title="Select a video or audio file to append during export"
>
<Plus className="w-3.5 h-3.5" />
Add Clip
</button>
</div>
);
}

View File

@ -0,0 +1,150 @@
import { useEditorStore } from '../store/editorStore';
import { Music, Trash2, Volume2, Disc3 } from 'lucide-react';
export default function BackgroundMusicPanel() {
const { backgroundMusic, setBackgroundMusic, updateBackgroundMusic } = useEditorStore();
const handleLoadMusic = async () => {
const path = await window.electronAPI?.openFile({
filters: [
{ name: 'Audio Files', extensions: ['mp3', 'wav', 'm4a', 'flac', 'ogg', 'aac', 'wma'] },
{ name: 'All Files', extensions: ['*'] },
],
});
if (path) {
setBackgroundMusic({
path,
volumeDb: -10,
duckingEnabled: true,
duckingDb: 6,
duckingAttackMs: 10,
duckingReleaseMs: 200,
});
}
};
const handleRemoveMusic = () => {
setBackgroundMusic(null);
};
return (
<div className="p-4 space-y-4">
<h3 className="text-sm font-semibold flex items-center gap-1.5">
<Music className="w-4 h-4" />
Background Music
</h3>
{!backgroundMusic ? (
<button
onClick={handleLoadMusic}
className="w-full flex items-center justify-center gap-2 px-4 py-3 rounded-lg border-2 border-dashed border-editor-border text-xs text-editor-text-muted hover:text-editor-text hover:border-editor-text-muted transition-colors"
title="Select an audio file to use as background music"
>
<Disc3 className="w-4 h-4" />
Load Music File
</button>
) : (
<div className="space-y-3">
<div className="flex items-center gap-2 p-2 rounded bg-editor-surface border border-editor-border">
<Music className="w-4 h-4 text-editor-accent shrink-0" />
<span className="flex-1 text-xs truncate">
{backgroundMusic.path.split(/[/\\]/).pop()}
</span>
<button
onClick={handleRemoveMusic}
className="p-1 rounded hover:bg-red-500/20 text-red-400 transition-colors"
title="Remove music"
>
<Trash2 className="w-3 h-3" />
</button>
</div>
<div className="space-y-2">
<div className="flex items-center gap-2">
<Volume2 className="w-3 h-3 text-editor-text-muted shrink-0" />
<span className="text-[10px] text-editor-text-muted w-16">Volume:</span>
<input
type="range"
min={-30}
max={12}
step={1}
value={backgroundMusic.volumeDb}
onChange={(e) => updateBackgroundMusic({ volumeDb: Number(e.target.value) })}
className="flex-1 h-1.5"
title="Background music volume relative to main audio — positive boosts, negative reduces"
/>
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.volumeDb} dB</span>
</div>
</div>
<label className="flex items-center gap-2 cursor-pointer">
<input
type="checkbox"
checked={backgroundMusic.duckingEnabled}
onChange={(e) => updateBackgroundMusic({ duckingEnabled: e.target.checked })}
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
title="Automatically lower music volume when speech is detected"
/>
<div>
<span className="text-xs font-medium">Auto-ducking</span>
<p className="text-[10px] text-editor-text-muted">
Lower music volume when speech is detected
</p>
</div>
</label>
{backgroundMusic.duckingEnabled && (
<div className="pl-6 space-y-2">
<div className="flex items-center gap-2">
<span className="text-[10px] text-editor-text-muted w-20">Duck amount:</span>
<input
type="range"
min={1}
max={20}
step={1}
value={backgroundMusic.duckingDb}
onChange={(e) => updateBackgroundMusic({ duckingDb: Number(e.target.value) })}
className="flex-1 h-1.5"
title="How much to reduce music volume during speech (1-20 dB)"
/>
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.duckingDb} dB</span>
</div>
<div className="flex items-center gap-2">
<span className="text-[10px] text-editor-text-muted w-20">Attack:</span>
<input
type="range"
min={1}
max={100}
step={1}
value={backgroundMusic.duckingAttackMs}
onChange={(e) => updateBackgroundMusic({ duckingAttackMs: Number(e.target.value) })}
className="flex-1 h-1.5"
title="How quickly the ducking effect engages when speech starts"
/>
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.duckingAttackMs}ms</span>
</div>
<div className="flex items-center gap-2">
<span className="text-[10px] text-editor-text-muted w-20">Release:</span>
<input
type="range"
min={10}
max={1000}
step={10}
value={backgroundMusic.duckingReleaseMs}
onChange={(e) => updateBackgroundMusic({ duckingReleaseMs: Number(e.target.value) })}
className="flex-1 h-1.5"
title="How quickly the ducking effect fades when speech ends"
/>
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.duckingReleaseMs}ms</span>
</div>
</div>
)}
<p className="text-[10px] text-editor-text-muted leading-relaxed">
The music will be mixed during export. Enable auto-ducking to lower music volume whenever speech is active.
</p>
</div>
)}
</div>
);
}

View File

@ -0,0 +1,160 @@
import { useState, useCallback } from 'react';
import { useEditorStore } from '../store/editorStore';
import { Terminal, ChevronDown, ChevronUp, Play, Wifi, AlertTriangle } from 'lucide-react';
export default function DevPanel() {
const [open, setOpen] = useState(false);
const [pathInput, setPathInput] = useState('');
const [testResult, setTestResult] = useState<string | null>(null);
const [testing, setTesting] = useState(false);
const [showResetConfirm, setShowResetConfirm] = useState(false);
const { backendUrl, videoPath, loadVideo } = useEditorStore();
const handleLoad = useCallback(() => {
const p = pathInput.trim();
if (p) loadVideo(p);
}, [pathInput, loadVideo]);
const testEndpoint = useCallback(async (endpoint: string) => {
setTesting(true);
setTestResult(null);
try {
const url = `${backendUrl}${endpoint}`;
const res = await fetch(url);
const text = res.headers.get('content-type')?.includes('json')
? JSON.stringify(await res.json(), null, 2)
: `${res.status} ${res.statusText} (${res.headers.get('content-type') ?? 'no type'})`;
setTestResult(`${url}\n${text}`);
} catch (e) {
setTestResult(`${e}`);
} finally {
setTesting(false);
}
}, [backendUrl]);
const testWaveform = useCallback(async () => {
const p = pathInput.trim() || videoPath;
if (!p) { setTestResult('No path to test'); return; }
setTesting(true);
setTestResult(null);
try {
const url = `${backendUrl}/audio/waveform?path=${encodeURIComponent(p)}`;
const res = await fetch(url);
if (res.ok) {
const buf = await res.arrayBuffer();
setTestResult(`✓ Waveform OK — ${buf.byteLength} bytes\n${url}`);
} else {
const body = await res.text().catch(() => '');
setTestResult(`✗ HTTP ${res.status}\n${body}`);
}
} catch (e) {
setTestResult(`${e}`);
} finally {
setTesting(false);
}
}, [backendUrl, pathInput, videoPath]);
return (
<div className="fixed bottom-0 right-0 z-50 w-96 font-mono text-[11px]">
{/* Header */}
<button
onClick={() => setOpen(o => !o)}
className="w-full flex items-center justify-between px-3 py-1.5 bg-[#0d0f1a] border-t border-l border-[#2a2d3e] text-[#6b7280] hover:text-white"
>
<span className="flex items-center gap-1.5">
<Terminal className="w-3 h-3" />
DevPanel
<span className="ml-2 text-[#4a4f6a]">{backendUrl}</span>
</span>
{open ? <ChevronDown className="w-3 h-3" /> : <ChevronUp className="w-3 h-3" />}
</button>
{open && (
<div className="bg-[#0d0f1a] border-t border-l border-[#2a2d3e] p-3 space-y-3">
{/* State */}
<div className="space-y-0.5 text-[#4a4f6a]">
<div>backendUrl: <span className="text-[#6366f1]">{backendUrl}</span></div>
<div className="truncate">videoPath: <span className="text-[#6366f1]">{videoPath ?? 'null'}</span></div>
</div>
{/* Load file by path */}
<div className="space-y-1">
<div className="text-[#6b7280] uppercase tracking-wider text-[9px]">Load file</div>
<div className="flex gap-1">
<input
type="text"
value={pathInput}
onChange={e => setPathInput(e.target.value)}
onKeyDown={e => e.key === 'Enter' && handleLoad()}
placeholder={videoPath ?? '/path/to/file.wav'}
className="flex-1 bg-[#13141f] border border-[#2a2d3e] rounded px-2 py-1 text-white placeholder-[#2a2d3e] focus:outline-none focus:border-[#6366f1]"
/>
<button
onClick={handleLoad}
disabled={!pathInput.trim()}
className="px-2 py-1 bg-[#6366f1] hover:bg-[#4f52d4] disabled:opacity-30 rounded text-white"
>
<Play className="w-3 h-3" />
</button>
</div>
</div>
{/* Quick tests */}
<div className="space-y-1">
<div className="text-[#6b7280] uppercase tracking-wider text-[9px]">Test endpoints</div>
<div className="flex flex-wrap gap-1">
<button onClick={() => testEndpoint('/health')} className="px-2 py-0.5 bg-[#1e2030] hover:bg-[#2a2d3e] rounded text-[#6b7280] hover:text-white flex items-center gap-1">
<Wifi className="w-2.5 h-2.5" />/health
</button>
<button onClick={() => testEndpoint('/audio/capabilities')} className="px-2 py-0.5 bg-[#1e2030] hover:bg-[#2a2d3e] rounded text-[#6b7280] hover:text-white">
/audio/capabilities
</button>
<button onClick={testWaveform} disabled={testing} className="px-2 py-0.5 bg-[#1e2030] hover:bg-[#2a2d3e] disabled:opacity-40 rounded text-[#6b7280] hover:text-white">
/audio/waveform
</button>
</div>
</div>
{/* Result */}
{testResult && (
<pre className="bg-[#13141f] border border-[#2a2d3e] rounded p-2 text-[10px] text-[#9ca3af] whitespace-pre-wrap break-all max-h-32 overflow-y-auto">
{testResult}
</pre>
)}
{/* Danger Zone */}
<div className="space-y-1">
<div className="text-[#ef4444] uppercase tracking-wider text-[9px]">Danger Zone</div>
{!showResetConfirm ? (
<button
onClick={() => setShowResetConfirm(true)}
className="w-full px-2 py-1.5 rounded border border-red-500/40 text-red-400 hover:bg-red-500/10 text-xs flex items-center justify-center gap-1.5"
>
<AlertTriangle className="w-3 h-3" />
Reset Editor State
</button>
) : (
<div className="bg-[#1e1020] border border-red-500/40 rounded p-2 space-y-1.5">
<p className="text-[#fca5a5] text-[10px]">This will clear all editor data and reload the page. Unsaved changes will be lost.</p>
<div className="flex gap-1">
<button
onClick={() => setShowResetConfirm(false)}
className="flex-1 px-2 py-1 rounded text-[10px] text-[#6b7280] hover:text-white hover:bg-[#2a2d3e]"
>
Cancel
</button>
<button
onClick={() => { useEditorStore.getState().reset(); window.location.reload(); }}
className="flex-1 px-2 py-1 rounded text-[10px] border border-red-500/40 text-red-400 hover:bg-red-500/10"
>
Confirm Reset
</button>
</div>
</div>
)}
</div>
</div>
)}
</div>
);
}

View File

@ -0,0 +1,90 @@
import { Component, type ReactNode } from 'react';
interface Props {
children: ReactNode;
}
interface State {
hasError: boolean;
error: Error | null;
}
export default class ErrorBoundary extends Component<Props, State> {
constructor(props: Props) {
super(props);
this.state = { hasError: false, error: null };
}
static getDerivedStateFromError(error: Error): State {
return { hasError: true, error };
}
componentDidCatch(error: Error, info: React.ErrorInfo) {
console.error('ErrorBoundary caught:', error, info.componentStack);
try {
window.electronAPI?.logError?.(error.message, error.stack || '', info.componentStack || '');
} catch {}
}
handleReload = () => {
window.location.reload();
};
handleReset = () => {
try {
localStorage.clear();
sessionStorage.clear();
} catch {}
window.location.reload();
};
render() {
if (this.state.hasError) {
return (
<div className="h-screen flex flex-col items-center justify-center gap-6 bg-editor-bg px-6">
<div className="flex flex-col items-center gap-3 max-w-md text-center">
<div className="w-12 h-12 rounded-full bg-red-500/20 flex items-center justify-center">
<svg className="w-6 h-6 text-red-400" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-2.5L13.732 4c-.77-.833-1.964-.833-2.732 0L4.082 16.5c-.77.833.192 2.5 1.732 2.5z" />
</svg>
</div>
<h2 className="text-lg font-semibold text-editor-text">Something went wrong</h2>
<p className="text-xs text-editor-text-muted leading-relaxed">
An unexpected error occurred. Your work may still be recoverable.
</p>
</div>
{this.state.error && (
<details className="max-w-md w-full">
<summary className="text-xs text-editor-text-muted cursor-pointer hover:text-editor-text">
Error details
</summary>
<pre className="mt-2 p-3 rounded bg-editor-surface border border-editor-border text-[10px] text-red-300 overflow-auto max-h-32 whitespace-pre-wrap">
{this.state.error.message}
{'\n'}
{this.state.error.stack}
</pre>
</details>
)}
<div className="flex flex-col items-center gap-2">
<button
onClick={this.handleReload}
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover rounded-lg text-sm font-medium transition-colors"
>
Reload App
</button>
<button
onClick={this.handleReset}
className="text-xs text-editor-text-muted hover:text-editor-text underline transition-colors"
>
Reset & Clear All Data
</button>
</div>
</div>
);
}
return this.props.children;
}
}

View File

@ -0,0 +1,623 @@
import { useState, useCallback } from 'react';
import { useEditorStore } from '../store/editorStore';
import { Download, Loader2, Zap, Cog, Info, Volume2, FileText, ZoomIn, Video, Music } from 'lucide-react';
import type { ExportOptions } from '../types/project';
import { assert } from '../lib/assert';
export default function ExportDialog() {
const { videoPath, words, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, isExporting, exportProgress, backendUrl, setExporting, getKeepSegments, additionalClips, backgroundMusic } =
useEditorStore();
const hasCuts = cutRanges.length > 0;
// Compute set of deleted word indices from cutRanges
const getDeletedSet = useCallback(() => {
const deletedSet = new Set<number>();
for (const range of cutRanges) {
for (let i = 0; i < words.length; i++) {
if (words[i].start >= range.start && words[i].end <= range.end) {
deletedSet.add(i);
}
}
}
return deletedSet;
}, [cutRanges, words]);
// Detect if input is audio-only by its extension
const audioExtensions = new Set(['.wav', '.mp3', '.flac', '.m4a', '.ogg', '.aac', '.wma']);
const inputExt = videoPath ? '.' + videoPath.split('.').pop()?.toLowerCase() : '';
const isAudioOnly = videoPath ? audioExtensions.has(inputExt) : false;
const [options, setOptions] = useState<Omit<ExportOptions, 'outputPath'> & { normalizeAudio: boolean; normalizeTarget: number }>({
mode: isAudioOnly ? 'reencode' : 'fast',
resolution: '1080p',
format: isAudioOnly ? 'wav' : 'mp4',
enhanceAudio: false,
captions: 'none',
normalizeAudio: false,
normalizeTarget: -14,
zoom: { enabled: false, zoomFactor: 1.25, panX: 0, panY: 0 },
removeBackground: false,
backgroundReplacement: 'blur',
backgroundReplacementValue: '',
});
const [exportError, setExportError] = useState<string | null>(null);
const [transcriptFormat, setTranscriptFormat] = useState<'txt' | 'srt'>('txt');
const [isTranscribingTranscript, setIsTranscribingTranscript] = useState(false);
const handleTranscriptExport = useCallback(async () => {
if (!videoPath || words.length === 0) return;
const defaultExt = transcriptFormat === 'srt' ? 'srt' : 'txt';
const outputPath = await window.electronAPI?.saveFile({
defaultPath: videoPath.replace(/\.[^.]+$/, `_transcript.${defaultExt}`),
filters: transcriptFormat === 'srt'
? [{ name: 'SRT Subtitles', extensions: ['srt'] }]
: [{ name: 'Text File', extensions: ['txt'] }],
});
if (!outputPath) return;
setIsTranscribingTranscript(true);
try {
// Compute deleted word set
const deletedSet = getDeletedSet();
// Generate content entirely on the frontend — no backend needed
let content: string;
if (transcriptFormat === 'srt') {
const lines: string[] = [];
let counter = 1;
const activeWords: Array<[number, typeof words[0]]> = [];
for (let i = 0; i < words.length; i++) {
if (!deletedSet.has(i)) activeWords.push([i, words[i]]);
}
const wordsPerLine = 8;
for (let ci = 0; ci < activeWords.length; ci += wordsPerLine) {
const chunk = activeWords.slice(ci, ci + wordsPerLine);
if (chunk.length === 0) continue;
const startTime = chunk[0][1].start;
const endTime = chunk[chunk.length - 1][1].end;
const fmt = (s: number) => {
const h = Math.floor(s / 3600);
const m = Math.floor((s % 3600) / 60);
const sec = Math.floor(s % 60);
const ms = Math.floor((s % 1) * 1000);
return `${String(h).padStart(2, '0')}:${String(m).padStart(2, '0')}:${String(sec).padStart(2, '0')},${String(ms).padStart(3, '0')}`;
};
lines.push(String(counter));
lines.push(`${fmt(startTime)} --> ${fmt(endTime)}`);
lines.push(chunk.map(([, w]) => w.word).join(' '));
lines.push('');
counter++;
}
content = lines.join('\n');
} else {
// Plain text
const activeWords: string[] = [];
for (let i = 0; i < words.length; i++) {
if (!deletedSet.has(i)) activeWords.push(words[i].word);
}
content = activeWords.join(' ');
}
// Write directly via Tauri — instant, no backend round-trip
await window.electronAPI?.writeFile(outputPath, content);
} catch (err) {
console.error('Transcript export error:', err);
setExportError(err instanceof Error ? err.message : 'Transcript export failed');
} finally {
setIsTranscribingTranscript(false);
}
}, [videoPath, words, getDeletedSet, transcriptFormat]);
const HANDLE_EXPORT_filters = useCallback(() => {
const ext = options.format;
const nameMap: Record<string, string> = {
mp4: 'MP4',
mov: 'MOV',
webm: 'WebM',
wav: 'WAV Audio',
};
return [{ name: nameMap[ext] || 'File', extensions: [ext] }];
}, [options.format]);
const handleExport = useCallback(async () => {
if (!videoPath) return;
const defaultExt = options.format === 'wav' ? 'wav' : 'mp4';
const outputPath = await window.electronAPI?.saveFile({
defaultPath: videoPath.replace(/\.[^.]+$/, `_edited.${defaultExt}`),
filters: HANDLE_EXPORT_filters(),
});
if (!outputPath) return;
setExporting(true, 0);
setExportError(null);
try {
const keepSegments = getKeepSegments();
assert(words.length > 0, 'handleExport: words is empty before building keep segments');
const deletedSet = getDeletedSet();
// Map frontend camelCase gain/speed fields to backend snake_case
const backendGainRanges = gainRanges.map((r) => ({
start: r.start,
end: r.end,
gain_db: r.gainDb,
}));
const backendSpeedRanges = speedRanges.map((r) => ({
start: r.start,
end: r.end,
speed: r.speed,
}));
const body: Record<string, any> = {
input_path: videoPath,
output_path: outputPath,
keep_segments: keepSegments,
mute_ranges: muteRanges.length > 0 ? muteRanges.map((r) => ({ start: r.start, end: r.end })) : undefined,
gain_ranges: backendGainRanges.length > 0 ? backendGainRanges : undefined,
speed_ranges: backendSpeedRanges.length > 0 ? backendSpeedRanges : undefined,
global_gain_db: globalGainDb,
words: options.captions !== 'none' ? words : undefined,
deleted_indices: options.captions !== 'none' ? [...deletedSet] : undefined,
mode: options.mode,
resolution: options.resolution,
format: options.format,
enhanceAudio: options.enhanceAudio,
normalize_loudness: options.normalizeAudio,
normalize_target_lufs: options.normalizeTarget,
captions: options.captions,
};
// Zoom
if (options.zoom?.enabled) {
body.zoom = options.zoom;
}
// Additional clips
if (additionalClips.length > 0) {
body.additional_clips = additionalClips.map((c) => c.path);
}
// Background music
if (backgroundMusic) {
body.background_music = backgroundMusic;
}
// Background removal
if (options.removeBackground) {
body.remove_background = true;
body.background_replacement = options.backgroundReplacement || 'blur';
body.background_replacement_value = options.backgroundReplacementValue || '';
}
const res = await fetch(`${backendUrl}/export`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(body),
});
if (!res.ok) {
let detail = res.statusText;
try {
const body = await res.json();
if (body?.detail) detail = String(body.detail);
} catch {
// Keep statusText fallback when response body is not JSON.
}
throw new Error(`Export failed: ${detail}`);
}
setExporting(false, 100);
} catch (err) {
console.error('Export error:', err);
setExportError(err instanceof Error ? err.message : 'Export failed');
setExporting(false);
}
}, [videoPath, options, backendUrl, setExporting, getKeepSegments, getDeletedSet, muteRanges, gainRanges, speedRanges, globalGainDb, words, HANDLE_EXPORT_filters, additionalClips, backgroundMusic]);
return (
<div className="p-4 space-y-5">
<h3 className="text-sm font-semibold">Export Video</h3>
{/* Mode */}
<fieldset className="space-y-2">
<legend className="text-xs text-editor-text-muted font-medium">Export Mode</legend>
<div className="grid grid-cols-2 gap-2">
<ModeCard
active={options.mode === 'fast'}
onClick={() => setOptions((o) => ({ ...o, mode: 'fast' }))}
icon={<Zap className="w-4 h-4" />}
title="Fast"
desc="Stream copy, seconds"
tooltip="Stream copy — fast, no quality loss, but does not apply cuts or effects"
/>
<ModeCard
active={options.mode === 'reencode'}
onClick={() => setOptions((o) => ({ ...o, mode: 'reencode' }))}
icon={<Cog className="w-4 h-4" />}
title="Re-encode"
desc="Custom quality, slower"
tooltip="Full re-encode — applies cuts, gain, speed, zoom, captions, and effects"
/>
</div>
</fieldset>
{/* Resolution (only for re-encode) */}
{options.mode === 'reencode' && (
<SelectField
label="Resolution"
value={options.resolution}
onChange={(v) => setOptions((o) => ({ ...o, resolution: v as ExportOptions['resolution'] }))}
options={[
{ value: '720p', label: '720p (HD)' },
{ value: '1080p', label: '1080p (Full HD)' },
{ value: '4k', label: '4K (Ultra HD)' },
]}
title="Output video resolution — higher resolution = larger file"
/>
)}
{/* Format */}
<SelectField
label="Format"
value={options.format}
onChange={(v) => setOptions((o) => ({ ...o, format: v as ExportOptions['format'] }))}
options={[
{ value: 'mp4', label: 'MP4 (H.264)' },
{ value: 'mov', label: 'MOV (QuickTime)' },
{ value: 'webm', label: 'WebM (VP9)' },
...(isAudioOnly ? [{ value: 'wav' as const, label: 'WAV (Uncompressed)' }] : []),
]}
title="Output container format — MP4 is most compatible"
/>
{/* Video zoom / punch-in */}
<div className="space-y-2 pt-1 border-t border-editor-border">
<label className="flex items-center gap-2 cursor-pointer">
<input
type="checkbox"
checked={options.zoom?.enabled || false}
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, enabled: e.target.checked } }))}
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
title="Crop and reposition the video frame — useful for removing black bars or reframing"
/>
<div>
<span className="text-xs font-medium flex items-center gap-1">
<ZoomIn className="w-3 h-3" />
Video zoom / punch-in
</span>
<p className="text-[10px] text-editor-text-muted">
Crop and zoom into the center of the video. Requires re-encode.
</p>
</div>
</label>
{options.zoom?.enabled && (
<div className="pl-6 space-y-2">
<div className="flex items-center gap-2">
<span className="text-[10px] text-editor-text-muted w-16">Zoom:</span>
<input
type="range"
min={1}
max={3}
step={0.05}
value={options.zoom?.zoomFactor || 1}
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, zoomFactor: Number(e.target.value) } }))}
className="flex-1 h-1.5"
title="Magnification level — 1.0x is original, higher values zoom in"
/>
<span className="text-xs text-editor-text w-10 text-right">{options.zoom?.zoomFactor?.toFixed(2)}x</span>
</div>
<div className="flex items-center gap-2">
<span className="text-[10px] text-editor-text-muted w-16">Pan X:</span>
<input
type="range"
min={-1}
max={1}
step={0.05}
value={options.zoom?.panX || 0}
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, panX: Number(e.target.value) } }))}
className="flex-1 h-1.5"
title="Horizontal position of the crop window — negative moves left, positive moves right"
/>
<span className="text-xs text-editor-text w-10 text-right">{((options.zoom?.panX || 0) * 100).toFixed(0)}%</span>
</div>
<div className="flex items-center gap-2">
<span className="text-[10px] text-editor-text-muted w-16">Pan Y:</span>
<input
type="range"
min={-1}
max={1}
step={0.05}
value={options.zoom?.panY || 0}
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, panY: Number(e.target.value) } }))}
className="flex-1 h-1.5"
title="Vertical position of the crop window — negative moves up, positive moves down"
/>
<span className="text-xs text-editor-text w-10 text-right">{((options.zoom?.panY || 0) * 100).toFixed(0)}%</span>
</div>
</div>
)}
</div>
{/* Background removal */}
{!isAudioOnly && (
<div className="space-y-2 pt-1 border-t border-editor-border">
<label className="flex items-center gap-2 cursor-pointer">
<input
type="checkbox"
checked={options.removeBackground || false}
onChange={(e) => setOptions((o) => ({ ...o, removeBackground: e.target.checked }))}
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
title="Remove or replace the background behind the speaker"
/>
<div>
<span className="text-xs font-medium flex items-center gap-1">
<Video className="w-3 h-3" />
Remove background
</span>
<p className="text-[10px] text-editor-text-muted">
Replace or blur the background. Uses MediaPipe if available.
</p>
</div>
</label>
{options.removeBackground && (
<div className="pl-6 space-y-2">
<SelectField
label="Background replacement"
value={options.backgroundReplacement || 'blur'}
onChange={(v) => setOptions((o) => ({ ...o, backgroundReplacement: v as 'blur' | 'color' | 'image' }))}
options={[
{ value: 'blur', label: 'Blur background' },
{ value: 'color', label: 'Solid color' },
{ value: 'image', label: 'Custom image' },
]}
/>
{options.backgroundReplacement === 'color' && (
<input
type="text"
value={options.backgroundReplacementValue || '#00FF00'}
onChange={(e) => setOptions((o) => ({ ...o, backgroundReplacementValue: e.target.value }))}
placeholder="#00FF00"
className="w-full px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent [color-scheme:dark]"
/>
)}
{options.backgroundReplacement === 'image' && (
<p className="text-[10px] text-editor-text-muted">Place a background image file path above.</p>
)}
</div>
)}
</div>
)}
{/* Background music track info */}
{backgroundMusic && (
<div className="pt-1 border-t border-editor-border">
<div className="flex items-center gap-1.5 text-xs text-editor-accent">
<Music className="w-3 h-3" />
Background music: {backgroundMusic.path.split(/[/\\]/).pop()}
</div>
</div>
)}
{/* Append clips info */}
{additionalClips.length > 0 && (
<div className="pt-1 border-t border-editor-border">
<div className="flex items-center gap-1.5 text-xs text-editor-accent">
<Video className="w-3 h-3" />
{additionalClips.length} additional clip{additionalClips.length > 1 ? 's' : ''} appended
</div>
</div>
)}
{/* Audio normalization — integrated into export */}
<div className="space-y-2 pt-1 border-t border-editor-border">
<label className="flex items-center gap-2 cursor-pointer">
<input
type="checkbox"
checked={options.normalizeAudio}
onChange={(e) => setOptions((o) => ({ ...o, normalizeAudio: e.target.checked }))}
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
title="Normalize audio to a consistent loudness target"
/>
<div>
<span className="text-xs font-medium">Normalize loudness</span>
<p className="text-[10px] text-editor-text-muted">
Apply LUFS normalization during export. Requires re-encode.
</p>
</div>
</label>
{options.normalizeAudio && (
<div className="flex items-center gap-2 pl-6">
<Volume2 className="w-3 h-3 text-editor-text-muted shrink-0" />
<select
value={options.normalizeTarget}
onChange={(e) => setOptions((o) => ({ ...o, normalizeTarget: Number(e.target.value) }))}
className="flex-1 px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent [color-scheme:dark]"
title="Loudness target — YouTube (-14), Spotify (-16), Broadcast (-23)"
>
<option value={-14}>YouTube (-14 LUFS)</option>
<option value={-16}>Spotify (-16 LUFS)</option>
<option value={-23}>Broadcast (-23 LUFS)</option>
<option value={-11}>Loud (-11 LUFS)</option>
<option value={-9}>Very Loud (-9 LUFS)</option>
</select>
</div>
)}
</div>
{/* Audio enhancement */}
<label className="flex items-center gap-2 cursor-pointer">
<input
type="checkbox"
checked={options.enhanceAudio}
onChange={(e) => setOptions((o) => ({ ...o, enhanceAudio: e.target.checked }))}
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
title="Apply noise reduction and speech enhancement"
/>
<span className="text-xs">Enhance audio (Studio Sound)</span>
</label>
{/* Captions */}
<SelectField
label="Captions"
value={options.captions}
onChange={(v) => setOptions((o) => ({ ...o, captions: v as ExportOptions['captions'] }))}
options={[
{ value: 'none', label: 'No captions' },
{ value: 'burn-in', label: 'Burn-in (permanent)' },
{ value: 'sidecar', label: 'Sidecar SRT file' },
]}
title="Burn captions into video, export as separate SRT/VTT file, or none"
/>
{/* Transcript-only export */}
<div className="space-y-2 pt-1 border-t border-editor-border">
<h4 className="text-xs font-semibold flex items-center gap-1.5">
<FileText className="w-3.5 h-3.5" />
Export Transcript Only
</h4>
<p className="text-[10px] text-editor-text-muted leading-relaxed">
Export the edited transcript as plain text or SRT without rendering video.
</p>
<div className="flex items-center gap-2">
<select
value={transcriptFormat}
onChange={(e) => setTranscriptFormat(e.target.value as 'txt' | 'srt')}
className="flex-1 px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent [color-scheme:dark]"
>
<option value="txt">Plain Text (.txt)</option>
<option value="srt">Subtitles (.srt)</option>
</select>
<button
onClick={handleTranscriptExport}
disabled={isTranscribingTranscript || words.length === 0}
className="flex items-center gap-1.5 px-3 py-1.5 text-xs rounded bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30 disabled:opacity-40 transition-colors"
title="Export just the transcript text or subtitles without the video"
>
{isTranscribingTranscript ? (
<Loader2 className="w-3 h-3 animate-spin" />
) : (
<FileText className="w-3 h-3" />
)}
Export
</button>
</div>
</div>
{/* Export video button */}
<button
onClick={handleExport}
disabled={isExporting || !videoPath}
className="w-full flex items-center justify-center gap-2 px-4 py-3 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-semibold transition-colors"
title="Start export with current settings"
>
<Download className="w-4 h-4" />
Export Video
</button>
{/* Export progress */}
{isExporting && (
<div className="space-y-2">
<div className="flex items-center gap-2">
<Loader2 className="w-4 h-4 animate-spin text-editor-accent" />
<span className="text-xs font-medium">Exporting...</span>
<span className="text-xs text-editor-text-muted">{Math.round(exportProgress)}%</span>
</div>
<div className="w-full h-2 bg-editor-border rounded-full overflow-hidden">
<div
className="h-full bg-editor-accent rounded-full transition-all duration-300"
style={{ width: `${exportProgress}%` }}
/>
</div>
<p className="text-xs text-editor-text-muted">Export in progress...</p>
</div>
)}
{exportError && (
<div className="rounded border border-red-500/40 bg-red-500/10 px-3 py-2 text-xs text-red-300">
{exportError}
</div>
)}
{options.mode === 'fast' && !hasCuts && (
<p className="text-[10px] text-editor-text-muted text-center">
Fast mode uses stream copy &mdash; no quality loss, exports in seconds.
</p>
)}
{options.mode === 'fast' && hasCuts && (
<div className="flex items-start gap-1.5 p-2 bg-editor-accent/10 rounded text-[10px] text-editor-accent">
<Info className="w-3.5 h-3.5 shrink-0 mt-0.5" />
<span>
Word-level cuts require re-encoding for frame-accurate output. Export will
automatically use re-encode mode. This takes longer but ensures your cuts are precise.
</span>
</div>
)}
</div>
);
}
function ModeCard({
active,
onClick,
icon,
title,
desc,
tooltip,
}: {
active: boolean;
onClick: () => void;
icon: React.ReactNode;
title: string;
desc: string;
tooltip?: string;
}) {
return (
<button
onClick={onClick}
title={tooltip}
className={`flex flex-col items-center gap-1 p-3 rounded-lg border-2 transition-colors ${
active
? 'border-editor-accent bg-editor-accent/10'
: 'border-editor-border hover:border-editor-text-muted'
}`}
>
{icon}
<span className="text-xs font-medium">{title}</span>
<span className="text-[10px] text-editor-text-muted">{desc}</span>
</button>
);
}
function SelectField({
label,
value,
onChange,
options,
title,
}: {
label: string;
value: string;
onChange: (value: string) => void;
options: Array<{ value: string; label: string }>;
title?: string;
}) {
return (
<div className="space-y-1">
<label className="text-xs text-editor-text-muted font-medium">{label}</label>
<select
title={title}
value={value}
onChange={(e) => onChange(e.target.value)}
className="w-full px-3 py-2 bg-editor-surface border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent [color-scheme:dark]"
>
{options.map((opt) => (
<option key={opt.value} value={opt.value}>
{opt.label}
</option>
))}
</select>
</div>
);
}

View File

@ -0,0 +1,156 @@
import { HelpCircle, Scissors, VolumeX, SlidersHorizontal, Gauge, Film, Search, FileText, Download, Music, MapPin, ListVideo, Sparkles, Keyboard } from 'lucide-react';
export default function HelpContent() {
return (
<div className="p-4 space-y-5 overflow-y-auto">
<h3 className="text-sm font-semibold flex items-center gap-1.5">
<HelpCircle className="w-4 h-4" />
Help &amp; Reference
</h3>
<Section title="Getting Started" icon={<Film className="w-3.5 h-3.5" />}>
<Step num={1}>Open a video file click <strong>File &gt; Open File</strong> or press <kbd>Ctrl+O</kbd></Step>
<Step num={2}>Wait for transcription Whisper processes your audio and creates a word-level transcript</Step>
<Step num={3}>Edit by selecting words choose <strong>Cut</strong>, <strong>Mute</strong>, <strong>Sound Gain</strong>, or <strong>Speed Adjust</strong> from the toolbar</Step>
<Step num={4}>Use AI tools detect filler words, find clips, re-transcribe with a different model</Step>
<Step num={5}>Export apply all edits and save your final video</Step>
<Step>Press <kbd>?</kbd> anytime to see all keyboard shortcuts</Step>
</Section>
<Section title="Cut / Mute / Sound Gain / Speed Adjust" icon={<Scissors className="w-3.5 h-3.5" />}>
<P>These are time-range edits applied during export. You create them in three ways:</P>
<Bullet>Select words in the transcript the toolbar buttons create a zone from the selected word range</Bullet>
<Bullet>Use <strong>Mark In</strong> (<kbd>I</kbd>) and <strong>Mark Out</strong> (<kbd>O</kbd>) on the timeline, then clicking the toolbar button</Bullet>
<Bullet>Click a toolbar button to enter <strong>zone mode</strong>, then drag on the waveform timeline to draw a zone</Bullet>
<P className="mt-2">
<strong>Cut</strong> removes the segment from the output entirely<br />
<strong>Mute</strong> silences the audio but keeps the video<br />
<strong>Sound Gain</strong> adjusts volume (positive = louder, negative = quieter)<br />
<strong>Speed Adjust</strong> changes playback speed (1.0x = normal, 2.0x = double)
</P>
<P>View and manage all zones in the <strong>Edit Zones</strong> panel. Click a zone on the waveform to select it drag edges to resize, drag the body to move.</P>
</Section>
<Section title="Waveform Timeline" icon={<Film className="w-3.5 h-3.5" />}>
<Bullet>Click to seek, drag to scrub through the video</Bullet>
<Bullet>Enter zone mode from the toolbar, then drag on the waveform to create a zone</Bullet>
<Bullet>Click an existing zone to select it drag edges to resize, drag body to move</Bullet>
<Bullet><kbd>Delete</kbd> or <kbd>Backspace</kbd> removes the selected zone (with confirmation)</Bullet>
<Bullet><kbd>Ctrl+Scroll</kbd> to zoom in/out, scroll to pan horizontally</Bullet>
<Bullet>Toggle individual zone types on/off with the colored buttons above the waveform</Bullet>
<Bullet>"Show adjusted timeline" compresses cut regions to preview the output</Bullet>
</Section>
<Section title="Transcript Editing" icon={<FileText className="w-3.5 h-3.5" />}>
<Bullet>Click a word to select it, <kbd>Shift+Click</kbd> to extend the selection</Bullet>
<Bullet><kbd>Ctrl+Click</kbd> any word to seek the video to that exact timestamp</Bullet>
<Bullet>Double-click any word to edit its text directly</Bullet>
<Bullet>Words with low confidence get an orange dotted underline adjust the threshold in Settings</Bullet>
<Bullet><kbd>Ctrl+F</kbd> to search the transcript navigate matches with <kbd>Enter</kbd> / <kbd>Shift+Enter</kbd></Bullet>
<Bullet>Select a word range and click <strong>Re-transcribe</strong> to re-run Whisper on just that segment</Bullet>
</Section>
<Section title="Chapter Marks" icon={<MapPin className="w-3.5 h-3.5" />}>
<Bullet>Add markers at the current playhead position with a label and color</Bullet>
<Bullet>Use <kbd>I</kbd> / <kbd>O</kbd> keys to set mark in/out points on the timeline</Bullet>
<Bullet>Markers auto-sort as chapters click <strong>Copy as YouTube timestamps</strong> to get chapter text</Bullet>
</Section>
<Section title="AI Tools" icon={<Sparkles className="w-3.5 h-3.5" />}>
<P><strong>Filler Words</strong> detects "um", "uh", "like", "you know" and similar words. Add custom fillers (e.g. "okay", "alright"). <strong>Apply All</strong> creates cut ranges for every detection at once.</P>
<P><strong>Create Clips</strong> analyzes your transcript to find the best 20-60 second segments for TikTok, YouTube Shorts, or Instagram Reels.</P>
<P><strong>Reprocess</strong> re-run transcription with a different Whisper model. Larger models are more accurate but slower. English-only models are faster for English content.</P>
<P>AI features work with the bundled local model (no setup needed), or via Ollama/OpenAI/Claude configure in Settings.</P>
</Section>
<Section title="Export" icon={<Download className="w-3.5 h-3.5" />}>
<Bullet><strong>Fast mode</strong> (stream copy): instant, no quality loss but doesn't apply cuts or effects</Bullet>
<Bullet><strong>Re-encode mode</strong>: applies all edits — cuts, gain, speed, zoom, captions, background music</Bullet>
<Bullet>Captions can be burned into the video or exported as separate SRT/VTT files</Bullet>
<Bullet>Loudness normalization targets: YouTube (-14 LUFS), Spotify (-16), Broadcast (-23)</Bullet>
<Bullet>Audio enhancement: noise reduction and speech clarity</Bullet>
<Bullet>Export Transcript Only — get SRT or plain text without the video</Bullet>
</Section>
<Section title="Background Music + Add Clips" icon={<Music className="w-3.5 h-3.5" />}>
<Bullet><strong>Bkg. Music</strong> — add a music track with auto-ducking: the music automatically lowers when someone speaks. Adjust volume, duck amount, attack, and release times.</Bullet>
<Bullet><strong>Add Clips</strong> — load additional video files to concatenate during export. Drag to reorder.</Bullet>
<Bullet>Both are applied during re-encode export only</Bullet>
</Section>
<Section title="Keyboard Shortcuts" icon={<Keyboard className="w-3.5 h-3.5" />}>
<P>Press <kbd>?</kbd> anytime to see the full cheatsheet overlay. Remap any shortcut in Settings.</P>
<div className="grid grid-cols-2 gap-1 mt-2">
<Shortcut keys="Space" desc="Play / Pause" />
<Shortcut keys="J K L" desc="Slow / Pause / Speed" />
<Shortcut keys="← →" desc="Skip 5s back / forward" />
<Shortcut keys="I / O" desc="Mark In / Out points" />
<Shortcut keys="Delete" desc="Cut selected / marked range" />
<Shortcut keys="Ctrl+Z" desc="Undo" />
<Shortcut keys="Ctrl+Shift+Z" desc="Redo" />
<Shortcut keys="Ctrl+S" desc="Save project" />
<Shortcut keys="Ctrl+E" desc="Export" />
<Shortcut keys="Ctrl+F" desc="Find in transcript" />
<Shortcut keys="?" desc="Toggle cheatsheet" />
</div>
<button
onClick={() => window.dispatchEvent(new KeyboardEvent('keydown', { key: '?' }))}
className="text-editor-accent hover:underline text-xs mt-2"
>
View full keyboard shortcut reference
</button>
</Section>
<div className="text-[10px] text-editor-text-muted leading-relaxed border-t border-editor-border pt-4">
TalkEdit is 100% offline. No account required. No data leaves your machine. No subscription — buy once, own forever.
</div>
</div>
);
}
function Section({ title, icon, children }: { title: string; icon: React.ReactNode; children: React.ReactNode }) {
return (
<div className="space-y-2 p-3 bg-editor-surface rounded-lg">
<h4 className="text-xs font-semibold flex items-center gap-1.5 text-editor-text">
{icon}
{title}
</h4>
<div className="space-y-1.5">
{children}
</div>
</div>
);
}
function P({ children, className = '' }: { children: React.ReactNode; className?: string }) {
return <p className={`text-xs text-editor-text-muted leading-relaxed ${className}`}>{children}</p>;
}
function Bullet({ children }: { children: React.ReactNode }) {
return (
<div className="flex items-start gap-1.5">
<span className="text-editor-accent mt-1.5 w-1 h-1 rounded-full bg-editor-accent shrink-0" />
<span className="text-xs text-editor-text-muted leading-relaxed">{children}</span>
</div>
);
}
function Step({ num, children }: { num?: number; children: React.ReactNode }) {
return (
<div className="flex items-start gap-2">
<span className="w-5 h-5 rounded-full bg-editor-accent/20 text-editor-accent text-[10px] font-semibold flex items-center justify-center shrink-0 mt-0.5">
{num}
</span>
<span className="text-xs text-editor-text-muted leading-relaxed">{children}</span>
</div>
);
}
function Shortcut({ keys, desc }: { keys: string; desc: string }) {
return (
<div className="flex items-center gap-2 text-xs">
<kbd className="px-1.5 py-0.5 text-[10px] font-mono bg-editor-bg border border-editor-border rounded text-editor-text min-w-[72px] text-center">{keys}</kbd>
<span className="text-editor-text-muted">{desc}</span>
</div>
);
}

View File

@ -0,0 +1,296 @@
import { useState } from 'react';
import { useLicenseStore } from '../store/licenseStore';
import { Key, Check, X, Loader2, Shield, Clock, AlertTriangle } from 'lucide-react';
export default function LicenseDialog() {
const { status, showDialog, setShowDialog, activateLicense } = useLicenseStore();
const [key, setKey] = useState('');
const [error, setError] = useState<string | null>(null);
const [activating, setActivating] = useState(false);
const [confirmedEmail, setConfirmedEmail] = useState<string | null>(null);
const [verifying, setVerifying] = useState(false);
const handleActivate = async () => {
if (!key.trim()) return;
setError(null);
// If we already verified and the user confirmed, complete activation
if (confirmedEmail) {
setActivating(true);
const ok = await activateLicense(key.trim());
if (!ok) {
setError('Invalid license key. Check it was entered correctly.');
}
setActivating(false);
return;
}
// Step 1: Verify the key (don't cache yet) to get the email
setVerifying(true);
try {
const payload = await window.electronAPI?.verifyLicense(key.trim());
if (payload?.customer_email) {
setConfirmedEmail(payload.customer_email);
} else {
setError('Invalid license key. Check it was entered correctly.');
}
} catch {
setError('Invalid license key. Check it was entered correctly.');
}
setVerifying(false);
};
const handleDeny = () => {
setConfirmedEmail(null);
setKey('');
setError(null);
};
const formatDate = (ts: number) => {
const d = new Date(ts * 1000);
return d.toLocaleDateString('en-US', { year: 'numeric', month: 'short', day: 'numeric' });
};
if (!status) return null;
if (status.tag === 'Licensed') {
return (
<div className="fixed bottom-4 right-4 z-50">
<div className="flex items-center gap-2 px-3 py-2 rounded-lg bg-editor-surface border border-editor-border shadow-lg text-xs">
<Shield className="w-3.5 h-3.5 text-editor-success" />
<span className="text-editor-text-muted">
{status.license.tier === 'business' ? 'Business' : 'Pro'} {status.license.customer_email}
</span>
<span className="text-editor-text-muted/50">
expires {formatDate(status.license.expires_at)}
</span>
</div>
</div>
);
}
if (status.tag === 'Trial') {
return (
<>
<div className="fixed bottom-4 right-4 z-50">
<button
onClick={() => setShowDialog(true)}
className="flex items-center gap-2 px-3 py-2 rounded-lg bg-editor-surface border border-editor-border shadow-lg text-xs hover:bg-editor-bg transition-colors"
>
<Clock className="w-3.5 h-3.5 text-editor-accent" />
<span className="text-editor-text-muted">
Trial {status.days_remaining} day{status.days_remaining !== 1 ? 's' : ''} left
</span>
</button>
</div>
{showDialog && (
<LicenseActivateDialog
onClose={() => { setShowDialog(false); handleDeny(); }}
onActivate={handleActivate}
onDeny={handleDeny}
keyValue={key}
setKeyValue={setKey}
error={error}
activating={activating}
verifying={verifying}
confirmedEmail={confirmedEmail}
trialEnding={status.days_remaining <= 3}
/>
)}
</>
);
}
// Expired — show banner + activation dialog (both dismissible)
return (
<>
<ExpiredBanner onActivate={() => setShowDialog(true)} />
{showDialog && (
<LicenseActivateDialog
onClose={() => { setShowDialog(false); handleDeny(); }}
onActivate={handleActivate}
onDeny={handleDeny}
keyValue={key}
setKeyValue={setKey}
error={error}
activating={activating}
verifying={verifying}
confirmedEmail={confirmedEmail}
expired
/>
)}
</>
);
}
/** Persistent top banner shown when trial expired — still allows export and loading */
function ExpiredBanner({ onActivate }: { onActivate: () => void }) {
return (
<div className="h-9 flex items-center justify-center gap-3 px-4 bg-red-500/15 border-b border-red-500/30 shrink-0">
<AlertTriangle className="w-3.5 h-3.5 text-red-400 shrink-0" />
<span className="text-xs text-red-300">
Trial expired export and project loading still work.&nbsp;
<button onClick={onActivate} className="underline font-medium hover:text-red-200">
Activate license
</button>
&nbsp;to restore editing.
</span>
</div>
);
}
function LicenseActivateDialog({
onClose,
onActivate,
onDeny,
keyValue,
setKeyValue,
error,
activating,
verifying,
confirmedEmail,
trialEnding,
expired,
}: {
onClose: () => void;
onActivate: () => void;
onDeny: () => void;
keyValue: string;
setKeyValue: (v: string) => void;
error: string | null;
activating: boolean;
verifying: boolean;
confirmedEmail: string | null;
trialEnding?: boolean;
expired?: boolean;
}) {
const isProcessing = activating || verifying;
if (confirmedEmail) {
return (
<div className="fixed inset-0 z-[80] flex items-center justify-center bg-black/60 px-4">
<div
className="w-full max-w-md rounded-xl border border-editor-border bg-editor-bg p-6 space-y-4"
onClick={(e) => e.stopPropagation()}
>
<div className="flex items-center gap-2">
<Shield className="w-5 h-5 text-editor-accent" />
<h3 className="text-sm font-semibold">Confirm License</h3>
</div>
<div className="p-3 rounded-lg bg-editor-surface border border-editor-border space-y-1">
<p className="text-xs text-editor-text-muted">
This license key is registered to:
</p>
<p className="text-sm font-medium text-editor-text">{confirmedEmail}</p>
</div>
<p className="text-xs text-editor-text-muted leading-relaxed">
License keys are tied to your email. Sharing this key may result in deactivation.
</p>
<div className="flex items-center justify-end gap-2 pt-1">
<button
onClick={onDeny}
className="px-3 py-1.5 rounded-md text-xs text-editor-text-muted hover:text-editor-text hover:bg-editor-surface"
>
Cancel
</button>
<button
onClick={onActivate}
disabled={activating}
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors flex items-center gap-2"
>
{activating ? (
<Loader2 className="w-4 h-4 animate-spin" />
) : (
<Check className="w-4 h-4" />
)}
Activate
</button>
</div>
</div>
</div>
);
}
return (
<div className="fixed inset-0 z-[80] flex items-center justify-center bg-black/60 px-4">
<div
className="w-full max-w-md rounded-xl border border-editor-border bg-editor-bg p-6 space-y-4"
onClick={(e) => e.stopPropagation()}
>
<div className="flex items-center justify-between">
<div className="flex items-center gap-2">
<Key className="w-5 h-5 text-editor-accent" />
<h3 className="text-sm font-semibold">
{expired ? 'Trial Expired' : 'Activate TalkEdit'}
</h3>
</div>
<button
onClick={onClose}
className="p-1 rounded hover:bg-editor-surface text-editor-text-muted"
title="Close dialog"
>
<X className="w-4 h-4" />
</button>
</div>
{expired && (
<div className="text-xs text-editor-text-muted leading-relaxed space-y-1">
<p className="font-medium text-red-300">Your 30-day trial has ended.</p>
<p>
You can still <strong>export videos</strong> and <strong>load projects</strong>.
Enter a license key to restore editing, AI tools, and all other features.
</p>
</div>
)}
{trialEnding && !expired && (
<div className="flex items-start gap-2 p-3 rounded-lg bg-amber-500/10 border border-amber-500/30">
<AlertTriangle className="w-4 h-4 text-amber-400 shrink-0 mt-0.5" />
<p className="text-xs text-amber-300">Your trial ends soon. Activate now to keep using all features.</p>
</div>
)}
{!expired && !trialEnding && (
<p className="text-xs text-editor-text-muted leading-relaxed">
Enter your license key to activate TalkEdit Pro or Business.
You received this key by email after purchase.
</p>
)}
<div className="space-y-1.5">
<label className="text-xs text-editor-text-muted font-medium">License Key</label>
<textarea
value={keyValue}
onChange={(e) => { setKeyValue(e.target.value); }}
placeholder="talkedit_v1_..."
rows={3}
className="w-full px-3 py-2 text-xs font-mono bg-editor-surface border border-editor-border rounded-lg text-editor-text placeholder:text-editor-text-muted/50 focus:outline-none focus:border-editor-accent resize-none"
/>
{error && <p className="text-xs text-red-400">{error}</p>}
</div>
<button
onClick={onActivate}
disabled={isProcessing || !keyValue.trim()}
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
>
{isProcessing ? (
<Loader2 className="w-4 h-4 animate-spin" />
) : (
<Key className="w-4 h-4" />
)}
{verifying ? 'Verifying...' : 'Verify Key'}
</button>
<p className="text-[10px] text-editor-text-muted text-center">
No license? <a href="#" className="text-editor-accent hover:underline">Purchase at talked.it</a>
</p>
</div>
</div>
);
}

View File

@ -0,0 +1,171 @@
import { useState } from 'react';
import { useEditorStore } from '../store/editorStore';
import { MapPin, Trash2, PencilLine, Check, X, Copy } from 'lucide-react';
const COLOR_NAMES: Record<string, string> = {
'#6366f1': 'Indigo',
'#ef4444': 'Red',
'#22c55e': 'Green',
'#f59e0b': 'Amber',
'#3b82f6': 'Blue',
'#ec4899': 'Pink',
'#8b5cf6': 'Purple',
'#14b8a6': 'Teal',
};
const COLORS = ['#6366f1', '#ef4444', '#22c55e', '#f59e0b', '#3b82f6', '#ec4899', '#8b5cf6', '#14b8a6'];
export default function MarkersPanel() {
const { timelineMarkers, addTimelineMarker, updateTimelineMarker, removeTimelineMarker, getChapters } =
useEditorStore();
const currentTime = useEditorStore((s) => s.currentTime);
const [editingId, setEditingId] = useState<string | null>(null);
const [editLabel, setEditLabel] = useState('');
const [newLabel, setNewLabel] = useState('');
const [newColor, setNewColor] = useState(COLORS[0]);
const [showChapters, setShowChapters] = useState(false);
const chapters = getChapters();
const addAtCurrentTime = () => {
addTimelineMarker(currentTime, newLabel || undefined, newColor);
setNewLabel('');
};
const startEdit = (id: string, label: string) => {
setEditingId(id);
setEditLabel(label);
};
const commitEdit = (id: string) => {
if (editLabel.trim()) {
updateTimelineMarker(id, { label: editLabel.trim() });
}
setEditingId(null);
};
const exportChapters = () => {
const lines = chapters.map((ch) => {
const h = Math.floor(ch.startTime / 3600);
const m = Math.floor((ch.startTime % 3600) / 60);
const s = Math.floor(ch.startTime % 60);
const timeStr = `${h > 0 ? `${h}:` : ''}${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`;
return `${timeStr} ${ch.label}`;
});
const text = lines.join('\n');
navigator.clipboard.writeText(text).catch(() => {});
};
return (
<div className="p-4 space-y-4">
<div className="space-y-1">
<h3 className="text-sm font-semibold flex items-center gap-1.5">
<MapPin className="w-4 h-4" />
Timeline Markers
</h3>
<p className="text-xs text-editor-text-muted">
Drop markers at key points. Markers become YouTube-compatible chapters.
</p>
</div>
{/* Add marker at current time */}
<div className="space-y-2">
<div className="flex items-center gap-2">
<input
value={newLabel}
onChange={(e) => setNewLabel(e.target.value)}
placeholder={`${currentTime.toFixed(2)}s`}
className="flex-1 px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent"
/>
<div className="flex gap-0.5">
{COLORS.map((c) => (
<button
key={c}
onClick={() => setNewColor(c)}
className={`w-4 h-4 rounded-full border ${newColor === c ? 'border-white ring-1 ring-white' : 'border-transparent'}`}
style={{ backgroundColor: c }}
title={COLOR_NAMES[c]}
/>
))}
</div>
</div>
<button
onClick={addAtCurrentTime}
className="w-full flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30 rounded"
title="Add a marker at the current playhead position"
>
<MapPin className="w-3 h-3" />
Add
</button>
</div>
{/* Marker list */}
{timelineMarkers.length > 0 ? (
<div className="space-y-1 max-h-60 overflow-y-auto">
{timelineMarkers.map((m) => (
<div
key={m.id}
className="flex items-center gap-2 px-2 py-1.5 rounded bg-editor-surface border border-editor-border text-xs"
>
<div className="w-2.5 h-2.5 rounded-full shrink-0" style={{ backgroundColor: m.color }} />
<span className="text-[10px] text-editor-text-muted w-14 shrink-0">{m.time.toFixed(2)}s</span>
{editingId === m.id ? (
<>
<input
value={editLabel}
onChange={(e) => setEditLabel(e.target.value)}
autoFocus
className="flex-1 px-1.5 py-0.5 text-xs bg-editor-bg border border-editor-border rounded focus:outline-none focus:border-editor-accent"
/>
<button onClick={() => commitEdit(m.id)} className="p-0.5 text-editor-success"><Check className="w-3 h-3" /></button>
<button onClick={() => setEditingId(null)} className="p-0.5 text-editor-text-muted"><X className="w-3 h-3" /></button>
</>
) : (
<>
<span className="flex-1 truncate">{m.label}</span>
<button onClick={() => startEdit(m.id, m.label)} className="p-0.5 hover:text-editor-accent" title="Edit marker label and color"><PencilLine className="w-3 h-3" /></button>
<button onClick={() => { if (window.confirm("Delete this marker?")) removeTimelineMarker(m.id); }} className="p-0.5 hover:text-editor-danger" title="Delete this marker"><Trash2 className="w-3 h-3" /></button>
</>
)}
</div>
))}
</div>
) : (
<div className="p-4 rounded border border-dashed border-editor-border text-center">
<p className="text-xs text-editor-text-muted">
No markers yet. Press I and O on the timeline to set mark in/out points, then add a marker here.
</p>
</div>
)}
{/* Chapters */}
{chapters.length > 0 && (
<div className="space-y-2 pt-1 border-t border-editor-border">
<button
onClick={() => setShowChapters(!showChapters)}
className="flex items-center gap-1 text-xs font-medium text-editor-text-muted hover:text-editor-text"
>
{showChapters ? '▼' : '▶'} Chapters ({chapters.length})
</button>
{showChapters && (
<div className="space-y-1">
{chapters.map((ch) => (
<div key={ch.markerId} className="flex items-center gap-2 text-[10px] text-editor-text-muted">
<span className="font-mono">{ch.label}</span>
</div>
))}
<button
onClick={exportChapters}
className="flex items-center gap-1 text-[10px] text-editor-accent hover:underline"
title="Copy chapter timestamps to clipboard in YouTube format"
>
<Copy className="w-2.5 h-2.5" />
Copy as YouTube timestamps
</button>
</div>
)}
</div>
)}
</div>
);
}

View File

@ -0,0 +1,469 @@
import { useAIStore } from '../store/aiStore';
import { useState, useEffect, useCallback } from 'react';
import type { AIProvider, KeyBinding, HotkeyPreset } from '../types/project';
import { useEditorStore } from '../store/editorStore';
import { Bot, Cloud, Brain, RefreshCw, Keyboard, Trash2, HardDrive } from 'lucide-react';
import { loadBindings, saveBindings, applyPreset as applyKeyPreset, DEFAULT_PRESETS, detectConflicts as detectKeyConflicts } from '../lib/keybindings';
export default function SettingsPanel() {
const { providers, defaultProvider, setProviderConfig, setDefaultProvider } = useAIStore();
const { backendUrl, zonePreviewPaddingSeconds, setZonePreviewPaddingSeconds } = useEditorStore();
const CONFIDENCE_THRESHOLD_KEY = 'talkedit:confidenceThreshold';
const [confidenceThreshold, setConfidenceThresholdState] = useState(() => {
const stored = typeof window !== 'undefined' ? Number(window.localStorage.getItem(CONFIDENCE_THRESHOLD_KEY)) : 0;
return Number.isFinite(stored) ? stored : 0.6;
});
const setConfidenceThreshold = (value: number) => {
const clamped = Math.max(0, Math.min(1, value));
setConfidenceThresholdState(clamped);
if (typeof window !== 'undefined') {
window.localStorage.setItem(CONFIDENCE_THRESHOLD_KEY, String(clamped));
}
};
// Keyboard shortcuts state
const [bindings, setBindings] = useState<KeyBinding[]>(() => {
try { return loadBindings(); } catch { return DEFAULT_PRESETS['standard']; }
});
const [editingKey, setEditingKey] = useState<string | null>(null);
const [editKeyValue, setEditKeyValue] = useState('');
const conflicts = detectKeyConflicts(bindings);
const persistBindings = (newB: KeyBinding[]) => {
saveBindings(newB);
setBindings(newB);
};
const applyPresetAction = (preset: HotkeyPreset) => {
persistBindings(applyKeyPreset(preset));
};
const startKeyEdit = (idx: number) => {
setEditingKey(bindings[idx].id);
setEditKeyValue(bindings[idx].keys);
};
const handleKeyCapture = (e: React.KeyboardEvent, idx: number) => {
e.preventDefault();
const parts: string[] = [];
if (e.ctrlKey || e.metaKey) parts.push('Ctrl');
if (e.shiftKey) parts.push('Shift');
if (e.altKey) parts.push('Alt');
const key = e.key === ' ' ? 'Space' : e.key.length === 1 ? e.key.toUpperCase() : e.key;
if (!['Control', 'Shift', 'Alt', 'Meta'].includes(key)) parts.push(key);
if (parts.length === 0) return;
const combo = parts.join('+');
const newBindings = bindings.map((b, i) => (i === idx ? { ...b, keys: combo } : b));
setEditKeyValue(combo);
setEditingKey(null);
persistBindings(newBindings);
};
const handleReset = (idx: number) => {
const standard = DEFAULT_PRESETS['standard'];
const existing = standard.find((b: KeyBinding) => b.id === bindings[idx].id);
if (!existing) return;
persistBindings(bindings.map((b, i) => (i === idx ? { ...existing } : b)));
};
const [models, setModels] = useState<ModelInfo[]>([]);
const [loadingModels, setLoadingModels] = useState(false);
const [deleting, setDeleting] = useState<string | null>(null);
const fetchModels = useCallback(async () => {
setLoadingModels(true);
try {
const list = await window.electronAPI.listModels();
setModels(list);
} catch {
setModels([]);
} finally {
setLoadingModels(false);
}
}, []);
useEffect(() => {
fetchModels();
}, [fetchModels]);
const handleDeleteModel = useCallback(async (model: ModelInfo) => {
if (deleting) return;
setDeleting(model.path);
try {
await window.electronAPI.deleteModel(model.path);
setModels((prev) => prev.filter((m) => m.path !== model.path));
} catch {
// Model deletion failed silently
} finally {
setDeleting(null);
}
}, [deleting]);
const formatBytes = (bytes: number) => {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
};
const [ollamaModels, setOllamaModels] = useState<string[]>([]);
const [loadingOllamaModels, setLoadingOllamaModels] = useState(false);
const fetchOllamaModels = useCallback(async () => {
setLoadingOllamaModels(true);
try {
const res = await fetch(`${backendUrl}/ai/ollama-models`);
if (res.ok) {
const data = await res.json();
setOllamaModels(data.models || []);
}
} catch {
setOllamaModels([]);
} finally {
setLoadingOllamaModels(false);
}
}, [backendUrl]);
useEffect(() => {
fetchOllamaModels();
}, [fetchOllamaModels]);
const providerIcons: Record<AIProvider, React.ReactNode> = {
ollama: <Bot className="w-4 h-4" />,
openai: <Cloud className="w-4 h-4" />,
claude: <Brain className="w-4 h-4" />,
};
return (
<div className="p-4 space-y-6">
<h3 className="text-sm font-semibold">Settings</h3>
<ProviderSection title="Playback" icon={<RefreshCw className="w-4 h-4" />}>
<div className="space-y-1">
<label className="text-xs text-editor-text-muted">Zone preview padding (seconds before and after)</label>
<div className="flex items-center gap-2">
<input
type="range"
min={0}
max={10}
step={0.25}
value={zonePreviewPaddingSeconds}
onChange={(e) => setZonePreviewPaddingSeconds(Number(e.target.value) || 0)}
className="flex-1 h-1.5"
title="Extra time in seconds to show before and after each zone during preview"
/>
<input
type="number"
min={0}
max={10}
step={0.25}
value={zonePreviewPaddingSeconds}
onChange={(e) => setZonePreviewPaddingSeconds(Number(e.target.value) || 0)}
className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
title="Extra time in seconds to show before and after each zone during preview"
/>
<span className="text-xs text-editor-text-muted w-6">s</span>
</div>
</div>
</ProviderSection>
{/* Confidence threshold */}
<div className="space-y-2">
<label className="text-xs text-editor-text-muted font-medium">Low-Confidence Word Threshold</label>
<p className="text-[10px] text-editor-text-muted leading-relaxed">
Words with confidence below this value are highlighted with an orange dotted underline.
Whisper often gets homophones and proper nouns wrong at low confidence.
</p>
<div className="flex items-center gap-2">
<input
type="range"
min={0}
max={1}
step={0.05}
value={confidenceThreshold}
onChange={(e) => setConfidenceThreshold(Number(e.target.value))}
className="flex-1 h-1.5"
title="Words below this confidence get an orange underline — lower values show fewer warnings"
/>
<input
type="number"
min={0}
max={1}
step={0.05}
value={confidenceThreshold}
onChange={(e) => setConfidenceThreshold(Math.max(0, Math.min(1, Number(e.target.value) || 0)))}
className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
title="Words below this confidence get an orange underline — lower values show fewer warnings"
/>
</div>
<div className="flex items-center justify-between text-[10px]">
<span className="text-editor-text-muted">Show all</span>
<span className="font-medium text-editor-text">{confidenceThreshold.toFixed(2)}</span>
<span className="text-editor-text-muted">Strict</span>
</div>
</div>
{/* Keyboard shortcuts */}
<div className="space-y-2 pt-1 border-t border-editor-border">
<h4 className="text-xs font-semibold flex items-center gap-1.5">
<Keyboard className="w-3.5 h-3.5" />
Keyboard Shortcuts
</h4>
<div className="flex items-center gap-2">
<button
onClick={() => applyPresetAction('standard')}
className="flex-1 px-2 py-1.5 text-xs rounded bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30"
title="Reset all shortcuts to the Standard preset"
>
Standard Preset
</button>
<button
onClick={() => applyPresetAction('left-hand')}
className="flex-1 px-2 py-1.5 text-xs rounded bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30"
title="Reset all shortcuts to the Left-Hand preset"
>
Left-Hand Preset
</button>
</div>
{conflicts.length > 0 && (
<div className="px-2 py-1 rounded border border-red-500/40 bg-red-500/10 text-[10px] text-red-300">
{conflicts.join('; ')}
</div>
)}
<div className="max-h-52 overflow-y-auto space-y-1 pr-1">
{bindings.map((b, i) => (
<div key={b.id} className="flex items-center gap-2 text-[11px]">
<span className="flex-1 truncate text-editor-text-muted">{b.label}</span>
<input
value={editingKey === b.id ? editKeyValue : b.keys}
onFocus={() => startKeyEdit(i)}
onChange={(e) => {
setEditingKey(b.id);
setEditKeyValue(e.target.value);
}}
onKeyDown={(e) => handleKeyCapture(e, i)}
className="w-28 px-2 py-1 text-[10px] font-mono bg-editor-bg border border-editor-border rounded text-center focus:outline-none focus:border-editor-accent"
placeholder="Type shortcut"
title="Click then press the desired key combination"
/>
<button
onClick={() => handleReset(i)}
className="text-[10px] text-editor-text-muted hover:text-editor-text px-1"
title="Reset this shortcut to default"
>
</button>
</div>
))}
</div>
<p className="text-[10px] text-editor-text-muted">
Press <kbd>?</kbd> anytime to view shortcuts. Changes apply immediately.
</p>
</div>
{/* Default provider selector */}
<div className="space-y-2">
<label className="text-xs text-editor-text-muted font-medium">Default AI Provider</label>
<div className="grid grid-cols-3 gap-1.5">
{(['ollama', 'openai', 'claude'] as AIProvider[]).map((p) => (
<button
key={p}
onClick={() => setDefaultProvider(p)}
title={`Use ${p.charAt(0).toUpperCase() + p.slice(1)} for AI features — ${
p === 'ollama' ? 'Use a local Ollama instance' :
p === 'openai' ? "Use OpenAI's API (requires API key)" :
"Use Anthropic's Claude API (requires API key)"
}`}
className={`flex flex-col items-center gap-1 p-2 rounded-lg border transition-colors text-[10px] ${
defaultProvider === p
? 'border-editor-accent bg-editor-accent/10 text-editor-accent'
: 'border-editor-border text-editor-text-muted hover:text-editor-text'
}`}
>
{providerIcons[p]}
{p.charAt(0).toUpperCase() + p.slice(1)}
</button>
))}
</div>
</div>
{/* Manage downloaded models */}
<div className="space-y-2 pt-1 border-t border-editor-border">
<h4 className="text-xs font-semibold flex items-center gap-1.5">
<HardDrive className="w-3.5 h-3.5" />
Manage Models
</h4>
<p className="text-[10px] text-editor-text-muted leading-relaxed">
Downloaded Whisper transcription models and bundled LLM files.
</p>
{models.length === 0 ? (
<p className="text-xs text-editor-text-muted">No downloaded models found.</p>
) : (
<div className="space-y-1.5">
{models.map((m) => (
<div key={m.path} className="flex items-center gap-2 p-2 rounded bg-editor-bg border border-editor-border">
<div className="flex-1 min-w-0">
<p className="text-xs text-editor-text truncate">{m.name}</p>
<p className="text-[10px] text-editor-text-muted">
{formatBytes(m.size_bytes)} &middot; {m.kind === 'whisper' ? 'Whisper' : 'LLM'}
</p>
</div>
<button
onClick={() => handleDeleteModel(m)}
disabled={deleting === m.path}
className="p-1.5 rounded text-editor-text-muted hover:text-red-400 hover:bg-red-500/10 transition-colors disabled:opacity-40"
title="Delete model"
>
<Trash2 className="w-3.5 h-3.5" />
</button>
</div>
))}
</div>
)}
<button
onClick={fetchModels}
disabled={loadingModels}
className="text-[10px] text-editor-accent hover:underline flex items-center gap-0.5"
title="Refresh list of downloaded models"
>
<RefreshCw className={`w-2.5 h-2.5 ${loadingModels ? 'animate-spin' : ''}`} />
Refresh
</button>
</div>
<h4 className="text-xs font-semibold uppercase tracking-wide text-editor-text-muted">AI Settings</h4>
{/* Ollama settings */}
<ProviderSection title="Ollama (Local)" icon={providerIcons.ollama}>
<InputField
label="Base URL"
value={providers.ollama.baseUrl || ''}
onChange={(v) => setProviderConfig('ollama', { baseUrl: v })}
placeholder="http://localhost:11434"
title="URL of your Ollama instance — http://localhost:11434 by default"
/>
<div className="space-y-1">
<div className="flex items-center justify-between">
<label className="text-xs text-editor-text-muted">Model</label>
<button
onClick={fetchOllamaModels}
disabled={loadingOllamaModels}
className="text-[10px] text-editor-accent hover:underline flex items-center gap-0.5"
title="Refresh available Ollama models"
>
<RefreshCw className={`w-2.5 h-2.5 ${loadingOllamaModels ? 'animate-spin' : ''}`} />
Refresh
</button>
</div>
{ollamaModels.length > 0 ? (
<select
value={providers.ollama.model}
onChange={(e) => setProviderConfig('ollama', { model: e.target.value })}
className="w-full px-3 py-2 bg-editor-surface border border-editor-border rounded-lg text-xs text-white focus:outline-none focus:border-editor-accent"
title="Which Ollama model to use for AI features"
>
{ollamaModels.map((m) => (
<option key={m} value={m}>{m}</option>
))}
</select>
) : (
<InputField
label=""
value={providers.ollama.model}
onChange={(v) => setProviderConfig('ollama', { model: v })}
placeholder="llama3"
title="Which Ollama model to use for AI features"
/>
)}
</div>
</ProviderSection>
{/* OpenAI settings */}
<ProviderSection title="OpenAI" icon={providerIcons.openai}>
<InputField
label="API Key"
value={providers.openai.apiKey || ''}
onChange={(v) => setProviderConfig('openai', { apiKey: v })}
placeholder="sk-..."
type="password"
title="Your OpenAI API key — stored encrypted on your machine"
/>
<InputField
label="Model"
value={providers.openai.model}
onChange={(v) => setProviderConfig('openai', { model: v })}
placeholder="gpt-4o"
title="OpenAI model to use (e.g. gpt-4o, gpt-4o-mini)"
/>
</ProviderSection>
{/* Claude settings */}
<ProviderSection title="Claude (Anthropic)" icon={providerIcons.claude}>
<InputField
label="API Key"
value={providers.claude.apiKey || ''}
onChange={(v) => setProviderConfig('claude', { apiKey: v })}
placeholder="sk-ant-..."
type="password"
title="Your Anthropic Claude API key — stored encrypted on your machine"
/>
<InputField
label="Model"
value={providers.claude.model}
onChange={(v) => setProviderConfig('claude', { model: v })}
placeholder="claude-sonnet-4-20250514"
title="Claude model to use (e.g. claude-sonnet-4-20250514)"
/>
</ProviderSection>
</div>
);
}
function ProviderSection({
title,
icon,
children,
}: {
title: string;
icon: React.ReactNode;
children: React.ReactNode;
}) {
return (
<div className="space-y-3 p-3 bg-editor-surface rounded-lg">
<div className="flex items-center gap-2 text-xs font-medium">
{icon}
{title}
</div>
<div className="space-y-2">{children}</div>
</div>
);
}
function InputField({
label,
value,
onChange,
placeholder,
type = 'text',
title,
}: {
label: string;
value: string;
onChange: (value: string) => void;
placeholder: string;
type?: string;
title?: string;
}) {
return (
<div className="space-y-1">
{label && <label className="text-xs text-editor-text-muted">{label}</label>}
<input
type={type}
value={value}
onChange={(e) => onChange(e.target.value)}
placeholder={placeholder}
title={title}
className="w-full px-3 py-2 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text placeholder:text-editor-text-muted/50 focus:outline-none focus:border-editor-accent"
/>
</div>
);
}

View File

@ -0,0 +1,295 @@
import { useState } from 'react';
import { useEditorStore } from '../store/editorStore';
import { Loader2, Scissors, Trash2, RotateCcw, PencilLine, Layers } from 'lucide-react';
import type { SilenceDetectionRange, SilenceTrimSettings } from '../types/project';
export default function SilenceTrimmerPanel() {
const {
videoPath,
backendUrl,
silenceTrimGroups,
cutRanges,
applySilenceTrimGroup,
removeSilenceTrimGroup,
} = useEditorStore();
const [minSilenceMs, setMinSilenceMs] = useState(500);
const [silenceDb, setSilenceDb] = useState(-35);
const [preBufferMs, setPreBufferMs] = useState(80);
const [postBufferMs, setPostBufferMs] = useState(120);
const [isDetecting, setIsDetecting] = useState(false);
const [ranges, setRanges] = useState<SilenceDetectionRange[]>([]);
const [selectedGroupId, setSelectedGroupId] = useState<string | null>(null);
const [status, setStatus] = useState<string | null>(null);
const selectedGroup = selectedGroupId
? silenceTrimGroups.find((group) => group.id === selectedGroupId) ?? null
: null;
const buildSettings = (): SilenceTrimSettings => ({
minSilenceMs,
silenceDb,
preBufferMs,
postBufferMs,
});
const detectSilence = async () => {
if (!videoPath) return;
setIsDetecting(true);
setRanges([]);
try {
const res = await fetch(`${backendUrl}/audio/detect-silence`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
input_path: videoPath,
min_silence_ms: minSilenceMs,
silence_db: silenceDb,
}),
});
if (!res.ok) {
let detail = `HTTP ${res.status} ${res.statusText}`;
try {
const err = await res.json();
if (err?.detail) detail += ` - ${String(err.detail)}`;
} catch {
// ignore JSON parse errors for non-JSON error responses
}
if (res.status === 404) {
detail += ' (endpoint missing: restart backend to load /audio/detect-silence)';
}
throw new Error(detail);
}
const data = await res.json();
setRanges(data.ranges || []);
setStatus(`Detected ${(data.ranges || []).length} pause ranges.`);
} catch (err) {
console.error(err);
const message = err instanceof Error ? err.message : 'Unknown error';
alert(`Silence detection failed: ${message}`);
} finally {
setIsDetecting(false);
}
};
const applyAsNewGroup = () => {
if (ranges.length === 0) return;
const result = applySilenceTrimGroup({
sourceRanges: ranges,
settings: buildSettings(),
});
setSelectedGroupId(result.groupId);
setStatus(`Applied ${result.appliedCount} cut ranges as ${result.groupId}. Undo will revert this pass in one step.`);
};
const loadGroupForEditing = (groupId: string) => {
const group = silenceTrimGroups.find((entry) => entry.id === groupId);
if (!group) return;
setSelectedGroupId(groupId);
setRanges(group.sourceRanges);
setMinSilenceMs(group.settings.minSilenceMs);
setSilenceDb(group.settings.silenceDb);
setPreBufferMs(group.settings.preBufferMs);
setPostBufferMs(group.settings.postBufferMs);
setStatus(`Loaded ${group.id} for editing. Adjust settings and reapply.`);
};
const reapplySelectedGroup = () => {
if (!selectedGroupId || ranges.length === 0) return;
const result = applySilenceTrimGroup({
groupId: selectedGroupId,
sourceRanges: ranges,
settings: buildSettings(),
});
setStatus(`Reapplied ${result.groupId} with ${result.appliedCount} cut ranges.`);
};
const removeGroup = (groupId: string) => {
removeSilenceTrimGroup(groupId);
if (selectedGroupId === groupId) {
setSelectedGroupId(null);
}
setStatus(`Removed all cut ranges from ${groupId}.`);
};
return (
<div className="p-4 space-y-4">
<div className="space-y-1">
<h3 className="text-sm font-semibold">Silence / Pause Trimmer</h3>
<p className="text-xs text-editor-text-muted">
Detect pauses and convert them into cut ranges.
</p>
</div>
<div className="space-y-3">
<div className="space-y-1.5">
<label className="text-[11px] text-editor-text-muted font-medium">
Minimum pause length (ms)
</label>
<input
type="number"
min={100}
step={50}
value={minSilenceMs}
onChange={(e) => setMinSilenceMs(Number(e.target.value) || 500)}
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
title="Minimum duration of silence to detect in milliseconds"
/>
</div>
<div className="space-y-1.5">
<label className="text-[11px] text-editor-text-muted font-medium">
Silence threshold (dB)
</label>
<input
type="number"
min={-80}
max={0}
step={1}
value={silenceDb}
onChange={(e) => setSilenceDb(Number(e.target.value) || -35)}
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
title="Volume threshold in dB — lower values detect quieter sounds as silence"
/>
</div>
<div className="grid grid-cols-2 gap-2">
<div className="space-y-1.5">
<label className="text-[11px] text-editor-text-muted font-medium">
Buffer before (ms, +shrink / -expand)
</label>
<input
type="number"
min={-5000}
max={5000}
step={10}
value={preBufferMs}
onChange={(e) => setPreBufferMs(Number(e.target.value) || 0)}
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
title="Extra time to add before each detected silence"
/>
</div>
<div className="space-y-1.5">
<label className="text-[11px] text-editor-text-muted font-medium">
Buffer after (ms, +shrink / -expand)
</label>
<input
type="number"
min={-5000}
max={5000}
step={10}
value={postBufferMs}
onChange={(e) => setPostBufferMs(Number(e.target.value) || 0)}
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
title="Extra time to add after each detected silence"
/>
</div>
</div>
<button
onClick={detectSilence}
disabled={isDetecting || !videoPath}
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
title="Scan the entire audio track for silent pauses"
>
{isDetecting ? (
<>
<Loader2 className="w-4 h-4 animate-spin" />
Detecting pauses...
</>
) : (
'Detect Pauses'
)}
</button>
</div>
{status && (
<div className="text-[11px] text-editor-text-muted bg-editor-surface border border-editor-border rounded px-2.5 py-2">
{status}
</div>
)}
{ranges.length > 0 && (
<div className="space-y-2">
<div className="flex items-center justify-between">
<span className="text-xs font-medium">Detected {ranges.length} pause ranges</span>
<div className="flex items-center gap-1">
{selectedGroup && (
<button
onClick={reapplySelectedGroup}
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-warning/20 text-editor-warning rounded hover:bg-editor-warning/30"
title="Re-apply this silence trim group with current settings"
>
<RotateCcw className="w-3 h-3" />
Reapply Group
</button>
)}
<button
onClick={applyAsNewGroup}
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-accent/20 text-editor-accent rounded hover:bg-editor-accent/30"
title="Create a new silence trim group from detected pauses"
>
<Scissors className="w-3 h-3" />
Apply As New Group
</button>
</div>
</div>
<div className="max-h-56 overflow-y-auto space-y-1 pr-1">
{ranges.slice(0, 50).map((r, i) => (
<div key={`${r.start}-${r.end}-${i}`} className="px-2 py-1.5 rounded bg-editor-surface border border-editor-border text-xs">
{r.start.toFixed(2)}s - {r.end.toFixed(2)}s ({r.duration.toFixed(2)}s)
</div>
))}
</div>
</div>
)}
{silenceTrimGroups.length > 0 && (
<div className="space-y-2 pt-1">
<div className="text-xs font-medium flex items-center gap-1">
<Layers className="w-3 h-3" />
Silence Trim Groups
</div>
<div className="max-h-48 overflow-y-auto space-y-1 pr-1">
{silenceTrimGroups.map((group) => {
const groupCutCount = cutRanges.filter((range) => range.trimGroupId === group.id).length;
const isActive = selectedGroupId === group.id;
return (
<div
key={group.id}
className={`rounded border px-2 py-1.5 text-xs ${isActive ? 'border-editor-accent bg-editor-accent/10' : 'border-editor-border bg-editor-surface'}`}
>
<div className="flex items-center justify-between gap-2">
<div className="min-w-0">
<div className="font-medium truncate">{group.id}</div>
<div className="text-[10px] text-editor-text-muted">
{groupCutCount} cuts · {group.sourceRanges.length} source pauses
</div>
</div>
<div className="flex items-center gap-1 shrink-0">
<button
onClick={() => loadGroupForEditing(group.id)}
className="px-1.5 py-1 rounded hover:bg-editor-accent/20 text-editor-accent"
title="Edit and reapply this group"
>
<PencilLine className="w-3 h-3" />
</button>
<button
onClick={() => removeGroup(group.id)}
className="px-1.5 py-1 rounded hover:bg-editor-danger/20 text-editor-danger"
title="Delete all cuts from this group"
>
<Trash2 className="w-3 h-3" />
</button>
</div>
</div>
</div>
);
})}
</div>
</div>
)}
</div>
);
}

View File

@ -0,0 +1,631 @@
import { useCallback, useRef, useEffect, useMemo, useState } from 'react';
import { useEditorStore } from '../store/editorStore';
import { useLicenseStore } from '../store/licenseStore';
import { Virtuoso } from 'react-virtuoso';
import { Scissors, VolumeX, SlidersHorizontal, Gauge, RotateCcw, Search, ChevronUp, ChevronDown, X, RefreshCw } from 'lucide-react';
import { assert } from '../lib/assert';
interface TranscriptEditorProps {
cutMode: boolean;
muteMode: boolean;
gainMode: boolean;
gainModeDb: number;
speedMode: boolean;
speedModeValue: number;
}
export default function TranscriptEditor({
cutMode,
muteMode,
gainMode,
gainModeDb,
speedMode,
speedModeValue,
}: TranscriptEditorProps) {
const words = useEditorStore((s) => s.words);
const segments = useEditorStore((s) => s.segments);
const cutRanges = useEditorStore((s) => s.cutRanges);
const muteRanges = useEditorStore((s) => s.muteRanges);
const gainRanges = useEditorStore((s) => s.gainRanges);
const speedRanges = useEditorStore((s) => s.speedRanges);
const selectedWordIndices = useEditorStore((s) => s.selectedWordIndices);
const hoveredWordIndex = useEditorStore((s) => s.hoveredWordIndex);
const setSelectedWordIndices = useEditorStore((s) => s.setSelectedWordIndices);
const setHoveredWordIndex = useEditorStore((s) => s.setHoveredWordIndex);
const videoPath = useEditorStore((s) => s.videoPath);
const backendUrl = useEditorStore((s) => s.backendUrl);
const replaceWordRange = useEditorStore((s) => s.replaceWordRange);
const removeCutRange = useEditorStore((s) => s.removeCutRange);
const removeMuteRange = useEditorStore((s) => s.removeMuteRange);
const removeGainRange = useEditorStore((s) => s.removeGainRange);
const removeSpeedRange = useEditorStore((s) => s.removeSpeedRange);
const addCutRange = useEditorStore((s) => s.addCutRange);
const addMuteRange = useEditorStore((s) => s.addMuteRange);
const addGainRange = useEditorStore((s) => s.addGainRange);
const addSpeedRange = useEditorStore((s) => s.addSpeedRange);
const getWordAtTime = useEditorStore((s) => s.getWordAtTime);
const canEdit = useLicenseStore((s) => s.canEdit);
const selectionStart = useRef<number | null>(null);
const wasDragging = useRef(false);
const virtuosoRef = useRef<any>(null);
const zoneDragStart = useRef<number | null>(null);
const [zoneDragRange, setZoneDragRange] = useState<{ start: number; end: number } | null>(null);
const [searchOpen, setSearchOpen] = useState(false);
const [searchQuery, setSearchQuery] = useState('');
const [activeMatchIdx, setActiveMatchIdx] = useState(0);
const searchInputRef = useRef<HTMLInputElement | null>(null);
const updateWordText = useEditorStore((s) => s.updateWordText);
const [editingWordIndex, setEditingWordIndex] = useState<number | null>(null);
const [editText, setEditText] = useState('');
const editInputRef = useRef<HTMLInputElement | null>(null);
const selectedSet = useMemo(() => new Set(selectedWordIndices), [selectedWordIndices]);
const matchIndices = useMemo(() => {
const q = searchQuery.trim().toLowerCase();
if (!q) return [] as number[];
const matches: number[] = [];
for (let i = 0; i < words.length; i++) {
if (words[i].word.toLowerCase().includes(q)) matches.push(i);
}
return matches;
}, [searchQuery, words]);
const matchSet = useMemo(() => new Set(matchIndices), [matchIndices]);
const safeActiveMatchIdx = matchIndices.length === 0
? 0
: Math.min(activeMatchIdx, matchIndices.length - 1);
const jumpToMatch = useCallback((idx: number) => {
if (matchIndices.length === 0) return;
const nextIdx = ((idx % matchIndices.length) + matchIndices.length) % matchIndices.length;
setActiveMatchIdx(nextIdx);
const wordIndex = matchIndices[nextIdx];
const el = document.getElementById(`word-${wordIndex}`);
if (el) {
el.scrollIntoView({ behavior: 'smooth', block: 'center', inline: 'nearest' });
}
}, [matchIndices]);
useEffect(() => {
const onKeyDown = (e: KeyboardEvent) => {
const target = e.target as HTMLElement | null;
const isInInput = !!target && (target.tagName === 'INPUT' || target.tagName === 'TEXTAREA' || target.tagName === 'SELECT');
if ((e.ctrlKey || e.metaKey) && e.key.toLowerCase() === 'f') {
e.preventDefault();
setSearchOpen(true);
requestAnimationFrame(() => searchInputRef.current?.focus());
return;
}
if (!searchOpen) return;
if (e.key === 'Escape') {
e.preventDefault();
setSearchOpen(false);
return;
}
if (e.key === 'Enter' && !isInInput) {
e.preventDefault();
jumpToMatch(safeActiveMatchIdx + (e.shiftKey ? -1 : 1));
return;
}
if (e.key === 'Enter' && isInInput && target === searchInputRef.current) {
e.preventDefault();
jumpToMatch(safeActiveMatchIdx + (e.shiftKey ? -1 : 1));
}
};
window.addEventListener('keydown', onKeyDown);
return () => window.removeEventListener('keydown', onKeyDown);
}, [jumpToMatch, searchOpen, safeActiveMatchIdx]);
const [activeWordIndex, setActiveWordIndex] = useState(-1);
useEffect(() => {
if (words.length === 0) return;
const interval = setInterval(() => {
const video = document.querySelector('video') as HTMLVideoElement | null;
if (!video) return;
const idx = getWordAtTime(video.currentTime);
setActiveWordIndex((prev) => (prev === idx ? prev : idx));
}, 250);
return () => clearInterval(interval);
}, [words, getWordAtTime]);
// Auto-scroll to active segment via Virtuoso
useEffect(() => {
if (activeWordIndex < 0 || segments.length === 0) return;
const segIdx = segments.findIndex((seg) => {
const start = seg.globalStartIndex ?? 0;
return activeWordIndex >= start && activeWordIndex < start + seg.words.length;
});
if (segIdx >= 0 && virtuosoRef.current) {
virtuosoRef.current.scrollIntoView({ index: segIdx, behavior: 'smooth', align: 'center' });
}
}, [activeWordIndex, segments]);
const handleWordMouseDown = useCallback(
(index: number, e: React.MouseEvent) => {
e.preventDefault();
// Ctrl+click → seek video to this word's start time
if (e.ctrlKey) {
const word = words[index];
if (word) {
const video = document.querySelector('video') as HTMLVideoElement | null;
if (video) video.currentTime = word.start;
}
return;
}
if (cutMode || muteMode || gainMode || speedMode) {
zoneDragStart.current = index;
setZoneDragRange({ start: index, end: index });
selectionStart.current = null;
return;
}
wasDragging.current = false;
if (e.shiftKey && selectedWordIndices.length > 0) {
const first = selectedWordIndices[0];
const start = Math.min(first, index);
const end = Math.max(first, index);
const indices = [];
for (let i = start; i <= end; i++) indices.push(i);
setSelectedWordIndices(indices);
} else {
selectionStart.current = index;
setSelectedWordIndices([index]);
}
},
[words, selectedWordIndices, setSelectedWordIndices, cutMode, muteMode, gainMode, speedMode],
);
const handleWordMouseEnter = useCallback(
(index: number) => {
setHoveredWordIndex(index);
if (zoneDragStart.current !== null) {
setZoneDragRange({
start: Math.min(zoneDragStart.current, index),
end: Math.max(zoneDragStart.current, index),
});
return;
}
if (selectionStart.current !== null) {
wasDragging.current = true;
const start = Math.min(selectionStart.current, index);
const end = Math.max(selectionStart.current, index);
const indices = [];
for (let i = start; i <= end; i++) indices.push(i);
setSelectedWordIndices(indices);
}
},
[setHoveredWordIndex, setSelectedWordIndices],
);
const handleMouseUp = useCallback(() => {
if (zoneDragStart.current !== null && zoneDragRange) {
assert(zoneDragRange.start >= 0 && zoneDragRange.start < words.length, 'handleMouseUp: zoneDragRange.start out of bounds');
assert(zoneDragRange.end >= 0 && zoneDragRange.end < words.length, 'handleMouseUp: zoneDragRange.end out of bounds');
const startWord = words[zoneDragRange.start];
const endWord = words[zoneDragRange.end];
if (startWord && endWord && canEdit) {
if (cutMode) addCutRange(startWord.start, endWord.end);
if (muteMode) addMuteRange(startWord.start, endWord.end);
if (gainMode) addGainRange(startWord.start, endWord.end, gainModeDb);
if (speedMode) addSpeedRange(startWord.start, endWord.end, speedModeValue);
}
}
zoneDragStart.current = null;
setZoneDragRange(null);
selectionStart.current = null;
}, [zoneDragRange, words, cutMode, muteMode, gainMode, gainModeDb, speedMode, speedModeValue, addCutRange, addMuteRange, addGainRange, addSpeedRange, canEdit]);
const handleClickOutside = useCallback(
(e: React.MouseEvent) => {
if (wasDragging.current) {
wasDragging.current = false;
return;
}
if ((e.target as HTMLElement).dataset.wordIndex === undefined) {
setSelectedWordIndices([]);
}
},
[setSelectedWordIndices],
);
const startEditing = useCallback((index: number) => {
const word = words[index];
if (!word) return;
setEditingWordIndex(index);
setEditText(word.word);
requestAnimationFrame(() => {
editInputRef.current?.focus();
editInputRef.current?.select();
});
}, [words]);
const commitEdit = useCallback(() => {
if (editingWordIndex === null) return;
const trimmed = editText.trim();
if (trimmed && trimmed !== words[editingWordIndex]?.word) {
updateWordText(editingWordIndex, trimmed);
}
setEditingWordIndex(null);
setEditText('');
}, [editingWordIndex, editText, words, updateWordText]);
const cancelEdit = useCallback(() => {
setEditingWordIndex(null);
setEditText('');
}, []);
const [isReTranscribing, setIsReTranscribing] = useState(false);
const reTranscribeGuard = useRef(false);
const handleReTranscribe = useCallback(async () => {
if (!videoPath || selectedWordIndices.length === 0 || reTranscribeGuard.current) return;
reTranscribeGuard.current = true;
setIsReTranscribing(true);
// Snapshot indices and word timings before the async gap
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
assert(sorted[0] >= 0 && sorted[sorted.length - 1] < words.length, 'handleReTranscribe: sorted indices out of bounds');
const startWord = words[sorted[0]];
const endWord = words[sorted[sorted.length - 1]];
if (!startWord || !endWord) {
reTranscribeGuard.current = false;
setIsReTranscribing(false);
return;
}
try {
const res = await fetch(`${backendUrl}/transcribe/segment`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
file_path: videoPath,
start: startWord.start,
end: endWord.end,
}),
});
if (!res.ok) {
let detail = res.statusText;
try { const body = await res.json(); if (body?.detail) detail = String(body.detail); } catch { /* keep statusText fallback */ }
throw new Error(`Re-transcribe failed: ${detail}`);
}
const data = await res.json();
replaceWordRange(sorted[0], sorted[sorted.length - 1], data.words);
} catch (err) {
console.error('Re-transcribe error:', err);
alert(err instanceof Error ? err.message : 'Re-transcribe failed');
} finally {
reTranscribeGuard.current = false;
setIsReTranscribing(false);
}
}, [videoPath, selectedWordIndices, words, backendUrl, replaceWordRange]);
const handleWordDoubleClick = useCallback((index: number) => {
if (cutMode || muteMode || gainMode || speedMode) return;
if (!canEdit) return;
startEditing(index);
}, [cutMode, muteMode, gainMode, speedMode, startEditing, canEdit]);
// Focus edit input when it appears
useEffect(() => {
if (editingWordIndex !== null && editInputRef.current) {
editInputRef.current.focus();
editInputRef.current.select();
}
}, [editingWordIndex]);
// Global key handler for edit mode
useEffect(() => {
const onKeyDown = (e: KeyboardEvent) => {
if (editingWordIndex === null) return;
if (e.key === 'Enter') {
e.preventDefault();
commitEdit();
} else if (e.key === 'Escape') {
e.preventDefault();
cancelEdit();
}
};
window.addEventListener('keydown', onKeyDown);
return () => window.removeEventListener('keydown', onKeyDown);
}, [editingWordIndex, commitEdit, cancelEdit]);
const cutSelectedWords = useCallback(() => {
if (selectedWordIndices.length === 0) return;
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
assert(sorted[0] >= 0 && sorted[0] < words.length, 'cutSelectedWords: sorted[0] out of bounds');
assert(sorted[sorted.length - 1] >= 0 && sorted[sorted.length - 1] < words.length, 'cutSelectedWords: sorted[last] out of bounds');
const startTime = words[sorted[0]].start;
const endTime = words[sorted[sorted.length - 1]].end;
addCutRange(startTime, endTime);
}, [selectedWordIndices, words, addCutRange]);
const muteSelectedWords = useCallback(() => {
if (selectedWordIndices.length === 0) return;
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
const startTime = words[sorted[0]].start;
const endTime = words[sorted[sorted.length - 1]].end;
addMuteRange(startTime, endTime);
}, [selectedWordIndices, words, addMuteRange]);
const gainSelectedWords = useCallback(() => {
if (selectedWordIndices.length === 0) return;
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
const startTime = words[sorted[0]].start;
const endTime = words[sorted[sorted.length - 1]].end;
addGainRange(startTime, endTime, gainModeDb);
}, [selectedWordIndices, words, addGainRange, gainModeDb]);
const speedSelectedWords = useCallback(() => {
if (selectedWordIndices.length === 0) return;
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
const startTime = words[sorted[0]].start;
const endTime = words[sorted[sorted.length - 1]].end;
addSpeedRange(startTime, endTime, speedModeValue);
}, [selectedWordIndices, words, addSpeedRange, speedModeValue]);
const getCutRangeForWord = useCallback(
(wordIndex: number) => {
const word = words[wordIndex];
if (!word) return null;
return cutRanges.find((r) => word.start >= r.start && word.end <= r.end);
},
[words, cutRanges],
);
const getMuteRangeForWord = useCallback(
(wordIndex: number) => {
const word = words[wordIndex];
if (!word) return null;
return muteRanges.find((r) => word.start >= r.start && word.end <= r.end);
},
[words, muteRanges],
);
const getGainRangeForWord = useCallback(
(wordIndex: number) => {
const word = words[wordIndex];
if (!word) return null;
return gainRanges.find((r) => word.start >= r.start && word.end <= r.end);
},
[words, gainRanges],
);
const getSpeedRangeForWord = useCallback(
(wordIndex: number) => {
const word = words[wordIndex];
if (!word) return null;
return speedRanges.find((r) => word.start >= r.start && word.end <= r.end);
},
[words, speedRanges],
);
const renderSegment = useCallback(
(index: number) => {
const segment = segments[index];
if (!segment) return null;
return (
<div className="mb-3 px-4">
{segment.speaker && (
<div className="text-xs text-editor-accent font-medium mb-1">
{segment.speaker}
</div>
)}
<p className="text-sm leading-relaxed flex flex-wrap">
{segment.words.map((word, localIndex) => {
const globalIndex = (segment.globalStartIndex ?? 0) + localIndex;
const isSelected = selectedSet.has(globalIndex);
const isActive = globalIndex === activeWordIndex;
const isHovered = globalIndex === hoveredWordIndex;
const isZoneDragSelected = zoneDragRange
? globalIndex >= zoneDragRange.start && globalIndex <= zoneDragRange.end
: false;
const cutRange = getCutRangeForWord(globalIndex);
const muteRange = getMuteRangeForWord(globalIndex);
const gainRange = getGainRangeForWord(globalIndex);
const speedRange = getSpeedRangeForWord(globalIndex);
const isSearchMatch = matchSet.has(globalIndex);
const isActiveSearchMatch = matchIndices.length > 0 && matchIndices[safeActiveMatchIdx] === globalIndex;
const isEditing = globalIndex === editingWordIndex;
// Low-confidence highlighting
const CONFIDENCE_THRESHOLD_KEY = 'talkedit:confidenceThreshold';
const storedThreshold = typeof window !== 'undefined' ? Number(window.localStorage.getItem(CONFIDENCE_THRESHOLD_KEY)) : 0;
const confidenceThreshold = Number.isFinite(storedThreshold) ? storedThreshold : 0.6;
const isLowConfidence = word.confidence > 0 && word.confidence < confidenceThreshold && !cutRange && !muteRange && !gainRange && !speedRange;
const confidencePct = word.confidence > 0 ? Math.round(word.confidence * 100) : null;
return (
<span
key={globalIndex}
id={`word-${globalIndex}`}
data-word-index={globalIndex}
title={`${word.start.toFixed(2)}s — confidence: ${confidencePct !== null ? confidencePct + '%' : 'N/A'}${isLowConfidence ? ' ⚠️ Low confidence' : ''} — Ctrl+click to seek, double-click to edit`}
onMouseDown={(e) => handleWordMouseDown(globalIndex, e)}
onMouseEnter={() => handleWordMouseEnter(globalIndex)}
onMouseLeave={() => setHoveredWordIndex(null)}
onDoubleClick={() => handleWordDoubleClick(globalIndex)}
className={`
relative px-[2px] py-[1px] rounded cursor-pointer transition-colors
${cutRange ? 'bg-red-500/20 text-red-100' : ''}
${muteRange ? 'bg-blue-500/20 text-blue-100' : ''}
${gainRange ? 'bg-amber-500/20 text-amber-100' : ''}
${speedRange ? 'bg-emerald-500/20 text-emerald-100' : ''}
${isZoneDragSelected && cutMode ? 'bg-red-500/30 ring-1 ring-red-400/60' : ''}
${isZoneDragSelected && muteMode ? 'bg-blue-500/30 ring-1 ring-blue-400/60' : ''}
${isZoneDragSelected && gainMode ? 'bg-amber-500/30 ring-1 ring-amber-400/60' : ''}
${isZoneDragSelected && speedMode ? 'bg-emerald-500/30 ring-1 ring-emerald-400/60' : ''}
${isSearchMatch && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/15 ring-2 ring-editor-accent/50' : ''}
${isActiveSearchMatch && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/35 ring-2 ring-editor-accent text-white font-medium' : ''}
${isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-selected text-white' : ''}
${isActive && !isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/20 text-editor-accent' : ''}
${isHovered && !isSelected && !isActive && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-hover' : ''}
${isLowConfidence ? 'border-b border-dashed border-orange-400/60' : ''}
`}
>
{isEditing ? (
<input
ref={editInputRef}
value={editText}
onChange={(e) => setEditText(e.target.value)}
onBlur={commitEdit}
className="w-24 px-1 py-0 text-xs bg-editor-bg border border-editor-accent rounded text-editor-text focus:outline-none"
style={{ minWidth: `${Math.max(word.word.length * 8, 48)}px` }}
/>
) : (
<>{word.word}{' '}</>
)}
{(cutRange || muteRange || gainRange || speedRange) && isHovered && (
<button
onClick={(e) => {
e.stopPropagation();
if (cutRange) removeCutRange(cutRange.id);
if (muteRange) removeMuteRange(muteRange.id);
if (gainRange) removeGainRange(gainRange.id);
if (speedRange) removeSpeedRange(speedRange.id);
}}
className="absolute -top-5 left-1/2 -translate-x-1/2 flex items-center gap-0.5 px-1.5 py-0.5 bg-editor-surface border border-editor-border rounded text-[10px] text-editor-success whitespace-nowrap z-10"
>
<RotateCcw className="w-2.5 h-2.5" /> Restore
</button>
)}
</span>
);
})}
</p>
</div>
);
},
[segments, selectedSet, matchSet, matchIndices, safeActiveMatchIdx, activeWordIndex, hoveredWordIndex, handleWordMouseDown, handleWordMouseEnter, setHoveredWordIndex, getCutRangeForWord, getMuteRangeForWord, getGainRangeForWord, getSpeedRangeForWord, removeCutRange, removeMuteRange, removeGainRange, removeSpeedRange, zoneDragRange, cutMode, muteMode, gainMode, speedMode, editingWordIndex, editText, editInputRef, handleWordDoubleClick, commitEdit, setEditText],
);
return (
<div className="flex-1 flex flex-col min-h-0">
<div className="flex items-center justify-between gap-2 px-4 py-2 border-b border-editor-border shrink-0">
<div className="flex items-center gap-1.5">
<button
onClick={() => {
setSearchOpen(true);
requestAnimationFrame(() => searchInputRef.current?.focus());
}}
className="flex items-center gap-1 px-2 py-1 text-xs text-editor-text-muted hover:text-editor-text hover:bg-editor-surface rounded"
title="Find (Ctrl+F)"
>
<Search className="w-3 h-3" />
Find
</button>
{searchOpen && (
<div className="flex items-center gap-1.5 px-2 py-1 rounded border border-editor-border bg-editor-surface">
<input
ref={searchInputRef}
value={searchQuery}
onChange={(e) => {
setSearchQuery(e.target.value);
setActiveMatchIdx(0);
}}
placeholder="Search transcript"
className="w-40 bg-transparent text-xs text-editor-text focus:outline-none"
/>
<span className="text-[10px] text-editor-text-muted min-w-[52px] text-right">
{matchIndices.length === 0 ? '0/0' : `${safeActiveMatchIdx + 1}/${matchIndices.length}`}
</span>
<button
onClick={() => jumpToMatch(safeActiveMatchIdx - 1)}
className="p-0.5 rounded hover:bg-editor-bg text-editor-text-muted hover:text-editor-text"
title="Previous match (Shift+Enter)"
>
<ChevronUp className="w-3 h-3" />
</button>
<button
onClick={() => jumpToMatch(safeActiveMatchIdx + 1)}
className="p-0.5 rounded hover:bg-editor-bg text-editor-text-muted hover:text-editor-text"
title="Next match (Enter)"
>
<ChevronDown className="w-3 h-3" />
</button>
<button
onClick={() => setSearchOpen(false)}
className="p-0.5 rounded hover:bg-editor-bg text-editor-text-muted hover:text-editor-text"
title="Close search (Esc)"
>
<X className="w-3 h-3" />
</button>
</div>
)}
</div>
{selectedWordIndices.length > 0 && (
<div className="flex items-center gap-1">
<button
onClick={cutSelectedWords}
disabled={!canEdit}
className="flex items-center gap-1 px-2 py-1 text-xs bg-red-500/20 text-red-300 rounded hover:bg-red-500/30 transition-colors disabled:opacity-40"
title="Remove this word range from the output"
>
<Scissors className="w-3 h-3" />
Cut
</button>
<button
onClick={muteSelectedWords}
disabled={!canEdit}
className="flex items-center gap-1 px-2 py-1 text-xs bg-blue-500/20 text-blue-300 rounded hover:bg-blue-500/30 transition-colors disabled:opacity-40"
title="Silence audio for this word range"
>
<VolumeX className="w-3 h-3" />
Mute
</button>
<button
onClick={gainSelectedWords}
disabled={!canEdit}
className="flex items-center gap-1 px-2 py-1 text-xs bg-amber-500/20 text-amber-300 rounded hover:bg-amber-500/30 transition-colors disabled:opacity-40"
title="Adjust volume for this word range — positive boosts, negative reduces"
>
<SlidersHorizontal className="w-3 h-3" />
Gain ({gainModeDb > 0 ? '+' : ''}{gainModeDb.toFixed(1)} dB)
</button>
<button
onClick={speedSelectedWords}
disabled={!canEdit}
className="flex items-center gap-1 px-2 py-1 text-xs bg-emerald-500/20 text-emerald-300 rounded hover:bg-emerald-500/30 transition-colors disabled:opacity-40"
title="Change playback speed for this word range — lower is slower, higher is faster"
>
<Gauge className="w-3 h-3" />
Speed {speedModeValue.toFixed(2)}x
</button>
<button
onClick={handleReTranscribe}
disabled={isReTranscribing || !canEdit}
className="flex items-center gap-1 px-2 py-1 text-xs bg-purple-500/20 text-purple-300 rounded hover:bg-purple-500/30 disabled:opacity-40 transition-colors"
title="Re-run Whisper transcription on this segment"
>
<RefreshCw className={`w-3 h-3 ${isReTranscribing ? 'animate-spin' : ''}`} />
{isReTranscribing ? 'Re-transcribing...' : 'Re-transcribe'}
</button>
</div>
)}
</div>
<div
className="flex-1 min-h-0 select-none"
onMouseUp={handleMouseUp}
onClick={handleClickOutside}
>
<Virtuoso
ref={virtuosoRef}
totalCount={segments.length}
itemContent={renderSegment}
overscan={200}
className="h-full"
style={{ height: '100%' }}
/>
</div>
</div>
);
}

View File

@ -0,0 +1,133 @@
import { useRef, useCallback, useState, useEffect } from 'react';
import { useEditorStore } from '../store/editorStore';
import { useVideoSync } from '../hooks/useVideoSync';
import { Play, Pause, SkipBack, SkipForward, Volume2 } from 'lucide-react';
export default function VideoPlayer() {
const videoRef = useRef<HTMLVideoElement>(null);
const videoUrl = useEditorStore((s) => s.videoUrl);
const isPlaying = useEditorStore((s) => s.isPlaying);
const duration = useEditorStore((s) => s.duration);
const { seekTo, togglePlay } = useVideoSync(videoRef);
const [displayTime, setDisplayTime] = useState(0);
useEffect(() => {
const video = videoRef.current;
if (!video) return;
let raf = 0;
const tick = () => {
setDisplayTime(video.currentTime);
raf = requestAnimationFrame(tick);
};
raf = requestAnimationFrame(tick);
return () => cancelAnimationFrame(raf);
}, [videoUrl]);
const formatTime = (seconds: number) => {
const m = Math.floor(seconds / 60);
const s = Math.floor(seconds % 60);
return `${m}:${s.toString().padStart(2, '0')}`;
};
const handleProgressClick = useCallback(
(e: React.MouseEvent<HTMLDivElement>) => {
const rect = e.currentTarget.getBoundingClientRect();
const ratio = (e.clientX - rect.left) / rect.width;
seekTo(ratio * duration);
},
[seekTo, duration],
);
const skip = useCallback(
(delta: number) => {
const video = videoRef.current;
if (!video) return;
seekTo(Math.max(0, Math.min(duration, video.currentTime + delta)));
},
[seekTo, duration],
);
if (!videoUrl) {
return (
<div className="w-full h-full flex items-center justify-center text-editor-text-muted text-sm">
No video loaded
</div>
);
}
return (
<div className="w-full h-full flex flex-col">
<div className="flex-1 flex items-center justify-center bg-black rounded-lg overflow-hidden min-h-0">
<video
ref={videoRef}
src={videoUrl}
className="max-w-full max-h-full object-contain"
playsInline
onClick={togglePlay}
/>
</div>
<div className="pt-2 space-y-1.5 shrink-0">
<div
className="h-1.5 bg-editor-border rounded-full cursor-pointer group"
onClick={handleProgressClick}
>
<div
className="h-full bg-editor-accent rounded-full relative transition-all group-hover:h-2"
style={{ width: duration > 0 ? `${(displayTime / duration) * 100}%` : '0%' }}
>
<div className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 bg-white rounded-full opacity-0 group-hover:opacity-100 transition-opacity" />
</div>
</div>
<div className="flex items-center justify-between">
<div className="flex items-center gap-1">
<ControlButton onClick={() => skip(-5)} title="Back 5s">
<SkipBack className="w-4 h-4" />
</ControlButton>
<ControlButton onClick={togglePlay} title={isPlaying ? 'Pause' : 'Play'} primary>
{isPlaying ? <Pause className="w-5 h-5" /> : <Play className="w-5 h-5 ml-0.5" />}
</ControlButton>
<ControlButton onClick={() => skip(5)} title="Forward 5s">
<SkipForward className="w-4 h-4" />
</ControlButton>
</div>
<div className="flex items-center gap-3 text-xs text-editor-text-muted">
<Volume2 className="w-3.5 h-3.5" />
<span className="font-mono">
{formatTime(displayTime)} / {formatTime(duration)}
</span>
</div>
</div>
</div>
</div>
);
}
function ControlButton({
children,
onClick,
title,
primary,
}: {
children: React.ReactNode;
onClick: () => void;
title: string;
primary?: boolean;
}) {
return (
<button
onClick={onClick}
title={title}
className={`p-1.5 rounded-md transition-colors ${
primary
? 'bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30'
: 'text-editor-text-muted hover:text-editor-text hover:bg-editor-surface'
}`}
>
{children}
</button>
);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,459 @@
import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
import { useEditorStore } from '../store/editorStore';
import { Trash2, Scissors, Volume2, SlidersHorizontal, Gauge, Play } from 'lucide-react';
function formatTimelineLikeTime(secs: number): string {
const m = Math.floor(secs / 60);
const s = secs % 60;
if (m > 0) return `${m}:${String(Math.floor(s)).padStart(2, '0')}.${Math.floor((s % 1) * 10)}`;
return `${s.toFixed(1)}s`;
}
export default function ZoneEditor() {
const [viewMode, setViewMode] = useState<'all' | 'cut' | 'mute' | 'gain' | 'speed'>('all');
const [focusedZone, setFocusedZone] = useState<{ type: 'cut' | 'mute' | 'gain' | 'speed'; id: string } | null>(null);
const previewFrameRef = useRef<number | null>(null);
const {
cutRanges,
muteRanges,
gainRanges,
speedRanges,
duration,
setCurrentTime,
zonePreviewPaddingSeconds,
setZonePreviewPaddingSeconds,
globalGainDb,
setGlobalGainDb,
removeCutRange,
removeMuteRange,
removeGainRange,
removeSpeedRange,
updateGainRange,
updateSpeedRange,
} = useEditorStore();
const stopPreviewLoop = useCallback(() => {
if (previewFrameRef.current !== null) {
cancelAnimationFrame(previewFrameRef.current);
previewFrameRef.current = null;
}
}, []);
useEffect(() => stopPreviewLoop, [stopPreviewLoop]);
const previewZone = useCallback((start: number, end: number) => {
const video = document.querySelector('video');
if (!(video instanceof HTMLVideoElement)) return;
stopPreviewLoop();
const previewStart = Math.max(0, start - zonePreviewPaddingSeconds);
const maxDuration = Number.isFinite(duration) && duration > 0 ? duration : video.duration;
const previewEnd = Math.min(maxDuration || end + zonePreviewPaddingSeconds, end + zonePreviewPaddingSeconds);
video.currentTime = previewStart;
setCurrentTime(previewStart);
const tick = () => {
if (video.paused || video.ended) {
previewFrameRef.current = null;
return;
}
if (video.currentTime >= previewEnd) {
video.pause();
video.currentTime = previewEnd;
setCurrentTime(previewEnd);
previewFrameRef.current = null;
return;
}
previewFrameRef.current = requestAnimationFrame(tick);
};
void video.play();
previewFrameRef.current = requestAnimationFrame(tick);
}, [duration, setCurrentTime, stopPreviewLoop, zonePreviewPaddingSeconds]);
const renderPreviewButton = (start: number, end: number, accentClass: string) => (
<button
onClick={(e) => {
e.stopPropagation();
previewZone(start, end);
}}
className={`p-1 rounded opacity-0 group-hover:opacity-100 transition-opacity ${accentClass}`}
title={`Play ${zonePreviewPaddingSeconds.toFixed(2)}s before and after zone`}
>
<Play className="w-3.5 h-3.5" />
</button>
);
const totalZones = cutRanges.length + muteRanges.length + gainRanges.length + speedRanges.length;
const getZoneTypeColor = (type: 'cut' | 'mute' | 'gain' | 'speed') => {
switch (type) {
case 'cut':
return 'border-red-500/40 bg-red-500/5';
case 'mute':
return 'border-blue-500/40 bg-blue-500/20';
case 'gain':
return 'border-amber-500/40 bg-amber-500/5';
case 'speed':
return 'border-emerald-500/40 bg-emerald-500/5';
}
};
const activeFocusedZone = useMemo(() => {
if (!focusedZone) return null;
const exists = focusedZone.type === 'cut'
? cutRanges.some((range) => range.id === focusedZone.id)
: focusedZone.type === 'mute'
? muteRanges.some((range) => range.id === focusedZone.id)
: focusedZone.type === 'gain'
? gainRanges.some((range) => range.id === focusedZone.id)
: speedRanges.some((range) => range.id === focusedZone.id);
return exists ? focusedZone : null;
}, [cutRanges, focusedZone, gainRanges, muteRanges, speedRanges]);
const isZoneFocused = useCallback(
(type: 'cut' | 'mute' | 'gain' | 'speed', id: string) => activeFocusedZone?.type === type && activeFocusedZone.id === id,
[activeFocusedZone],
);
const removeZone = useCallback((type: 'cut' | 'mute' | 'gain' | 'speed', id: string) => {
if (!window.confirm("Delete this zone?")) return;
if (type === 'cut') removeCutRange(id);
else if (type === 'mute') removeMuteRange(id);
else if (type === 'gain') removeGainRange(id);
else removeSpeedRange(id);
setFocusedZone((current) => (current?.type === type && current.id === id ? null : current));
}, [removeCutRange, removeGainRange, removeMuteRange, removeSpeedRange]);
useEffect(() => {
const handleKeyDown = (e: KeyboardEvent) => {
const target = e.target as HTMLElement | null;
if (target && (target.tagName === 'INPUT' || target.tagName === 'TEXTAREA' || target.tagName === 'SELECT')) {
return;
}
if (e.key === 'Escape') {
setFocusedZone(null);
return;
}
if ((e.key === 'Delete' || e.key === 'Backspace') && activeFocusedZone) {
e.preventDefault();
removeZone(activeFocusedZone.type, activeFocusedZone.id);
}
};
window.addEventListener('keydown', handleKeyDown, { capture: true });
return () => window.removeEventListener('keydown', handleKeyDown, { capture: true });
}, [activeFocusedZone, removeZone]);
return (
<div className="p-4 space-y-4">
<div className="space-y-2">
<div className="space-y-1">
<div className="flex items-start justify-between gap-3">
<div>
<h3 className="text-sm font-semibold flex items-center gap-2">
Zone Editor
</h3>
<p className="text-xs text-editor-text-muted">
Manage all timeline zones ({totalZones} total)
</p>
</div>
<div className="min-w-[160px] rounded border border-editor-border bg-editor-surface px-2 py-1.5">
<div className="flex items-center justify-between gap-2">
<span className="text-[10px] uppercase tracking-wide text-editor-text-muted">Preview</span>
<span className="text-[10px] text-editor-text-muted">before/after</span>
</div>
<div className="mt-1 flex items-center gap-1.5">
<input
type="number"
min={0}
max={10}
step={0.25}
value={zonePreviewPaddingSeconds}
onChange={(e) => setZonePreviewPaddingSeconds(Number(e.target.value) || 0)}
className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded text-xs text-editor-text focus:outline-none focus:border-editor-accent"
title="Preview time before and after each zone"
/>
<span className="text-xs text-editor-text-muted">sec</span>
</div>
</div>
</div>
</div>
{/* View Mode Toggle */}
<div className="flex items-center gap-1 rounded bg-editor-surface border border-editor-border p-1">
<button
onClick={() => setViewMode('all')}
className={`px-2 py-1 text-xs rounded transition-colors ${
viewMode === 'all'
? 'bg-editor-accent text-white'
: 'text-editor-text-muted hover:text-editor-text'
}`}
title="Show all zones"
>
All
</button>
<button
onClick={() => setViewMode('cut')}
className={`px-2 py-1 text-xs rounded transition-colors ${
viewMode === 'cut'
? 'bg-red-500/30 text-red-500'
: 'text-editor-text-muted hover:text-editor-text'
}`}
title="Show only Cut zones"
>
Cut
</button>
<button
onClick={() => setViewMode('mute')}
className={`px-2 py-1 text-xs rounded transition-colors ${
viewMode === 'mute'
? 'bg-blue-500/20 text-blue-400'
: 'text-editor-text-muted hover:text-editor-text'
}`}
title="Show only Mute zones"
>
Mute
</button>
<button
onClick={() => setViewMode('gain')}
className={`px-2 py-1 text-xs rounded transition-colors ${
viewMode === 'gain'
? 'bg-amber-500/30 text-amber-500'
: 'text-editor-text-muted hover:text-editor-text'
}`}
title="Show only Gain zones"
>
Gain
</button>
<button
onClick={() => setViewMode('speed')}
className={`px-2 py-1 text-xs rounded transition-colors ${
viewMode === 'speed'
? 'bg-emerald-500/30 text-emerald-500'
: 'text-editor-text-muted hover:text-editor-text'
}`}
title="Show only Speed zones"
>
Speed
</button>
</div>
</div>
{totalZones === 0 ? (
<div className="p-4 rounded-lg border border-dashed border-editor-border text-center">
<p className="text-xs text-editor-text-muted">
No zones yet. Create zones from the toolbar or by highlighting words.
</p>
</div>
) : (
<div className="space-y-3">
{/* Cut Zones */}
{(viewMode === 'all' || viewMode === 'cut') && cutRanges.length > 0 && (
<div className="space-y-2">
<div className="text-xs font-semibold text-red-500/80 flex items-center gap-2">
<Scissors className="w-3.5 h-3.5" />
Cut Zones ({cutRanges.length})
</div>
<div className="space-y-1">
{cutRanges.map((range) => (
<div
key={range.id}
onClick={() => setFocusedZone({ type: 'cut', id: range.id })}
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('cut')} ${isZoneFocused('cut', range.id) ? 'ring-1 ring-red-400 border-red-400/80 bg-red-500/12' : ''}`}
>
<div className="flex-1 min-w-0">
<div className="font-medium truncate">
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
</div>
</div>
{renderPreviewButton(range.start, range.end, 'hover:bg-red-500/20 text-red-500/70 hover:text-red-500')}
<button
onClick={(e) => {
e.stopPropagation();
removeZone('cut', range.id);
}}
className="p-1 rounded hover:bg-red-500/20 text-red-500/70 hover:text-red-500 opacity-0 group-hover:opacity-100 transition-opacity"
title="Delete cut zone"
>
<Trash2 className="w-3.5 h-3.5" />
</button>
</div>
))}
</div>
</div>
)}
{/* Mute Zones */}
{(viewMode === 'all' || viewMode === 'mute') && muteRanges.length > 0 && (
<div className="space-y-2">
<div className="text-xs font-semibold text-blue-400 flex items-center gap-2">
<Volume2 className="w-3.5 h-3.5" />
Mute Zones ({muteRanges.length})
</div>
<div className="space-y-1">
{muteRanges.map((range) => (
<div
key={range.id}
onClick={() => setFocusedZone({ type: 'mute', id: range.id })}
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('mute')} ${isZoneFocused('mute', range.id) ? 'ring-1 ring-blue-400 border-blue-400/80 bg-blue-500/20' : ''}`}
>
<div className="flex-1 min-w-0">
<div className="font-medium truncate">
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
</div>
</div>
{renderPreviewButton(range.start, range.end, 'hover:bg-blue-500/20 text-blue-400 hover:text-blue-400')}
<button
onClick={(e) => {
e.stopPropagation();
removeZone('mute', range.id);
}}
className="p-1 rounded hover:bg-blue-500/20 text-blue-400 hover:text-blue-400 opacity-0 group-hover:opacity-100 transition-opacity"
title="Delete mute zone"
>
<Trash2 className="w-3.5 h-3.5" />
</button>
</div>
))}
</div>
</div>
)}
{/* Sound Gain */}
{(viewMode === 'all' || viewMode === 'gain') && gainRanges.length > 0 && (
<div className="space-y-2">
<div className="text-xs font-semibold text-amber-500/80 flex items-center gap-2">
<SlidersHorizontal className="w-3.5 h-3.5" />
Sound Gain ({gainRanges.length})
</div>
{/* Global Gain Slider */}
<div className="px-2 py-2 rounded border border-amber-500/20 bg-amber-500/5 space-y-2">
<label className="text-xs text-editor-text-muted font-medium">Global Gain</label>
<div className="flex items-center gap-2">
<input
type="range"
min={-24}
max={24}
step={0.5}
value={globalGainDb}
onChange={(e) => setGlobalGainDb(Number(e.target.value))}
className="flex-1 h-1.5"
/>
<input
type="number"
min={-24}
max={24}
step={0.5}
value={globalGainDb}
onChange={(e) => setGlobalGainDb(Math.max(-24, Math.min(24, Number(e.target.value) || 0)))}
className="w-14 px-1.5 py-0.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
title="Volume adjustment in decibels — +6 dB doubles volume, -6 dB halves it"
/>
<span className="text-xs text-amber-500/80 font-medium w-6 text-right">dB</span>
</div>
</div>
<div className="space-y-1">
{gainRanges.map((range) => (
<div
key={range.id}
onClick={() => setFocusedZone({ type: 'gain', id: range.id })}
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('gain')} ${isZoneFocused('gain', range.id) ? 'ring-1 ring-amber-400 border-amber-400/80 bg-amber-500/12' : ''}`}
>
<div className="flex-1 min-w-0">
<div className="font-medium truncate">
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
</div>
<div className="text-editor-text-muted text-[10px]">
{range.gainDb > 0 ? '+' : ''}{range.gainDb.toFixed(1)} dB
</div>
</div>
<input
type="number"
min={-24}
max={24}
step={0.5}
value={range.gainDb}
onClick={(e) => e.stopPropagation()}
onChange={(e) => updateGainRange(range.id, Number(e.target.value) || 0)}
className="w-16 px-1.5 py-0.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
title="Volume adjustment in decibels — +6 dB doubles volume, -6 dB halves it"
/>
{renderPreviewButton(range.start, range.end, 'hover:bg-amber-500/20 text-amber-500/70 hover:text-amber-500')}
<button
onClick={(e) => {
e.stopPropagation();
removeZone('gain', range.id);
}}
className="p-1 rounded hover:bg-amber-500/20 text-amber-500/70 hover:text-amber-500 opacity-0 group-hover:opacity-100 transition-opacity"
title="Delete gain zone"
>
<Trash2 className="w-3.5 h-3.5" />
</button>
</div>
))}
</div>
</div>
)}
{/* Speed Adjust */}
{(viewMode === 'all' || viewMode === 'speed') && speedRanges.length > 0 && (
<div className="space-y-2">
<div className="text-xs font-semibold text-emerald-500/80 flex items-center gap-2">
<Gauge className="w-3.5 h-3.5" />
Speed Adjust ({speedRanges.length})
</div>
<div className="space-y-1">
{speedRanges.map((range) => (
<div
key={range.id}
onClick={() => setFocusedZone({ type: 'speed', id: range.id })}
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('speed')} ${isZoneFocused('speed', range.id) ? 'ring-1 ring-emerald-400 border-emerald-400/80 bg-emerald-500/12' : ''}`}
>
<div className="flex-1 min-w-0">
<div className="font-medium truncate">
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
</div>
<div className="text-editor-text-muted text-[10px]">
{range.speed.toFixed(2)}x
</div>
</div>
<input
type="number"
min={0.25}
max={4}
step={0.05}
value={range.speed}
onClick={(e) => e.stopPropagation()}
onChange={(e) => updateSpeedRange(range.id, Number(e.target.value) || 1)}
className="w-16 px-1.5 py-0.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
title="Playback speed multiplier — 1.0x is normal, 2.0x is twice as fast"
/>
{renderPreviewButton(range.start, range.end, 'hover:bg-emerald-500/20 text-emerald-500/70 hover:text-emerald-500')}
<button
onClick={(e) => {
e.stopPropagation();
removeZone('speed', range.id);
}}
className="p-1 rounded hover:bg-emerald-500/20 text-emerald-500/70 hover:text-emerald-500 opacity-0 group-hover:opacity-100 transition-opacity"
title="Delete speed zone"
>
<Trash2 className="w-3.5 h-3.5" />
</button>
</div>
))}
</div>
</div>
)}
</div>
)}
</div>
);
}

View File

@ -0,0 +1,212 @@
import { useEffect, useRef } from 'react';
import { useEditorStore } from '../store/editorStore';
import { loadBindings, DEFAULT_PRESETS } from '../lib/keybindings';
import type { KeyBinding } from '../types/project';
export function useKeyboardShortcuts() {
const addCutRange = useEditorStore((s) => s.addCutRange);
const markInTime = useEditorStore((s) => s.markInTime);
const markOutTime = useEditorStore((s) => s.markOutTime);
const setMarkInTime = useEditorStore((s) => s.setMarkInTime);
const setMarkOutTime = useEditorStore((s) => s.setMarkOutTime);
const clearMarkRange = useEditorStore((s) => s.clearMarkRange);
const selectedWordIndices = useEditorStore((s) => s.selectedWordIndices);
const words = useEditorStore((s) => s.words);
const playbackRateRef = useRef(1);
// Read bindings fresh from localStorage on every call to avoid stale closures
const getBindings = (): KeyBinding[] => {
try { return loadBindings(); } catch { return []; }
};
useEffect(() => {
const getVideo = (): HTMLVideoElement | null => document.querySelector('video');
const handler = (e: KeyboardEvent) => {
const target = e.target as HTMLElement;
if (target.tagName === 'INPUT' || target.tagName === 'TEXTAREA' || target.tagName === 'SELECT') return;
const video = getVideo();
// Build a key string from the event for matching
const parts: string[] = [];
if (e.ctrlKey || e.metaKey) parts.push('Ctrl');
if (e.shiftKey && !['Shift'].includes(e.key)) parts.push('Shift');
if (e.altKey) parts.push('Alt');
const keyStr = e.key === ' ' ? 'Space' : e.key.length === 1 ? e.key.toUpperCase() : e.key;
parts.push(keyStr);
const combo = parts.join('+');
// Look up binding — fresh read every keystroke so Settings changes take effect immediately
const currentBindings = getBindings();
const binding = currentBindings.find((b) => b.keys === combo);
if (!binding) return; // Unbound key — ignore
e.preventDefault();
switch (binding.id) {
case 'undo':
useEditorStore.temporal.getState().undo();
return;
case 'redo':
useEditorStore.temporal.getState().redo();
return;
case 'cut': {
if (selectedWordIndices.length > 0) {
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
addCutRange(words[sorted[0]].start, words[sorted[sorted.length - 1]].end);
return;
}
if (markInTime !== null && markOutTime !== null) {
const start = Math.min(markInTime, markOutTime);
const end = Math.max(markInTime, markOutTime);
if (end - start >= 0.01) addCutRange(start, end);
clearMarkRange();
}
return;
}
case 'play-pause':
if (video) { if (video.paused) video.play(); else video.pause(); }
return;
case 'slow-down': {
if (video) {
playbackRateRef.current = Math.max(-2, playbackRateRef.current - 0.5);
if (playbackRateRef.current < 0) video.currentTime = Math.max(0, video.currentTime - 2);
else { video.playbackRate = playbackRateRef.current; if (video.paused) video.play(); }
}
return;
}
case 'pause':
if (video) { video.pause(); playbackRateRef.current = 1; }
return;
case 'speed-up': {
if (video) {
playbackRateRef.current = Math.min(4, playbackRateRef.current + 0.5);
video.playbackRate = Math.max(0.25, playbackRateRef.current);
if (video.paused) video.play();
}
return;
}
case 'rewind':
if (video) video.currentTime = Math.max(0, video.currentTime - 5);
return;
case 'forward':
if (video) video.currentTime = Math.min(video.duration, video.currentTime + 5);
return;
case 'mark-in':
if (video) setMarkInTime(video.currentTime);
return;
case 'mark-out':
if (video) setMarkOutTime(video.currentTime);
return;
case 'save': {
const saveBtn = document.querySelector('[title="Save"]') as HTMLButtonElement | null;
if (saveBtn) saveBtn.click();
else saveProject();
return;
}
case 'export': {
const exportBtn = document.querySelector('[title="Export"]') as HTMLButtonElement;
if (exportBtn) exportBtn.click();
return;
}
case 'search': {
const findBtn = document.querySelector('[title="Find (Ctrl+F)"]') as HTMLButtonElement;
if (findBtn) findBtn.click();
return;
}
case 'help':
toggleCheatsheet(currentBindings);
return;
default:
break;
}
};
window.addEventListener('keydown', handler);
return () => window.removeEventListener('keydown', handler);
}, [addCutRange, markInTime, markOutTime, setMarkInTime, setMarkOutTime, clearMarkRange, selectedWordIndices, words]);
}
async function saveProject() {
const state = useEditorStore.getState();
if (!state.videoPath || state.words.length === 0) return;
try {
const projectData = state.saveProject();
let outputPath = state.projectFilePath;
if (!outputPath) {
outputPath = await window.electronAPI?.saveFile({
defaultPath: state.videoPath.replace(/\.[^.]+$/, '.aive'),
filters: [{ name: 'TalkEdit Project', extensions: ['aive'] }],
});
}
if (!outputPath) return;
const resolvedPath = outputPath.endsWith('.aive') ? outputPath : `${outputPath}.aive`;
if (window.electronAPI?.writeFile) {
await window.electronAPI.writeFile(resolvedPath, JSON.stringify(projectData, null, 2));
useEditorStore.getState().setProjectFilePath(resolvedPath);
} else {
const blob = new Blob([JSON.stringify(projectData, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = resolvedPath.split(/[\\/]/).pop() || 'project.aive';
a.click();
URL.revokeObjectURL(url);
useEditorStore.getState().setProjectFilePath(resolvedPath);
}
} catch (err) {
console.error('Failed to save project:', err);
}
}
function toggleCheatsheet(bindings: KeyBinding[]) {
const existing = document.getElementById('keyboard-cheatsheet');
if (existing) {
existing.remove();
return;
}
const overlay = document.createElement('div');
overlay.id = 'keyboard-cheatsheet';
overlay.style.cssText =
'position:fixed;inset:0;z-index:9999;display:flex;align-items:center;justify-content:center;background:rgba(0,0,0,0.7);';
overlay.onclick = () => {
overlay.remove();
};
const presetName = JSON.stringify(bindings) === JSON.stringify(DEFAULT_PRESETS['left-hand']) ? 'Left-Hand Preset' : 'Standard Preset';
const rows = bindings
.map(
(b) =>
`<tr><td style="padding:6px 16px 6px 0;font-family:monospace;color:#818cf8;font-weight:600;white-space:nowrap">${b.keys}</td><td style="padding:6px 0;color:#e2e8f0">${b.label}</td><td style="padding:6px 0 6px 12px;font-size:10px;color:#94a3b8">${b.category}</td></tr>`,
)
.join('');
overlay.innerHTML = `<div style="background:#1a1d27;border:1px solid #2a2d3a;border-radius:12px;padding:24px 32px;max-width:450px;position:relative;" onclick="event.stopPropagation()">
<div style="font-size:11px;color:#94a3b8;margin-bottom:12px">Active preset: <span style="color:#818cf8;font-weight:500">${presetName}</span></div>
<h3 style="margin:0 0 16px;font-size:14px;font-weight:600;color:#e2e8f0">Keyboard Shortcuts</h3>
<table style="font-size:13px">${rows}</table>
<p style="margin:16px 0 0;font-size:11px;color:#94a3b8;text-align:center">Customize in Settings &bull; Press ? to close</p>
<button id="cheatsheet-close" style="position:absolute;top:12px;right:16px;background:none;border:none;color:#94a3b8;font-size:18px;cursor:pointer;line-height:1;padding:4px;">&times;</button>
</div>`;
document.body.appendChild(overlay);
const closeBtn = overlay.querySelector('#cheatsheet-close') as HTMLButtonElement;
if (closeBtn) closeBtn.onclick = () => overlay.remove();
const escHandler = (e: KeyboardEvent) => {
if (e.key === 'Escape') {
overlay.remove();
document.removeEventListener('keydown', escHandler);
}
};
document.addEventListener('keydown', escHandler);
}

View File

@ -0,0 +1,164 @@
import { useCallback, useRef, useEffect } from 'react';
import { useEditorStore } from '../store/editorStore';
export function useVideoSync(videoRef: React.RefObject<HTMLVideoElement | null>) {
const rafRef = useRef<number>(0);
const {
setCurrentTime,
setDuration,
setIsPlaying,
cutRanges,
muteRanges,
speedRanges,
} = useEditorStore();
const getPlaybackRateAtTime = useCallback(
(time: number) => {
for (const range of speedRanges) {
if (time >= range.start && time < range.end) {
return range.speed;
}
}
return 1;
},
[speedRanges],
);
const applyVideoEffects = useCallback(
(video: HTMLVideoElement) => {
let t = video.currentTime;
const allSkipRanges = [...cutRanges];
let skipCount = 0;
const maxSkips = 10;
while (skipCount < maxSkips) {
let shouldSkip = false;
for (const range of allSkipRanges) {
if (t >= range.start && t < range.end) {
t = range.end;
shouldSkip = true;
skipCount++;
break;
}
}
if (!shouldSkip) break;
}
if (skipCount > 0 && video.currentTime !== t) {
video.currentTime = t;
}
let shouldMute = false;
for (const range of muteRanges) {
if (t >= range.start && t < range.end) {
shouldMute = true;
break;
}
}
video.muted = shouldMute;
const playbackRate = getPlaybackRateAtTime(t);
if (video.playbackRate !== playbackRate) {
video.playbackRate = playbackRate;
}
setCurrentTime(t);
return t;
},
[cutRanges, muteRanges, getPlaybackRateAtTime, setCurrentTime],
);
const seekTo = useCallback(
(time: number) => {
if (videoRef.current) {
let targetTime = time;
// If seeking into cut or deleted ranges, skip to the end (handle overlapping/chained ranges)
const allSkipRanges = [...cutRanges];
let skipCount = 0;
const maxSkips = 10; // Prevent infinite loops
while (skipCount < maxSkips) {
let shouldSkip = false;
for (const range of allSkipRanges) {
if (targetTime >= range.start && targetTime < range.end) {
targetTime = range.end;
shouldSkip = true;
skipCount++;
break;
}
}
if (!shouldSkip) break;
}
videoRef.current.currentTime = targetTime;
videoRef.current.playbackRate = getPlaybackRateAtTime(targetTime);
setCurrentTime(targetTime);
}
},
[videoRef, cutRanges, getPlaybackRateAtTime, setCurrentTime],
);
const togglePlay = useCallback(() => {
if (!videoRef.current) return;
if (videoRef.current.paused) {
videoRef.current.play();
} else {
videoRef.current.pause();
}
}, [videoRef]);
useEffect(() => {
const video = videoRef.current;
if (!video) return;
const updateWhilePlaying = () => {
applyVideoEffects(video);
if (!video.paused && !video.ended) {
rafRef.current = requestAnimationFrame(updateWhilePlaying);
}
};
const onTimeUpdate = () => {
cancelAnimationFrame(rafRef.current);
rafRef.current = requestAnimationFrame(() => {
applyVideoEffects(video);
});
};
const onPlay = () => {
setIsPlaying(true);
cancelAnimationFrame(rafRef.current);
rafRef.current = requestAnimationFrame(updateWhilePlaying);
};
const onPause = () => {
setIsPlaying(false);
cancelAnimationFrame(rafRef.current);
applyVideoEffects(video);
};
const onLoadedMetadata = () => {
setDuration(video.duration);
applyVideoEffects(video);
};
const onSeeked = () => applyVideoEffects(video);
video.addEventListener('timeupdate', onTimeUpdate);
video.addEventListener('play', onPlay);
video.addEventListener('pause', onPause);
video.addEventListener('loadedmetadata', onLoadedMetadata);
video.addEventListener('seeked', onSeeked);
return () => {
video.removeEventListener('timeupdate', onTimeUpdate);
video.removeEventListener('play', onPlay);
video.removeEventListener('pause', onPause);
video.removeEventListener('loadedmetadata', onLoadedMetadata);
video.removeEventListener('seeked', onSeeked);
cancelAnimationFrame(rafRef.current);
video.playbackRate = 1;
};
}, [videoRef, applyVideoEffects, setIsPlaying, setDuration]);
return { seekTo, togglePlay };
}

58
frontend/src/index.css Normal file
View File

@ -0,0 +1,58 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
@keyframes waveBar {
0% { transform: scaleY(0.3); opacity: 0.5; }
50% { transform: scaleY(1); opacity: 1; }
100% { transform: scaleY(0.3); opacity: 0.5; }
}
@keyframes audioBounce {
0% { height: 12px; }
50% { height: var(--bar-peak); }
100% { height: 12px; }
}
.wave-bar {
animation: waveBar 0.9s ease-in-out infinite;
transform-origin: bottom;
}
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Inter', system-ui, -apple-system, sans-serif;
overflow: hidden;
user-select: none;
}
::-webkit-scrollbar {
width: 6px;
height: 6px;
}
::-webkit-scrollbar-track {
background: transparent;
}
::-webkit-scrollbar-thumb {
background: #2a2d3a;
border-radius: 3px;
}
::-webkit-scrollbar-thumb:hover {
background: #3a3d4a;
}
video::-webkit-media-controls {
display: none !important;
}

View File

@ -0,0 +1,26 @@
import { describe, expect, test } from 'vitest';
import { assert } from './assert';
describe('assert', () => {
test('does not throw for true condition', () => {
expect(() => assert(true, 'should not throw')).not.toThrow();
});
test('throws in dev mode for false condition', () => {
expect(() => assert(false, 'should throw')).toThrow('Assertion failed: should throw');
});
test('includes message in error', () => {
try {
assert(false, 'custom message here');
} catch (e: any) {
expect(e.message).toContain('custom message here');
}
});
test('does not throw for truthy values', () => {
expect(() => assert(1 === 1, 'math works')).not.toThrow();
expect(() => assert('hello' === 'hello', 'strings work')).not.toThrow();
});
});

Some files were not shown because too many files have changed in this diff Show More