Compare commits
84 Commits
c5cc7c2969
...
b000_bug1
| Author | SHA1 | Date | |
|---|---|---|---|
| e5c1c766b6 | |||
| 3093b41033 | |||
| a64ae78833 | |||
| b558ef8a7f | |||
| f1e6c010eb | |||
| 124f215a0a | |||
| 1993aabeac | |||
| 573ac9c9f5 | |||
| 5d52c8aec5 | |||
| 8bd1ad5b69 | |||
| 850b373d42 | |||
| 2212d7b265 | |||
| 813877a7b4 | |||
| e4484a57f9 | |||
| 10437c02ca | |||
| 4004312994 | |||
| 9a301fe2a2 | |||
| 6ac1d68887 | |||
| acf7f2e64c | |||
| a96e42c9f9 | |||
| fd6697b48e | |||
| 09ebcbc9ec | |||
| 88cd9a21d0 | |||
| 91217f6db0 | |||
| 835719a907 | |||
| 810957747b | |||
| 4d4dfa7f7c | |||
| cde635a660 | |||
| 21e4255325 | |||
| 1678d28db7 | |||
| 137dc80cde | |||
| dd4ce58920 | |||
| 5758401dda | |||
| 90b1999a57 | |||
| 0c7a4c94c2 | |||
| 168676a9e9 | |||
| 3fa67383c4 | |||
| f121d71f5f | |||
| af8e0cf6eb | |||
| 4d3d8a2218 | |||
| b7a795f986 | |||
| 7479acd3ee | |||
| 17874587a4 | |||
| 84edddded8 | |||
| 48d761c713 | |||
| 024b9bd806 | |||
| d11e26cf2d | |||
| 4f90750497 | |||
| 0df967507f | |||
| b8ec396ebd | |||
| 140b7a5319 | |||
| 1d17a8f19a | |||
| f9cd2bf579 | |||
| d80ff847d8 | |||
| 8a7c94d594 | |||
| 0237d685e5 | |||
| 585262c3e7 | |||
| d7bc6ea74d | |||
| f0568ed267 | |||
| 7c8c74d04d | |||
| addd87c45b | |||
| bb9ac53ae5 | |||
| c7445206cc | |||
| ea3f1d2b23 | |||
| 246d816f84 | |||
| 2ffc406b10 | |||
| 4a857d8cbf | |||
| 164b2f87d4 | |||
| 00ee076baa | |||
| b4bcb8f3f2 | |||
| 4230ae6cb9 | |||
| c01db38eb3 | |||
| d134e4ab27 | |||
| a864b562ae | |||
| e5c47e31b3 | |||
| 78d34133ad | |||
| 33cca5f552 | |||
| d1e1fedcae | |||
| 70c5d32413 | |||
| ce398ae1d4 | |||
| 168bf5f573 | |||
| efee0b0abe | |||
| 4dd3c7600e | |||
| 78e9df31e6 |
BIN
.diagnostics/diag_20260415_163239.tar.gz
Normal file
BIN
.diagnostics/diag_20260415_163239.tar.gz
Normal file
Binary file not shown.
16
.diagnostics/diag_20260415_163239/backend_health_check.txt
Normal file
16
.diagnostics/diag_20260415_163239/backend_health_check.txt
Normal file
@ -0,0 +1,16 @@
|
||||
# backend_health_check
|
||||
# cmd: /home/dillon/_code/TalkEdit/.venv312/bin/python3.12 -c import importlib; importlib.import_module('backend.main'); print('backend import OK')
|
||||
Traceback (most recent call last):
|
||||
File "<string>", line 1, in <module>
|
||||
File "/usr/lib/python3.12/importlib/__init__.py", line 90, in import_module
|
||||
return _bootstrap._gcd_import(name[level:], package, level)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "<frozen importlib._bootstrap>", line 1387, in _gcd_import
|
||||
File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
|
||||
File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked
|
||||
File "<frozen importlib._bootstrap>", line 935, in _load_unlocked
|
||||
File "<frozen importlib._bootstrap_external>", line 999, in exec_module
|
||||
File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
|
||||
File "/home/dillon/_code/TalkEdit/backend/main.py", line 12, in <module>
|
||||
from routers import transcribe, export, ai, captions, audio
|
||||
ModuleNotFoundError: No module named 'routers'
|
||||
@ -0,0 +1,3 @@
|
||||
# backend_python_version
|
||||
# cmd: /home/dillon/_code/TalkEdit/.venv312/bin/python3.12 --version
|
||||
Python 3.12.13
|
||||
3
.diagnostics/diag_20260415_163239/env_git_head.txt
Normal file
3
.diagnostics/diag_20260415_163239/env_git_head.txt
Normal file
@ -0,0 +1,3 @@
|
||||
# env_git_head
|
||||
# cmd: git -C /home/dillon/_code/TalkEdit rev-parse --short HEAD
|
||||
4f90750
|
||||
10
.diagnostics/diag_20260415_163239/env_git_status.txt
Normal file
10
.diagnostics/diag_20260415_163239/env_git_status.txt
Normal file
@ -0,0 +1,10 @@
|
||||
# env_git_status
|
||||
# cmd: git -C /home/dillon/_code/TalkEdit status --short
|
||||
M frontend/src/App.tsx
|
||||
M frontend/src/components/VolumePanel.tsx
|
||||
M frontend/src/components/WaveformTimeline.tsx
|
||||
M frontend/src/store/editorStore.ts
|
||||
?? .diagnostics/
|
||||
?? AI_dev.md
|
||||
?? docs/
|
||||
?? scripts/
|
||||
3
.diagnostics/diag_20260415_163239/env_node_version.txt
Normal file
3
.diagnostics/diag_20260415_163239/env_node_version.txt
Normal file
@ -0,0 +1,3 @@
|
||||
# env_node_version
|
||||
# cmd: node --version
|
||||
v22.18.0
|
||||
3
.diagnostics/diag_20260415_163239/env_npm_version.txt
Normal file
3
.diagnostics/diag_20260415_163239/env_npm_version.txt
Normal file
@ -0,0 +1,3 @@
|
||||
# env_npm_version
|
||||
# cmd: npm --version
|
||||
10.9.3
|
||||
3
.diagnostics/diag_20260415_163239/env_uname.txt
Normal file
3
.diagnostics/diag_20260415_163239/env_uname.txt
Normal file
@ -0,0 +1,3 @@
|
||||
# env_uname
|
||||
# cmd: uname -a
|
||||
Linux cachyos-x 6.19.10-1-cachyos #1 SMP PREEMPT_DYNAMIC Wed, 25 Mar 2026 23:30:07 +0000 x86_64 GNU/Linux
|
||||
11
.diagnostics/diag_20260415_163239/frontend_build.txt
Normal file
11
.diagnostics/diag_20260415_163239/frontend_build.txt
Normal file
@ -0,0 +1,11 @@
|
||||
# frontend_build
|
||||
# cmd: bash -lc cd '/home/dillon/_code/TalkEdit/frontend' && npm run -s build
|
||||
vite v6.4.1 building for production...
|
||||
transforming...
|
||||
✓ 1606 modules transformed.
|
||||
rendering chunks...
|
||||
computing gzip size...
|
||||
dist/index.html 1.20 kB │ gzip: 0.57 kB
|
||||
dist/assets/index-gyhcOzhr.css 19.31 kB │ gzip: 4.48 kB
|
||||
dist/assets/index-B5NnH24A.js 354.13 kB │ gzip: 108.13 kB
|
||||
✓ built in 2.43s
|
||||
3
.diagnostics/diag_20260415_163239/frontend_lint.txt
Normal file
3
.diagnostics/diag_20260415_163239/frontend_lint.txt
Normal file
@ -0,0 +1,3 @@
|
||||
# frontend_lint
|
||||
# cmd: bash -lc cd '/home/dillon/_code/TalkEdit/frontend' && npm run -s lint
|
||||
sh: line 1: eslint: command not found
|
||||
72
.diagnostics/diag_20260415_163239/list_recent_files.txt
Normal file
72
.diagnostics/diag_20260415_163239/list_recent_files.txt
Normal file
@ -0,0 +1,72 @@
|
||||
# list_recent_files
|
||||
# cmd: find /home/dillon/_code/TalkEdit -maxdepth 2 -type f
|
||||
/home/dillon/_code/TalkEdit/.git/description
|
||||
/home/dillon/_code/TalkEdit/.git/packed-refs
|
||||
/home/dillon/_code/TalkEdit/.git/COMMIT_EDITMSG
|
||||
/home/dillon/_code/TalkEdit/.git/FETCH_HEAD
|
||||
/home/dillon/_code/TalkEdit/.git/ORIG_HEAD
|
||||
/home/dillon/_code/TalkEdit/.git/REBASE_HEAD
|
||||
/home/dillon/_code/TalkEdit/.git/HEAD
|
||||
/home/dillon/_code/TalkEdit/.git/config
|
||||
/home/dillon/_code/TalkEdit/.git/index
|
||||
/home/dillon/_code/TalkEdit/backend/requirements.txt
|
||||
/home/dillon/_code/TalkEdit/backend/.python-version
|
||||
/home/dillon/_code/TalkEdit/backend/dev_main.py
|
||||
/home/dillon/_code/TalkEdit/backend/video_editor.py
|
||||
/home/dillon/_code/TalkEdit/backend/audio_cleaner.py
|
||||
/home/dillon/_code/TalkEdit/backend/diarization.py
|
||||
/home/dillon/_code/TalkEdit/backend/ai_provider.py
|
||||
/home/dillon/_code/TalkEdit/backend/caption_generator.py
|
||||
/home/dillon/_code/TalkEdit/backend/background_removal.py
|
||||
/home/dillon/_code/TalkEdit/backend/main.py
|
||||
/home/dillon/_code/TalkEdit/frontend/postcss.config.js
|
||||
/home/dillon/_code/TalkEdit/frontend/tailwind.config.js
|
||||
/home/dillon/_code/TalkEdit/frontend/tsconfig.json
|
||||
/home/dillon/_code/TalkEdit/frontend/vite.config.ts
|
||||
/home/dillon/_code/TalkEdit/frontend/frontend_dev.log
|
||||
/home/dillon/_code/TalkEdit/frontend/index.html
|
||||
/home/dillon/_code/TalkEdit/frontend/package-lock.json
|
||||
/home/dillon/_code/TalkEdit/frontend/package.json
|
||||
/home/dillon/_code/TalkEdit/frontend/tsconfig.tsbuildinfo
|
||||
/home/dillon/_code/TalkEdit/shared/project-schema.json
|
||||
/home/dillon/_code/TalkEdit/node_modules/.package-lock.json
|
||||
/home/dillon/_code/TalkEdit/src-tauri/.gitignore
|
||||
/home/dillon/_code/TalkEdit/src-tauri/Cargo.toml
|
||||
/home/dillon/_code/TalkEdit/src-tauri/build.rs
|
||||
/home/dillon/_code/TalkEdit/src-tauri/tauri_dev.log
|
||||
/home/dillon/_code/TalkEdit/src-tauri/Cargo.lock
|
||||
/home/dillon/_code/TalkEdit/src-tauri/tauri.conf.json
|
||||
/home/dillon/_code/TalkEdit/.dockerignore
|
||||
/home/dillon/_code/TalkEdit/.gitattributes
|
||||
/home/dillon/_code/TalkEdit/FIX-GITHUB-ACTIONS.md
|
||||
/home/dillon/_code/TalkEdit/LICENSE
|
||||
/home/dillon/_code/TalkEdit/M4A-SUPPORT.md
|
||||
/home/dillon/_code/TalkEdit/package-lock.json
|
||||
/home/dillon/_code/TalkEdit/TECH_FEATURES.md
|
||||
/home/dillon/_code/TalkEdit/FFmpeg_COMPLIANCE.md
|
||||
/home/dillon/_code/TalkEdit/transcribe.py
|
||||
/home/dillon/_code/TalkEdit/test_api.py
|
||||
/home/dillon/_code/TalkEdit/.vscode/settings.json
|
||||
/home/dillon/_code/TalkEdit/.venv312/pyvenv.cfg
|
||||
/home/dillon/_code/TalkEdit/webview.log
|
||||
/home/dillon/_code/TalkEdit/.gitmodules
|
||||
/home/dillon/_code/TalkEdit/split_audio.sh
|
||||
/home/dillon/_code/TalkEdit/venv/.gitignore
|
||||
/home/dillon/_code/TalkEdit/venv/pyvenv.cfg
|
||||
/home/dillon/_code/TalkEdit/.gitignore
|
||||
/home/dillon/_code/TalkEdit/FEATURES.md
|
||||
/home/dillon/_code/TalkEdit/README.md
|
||||
/home/dillon/_code/TalkEdit/close
|
||||
/home/dillon/_code/TalkEdit/electron/main.js
|
||||
/home/dillon/_code/TalkEdit/electron/preload.js
|
||||
/home/dillon/_code/TalkEdit/electron/python-bridge.js
|
||||
/home/dillon/_code/TalkEdit/idea summary.md
|
||||
/home/dillon/_code/TalkEdit/open
|
||||
/home/dillon/_code/TalkEdit/package.json
|
||||
/home/dillon/_code/TalkEdit/plan.md
|
||||
/home/dillon/_code/TalkEdit/.github/copilot-instructions.md
|
||||
/home/dillon/_code/TalkEdit/AI_dev.md
|
||||
/home/dillon/_code/TalkEdit/docs/spec-template.md
|
||||
/home/dillon/_code/TalkEdit/docs/ai-policy.md
|
||||
/home/dillon/_code/TalkEdit/scripts/validate-all.sh
|
||||
/home/dillon/_code/TalkEdit/scripts/collect-diagnostics.sh
|
||||
109
.github/copilot-instructions.md
vendored
Normal file
109
.github/copilot-instructions.md
vendored
Normal file
@ -0,0 +1,109 @@
|
||||
# TalkEdit Copilot Instructions (Living Project Context)
|
||||
|
||||
Purpose: give AI assistants immediate, accurate context for this repository and define what must be kept in sync when the project evolves.
|
||||
|
||||
## How To Use This File
|
||||
|
||||
- This is a workspace instruction file for VS Code Chat/Copilot.
|
||||
- Treat this as the first source of truth for architecture and workflow expectations.
|
||||
- If your code changes make any section outdated, update this file in the same change.
|
||||
|
||||
## Project Snapshot
|
||||
|
||||
- Name: TalkEdit
|
||||
- Product: local-first, AI-powered, text-based audio/video editor.
|
||||
- Primary runtime: Tauri + React frontend + Python FastAPI backend.
|
||||
- Desktop only (Electron has been removed; Tauri is the exclusive desktop runtime).
|
||||
|
||||
## Tech Stack
|
||||
|
||||
- Frontend: React 19, TypeScript, Vite, Tailwind, Zustand.
|
||||
- Desktop bridge: Tauri API (IPC commands via `window.electronAPI` polyfill in `frontend/src/lib/tauri-bridge.ts` for unified call-site interface).
|
||||
- Backend: FastAPI + Uvicorn (`backend/main.py`) with routers in `backend/routers` and core services in `backend/services`.
|
||||
- Media tooling: FFmpeg for edit/export and codec operations.
|
||||
- AI tooling: WhisperX/faster-whisper for transcription; provider layer supports OpenAI/Anthropic/Ollama.
|
||||
|
||||
## Code Map
|
||||
|
||||
- `frontend/src/components`: editor UI (player, transcript, waveform, settings, export, AI panel).
|
||||
- `frontend/src/store`: Zustand state (`editorStore`, `aiStore`).
|
||||
- `frontend/src/hooks`: keyboard/video sync behavior.
|
||||
- `backend/routers`: API surface (`/transcribe`, `/export`, `/ai/*`, `/captions`, `/audio/*`).
|
||||
- `backend/services`: heavy operations (transcription, captioning, diarization, video editing, cleanup).
|
||||
- `shared/project-schema.json`: saved project schema contract.
|
||||
- `src-tauri`: Rust/Tauri host code and app configuration.
|
||||
|
||||
## Run And Build (Preferred)
|
||||
|
||||
- Frontend dev: `npm run dev`
|
||||
- Backend dev: `npm run dev:backend`
|
||||
- Tauri dev: `npm run dev:tauri`
|
||||
- Tauri build: `npm run build:tauri`
|
||||
|
||||
Use project virtualenvs where available (`.venv312`, `.venv`, or `venv`) for backend execution.
|
||||
|
||||
## Working Conventions
|
||||
|
||||
- Keep router files thin; put heavy logic in `backend/services`.
|
||||
- Preserve response compatibility for existing frontend callers unless task explicitly allows API breakage.
|
||||
- Frontend uses unified `window.electronAPI` interface (Tauri-backed via tauri-bridge.ts); desktop APIs are implemented exclusively in Tauri.
|
||||
- Prefer small, focused edits over broad refactors.
|
||||
|
||||
## Known Risk Areas
|
||||
|
||||
- Startup/rendering on Linux WebKit can regress when reintroducing remote fonts/CSP allowances; prefer local font assets.
|
||||
- Media URL handling between project load paths should remain consistent to avoid format-specific regressions (especially WAV/MP3 behavior).
|
||||
- Export pipeline changes must preserve caption modes (`none`, `sidecar`, `burn-in`) and audio enhancement behavior.
|
||||
- WAV export uses `pcm_s16le` codec — only available for audio-only inputs (no video stream). Format selector conditionally shows WAV based on input file extension.
|
||||
- `<select>` dropdowns need `[color-scheme:dark]` Tailwind class on Linux WebKit or the native popup renders white-on-light-gray.
|
||||
- Frontend gain ranges use camelCase (`gainDb`) but the backend expects snake_case (`gain_db`). The ExportDialog maps them before sending. Any new call sites must do the same.
|
||||
|
||||
## Recent Changes
|
||||
|
||||
### 2026-05-04 — Word text correction, low-confidence highlighting, audio normalization
|
||||
|
||||
- **Word text correction (#015)**: Double-click any word in the transcript editor to edit its text inline. Press Enter to commit, Escape to cancel. State is updated in both `words[]` and `segments[]` arrays (segment text recomposed from updated words). Pure frontend; no backend changes needed.
|
||||
- **Low-confidence word highlighting (#012)**: Words with `confidence < threshold` (default 0.6, configurable in Settings panel) render with an orange dotted underline. Tooltip shows exact confidence percentage. Threshold is persisted in `localStorage` key `talkedit:confidenceThreshold`.
|
||||
- **Audio normalization (#018)**: New backend endpoint `POST /audio/normalize` in `backend/routers/audio.py`. Two-pass FFmpeg `loudnorm` (measure then apply) implemented in `backend/services/audio_cleaner.py:normalize_audio()`. Falls back to single-pass if measurement fails. Frontend UI in Export panel: target selector (YouTube -14, Spotify -16, Broadcast -23, etc.) with "Normalize" button.
|
||||
- **Store**: New `updateWordText(index, text)` action in `editorStore.ts` updates both `words[]` and recomputes `segments[].text`.
|
||||
- **Settings panel**: New confidence threshold slider (0–1 range).
|
||||
- **WAV export format**: Format selector shows "WAV (Uncompressed)" for audio-only inputs. Backend uses `pcm_s16le` codec via `_get_codec_args()` helper. Codec selection centralized in `backend/services/video_editor.py:_get_codec_args(format_hint, has_video)`.
|
||||
- **Normalization moved to export**: No longer a standalone button. Integrated as `normalizeAudio` checkbox + LUFS target selector in ExportPanel. Sent as `normalize_loudness`/`normalize_target_lufs` to backend. Applied via `loudnorm` in FFmpeg audio filter chain during export.
|
||||
- **Export camelCase fix**: `ExportDialog.tsx` now manually maps `gainRanges`→`gain_db` and `muteRanges`→`{start,end}` before sending to backend. Prevents Pydantic v2 field rejection.
|
||||
- **color-scheme:dark**: All `<select>` elements in ExportDialog use `[color-scheme:dark]` to ensure readable native dropdown popups on Linux WebKit.
|
||||
- **Re-transcribe selection (#013)**: Backend `POST /transcribe/segment` extracts audio via FFmpeg, runs Whisper, adjusts timestamps. Frontend: "Re-transcribe" button on selected words in TranscriptEditor; `replaceWordRange()` store action swaps words + rebuilds segments by speaker.
|
||||
- **Transcript-only export (#024)**: "Export Transcript Only" in ExportDialog with .txt/.srt options. **Pure frontend** — generates content in-browser, writes via Tauri `writeFile`. No backend dependency. Respects word cuts.
|
||||
- **Named timeline markers (#016)**: `TimelineMarker` type in `project.ts`. Store actions: `addTimelineMarker`, `updateTimelineMarker`, `removeTimelineMarker`. Colored pins on waveform canvas. MarkersPanel UI for add/edit/delete. Persisted in project.
|
||||
- **Chapters (#017)**: `getChapters()` store action derives from sorted markers. "Copy as YouTube timestamps" in MarkersPanel. Zero backend.
|
||||
- **Clip thumbnail strip (#022)**: `lib/thumbnails.ts` — frontend canvas capture from `<video>`. Toggle button in WaveformTimeline. Clickable frames at 10s intervals.
|
||||
- **Customizable hotkeys (#041)**: `lib/keybindings.ts` with two presets (standard + left-hand). `useKeyboardShortcuts.ts` reads bindings dynamically. Settings panel includes key remapper with conflict detection and per-key reset. `?` key shows dynamic cheatsheet.
|
||||
|
||||
## Update Rules (Important)
|
||||
|
||||
When a task changes architecture, app wiring, commands, API shape, project schema, or major conventions, update this file before finishing.
|
||||
|
||||
Always update these sections if affected:
|
||||
|
||||
- `Project Snapshot`
|
||||
- `Tech Stack`
|
||||
- `Code Map`
|
||||
- `Run And Build (Preferred)`
|
||||
- `Working Conventions`
|
||||
- `Known Risk Areas`
|
||||
- Recent changes section (if applicable)
|
||||
- `Code Map`
|
||||
- `Run And Build (Preferred)`
|
||||
- `Known Risk Areas`
|
||||
|
||||
If behavior changed significantly, add a short note under a new `Recent Changes` section with:
|
||||
|
||||
- Date (`YYYY-MM-DD`)
|
||||
- What changed
|
||||
- What future edits should preserve
|
||||
|
||||
## Assistant Behavior For This Repo
|
||||
|
||||
- Validate assumptions against current files before editing.
|
||||
- Prefer existing patterns in neighboring files over introducing new patterns.
|
||||
- Call out uncertainty explicitly when code and docs disagree.
|
||||
- If you discover stale docs, fix them as part of the same task when reasonable.
|
||||
23
.github/pull_request_template.md
vendored
Normal file
23
.github/pull_request_template.md
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
## Summary
|
||||
|
||||
Describe what changed and why.
|
||||
|
||||
## Spec Link (Required For Feature Changes)
|
||||
|
||||
- Spec file in `docs/specs/`: <!-- e.g. docs/specs/2026-04-15-speed-adjustment.md -->
|
||||
|
||||
## Acceptance Criteria Checklist
|
||||
|
||||
- [ ] Acceptance criteria reviewed against the linked spec
|
||||
- [ ] User-visible behavior verified for this change
|
||||
- [ ] Backward compatibility impact assessed
|
||||
|
||||
## Validation
|
||||
|
||||
- [ ] `./scripts/validate-all.sh` passes locally
|
||||
- [ ] Added/updated tests for changed behavior
|
||||
|
||||
## Risk And Rollback
|
||||
|
||||
- Risk level: Low / Medium / High
|
||||
- Rollback plan:
|
||||
54
.github/workflows/ci.yml
vendored
Normal file
54
.github/workflows/ci.yml
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
rust:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- run: |
|
||||
apt-get update
|
||||
apt-get install -y \
|
||||
libgtk-3-dev \
|
||||
libwebkit2gtk-4.1-dev \
|
||||
libayatana-appindicator3-dev \
|
||||
librsvg2-dev \
|
||||
libssl-dev \
|
||||
patchelf
|
||||
- run: cargo test
|
||||
working-directory: src-tauri
|
||||
- run: cargo check --release
|
||||
working-directory: src-tauri
|
||||
- run: cargo clippy -- -D warnings
|
||||
working-directory: src-tauri
|
||||
continue-on-error: true
|
||||
|
||||
frontend:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
- run: npm ci
|
||||
working-directory: frontend
|
||||
- run: npx tsc --noEmit
|
||||
working-directory: frontend
|
||||
- run: npx vitest run
|
||||
working-directory: frontend
|
||||
|
||||
python:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- run: |
|
||||
apt-get update
|
||||
apt-get install -y python3 python3-pip
|
||||
- run: pip3 install pytest
|
||||
- run: python3 -m pytest backend/tests/ || true
|
||||
76
.github/workflows/docker-build.yml
vendored
76
.github/workflows/docker-build.yml
vendored
@ -1,76 +0,0 @@
|
||||
name: Build and Push Docker Images
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, develop ]
|
||||
tags: [ 'v*' ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Convert repository name to lowercase
|
||||
id: lowercase-repo
|
||||
run: echo "repository=$(echo ${{ github.repository }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Extract metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=ref,event=pr
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=semver,pattern={{major}}
|
||||
type=raw,value=latest,enable={{is_default_branch}}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Build and push GPU-enabled image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile.gpu
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}:latest-gpu
|
||||
${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}:${{ github.sha }}-gpu
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
94
.github/workflows/release.yml
vendored
Normal file
94
.github/workflows/release.yml
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
name: Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: npm
|
||||
cache-dependency-path: frontend/package-lock.json
|
||||
- run: npm ci
|
||||
working-directory: frontend
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- run: |
|
||||
apt-get update
|
||||
apt-get install -y \
|
||||
libwebkit2gtk-4.1-dev \
|
||||
librsvg2-dev \
|
||||
patchelf \
|
||||
libssl-dev \
|
||||
libgtk-3-dev \
|
||||
libayatana-appindicator3-dev \
|
||||
rpm
|
||||
- name: Download FFmpeg (bundled sidecar)
|
||||
run: |
|
||||
mkdir -p src-tauri/binaries
|
||||
curl -sL "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz" -o /tmp/ffmpeg.tar.xz
|
||||
tar -xf /tmp/ffmpeg.tar.xz -C /tmp
|
||||
cp /tmp/ffmpeg-*-amd64-static/ffmpeg src-tauri/binaries/ffmpeg-x86_64-unknown-linux-gnu
|
||||
cp /tmp/ffmpeg-*-amd64-static/ffprobe src-tauri/binaries/ffprobe-x86_64-unknown-linux-gnu
|
||||
chmod +x src-tauri/binaries/*
|
||||
- uses: tauri-apps/tauri-action@v0
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tagName: ${{ github.ref_name }}
|
||||
releaseName: 'TalkEdit ${{ github.ref_name }}'
|
||||
releaseBody: 'See the assets to download and install this version.'
|
||||
releaseDraft: false
|
||||
includeUpdaterJson: true
|
||||
args: --bundles deb,rpm
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: npm
|
||||
cache-dependency-path: frontend/package-lock.json
|
||||
- run: npm ci
|
||||
working-directory: frontend
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- uses: tauri-apps/tauri-action@v0
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tagName: ${{ github.ref_name }}
|
||||
releaseName: 'TalkEdit ${{ github.ref_name }}'
|
||||
releaseBody: 'See the assets to download and install this version.'
|
||||
releaseDraft: false
|
||||
includeUpdaterJson: true
|
||||
args: --bundles msi
|
||||
|
||||
# macos:
|
||||
# runs-on: macos-latest
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
# - uses: actions/setup-node@v4
|
||||
# with:
|
||||
# node-version: 20
|
||||
# cache: npm
|
||||
# cache-dependency-path: frontend/package-lock.json
|
||||
# - run: npm ci
|
||||
# working-directory: frontend
|
||||
# - uses: dtolnay/rust-toolchain@stable
|
||||
# - uses: tauri-apps/tauri-action@v0
|
||||
# env:
|
||||
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
# with:
|
||||
# tagName: ${{ github.ref_name }}
|
||||
# releaseName: 'TalkEdit ${{ github.ref_name }}'
|
||||
# releaseBody: 'See the assets to download and install this version.'
|
||||
# releaseDraft: false
|
||||
# includeUpdaterJson: true
|
||||
# args: --bundles dmg
|
||||
58
.github/workflows/validate-all.yml
vendored
Normal file
58
.github/workflows/validate-all.yml
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
name: Validate All
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
validate-all:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Node
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
cache: npm
|
||||
cache-dependency-path: |
|
||||
frontend/package-lock.json
|
||||
package-lock.json
|
||||
|
||||
- name: Setup Python
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y python3 python3-pip
|
||||
|
||||
- name: Enforce feature spec policy (PR only)
|
||||
if: github.event_name == 'pull_request'
|
||||
env:
|
||||
BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
run: ./scripts/check-feature-spec.sh
|
||||
|
||||
- name: Install frontend dependencies
|
||||
run: |
|
||||
cd frontend
|
||||
npm install
|
||||
|
||||
- name: Run validate-all
|
||||
env:
|
||||
SKIP_BACKEND_IMPORT_SMOKE: '1'
|
||||
run: ./scripts/validate-all.sh
|
||||
|
||||
- name: Collect diagnostics on failure
|
||||
if: failure()
|
||||
run: ./scripts/collect-diagnostics.sh
|
||||
|
||||
- name: Upload diagnostics artifact
|
||||
if: failure()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: diagnostics
|
||||
path: .diagnostics
|
||||
51
.gitignore
vendored
51
.gitignore
vendored
@ -1,13 +1,60 @@
|
||||
# Python virtual environment
|
||||
# Dependencies
|
||||
node_modules/
|
||||
|
||||
# Build output
|
||||
frontend/dist/
|
||||
|
||||
# Python
|
||||
venv/
|
||||
.venv312/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.egg-info/
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
|
||||
# IDE files
|
||||
# IDE / Editor
|
||||
.vscode/
|
||||
.idea/
|
||||
.cursor/
|
||||
|
||||
# Submodules (can be cloned separately if needed)
|
||||
CutScript/
|
||||
|
||||
# OS files
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
*.swp
|
||||
*.tmp
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
logs/
|
||||
cache/
|
||||
*.aive
|
||||
|
||||
# Build output
|
||||
frontend/dist/
|
||||
dist/
|
||||
build/
|
||||
*.asar
|
||||
target/
|
||||
src-tauri/target/
|
||||
|
||||
# Node.js
|
||||
node_modules/
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Rust
|
||||
Cargo.lock
|
||||
|
||||
# Lock files (root only — frontend lock is committed)
|
||||
/package-lock.json
|
||||
|
||||
1
.gitmodules
vendored
Normal file
1
.gitmodules
vendored
Normal file
@ -0,0 +1 @@
|
||||
|
||||
157
AI_dev_plan.md
Normal file
157
AI_dev_plan.md
Normal file
@ -0,0 +1,157 @@
|
||||
# AI Dev Plan (Must-Haves Only)
|
||||
|
||||
## Purpose
|
||||
|
||||
This is the minimum implementation needed for AI to reliably build, test, and debug TalkEdit with high confidence.
|
||||
|
||||
Target: reliable 80-90% autonomous implementation/debugging on scoped tasks.
|
||||
|
||||
## Must-Have Pillars
|
||||
|
||||
## 1. Single Validation Command
|
||||
|
||||
Required:
|
||||
|
||||
1. One command that runs lint, build, backend tests, and smoke checks.
|
||||
2. Works locally and in CI.
|
||||
|
||||
Current status:
|
||||
|
||||
1. Implemented via scripts/validate-all.sh.
|
||||
2. Enforced in CI via .github/workflows/validate-all.yml.
|
||||
|
||||
## 2. CI Quality Gate
|
||||
|
||||
Required:
|
||||
|
||||
1. Pull requests fail if validation fails.
|
||||
2. Failures produce diagnostics artifacts.
|
||||
|
||||
Current status:
|
||||
|
||||
1. Implemented in .github/workflows/validate-all.yml.
|
||||
2. Diagnostics collected by scripts/collect-diagnostics.sh on failure.
|
||||
|
||||
## 3. Spec Requirement for Feature Changes
|
||||
|
||||
Required:
|
||||
|
||||
1. Feature code changes must include a spec file update.
|
||||
2. Spec format must be standardized.
|
||||
|
||||
Current status:
|
||||
|
||||
1. Implemented via scripts/check-feature-spec.sh.
|
||||
2. Spec template exists at docs/spec-template.md.
|
||||
3. Specs folder guidance exists at docs/specs/README.md.
|
||||
|
||||
## 4. Backend Contract Test Coverage
|
||||
|
||||
Required:
|
||||
|
||||
1. Router-level contract tests for success and error paths.
|
||||
2. Tests are deterministic and mock heavy services.
|
||||
|
||||
Current status:
|
||||
|
||||
1. Implemented in backend/tests/test_router_contracts.py.
|
||||
2. Cache utility baseline tests implemented in backend/tests/test_cache_utils.py.
|
||||
|
||||
## 5. Error-Tolerant Router Contracts
|
||||
|
||||
Required:
|
||||
|
||||
1. Expected client errors must remain 4xx.
|
||||
2. Server failures must return 5xx with useful detail.
|
||||
|
||||
Current status:
|
||||
|
||||
1. Implemented for captions/export HTTPException passthrough.
|
||||
2. Covered by contract tests.
|
||||
|
||||
## 6. Basic Autonomy Policy
|
||||
|
||||
Required:
|
||||
|
||||
1. Clear autonomous scope and escalation rules.
|
||||
2. Clear restrictions for high-risk changes.
|
||||
|
||||
Current status:
|
||||
|
||||
1. Implemented in docs/ai-policy.md.
|
||||
|
||||
## Must-Have Remaining Work
|
||||
|
||||
No remaining must-have items.
|
||||
|
||||
Completed in this pass:
|
||||
|
||||
1. Added lightweight frontend tests and integrated them into scripts/validate-all.sh.
|
||||
2. Added pull request template with required spec link and acceptance criteria checklist.
|
||||
3. Added endpoint-level contract assertions for /file range requests and /audio/waveform cache-hit/cache-miss behavior.
|
||||
4. Confirmed scripts/validate-all.sh passes end-to-end with frontend tests + expanded backend contracts.
|
||||
|
||||
## Out of Scope for Must-Have Baseline
|
||||
|
||||
Useful later, but not required for strong day-to-day autonomous implementation:
|
||||
|
||||
1. Full quality dashboards.
|
||||
2. Advanced autonomy telemetry.
|
||||
3. Complete long-term governance expansion.
|
||||
4. High-autonomy optimization beyond 90% reliability target.
|
||||
|
||||
## Definition of Done (Must-Have Plan)
|
||||
|
||||
Must-have plan is complete when all are true:
|
||||
|
||||
1. scripts/validate-all.sh passes locally and in CI.
|
||||
2. Feature PRs without spec updates are blocked.
|
||||
3. Backend router contracts cover core success and error paths.
|
||||
4. Frontend has at least one stable test command integrated into validation.
|
||||
|
||||
## 7. AI Tools Validation Strategy
|
||||
|
||||
Required:
|
||||
|
||||
1. **Per-edit validation**: After each code change (file edit, replacement, or creation), validate immediately with appropriate tools.
|
||||
2. **Tool selection by change type**:
|
||||
- Frontend changes: ESLint (`npm run -s lint`), then TypeScript build (`npm run build`)
|
||||
- Backend changes: Syntax check via Python import, then run relevant test suite
|
||||
- Type/interface changes: Full type check via build or `tsc -b`
|
||||
3. **Failure handling**: If validation fails, fix immediately before proceeding to next edit.
|
||||
4. **Documentation updates**: When changing architecture, always update [.github/copilot-instructions.md](.github/copilot-instructions.md) as part of the same PR.
|
||||
5. **Large multi-edit operations**: Use `multi_replace_string_in_file` to batch independent edits and reduce tool call overhead.
|
||||
6. **Error collection**: Use `get_errors` tool to identify issues across multiple files in one call post-change.
|
||||
|
||||
Current implementation:
|
||||
|
||||
1. Electron removal completed with post-edit lint and build validation at each phase.
|
||||
2. Zone editor feature implemented with immediate lint/build validation after component creation and UI integration.
|
||||
3. Validation tools: `npm run -s lint`, `npm run build`, `get_errors`, `run_in_terminal` for test scripts.
|
||||
|
||||
Best practices established:
|
||||
|
||||
- Always run lint before build to catch TypeScript errors early
|
||||
- Run full build after component changes to verify tree-shaking and bundling
|
||||
- Use `get_errors` for multi-file error detection rather than sequential file reads
|
||||
- Batch unrelated edits with `multi_replace_string_in_file` for efficiency
|
||||
- Cache key decisions in session memory to avoid repeated exploration
|
||||
5. AI policy + diagnostics workflow are active.
|
||||
|
||||
## Current State Summary
|
||||
|
||||
Completed:
|
||||
|
||||
1. Validation and CI enforcement.
|
||||
2. Diagnostics capture.
|
||||
3. Spec policy and templates.
|
||||
4. Backend contract test foundation (including AI endpoints).
|
||||
5. Core router error-path correctness.
|
||||
6. Autonomy policy baseline.
|
||||
7. Frontend test command integrated into validation.
|
||||
8. PR template requirement added.
|
||||
9. /file and /audio/waveform contract assertions implemented.
|
||||
|
||||
Remaining:
|
||||
|
||||
1. No must-have items remaining.
|
||||
305
DOCKER.md
305
DOCKER.md
@ -1,305 +0,0 @@
|
||||
# Docker Deployment Guide for VideoTranscriber
|
||||
|
||||
This guide explains how to run VideoTranscriber in a Docker container while using Ollama models on your host system.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
│ Host System │
|
||||
│ ┌─────────────────┐ ┌──────────────────│
|
||||
│ │ Ollama Service │ │ Video Files │
|
||||
│ │ (port 11434) │ │ Directory │
|
||||
│ └─────────────────┘ └──────────────────│
|
||||
│ ▲ ▲ │
|
||||
│ │ │ │
|
||||
│ ┌───────┼─────────────────────┼─────────│
|
||||
│ │ Docker Container │ │
|
||||
│ │ ┌─────▼─────────┐ │ │
|
||||
│ │ │ VideoTranscriber │ │
|
||||
│ │ │ - Streamlit App │ │
|
||||
│ │ │ - Whisper Models │ │
|
||||
│ │ │ - ML Dependencies │ │
|
||||
│ │ └───────────────┘ │ │
|
||||
│ └────────────────────────────┼─────────│
|
||||
│ │ │
|
||||
│ Mounted Volumes ─────┘ │
|
||||
└─────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. **Docker & Docker Compose** installed
|
||||
2. **Ollama running on host**:
|
||||
```bash
|
||||
# Install Ollama (if not already installed)
|
||||
curl -fsSL https://ollama.ai/install.sh | sh
|
||||
|
||||
# Start Ollama service
|
||||
ollama serve
|
||||
|
||||
# Pull a model (in another terminal)
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
### 1. Setup Environment
|
||||
|
||||
```bash
|
||||
# Copy environment template
|
||||
cp docker.env.example .env
|
||||
|
||||
# Edit .env file with your paths
|
||||
# Key settings to update:
|
||||
VIDEO_PATH=/path/to/your/videos
|
||||
OUTPUT_PATH=/path/to/save/outputs
|
||||
HF_TOKEN=your_huggingface_token_if_needed
|
||||
```
|
||||
|
||||
### 2. Create Required Directories
|
||||
|
||||
```bash
|
||||
# Create directories for mounting
|
||||
mkdir -p videos outputs cache config
|
||||
```
|
||||
|
||||
### 3. Build and Run
|
||||
|
||||
```bash
|
||||
# Build and start the container
|
||||
docker-compose up -d
|
||||
|
||||
# View logs
|
||||
docker-compose logs -f
|
||||
|
||||
# Access the application
|
||||
# Open browser to: http://localhost:8501
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Description | Default | Required |
|
||||
|----------|-------------|---------|----------|
|
||||
| `VIDEO_PATH` | Host directory containing video files | `./videos` | Yes |
|
||||
| `OUTPUT_PATH` | Host directory for outputs | `./outputs` | Yes |
|
||||
| `CACHE_PATH` | Host directory for model cache | `./cache` | No |
|
||||
| `OLLAMA_API_URL` | Ollama API endpoint | `http://host.docker.internal:11434/api` | No |
|
||||
| `HF_TOKEN` | HuggingFace token for advanced features | - | No |
|
||||
| `CUDA_VISIBLE_DEVICES` | GPU devices to use | - | No |
|
||||
|
||||
### Volume Mounts
|
||||
|
||||
| Host Path | Container Path | Purpose |
|
||||
|-----------|----------------|---------|
|
||||
| `${VIDEO_PATH}` | `/app/data/videos` | Input video files |
|
||||
| `${OUTPUT_PATH}` | `/app/data/outputs` | Generated transcripts/summaries |
|
||||
| `${CACHE_PATH}` | `/app/data/cache` | Model and processing cache |
|
||||
| `${CONFIG_PATH}` | `/app/config` | Configuration files |
|
||||
|
||||
## Platform-Specific Setup
|
||||
|
||||
### Windows (Docker Desktop)
|
||||
|
||||
```yaml
|
||||
# In docker-compose.yml - use bridge networking
|
||||
networks:
|
||||
- videotranscriber-network
|
||||
|
||||
environment:
|
||||
- OLLAMA_API_URL=http://host.docker.internal:11434/api
|
||||
```
|
||||
|
||||
### macOS (Docker Desktop)
|
||||
|
||||
Same as Windows - uses `host.docker.internal` to access host services.
|
||||
|
||||
### Linux
|
||||
|
||||
Option 1 - Host Networking (Recommended):
|
||||
```yaml
|
||||
# In docker-compose.yml
|
||||
network_mode: host
|
||||
|
||||
environment:
|
||||
- OLLAMA_API_URL=http://localhost:11434/api
|
||||
```
|
||||
|
||||
Option 2 - Bridge Networking:
|
||||
```yaml
|
||||
environment:
|
||||
- OLLAMA_API_URL=http://172.17.0.1:11434/api # Docker bridge IP
|
||||
```
|
||||
|
||||
## GPU Support
|
||||
|
||||
### NVIDIA GPU Setup
|
||||
|
||||
1. **Install NVIDIA Container Toolkit**:
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
2. **Enable in docker-compose.yml**:
|
||||
```yaml
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
```
|
||||
|
||||
## Usage in Container
|
||||
|
||||
### Application Settings
|
||||
|
||||
When running in Docker, update these settings in the VideoTranscriber UI:
|
||||
|
||||
1. **Base Folder**: Set to `/app/data/videos`
|
||||
2. **Ollama Models**: Should auto-detect from host
|
||||
3. **GPU Settings**: Will use container GPU if configured
|
||||
|
||||
### File Access
|
||||
|
||||
- **Input Videos**: Place in your `${VIDEO_PATH}` directory on host
|
||||
- **Outputs**: Generated files appear in `${OUTPUT_PATH}` on host
|
||||
- **Cache**: Models cached in `${CACHE_PATH}` for faster subsequent runs
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### 1. Can't Connect to Ollama
|
||||
|
||||
**Symptoms**: "Ollama service is not available" message
|
||||
|
||||
**Solutions**:
|
||||
- Verify Ollama is running: `curl http://localhost:11434/api/tags`
|
||||
- Check firewall settings
|
||||
- For Linux, try host networking mode
|
||||
- Verify OLLAMA_API_URL in environment
|
||||
|
||||
#### 2. No Video Files Detected
|
||||
|
||||
**Symptoms**: "No recordings found" message
|
||||
|
||||
**Solutions**:
|
||||
- Check VIDEO_PATH points to correct directory
|
||||
- Ensure directory contains supported formats (.mp4, .avi, .mov, .mkv)
|
||||
- Check file permissions
|
||||
|
||||
#### 3. GPU Not Detected
|
||||
|
||||
**Symptoms**: Processing is slow, no GPU utilization
|
||||
|
||||
**Solutions**:
|
||||
- Install NVIDIA Container Toolkit
|
||||
- Uncomment GPU section in docker-compose.yml
|
||||
- Verify: `docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi`
|
||||
|
||||
#### 4. Permission Issues
|
||||
|
||||
**Symptoms**: Cannot write to output directory
|
||||
|
||||
**Solutions**:
|
||||
```bash
|
||||
# Fix permissions
|
||||
sudo chown -R $(id -u):$(id -g) outputs cache config
|
||||
chmod -R 755 outputs cache config
|
||||
```
|
||||
|
||||
### Debugging
|
||||
|
||||
```bash
|
||||
# View container logs
|
||||
docker-compose logs -f videotranscriber
|
||||
|
||||
# Execute shell in container
|
||||
docker-compose exec videotranscriber bash
|
||||
|
||||
# Check Ollama connectivity from container
|
||||
docker-compose exec videotranscriber curl -f $OLLAMA_API_URL/tags
|
||||
|
||||
# Monitor resource usage
|
||||
docker stats videotranscriber
|
||||
```
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Custom Dockerfile
|
||||
|
||||
For specialized requirements, modify the Dockerfile:
|
||||
|
||||
```dockerfile
|
||||
# Add custom dependencies
|
||||
RUN pip install your-custom-package
|
||||
|
||||
# Set custom environment variables
|
||||
ENV YOUR_CUSTOM_VAR=value
|
||||
|
||||
# Copy custom configuration
|
||||
COPY custom-config.yaml /app/config/
|
||||
```
|
||||
|
||||
### Multi-Instance Deployment
|
||||
|
||||
Run multiple instances for different use cases:
|
||||
|
||||
```bash
|
||||
# Copy docker-compose.yml to docker-compose.prod.yml
|
||||
# Modify ports and paths
|
||||
docker-compose -f docker-compose.prod.yml up -d
|
||||
```
|
||||
|
||||
### CI/CD Integration
|
||||
|
||||
```yaml
|
||||
# .github/workflows/docker.yml
|
||||
name: Build and Deploy
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build Docker image
|
||||
run: docker build -t videotranscriber .
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Memory Management
|
||||
|
||||
```yaml
|
||||
# In docker-compose.yml
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G
|
||||
reservations:
|
||||
memory: 4G
|
||||
```
|
||||
|
||||
### Model Caching
|
||||
|
||||
- Use persistent volumes for `/app/data/cache`
|
||||
- Pre-download models to reduce startup time
|
||||
- Configure appropriate cache size limits
|
||||
|
||||
### Network Optimization
|
||||
|
||||
- Use host networking on Linux for better performance
|
||||
- Consider running Ollama and VideoTranscriber on same machine
|
||||
- Use SSD storage for cache directories
|
||||
45
Dockerfile
45
Dockerfile
@ -1,45 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
git \
|
||||
wget \
|
||||
curl \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements first for better Docker layer caching
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install Python dependencies with pinned versions
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Optional: Install CUDA-specific PyTorch if GPU support needed
|
||||
# Uncomment and modify for your CUDA version:
|
||||
# RUN pip install --force-reinstall torch==2.1.0+cu118 torchvision==0.16.0+cu118 torchaudio==2.1.0+cu118 --index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create directories for mounted volumes
|
||||
RUN mkdir -p /app/data/videos /app/data/outputs /app/data/cache
|
||||
|
||||
# Set environment variables
|
||||
ENV STREAMLIT_SERVER_PORT=8501
|
||||
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
||||
ENV STREAMLIT_SERVER_HEADLESS=true
|
||||
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
||||
|
||||
# Expose Streamlit port
|
||||
EXPOSE 8501
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
||||
|
||||
# Start the application
|
||||
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
||||
@ -1,54 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies including CUDA-related packages
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
git \
|
||||
wget \
|
||||
curl \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements first for better Docker layer caching
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install CPU versions from requirements.txt first
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install CUDA-optimized PyTorch (overwrites CPU versions)
|
||||
# Updated to torch 2.1.0+ for SpeechBrain 1.0 / pyannote diarization compatibility
|
||||
RUN pip install --force-reinstall \
|
||||
torch==2.1.0+cu118 \
|
||||
torchvision==0.16.0+cu118 \
|
||||
torchaudio==2.1.0+cu118 \
|
||||
--index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create directories for mounted volumes
|
||||
RUN mkdir -p /app/data/videos /app/data/outputs /app/data/cache
|
||||
|
||||
# Set environment variables
|
||||
ENV STREAMLIT_SERVER_PORT=8501
|
||||
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
||||
ENV STREAMLIT_SERVER_HEADLESS=true
|
||||
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
||||
|
||||
# GPU-specific environment variables
|
||||
ENV CUDA_VISIBLE_DEVICES=0
|
||||
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
|
||||
# Expose Streamlit port
|
||||
EXPOSE 8501
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
||||
|
||||
# Start the application
|
||||
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
||||
181
FEATURES.md
Normal file
181
FEATURES.md
Normal file
@ -0,0 +1,181 @@
|
||||
# TalkEdit — Features & Roadmap
|
||||
|
||||
**Niche:** "Descript for long-form content" — works on hour+ files without degrading, fully offline, one-time payment.
|
||||
|
||||
---
|
||||
|
||||
## ✅ Already Implemented
|
||||
|
||||
### Core editing
|
||||
- [x] [#001] **Cut / Mute sections** — remove or silence segments from output
|
||||
- [x] [#002] **Silence / pause trimmer** — batch detect and remove silent pauses
|
||||
- [x] [#006] **Volume / gain control** — per-zone and global gain adjustment
|
||||
- [x] [#007] **Speed adjustment** — per-zone playback speed changes (0.25x–4x)
|
||||
- [x] [#008] **Cut preview** — preview zones before export with configurable padding
|
||||
- [x] [#009] **Timeline shows output length** — adjusted timeline with cut compression
|
||||
- [x] [#011] **Mark In / Out** — I/O keys to set selection range on timeline
|
||||
|
||||
### Transcript
|
||||
- [x] [#010] **Transcript search (Ctrl+F)** — find words, navigate matches
|
||||
- [x] [#012] **Low-confidence word highlighting** — orange dotted underline with confidence %
|
||||
- [x] [#013] **Re-transcribe selection** — re-run Whisper on a selected word range
|
||||
- [x] [#015] **Word text correction** — double-click any word to edit text in-place
|
||||
- [x] [#016] **Named timeline markers** — colored pins with labels, editable
|
||||
- [x] [#017] **Chapters** — auto-form from markers, copy as YouTube timestamps
|
||||
- [x] [#025] Word-level transcript editing (click, shift+click, drag select)
|
||||
- [x] [#026] Ctrl+click word → seek video to that timestamp
|
||||
- [x] [#027] Waveform timeline with zoom (Ctrl+scroll), scroll, drag-to-scrub
|
||||
- [x] [#028] Auto-scroll waveform when playhead goes off-screen
|
||||
|
||||
### AI features
|
||||
- [x] [#029] **AI filler word detection** — find and remove "um", "uh", "like" etc.
|
||||
- [x] [#030] **AI clip suggestions** — find best 20-60s segments for social media
|
||||
- [x] [#031] **Noise reduction** — DeepFilterNet or FFmpeg ANLMDN
|
||||
- [x] [#034] **Speaker diarization** — label speakers in transcript
|
||||
- [x] [#042] **Background removal** — MediaPipe segmentation, blur/color/image replacement
|
||||
|
||||
### Export
|
||||
- [x] [#018] **Audio loudness normalization** — LUFS targets (-14 YouTube, -16 Spotify, -23 Broadcast)
|
||||
- [x] [#019] **Background music** — auto-ducking via FFmpeg sidechain compress
|
||||
- [x] [#020] **Video zoom / punch-in** — crop, zoom, pan during export
|
||||
- [x] [#021] **Multi-clip / append** — concatenate multiple video files
|
||||
- [x] [#024] **Export transcript** — plain text or SRT without video
|
||||
- [x] [#032] **Export** — fast stream-copy or full re-encode (MP4/MOV/WebM/WAV, 720p–4K)
|
||||
- [x] [#033] **Captions** — SRT, VTT, ASS burn-in with font/color/position options
|
||||
|
||||
### Project & state
|
||||
- [x] [#003] **Undo / redo** — 100-level history via Zundo
|
||||
- [x] [#004] **Grouped silence-trim zones** — editable batch groups
|
||||
- [x] [#005] **Edit silence-trim group** settings after applying
|
||||
- [x] [#022] **Clip thumbnail strip** — canvas capture from video, clickable
|
||||
- [x] [#035] **Project save / load** — .aive JSON format
|
||||
- [x] [#037] **Multi-format input** — MP4, MKV, MOV, AVI, WebM, M4A
|
||||
- [x] [#038] **Keyboard shortcuts** — Space, J/K/L, arrows, Ctrl+Z/S/E, ?
|
||||
- [x] [#039] **Settings panel** — AI provider config (Ollama, OpenAI, Claude)
|
||||
- [x] [#040] **Zone creation on timeline** — draggable edits, Delete to remove
|
||||
- [x] [#041] **Customizable hotkeys** — two presets, click-to-remap, conflict detection
|
||||
- [x] **[M] Manage Models** — view/delete downloaded Whisper and LLM files
|
||||
- [x] **[M] Keyboard cheatsheet** — `?` overlay with close button, preset indicator
|
||||
- [x] **[M] Visual toolbar** — grouped buttons with section dividers
|
||||
- [x] **[M] Help panel** — full feature documentation in sidebar
|
||||
- [x] **[M] First-run welcome overlay** — 3-step quick-start guide
|
||||
- [x] **[M] Responsive welcome screen** — animated audio bars, model picker
|
||||
- [x] **[M] Error boundary** — catches React crashes, shows fallback + reload
|
||||
- [x] **[M] Global error logging** — uncaught errors logged to Rust backend
|
||||
- [x] **[M] Store input validation** — NaN rejection, bounds clamping, min zone duration
|
||||
- [x] **[M] Runtime assertions** — dev-mode guards in critical paths
|
||||
- [x] **[M] Backend health check** — polls every 30s, shows reconnecting banner
|
||||
|
||||
### Licensing
|
||||
- [x] **[L] 7-day free trial** — no credit card required
|
||||
- [x] **[L] License activation** — email confirmation step to deter key sharing
|
||||
- [x] **[L] Ed25519-signed license keys** — offline verification
|
||||
- [x] **[L] Trial integrity** — sentinel file prevents delete-and-reset, XOR checksum deters timestamp editing
|
||||
- [x] **[L] canEdit gate** — defaults to locked, only unlocks after verified status
|
||||
- [x] **[L] Expired state** — export and loading still work, editing and AI locked
|
||||
|
||||
### Robustness
|
||||
- [x] **[R] Auto-save crash recovery** — every 60s, restore prompt on next launch
|
||||
- [x] **[R] Bad project state recovery** — auto-prunes invalid zones on load
|
||||
- [x] **[R] Zone/marker deletion confirmations** — prevents accidental removals
|
||||
- [x] **[R] Progress bars** — export (determinate), transcription (indeterminate)
|
||||
- [x] **[R] Loading spinners** — waveform, AI processing
|
||||
- [x] **[R] Error states with retry** — AIPanel, WaveformTimeline
|
||||
- [x] **[R] Empty states** — MarkersPanel, AIPanel, ZoneEditor
|
||||
- [x] **[R] Canvas zone handles enlarged** — radius 6px, hit area increased
|
||||
- [x] **[R] Search match contrast** — thicker rings, higher opacity
|
||||
- [x] **[R] Split panes keyboard-accessible** — arrow keys, tabIndex, ARIA
|
||||
|
||||
### Testing
|
||||
- [x] **95 frontend tests** — editorStore (68), licenseStore (22), aiStore (15), assert (4)
|
||||
- [x] **12 Rust tests** — licensing (7), models (5)
|
||||
- [x] **CI pipeline** — GitHub Actions (Rust: test+clippy, Frontend: tsc+vitest, Python: pytest)
|
||||
|
||||
---
|
||||
|
||||
## 🔴 What's Next — highest impact
|
||||
|
||||
- [ ] **[LLM] Bundled Qwen3 LLM** — auto-download on first AI use, no API keys needed. Replace Python `ai_provider.py` with llama.cpp Rust bindings. Two sizes: 4B (2.5GB, 8GB+ RAM) and 1.7B (1GB, 4GB+ RAM)
|
||||
- [ ] **[SHORTS] Smart Shorts finder** — scan transcript for self-contained 10–90s segments, ranked by engagement. One-click export as separate clips
|
||||
- [ ] **[PAYMENT] Wire checkout** — payment page at talked.it, Stripe → license key generation → delivery email
|
||||
- [ ] **[BETA] Beta testers** — give 5–10 podcasters free licenses in exchange for feedback
|
||||
- [ ] **[BUILD] Production builds** — `cargo tauri build` for Windows, macOS, Linux
|
||||
|
||||
---
|
||||
|
||||
## 🟡 Medium impact — AI features
|
||||
|
||||
- [ ] [#044] **AI Transcript Summarization** — bullet-point summary from transcript
|
||||
- [ ] [#045] **AI Sentence Rephrase** — right-click word → see alternatives → replace
|
||||
- [ ] [#046] **AI Smart Speed** — detect slow sections → suggest speed adjustments
|
||||
- [ ] [#047] **AI Auto-Chapters** — topic detection from transcript → markers
|
||||
- [ ] [#048] **AI Show Notes** — title, description, keywords, timestamps
|
||||
- [ ] [#049] **AI Find Fluff** — detect rambles, off-topic chatter
|
||||
- [ ] [#050] **AI Smooth Cuts** — crossfade between deleted segments
|
||||
|
||||
---
|
||||
|
||||
## 🟢 Lower impact — expansion
|
||||
|
||||
- [ ] **Project stitching** — load multiple .aive projects into one export
|
||||
- [ ] **Batch export** — multiple projects/cuts in sequence
|
||||
- [ ] **Smart chunking** — overlapping chunks for files >2hr
|
||||
- [ ] [#014] Alternate transcription backend (VibeVoice-ASR-HF)
|
||||
- [ ] [#051] **AI B-roll** — generate footage from text prompt
|
||||
- [ ] [#052] **Smart Layouts** — auto-switch speakers in video frame
|
||||
- [ ] [#053] **Per-track audio levels** — gain per speaker track
|
||||
- [ ] [#054] **Intro/Outro templates** — reusable segment presets
|
||||
- [ ] [#055] **Built-in free music library** — CC0 loops shipped with app
|
||||
- [ ] [#056] **Stock media browser** — browse local resources/media/
|
||||
- [ ] [#057] **Sample content downloader** — test video with pre-made transcript
|
||||
|
||||
---
|
||||
|
||||
## 🎬 OpenShot-inspired (long-term)
|
||||
|
||||
- [ ] Keyframe animations — clip position, scale, opacity over time
|
||||
- [ ] Video transitions — crossfade, wipe between clips
|
||||
- [ ] Title / text overlays — SVG templates, adjustable font/color
|
||||
- [ ] Chroma key / greenscreen — per-clip effect
|
||||
- [ ] Speed ramps — animate speed within a clip
|
||||
- [ ] Frame-accurate stepping — arrow keys frame by frame
|
||||
- [ ] Clip trimming on timeline — drag edges to trim
|
||||
- [ ] Snapping — magnetic snap to markers and edges
|
||||
|
||||
---
|
||||
|
||||
## 💡 Competitive advantages
|
||||
|
||||
- **7-day free trial (no CC)** — full features, no risk
|
||||
- **One-time purchase** — $39 Pro, $79 Business, no subscription
|
||||
- **100% offline** — no account, no cloud, no data leaves your machine
|
||||
- **Local AI** — filler detection, clip suggestions, Smart Clean work offline
|
||||
- **Word-level precision** — edit video by deleting words, not razor cuts
|
||||
- **Per-segment re-transcription** — fix transcription errors on just the bad part
|
||||
- **Auto-ducking background music** — music lowers when speech detected, no keyframing
|
||||
- **Works on long files** — virtualized transcript + chunked waveform handles 1hr+
|
||||
|
||||
---
|
||||
|
||||
## 🚫 Explicitly deferred
|
||||
|
||||
- Cloud sync / collaboration
|
||||
- Voice cloning / TTS
|
||||
- Full multi-track NLE (compositing, keyframes, nested sequences)
|
||||
- Mobile app
|
||||
- Subscription model
|
||||
- Image/video generation models
|
||||
|
||||
TalkEdit's advantage is that it isn't a timeline editor — the text-is-the-timeline model makes spoken-word editing drastically faster than dragging razor cuts.
|
||||
|
||||
---
|
||||
|
||||
## 📦 Launch checklist
|
||||
|
||||
- [ ] Landing page at talked.it (features, screenshots, pricing, downloads)
|
||||
- [ ] Demo video (3–5 min walkthrough)
|
||||
- [ ] Product Hunt listing + 50 free licenses
|
||||
- [ ] r/podcasting, r/VideoEditing, r/selfhosted posts
|
||||
- [ ] Hacker News "Show HN"
|
||||
- [ ] GitHub v1.0.0 release with Windows/macOS/Linux binaries
|
||||
- [ ] Compare page: TalkEdit vs Descript
|
||||
52
FFmpeg_COMPLIANCE.md
Normal file
52
FFmpeg_COMPLIANCE.md
Normal file
@ -0,0 +1,52 @@
|
||||
# FFmpeg Compliance Checklist
|
||||
|
||||
Purpose: quick, practical checklist to ensure your TalkEdit distribution complies with FFmpeg licensing and packaging requirements.
|
||||
|
||||
1) Choose the FFmpeg build strategy
|
||||
- Prefer an LGPL-only build (no GPL-only encoders) for minimal obligations.
|
||||
- If you require GPL encoders (x264/x265/fdk-aac), document the decision and prepare to comply with GPL obligations.
|
||||
|
||||
2) Linking vs external binary
|
||||
- Prefer spawning an external `ffmpeg` binary from Rust (invoke process) rather than statically linking FFmpeg into your app.
|
||||
- If you link or bundle as a library, treat it as a third-party component and follow license terms strictly.
|
||||
|
||||
3) Bundling binary in installers
|
||||
- If bundling `ffmpeg` binaries in installers, include the appropriate license files (COPYING.LGPL, COPYING.GPL) in the installer and app About/Legal.
|
||||
- Include a plain-language notice in the installer/readme that explains which codecs/encoders are present and any implications.
|
||||
|
||||
4) Source & build-info disclosure
|
||||
- For GPL components, you must provide access to the corresponding source or provide a written offer. Record the exact FFmpeg commit/configure flags used.
|
||||
- Add a `third_party/ffmpeg/BUILD_INFO.txt` in the repo (or in release artifacts) containing:
|
||||
- FFmpeg git commit or version
|
||||
- configure flags used
|
||||
- date and builder identity (automated CI username)
|
||||
- link to the exact source tarball or repo snapshot
|
||||
|
||||
5) Make GPL components opt-in
|
||||
- Default distribution: ship LGPL-only binary or no binary and invoke system `ffmpeg` when available.
|
||||
- Offer an optional "codec pack" download or advanced installer that includes GPL encoders; make users explicitly accept terms before download.
|
||||
|
||||
6) Patent/licensing notice for codecs
|
||||
- Add a short note in the README/installer explaining that certain codecs (H.264/AAC) may be patent-encumbered and that distributors may require separate licensing.
|
||||
|
||||
7) Platform-specific recommendations
|
||||
- Linux: Prefer calling system FFmpeg (packaged by distro) or instruct users to install via package manager. If bundling, consider AppImage guidance.
|
||||
- macOS: Prefer Homebrew/optional download; if bundling, include license files and sign/ notarize appropriately.
|
||||
- Windows: If shipping `ffmpeg.exe`, include license files and a link to the source/build info; include checksums for shipped binaries.
|
||||
|
||||
8) Build automation & compliance artifacts
|
||||
- Add a CI step that builds or fetches the FFmpeg binary, captures `ffmpeg -buildconf`, and writes `BUILD_INFO.txt` into the release artifacts.
|
||||
- Produce a LICENSES folder in each installer containing FFmpeg license text and any third-party license texts used by your chosen build.
|
||||
|
||||
9) User-visible legal UI
|
||||
- Add an About > Legal pane listing third-party components and linked license files.
|
||||
- If downloading binaries on first run, show an explicit notice with a link to license and source information and require an OK from the user.
|
||||
|
||||
10) Pre-release legal checklist
|
||||
- Verify whether chosen build enables GPL libraries; if yes, prepare source or written offer before publishing.
|
||||
- Ensure installer contains license files and links to source/build-info.
|
||||
- Add a short FAQ entry about codec patents and user options.
|
||||
|
||||
Notes & Next Steps
|
||||
- This checklist is practical guidance, not legal advice. For final release compliance, consult legal counsel experienced in open-source licensing.
|
||||
- I can add a small CI script snippet that records `ffmpeg -buildconf` and uploads `BUILD_INFO.txt` to release assets — tell me which CI you use and I'll draft it.
|
||||
@ -1,105 +0,0 @@
|
||||
# Gemini Insights: OBS Recording Transcriber
|
||||
|
||||
## Project Overview
|
||||
The OBS Recording Transcriber is a Python application built with Streamlit that processes video recordings (particularly from OBS Studio) to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization.
|
||||
|
||||
## Key Improvement Areas
|
||||
|
||||
### 1. UI Enhancements
|
||||
- **Implemented:**
|
||||
- Responsive layout with columns for better organization
|
||||
- Expanded sidebar with categorized settings
|
||||
- Custom CSS for improved button styling
|
||||
- Spinner for long-running operations
|
||||
- Expanded transcript view by default
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add a dark mode toggle
|
||||
- Implement progress bars for each processing step
|
||||
- Add tooltips for complex options
|
||||
- Create a dashboard view for batch processing results
|
||||
- Add visualization of transcript segments with timestamps
|
||||
|
||||
### 2. Ollama Local API Integration
|
||||
- **Implemented:**
|
||||
- Local API integration for offline summarization
|
||||
- Model selection from available Ollama models
|
||||
- Chunking for long texts
|
||||
- Fallback to online models when Ollama fails
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add temperature and other generation parameters as advanced options
|
||||
- Implement streaming responses for real-time feedback
|
||||
- Cache results to avoid reprocessing
|
||||
- Add support for custom Ollama model creation with specific instructions
|
||||
- Implement parallel processing for multiple chunks
|
||||
|
||||
### 3. Subtitle Export Formats
|
||||
- **Implemented:**
|
||||
- SRT export with proper formatting
|
||||
- ASS export with basic styling
|
||||
- Multi-format export options
|
||||
- Automatic segment creation from plain text
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add customizable styling options for ASS subtitles
|
||||
- Implement subtitle editing before export
|
||||
- Add support for VTT format for web videos
|
||||
- Implement subtitle timing adjustment
|
||||
- Add batch export for multiple files
|
||||
|
||||
### 4. Architecture and Code Quality
|
||||
- **Recommendations:**
|
||||
- Implement proper error handling and logging throughout
|
||||
- Add unit tests for critical components
|
||||
- Create a configuration file for default settings
|
||||
- Implement caching for processed files
|
||||
- Add type hints throughout the codebase
|
||||
- Document API endpoints for potential future web service
|
||||
|
||||
### 5. Performance Optimizations
|
||||
- **Recommendations:**
|
||||
- Implement parallel processing for batch operations
|
||||
- Add GPU acceleration configuration options
|
||||
- Optimize memory usage for large files
|
||||
- Implement incremental processing for very long recordings
|
||||
- Add compression options for exported files
|
||||
|
||||
### 6. Additional Features
|
||||
- **Recommendations:**
|
||||
- Speaker diarization (identifying different speakers)
|
||||
- Language detection and translation
|
||||
- Keyword extraction and timestamp linking
|
||||
- Integration with video editing software
|
||||
- Batch processing queue with email notifications
|
||||
- Custom vocabulary for domain-specific terminology
|
||||
|
||||
## Implementation Roadmap
|
||||
1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export
|
||||
2. **Phase 2 (Completed):** Performance optimizations and additional export formats
|
||||
- Added WebVTT export format for web videos
|
||||
- Implemented GPU acceleration with automatic device selection
|
||||
- Added caching system for faster processing of previously transcribed files
|
||||
- Optimized memory usage with configurable memory limits
|
||||
- Added compression options for exported files
|
||||
- Enhanced ASS subtitle styling options
|
||||
- Added progress indicators for better user feedback
|
||||
3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation
|
||||
- Implemented speaker diarization to identify different speakers in recordings
|
||||
- Added language detection and translation capabilities
|
||||
- Integrated keyword extraction with timestamp linking
|
||||
- Created interactive transcript with keyword highlighting
|
||||
- Added named entity recognition for better content analysis
|
||||
- Generated keyword index with timestamp references
|
||||
- Provided speaker statistics and word count analysis
|
||||
4. **Phase 4:** Integration with other tools and services
|
||||
|
||||
## Technical Considerations
|
||||
- Ensure compatibility with different Whisper model sizes
|
||||
- Handle large files efficiently to prevent memory issues
|
||||
- Provide graceful degradation when optional dependencies are missing
|
||||
- Maintain backward compatibility with existing workflows
|
||||
- Consider containerization for easier deployment
|
||||
|
||||
## Conclusion
|
||||
The OBS Recording Transcriber has a solid foundation but can be significantly enhanced with the suggested improvements. The focus should be on improving user experience, adding offline processing capabilities, and expanding export options to make the tool more versatile for different use cases.
|
||||
141
INSTALLATION.md
141
INSTALLATION.md
@ -1,141 +0,0 @@
|
||||
# Installation Guide for OBS Recording Transcriber
|
||||
|
||||
This guide will help you install all the necessary dependencies for the OBS Recording Transcriber application, including the advanced features from Phase 3.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before installing the Python packages, you need to set up some prerequisites:
|
||||
|
||||
### 1. Python 3.8 or higher
|
||||
|
||||
Make sure you have Python 3.8 or higher installed. You can download it from [python.org](https://www.python.org/downloads/).
|
||||
|
||||
### 2. FFmpeg
|
||||
|
||||
FFmpeg is required for audio processing:
|
||||
|
||||
- **Windows**:
|
||||
- Download from [gyan.dev/ffmpeg/builds](https://www.gyan.dev/ffmpeg/builds/)
|
||||
- Extract the ZIP file
|
||||
- Add the `bin` folder to your system PATH
|
||||
|
||||
- **macOS**:
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
- **Linux**:
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install ffmpeg
|
||||
```
|
||||
|
||||
### 3. Visual C++ Build Tools (Windows only)
|
||||
|
||||
Some packages like `tokenizers` require C++ build tools:
|
||||
|
||||
1. Download and install [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||
2. During installation, select "Desktop development with C++"
|
||||
|
||||
## Installation Steps
|
||||
|
||||
### 1. Create a Virtual Environment (Recommended)
|
||||
|
||||
```bash
|
||||
# Create a virtual environment
|
||||
python -m venv venv
|
||||
|
||||
# Activate the virtual environment
|
||||
# Windows
|
||||
venv\Scripts\activate
|
||||
# macOS/Linux
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
### 2. Install PyTorch
|
||||
|
||||
For better performance, install PyTorch with CUDA support if you have an NVIDIA GPU:
|
||||
|
||||
```bash
|
||||
# Windows/Linux with CUDA
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
# macOS or CPU-only
|
||||
pip install torch torchvision torchaudio
|
||||
```
|
||||
|
||||
### 3. Install Dependencies
|
||||
|
||||
```bash
|
||||
# Install all dependencies from requirements.txt
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 4. Troubleshooting Common Issues
|
||||
|
||||
#### Tokenizers Installation Issues
|
||||
|
||||
If you encounter issues with `tokenizers` installation:
|
||||
|
||||
1. Make sure you have Visual C++ Build Tools installed (Windows)
|
||||
2. Try installing Rust: [rustup.rs](https://rustup.rs/)
|
||||
3. Install tokenizers separately:
|
||||
```bash
|
||||
pip install tokenizers --no-binary tokenizers
|
||||
```
|
||||
|
||||
#### PyAnnote.Audio Access
|
||||
|
||||
To use speaker diarization, you need a HuggingFace token with access to the pyannote models:
|
||||
|
||||
1. Create an account on [HuggingFace](https://huggingface.co/)
|
||||
2. Generate an access token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
||||
3. Request access to [pyannote/speaker-diarization-3.0](https://huggingface.co/pyannote/speaker-diarization-3.0)
|
||||
4. Set the token in the application when prompted or as an environment variable:
|
||||
```bash
|
||||
# Windows
|
||||
set HF_TOKEN=your_token_here
|
||||
# macOS/Linux
|
||||
export HF_TOKEN=your_token_here
|
||||
```
|
||||
|
||||
#### Memory Issues with Large Files
|
||||
|
||||
If you encounter memory issues with large files:
|
||||
|
||||
1. Use a smaller Whisper model (e.g., "base" instead of "large")
|
||||
2. Reduce the GPU memory fraction in the application settings
|
||||
3. Increase your system's swap space/virtual memory
|
||||
|
||||
## Running the Application
|
||||
|
||||
After installation, run the application with:
|
||||
|
||||
```bash
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
## Optional: Ollama Setup for Local Summarization
|
||||
|
||||
To use Ollama for local summarization:
|
||||
|
||||
1. Install Ollama from [ollama.ai](https://ollama.ai/)
|
||||
2. Pull a model:
|
||||
```bash
|
||||
ollama pull llama3
|
||||
```
|
||||
3. Uncomment the Ollama line in requirements.txt and install:
|
||||
```bash
|
||||
pip install ollama
|
||||
```
|
||||
|
||||
## Verifying Installation
|
||||
|
||||
To verify that all components are working correctly:
|
||||
|
||||
1. Run the application
|
||||
2. Check that GPU acceleration is available (if applicable)
|
||||
3. Test a small video file with basic transcription
|
||||
4. Gradually enable advanced features like diarization and translation
|
||||
|
||||
If you encounter any issues, check the application logs for specific error messages.
|
||||
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 DataAnts-AI
|
||||
Copyright (c) 2026 DataAnts AI
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
63
QUICK-FIX.md
63
QUICK-FIX.md
@ -1,63 +0,0 @@
|
||||
# 🚨 Quick Fix for PyTorch Compatibility Error
|
||||
|
||||
If you're seeing the `torch.compiler.disable` error, here's how to fix it:
|
||||
|
||||
## Immediate Fix
|
||||
|
||||
```bash
|
||||
# Stop the current container
|
||||
docker-compose down
|
||||
|
||||
# Remove the old image to force rebuild with fixed versions
|
||||
docker rmi $(docker images | grep videotranscriber | awk '{print $3}')
|
||||
|
||||
# Rebuild with fixed dependencies
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
## Better Solution: Use Prebuilt Images
|
||||
|
||||
⚠️ **Note**: GitHub Actions had a naming issue that's now fixed. See [FIX-GITHUB-ACTIONS.md](FIX-GITHUB-ACTIONS.md) for details.
|
||||
|
||||
Once prebuilt images are available, use them instead:
|
||||
|
||||
```bash
|
||||
# Check if images are ready
|
||||
docker pull ghcr.io/dataants-ai/videotranscriber:latest
|
||||
|
||||
# If successful, stop current container and use prebuilt image
|
||||
docker-compose down
|
||||
docker-compose -f docker-compose.prebuilt.yml up -d
|
||||
```
|
||||
|
||||
## What Was Fixed
|
||||
|
||||
1. **Version Pinning**: Updated `requirements.txt` with compatible versions:
|
||||
- `torch==2.0.1` (was `>=1.7.0`)
|
||||
- `pytorch-lightning==2.0.6` (compatible with torch 2.0.1)
|
||||
- `pyannote.audio==3.1.1` (updated to compatible version)
|
||||
|
||||
2. **Build Process**: Removed duplicate PyTorch installation that could cause conflicts
|
||||
|
||||
3. **Prebuilt Images**: Created GitHub Actions to build reliable, tested images
|
||||
|
||||
## Verification
|
||||
|
||||
After fixing, you should see the Streamlit app load without errors at `http://localhost:8501`
|
||||
|
||||
## If Still Having Issues
|
||||
|
||||
1. **Clear Docker cache**:
|
||||
```bash
|
||||
docker system prune -a
|
||||
```
|
||||
|
||||
2. **Check logs**:
|
||||
```bash
|
||||
docker-compose logs -f
|
||||
```
|
||||
|
||||
3. **Manual rebuild**:
|
||||
```bash
|
||||
docker build --no-cache -t videotranscriber .
|
||||
```
|
||||
308
README.md
308
README.md
@ -1,198 +1,176 @@
|
||||
# Video Transcriber
|
||||
# TalkEdit
|
||||
|
||||
## Project Overview
|
||||
The Video Recording Transcriber is a Python application built with Streamlit that processes video and audio recordings to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization.
|
||||
**Edit video by editing text.** An offline, local-first desktop video editor where deleting a word from the transcript cuts it from the video.
|
||||
|
||||
**Supported Formats**: MP4, AVI, MOV, MKV (video) and M4A (audio)
|
||||
<img width="1034" height="661" alt="TalkEdit screenshot" src="https://github.com/user-attachments/assets/b1ed9505-792e-42ca-bb73-85458d0f02a5" />
|
||||
|
||||
---
|
||||
|
||||

|
||||
## Features
|
||||
|
||||
Demo here
|
||||
- **Text-based editing** — delete, reorder, or correct words in the transcript to edit the underlying video. No razor tool, no timeline slicing.
|
||||
- **Word-level transcription** — Whisper.cpp with per-word timestamps and confidence scores. Low-confidence words get a visual warning.
|
||||
- **Four zone types** — Cut, Mute, Sound Gain, and Speed Adjust. Create zones on the waveform timeline and drag edges to refine.
|
||||
- **Waveform timeline** — zoomable, scrollable waveform with playhead scrubbing, zone visualization, markers, chapters, and thumbnail strips.
|
||||
- **AI-powered editing**
|
||||
- Filler word detection and removal
|
||||
- Smart Clean: one-click filler removal + silence trim + noise reduction + loudness normalization
|
||||
- Clip suggestions for social media shorts
|
||||
- Sentence rephrase with AI alternatives
|
||||
- Supports **Ollama** (local), **OpenAI**, and **Claude** backends
|
||||
- **Background music** — import a second audio track with auto-ducking via sidechain compression.
|
||||
- **Export** — fast stream-copy or full re-encode to MP4, MOV, WebM, or WAV. Resolution up to 4K.
|
||||
- **Captions** — generate SRT, VTT, or burn-in ASS subtitles with configurable font, color, and position.
|
||||
- **Speaker diarization** — identify and label multiple speakers.
|
||||
- **Audio tools** — noise reduction (DeepFilterNet), loudness normalization (LUFS targeting), background removal (MediaPipe), batch silence removal, video zoom/punch-in.
|
||||
- **Project save/load** — `.aive` JSON format preserves all edits, zones, markers, and AI config.
|
||||
- **Customizable hotkeys** — two presets (Standard / Left-hand) with per-key remapping and conflict detection.
|
||||
- **100% offline, no account required** — everything runs on your machine. No telemetry, no cloud dependency.
|
||||
- **7-day free trial** with one-time license key purchase. No subscription.
|
||||
|
||||
https://github.com/user-attachments/assets/990e63fc-232e-46a0-afdf-ca8836d46a13
|
||||
---
|
||||
|
||||
## Tech Stack
|
||||
|
||||
## Installation
|
||||
| Layer | Technology |
|
||||
|-------|------------|
|
||||
| Desktop shell | **Tauri 2.0** (Rust) |
|
||||
| Frontend | **React** + **TypeScript** + **Tailwind CSS** |
|
||||
| State management | **Zustand** with Zundo undo/redo |
|
||||
| Transcription | **Whisper.cpp** (word-level timestamps) |
|
||||
| AI / LLM | **Ollama**, **OpenAI**, **Claude** (plugable backends) |
|
||||
| Media processing | **FFmpeg** |
|
||||
| Python services | **FastAPI** (spawned as a child process) |
|
||||
|
||||
### 🐳 Docker Installation (Recommended)
|
||||
---
|
||||
|
||||
**Benefits**: Isolated environment, no dependency conflicts, easy deployment
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **Node.js** 18+
|
||||
- **Python** 3.10+
|
||||
- **FFmpeg** (in PATH)
|
||||
- **Rust** toolchain (for Tauri)
|
||||
- **Ollama** (optional, for local AI features)
|
||||
|
||||
### Install
|
||||
|
||||
#### Option A: Prebuilt Images (Fastest & Most Reliable)
|
||||
```bash
|
||||
# 1. Clone repository for config files
|
||||
git clone https://github.com/DataAnts-AI/VideoTranscriber.git
|
||||
cd VideoTranscriber
|
||||
# Root and frontend dependencies
|
||||
npm install
|
||||
cd frontend && npm install && cd ..
|
||||
|
||||
# 2. Setup environment
|
||||
cp docker.env.example .env
|
||||
# Edit .env with your video directory paths
|
||||
|
||||
# 3. Ensure Ollama is running on host
|
||||
ollama serve # In separate terminal
|
||||
ollama pull llama3
|
||||
|
||||
# 4. Start with prebuilt image
|
||||
docker-compose -f docker-compose.prebuilt.yml up -d
|
||||
|
||||
# 5. Access application
|
||||
# Open browser to: http://localhost:8501
|
||||
# Backend dependencies
|
||||
cd backend && pip install -r requirements.txt && cd ..
|
||||
```
|
||||
|
||||
#### Option B: Build from Source (Development)
|
||||
### Run (Development)
|
||||
|
||||
```bash
|
||||
# Use the local build approach
|
||||
docker-compose up -d
|
||||
# Start everything: backend + frontend + Tauri
|
||||
npm run dev:tauri
|
||||
```
|
||||
|
||||
See [DOCKER.md](DOCKER.md) for complete Docker setup guide.
|
||||
Or run components separately:
|
||||
|
||||
### Easy Installation (Recommended)
|
||||
```bash
|
||||
# Terminal 1: Python backend
|
||||
npm run dev:backend
|
||||
|
||||
#### Windows
|
||||
1. Download or clone the repository
|
||||
2. Run `install.bat` by double-clicking it
|
||||
3. Follow the on-screen instructions
|
||||
|
||||
#### Linux/macOS
|
||||
1. Download or clone the repository
|
||||
2. Open a terminal in the project directory
|
||||
3. Make the install script executable: `chmod +x install.sh`
|
||||
4. Run the script: `./install.sh`
|
||||
5. Follow the on-screen instructions
|
||||
|
||||
### Manual Installation
|
||||
1. Clone the repo.
|
||||
```
|
||||
git clone https://github.com/DataAnts-AI/VideoTranscriber.git
|
||||
cd VideoTranscriber
|
||||
# Terminal 2: Frontend + Tauri
|
||||
cd frontend && cargo tauri dev
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
### Build
|
||||
|
||||
```bash
|
||||
npm run build:tauri
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Ensure that the versions align with the features you use and your system compatibility.
|
||||
- torch version should match the capabilities of your hardware (e.g., CUDA support for GPUs).
|
||||
- For advanced features like speaker diarization, you'll need a HuggingFace token.
|
||||
- See `INSTALLATION.md` for detailed instructions and troubleshooting.
|
||||
---
|
||||
|
||||
## Project Structure
|
||||
|
||||
3. Run the application:
|
||||
```
|
||||
streamlit run app.py
|
||||
talkedit/
|
||||
├── src-tauri/ # Tauri 2.0 Rust runtime
|
||||
│ ├── Cargo.toml
|
||||
│ └── src/
|
||||
│ ├── main.rs # App entry, backend spawner
|
||||
│ ├── lib.rs # Command handlers (IPC bridge)
|
||||
│ ├── transcription.rs # Whisper.cpp integration
|
||||
│ ├── video_editor.rs # FFmpeg-based editing
|
||||
│ ├── caption_generator.rs
|
||||
│ ├── diarization.rs
|
||||
│ ├── ai_provider.rs # Ollama / OpenAI / Claude
|
||||
│ ├── audio_cleaner.rs
|
||||
│ ├── background_removal.rs
|
||||
│ ├── licensing.rs # Trial + key activation
|
||||
│ ├── models.rs # Shared data types
|
||||
│ └── paths.rs
|
||||
├── frontend/ # React + Vite + Tailwind
|
||||
│ └── src/
|
||||
│ ├── components/ # UI components
|
||||
│ │ ├── TranscriptEditor.tsx
|
||||
│ │ ├── WaveformTimeline.tsx
|
||||
│ │ ├── VideoPlayer.tsx
|
||||
│ │ ├── AIPanel.tsx
|
||||
│ │ ├── ExportDialog.tsx
|
||||
│ │ ├── SettingsPanel.tsx
|
||||
│ │ ├── BackgroundMusicPanel.tsx
|
||||
│ │ ├── MarkersPanel.tsx
|
||||
│ │ ├── ZoneEditor.tsx
|
||||
│ │ ├── SilenceTrimmerPanel.tsx
|
||||
│ │ ├── AppendClipPanel.tsx
|
||||
│ │ ├── LicenseDialog.tsx
|
||||
│ │ └── DevPanel.tsx
|
||||
│ ├── store/ # Zustand state (editorStore, aiStore, settingsStore)
|
||||
│ ├── hooks/ # Custom React hooks
|
||||
│ ├── lib/ # Utilities and Tauri bridge
|
||||
│ └── types/ # TypeScript interfaces
|
||||
├── backend/ # FastAPI Python services
|
||||
│ ├── main.py
|
||||
│ ├── routers/ # API endpoints
|
||||
│ │ ├── transcribe.py
|
||||
│ │ ├── ai.py
|
||||
│ │ ├── audio.py
|
||||
│ │ ├── captions.py
|
||||
│ │ └── export.py
|
||||
│ ├── services/ # Core logic
|
||||
│ ├── video_editor.py
|
||||
│ ├── caption_generator.py
|
||||
│ ├── ai_provider.py
|
||||
│ ├── diarization.py
|
||||
│ ├── audio_cleaner.py
|
||||
│ ├── background_removal.py
|
||||
│ └── license_server.py
|
||||
├── shared/ # Schema definitions (project format)
|
||||
├── models/ # Whisper model storage
|
||||
└── docs/ # Documentation
|
||||
```
|
||||
|
||||
## Usage
|
||||
1. Set your base folder where video/audio recordings are stored
|
||||
2. Select a recording from the dropdown (supports MP4, AVI, MOV, MKV, M4A)
|
||||
3. Choose transcription and summarization models
|
||||
4. Configure performance settings (GPU acceleration, caching)
|
||||
5. Select export formats and compression options
|
||||
6. Click "Process Recording" to start
|
||||
---
|
||||
|
||||
## Advanced Features
|
||||
- **Speaker Diarization**: Identify and label different speakers in your recordings
|
||||
- **Translation**: Automatically detect language and translate to multiple languages
|
||||
- **Keyword Extraction**: Extract important keywords with timestamp links
|
||||
- **Interactive Transcript**: Navigate through the transcript with keyword highlighting
|
||||
- **GPU Acceleration**: Utilize your GPU for faster processing
|
||||
- **Caching**: Save processing time by caching results
|
||||
## Keyboard Shortcuts
|
||||
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| Space | Play / Pause |
|
||||
| J / K / L | Reverse / Pause / Forward |
|
||||
| I / O | Mark In / Mark Out |
|
||||
| ← / → | Seek ±5 seconds |
|
||||
| Delete | Delete selected words or zones |
|
||||
| Ctrl+Z | Undo |
|
||||
| Ctrl+Shift+Z | Redo |
|
||||
| Ctrl+S | Save project |
|
||||
| Ctrl+E | Export |
|
||||
| Ctrl+F | Search transcript |
|
||||
| Ctrl+Scroll | Zoom waveform |
|
||||
| ? | Shortcut cheatsheet |
|
||||
|
||||
---
|
||||
|
||||
## Key Improvement Areas
|
||||
## License
|
||||
|
||||
### 1. UI Enhancements
|
||||
- **Implemented:**
|
||||
- Responsive layout with columns for better organization
|
||||
- Expanded sidebar with categorized settings
|
||||
- Custom CSS for improved button styling
|
||||
- Spinner for long-running operations
|
||||
- Expanded transcript view by default
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add a dark mode toggle
|
||||
- Implement progress bars for each processing step
|
||||
- Add tooltips for complex options
|
||||
- Create a dashboard view for batch processing results
|
||||
- Add visualization of transcript segments with timestamps
|
||||
|
||||
### 2. Ollama Local API Integration
|
||||
- **Implemented:**
|
||||
- Local API integration for offline summarization
|
||||
- Model selection from available Ollama models
|
||||
- Chunking for long texts
|
||||
- Fallback to online models when Ollama fails
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add temperature and other generation parameters as advanced options
|
||||
- Implement streaming responses for real-time feedback
|
||||
- Cache results to avoid reprocessing
|
||||
- Add support for custom Ollama model creation with specific instructions
|
||||
- Implement parallel processing for multiple chunks
|
||||
|
||||
### 3. Subtitle Export Formats
|
||||
- **Implemented:**
|
||||
- SRT export with proper formatting
|
||||
- ASS export with basic styling
|
||||
- Multi-format export options
|
||||
- Automatic segment creation from plain text
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add customizable styling options for ASS subtitles
|
||||
- Implement subtitle editing before export
|
||||
- Add support for VTT format for web videos
|
||||
- Implement subtitle timing adjustment
|
||||
- Add batch export for multiple files
|
||||
|
||||
### 4. Architecture and Code Quality
|
||||
- **Recommendations:**
|
||||
- Implement proper error handling and logging throughout
|
||||
- Add unit tests for critical components
|
||||
- Create a configuration file for default settings
|
||||
- Implement caching for processed files
|
||||
- Add type hints throughout the codebase
|
||||
- Document API endpoints for potential future web service
|
||||
|
||||
### 5. Performance Optimizations
|
||||
- **Recommendations:**
|
||||
- Implement parallel processing for batch operations
|
||||
- Add GPU acceleration configuration options
|
||||
- Optimize memory usage for large files
|
||||
- Implement incremental processing for very long recordings
|
||||
- Add compression options for exported files
|
||||
|
||||
### 6. Additional Features
|
||||
- **Recommendations:**
|
||||
- Speaker diarization (identifying different speakers)
|
||||
- Language detection and translation
|
||||
- Keyword extraction and timestamp linking
|
||||
- Integration with video editing software
|
||||
- Batch processing queue with email notifications
|
||||
- Custom vocabulary for domain-specific terminology
|
||||
|
||||
## Implementation Roadmap
|
||||
1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export
|
||||
2. **Phase 2 (Completed):** Performance optimizations and additional export formats
|
||||
- Added WebVTT export format for web videos
|
||||
- Implemented GPU acceleration with automatic device selection
|
||||
- Added caching system for faster processing of previously transcribed files
|
||||
- Optimized memory usage with configurable memory limits
|
||||
- Added compression options for exported files
|
||||
- Enhanced ASS subtitle styling options
|
||||
- Added progress indicators for better user feedback
|
||||
3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation
|
||||
- Implemented speaker diarization to identify different speakers in recordings
|
||||
- Added language detection and translation capabilities
|
||||
- Integrated keyword extraction with timestamp linking
|
||||
- Created interactive transcript with keyword highlighting
|
||||
- Added named entity recognition for better content analysis
|
||||
- Generated keyword index with timestamp references
|
||||
- Provided speaker statistics and word count analysis
|
||||
4. **Phase 4:** Integration with other tools and services (In progess)
|
||||
|
||||
|
||||
Reach out to support@dataants.org if you need assistance with any AI solutions - we offer support for n8n workflows, local RAG chatbots, and ERP and Financial reporting.
|
||||
Source code is MIT — see [LICENSE](LICENSE) for details. The distributed binary includes a 7-day free trial requiring a one-time license key purchase for continued use.
|
||||
|
||||
83
TECH_FEATURES.md
Normal file
83
TECH_FEATURES.md
Normal file
@ -0,0 +1,83 @@
|
||||
# TalkEdit — Tech Stack, Tools, and Features
|
||||
|
||||
This document summarizes the chosen technology, tooling, the full feature set, recommended additions, and items on the back burner.
|
||||
|
||||
## Overview
|
||||
- Goal: Offline, local text-based audio/video editor (Descript-style) focused on spoken-word creators (podcasters, YouTubers). Fast, privacy-first, single-file installer.
|
||||
|
||||
## Tech Stack
|
||||
- Frontend: React 19 + Vite + TypeScript + Tailwind CSS + Zustand (with zundo undo/redo) + Virtuoso (virtualized transcript)
|
||||
- Backend: Tauri 2.0 (Rust) for file I/O, licensing, licensing crypto (Ed25519), model management, error logging
|
||||
- Transcription: Python faster-whisper with WhisperX for word-level alignment. Models downloaded on demand.
|
||||
- Audio/Video Processing: FFmpeg invoked from Rust via Python scripts (video_editor.py, audio_cleaner.py, caption_generator.py)
|
||||
- AI: Ollama, OpenAI, Claude through Python ai_provider.py. Bundled Qwen3 LLM planned.
|
||||
- State: Zustand (in-frontend store) + zundo middleware for undo/redo history
|
||||
- Packaging: Tauri `tauri build` for cross-platform installers
|
||||
|
||||
## Developer Tools
|
||||
- Rust toolchain (cargo, rustc)
|
||||
- Node.js + npm for frontend
|
||||
- Python 3.11+ (faster-whisper, WhisperX, AI providers)
|
||||
- FFmpeg binaries (platform-specific; bundled or downloaded at install)
|
||||
- Build/test: Tauri CLI, Vite dev server
|
||||
- Testing: Vitest (frontend), cargo test (Rust), pytest (Python)
|
||||
- CI: GitHub Actions (Rust clippy/test, Frontend tsc/vitest, Python pytest)
|
||||
|
||||
## Implemented Features
|
||||
|
||||
- [x] 1. Media import via file dialog (audio/video auto audio-extract)
|
||||
- [x] 2. One-click local transcription with model selector (tiny/base → larger models) and model-size chooser
|
||||
- [x] 3. Scrollable, Google-Doc-style transcript editor (Virtuoso virtualized)
|
||||
- Click word → seek video/audio
|
||||
- Select words → cut corresponding media segment (smart 150–250ms fades)
|
||||
- [x] 4. Smart Cleanup
|
||||
- Filler word removal (configurable list per-project)
|
||||
- Silence trimming
|
||||
- [x] 5. Audio Polish chain (FFmpeg): normalize, compression, noise reduction
|
||||
- [x] 6. Preview with synced playback, undo/redo (zundo), project save/load
|
||||
- [x] 7. Export MP4/audio with SRT/VTT/ASS captions (speaker-labeled)
|
||||
- [x] 8. Speaker diarization
|
||||
- [x] 9. Custom filler lists per-project
|
||||
- [x] 10. Background music with auto-ducking
|
||||
- [x] 11. Append clips (concatenation)
|
||||
- [x] 12. Settings: AI provider config (Ollama, OpenAI, Claude)
|
||||
- [x] 13. Keyboard shortcuts with custom remapping
|
||||
- [x] 14. Help panel + cheatsheet
|
||||
- [x] 15. 7-day licensing with Ed25519-signed license keys
|
||||
|
||||
## Recommended Additions (near-term, high ROI)
|
||||
|
||||
- [ ] Local GPU/CPU detection & recommended model/settings UI
|
||||
- [ ] Per-project incremental transcription: re-run only edited segments
|
||||
- [ ] "Preview cleaning" dry-run that highlights candidate removals before applying
|
||||
- [ ] Export size/time estimator and suggested export presets
|
||||
- [ ] Accessibility export presets (podcast vs YouTube presets)
|
||||
- [ ] Bundled Qwen3 LLM for offline AI features
|
||||
|
||||
## Remove / Defer (Back Burner)
|
||||
These broaden scope or add legal/privacy surface — defer for now.
|
||||
|
||||
- Voice cloning / TTS: DEFER
|
||||
- Multi-track, full timeline NLE features: DEFER
|
||||
- Real-time collaboration / cloud sync: DEFER
|
||||
- Built-in cloud processing by default: DEFER (make optional add-on later)
|
||||
|
||||
## Risks & Mitigations
|
||||
- Large model sizes: don't bundle large models; download on-demand and document storage location.
|
||||
- Timestamp accuracy: WhisperX word-level alignment + manual per-segment re-run available.
|
||||
- FFmpeg packaging/licensing: ship platform-specific binaries or use Tauri bundling guidance; document license compliance.
|
||||
|
||||
## Prioritized Quick Wins
|
||||
1. Per-project incremental transcription
|
||||
2. "Preview cleaning" dry-run UI
|
||||
3. Export presets (podcast vs YouTube)
|
||||
|
||||
## Next Steps for Implementation
|
||||
- Bundle Qwen3 LLM for offline AI processing.
|
||||
- Implement incremental transcription to speed up re-editing workflows.
|
||||
- Add export presets and size estimation.
|
||||
- Improve GPU/CPU detection and model recommendations.
|
||||
|
||||
---
|
||||
|
||||
Generated to capture tech, tools, implemented features, and the recommended add/remove/defer list.
|
||||
544
app.py
544
app.py
@ -1,544 +0,0 @@
|
||||
import streamlit as st
|
||||
from utils.audio_processing import extract_audio
|
||||
from utils.transcription import transcribe_audio
|
||||
from utils.summarization import summarize_text
|
||||
from utils.validation import validate_environment
|
||||
from utils.export import export_transcript
|
||||
from pathlib import Path
|
||||
import os
|
||||
import logging
|
||||
import humanize
|
||||
from datetime import timedelta
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import Ollama integration, but don't fail if it's not available
|
||||
try:
|
||||
from utils.ollama_integration import check_ollama_available, list_available_models, chunk_and_summarize
|
||||
OLLAMA_AVAILABLE = check_ollama_available()
|
||||
except ImportError:
|
||||
OLLAMA_AVAILABLE = False
|
||||
|
||||
# Try to import GPU utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.gpu_utils import get_gpu_info, configure_gpu
|
||||
GPU_UTILS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GPU_UTILS_AVAILABLE = False
|
||||
|
||||
# Try to import caching utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.cache import get_cache_size, clear_cache
|
||||
CACHE_AVAILABLE = True
|
||||
except ImportError:
|
||||
CACHE_AVAILABLE = False
|
||||
|
||||
# Try to import diarization utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.diarization import transcribe_with_diarization
|
||||
DIARIZATION_AVAILABLE = True
|
||||
except ImportError:
|
||||
DIARIZATION_AVAILABLE = False
|
||||
|
||||
# Try to import translation utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.translation import transcribe_and_translate, get_language_name
|
||||
TRANSLATION_AVAILABLE = True
|
||||
except ImportError:
|
||||
TRANSLATION_AVAILABLE = False
|
||||
|
||||
# Try to import keyword extraction utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.keyword_extraction import extract_keywords_from_transcript, generate_keyword_index, generate_interactive_transcript
|
||||
KEYWORD_EXTRACTION_AVAILABLE = True
|
||||
except ImportError:
|
||||
KEYWORD_EXTRACTION_AVAILABLE = False
|
||||
|
||||
def main():
|
||||
# Set page configuration
|
||||
st.set_page_config(
|
||||
page_title="OBS Recording Transcriber",
|
||||
page_icon="🎥",
|
||||
layout="wide",
|
||||
initial_sidebar_state="expanded"
|
||||
)
|
||||
|
||||
# Custom CSS for better UI
|
||||
st.markdown("""
|
||||
<style>
|
||||
.main .block-container {
|
||||
padding-top: 2rem;
|
||||
padding-bottom: 2rem;
|
||||
}
|
||||
.stButton>button {
|
||||
width: 100%;
|
||||
}
|
||||
.stDownloadButton>button {
|
||||
width: 100%;
|
||||
}
|
||||
.stProgress > div > div > div {
|
||||
background-color: #4CAF50;
|
||||
}
|
||||
.speaker {
|
||||
font-weight: bold;
|
||||
color: #1E88E5;
|
||||
}
|
||||
.timestamp {
|
||||
color: #757575;
|
||||
font-size: 0.9em;
|
||||
margin-right: 8px;
|
||||
}
|
||||
.keyword {
|
||||
background-color: #FFF9C4;
|
||||
padding: 0 2px;
|
||||
border-radius: 3px;
|
||||
}
|
||||
.interactive-transcript p {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
st.title("🎥 OBS Recording Transcriber")
|
||||
st.caption("Process your OBS recordings with AI transcription and summarization")
|
||||
|
||||
# Sidebar configuration
|
||||
st.sidebar.header("Settings")
|
||||
|
||||
# Allow the user to select a base folder
|
||||
base_folder = st.sidebar.text_input(
|
||||
"Enter the base folder path:",
|
||||
value=str(Path.home())
|
||||
)
|
||||
|
||||
base_path = Path(base_folder)
|
||||
|
||||
# Model selection
|
||||
st.sidebar.subheader("Model Settings")
|
||||
|
||||
# Transcription model selection
|
||||
transcription_model = st.sidebar.selectbox(
|
||||
"Transcription Model",
|
||||
["tiny", "base", "small", "medium", "large"],
|
||||
index=1,
|
||||
help="Select the Whisper model size. Larger models are more accurate but slower."
|
||||
)
|
||||
|
||||
# Summarization model selection
|
||||
summarization_options = ["Hugging Face (Online)", "Ollama (Local)"] if OLLAMA_AVAILABLE else ["Hugging Face (Online)"]
|
||||
summarization_method = st.sidebar.selectbox(
|
||||
"Summarization Method",
|
||||
summarization_options,
|
||||
index=0,
|
||||
help="Select the summarization method. Ollama runs locally but requires installation."
|
||||
)
|
||||
|
||||
# If Ollama is selected, show model selection
|
||||
ollama_model = None
|
||||
if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)":
|
||||
available_models = list_available_models()
|
||||
if available_models:
|
||||
ollama_model = st.sidebar.selectbox(
|
||||
"Ollama Model",
|
||||
available_models,
|
||||
index=0 if "llama3" in available_models else 0,
|
||||
help="Select the Ollama model to use for summarization."
|
||||
)
|
||||
else:
|
||||
st.sidebar.warning("No Ollama models found. Please install models using 'ollama pull model_name'.")
|
||||
|
||||
# Advanced features
|
||||
st.sidebar.subheader("Advanced Features")
|
||||
|
||||
# Speaker diarization
|
||||
use_diarization = st.sidebar.checkbox(
|
||||
"Speaker Diarization",
|
||||
value=False,
|
||||
disabled=not DIARIZATION_AVAILABLE,
|
||||
help="Identify different speakers in the recording."
|
||||
)
|
||||
|
||||
# Show HF token input if diarization is enabled
|
||||
hf_token = None
|
||||
if use_diarization and DIARIZATION_AVAILABLE:
|
||||
hf_token = st.sidebar.text_input(
|
||||
"HuggingFace Token",
|
||||
type="password",
|
||||
help="Required for speaker diarization. Get your token at huggingface.co/settings/tokens"
|
||||
)
|
||||
|
||||
num_speakers = st.sidebar.number_input(
|
||||
"Number of Speakers",
|
||||
min_value=1,
|
||||
max_value=10,
|
||||
value=2,
|
||||
help="Specify the number of speakers if known, or leave at default for auto-detection."
|
||||
)
|
||||
|
||||
# Translation
|
||||
use_translation = st.sidebar.checkbox(
|
||||
"Translation",
|
||||
value=False,
|
||||
disabled=not TRANSLATION_AVAILABLE,
|
||||
help="Translate the transcript to another language."
|
||||
)
|
||||
|
||||
# Target language selection if translation is enabled
|
||||
target_lang = None
|
||||
if use_translation and TRANSLATION_AVAILABLE:
|
||||
target_lang = st.sidebar.selectbox(
|
||||
"Target Language",
|
||||
["en", "es", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko", "ar"],
|
||||
format_func=lambda x: f"{get_language_name(x)} ({x})",
|
||||
help="Select the language to translate to."
|
||||
)
|
||||
|
||||
# Keyword extraction
|
||||
use_keywords = st.sidebar.checkbox(
|
||||
"Keyword Extraction",
|
||||
value=False,
|
||||
disabled=not KEYWORD_EXTRACTION_AVAILABLE,
|
||||
help="Extract keywords and link them to timestamps."
|
||||
)
|
||||
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
|
||||
max_keywords = st.sidebar.slider(
|
||||
"Max Keywords",
|
||||
min_value=5,
|
||||
max_value=30,
|
||||
value=15,
|
||||
help="Maximum number of keywords to extract."
|
||||
)
|
||||
|
||||
# Performance settings
|
||||
st.sidebar.subheader("Performance Settings")
|
||||
|
||||
# GPU acceleration
|
||||
use_gpu = st.sidebar.checkbox(
|
||||
"Use GPU Acceleration",
|
||||
value=True if GPU_UTILS_AVAILABLE else False,
|
||||
disabled=not GPU_UTILS_AVAILABLE,
|
||||
help="Use GPU for faster processing if available."
|
||||
)
|
||||
|
||||
# Show GPU info if available
|
||||
if GPU_UTILS_AVAILABLE and use_gpu:
|
||||
gpu_info = get_gpu_info()
|
||||
if gpu_info["cuda_available"]:
|
||||
gpu_devices = [f"{d['name']} ({humanize.naturalsize(d['total_memory'])})" for d in gpu_info["cuda_devices"]]
|
||||
st.sidebar.info(f"GPU(s) available: {', '.join(gpu_devices)}")
|
||||
elif gpu_info["mps_available"]:
|
||||
st.sidebar.info("Apple Silicon GPU (MPS) available")
|
||||
else:
|
||||
st.sidebar.warning("No GPU detected. Using CPU.")
|
||||
|
||||
# Memory usage
|
||||
memory_fraction = st.sidebar.slider(
|
||||
"GPU Memory Usage",
|
||||
min_value=0.1,
|
||||
max_value=1.0,
|
||||
value=0.8,
|
||||
step=0.1,
|
||||
disabled=not (GPU_UTILS_AVAILABLE and use_gpu),
|
||||
help="Fraction of GPU memory to use. Lower if you encounter out-of-memory errors."
|
||||
)
|
||||
|
||||
# Caching options
|
||||
use_cache = st.sidebar.checkbox(
|
||||
"Use Caching",
|
||||
value=True if CACHE_AVAILABLE else False,
|
||||
disabled=not CACHE_AVAILABLE,
|
||||
help="Cache transcription results to avoid reprocessing the same files."
|
||||
)
|
||||
|
||||
# Cache management
|
||||
if CACHE_AVAILABLE and use_cache:
|
||||
cache_size, cache_files = get_cache_size()
|
||||
if cache_size > 0:
|
||||
st.sidebar.info(f"Cache: {humanize.naturalsize(cache_size)} ({cache_files} files)")
|
||||
if st.sidebar.button("Clear Cache"):
|
||||
cleared = clear_cache()
|
||||
st.sidebar.success(f"Cleared {cleared} cache files")
|
||||
|
||||
# Export options
|
||||
st.sidebar.subheader("Export Options")
|
||||
export_format = st.sidebar.multiselect(
|
||||
"Export Formats",
|
||||
["TXT", "SRT", "VTT", "ASS"],
|
||||
default=["TXT"],
|
||||
help="Select the formats to export the transcript."
|
||||
)
|
||||
|
||||
# Compression options
|
||||
compress_exports = st.sidebar.checkbox(
|
||||
"Compress Exports",
|
||||
value=False,
|
||||
help="Compress exported files to save space."
|
||||
)
|
||||
|
||||
if compress_exports:
|
||||
compression_type = st.sidebar.radio(
|
||||
"Compression Format",
|
||||
["gzip", "zip"],
|
||||
index=0,
|
||||
help="Select the compression format for exported files."
|
||||
)
|
||||
else:
|
||||
compression_type = None
|
||||
|
||||
# ASS subtitle styling
|
||||
if "ASS" in export_format:
|
||||
st.sidebar.subheader("ASS Subtitle Styling")
|
||||
show_style_options = st.sidebar.checkbox("Customize ASS Style", value=False)
|
||||
|
||||
if show_style_options:
|
||||
ass_style = {}
|
||||
ass_style["fontname"] = st.sidebar.selectbox(
|
||||
"Font",
|
||||
["Arial", "Helvetica", "Times New Roman", "Courier New", "Comic Sans MS"],
|
||||
index=0
|
||||
)
|
||||
ass_style["fontsize"] = st.sidebar.slider("Font Size", 12, 72, 48)
|
||||
ass_style["alignment"] = st.sidebar.selectbox(
|
||||
"Alignment",
|
||||
["2 (Bottom Center)", "1 (Bottom Left)", "3 (Bottom Right)", "8 (Top Center)"],
|
||||
index=0
|
||||
).split()[0] # Extract just the number
|
||||
ass_style["bold"] = "-1" if st.sidebar.checkbox("Bold", value=True) else "0"
|
||||
ass_style["italic"] = "-1" if st.sidebar.checkbox("Italic", value=False) else "0"
|
||||
else:
|
||||
ass_style = None
|
||||
|
||||
# Validate environment
|
||||
env_errors = validate_environment(base_path)
|
||||
if env_errors:
|
||||
st.error("## Environment Issues")
|
||||
for error in env_errors:
|
||||
st.markdown(f"- {error}")
|
||||
return
|
||||
|
||||
# File selection - support multiple video and audio formats
|
||||
supported_extensions = ["*.mp4", "*.avi", "*.mov", "*.mkv", "*.m4a"]
|
||||
recordings = []
|
||||
for extension in supported_extensions:
|
||||
recordings.extend(base_path.glob(extension))
|
||||
|
||||
if not recordings:
|
||||
st.warning(f"📂 No recordings found in the folder: {base_folder}!")
|
||||
st.info("💡 Supported formats: MP4, AVI, MOV, MKV, M4A")
|
||||
return
|
||||
|
||||
selected_file = st.selectbox("Choose a recording", recordings)
|
||||
|
||||
# Process button with spinner
|
||||
if st.button("🚀 Start Processing"):
|
||||
# Create a progress bar
|
||||
progress_bar = st.progress(0)
|
||||
status_text = st.empty()
|
||||
|
||||
try:
|
||||
# Update progress
|
||||
status_text.text("Extracting audio...")
|
||||
progress_bar.progress(10)
|
||||
|
||||
# Process based on selected features
|
||||
if use_diarization and DIARIZATION_AVAILABLE and hf_token:
|
||||
# Transcribe with speaker diarization
|
||||
status_text.text("Transcribing with speaker diarization...")
|
||||
num_speakers_arg = int(num_speakers) if num_speakers > 0 else None
|
||||
diarized_segments, diarized_transcript = transcribe_with_diarization(
|
||||
selected_file,
|
||||
whisper_model=transcription_model,
|
||||
num_speakers=num_speakers_arg,
|
||||
use_gpu=use_gpu,
|
||||
hf_token=hf_token
|
||||
)
|
||||
segments = diarized_segments
|
||||
transcript = diarized_transcript
|
||||
elif use_translation and TRANSLATION_AVAILABLE:
|
||||
# Transcribe and translate
|
||||
status_text.text("Transcribing and translating...")
|
||||
original_segments, translated_segments, original_transcript, translated_transcript = transcribe_and_translate(
|
||||
selected_file,
|
||||
whisper_model=transcription_model,
|
||||
target_lang=target_lang,
|
||||
use_gpu=use_gpu
|
||||
)
|
||||
segments = translated_segments
|
||||
transcript = translated_transcript
|
||||
# Store original for display
|
||||
original_text = original_transcript
|
||||
else:
|
||||
# Standard transcription
|
||||
status_text.text("Transcribing audio...")
|
||||
segments, transcript = transcribe_audio(
|
||||
selected_file,
|
||||
model=transcription_model,
|
||||
use_cache=use_cache,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
|
||||
progress_bar.progress(50)
|
||||
|
||||
if transcript:
|
||||
# Extract keywords if requested
|
||||
keyword_timestamps = None
|
||||
entity_timestamps = None
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
|
||||
status_text.text("Extracting keywords...")
|
||||
keyword_timestamps, entity_timestamps = extract_keywords_from_transcript(
|
||||
transcript,
|
||||
segments,
|
||||
max_keywords=max_keywords,
|
||||
use_gpu=use_gpu
|
||||
)
|
||||
|
||||
# Generate keyword index
|
||||
keyword_index = generate_keyword_index(keyword_timestamps, entity_timestamps)
|
||||
|
||||
# Generate interactive transcript
|
||||
interactive_transcript = generate_interactive_transcript(
|
||||
segments,
|
||||
keyword_timestamps,
|
||||
entity_timestamps
|
||||
)
|
||||
|
||||
# Generate summary based on selected method
|
||||
status_text.text("Generating summary...")
|
||||
if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)" and ollama_model:
|
||||
summary = chunk_and_summarize(transcript, model=ollama_model)
|
||||
if not summary:
|
||||
st.warning("Ollama summarization failed. Falling back to Hugging Face.")
|
||||
summary = summarize_text(
|
||||
transcript,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
else:
|
||||
summary = summarize_text(
|
||||
transcript,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
|
||||
progress_bar.progress(80)
|
||||
status_text.text("Preparing results...")
|
||||
|
||||
# Display results in tabs
|
||||
tab1, tab2, tab3 = st.tabs(["Summary", "Transcript", "Advanced"])
|
||||
|
||||
with tab1:
|
||||
st.subheader("🖍 Summary")
|
||||
st.write(summary)
|
||||
|
||||
# If translation was used, show original language
|
||||
if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals():
|
||||
with st.expander("Original Language Summary"):
|
||||
original_summary = summarize_text(
|
||||
original_text,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
st.write(original_summary)
|
||||
|
||||
with tab2:
|
||||
st.subheader("📜 Full Transcript")
|
||||
|
||||
# Show interactive transcript if keywords were extracted
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'interactive_transcript' in locals():
|
||||
st.markdown(interactive_transcript, unsafe_allow_html=True)
|
||||
else:
|
||||
st.text(transcript)
|
||||
|
||||
# If translation was used, show original language
|
||||
if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals():
|
||||
with st.expander("Original Language Transcript"):
|
||||
st.text(original_text)
|
||||
|
||||
with tab3:
|
||||
# Show keyword index if available
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'keyword_index' in locals():
|
||||
st.subheader("🔑 Keyword Index")
|
||||
st.markdown(keyword_index)
|
||||
|
||||
# Show speaker information if available
|
||||
if use_diarization and DIARIZATION_AVAILABLE:
|
||||
st.subheader("🎙️ Speaker Information")
|
||||
speakers = set(segment.get('speaker', 'UNKNOWN') for segment in segments)
|
||||
st.write(f"Detected {len(speakers)} speakers: {', '.join(speakers)}")
|
||||
|
||||
# Count words per speaker
|
||||
speaker_words = {}
|
||||
for segment in segments:
|
||||
speaker = segment.get('speaker', 'UNKNOWN')
|
||||
words = len(segment['text'].split())
|
||||
if speaker in speaker_words:
|
||||
speaker_words[speaker] += words
|
||||
else:
|
||||
speaker_words[speaker] = words
|
||||
|
||||
# Display speaker statistics
|
||||
st.write("### Speaker Statistics")
|
||||
for speaker, words in speaker_words.items():
|
||||
st.write(f"- **{speaker}**: {words} words")
|
||||
|
||||
# Export options
|
||||
st.subheader("💾 Export Options")
|
||||
export_cols = st.columns(len(export_format))
|
||||
|
||||
output_base = Path(selected_file).stem
|
||||
|
||||
for i, format_type in enumerate(export_format):
|
||||
with export_cols[i]:
|
||||
if format_type == "TXT":
|
||||
st.download_button(
|
||||
label=f"Download {format_type}",
|
||||
data=transcript,
|
||||
file_name=f"{output_base}_transcript.txt",
|
||||
mime="text/plain"
|
||||
)
|
||||
elif format_type in ["SRT", "VTT", "ASS"]:
|
||||
# Export to subtitle format
|
||||
output_path = export_transcript(
|
||||
transcript,
|
||||
output_base,
|
||||
format_type.lower(),
|
||||
segments=segments,
|
||||
compress=compress_exports,
|
||||
compression_type=compression_type,
|
||||
style=ass_style if format_type == "ASS" and ass_style else None
|
||||
)
|
||||
|
||||
# Read the exported file for download
|
||||
with open(output_path, 'rb') as f:
|
||||
subtitle_content = f.read()
|
||||
|
||||
# Determine file extension
|
||||
file_ext = f".{format_type.lower()}"
|
||||
if compress_exports:
|
||||
file_ext += ".gz" if compression_type == "gzip" else ".zip"
|
||||
|
||||
st.download_button(
|
||||
label=f"Download {format_type}",
|
||||
data=subtitle_content,
|
||||
file_name=f"{output_base}{file_ext}",
|
||||
mime="application/octet-stream"
|
||||
)
|
||||
|
||||
# Clean up the temporary file
|
||||
os.remove(output_path)
|
||||
|
||||
# Complete progress
|
||||
progress_bar.progress(100)
|
||||
status_text.text("Processing complete!")
|
||||
else:
|
||||
st.error("❌ Failed to process recording")
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred: {e}")
|
||||
st.write(e) # This will show the traceback in the Streamlit app
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
backend/.python-version
Normal file
1
backend/.python-version
Normal file
@ -0,0 +1 @@
|
||||
3.11.15
|
||||
54
backend/ai_provider.py
Normal file
54
backend/ai_provider.py
Normal file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI provider interface for Ollama, OpenAI, and Claude.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from services.ai_provider import AIProvider
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python ai_provider.py <command> [args...]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
|
||||
try:
|
||||
if command == "complete":
|
||||
if len(sys.argv) < 4:
|
||||
print("Usage: python ai_provider.py complete <prompt> <provider> [model] [api_key] [base_url] [system_prompt] [temperature]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
prompt = sys.argv[2]
|
||||
provider = sys.argv[3]
|
||||
model = sys.argv[4] if len(sys.argv) > 4 else None
|
||||
api_key = sys.argv[5] if len(sys.argv) > 5 else None
|
||||
base_url = sys.argv[6] if len(sys.argv) > 6 else None
|
||||
system_prompt = sys.argv[7] if len(sys.argv) > 7 else None
|
||||
temperature = float(sys.argv[8]) if len(sys.argv) > 8 else 0.3
|
||||
|
||||
result = AIProvider.complete(prompt, provider, model, api_key, base_url, system_prompt, temperature)
|
||||
print(json.dumps({"response": result}))
|
||||
|
||||
elif command == "list_ollama_models":
|
||||
base_url = sys.argv[2] if len(sys.argv) > 2 else "http://localhost:11434"
|
||||
result = AIProvider.list_ollama_models(base_url)
|
||||
print(json.dumps({"models": result}))
|
||||
|
||||
else:
|
||||
print(f"Unknown command: {command}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
47
backend/audio_cleaner.py
Normal file
47
backend/audio_cleaner.py
Normal file
@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Audio cleaning operations using DeepFilterNet or FFmpeg fallback.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from services.audio_cleaner import clean_audio, is_deepfilter_available
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python audio_cleaner.py <command> [args...]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
|
||||
try:
|
||||
if command == "clean_audio":
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: python audio_cleaner.py clean_audio <input_path> <output_path>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
input_path = sys.argv[2]
|
||||
output_path = sys.argv[3]
|
||||
result = clean_audio(input_path, output_path)
|
||||
print(json.dumps({"output_path": result}))
|
||||
|
||||
elif command == "is_deepfilter_available":
|
||||
result = is_deepfilter_available()
|
||||
print(json.dumps({"available": result}))
|
||||
|
||||
else:
|
||||
print(f"Unknown command: {command}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
50
backend/background_removal.py
Normal file
50
backend/background_removal.py
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Background removal operations (placeholder for Phase 5).
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from services.background_removal import is_available, remove_background_on_export
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python background_removal.py <command> [args...]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
|
||||
try:
|
||||
if command == "is_available":
|
||||
result = is_available()
|
||||
print(json.dumps({"available": result}))
|
||||
|
||||
elif command == "remove_background_on_export":
|
||||
if len(sys.argv) != 6:
|
||||
print("Usage: python background_removal.py remove_background_on_export <input_path> <output_path> <replacement> <replacement_value>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
input_path = sys.argv[2]
|
||||
output_path = sys.argv[3]
|
||||
replacement = sys.argv[4]
|
||||
replacement_value = sys.argv[5]
|
||||
|
||||
result = remove_background_on_export(input_path, output_path, replacement, replacement_value)
|
||||
print(json.dumps({"output_path": result}))
|
||||
|
||||
else:
|
||||
print(f"Unknown command: {command}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
78
backend/caption_generator.py
Normal file
78
backend/caption_generator.py
Normal file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate caption files from word-level timestamps.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from services.caption_generator import generate_srt, generate_vtt, generate_ass, save_captions
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python caption_generator.py <command> [args...]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
|
||||
try:
|
||||
if command == "generate_srt":
|
||||
if len(sys.argv) < 4:
|
||||
print("Usage: python caption_generator.py generate_srt <words_json> [deleted_indices_json] [words_per_line]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
words = json.loads(sys.argv[2])
|
||||
deleted_indices = set(json.loads(sys.argv[3])) if len(sys.argv) > 3 and sys.argv[3] != "null" else None
|
||||
words_per_line = int(sys.argv[4]) if len(sys.argv) > 4 else 8
|
||||
|
||||
result = generate_srt(words, deleted_indices, words_per_line)
|
||||
print(json.dumps({"content": result}))
|
||||
|
||||
elif command == "generate_vtt":
|
||||
if len(sys.argv) < 4:
|
||||
print("Usage: python caption_generator.py generate_vtt <words_json> [deleted_indices_json] [words_per_line]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
words = json.loads(sys.argv[2])
|
||||
deleted_indices = set(json.loads(sys.argv[3])) if len(sys.argv) > 3 and sys.argv[3] != "null" else None
|
||||
words_per_line = int(sys.argv[4]) if len(sys.argv) > 4 else 8
|
||||
|
||||
result = generate_vtt(words, deleted_indices, words_per_line)
|
||||
print(json.dumps({"content": result}))
|
||||
|
||||
elif command == "generate_ass":
|
||||
if len(sys.argv) < 4:
|
||||
print("Usage: python caption_generator.py generate_ass <words_json> [deleted_indices_json] [words_per_line] [style_json]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
words = json.loads(sys.argv[2])
|
||||
deleted_indices = set(json.loads(sys.argv[3])) if len(sys.argv) > 3 and sys.argv[3] != "null" else None
|
||||
words_per_line = int(sys.argv[4]) if len(sys.argv) > 4 else 8
|
||||
style = json.loads(sys.argv[5]) if len(sys.argv) > 5 and sys.argv[5] != "null" else None
|
||||
|
||||
result = generate_ass(words, deleted_indices, words_per_line, style)
|
||||
print(json.dumps({"content": result}))
|
||||
|
||||
elif command == "save_captions":
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: python caption_generator.py save_captions <content> <output_path>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
content = sys.argv[2]
|
||||
output_path = sys.argv[3]
|
||||
|
||||
result = save_captions(content, output_path)
|
||||
print(json.dumps({"output_path": result}))
|
||||
|
||||
else:
|
||||
print(f"Unknown command: {command}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
98
backend/dev_main.py
Normal file
98
backend/dev_main.py
Normal file
@ -0,0 +1,98 @@
|
||||
"""Lightweight development backend for UI work.
|
||||
|
||||
This avoids importing heavy ML dependencies so the UI can run during frontend
|
||||
development without installing large Python packages (torch/whisperx/etc.).
|
||||
Use this when you only need the health/file streaming endpoints.
|
||||
"""
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pathlib import Path
|
||||
|
||||
from routers import audio
|
||||
|
||||
app = FastAPI(title="TalkEdit Dev Backend", version="0.0.1")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
expose_headers=["Content-Range", "Accept-Ranges", "Content-Length"],
|
||||
)
|
||||
|
||||
|
||||
MIME_MAP = {
|
||||
".mp4": "video/mp4",
|
||||
".mkv": "video/x-matroska",
|
||||
".mov": "video/quicktime",
|
||||
".avi": "video/x-msvideo",
|
||||
".webm": "video/webm",
|
||||
".m4a": "audio/mp4",
|
||||
".wav": "audio/wav",
|
||||
".mp3": "audio/mpeg",
|
||||
".flac": "audio/flac",
|
||||
}
|
||||
|
||||
|
||||
app.include_router(audio.router)
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/file")
|
||||
async def serve_local_file(request: Request, path: str):
|
||||
file_path = Path(path)
|
||||
if not file_path.is_file():
|
||||
raise HTTPException(status_code=404, detail=f"File not found: {path}")
|
||||
|
||||
file_size = file_path.stat().st_size
|
||||
content_type = MIME_MAP.get(file_path.suffix.lower(), "application/octet-stream")
|
||||
|
||||
range_header = request.headers.get("range")
|
||||
if range_header:
|
||||
range_spec = range_header.replace("bytes=", "")
|
||||
start_str, end_str = range_spec.split("-")
|
||||
start = int(start_str) if start_str else 0
|
||||
end = int(end_str) if end_str else file_size - 1
|
||||
end = min(end, file_size - 1)
|
||||
content_length = end - start + 1
|
||||
|
||||
def iter_range():
|
||||
with open(file_path, "rb") as f:
|
||||
f.seek(start)
|
||||
remaining = content_length
|
||||
while remaining > 0:
|
||||
chunk = f.read(min(65536, remaining))
|
||||
if not chunk:
|
||||
break
|
||||
remaining -= len(chunk)
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
iter_range(),
|
||||
status_code=206,
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Content-Range": f"bytes {start}-{end}/{file_size}",
|
||||
"Accept-Ranges": "bytes",
|
||||
"Content-Length": str(content_length),
|
||||
},
|
||||
)
|
||||
|
||||
def iter_file():
|
||||
with open(file_path, "rb") as f:
|
||||
while chunk := f.read(65536):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
iter_file(),
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Accept-Ranges": "bytes",
|
||||
"Content-Length": str(file_size),
|
||||
},
|
||||
)
|
||||
47
backend/diarization.py
Normal file
47
backend/diarization.py
Normal file
@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Speaker diarization using pyannote.audio.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from services.diarization import diarize_and_label
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python diarization.py <command> [args...]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
|
||||
try:
|
||||
if command == "diarize_and_label":
|
||||
if len(sys.argv) < 4:
|
||||
print("Usage: python diarization.py diarize_and_label <transcription_result_json> <audio_path> [hf_token] [num_speakers] [use_gpu]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
transcription_result = json.loads(sys.argv[2])
|
||||
audio_path = sys.argv[3]
|
||||
hf_token = sys.argv[4] if len(sys.argv) > 4 else None
|
||||
num_speakers = int(sys.argv[5]) if len(sys.argv) > 5 and sys.argv[5] != "null" else None
|
||||
use_gpu = sys.argv[6].lower() == "true" if len(sys.argv) > 6 else True
|
||||
|
||||
result = diarize_and_label(transcription_result, audio_path, hf_token, num_speakers, use_gpu)
|
||||
print(json.dumps(result))
|
||||
|
||||
else:
|
||||
print(f"Unknown command: {command}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
223
backend/license_server.py
Normal file
223
backend/license_server.py
Normal file
@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TalkEdit License Server — Stripe webhook + license key generator.
|
||||
|
||||
Usage (development):
|
||||
python backend/license_server.py
|
||||
|
||||
Then create a test license:
|
||||
python backend/license_server.py generate --email test@example.com --tier pro
|
||||
|
||||
This is a minimal server. In production, deploy as a Cloudflare Worker,
|
||||
Vercel function, or a small VPS behind nginx.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import hmac
|
||||
import hashlib
|
||||
|
||||
from nacl.bindings import (
|
||||
crypto_sign_seed_keypair,
|
||||
crypto_sign,
|
||||
crypto_sign_BYTES,
|
||||
)
|
||||
|
||||
# === CONFIGURATION ===
|
||||
|
||||
# The Ed25519 private key (base64-encoded). Keep this secret!
|
||||
# Generate with: python3 -c "import os,base64; print(base64.b64encode(os.urandom(32)).decode())"
|
||||
LICENSE_PRIVATE_KEY_B64 = "ONTdT2Hn367fMlovqulz7WYQPQru7uFa/GaSfjhnR9x7Qoe7uBPNwIFeW4p7A0g05Qj14rvaQ4Mm1u/LzgeEsA=="
|
||||
|
||||
# Stripe webhook secret (set this in production)
|
||||
STRIPE_WEBHOOK_SECRET = os.environ.get("STRIPE_WEBHOOK_SECRET", "")
|
||||
|
||||
# === TIER DEFINITIONS ===
|
||||
|
||||
TIERS = {
|
||||
"pro": {
|
||||
"price_id": "price_pro_monthly", # Replace with your Stripe price ID
|
||||
"features": ["bundled_deps", "auto_updates", "priority_support"],
|
||||
"max_activations": 1,
|
||||
"duration_days": 365,
|
||||
},
|
||||
"business": {
|
||||
"price_id": "price_business_monthly",
|
||||
"features": ["bundled_deps", "auto_updates", "priority_support",
|
||||
"white_label", "audit_logging", "bulk_deployment"],
|
||||
"max_activations": 10,
|
||||
"duration_days": 365,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def generate_license_key(
|
||||
customer_email: str,
|
||||
tier: str = "pro",
|
||||
license_id: str = None,
|
||||
duration_days: int = None,
|
||||
features: list = None,
|
||||
max_activations: int = None,
|
||||
) -> str:
|
||||
"""Generate a signed TalkEdit license key.
|
||||
|
||||
Returns a string like: talkedit_v1_<base64(payload)>.<base64(signature)>
|
||||
"""
|
||||
if license_id is None:
|
||||
license_id = f"lic_{int(time.time())}_{os.urandom(4).hex()}"
|
||||
|
||||
tier_config = TIERS.get(tier, TIERS["pro"])
|
||||
if duration_days is None:
|
||||
duration_days = tier_config["duration_days"]
|
||||
if features is None:
|
||||
features = tier_config["features"]
|
||||
if max_activations is None:
|
||||
max_activations = tier_config["max_activations"]
|
||||
|
||||
now = int(time.time())
|
||||
payload = {
|
||||
"license_id": license_id,
|
||||
"customer_email": customer_email,
|
||||
"tier": tier,
|
||||
"features": features,
|
||||
"issued_at": now,
|
||||
"expires_at": now + duration_days * 86400,
|
||||
"max_activations": max_activations,
|
||||
}
|
||||
|
||||
payload_bytes = json.dumps(payload, separators=(",", ":")).encode("utf-8")
|
||||
|
||||
# Sign with Ed25519
|
||||
seed = base64.b64decode(LICENSE_PRIVATE_KEY_B64)
|
||||
if len(seed) == 64:
|
||||
seed = seed[:32] # First 32 bytes are the actual seed
|
||||
pk, sk = crypto_sign_seed_keypair(seed)
|
||||
signed = crypto_sign(payload_bytes, sk)
|
||||
signature = signed[:crypto_sign_BYTES]
|
||||
|
||||
payload_b64 = base64.b64encode(payload_bytes).decode().rstrip("=")
|
||||
sig_b64 = base64.b64encode(signature).decode().rstrip("=")
|
||||
|
||||
return f"talkedit_v1_{payload_b64}.{sig_b64}"
|
||||
|
||||
|
||||
def verify_stripe_webhook(payload: bytes, sig_header: str) -> dict:
|
||||
"""Verify Stripe webhook signature and return the event."""
|
||||
if not STRIPE_WEBHOOK_SECRET:
|
||||
raise ValueError("STRIPE_WEBHOOK_SECRET not configured")
|
||||
|
||||
# Stripe sends signature in the `stripe-signature` header
|
||||
# Format: t=timestamp,v1=signature
|
||||
parts = {}
|
||||
for item in sig_header.split(","):
|
||||
key, _, value = item.partition("=")
|
||||
parts[key.strip()] = value.strip()
|
||||
|
||||
timestamp = parts.get("t", "")
|
||||
expected_sig = parts.get("v1", "")
|
||||
|
||||
# Compute signature
|
||||
signed_payload = f"{timestamp}.{payload.decode()}".encode()
|
||||
computed_sig = hmac.new(
|
||||
STRIPE_WEBHOOK_SECRET.encode(),
|
||||
signed_payload,
|
||||
hashlib.sha256,
|
||||
).hexdigest()
|
||||
|
||||
if not hmac.compare_digest(computed_sig, expected_sig):
|
||||
raise ValueError("Invalid webhook signature")
|
||||
|
||||
return json.loads(payload)
|
||||
|
||||
|
||||
# === CLI ===
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "generate":
|
||||
# CLI mode: generate a test license key
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Generate TalkEdit license key")
|
||||
parser.add_argument("--email", default="test@example.com")
|
||||
parser.add_argument("--tier", default="pro", choices=["pro", "business"])
|
||||
parser.add_argument("--days", type=int, default=None)
|
||||
args = parser.parse_args(sys.argv[2:])
|
||||
|
||||
key = generate_license_key(
|
||||
customer_email=args.email,
|
||||
tier=args.tier,
|
||||
duration_days=args.days,
|
||||
)
|
||||
print()
|
||||
print("=== TALKEDIT LICENSE KEY ===")
|
||||
print(key)
|
||||
print()
|
||||
print("Paste this into the TalkEdit app to activate.")
|
||||
return
|
||||
|
||||
# Server mode
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import urllib.parse
|
||||
|
||||
class LicenseHandler(BaseHTTPRequestHandler):
|
||||
def do_POST(self):
|
||||
path = urllib.parse.urlparse(self.path).path
|
||||
|
||||
if path == "/webhook/stripe":
|
||||
content_length = int(self.headers.get("Content-Length", 0))
|
||||
body = self.rfile.read(content_length)
|
||||
sig_header = self.headers.get("Stripe-Signature", "")
|
||||
|
||||
try:
|
||||
event = verify_stripe_webhook(body, sig_header)
|
||||
event_type = event.get("type", "")
|
||||
|
||||
if event_type == "checkout.session.completed":
|
||||
session = event["data"]["object"]
|
||||
email = session.get("customer_email", session.get("customer_details", {}).get("email", "unknown"))
|
||||
tier = "pro" # Map from session["metadata"]["tier"] or line items
|
||||
|
||||
license_key = generate_license_key(
|
||||
customer_email=email,
|
||||
tier=tier,
|
||||
)
|
||||
|
||||
# In production: email the license key to the customer
|
||||
print(f"License generated for {email}: {license_key[:40]}...")
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps({"status": "ok"}).encode())
|
||||
else:
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Webhook error: {e}")
|
||||
self.send_response(400)
|
||||
self.end_headers()
|
||||
self.wfile.write(str(e).encode())
|
||||
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
print(f"[license-server] {args}")
|
||||
|
||||
port = int(os.environ.get("PORT", 8643))
|
||||
server = HTTPServer(("0.0.0.0", port), LicenseHandler)
|
||||
print(f"License server listening on http://0.0.0.0:{port}")
|
||||
print(f" POST /webhook/stripe - Stripe webhook")
|
||||
print()
|
||||
print("To generate a test license:")
|
||||
print(f" python {__file__} generate --email you@example.com --tier pro")
|
||||
server.serve_forever()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
158
backend/main.py
Normal file
158
backend/main.py
Normal file
@ -0,0 +1,158 @@
|
||||
import logging
|
||||
import os
|
||||
import stat
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, Query, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from routers import transcribe, export, ai, captions, audio
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Dev log file — frontend forwards console.error/warn here so the agent can read it
|
||||
DEV_LOG_PATH = Path(__file__).parent.parent / "webview.log"
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
logger.info("AI Video Editor backend starting up")
|
||||
yield
|
||||
logger.info("AI Video Editor backend shutting down")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="AI Video Editor Backend",
|
||||
version="0.1.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
expose_headers=["Content-Range", "Accept-Ranges", "Content-Length"],
|
||||
)
|
||||
|
||||
app.include_router(transcribe.router)
|
||||
app.include_router(export.router)
|
||||
app.include_router(ai.router)
|
||||
app.include_router(captions.router)
|
||||
app.include_router(audio.router)
|
||||
|
||||
|
||||
MIME_MAP = {
|
||||
".mp4": "video/mp4",
|
||||
".mkv": "video/x-matroska",
|
||||
".mov": "video/quicktime",
|
||||
".avi": "video/x-msvideo",
|
||||
".webm": "video/webm",
|
||||
".m4a": "audio/mp4",
|
||||
".wav": "audio/wav",
|
||||
".mp3": "audio/mpeg",
|
||||
".flac": "audio/flac",
|
||||
}
|
||||
|
||||
|
||||
@app.get("/file")
|
||||
async def serve_local_file(request: Request, path: str = Query(...)):
|
||||
"""Stream a local file with HTTP Range support (required for video seeking)."""
|
||||
file_path = Path(path)
|
||||
if not file_path.is_file():
|
||||
logger.warning(f"[serve_file] File not found: {path}")
|
||||
raise HTTPException(status_code=404, detail=f"File not found: {path}")
|
||||
|
||||
file_size = file_path.stat().st_size
|
||||
content_type = MIME_MAP.get(file_path.suffix.lower(), "application/octet-stream")
|
||||
range_header = request.headers.get("range")
|
||||
|
||||
logger.info(
|
||||
f"[serve_file] {file_path.name} | size={file_size} | "
|
||||
f"type={content_type} | range={range_header or 'none'}"
|
||||
)
|
||||
|
||||
if content_type == "application/octet-stream":
|
||||
logger.warning(
|
||||
f"[serve_file] Unknown MIME type for extension '{file_path.suffix}' — "
|
||||
f"browser may fail to decode audio/video for '{file_path.name}'"
|
||||
)
|
||||
|
||||
if file_size == 0:
|
||||
logger.error(f"[serve_file] File is empty: {path}")
|
||||
raise HTTPException(status_code=422, detail=f"File is empty: {path}")
|
||||
if range_header:
|
||||
try:
|
||||
range_spec = range_header.replace("bytes=", "")
|
||||
range_start_str, range_end_str = range_spec.split("-")
|
||||
range_start = int(range_start_str) if range_start_str else 0
|
||||
range_end = int(range_end_str) if range_end_str else file_size - 1
|
||||
range_end = min(range_end, file_size - 1)
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(f"[serve_file] Malformed Range header '{range_header}': {e}")
|
||||
raise HTTPException(status_code=416, detail=f"Invalid Range header: {range_header}")
|
||||
content_length = range_end - range_start + 1
|
||||
|
||||
def iter_range():
|
||||
with open(file_path, "rb") as f:
|
||||
f.seek(range_start)
|
||||
remaining = content_length
|
||||
while remaining > 0:
|
||||
chunk = f.read(min(65536, remaining))
|
||||
if not chunk:
|
||||
break
|
||||
remaining -= len(chunk)
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
iter_range(),
|
||||
status_code=206,
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
|
||||
"Accept-Ranges": "bytes",
|
||||
"Content-Length": str(content_length),
|
||||
},
|
||||
)
|
||||
|
||||
def iter_file():
|
||||
with open(file_path, "rb") as f:
|
||||
while chunk := f.read(65536):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
iter_file(),
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Accept-Ranges": "bytes",
|
||||
"Content-Length": str(file_size),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
import datetime
|
||||
|
||||
@app.post("/dev/log")
|
||||
async def dev_log(request: Request):
|
||||
data = await request.json()
|
||||
level = data.get("level", "log")
|
||||
msg = str(data.get("message", ""))
|
||||
args = [str(a) for a in data.get("args", [])]
|
||||
ts = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||
line = f"[{ts}] [{level.upper():5}] {msg}"
|
||||
if args:
|
||||
line += " " + " ".join(args)
|
||||
line += "\n"
|
||||
with open(DEV_LOG_PATH, "a") as f:
|
||||
f.write(line)
|
||||
return {"ok": True}
|
||||
164
backend/requirements.txt
Normal file
164
backend/requirements.txt
Normal file
@ -0,0 +1,164 @@
|
||||
aiohappyeyeballs==2.6.1
|
||||
aiohttp==3.13.4
|
||||
aiosignal==1.4.0
|
||||
alembic==1.18.4
|
||||
annotated-doc==0.0.4
|
||||
annotated-types==0.7.0
|
||||
anthropic==0.86.0
|
||||
antlr4-python3-runtime==4.9.3
|
||||
anyio==4.13.0
|
||||
appdirs==1.4.4
|
||||
asteroid-filterbanks==0.4.0
|
||||
attrs==26.1.0
|
||||
av==17.0.0
|
||||
certifi==2026.2.25
|
||||
cffi==2.0.0
|
||||
charset-normalizer==3.4.6
|
||||
click==8.3.1
|
||||
colorlog==6.10.1
|
||||
contourpy==1.3.3
|
||||
ctranslate2==4.7.1
|
||||
cuda-bindings==12.9.4
|
||||
cuda-pathfinder==1.2.2
|
||||
cuda-toolkit==12.6.3
|
||||
cycler==0.12.1
|
||||
Cython==0.29.37
|
||||
decorator==5.2.1
|
||||
DeepFilterLib==0.5.6
|
||||
DeepFilterNet==0.5.6
|
||||
distro==1.9.0
|
||||
docstring_parser==0.17.0
|
||||
einops==0.8.2
|
||||
fastapi==0.135.2
|
||||
faster-whisper==1.2.1
|
||||
ffmpeg-python==0.2.0
|
||||
filelock==3.25.2
|
||||
flatbuffers==25.12.19
|
||||
fonttools==4.62.1
|
||||
frozenlist==1.8.0
|
||||
fsspec==2026.2.0
|
||||
future==1.0.0
|
||||
googleapis-common-protos==1.73.1
|
||||
greenlet==3.3.2
|
||||
grpcio==1.78.0
|
||||
h11==0.16.0
|
||||
hf-xet==1.4.2
|
||||
httpcore==1.0.9
|
||||
httptools==0.7.1
|
||||
httpx==0.28.1
|
||||
huggingface_hub==0.36.2
|
||||
HyperPyYAML==1.2.3
|
||||
idna==3.11
|
||||
ImageIO==2.37.3
|
||||
imageio-ffmpeg==0.6.0
|
||||
importlib_metadata==8.7.1
|
||||
Jinja2==3.1.6
|
||||
jiter==0.13.0
|
||||
joblib==1.5.3
|
||||
julius==0.2.7
|
||||
kiwisolver==1.5.0
|
||||
lightning==2.6.1
|
||||
lightning-utilities==0.15.3
|
||||
loguru==0.7.3
|
||||
Mako==1.3.10
|
||||
markdown-it-py==4.0.0
|
||||
MarkupSafe==3.0.3
|
||||
matplotlib==3.10.8
|
||||
maturin==1.12.6
|
||||
mdurl==0.1.2
|
||||
moviepy==2.2.1
|
||||
mpmath==1.3.0
|
||||
multidict==6.7.1
|
||||
networkx==3.6.1
|
||||
nltk==3.9.4
|
||||
numpy==2.4.3
|
||||
nvidia-cublas-cu12==12.8.4.1
|
||||
nvidia-cuda-cupti-cu12==12.8.90
|
||||
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||
nvidia-cuda-runtime-cu12==12.8.90
|
||||
nvidia-cudnn-cu12==9.10.2.21
|
||||
nvidia-cufft-cu12==11.3.3.83
|
||||
nvidia-cufile-cu12==1.13.1.3
|
||||
nvidia-curand-cu12==10.3.9.90
|
||||
nvidia-cusolver-cu12==11.7.3.90
|
||||
nvidia-cusparse-cu12==12.5.8.93
|
||||
nvidia-cusparselt-cu12==0.7.1
|
||||
nvidia-nccl-cu12==2.27.3
|
||||
nvidia-nvjitlink-cu12==12.8.93
|
||||
nvidia-nvshmem-cu12==3.4.5
|
||||
nvidia-nvtx-cu12==12.8.90
|
||||
omegaconf==2.3.0
|
||||
onnxruntime==1.24.4
|
||||
openai==2.30.0
|
||||
opentelemetry-api==1.40.0
|
||||
opentelemetry-exporter-otlp==1.40.0
|
||||
opentelemetry-exporter-otlp-proto-common==1.40.0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.40.0
|
||||
opentelemetry-exporter-otlp-proto-http==1.40.0
|
||||
opentelemetry-proto==1.40.0
|
||||
opentelemetry-sdk==1.40.0
|
||||
opentelemetry-semantic-conventions==0.61b0
|
||||
optuna==4.8.0
|
||||
packaging==23.2
|
||||
pandas==3.0.1
|
||||
pillow==11.3.0
|
||||
primePy==1.3
|
||||
proglog==0.1.12
|
||||
propcache==0.4.1
|
||||
protobuf==6.33.6
|
||||
pyannote-audio==4.0.4
|
||||
pyannote-core==6.0.1
|
||||
pyannote-database==6.1.1
|
||||
pyannote-metrics==4.0.0
|
||||
pyannote-pipeline==4.0.0
|
||||
pyannoteai-sdk==0.4.0
|
||||
pycparser==3.0
|
||||
pydantic==2.12.5
|
||||
pydantic_core==2.41.5
|
||||
Pygments==2.19.2
|
||||
pyparsing==3.3.2
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.2.2
|
||||
python-multipart==0.0.22
|
||||
pytorch-lightning==2.6.1
|
||||
pytorch-metric-learning==2.9.0
|
||||
PyYAML==6.0.3
|
||||
regex==2026.2.28
|
||||
requests==2.33.0
|
||||
rich==14.3.3
|
||||
ruamel.yaml==0.18.17
|
||||
ruamel.yaml.clib==0.2.15
|
||||
safetensors==0.7.0
|
||||
scikit-learn==1.8.0
|
||||
scipy==1.17.1
|
||||
setuptools==70.2.0
|
||||
shellingham==1.5.4
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soundfile==0.13.1
|
||||
SQLAlchemy==2.0.48
|
||||
starlette==1.0.0
|
||||
sympy==1.14.0
|
||||
threadpoolctl==3.6.0
|
||||
tokenizers==0.22.2
|
||||
torch==2.8.0
|
||||
torch-audiomentations==0.12.0
|
||||
torch_pitch_shift==1.2.5
|
||||
torchaudio==2.8.0
|
||||
torchmetrics==1.9.0
|
||||
tqdm==4.67.3
|
||||
transformers==4.57.6
|
||||
triton==3.4.0
|
||||
typer==0.24.1
|
||||
typing-inspection==0.4.2
|
||||
typing_extensions==4.15.0
|
||||
urllib3==2.6.3
|
||||
uvicorn==0.42.0
|
||||
uvloop==0.22.1
|
||||
watchfiles==1.1.1
|
||||
websockets==16.0
|
||||
wheel==0.46.3
|
||||
whisperx==3.8.4
|
||||
yarl==1.23.0
|
||||
zipp==3.23.0
|
||||
0
backend/routers/__init__.py
Normal file
0
backend/routers/__init__.py
Normal file
83
backend/routers/ai.py
Normal file
83
backend/routers/ai.py
Normal file
@ -0,0 +1,83 @@
|
||||
"""AI feature endpoints: filler word detection, clip creation, Ollama model listing."""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.ai_provider import AIProvider, detect_filler_words, create_clip_suggestion
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class WordInfo(BaseModel):
|
||||
index: int
|
||||
word: str
|
||||
start: Optional[float] = None
|
||||
end: Optional[float] = None
|
||||
|
||||
|
||||
class FillerRequest(BaseModel):
|
||||
transcript: str
|
||||
words: List[WordInfo]
|
||||
provider: str = "ollama"
|
||||
model: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
custom_filler_words: Optional[str] = None
|
||||
|
||||
|
||||
class ClipRequest(BaseModel):
|
||||
transcript: str
|
||||
words: List[WordInfo]
|
||||
provider: str = "ollama"
|
||||
model: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
target_duration: int = 60
|
||||
|
||||
|
||||
@router.post("/ai/filler-removal")
|
||||
async def filler_removal(req: FillerRequest):
|
||||
try:
|
||||
words_dicts = [w.model_dump() for w in req.words]
|
||||
result = detect_filler_words(
|
||||
transcript=req.transcript,
|
||||
words=words_dicts,
|
||||
provider=req.provider,
|
||||
model=req.model,
|
||||
api_key=req.api_key,
|
||||
base_url=req.base_url,
|
||||
custom_filler_words=req.custom_filler_words,
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Filler detection failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/ai/create-clip")
|
||||
async def create_clip(req: ClipRequest):
|
||||
try:
|
||||
words_dicts = [w.model_dump() for w in req.words]
|
||||
result = create_clip_suggestion(
|
||||
transcript=req.transcript,
|
||||
words=words_dicts,
|
||||
target_duration=req.target_duration,
|
||||
provider=req.provider,
|
||||
model=req.model,
|
||||
api_key=req.api_key,
|
||||
base_url=req.base_url,
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Clip creation failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/ai/ollama-models")
|
||||
async def ollama_models(base_url: str = "http://localhost:11434"):
|
||||
models = AIProvider.list_ollama_models(base_url)
|
||||
return {"models": models}
|
||||
193
backend/routers/audio.py
Normal file
193
backend/routers/audio.py
Normal file
@ -0,0 +1,193 @@
|
||||
"""Audio processing endpoint (noise reduction / Studio Sound)."""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query, Request
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available, normalize_audio
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
# Simple in-process cache: video path → extracted WAV path
|
||||
_waveform_cache: dict[str, str] = {}
|
||||
|
||||
|
||||
class AudioCleanRequest(BaseModel):
|
||||
input_path: str
|
||||
output_path: Optional[str] = None
|
||||
|
||||
|
||||
class SilenceDetectRequest(BaseModel):
|
||||
input_path: str
|
||||
min_silence_ms: int = 500
|
||||
silence_db: float = -35.0
|
||||
|
||||
|
||||
@router.post("/audio/clean")
|
||||
async def clean_audio_endpoint(req: AudioCleanRequest):
|
||||
try:
|
||||
output = clean_audio(req.input_path, req.output_path or "")
|
||||
return {
|
||||
"status": "ok",
|
||||
"output_path": output,
|
||||
"engine": "deepfilternet" if is_deepfilter_available() else "ffmpeg_anlmdn",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Audio cleaning failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/audio/capabilities")
|
||||
async def audio_capabilities():
|
||||
return {
|
||||
"deepfilternet_available": is_deepfilter_available(),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/audio/detect-silence")
|
||||
async def detect_silence_endpoint(req: SilenceDetectRequest):
|
||||
try:
|
||||
ranges = detect_silence_ranges(
|
||||
req.input_path,
|
||||
req.min_silence_ms,
|
||||
req.silence_db,
|
||||
)
|
||||
return {
|
||||
"status": "ok",
|
||||
"ranges": ranges,
|
||||
"count": len(ranges),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Silence detection failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/audio/waveform")
|
||||
async def get_waveform_audio(request: Request, path: str = Query(...)):
|
||||
"""
|
||||
Extract audio from any video/audio file and return it as a WAV.
|
||||
The WAV is cached on disk for subsequent requests.
|
||||
Uses FFmpeg directly so it works with MKV, MOV, AVI, MP4, etc.
|
||||
"""
|
||||
req_id = hashlib.md5(f"{path}:{request.url}".encode()).hexdigest()[:10]
|
||||
file_path = Path(path)
|
||||
logger.info(
|
||||
"[waveform:%s] request raw_url=%s raw_query=%s decoded_path=%r path_len=%s",
|
||||
req_id,
|
||||
str(request.url),
|
||||
request.url.query,
|
||||
path,
|
||||
len(path),
|
||||
)
|
||||
|
||||
try:
|
||||
resolved_path = file_path.expanduser().resolve(strict=False)
|
||||
except Exception:
|
||||
resolved_path = file_path
|
||||
|
||||
logger.info(
|
||||
"[waveform:%s] normalized path=%s exists=%s is_file=%s",
|
||||
req_id,
|
||||
resolved_path,
|
||||
file_path.exists(),
|
||||
file_path.is_file(),
|
||||
)
|
||||
|
||||
if not file_path.is_file():
|
||||
logger.warning("[waveform:%s] file_not_found path=%r", req_id, path)
|
||||
raise HTTPException(status_code=404, detail=f"File not found: {path}")
|
||||
|
||||
# Cache key based on path + mtime so stale cache is auto-invalidated
|
||||
mtime = file_path.stat().st_mtime
|
||||
cache_key = hashlib.md5(f"{path}:{mtime}".encode()).hexdigest()
|
||||
logger.info("[waveform:%s] cache_key=%s mtime=%s", req_id, cache_key, mtime)
|
||||
|
||||
if cache_key in _waveform_cache:
|
||||
cached = Path(_waveform_cache[cache_key])
|
||||
if cached.exists():
|
||||
logger.info("[waveform:%s] cache_hit cached=%s", req_id, cached)
|
||||
return FileResponse(str(cached), media_type="audio/wav")
|
||||
else:
|
||||
del _waveform_cache[cache_key]
|
||||
|
||||
logger.info("[waveform:%s] cache_miss extracting file=%s", req_id, file_path)
|
||||
tmp_dir = tempfile.mkdtemp(prefix="talkedit_waveform_")
|
||||
out_wav = Path(tmp_dir) / f"{cache_key}.wav"
|
||||
|
||||
# Downsample to mono 8000 Hz — enough for waveform drawing and much smaller payloads
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", str(file_path),
|
||||
"-vn", # drop video
|
||||
"-ac", "1", # mono
|
||||
"-ar", "8000", # 8 kHz sample rate
|
||||
"-acodec", "pcm_s16le", # 16-bit PCM WAV
|
||||
str(out_wav),
|
||||
]
|
||||
logger.info("[waveform:%s] ffmpeg_cmd=%s", req_id, " ".join(cmd))
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"[waveform:%s] ffmpeg_failed returncode=%s stderr_tail=%s",
|
||||
req_id,
|
||||
result.returncode,
|
||||
result.stderr[-2000:],
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to extract audio: {result.stderr[-300:]}"
|
||||
)
|
||||
|
||||
if not out_wav.exists() or out_wav.stat().st_size == 0:
|
||||
logger.error(
|
||||
"[waveform:%s] empty_output out_wav=%s exists=%s size=%s",
|
||||
req_id,
|
||||
out_wav,
|
||||
out_wav.exists(),
|
||||
out_wav.stat().st_size if out_wav.exists() else -1,
|
||||
)
|
||||
raise HTTPException(status_code=500, detail="Audio extraction produced empty file")
|
||||
|
||||
logger.info(
|
||||
"[waveform:%s] extracted_bytes=%s out_wav=%s",
|
||||
req_id,
|
||||
out_wav.stat().st_size,
|
||||
out_wav,
|
||||
)
|
||||
_waveform_cache[cache_key] = str(out_wav)
|
||||
return FileResponse(str(out_wav), media_type="audio/wav")
|
||||
|
||||
|
||||
class NormalizeRequest(BaseModel):
|
||||
input_path: str
|
||||
output_path: Optional[str] = None
|
||||
target_lufs: float = -14.0
|
||||
|
||||
|
||||
@router.post("/audio/normalize")
|
||||
async def normalize_audio_endpoint(req: NormalizeRequest):
|
||||
"""Normalize audio loudness to a target LUFS level using FFmpeg loudnorm."""
|
||||
if req.target_lufs < -70 or req.target_lufs > 0:
|
||||
raise HTTPException(status_code=400, detail="target_lufs must be between -70 and 0")
|
||||
try:
|
||||
output = normalize_audio(
|
||||
req.input_path,
|
||||
req.output_path or "",
|
||||
target_lufs=req.target_lufs,
|
||||
)
|
||||
return {
|
||||
"status": "ok",
|
||||
"output_path": output,
|
||||
"target_lufs": req.target_lufs,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Audio normalization failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
67
backend/routers/captions.py
Normal file
67
backend/routers/captions.py
Normal file
@ -0,0 +1,67 @@
|
||||
"""Caption generation endpoint."""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import PlainTextResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.caption_generator import generate_srt, generate_vtt, generate_ass, save_captions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class CaptionWord(BaseModel):
|
||||
word: str
|
||||
start: float
|
||||
end: float
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
class CaptionStyle(BaseModel):
|
||||
fontName: str = "Arial"
|
||||
fontSize: int = 48
|
||||
fontColor: str = "&H00FFFFFF"
|
||||
backgroundColor: str = "&H80000000"
|
||||
position: str = "bottom"
|
||||
bold: bool = True
|
||||
|
||||
|
||||
class CaptionRequest(BaseModel):
|
||||
words: List[CaptionWord]
|
||||
deleted_indices: List[int] = []
|
||||
format: str = "srt"
|
||||
words_per_line: int = 8
|
||||
style: Optional[CaptionStyle] = None
|
||||
output_path: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/captions")
|
||||
async def generate_captions(req: CaptionRequest):
|
||||
try:
|
||||
words_dicts = [w.model_dump() for w in req.words]
|
||||
deleted_set = set(req.deleted_indices)
|
||||
|
||||
if req.format == "srt":
|
||||
content = generate_srt(words_dicts, deleted_set, req.words_per_line)
|
||||
elif req.format == "vtt":
|
||||
content = generate_vtt(words_dicts, deleted_set, req.words_per_line)
|
||||
elif req.format == "ass":
|
||||
style_dict = req.style.model_dump() if req.style else None
|
||||
content = generate_ass(words_dicts, deleted_set, req.words_per_line, style_dict)
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Unknown format: {req.format}")
|
||||
|
||||
if req.output_path:
|
||||
saved = save_captions(content, req.output_path)
|
||||
return {"status": "ok", "output_path": saved}
|
||||
|
||||
return PlainTextResponse(content, media_type="text/plain")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Caption generation failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
357
backend/routers/export.py
Normal file
357
backend/routers/export.py
Normal file
@ -0,0 +1,357 @@
|
||||
"""Export endpoint for video cutting and rendering."""
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs, mix_background_music, concat_clips
|
||||
from services.audio_cleaner import clean_audio
|
||||
from services.caption_generator import generate_srt, generate_ass, save_captions
|
||||
from services.background_removal import remove_background_on_export as remove_bg
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class SegmentModel(BaseModel):
|
||||
start: float
|
||||
end: float
|
||||
|
||||
|
||||
class GainRangeModel(SegmentModel):
|
||||
gain_db: float
|
||||
|
||||
|
||||
class SpeedRangeModel(SegmentModel):
|
||||
speed: float
|
||||
|
||||
|
||||
class ExportWordModel(BaseModel):
|
||||
word: str
|
||||
start: float
|
||||
end: float
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
class ZoomConfigModel(BaseModel):
|
||||
enabled: bool = False
|
||||
zoomFactor: float = 1.0
|
||||
panX: float = 0.0
|
||||
panY: float = 0.0
|
||||
|
||||
|
||||
class BackgroundMusicModel(BaseModel):
|
||||
path: str
|
||||
volumeDb: float = 0.0
|
||||
duckingEnabled: bool = False
|
||||
duckingDb: float = 6.0
|
||||
duckingAttackMs: float = 10.0
|
||||
duckingReleaseMs: float = 200.0
|
||||
|
||||
|
||||
class ExportRequest(BaseModel):
|
||||
input_path: str
|
||||
output_path: str
|
||||
keep_segments: List[SegmentModel]
|
||||
mute_ranges: Optional[List[SegmentModel]] = None
|
||||
gain_ranges: Optional[List[GainRangeModel]] = None
|
||||
speed_ranges: Optional[List[SpeedRangeModel]] = None
|
||||
global_gain_db: float = 0.0
|
||||
mode: str = "fast"
|
||||
resolution: str = "1080p"
|
||||
format: str = "mp4"
|
||||
enhanceAudio: bool = False
|
||||
normalize_loudness: bool = False
|
||||
normalize_target_lufs: float = -14.0
|
||||
captions: str = "none"
|
||||
words: Optional[List[ExportWordModel]] = None
|
||||
deleted_indices: Optional[List[int]] = None
|
||||
zoom: Optional[ZoomConfigModel] = None
|
||||
additional_clips: Optional[List[str]] = None
|
||||
background_music: Optional[BackgroundMusicModel] = None
|
||||
remove_background: bool = False
|
||||
background_replacement: str = "blur"
|
||||
background_replacement_value: str = ""
|
||||
|
||||
|
||||
class TranscriptExportRequest(BaseModel):
|
||||
words: List[ExportWordModel]
|
||||
deleted_indices: Optional[List[int]] = None
|
||||
output_path: str
|
||||
format: str = "txt" # "txt" or "srt"
|
||||
|
||||
|
||||
def _map_ranges_to_output_timeline(
|
||||
ranges: List[dict],
|
||||
keep_segments: List[dict],
|
||||
) -> List[dict]:
|
||||
"""Map source-time ranges to output timeline after cuts are applied."""
|
||||
if not ranges or not keep_segments:
|
||||
return []
|
||||
|
||||
mapped: List[dict] = []
|
||||
output_cursor = 0.0
|
||||
for keep in keep_segments:
|
||||
keep_start = float(keep["start"])
|
||||
keep_end = float(keep["end"])
|
||||
keep_len = max(0.0, keep_end - keep_start)
|
||||
if keep_len <= 0:
|
||||
continue
|
||||
|
||||
for src_range in ranges:
|
||||
overlap_start = max(keep_start, float(src_range["start"]))
|
||||
overlap_end = min(keep_end, float(src_range["end"]))
|
||||
if overlap_end <= overlap_start:
|
||||
continue
|
||||
|
||||
mapped_range = {
|
||||
"start": output_cursor + (overlap_start - keep_start),
|
||||
"end": output_cursor + (overlap_end - keep_start),
|
||||
}
|
||||
if "gain_db" in src_range:
|
||||
mapped_range["gain_db"] = float(src_range["gain_db"])
|
||||
if "speed" in src_range:
|
||||
mapped_range["speed"] = float(src_range["speed"])
|
||||
mapped.append(mapped_range)
|
||||
|
||||
output_cursor += keep_len
|
||||
|
||||
return mapped
|
||||
|
||||
|
||||
def _mux_audio(video_path: str, audio_path: str, output_path: str) -> str:
|
||||
"""Replace video's audio track with cleaned audio using FFmpeg."""
|
||||
import subprocess
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", video_path,
|
||||
"-i", audio_path,
|
||||
"-c:v", "copy",
|
||||
"-map", "0:v:0",
|
||||
"-map", "1:a:0",
|
||||
"-shortest",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Audio mux failed: {result.stderr[-300:]}")
|
||||
return output_path
|
||||
|
||||
|
||||
@router.post("/export")
|
||||
async def export_video(req: ExportRequest):
|
||||
try:
|
||||
segments = [{"start": s.start, "end": s.end} for s in req.keep_segments]
|
||||
mute_segments = [{"start": s.start, "end": s.end} for s in req.mute_ranges] if req.mute_ranges else None
|
||||
gain_segments = [{"start": s.start, "end": s.end, "gain_db": s.gain_db} for s in req.gain_ranges] if req.gain_ranges else None
|
||||
speed_segments = [{"start": s.start, "end": s.end, "speed": s.speed} for s in req.speed_ranges] if req.speed_ranges else None
|
||||
|
||||
if not segments and not mute_segments:
|
||||
raise HTTPException(status_code=400, detail="No segments to export")
|
||||
|
||||
# Convert zoom config to dict
|
||||
zoom_dict = None
|
||||
if req.zoom and req.zoom.enabled:
|
||||
zoom_dict = {
|
||||
"enabled": True,
|
||||
"zoomFactor": req.zoom.zoomFactor,
|
||||
"panX": req.zoom.panX,
|
||||
"panY": req.zoom.panY,
|
||||
}
|
||||
|
||||
# Handle additional clips: pre-concat before main editing
|
||||
working_input = req.input_path
|
||||
has_additional = bool(req.additional_clips)
|
||||
if has_additional:
|
||||
try:
|
||||
concat_output = req.output_path + ".concat.mp4"
|
||||
concat_clips(req.input_path, req.additional_clips, concat_output)
|
||||
working_input = concat_output
|
||||
logger.info("Pre-concatenated %d additional clips into %s", len(req.additional_clips), concat_output)
|
||||
except Exception as e:
|
||||
logger.warning(f"Clip concatenation failed (non-fatal): {e}")
|
||||
# Fall back to main input only
|
||||
|
||||
mapped_gain_segments = _map_ranges_to_output_timeline(gain_segments or [], segments)
|
||||
|
||||
has_gain = abs(float(req.global_gain_db)) > 1e-6 or bool(gain_segments)
|
||||
has_speed = bool(speed_segments)
|
||||
|
||||
if has_speed and (mute_segments or has_gain):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Speed zones currently cannot be combined with mute/gain filters in one export",
|
||||
)
|
||||
|
||||
use_stream_copy = req.mode == "fast" and len(segments) == 1 and not mute_segments and not has_gain and not has_speed and not zoom_dict and not has_additional
|
||||
needs_reencode_for_subs = req.captions == "burn-in"
|
||||
|
||||
# Burn-in captions or audio filters require re-encode
|
||||
if needs_reencode_for_subs or mute_segments or has_gain or has_speed:
|
||||
use_stream_copy = False
|
||||
|
||||
words_dicts = [w.model_dump() for w in req.words] if req.words else []
|
||||
deleted_set = set(req.deleted_indices or [])
|
||||
|
||||
# Generate ASS file for burn-in
|
||||
ass_path = None
|
||||
if req.captions == "burn-in" and words_dicts:
|
||||
ass_content = generate_ass(words_dicts, deleted_set)
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".ass", delete=False, mode="w", encoding="utf-8")
|
||||
tmp.write(ass_content)
|
||||
tmp.close()
|
||||
ass_path = tmp.name
|
||||
|
||||
try:
|
||||
if use_stream_copy:
|
||||
output = export_stream_copy(working_input, req.output_path, segments)
|
||||
elif ass_path:
|
||||
output = export_reencode_with_subs(
|
||||
working_input,
|
||||
req.output_path,
|
||||
segments,
|
||||
ass_path,
|
||||
resolution=req.resolution,
|
||||
format_hint=req.format,
|
||||
mute_ranges=mute_segments,
|
||||
gain_ranges=mapped_gain_segments,
|
||||
speed_ranges=speed_segments,
|
||||
global_gain_db=req.global_gain_db,
|
||||
normalize_loudness=req.normalize_loudness,
|
||||
normalize_target_lufs=req.normalize_target_lufs,
|
||||
zoom_config=zoom_dict,
|
||||
)
|
||||
else:
|
||||
output = export_reencode(
|
||||
working_input,
|
||||
req.output_path,
|
||||
segments,
|
||||
resolution=req.resolution,
|
||||
format_hint=req.format,
|
||||
mute_ranges=mute_segments,
|
||||
gain_ranges=mapped_gain_segments,
|
||||
speed_ranges=speed_segments,
|
||||
global_gain_db=req.global_gain_db,
|
||||
normalize_loudness=req.normalize_loudness,
|
||||
normalize_target_lufs=req.normalize_target_lufs,
|
||||
zoom_config=zoom_dict,
|
||||
)
|
||||
finally:
|
||||
if ass_path and os.path.exists(ass_path):
|
||||
os.unlink(ass_path)
|
||||
|
||||
# Audio enhancement: clean, then mux back into the exported video
|
||||
if req.enhanceAudio:
|
||||
try:
|
||||
tmp_dir = tempfile.mkdtemp(prefix="cutscript_audio_")
|
||||
cleaned_audio = os.path.join(tmp_dir, "cleaned.wav")
|
||||
clean_audio(output, cleaned_audio)
|
||||
|
||||
muxed_path = output + ".muxed.mp4"
|
||||
_mux_audio(output, cleaned_audio, muxed_path)
|
||||
|
||||
os.replace(muxed_path, output)
|
||||
logger.info(f"Audio enhanced and muxed into {output}")
|
||||
|
||||
try:
|
||||
os.remove(cleaned_audio)
|
||||
os.rmdir(tmp_dir)
|
||||
except OSError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning(f"Audio enhancement failed (non-fatal): {e}")
|
||||
|
||||
# Background removal (post-process)
|
||||
if req.remove_background:
|
||||
try:
|
||||
bg_output = output + ".nobg.mp4"
|
||||
remove_bg(output, bg_output, req.background_replacement, req.background_replacement_value)
|
||||
os.replace(bg_output, output)
|
||||
logger.info("Background removed from %s", output)
|
||||
except Exception as e:
|
||||
logger.warning(f"Background removal failed (non-fatal): {e}")
|
||||
|
||||
# Background music mixing (post-process)
|
||||
if req.background_music:
|
||||
try:
|
||||
music_output = output + ".music.mp4"
|
||||
mix_background_music(
|
||||
output,
|
||||
req.background_music.path,
|
||||
music_output,
|
||||
volume_db=req.background_music.volumeDb,
|
||||
ducking_enabled=req.background_music.duckingEnabled,
|
||||
ducking_db=req.background_music.duckingDb,
|
||||
ducking_attack_ms=req.background_music.duckingAttackMs,
|
||||
ducking_release_ms=req.background_music.duckingReleaseMs,
|
||||
)
|
||||
os.replace(music_output, output)
|
||||
logger.info("Background music mixed into %s", output)
|
||||
except Exception as e:
|
||||
logger.warning(f"Background music mixing failed (non-fatal): {e}")
|
||||
|
||||
# Sidecar SRT: generate and save alongside video
|
||||
srt_path = None
|
||||
if req.captions == "sidecar" and words_dicts:
|
||||
srt_content = generate_srt(words_dicts, deleted_set)
|
||||
srt_path = req.output_path.rsplit(".", 1)[0] + ".srt"
|
||||
save_captions(srt_content, srt_path)
|
||||
logger.info(f"Sidecar SRT saved to {srt_path}")
|
||||
|
||||
# Cleanup pre-concat temp file
|
||||
if has_additional and working_input != req.input_path and os.path.exists(working_input):
|
||||
try:
|
||||
os.remove(working_input)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
result = {"status": "ok", "output_path": output}
|
||||
if srt_path:
|
||||
result["srt_path"] = srt_path
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except RuntimeError as e:
|
||||
logger.error(f"Export failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Export error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/export/transcript")
|
||||
async def export_transcript(req: TranscriptExportRequest):
|
||||
"""Export transcript as plain text or SRT without rendering video."""
|
||||
try:
|
||||
from services.caption_generator import generate_srt
|
||||
|
||||
deleted_set = set(req.deleted_indices or [])
|
||||
word_dicts = [w.model_dump() for w in req.words]
|
||||
|
||||
if req.format == "srt":
|
||||
content = generate_srt(word_dicts, deleted_set)
|
||||
else:
|
||||
# Plain text: join non-deleted words
|
||||
active_words = []
|
||||
for i, w in enumerate(word_dicts):
|
||||
if i not in deleted_set:
|
||||
active_words.append(w["word"])
|
||||
content = " ".join(active_words)
|
||||
|
||||
os.makedirs(os.path.dirname(req.output_path) or ".", exist_ok=True)
|
||||
with open(req.output_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
|
||||
logger.info("Transcript exported to %s (format=%s)", req.output_path, req.format)
|
||||
return {"status": "ok", "output_path": req.output_path}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Transcript export failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
54
backend/routers/local_llm.py
Normal file
54
backend/routers/local_llm.py
Normal file
@ -0,0 +1,54 @@
|
||||
"""Local LLM endpoints for bundled Qwen3 inference."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.local_llm import get_status, download_model, complete
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class CompleteRequest(BaseModel):
|
||||
prompt: str
|
||||
model_id: str = "qwen3-1.7b"
|
||||
system_prompt: Optional[str] = None
|
||||
temperature: float = 0.3
|
||||
max_tokens: int = 2048
|
||||
|
||||
|
||||
@router.get("/local-llm/status")
|
||||
async def llm_status():
|
||||
try:
|
||||
return get_status()
|
||||
except Exception as e:
|
||||
logger.error(f"Local LLM status failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/local-llm/download")
|
||||
async def llm_download(model_id: str = "qwen3-1.7b"):
|
||||
try:
|
||||
return download_model(model_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Local LLM download failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/local-llm/complete")
|
||||
async def llm_complete(req: CompleteRequest):
|
||||
try:
|
||||
result = complete(
|
||||
prompt=req.prompt,
|
||||
model_id=req.model_id,
|
||||
system_prompt=req.system_prompt,
|
||||
temperature=req.temperature,
|
||||
max_tokens=req.max_tokens,
|
||||
)
|
||||
return {"response": result}
|
||||
except Exception as e:
|
||||
logger.error(f"Local LLM completion failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
149
backend/routers/transcribe.py
Normal file
149
backend/routers/transcribe.py
Normal file
@ -0,0 +1,149 @@
|
||||
"""Transcription endpoint using WhisperX."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.transcription import transcribe_audio
|
||||
from services.diarization import diarize_and_label
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class TranscribeRequest(BaseModel):
|
||||
file_path: str
|
||||
model: str = "base"
|
||||
language: Optional[str] = None
|
||||
use_gpu: bool = True
|
||||
use_cache: bool = True
|
||||
diarize: bool = False
|
||||
hf_token: Optional[str] = None
|
||||
num_speakers: Optional[int] = None
|
||||
|
||||
|
||||
@router.post("/transcribe")
|
||||
async def transcribe(req: TranscribeRequest):
|
||||
try:
|
||||
result = transcribe_audio(
|
||||
file_path=req.file_path,
|
||||
model_name=req.model,
|
||||
use_gpu=req.use_gpu,
|
||||
use_cache=req.use_cache,
|
||||
language=req.language,
|
||||
)
|
||||
|
||||
if req.diarize and req.hf_token:
|
||||
result = diarize_and_label(
|
||||
transcription_result=result,
|
||||
audio_path=req.file_path,
|
||||
hf_token=req.hf_token,
|
||||
num_speakers=req.num_speakers,
|
||||
use_gpu=req.use_gpu,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail=f"File not found: {req.file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
class ReTranscribeSegmentRequest(BaseModel):
|
||||
file_path: str
|
||||
start: float
|
||||
end: float
|
||||
model: str = "base"
|
||||
language: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/transcribe/segment")
|
||||
async def transcribe_segment(req: ReTranscribeSegmentRequest):
|
||||
"""
|
||||
Re-transcribe a specific segment of audio.
|
||||
Extracts the segment with FFmpeg, transcribes it, and returns words
|
||||
with timestamps adjusted to the original file timeline.
|
||||
"""
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
try:
|
||||
# Extract the segment to a temp file
|
||||
tmp_dir = tempfile.mkdtemp(prefix="talkedit_segment_")
|
||||
segment_path = os.path.join(tmp_dir, "segment.wav")
|
||||
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", req.file_path,
|
||||
"-ss", str(req.start),
|
||||
"-to", str(req.end),
|
||||
"-vn",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
segment_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Segment extraction failed: {result.stderr[-300:]}")
|
||||
|
||||
# Transcribe the segment — try GPU first, fall back to CPU
|
||||
try:
|
||||
segment_result = transcribe_audio(
|
||||
file_path=segment_path,
|
||||
model_name=req.model,
|
||||
use_gpu=True,
|
||||
use_cache=False,
|
||||
language=req.language,
|
||||
)
|
||||
except Exception as gpu_err:
|
||||
logger.warning(f"GPU transcription failed (%s), falling back to CPU", gpu_err)
|
||||
segment_result = transcribe_audio(
|
||||
file_path=segment_path,
|
||||
model_name=req.model,
|
||||
use_gpu=False,
|
||||
use_cache=False,
|
||||
language=req.language,
|
||||
)
|
||||
|
||||
# Adjust timestamps to be relative to the original file
|
||||
offset = req.start
|
||||
adjusted_words = []
|
||||
for w in segment_result.get("words", []):
|
||||
w["start"] = round(w["start"] + offset, 3)
|
||||
w["end"] = round(w["end"] + offset, 3)
|
||||
adjusted_words.append(w)
|
||||
|
||||
adjusted_segments = []
|
||||
for seg in segment_result.get("segments", []):
|
||||
seg["start"] = round(seg["start"] + offset, 3)
|
||||
seg["end"] = round(seg["end"] + offset, 3)
|
||||
# Also adjust words within each segment
|
||||
for w in seg.get("words", []):
|
||||
w["start"] = round(w["start"] + offset, 3)
|
||||
w["end"] = round(w["end"] + offset, 3)
|
||||
adjusted_segments.append(seg)
|
||||
|
||||
# Cleanup
|
||||
try:
|
||||
os.remove(segment_path)
|
||||
os.rmdir(tmp_dir)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return {
|
||||
"words": adjusted_words,
|
||||
"segments": adjusted_segments,
|
||||
"language": segment_result.get("language", "en"),
|
||||
}
|
||||
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail=f"File not found: {req.file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Segment transcription failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
0
backend/services/__init__.py
Normal file
0
backend/services/__init__.py
Normal file
211
backend/services/ai_provider.py
Normal file
211
backend/services/ai_provider.py
Normal file
@ -0,0 +1,211 @@
|
||||
"""
|
||||
Unified AI provider interface for Ollama, OpenAI, and Claude.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AIProvider:
|
||||
"""Routes completion requests to the configured provider."""
|
||||
|
||||
@staticmethod
|
||||
def complete(
|
||||
prompt: str,
|
||||
provider: str = "ollama",
|
||||
model: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
system_prompt: Optional[str] = None,
|
||||
temperature: float = 0.3,
|
||||
) -> str:
|
||||
if provider == "ollama":
|
||||
return _ollama_complete(prompt, model or "llama3", base_url or "http://localhost:11434", system_prompt, temperature)
|
||||
elif provider == "openai":
|
||||
return _openai_complete(prompt, model or "gpt-4o", api_key or "", system_prompt, temperature)
|
||||
elif provider == "claude":
|
||||
return _claude_complete(prompt, model or "claude-sonnet-4-20250514", api_key or "", system_prompt, temperature)
|
||||
else:
|
||||
raise ValueError(f"Unknown provider: {provider}")
|
||||
|
||||
@staticmethod
|
||||
def list_ollama_models(base_url: str = "http://localhost:11434") -> List[str]:
|
||||
try:
|
||||
resp = requests.get(f"{base_url}/api/tags", timeout=3)
|
||||
if resp.status_code == 200:
|
||||
return [m["name"] for m in resp.json().get("models", [])]
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
def _ollama_complete(prompt: str, model: str, base_url: str, system_prompt: Optional[str], temperature: float) -> str:
|
||||
body = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature},
|
||||
}
|
||||
if system_prompt:
|
||||
body["system"] = system_prompt
|
||||
|
||||
try:
|
||||
resp = requests.post(f"{base_url}/api/generate", json=body, timeout=120)
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("response", "").strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Ollama error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def _openai_complete(prompt: str, model: str, api_key: str, system_prompt: Optional[str], temperature: float) -> str:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=api_key)
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
)
|
||||
return response.choices[0].message.content.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"OpenAI error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def _claude_complete(prompt: str, model: str, api_key: str, system_prompt: Optional[str], temperature: float) -> str:
|
||||
try:
|
||||
import anthropic
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
kwargs = {
|
||||
"model": model,
|
||||
"max_tokens": 4096,
|
||||
"temperature": temperature,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
if system_prompt:
|
||||
kwargs["system"] = system_prompt
|
||||
|
||||
response = client.messages.create(**kwargs)
|
||||
return response.content[0].text.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Claude error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def detect_filler_words(
|
||||
transcript: str,
|
||||
words: List[dict],
|
||||
provider: str = "ollama",
|
||||
model: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
custom_filler_words: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Use an LLM to identify filler words in the transcript.
|
||||
Returns {"wordIndices": [...], "fillerWords": [{"index": N, "word": "...", "reason": "..."}]}
|
||||
"""
|
||||
word_list = "\n".join(f"{w['index']}: {w['word']}" for w in words)
|
||||
|
||||
custom_line = ""
|
||||
if custom_filler_words and custom_filler_words.strip():
|
||||
custom_line = f"\n\nAdditionally, flag these user-specified filler words/phrases: {custom_filler_words.strip()}"
|
||||
|
||||
prompt = f"""Analyze this transcript for filler words and verbal hesitations.
|
||||
|
||||
Filler words include: um, uh, uh huh, hmm, like (when used as filler), you know, so (when starting sentences unnecessarily), basically, actually, literally, right, I mean, kind of, sort of, well (when used as filler).
|
||||
|
||||
Also flag repeated words that indicate stammering (e.g., "I I I" or "the the").{custom_line}
|
||||
|
||||
Here are the words with their indices:
|
||||
{word_list}
|
||||
|
||||
Return ONLY a valid JSON object with this exact structure:
|
||||
{{"wordIndices": [list of integer indices to remove], "fillerWords": [{{"index": integer, "word": "the word", "reason": "brief reason"}}]}}
|
||||
|
||||
Be conservative -- only flag clear filler words, not words that are part of meaningful sentences."""
|
||||
|
||||
system = "You are a precise text analysis tool. Return only valid JSON, no explanation."
|
||||
|
||||
result_text = AIProvider.complete(
|
||||
prompt=prompt,
|
||||
provider=provider,
|
||||
model=model,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
system_prompt=system,
|
||||
temperature=0.1,
|
||||
)
|
||||
|
||||
try:
|
||||
start = result_text.find("{")
|
||||
end = result_text.rfind("}") + 1
|
||||
if start >= 0 and end > start:
|
||||
return json.loads(result_text[start:end])
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Failed to parse AI response as JSON: {result_text[:200]}")
|
||||
|
||||
return {"wordIndices": [], "fillerWords": []}
|
||||
|
||||
|
||||
def create_clip_suggestion(
|
||||
transcript: str,
|
||||
words: List[dict],
|
||||
target_duration: int = 60,
|
||||
provider: str = "ollama",
|
||||
model: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Use an LLM to find the best clip segments in a transcript.
|
||||
"""
|
||||
word_list = "\n".join(
|
||||
f"{w['index']}: \"{w['word']}\" ({w.get('start', 0):.1f}s - {w.get('end', 0):.1f}s)"
|
||||
for w in words
|
||||
)
|
||||
|
||||
prompt = f"""Analyze this transcript and find the most engaging {target_duration}-second segment(s) that would work well as a YouTube Short or social media clip.
|
||||
|
||||
Look for: compelling stories, surprising facts, emotional moments, clear explanations, humor, or quotable statements.
|
||||
|
||||
Words with indices and timestamps:
|
||||
{word_list}
|
||||
|
||||
Return ONLY a valid JSON object:
|
||||
{{"clips": [{{"title": "short catchy title", "startWordIndex": integer, "endWordIndex": integer, "startTime": float, "endTime": float, "reason": "why this segment is engaging"}}]}}
|
||||
|
||||
Suggest 1-3 clips, each approximately {target_duration} seconds long."""
|
||||
|
||||
system = "You are a viral content expert. Return only valid JSON, no explanation."
|
||||
|
||||
result_text = AIProvider.complete(
|
||||
prompt=prompt,
|
||||
provider=provider,
|
||||
model=model,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
system_prompt=system,
|
||||
temperature=0.5,
|
||||
)
|
||||
|
||||
try:
|
||||
start = result_text.find("{")
|
||||
end = result_text.rfind("}") + 1
|
||||
if start >= 0 and end > start:
|
||||
return json.loads(result_text[start:end])
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Failed to parse clip suggestions: {result_text[:200]}")
|
||||
|
||||
return {"clips": []}
|
||||
282
backend/services/audio_cleaner.py
Normal file
282
backend/services/audio_cleaner.py
Normal file
@ -0,0 +1,282 @@
|
||||
"""
|
||||
Audio noise reduction using DeepFilterNet.
|
||||
Falls back to a basic FFmpeg noise filter if DeepFilterNet is not installed.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEEPFILTER_AVAILABLE = None
|
||||
enhance = None
|
||||
init_df = None
|
||||
load_audio = None
|
||||
save_audio = None
|
||||
|
||||
|
||||
_df_model = None
|
||||
_df_state = None
|
||||
|
||||
|
||||
def _ensure_deepfilter_loaded() -> bool:
|
||||
global DEEPFILTER_AVAILABLE, enhance, init_df, load_audio, save_audio
|
||||
if DEEPFILTER_AVAILABLE is not None:
|
||||
return DEEPFILTER_AVAILABLE
|
||||
|
||||
try:
|
||||
# DeepFilterNet currently triggers a third-party torchaudio deprecation warning
|
||||
# on import in some environments; suppress only this known warning.
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message=r".*torchaudio\._backend\.common\.AudioMetaData has been moved.*",
|
||||
category=UserWarning,
|
||||
)
|
||||
from df.enhance import enhance as _enhance, init_df as _init_df, load_audio as _load_audio, save_audio as _save_audio
|
||||
enhance = _enhance
|
||||
init_df = _init_df
|
||||
load_audio = _load_audio
|
||||
save_audio = _save_audio
|
||||
DEEPFILTER_AVAILABLE = True
|
||||
except ImportError:
|
||||
DEEPFILTER_AVAILABLE = False
|
||||
|
||||
return DEEPFILTER_AVAILABLE
|
||||
|
||||
|
||||
def _init_deepfilter():
|
||||
global _df_model, _df_state
|
||||
if not _ensure_deepfilter_loaded():
|
||||
raise RuntimeError("DeepFilterNet is not available")
|
||||
if _df_model is None:
|
||||
logger.info("Initializing DeepFilterNet model")
|
||||
_df_model, _df_state, _ = init_df()
|
||||
return _df_model, _df_state
|
||||
|
||||
|
||||
def clean_audio(
|
||||
input_path: str,
|
||||
output_path: str = "",
|
||||
) -> str:
|
||||
"""
|
||||
Apply noise reduction to an audio file.
|
||||
|
||||
If DeepFilterNet is available, uses it for high-quality results.
|
||||
Otherwise falls back to FFmpeg's anlmdn filter.
|
||||
|
||||
Returns: path to the cleaned audio file.
|
||||
"""
|
||||
input_path = Path(input_path)
|
||||
if not output_path:
|
||||
output_path = str(input_path.with_stem(input_path.stem + "_clean"))
|
||||
|
||||
if is_deepfilter_available():
|
||||
return _clean_with_deepfilter(str(input_path), output_path)
|
||||
else:
|
||||
return _clean_with_ffmpeg(str(input_path), output_path)
|
||||
|
||||
|
||||
def _clean_with_deepfilter(input_path: str, output_path: str) -> str:
|
||||
model, state = _init_deepfilter()
|
||||
audio, info = load_audio(input_path, sr=state.sr())
|
||||
enhanced = enhance(model, state, audio)
|
||||
save_audio(output_path, enhanced, sr=state.sr())
|
||||
logger.info(f"DeepFilterNet cleaned audio saved to {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
def _clean_with_ffmpeg(input_path: str, output_path: str) -> str:
|
||||
"""Fallback: basic noise reduction using FFmpeg's anlmdn filter."""
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", input_path,
|
||||
"-af", "anlmdn=s=7:p=0.002:r=0.002:m=15",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg audio cleaning failed: {result.stderr[-300:]}")
|
||||
logger.info(f"FFmpeg cleaned audio saved to {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
def is_deepfilter_available() -> bool:
|
||||
return _ensure_deepfilter_loaded()
|
||||
|
||||
|
||||
def detect_silence_ranges(input_path: str, min_silence_ms: int, silence_db: float):
|
||||
"""Detect silence ranges using ffmpeg silencedetect.
|
||||
|
||||
Returns a list of dicts: {start, end, duration} in seconds.
|
||||
"""
|
||||
min_silence_seconds = max(0.05, float(min_silence_ms) / 1000.0)
|
||||
noise_threshold = float(silence_db)
|
||||
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-hide_banner",
|
||||
"-i",
|
||||
input_path,
|
||||
"-af",
|
||||
f"silencedetect=noise={noise_threshold}dB:d={min_silence_seconds}",
|
||||
"-f",
|
||||
"null",
|
||||
"-",
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
# silencedetect prints to stderr even on success.
|
||||
output = result.stderr or ""
|
||||
start_pat = re.compile(r"silence_start:\s*([0-9.]+)")
|
||||
end_pat = re.compile(r"silence_end:\s*([0-9.]+)\s*\|\s*silence_duration:\s*([0-9.]+)")
|
||||
|
||||
starts = [float(m.group(1)) for m in start_pat.finditer(output)]
|
||||
ends = [(float(m.group(1)), float(m.group(2))) for m in end_pat.finditer(output)]
|
||||
|
||||
ranges = []
|
||||
pair_count = min(len(starts), len(ends))
|
||||
for i in range(pair_count):
|
||||
start = max(0.0, starts[i])
|
||||
end, duration = ends[i]
|
||||
if end > start and duration >= min_silence_seconds:
|
||||
ranges.append({
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(duration, 3),
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"Detected %s silence ranges in %s (min=%sms, threshold=%sdB)",
|
||||
len(ranges),
|
||||
input_path,
|
||||
min_silence_ms,
|
||||
silence_db,
|
||||
)
|
||||
return ranges
|
||||
|
||||
|
||||
def normalize_audio(
|
||||
input_path: str,
|
||||
output_path: str = "",
|
||||
target_lufs: float = -14.0,
|
||||
) -> str:
|
||||
"""
|
||||
Normalize audio loudness to a target LUFS level using FFmpeg's loudnorm filter.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input audio/video file.
|
||||
output_path: Path for the normalized output. Auto-generated if empty.
|
||||
target_lufs: Target integrated loudness in LUFS.
|
||||
Common targets: -14 (YouTube), -16 (Spotify), -23 (broadcast).
|
||||
|
||||
Returns: path to the normalized audio file.
|
||||
"""
|
||||
import os as _os
|
||||
|
||||
inp = Path(input_path)
|
||||
if not output_path:
|
||||
output_path = str(inp.with_stem(inp.stem + "_normalized"))
|
||||
|
||||
# Two-pass loudnorm: first pass measures loudness, second pass applies correction.
|
||||
# First pass: measure only (print_format=json)
|
||||
measure_cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", str(inp),
|
||||
"-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:print_format=json",
|
||||
"-f", "null",
|
||||
"-",
|
||||
]
|
||||
logger.info("Running loudnorm first pass (measurement): %s", " ".join(measure_cmd))
|
||||
measure_result = subprocess.run(measure_cmd, capture_output=True, text=True)
|
||||
|
||||
# Parse measured parameters from stderr (loudnorm outputs JSON to stderr)
|
||||
measured = _parse_loudnorm_measurement(measure_result.stderr)
|
||||
if not measured:
|
||||
logger.warning(
|
||||
"loudnorm measurement failed or produced no output; "
|
||||
"falling back to single-pass normalization"
|
||||
)
|
||||
# Single-pass fallback
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", str(inp),
|
||||
"-af", f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5",
|
||||
"-c:v", "copy",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Audio normalization failed: {result.stderr[-300:]}")
|
||||
logger.info("Single-pass normalized audio saved to %s", output_path)
|
||||
return output_path
|
||||
|
||||
# Second pass: apply normalization using measured values
|
||||
input_i = measured.get("input_i", target_lufs)
|
||||
input_lra = measured.get("input_lra", 7.0)
|
||||
input_tp = measured.get("input_tp", -1.5)
|
||||
input_thresh = measured.get("input_thresh", -30.0)
|
||||
offset = measured.get("target_offset", 0.0)
|
||||
|
||||
apply_cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", str(inp),
|
||||
"-af",
|
||||
(
|
||||
f"loudnorm=I={target_lufs}:LRA=7:TP=-1.5:"
|
||||
f"measured_I={input_i}:measured_LRA={input_lra}:"
|
||||
f"measured_TP={input_tp}:measured_thresh={input_thresh}:"
|
||||
f"offset={offset}:linear=true:print_format=summary"
|
||||
),
|
||||
"-c:v", "copy",
|
||||
output_path,
|
||||
]
|
||||
logger.info("Running loudnorm second pass (apply): %s", " ".join(apply_cmd))
|
||||
result = subprocess.run(apply_cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Audio normalization (apply) failed: {result.stderr[-300:]}")
|
||||
|
||||
logger.info(
|
||||
"Normalized audio saved to %s (target=%s LUFS, measured_I=%s)",
|
||||
output_path,
|
||||
target_lufs,
|
||||
input_i,
|
||||
)
|
||||
return output_path
|
||||
|
||||
|
||||
def _parse_loudnorm_measurement(stderr_output: str) -> dict:
|
||||
"""Parse loudnorm JSON measurement output from FFmpeg stderr."""
|
||||
import json
|
||||
|
||||
# loudnorm prints JSON block between "Parsed_loudnorm" lines
|
||||
lines = stderr_output.split("\n")
|
||||
json_lines = []
|
||||
in_json = False
|
||||
for line in lines:
|
||||
if "Parsed_loudnorm" in line and "}" in line:
|
||||
# Single-line JSON
|
||||
try:
|
||||
start = line.index("{")
|
||||
end = line.rindex("}") + 1
|
||||
return json.loads(line[start:end])
|
||||
except (ValueError, json.JSONDecodeError):
|
||||
continue
|
||||
if "{" in line and not in_json:
|
||||
in_json = True
|
||||
if in_json:
|
||||
json_lines.append(line)
|
||||
if in_json and "}" in line:
|
||||
in_json = False
|
||||
break
|
||||
|
||||
if json_lines:
|
||||
try:
|
||||
return json.loads("".join(json_lines))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return {}
|
||||
232
backend/services/background_removal.py
Normal file
232
backend/services/background_removal.py
Normal file
@ -0,0 +1,232 @@
|
||||
"""
|
||||
AI background removal using MediaPipe for person segmentation.
|
||||
Applied during export as a post-processing step — no real-time preview.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MEDIAPIPE_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import mediapipe as mp
|
||||
MEDIAPIPE_AVAILABLE = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
return MEDIAPIPE_AVAILABLE
|
||||
|
||||
|
||||
def remove_background_on_export(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
replacement: str = "blur",
|
||||
replacement_value: str = "",
|
||||
) -> str:
|
||||
"""
|
||||
Process video frame-by-frame using FFmpeg chromakey fallback,
|
||||
or MediaPipe-based segmentation if available.
|
||||
|
||||
Args:
|
||||
input_path: source video
|
||||
output_path: destination
|
||||
replacement: 'blur', 'color', or 'image'
|
||||
replacement_value: hex color or image path (for color/image modes)
|
||||
|
||||
Returns:
|
||||
output_path
|
||||
"""
|
||||
input_path = str(Path(input_path).resolve())
|
||||
output_path = str(Path(output_path).resolve())
|
||||
|
||||
if MEDIAPIPE_AVAILABLE:
|
||||
return _remove_with_mediapipe(input_path, output_path, replacement, replacement_value)
|
||||
else:
|
||||
return _remove_with_ffmpeg_portrait(input_path, output_path, replacement, replacement_value)
|
||||
|
||||
|
||||
def _remove_with_mediapipe(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
replacement: str = "blur",
|
||||
replacement_value: str = "",
|
||||
) -> str:
|
||||
"""Use MediaPipe Selfie Segmentation + FFmpeg for background removal.
|
||||
|
||||
Extracts frames, applies segmentation, composites replacement background.
|
||||
"""
|
||||
try:
|
||||
import cv2
|
||||
import numpy as np
|
||||
import mediapipe as mp
|
||||
|
||||
mp_selfie_segmentation = mp.solutions.selfie_segmentation
|
||||
|
||||
# Determine background color/image
|
||||
if replacement == "color":
|
||||
color_hex = replacement_value or "#00FF00"
|
||||
color_hex = color_hex.lstrip("#")
|
||||
bg_color = tuple(int(color_hex[i:i+2], 16) for i in (0, 2, 4))
|
||||
bg_color = bg_color[::-1] # RGB -> BGR
|
||||
elif replacement == "image":
|
||||
bg_image = cv2.imread(replacement_value) if replacement_value else None
|
||||
if bg_image is None:
|
||||
bg_color = (0, 255, 0)
|
||||
bg_image = None
|
||||
else:
|
||||
# Blur background (default)
|
||||
bg_color = None
|
||||
|
||||
# Open video
|
||||
cap = cv2.VideoCapture(input_path)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
# Temp directory for processed frames
|
||||
temp_dir = tempfile.mkdtemp(prefix="aive_bgrem_")
|
||||
frame_dir = os.path.join(temp_dir, "frames")
|
||||
os.makedirs(frame_dir, exist_ok=True)
|
||||
|
||||
with mp_selfie_segmentation.SelfieSegmentation(model_selection=0) as segmenter:
|
||||
frame_idx = 0
|
||||
while cap.isOpened():
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
# Convert to RGB for MediaPipe
|
||||
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
result = segmenter.process(rgb)
|
||||
mask = result.segmentation_mask
|
||||
|
||||
# Threshold the mask
|
||||
condition = mask > 0.5
|
||||
|
||||
if replacement == "blur":
|
||||
# Apply strong blur to background
|
||||
blurred = cv2.GaussianBlur(frame, (99, 99), 0)
|
||||
output_frame = np.where(condition[..., None], frame, blurred)
|
||||
elif replacement == "color":
|
||||
bg = np.full(frame.shape, bg_color, dtype=np.uint8)
|
||||
output_frame = np.where(condition[..., None], frame, bg)
|
||||
elif replacement == "image" and bg_image is not None:
|
||||
bg_resized = cv2.resize(bg_image, (width, height))
|
||||
output_frame = np.where(condition[..., None], frame, bg_resized)
|
||||
else:
|
||||
output_frame = frame
|
||||
|
||||
out_path = os.path.join(frame_dir, f"frame_{frame_idx:06d}.png")
|
||||
cv2.imwrite(out_path, output_frame)
|
||||
frame_idx += 1
|
||||
|
||||
if frame_idx % 100 == 0:
|
||||
logger.info("Background removal: %d/%d frames", frame_idx, total_frames)
|
||||
|
||||
cap.release()
|
||||
|
||||
# Encode frames back to video using FFmpeg
|
||||
import subprocess as _sp
|
||||
ffmpeg = "ffmpeg"
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-framerate", str(fps),
|
||||
"-i", os.path.join(frame_dir, "frame_%06d.png"),
|
||||
"-i", input_path,
|
||||
"-map", "0:v:0",
|
||||
"-map", "1:a:0?",
|
||||
"-c:v", "libx264", "-preset", "medium", "-crf", "18",
|
||||
"-c:a", "aac", "-b:a", "192k",
|
||||
"-shortest",
|
||||
"-pix_fmt", "yuv420p",
|
||||
output_path,
|
||||
]
|
||||
result = _sp.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg frame encode failed: {result.stderr[-500:]}")
|
||||
|
||||
# Cleanup
|
||||
for f in os.listdir(frame_dir):
|
||||
try:
|
||||
os.remove(os.path.join(frame_dir, f))
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
os.rmdir(frame_dir)
|
||||
os.rmdir(temp_dir)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
logger.info("MediaPipe background removal completed -> %s", output_path)
|
||||
return output_path
|
||||
|
||||
except ImportError:
|
||||
logger.warning("mediapipe/cv2 not available, falling back to FFmpeg portrait mode")
|
||||
return _remove_with_ffmpeg_portrait(input_path, output_path, replacement, replacement_value)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"MediaPipe background removal failed: {e}")
|
||||
|
||||
|
||||
|
||||
def _remove_with_ffmpeg_portrait(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
replacement: str = "blur",
|
||||
replacement_value: str = "",
|
||||
) -> str:
|
||||
"""Fallback: basic FFmpeg-only background blur.
|
||||
|
||||
Uses a strong gaussian blur as a crude background replacement.
|
||||
For proper person segmentation (color/image replacement), install:
|
||||
pip install mediapipe opencv-python
|
||||
"""
|
||||
ffmpeg = "ffmpeg"
|
||||
|
||||
if replacement == "blur":
|
||||
filter_complex = "gblur=sigma=30"
|
||||
elif replacement == "color":
|
||||
color = replacement_value or "00FF00"
|
||||
filter_complex = (
|
||||
f"split[fg][bg];"
|
||||
f"[bg]colorkey=0x{color}:0.3:0.1[bg_key];"
|
||||
f"[fg][bg_key]overlay"
|
||||
)
|
||||
elif replacement == "image" and replacement_value:
|
||||
escaped = replacement_value.replace("\\", "/").replace(":", "\\:")
|
||||
filter_complex = (
|
||||
f"movie='{escaped}':loop=0,scale=iw:ih[bg];"
|
||||
f"[0:v][bg]overlay=0:0:shortest=1"
|
||||
)
|
||||
else:
|
||||
filter_complex = "null"
|
||||
|
||||
if filter_complex == "null":
|
||||
cmd = [ffmpeg, "-y", "-i", input_path, "-c", "copy", output_path]
|
||||
else:
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-vf", filter_complex,
|
||||
"-c:v", "libx264", "-preset", "medium", "-crf", "18",
|
||||
"-c:a", "aac", "-b:a", "192k",
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg background removal failed: {result.stderr[-500:]}")
|
||||
|
||||
logger.warning(
|
||||
"FFmpeg fallback background removal used (no MediaPipe). "
|
||||
"Install 'mediapipe' and 'opencv-python' for proper person segmentation."
|
||||
)
|
||||
return output_path
|
||||
148
backend/services/caption_generator.py
Normal file
148
backend/services/caption_generator.py
Normal file
@ -0,0 +1,148 @@
|
||||
"""
|
||||
Generate caption files (SRT, VTT, ASS) from word-level timestamps.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _format_srt_time(seconds: float) -> str:
|
||||
h = int(seconds // 3600)
|
||||
m = int((seconds % 3600) // 60)
|
||||
s = int(seconds % 60)
|
||||
ms = int((seconds % 1) * 1000)
|
||||
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
|
||||
|
||||
|
||||
def _format_vtt_time(seconds: float) -> str:
|
||||
h = int(seconds // 3600)
|
||||
m = int((seconds % 3600) // 60)
|
||||
s = int(seconds % 60)
|
||||
ms = int((seconds % 1) * 1000)
|
||||
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
|
||||
|
||||
|
||||
def _format_ass_time(seconds: float) -> str:
|
||||
h = int(seconds // 3600)
|
||||
m = int((seconds % 3600) // 60)
|
||||
s = int(seconds % 60)
|
||||
cs = int((seconds % 1) * 100)
|
||||
return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
|
||||
|
||||
|
||||
def generate_srt(
|
||||
words: List[dict],
|
||||
deleted_indices: Optional[set] = None,
|
||||
words_per_line: int = 8,
|
||||
) -> str:
|
||||
"""Generate SRT caption content from word-level timestamps."""
|
||||
deleted_indices = deleted_indices or set()
|
||||
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
|
||||
|
||||
lines = []
|
||||
counter = 1
|
||||
for chunk_start in range(0, len(active_words), words_per_line):
|
||||
chunk = active_words[chunk_start:chunk_start + words_per_line]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
start_time = chunk[0][1]["start"]
|
||||
end_time = chunk[-1][1]["end"]
|
||||
text = " ".join(w["word"] for _, w in chunk)
|
||||
|
||||
lines.append(str(counter))
|
||||
lines.append(f"{_format_srt_time(start_time)} --> {_format_srt_time(end_time)}")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
counter += 1
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def generate_vtt(
|
||||
words: List[dict],
|
||||
deleted_indices: Optional[set] = None,
|
||||
words_per_line: int = 8,
|
||||
) -> str:
|
||||
"""Generate WebVTT caption content."""
|
||||
deleted_indices = deleted_indices or set()
|
||||
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
|
||||
|
||||
lines = ["WEBVTT", ""]
|
||||
for chunk_start in range(0, len(active_words), words_per_line):
|
||||
chunk = active_words[chunk_start:chunk_start + words_per_line]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
start_time = chunk[0][1]["start"]
|
||||
end_time = chunk[-1][1]["end"]
|
||||
text = " ".join(w["word"] for _, w in chunk)
|
||||
|
||||
lines.append(f"{_format_vtt_time(start_time)} --> {_format_vtt_time(end_time)}")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def generate_ass(
|
||||
words: List[dict],
|
||||
deleted_indices: Optional[set] = None,
|
||||
words_per_line: int = 8,
|
||||
style: Optional[dict] = None,
|
||||
) -> str:
|
||||
"""Generate ASS subtitle content with styling."""
|
||||
deleted_indices = deleted_indices or set()
|
||||
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
|
||||
|
||||
s = style or {}
|
||||
font = s.get("fontName", "Arial")
|
||||
size = s.get("fontSize", 48)
|
||||
color = s.get("fontColor", "&H00FFFFFF")
|
||||
bold = "-1" if s.get("bold", True) else "0"
|
||||
alignment = 2
|
||||
|
||||
header = f"""[Script Info]
|
||||
Title: AI Video Editor Captions
|
||||
ScriptType: v4.00+
|
||||
PlayResX: 1920
|
||||
PlayResY: 1080
|
||||
|
||||
[V4+ Styles]
|
||||
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
||||
Style: Default,{font},{size},{color},&H000000FF,&H00000000,&H80000000,{bold},0,0,0,100,100,0,0,1,2,1,{alignment},20,20,40,1
|
||||
|
||||
[Events]
|
||||
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||
"""
|
||||
|
||||
events = []
|
||||
for chunk_start in range(0, len(active_words), words_per_line):
|
||||
chunk = active_words[chunk_start:chunk_start + words_per_line]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
start_time = chunk[0][1]["start"]
|
||||
end_time = chunk[-1][1]["end"]
|
||||
text = " ".join(w["word"] for _, w in chunk)
|
||||
|
||||
events.append(
|
||||
f"Dialogue: 0,{_format_ass_time(start_time)},{_format_ass_time(end_time)},Default,,0,0,0,,{text}"
|
||||
)
|
||||
|
||||
return header + "\n".join(events) + "\n"
|
||||
|
||||
|
||||
def save_captions(
|
||||
content: str,
|
||||
output_path: str,
|
||||
) -> str:
|
||||
"""Write caption content to a file."""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(content, encoding="utf-8")
|
||||
logger.info(f"Saved captions to {output_path}")
|
||||
return str(output_path)
|
||||
98
backend/services/diarization.py
Normal file
98
backend/services/diarization.py
Normal file
@ -0,0 +1,98 @@
|
||||
"""
|
||||
Speaker diarization service using pyannote.audio.
|
||||
Refactored from the original repo -- removed Streamlit dependency.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from utils.gpu_utils import get_optimal_device
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_pipeline_cache = {}
|
||||
|
||||
|
||||
def _get_pipeline(hf_token: str, device: torch.device):
|
||||
cache_key = str(device)
|
||||
if cache_key in _pipeline_cache:
|
||||
return _pipeline_cache[cache_key]
|
||||
|
||||
try:
|
||||
from pyannote.audio import Pipeline
|
||||
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.0",
|
||||
use_auth_token=hf_token,
|
||||
)
|
||||
if device.type == "cuda":
|
||||
pipeline = pipeline.to(device)
|
||||
|
||||
_pipeline_cache[cache_key] = pipeline
|
||||
return pipeline
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load diarization pipeline: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def diarize_and_label(
|
||||
transcription_result: dict,
|
||||
audio_path: str,
|
||||
hf_token: Optional[str] = None,
|
||||
num_speakers: Optional[int] = None,
|
||||
use_gpu: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Apply speaker diarization to an existing transcription result.
|
||||
Adds 'speaker' field to each word and segment.
|
||||
|
||||
Returns the mutated transcription_result with speaker labels.
|
||||
"""
|
||||
hf_token = hf_token or os.environ.get("HF_TOKEN")
|
||||
if not hf_token:
|
||||
logger.warning("No HuggingFace token provided; skipping diarization")
|
||||
return transcription_result
|
||||
|
||||
device = get_optimal_device() if use_gpu else torch.device("cpu")
|
||||
pipeline = _get_pipeline(hf_token, device)
|
||||
if pipeline is None:
|
||||
return transcription_result
|
||||
|
||||
audio_path = Path(audio_path)
|
||||
logger.info(f"Running diarization on {audio_path}")
|
||||
|
||||
try:
|
||||
diarization = pipeline(str(audio_path), num_speakers=num_speakers)
|
||||
except Exception as e:
|
||||
logger.error(f"Diarization failed: {e}")
|
||||
return transcription_result
|
||||
|
||||
speaker_map = []
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
speaker_map.append((turn.start, turn.end, speaker))
|
||||
|
||||
def _find_speaker(start: float, end: float) -> str:
|
||||
best_overlap = 0
|
||||
best_speaker = "UNKNOWN"
|
||||
for s_start, s_end, speaker in speaker_map:
|
||||
overlap_start = max(start, s_start)
|
||||
overlap_end = min(end, s_end)
|
||||
overlap = max(0, overlap_end - overlap_start)
|
||||
if overlap > best_overlap:
|
||||
best_overlap = overlap
|
||||
best_speaker = speaker
|
||||
return best_speaker
|
||||
|
||||
for word in transcription_result.get("words", []):
|
||||
word["speaker"] = _find_speaker(word["start"], word["end"])
|
||||
|
||||
for segment in transcription_result.get("segments", []):
|
||||
segment["speaker"] = _find_speaker(segment["start"], segment["end"])
|
||||
for w in segment.get("words", []):
|
||||
w["speaker"] = _find_speaker(w["start"], w["end"])
|
||||
|
||||
return transcription_result
|
||||
125
backend/services/local_llm.py
Normal file
125
backend/services/local_llm.py
Normal file
@ -0,0 +1,125 @@
|
||||
"""
|
||||
Local LLM inference using llama.cpp via llama-cpp-python.
|
||||
Handles model download from HuggingFace and text completion.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
|
||||
QWEN_MODELS = {
|
||||
"qwen3-1.7b": {
|
||||
"repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
|
||||
"file": "qwen3-1.7b-instruct-q4_k_m.gguf",
|
||||
"size_gb": 1.0,
|
||||
},
|
||||
"qwen3-4b": {
|
||||
"repo": "Qwen/Qwen3-4B-Instruct-GGUF",
|
||||
"file": "qwen3-4b-instruct-q4_k_m.gguf",
|
||||
"size_gb": 2.5,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _ensure_llama_cpp() -> bool:
|
||||
try:
|
||||
from llama_cpp import Llama
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
def _model_path(model_id: str) -> Path:
|
||||
info = QWEN_MODELS.get(model_id)
|
||||
if not info:
|
||||
raise ValueError(f"Unknown model: {model_id}")
|
||||
return LOCAL_MODELS_DIR / model_id / info["file"]
|
||||
|
||||
|
||||
def get_status() -> dict:
|
||||
"""Check status of local LLM setup."""
|
||||
llama_available = _ensure_llama_cpp()
|
||||
models = {}
|
||||
for model_id in QWEN_MODELS:
|
||||
path = _model_path(model_id)
|
||||
models[model_id] = {
|
||||
"downloaded": path.exists(),
|
||||
"size_bytes": path.stat().st_size if path.exists() else 0,
|
||||
"total_gb": QWEN_MODELS[model_id]["size_gb"],
|
||||
}
|
||||
|
||||
return {
|
||||
"llama_cpp_available": llama_available,
|
||||
"models": models,
|
||||
"models_dir": str(LOCAL_MODELS_DIR),
|
||||
}
|
||||
|
||||
|
||||
def download_model(model_id: str) -> dict:
|
||||
"""Download a Qwen3 GGUF model from HuggingFace."""
|
||||
info = QWEN_MODELS.get(model_id)
|
||||
if not info:
|
||||
raise ValueError(f"Unknown model: {model_id}")
|
||||
|
||||
model_dir = LOCAL_MODELS_DIR / model_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_path = model_dir / info["file"]
|
||||
|
||||
if output_path.exists():
|
||||
return {"status": "already_downloaded", "path": str(output_path)}
|
||||
|
||||
logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
|
||||
subprocess.run([
|
||||
sys.executable, "-m", "huggingface_hub", "download",
|
||||
info["repo"], info["file"],
|
||||
"--local-dir", str(model_dir),
|
||||
"--local-dir-use-symlinks", "False",
|
||||
], check=True)
|
||||
|
||||
if not output_path.exists():
|
||||
raise RuntimeError(f"Download failed: {output_path} not found")
|
||||
|
||||
return {"status": "downloaded", "path": str(output_path)}
|
||||
|
||||
|
||||
def complete(
|
||||
prompt: str,
|
||||
model_id: str = "qwen3-1.7b",
|
||||
system_prompt: Optional[str] = None,
|
||||
temperature: float = 0.3,
|
||||
max_tokens: int = 2048,
|
||||
) -> str:
|
||||
"""Run inference using a local Qwen3 model."""
|
||||
model_path = _model_path(model_id)
|
||||
if not model_path.exists():
|
||||
raise RuntimeError(f"Model not downloaded: {model_id}")
|
||||
|
||||
from llama_cpp import Llama
|
||||
|
||||
llm = Llama(
|
||||
model_path=str(model_path),
|
||||
n_ctx=4096,
|
||||
n_threads=4,
|
||||
n_gpu_layers=-1,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = llm.create_chat_completion(
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
return response["choices"][0]["message"]["content"].strip()
|
||||
205
backend/services/transcription.py
Normal file
205
backend/services/transcription.py
Normal file
@ -0,0 +1,205 @@
|
||||
"""
|
||||
WhisperX-based transcription service with word-level alignment.
|
||||
Falls back to standard Whisper if WhisperX is not available.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from utils.gpu_utils import get_optimal_device, configure_gpu
|
||||
from utils.audio_processing import extract_audio
|
||||
from utils.cache import load_from_cache, save_to_cache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_model_cache: dict = {}
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
WHISPERX_AVAILABLE = True
|
||||
except ImportError:
|
||||
WHISPERX_AVAILABLE = False
|
||||
import whisper
|
||||
|
||||
try:
|
||||
HF_TOKEN = None
|
||||
import os
|
||||
HF_TOKEN = os.environ.get("HF_TOKEN")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _get_device(use_gpu: bool = True) -> torch.device:
|
||||
if use_gpu:
|
||||
return get_optimal_device()
|
||||
return torch.device("cpu")
|
||||
|
||||
|
||||
def _load_model(model_name: str, device: torch.device):
|
||||
cache_key = f"{model_name}_{device}"
|
||||
if cache_key in _model_cache:
|
||||
return _model_cache[cache_key]
|
||||
|
||||
logger.info(f"Loading model: {model_name} on {device}")
|
||||
if WHISPERX_AVAILABLE:
|
||||
compute_type = "float16" if device.type == "cuda" else "int8"
|
||||
model = whisperx.load_model(
|
||||
model_name,
|
||||
device=device.type, # use "cuda" not "cuda:0" — some WhisperX versions don't support device ordinal
|
||||
compute_type=compute_type,
|
||||
)
|
||||
else:
|
||||
model = whisper.load_model(model_name, device=str(device))
|
||||
|
||||
_model_cache[cache_key] = model
|
||||
return model
|
||||
|
||||
|
||||
def transcribe_audio(
|
||||
file_path: str,
|
||||
model_name: str = "base",
|
||||
use_gpu: bool = True,
|
||||
use_cache: bool = True,
|
||||
language: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe audio/video file and return word-level timestamps.
|
||||
|
||||
Returns:
|
||||
dict with keys: words, segments, language
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if use_cache:
|
||||
cached = load_from_cache(file_path, model_name, "transcribe_wx")
|
||||
if cached:
|
||||
logger.info("Using cached transcription")
|
||||
return cached
|
||||
|
||||
video_extensions = {".mp4", ".avi", ".mov", ".mkv", ".webm"}
|
||||
if file_path.suffix.lower() in video_extensions:
|
||||
audio_path = extract_audio(file_path)
|
||||
else:
|
||||
audio_path = file_path
|
||||
|
||||
device = _get_device(use_gpu)
|
||||
model = _load_model(model_name, device)
|
||||
|
||||
logger.info(f"Transcribing: {file_path}")
|
||||
|
||||
if WHISPERX_AVAILABLE:
|
||||
result = _transcribe_whisperx(model, str(audio_path), device, language)
|
||||
else:
|
||||
result = _transcribe_standard(model, str(audio_path), language)
|
||||
|
||||
if use_cache:
|
||||
save_to_cache(file_path, result, model_name, "transcribe_wx")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _transcribe_whisperx(model, audio_path: str, device: torch.device, language: Optional[str]) -> dict:
|
||||
audio = whisperx.load_audio(audio_path)
|
||||
transcribe_opts = {}
|
||||
if language:
|
||||
transcribe_opts["language"] = language
|
||||
|
||||
result = model.transcribe(audio, batch_size=16, **transcribe_opts)
|
||||
detected_language = result.get("language", "en")
|
||||
|
||||
align_model, align_metadata = whisperx.load_align_model(
|
||||
language_code=detected_language,
|
||||
device=device.type,
|
||||
)
|
||||
aligned = whisperx.align(
|
||||
result["segments"],
|
||||
align_model,
|
||||
align_metadata,
|
||||
audio,
|
||||
str(device),
|
||||
return_char_alignments=False,
|
||||
)
|
||||
|
||||
words = []
|
||||
for seg in aligned.get("segments", []):
|
||||
for w in seg.get("words", []):
|
||||
words.append({
|
||||
"word": w.get("word", ""),
|
||||
"start": round(w.get("start", 0), 3),
|
||||
"end": round(w.get("end", 0), 3),
|
||||
"confidence": round(w.get("score", 0), 3),
|
||||
})
|
||||
|
||||
segments = []
|
||||
for i, seg in enumerate(aligned.get("segments", [])):
|
||||
seg_words = []
|
||||
for w in seg.get("words", []):
|
||||
seg_words.append({
|
||||
"word": w.get("word", ""),
|
||||
"start": round(w.get("start", 0), 3),
|
||||
"end": round(w.get("end", 0), 3),
|
||||
"confidence": round(w.get("score", 0), 3),
|
||||
})
|
||||
segments.append({
|
||||
"id": i,
|
||||
"start": round(seg.get("start", 0), 3),
|
||||
"end": round(seg.get("end", 0), 3),
|
||||
"text": seg.get("text", "").strip(),
|
||||
"words": seg_words,
|
||||
})
|
||||
|
||||
return {
|
||||
"words": words,
|
||||
"segments": segments,
|
||||
"language": detected_language,
|
||||
}
|
||||
|
||||
|
||||
def _transcribe_standard(model, audio_path: str, language: Optional[str]) -> dict:
|
||||
"""Fallback: standard Whisper (segment-level only, synthesized word timestamps)."""
|
||||
opts = {}
|
||||
if language:
|
||||
opts["language"] = language
|
||||
|
||||
result = model.transcribe(audio_path, **opts)
|
||||
detected_language = result.get("language", "en")
|
||||
|
||||
words = []
|
||||
segments = []
|
||||
|
||||
for i, seg in enumerate(result.get("segments", [])):
|
||||
text = seg.get("text", "").strip()
|
||||
seg_start = seg.get("start", 0)
|
||||
seg_end = seg.get("end", 0)
|
||||
seg_words_text = text.split()
|
||||
duration = seg_end - seg_start
|
||||
|
||||
seg_words = []
|
||||
for j, w_text in enumerate(seg_words_text):
|
||||
w_start = seg_start + (j / max(len(seg_words_text), 1)) * duration
|
||||
w_end = seg_start + ((j + 1) / max(len(seg_words_text), 1)) * duration
|
||||
word_obj = {
|
||||
"word": w_text,
|
||||
"start": round(w_start, 3),
|
||||
"end": round(w_end, 3),
|
||||
"confidence": 0.5,
|
||||
}
|
||||
words.append(word_obj)
|
||||
seg_words.append(word_obj)
|
||||
|
||||
segments.append({
|
||||
"id": i,
|
||||
"start": round(seg_start, 3),
|
||||
"end": round(seg_end, 3),
|
||||
"text": text,
|
||||
"words": seg_words,
|
||||
})
|
||||
|
||||
return {
|
||||
"words": words,
|
||||
"segments": segments,
|
||||
"language": detected_language,
|
||||
}
|
||||
824
backend/services/video_editor.py
Normal file
824
backend/services/video_editor.py
Normal file
@ -0,0 +1,824 @@
|
||||
"""
|
||||
FFmpeg-based video cutting engine.
|
||||
Uses stream copy for fast, lossless cuts and falls back to re-encode when needed.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_codec_args(format_hint: str, has_video: bool = True) -> list:
|
||||
"""Return FFmpeg codec arguments for the given format."""
|
||||
if format_hint == "wav":
|
||||
return ["-c:a", "pcm_s16le"]
|
||||
if format_hint == "webm":
|
||||
if has_video:
|
||||
return ["-c:v", "libvpx-vp9", "-crf", "30", "-b:v", "0", "-c:a", "libopus"]
|
||||
return ["-c:a", "libopus", "-b:a", "160k"]
|
||||
# Default: MP4
|
||||
if has_video:
|
||||
return ["-c:v", "libx264", "-preset", "medium", "-crf", "18", "-c:a", "aac", "-b:a", "192k"]
|
||||
return ["-c:a", "aac", "-b:a", "192k"]
|
||||
|
||||
|
||||
def _input_has_video_stream(ffmpeg_cmd: str, input_path: str) -> bool:
|
||||
"""Return True if the input contains at least one video stream."""
|
||||
ffprobe = ffmpeg_cmd.replace("ffmpeg", "ffprobe")
|
||||
cmd = [
|
||||
ffprobe,
|
||||
"-v", "error",
|
||||
"-select_streams", "v:0",
|
||||
"-show_entries", "stream=index",
|
||||
"-of", "csv=p=0",
|
||||
str(input_path),
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
return result.returncode == 0 and bool(result.stdout.strip())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _input_has_audio_stream(ffmpeg_cmd: str, input_path: str) -> bool:
|
||||
"""Return True if the input contains at least one audio stream."""
|
||||
ffprobe = ffmpeg_cmd.replace("ffmpeg", "ffprobe")
|
||||
cmd = [
|
||||
ffprobe,
|
||||
"-v", "error",
|
||||
"-select_streams", "a:0",
|
||||
"-show_entries", "stream=index",
|
||||
"-of", "csv=p=0",
|
||||
str(input_path),
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
return result.returncode == 0 and bool(result.stdout.strip())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _clamp_speed(speed: float) -> float:
|
||||
return max(0.25, min(4.0, float(speed)))
|
||||
|
||||
|
||||
def _build_atempo_chain(speed: float) -> str:
|
||||
"""Build an FFmpeg atempo chain since each atempo node only supports 0.5..2.0."""
|
||||
s = _clamp_speed(speed)
|
||||
filters = []
|
||||
while s > 2.0:
|
||||
filters.append("atempo=2.0")
|
||||
s /= 2.0
|
||||
while s < 0.5:
|
||||
filters.append("atempo=0.5")
|
||||
s /= 0.5
|
||||
filters.append(f"atempo={s:.6f}")
|
||||
return ",".join(filters)
|
||||
|
||||
|
||||
def _split_keep_segments_by_speed(
|
||||
keep_segments: List[dict],
|
||||
speed_ranges: List[dict] = None,
|
||||
) -> List[dict]:
|
||||
"""Split keep segments by speed ranges, attaching speed multiplier per piece."""
|
||||
if not keep_segments:
|
||||
return []
|
||||
|
||||
normalized_ranges = []
|
||||
for r in speed_ranges or []:
|
||||
start = float(r.get("start", 0.0))
|
||||
end = float(r.get("end", 0.0))
|
||||
if end <= start:
|
||||
continue
|
||||
normalized_ranges.append({
|
||||
"start": start,
|
||||
"end": end,
|
||||
"speed": _clamp_speed(float(r.get("speed", 1.0))),
|
||||
})
|
||||
normalized_ranges.sort(key=lambda x: x["start"])
|
||||
|
||||
result = []
|
||||
for keep in keep_segments:
|
||||
k_start = float(keep["start"])
|
||||
k_end = float(keep["end"])
|
||||
if k_end <= k_start:
|
||||
continue
|
||||
|
||||
cuts = {k_start, k_end}
|
||||
for sr in normalized_ranges:
|
||||
overlap_start = max(k_start, sr["start"])
|
||||
overlap_end = min(k_end, sr["end"])
|
||||
if overlap_end > overlap_start:
|
||||
cuts.add(overlap_start)
|
||||
cuts.add(overlap_end)
|
||||
|
||||
points = sorted(cuts)
|
||||
for i in range(len(points) - 1):
|
||||
seg_start = points[i]
|
||||
seg_end = points[i + 1]
|
||||
if seg_end - seg_start < 1e-6:
|
||||
continue
|
||||
|
||||
speed = 1.0
|
||||
for sr in normalized_ranges:
|
||||
if seg_start >= sr["start"] and seg_end <= sr["end"]:
|
||||
speed = sr["speed"]
|
||||
break
|
||||
|
||||
result.append({"start": seg_start, "end": seg_end, "speed": speed})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _build_zoom_filter(zoom_config: dict = None) -> str:
|
||||
"""Build FFmpeg video filter snippet for zoom/punch-in effect.
|
||||
|
||||
zoom_config: {enabled, zoomFactor, panX, panY}
|
||||
Returns empty string if disabled. Should be prepended to the video filter chain.
|
||||
"""
|
||||
if not zoom_config or not zoom_config.get("enabled"):
|
||||
return ""
|
||||
factor = float(zoom_config.get("zoomFactor", 1.0))
|
||||
if abs(factor - 1.0) < 0.01:
|
||||
return ""
|
||||
pan_x = float(zoom_config.get("panX", 0.0))
|
||||
pan_y = float(zoom_config.get("panY", 0.0))
|
||||
return f"crop=iw/{factor}:ih/{factor}:((iw-iw/{factor})/2)+({pan_x}*(iw-iw/{factor})/2):((ih-ih/{factor})/2)+({pan_y}*(ih-ih/{factor})/2),scale=iw:ih"
|
||||
|
||||
|
||||
def mix_background_music(
|
||||
video_path: str,
|
||||
music_path: str,
|
||||
output_path: str,
|
||||
volume_db: float = 0.0,
|
||||
ducking_enabled: bool = False,
|
||||
ducking_db: float = 6.0,
|
||||
ducking_attack_ms: float = 10.0,
|
||||
ducking_release_ms: float = 200.0,
|
||||
) -> str:
|
||||
"""Mix background music into a video with optional ducking.
|
||||
|
||||
Uses FFmpeg amix + sidechaincompress. If the input has no audio,
|
||||
the music track becomes the sole audio track. Output is written to output_path.
|
||||
"""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
escaped_music = music_path.replace("\\", "/").replace(":", "\\:")
|
||||
has_audio_result = _input_has_audio_stream(ffmpeg, video_path)
|
||||
|
||||
if not has_audio_result:
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", video_path,
|
||||
"-i", music_path,
|
||||
"-map", "0:v",
|
||||
"-map", "1:a",
|
||||
"-c:v", "copy",
|
||||
"-c:a", "aac", "-b:a", "192k",
|
||||
"-shortest",
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
elif ducking_enabled:
|
||||
music_source = f"amovie='{escaped_music}',volume={volume_db}dB[music]"
|
||||
filter_complex = (
|
||||
f"[0:a]asplit[main][sidechain];"
|
||||
f"{music_source};"
|
||||
f"[main][music]amix=inputs=2:duration=first:dropout_transition=2[mixed];"
|
||||
f"[mixed][sidechain]sidechaincompress="
|
||||
f"threshold=-30dB:ratio=20:attack={ducking_attack_ms / 1000}:"
|
||||
f"release={ducking_release_ms / 1000}:makeup=1:level_sc={ducking_db}[outa]"
|
||||
)
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", video_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", "0:v",
|
||||
"-map", "[outa]",
|
||||
"-c:v", "copy",
|
||||
"-c:a", "aac", "-b:a", "192k",
|
||||
"-shortest",
|
||||
output_path,
|
||||
]
|
||||
else:
|
||||
music_source = f"amovie='{escaped_music}',volume={volume_db}dB[music]"
|
||||
filter_complex = (
|
||||
f"{music_source};"
|
||||
f"[0:a][music]amix=inputs=2:duration=first:dropout_transition=2[outa]"
|
||||
)
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", video_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", "0:v",
|
||||
"-map", "[outa]",
|
||||
"-c:v", "copy",
|
||||
"-c:a", "aac", "-b:a", "192k",
|
||||
"-shortest",
|
||||
output_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Background music mix failed: {result.stderr[-500:]}")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def concat_clips(
|
||||
main_path: str,
|
||||
append_paths: list,
|
||||
output_path: str,
|
||||
) -> str:
|
||||
"""Concatenate multiple video clips using FFmpeg concat demuxer.
|
||||
|
||||
The main_path is kept as-is. append_paths are appended after it.
|
||||
"""
|
||||
if not append_paths:
|
||||
raise ValueError("No clips to concatenate")
|
||||
|
||||
ffmpeg = _find_ffmpeg()
|
||||
resolved_main = str(Path(main_path).resolve())
|
||||
|
||||
# If output_path collides with an input, write to temp first
|
||||
all_inputs = [resolved_main] + [str(Path(p).resolve()) for p in append_paths]
|
||||
needs_rename = str(Path(output_path).resolve()) in all_inputs
|
||||
final_output = output_path
|
||||
if needs_rename:
|
||||
final_output = output_path + ".concat_tmp.mp4"
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix="aive_concat_")
|
||||
try:
|
||||
concat_file = os.path.join(temp_dir, "concat.txt")
|
||||
with open(concat_file, "w") as f:
|
||||
for path in all_inputs:
|
||||
f.write(f"file '{path}'\n")
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-f", "concat",
|
||||
"-safe", "0",
|
||||
"-i", concat_file,
|
||||
"-c", "copy",
|
||||
"-movflags", "+faststart",
|
||||
final_output,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Clip concat failed: {result.stderr[-500:]}")
|
||||
|
||||
if needs_rename:
|
||||
os.replace(final_output, output_path)
|
||||
|
||||
return output_path
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
try:
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
os.rmdir(temp_dir)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _find_ffmpeg() -> str:
|
||||
"""Locate ffmpeg binary."""
|
||||
for cmd in ["ffmpeg", "ffmpeg.exe"]:
|
||||
try:
|
||||
subprocess.run([cmd, "-version"], capture_output=True, check=True)
|
||||
return cmd
|
||||
except (FileNotFoundError, subprocess.CalledProcessError):
|
||||
continue
|
||||
raise RuntimeError("FFmpeg not found. Install it or add it to PATH.")
|
||||
|
||||
|
||||
def export_stream_copy(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
keep_segments: List[dict],
|
||||
mute_ranges: List[dict] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Export video using FFmpeg concat demuxer with stream copy.
|
||||
~100x faster than re-encoding. No quality loss.
|
||||
Falls back to re-encoding if mute_ranges are provided.
|
||||
|
||||
Args:
|
||||
input_path: source video file
|
||||
output_path: destination file
|
||||
keep_segments: list of {"start": float, "end": float} to keep
|
||||
mute_ranges: list of {"start": float, "end": float} to mute (optional)
|
||||
|
||||
Returns:
|
||||
output_path on success
|
||||
"""
|
||||
if mute_ranges:
|
||||
# Mute ranges require audio filtering, so fall back to re-encode
|
||||
return export_reencode(input_path, output_path, keep_segments, "1080p", "mp4", mute_ranges)
|
||||
ffmpeg = _find_ffmpeg()
|
||||
if not _input_has_video_stream(ffmpeg, input_path):
|
||||
# Audio-only inputs cannot use TS segment stream-copy concat reliably.
|
||||
return export_reencode(input_path, output_path, keep_segments)
|
||||
|
||||
input_path = str(Path(input_path).resolve())
|
||||
output_path = str(Path(output_path).resolve())
|
||||
|
||||
if not keep_segments:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix="aive_export_")
|
||||
|
||||
try:
|
||||
segment_files = []
|
||||
for i, seg in enumerate(keep_segments):
|
||||
seg_file = os.path.join(temp_dir, f"seg_{i:04d}.ts")
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-ss", str(seg["start"]),
|
||||
"-to", str(seg["end"]),
|
||||
"-i", input_path,
|
||||
"-c", "copy",
|
||||
"-avoid_negative_ts", "make_zero",
|
||||
"-f", "mpegts",
|
||||
seg_file,
|
||||
]
|
||||
logger.info(f"Extracting segment {i}: {seg['start']:.2f}s - {seg['end']:.2f}s")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"Stream copy segment {i} failed, will try re-encode: {result.stderr[-200:]}")
|
||||
return export_reencode(input_path, output_path, keep_segments)
|
||||
segment_files.append(seg_file)
|
||||
|
||||
concat_str = "|".join(segment_files)
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", f"concat:{concat_str}",
|
||||
"-c", "copy",
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
logger.info(f"Concatenating {len(segment_files)} segments -> {output_path}")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"Concat failed, falling back to re-encode: {result.stderr[-200:]}")
|
||||
return export_reencode(input_path, output_path, keep_segments)
|
||||
|
||||
return output_path
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
try:
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
os.rmdir(temp_dir)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _apply_zoom_post(input_path: str, output_path: str, zoom_config: dict) -> str:
|
||||
"""Re-encode video applying zoom/punch-in crop+scale as a post-process step."""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
zoom_filter = _build_zoom_filter(zoom_config)
|
||||
if not zoom_filter:
|
||||
return input_path
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", f"[0:v]{zoom_filter}[v]",
|
||||
"-map", "[v]",
|
||||
"-map", "0:a?",
|
||||
"-c:a", "copy",
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Zoom post-process failed: {result.stderr[-500:]}")
|
||||
return output_path
|
||||
|
||||
|
||||
def export_reencode(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
keep_segments: List[dict],
|
||||
resolution: str = "1080p",
|
||||
format_hint: str = "mp4",
|
||||
mute_ranges: List[dict] = None,
|
||||
gain_ranges: List[dict] = None,
|
||||
speed_ranges: List[dict] = None,
|
||||
global_gain_db: float = 0.0,
|
||||
normalize_loudness: bool = False,
|
||||
normalize_target_lufs: float = -14.0,
|
||||
zoom_config: dict = None,
|
||||
) -> str:
|
||||
"""
|
||||
Export video with full re-encode. Slower but supports resolution changes,
|
||||
format conversion, and avoids stream-copy edge cases.
|
||||
If mute_ranges are provided, applies audio muting instead of cutting.
|
||||
"""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
input_path = str(Path(input_path).resolve())
|
||||
output_path = str(Path(output_path).resolve())
|
||||
|
||||
scale_map = {
|
||||
"720p": "scale=-2:720",
|
||||
"1080p": "scale=-2:1080",
|
||||
"4k": "scale=-2:2160",
|
||||
}
|
||||
|
||||
def build_audio_filter() -> str:
|
||||
filters = []
|
||||
if abs(float(global_gain_db)) > 1e-6:
|
||||
filters.append(f"volume={float(global_gain_db)}dB")
|
||||
|
||||
for gain_range in gain_ranges or []:
|
||||
start = gain_range['start']
|
||||
end = gain_range['end']
|
||||
gain_db = gain_range.get('gain_db', 0.0)
|
||||
filters.append(f"volume={float(gain_db)}dB:enable='between(t,{start},{end})'")
|
||||
|
||||
for mute_range in mute_ranges or []:
|
||||
start = mute_range['start']
|
||||
end = mute_range['end']
|
||||
filters.append(f"volume=0:enable='between(t,{start},{end})'")
|
||||
|
||||
if normalize_loudness:
|
||||
filters.append(f"loudnorm=I={normalize_target_lufs}:LRA=7:TP=-1.5")
|
||||
|
||||
return ",".join(filters) if filters else "anull"
|
||||
|
||||
has_audio_filters = bool(mute_ranges) or bool(gain_ranges) or abs(float(global_gain_db)) > 1e-6
|
||||
has_video = _input_has_video_stream(ffmpeg, input_path)
|
||||
|
||||
speed_segments = _split_keep_segments_by_speed(keep_segments, speed_ranges)
|
||||
has_speed = any(abs(seg.get("speed", 1.0) - 1.0) > 1e-6 for seg in speed_segments)
|
||||
|
||||
if not has_video:
|
||||
if not keep_segments:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
segments_for_concat = speed_segments if speed_segments else _split_keep_segments_by_speed(keep_segments, None)
|
||||
if not segments_for_concat:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
filter_parts = []
|
||||
for i, seg in enumerate(segments_for_concat):
|
||||
speed = _clamp_speed(seg.get("speed", 1.0))
|
||||
a_chain = f"atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS"
|
||||
if abs(speed - 1.0) > 1e-6:
|
||||
a_chain += f",{_build_atempo_chain(speed)}"
|
||||
filter_parts.append(f"[0:a]{a_chain}[a{i}];")
|
||||
|
||||
n = len(segments_for_concat)
|
||||
concat_inputs = "".join(f"[a{i}]" for i in range(n))
|
||||
filter_parts.append(f"{concat_inputs}concat=n={n}:v=0:a=1[outa_raw]")
|
||||
|
||||
audio_filter = build_audio_filter()
|
||||
if audio_filter != "anull":
|
||||
filter_parts.append(f";[outa_raw]{audio_filter}[outa]")
|
||||
audio_map = "[outa]"
|
||||
else:
|
||||
audio_map = "[outa_raw]"
|
||||
|
||||
filter_complex = "".join(filter_parts)
|
||||
|
||||
codec_args = _get_codec_args(format_hint, has_video=False)
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", audio_map,
|
||||
*codec_args,
|
||||
output_path,
|
||||
]
|
||||
|
||||
logger.info(
|
||||
"Re-encoding audio-only input (%s segments, speed-adjusted=%s) -> %s",
|
||||
n,
|
||||
has_speed,
|
||||
output_path,
|
||||
)
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg audio-only export failed: {result.stderr[-500:]}")
|
||||
|
||||
return output_path
|
||||
|
||||
# Handle filtered full-timeline audio case (mute/gain/global gain) when no speed warping is needed
|
||||
if has_audio_filters and not has_speed:
|
||||
audio_filter = build_audio_filter()
|
||||
|
||||
# Video filter - just scaling if needed
|
||||
scale = scale_map.get(resolution, "")
|
||||
if scale:
|
||||
video_filter = scale
|
||||
video_map = "[v]"
|
||||
else:
|
||||
video_filter = "null"
|
||||
video_map = "0:v"
|
||||
|
||||
filter_complex = f"[0:a]{audio_filter}[a];[0:v]{video_filter}{video_map}"
|
||||
|
||||
codec_args = _get_codec_args(format_hint, has_video)
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", video_map,
|
||||
"-map", "[a]",
|
||||
*codec_args,
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
|
||||
logger.info(
|
||||
"Re-encoding with audio filters (mute=%s gain=%s global=%s) -> %s (%s)",
|
||||
len(mute_ranges or []),
|
||||
len(gain_ranges or []),
|
||||
global_gain_db,
|
||||
output_path,
|
||||
resolution,
|
||||
)
|
||||
else:
|
||||
# Cutting logic with optional per-segment speed changes
|
||||
if not keep_segments:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
segments_for_concat = speed_segments if speed_segments else _split_keep_segments_by_speed(keep_segments, None)
|
||||
if not segments_for_concat:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
filter_parts = []
|
||||
for i, seg in enumerate(segments_for_concat):
|
||||
speed = _clamp_speed(seg.get("speed", 1.0))
|
||||
v_chain = f"trim=start={seg['start']}:end={seg['end']},setpts=PTS-STARTPTS"
|
||||
a_chain = f"atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS"
|
||||
if abs(speed - 1.0) > 1e-6:
|
||||
v_chain += f",setpts=PTS/{speed:.6f}"
|
||||
a_chain += f",{_build_atempo_chain(speed)}"
|
||||
filter_parts.append(f"[0:v]{v_chain}[v{i}];[0:a]{a_chain}[a{i}];")
|
||||
|
||||
n = len(segments_for_concat)
|
||||
concat_inputs = "".join(f"[v{i}][a{i}]" for i in range(n))
|
||||
filter_parts.append(f"{concat_inputs}concat=n={n}:v=1:a=1[outv][outa]")
|
||||
|
||||
filter_complex = "".join(filter_parts)
|
||||
|
||||
# Add loudnorm to the cutting path audio chain if enabled
|
||||
audio_map_label = "[outa]"
|
||||
if normalize_loudness:
|
||||
filter_complex += f";{audio_map_label}loudnorm=I={normalize_target_lufs}:LRA=7:TP=-1.5[outa_norm]"
|
||||
audio_map_label = "[outa_norm]"
|
||||
|
||||
scale = scale_map.get(resolution, "")
|
||||
if scale:
|
||||
filter_complex += f";[outv]{scale}[outv_scaled]"
|
||||
video_map = "[outv_scaled]"
|
||||
else:
|
||||
video_map = "[outv]"
|
||||
|
||||
codec_args = _get_codec_args(format_hint, has_video)
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", video_map,
|
||||
"-map", audio_map_label,
|
||||
*codec_args,
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
|
||||
logger.info(
|
||||
"Re-encoding %s segments (speed-adjusted=%s, normalize=%s) -> %s (%s)",
|
||||
n,
|
||||
has_speed,
|
||||
normalize_loudness,
|
||||
output_path,
|
||||
resolution,
|
||||
)
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg re-encode failed: {result.stderr[-500:]}")
|
||||
|
||||
# Apply zoom post-processing if configured
|
||||
if zoom_config and zoom_config.get("enabled") and has_video:
|
||||
zoomed_path = output_path + ".zoomed.mp4"
|
||||
_apply_zoom_post(output_path, zoomed_path, zoom_config)
|
||||
os.replace(zoomed_path, output_path)
|
||||
logger.info("Zoom/punch-in applied to %s (factor=%s)", output_path, zoom_config.get("zoomFactor", 1.0))
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def export_reencode_with_subs(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
keep_segments: List[dict],
|
||||
subtitle_path: str,
|
||||
resolution: str = "1080p",
|
||||
format_hint: str = "mp4",
|
||||
mute_ranges: List[dict] = None,
|
||||
gain_ranges: List[dict] = None,
|
||||
speed_ranges: List[dict] = None,
|
||||
global_gain_db: float = 0.0,
|
||||
normalize_loudness: bool = False,
|
||||
normalize_target_lufs: float = -14.0,
|
||||
zoom_config: dict = None,
|
||||
) -> str:
|
||||
"""
|
||||
Export video with re-encode and burn-in subtitles (ASS format).
|
||||
Applies trim+concat first, then overlays the subtitle file.
|
||||
If mute_ranges are provided, applies audio muting instead of cutting.
|
||||
"""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
if not _input_has_video_stream(ffmpeg, input_path):
|
||||
raise ValueError("Burn-in captions require a video track")
|
||||
|
||||
input_path = str(Path(input_path).resolve())
|
||||
output_path = str(Path(output_path).resolve())
|
||||
subtitle_path = str(Path(subtitle_path).resolve())
|
||||
|
||||
scale_map = {
|
||||
"720p": "scale=-2:720",
|
||||
"1080p": "scale=-2:1080",
|
||||
"4k": "scale=-2:2160",
|
||||
}
|
||||
|
||||
def build_audio_filter() -> str:
|
||||
filters = []
|
||||
if abs(float(global_gain_db)) > 1e-6:
|
||||
filters.append(f"volume={float(global_gain_db)}dB")
|
||||
|
||||
for gain_range in gain_ranges or []:
|
||||
start = gain_range['start']
|
||||
end = gain_range['end']
|
||||
gain_db = gain_range.get('gain_db', 0.0)
|
||||
filters.append(f"volume={float(gain_db)}dB:enable='between(t,{start},{end})'")
|
||||
|
||||
for mute_range in mute_ranges or []:
|
||||
start = mute_range['start']
|
||||
end = mute_range['end']
|
||||
filters.append(f"volume=0:enable='between(t,{start},{end})'")
|
||||
|
||||
if normalize_loudness:
|
||||
filters.append(f"loudnorm=I={normalize_target_lufs}:LRA=7:TP=-1.5")
|
||||
|
||||
return ",".join(filters) if filters else "anull"
|
||||
|
||||
has_audio_filters = bool(mute_ranges) or bool(gain_ranges) or abs(float(global_gain_db)) > 1e-6
|
||||
|
||||
speed_segments = _split_keep_segments_by_speed(keep_segments, speed_ranges)
|
||||
has_speed = any(abs(seg.get("speed", 1.0) - 1.0) > 1e-6 for seg in speed_segments)
|
||||
|
||||
# Handle filtered full-timeline audio case (mute/gain/global gain) when no speed warping is needed
|
||||
if has_audio_filters and not has_speed:
|
||||
audio_filter = build_audio_filter()
|
||||
|
||||
# Video filter with subtitles
|
||||
escaped_sub = subtitle_path.replace("\\", "/").replace(":", "\\:")
|
||||
scale = scale_map.get(resolution, "")
|
||||
if scale:
|
||||
video_filter = f"{scale},ass='{escaped_sub}'"
|
||||
else:
|
||||
video_filter = f"ass='{escaped_sub}'"
|
||||
|
||||
filter_complex = f"[0:a]{audio_filter}[a];[0:v]{video_filter}[v]"
|
||||
|
||||
codec_args = _get_codec_args(format_hint, has_video=True)
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", "[v]",
|
||||
"-map", "[a]",
|
||||
*codec_args,
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
|
||||
logger.info(
|
||||
"Re-encoding with subtitles and audio filters (mute=%s gain=%s global=%s) -> %s (%s)",
|
||||
len(mute_ranges or []),
|
||||
len(gain_ranges or []),
|
||||
global_gain_db,
|
||||
output_path,
|
||||
resolution,
|
||||
)
|
||||
else:
|
||||
# Cutting logic with subtitles and optional speed changes
|
||||
if not keep_segments:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
segments_for_concat = speed_segments if speed_segments else _split_keep_segments_by_speed(keep_segments, None)
|
||||
if not segments_for_concat:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
filter_parts = []
|
||||
for i, seg in enumerate(segments_for_concat):
|
||||
speed = _clamp_speed(seg.get("speed", 1.0))
|
||||
v_chain = f"trim=start={seg['start']}:end={seg['end']},setpts=PTS-STARTPTS"
|
||||
a_chain = f"atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS"
|
||||
if abs(speed - 1.0) > 1e-6:
|
||||
v_chain += f",setpts=PTS/{speed:.6f}"
|
||||
a_chain += f",{_build_atempo_chain(speed)}"
|
||||
filter_parts.append(f"[0:v]{v_chain}[v{i}];[0:a]{a_chain}[a{i}];")
|
||||
|
||||
n = len(segments_for_concat)
|
||||
concat_inputs = "".join(f"[v{i}][a{i}]" for i in range(n))
|
||||
filter_parts.append(f"{concat_inputs}concat=n={n}:v=1:a=1[outv][outa]")
|
||||
|
||||
filter_complex = "".join(filter_parts)
|
||||
|
||||
# Escape path for FFmpeg subtitle filter (Windows backslashes need escaping)
|
||||
escaped_sub = subtitle_path.replace("\\", "/").replace(":", "\\:")
|
||||
|
||||
scale = scale_map.get(resolution, "")
|
||||
if scale:
|
||||
filter_complex += f";[outv]{scale},ass='{escaped_sub}'[outv_final]"
|
||||
else:
|
||||
filter_complex += f";[outv]ass='{escaped_sub}'[outv_final]"
|
||||
video_map = "[outv_final]"
|
||||
|
||||
codec_args = _get_codec_args(format_hint, has_video=True)
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", video_map,
|
||||
"-map", "[outa]",
|
||||
*codec_args,
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
|
||||
logger.info(
|
||||
"Re-encoding %s segments with subtitles (speed-adjusted=%s) -> %s (%s)",
|
||||
n,
|
||||
has_speed,
|
||||
output_path,
|
||||
resolution,
|
||||
)
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg re-encode with subs failed: {result.stderr[-500:]}")
|
||||
|
||||
# Apply zoom post-processing if configured
|
||||
if zoom_config and zoom_config.get("enabled"):
|
||||
zoomed_path = output_path + ".zoomed.mp4"
|
||||
_apply_zoom_post(output_path, zoomed_path, zoom_config)
|
||||
os.replace(zoomed_path, output_path)
|
||||
logger.info("Zoom/punch-in applied to %s (factor=%s)", output_path, zoom_config.get("zoomFactor", 1.0))
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def get_video_info(input_path: str) -> dict:
|
||||
"""Get basic video metadata using ffprobe."""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
ffprobe = ffmpeg.replace("ffmpeg", "ffprobe")
|
||||
|
||||
cmd = [
|
||||
ffprobe, "-v", "quiet",
|
||||
"-print_format", "json",
|
||||
"-show_format", "-show_streams",
|
||||
str(input_path),
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
import json
|
||||
data = json.loads(result.stdout)
|
||||
fmt = data.get("format", {})
|
||||
video_stream = next((s for s in data.get("streams", []) if s.get("codec_type") == "video"), {})
|
||||
|
||||
return {
|
||||
"duration": float(fmt.get("duration", 0)),
|
||||
"size": int(fmt.get("size", 0)),
|
||||
"format": fmt.get("format_name", ""),
|
||||
"width": int(video_stream.get("width", 0)),
|
||||
"height": int(video_stream.get("height", 0)),
|
||||
"codec": video_stream.get("codec_name", ""),
|
||||
"fps": eval(video_stream.get("r_frame_rate", "0/1")) if "/" in video_stream.get("r_frame_rate", "") else 0,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get video info: {e}")
|
||||
return {}
|
||||
0
backend/tests/__init__.py
Normal file
0
backend/tests/__init__.py
Normal file
57
backend/tests/test_cache_utils.py
Normal file
57
backend/tests/test_cache_utils.py
Normal file
@ -0,0 +1,57 @@
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from backend.utils import cache as cache_utils
|
||||
|
||||
|
||||
class CacheUtilsTests(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self._tmp_dir = tempfile.TemporaryDirectory()
|
||||
self._old_cache_dir = cache_utils.CACHE_DIR
|
||||
cache_utils.CACHE_DIR = Path(self._tmp_dir.name) / "cache"
|
||||
|
||||
self._work_dir = Path(self._tmp_dir.name) / "work"
|
||||
self._work_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._src_file = self._work_dir / "sample.txt"
|
||||
self._src_file.write_text("hello", encoding="utf-8")
|
||||
|
||||
def tearDown(self) -> None:
|
||||
cache_utils.CACHE_DIR = self._old_cache_dir
|
||||
self._tmp_dir.cleanup()
|
||||
|
||||
def test_get_file_hash_returns_none_for_missing_file(self) -> None:
|
||||
missing = self._work_dir / "missing.txt"
|
||||
self.assertIsNone(cache_utils.get_file_hash(missing))
|
||||
|
||||
def test_save_and_load_round_trip(self) -> None:
|
||||
payload = {"value": 123, "ok": True}
|
||||
saved = cache_utils.save_to_cache(self._src_file, payload, model="m1", operation="transcribe")
|
||||
self.assertTrue(saved)
|
||||
|
||||
loaded = cache_utils.load_from_cache(self._src_file, model="m1", operation="transcribe")
|
||||
self.assertEqual(payload, loaded)
|
||||
|
||||
def test_load_from_cache_respects_max_age(self) -> None:
|
||||
payload = {"value": 999}
|
||||
self.assertTrue(cache_utils.save_to_cache(self._src_file, payload, operation="transcribe"))
|
||||
|
||||
time.sleep(0.02)
|
||||
expired = cache_utils.load_from_cache(self._src_file, operation="transcribe", max_age=0.001)
|
||||
self.assertIsNone(expired)
|
||||
|
||||
def test_clear_cache_deletes_files(self) -> None:
|
||||
self.assertTrue(cache_utils.save_to_cache(self._src_file, {"a": 1}, operation="transcribe"))
|
||||
self.assertTrue(cache_utils.save_to_cache(self._src_file, {"a": 2}, operation="summarize"))
|
||||
|
||||
deleted_count = cache_utils.clear_cache()
|
||||
self.assertGreaterEqual(deleted_count, 1)
|
||||
|
||||
size_bytes, file_count = cache_utils.get_cache_size()
|
||||
self.assertEqual(size_bytes, 0)
|
||||
self.assertEqual(file_count, 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
451
backend/tests/test_router_contracts.py
Normal file
451
backend/tests/test_router_contracts.py
Normal file
@ -0,0 +1,451 @@
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
import os
|
||||
from types import SimpleNamespace
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.main import app
|
||||
from routers import audio as audio_router
|
||||
|
||||
|
||||
class RouterContractTests(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls) -> None:
|
||||
cls.client = TestClient(app)
|
||||
|
||||
def setUp(self) -> None:
|
||||
audio_router._waveform_cache.clear()
|
||||
|
||||
def test_health_endpoint(self) -> None:
|
||||
res = self.client.get("/health")
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.json(), {"status": "ok"})
|
||||
|
||||
def test_file_endpoint_full_content(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
file_path = Path(tmp) / "sample.wav"
|
||||
file_path.write_bytes(b"abcdefghij")
|
||||
|
||||
res = self.client.get("/file", params={"path": str(file_path)})
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.content, b"abcdefghij")
|
||||
self.assertEqual(res.headers.get("accept-ranges"), "bytes")
|
||||
|
||||
def test_file_endpoint_range_request(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
file_path = Path(tmp) / "sample.wav"
|
||||
file_path.write_bytes(b"abcdefghij")
|
||||
|
||||
res = self.client.get(
|
||||
"/file",
|
||||
params={"path": str(file_path)},
|
||||
headers={"Range": "bytes=2-5"},
|
||||
)
|
||||
|
||||
self.assertEqual(res.status_code, 206)
|
||||
self.assertEqual(res.content, b"cdef")
|
||||
self.assertEqual(res.headers.get("content-range"), "bytes 2-5/10")
|
||||
|
||||
def test_file_endpoint_missing_file(self) -> None:
|
||||
res = self.client.get("/file", params={"path": "/tmp/does-not-exist.wav"})
|
||||
|
||||
self.assertEqual(res.status_code, 404)
|
||||
self.assertIn("File not found", res.json()["detail"])
|
||||
|
||||
@patch("routers.audio.subprocess.run")
|
||||
def test_audio_waveform_cache_miss_then_hit(self, mock_subprocess_run) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
media_file = Path(tmp) / "input.mp4"
|
||||
media_file.write_bytes(b"fake-media")
|
||||
|
||||
def fake_ffmpeg(cmd, capture_output, text):
|
||||
out_path = Path(cmd[-1])
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_bytes(b"fake-wav")
|
||||
return SimpleNamespace(returncode=0, stderr="")
|
||||
|
||||
mock_subprocess_run.side_effect = fake_ffmpeg
|
||||
|
||||
res1 = self.client.get("/audio/waveform", params={"path": str(media_file)})
|
||||
self.assertEqual(res1.status_code, 200)
|
||||
self.assertTrue(res1.headers.get("content-type", "").startswith("audio/wav"))
|
||||
|
||||
res2 = self.client.get("/audio/waveform", params={"path": str(media_file)})
|
||||
self.assertEqual(res2.status_code, 200)
|
||||
self.assertTrue(res2.headers.get("content-type", "").startswith("audio/wav"))
|
||||
|
||||
self.assertEqual(mock_subprocess_run.call_count, 1)
|
||||
|
||||
@patch("routers.audio.subprocess.run")
|
||||
def test_audio_waveform_ffmpeg_failure_returns_500(self, mock_subprocess_run) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
media_file = Path(tmp) / "input.mp4"
|
||||
media_file.write_bytes(b"fake-media")
|
||||
|
||||
mock_subprocess_run.return_value = SimpleNamespace(returncode=1, stderr="ffmpeg failed")
|
||||
|
||||
res = self.client.get("/audio/waveform", params={"path": str(media_file)})
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertIn("Failed to extract audio", res.json()["detail"])
|
||||
|
||||
@patch("routers.ai.detect_filler_words")
|
||||
def test_ai_filler_removal_contract(self, mock_detect_filler_words) -> None:
|
||||
mock_detect_filler_words.return_value = {
|
||||
"wordIndices": [2, 5],
|
||||
"fillerWords": [
|
||||
{"index": 2, "word": "um", "reason": "filler"},
|
||||
{"index": 5, "word": "uh", "reason": "filler"},
|
||||
],
|
||||
}
|
||||
|
||||
payload = {
|
||||
"transcript": "Hello um world uh",
|
||||
"words": [
|
||||
{"index": 0, "word": "Hello"},
|
||||
{"index": 1, "word": "um"},
|
||||
{"index": 2, "word": "world"},
|
||||
],
|
||||
"provider": "ollama",
|
||||
"model": "llama3",
|
||||
}
|
||||
res = self.client.post("/ai/filler-removal", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("wordIndices", res.json())
|
||||
mock_detect_filler_words.assert_called_once()
|
||||
|
||||
@patch("routers.ai.detect_filler_words")
|
||||
def test_ai_filler_removal_error_returns_500(self, mock_detect_filler_words) -> None:
|
||||
mock_detect_filler_words.side_effect = RuntimeError("ai-filler-fail")
|
||||
|
||||
payload = {
|
||||
"transcript": "Hello world",
|
||||
"words": [{"index": 0, "word": "Hello"}],
|
||||
"provider": "ollama",
|
||||
}
|
||||
res = self.client.post("/ai/filler-removal", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertEqual(res.json()["detail"], "ai-filler-fail")
|
||||
|
||||
@patch("routers.ai.create_clip_suggestion")
|
||||
def test_ai_create_clip_contract(self, mock_create_clip_suggestion) -> None:
|
||||
mock_create_clip_suggestion.return_value = {
|
||||
"title": "Best Moment",
|
||||
"startWordIndex": 10,
|
||||
"endWordIndex": 40,
|
||||
"startTime": 12.3,
|
||||
"endTime": 48.8,
|
||||
"reason": "Strong hook",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"transcript": "Long transcript...",
|
||||
"words": [{"index": 0, "word": "hello"}],
|
||||
"provider": "ollama",
|
||||
"target_duration": 45,
|
||||
}
|
||||
res = self.client.post("/ai/create-clip", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.json()["title"], "Best Moment")
|
||||
mock_create_clip_suggestion.assert_called_once()
|
||||
|
||||
@patch("routers.ai.create_clip_suggestion")
|
||||
def test_ai_create_clip_error_returns_500(self, mock_create_clip_suggestion) -> None:
|
||||
mock_create_clip_suggestion.side_effect = RuntimeError("ai-clip-fail")
|
||||
|
||||
payload = {
|
||||
"transcript": "Hello world",
|
||||
"words": [{"index": 0, "word": "hello"}],
|
||||
"provider": "ollama",
|
||||
}
|
||||
res = self.client.post("/ai/create-clip", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertEqual(res.json()["detail"], "ai-clip-fail")
|
||||
|
||||
@patch("routers.ai.AIProvider.list_ollama_models")
|
||||
def test_ai_ollama_models_contract(self, mock_list_ollama_models) -> None:
|
||||
mock_list_ollama_models.return_value = ["llama3", "qwen2.5"]
|
||||
|
||||
res = self.client.get("/ai/ollama-models?base_url=http://localhost:11434")
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.json(), {"models": ["llama3", "qwen2.5"]})
|
||||
mock_list_ollama_models.assert_called_once_with("http://localhost:11434")
|
||||
|
||||
@patch("routers.ai.AIProvider.list_ollama_models")
|
||||
def test_ai_ollama_models_unhandled_error_returns_500(self, mock_list_ollama_models) -> None:
|
||||
mock_list_ollama_models.side_effect = RuntimeError("ollama-unreachable")
|
||||
|
||||
local_client = TestClient(app, raise_server_exceptions=False)
|
||||
res = local_client.get("/ai/ollama-models")
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
|
||||
@patch("routers.transcribe.transcribe_audio")
|
||||
def test_transcribe_success(self, mock_transcribe) -> None:
|
||||
mock_transcribe.return_value = {"words": [], "segments": [], "language": "en"}
|
||||
|
||||
payload = {
|
||||
"file_path": "/tmp/input.wav",
|
||||
"model": "base",
|
||||
"use_gpu": False,
|
||||
"use_cache": True,
|
||||
}
|
||||
res = self.client.post("/transcribe", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.json(), {"words": [], "segments": [], "language": "en"})
|
||||
mock_transcribe.assert_called_once()
|
||||
|
||||
@patch("routers.transcribe.diarize_and_label")
|
||||
@patch("routers.transcribe.transcribe_audio")
|
||||
def test_transcribe_with_diarization(self, mock_transcribe, mock_diarize) -> None:
|
||||
mock_transcribe.return_value = {"words": [{"word": "hi", "start": 0.0, "end": 0.2}], "segments": []}
|
||||
mock_diarize.return_value = {"words": [{"word": "hi", "start": 0.0, "end": 0.2, "speaker": "SPEAKER_00"}], "segments": []}
|
||||
|
||||
payload = {
|
||||
"file_path": "/tmp/input.wav",
|
||||
"model": "base",
|
||||
"diarize": True,
|
||||
"hf_token": "hf_xxx",
|
||||
"num_speakers": 2,
|
||||
}
|
||||
res = self.client.post("/transcribe", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("words", res.json())
|
||||
mock_transcribe.assert_called_once()
|
||||
mock_diarize.assert_called_once()
|
||||
|
||||
@patch("routers.transcribe.transcribe_audio")
|
||||
def test_transcribe_file_not_found_returns_404(self, mock_transcribe) -> None:
|
||||
mock_transcribe.side_effect = FileNotFoundError("missing")
|
||||
|
||||
payload = {
|
||||
"file_path": "/tmp/missing.wav",
|
||||
"model": "base",
|
||||
}
|
||||
res = self.client.post("/transcribe", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 404)
|
||||
self.assertIn("File not found", res.json()["detail"])
|
||||
|
||||
@patch("routers.transcribe.transcribe_audio")
|
||||
def test_transcribe_runtime_failure_returns_500(self, mock_transcribe) -> None:
|
||||
mock_transcribe.side_effect = RuntimeError("boom")
|
||||
|
||||
payload = {
|
||||
"file_path": "/tmp/in.wav",
|
||||
"model": "base",
|
||||
}
|
||||
res = self.client.post("/transcribe", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertEqual(res.json()["detail"], "boom")
|
||||
|
||||
@patch("routers.captions.generate_srt")
|
||||
def test_captions_plain_response(self, mock_generate_srt) -> None:
|
||||
mock_generate_srt.return_value = "1\n00:00:00,000 --> 00:00:01,000\nHello\n"
|
||||
|
||||
payload = {
|
||||
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
|
||||
"format": "srt",
|
||||
}
|
||||
res = self.client.post("/captions", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("Hello", res.text)
|
||||
mock_generate_srt.assert_called_once()
|
||||
|
||||
@patch("routers.captions.save_captions")
|
||||
@patch("routers.captions.generate_srt")
|
||||
def test_captions_save_output_path(self, mock_generate_srt, mock_save) -> None:
|
||||
mock_generate_srt.return_value = "1\n00:00:00,000 --> 00:00:01,000\nHello\n"
|
||||
mock_save.return_value = "/tmp/out.srt"
|
||||
|
||||
payload = {
|
||||
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
|
||||
"format": "srt",
|
||||
"output_path": "/tmp/out.srt",
|
||||
}
|
||||
res = self.client.post("/captions", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.json(), {"status": "ok", "output_path": "/tmp/out.srt"})
|
||||
mock_save.assert_called_once()
|
||||
|
||||
def test_captions_unknown_format_returns_400(self) -> None:
|
||||
payload = {
|
||||
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
|
||||
"format": "txt",
|
||||
}
|
||||
res = self.client.post("/captions", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 400)
|
||||
self.assertIn("Unknown format", res.json()["detail"])
|
||||
|
||||
@patch("routers.captions.generate_srt")
|
||||
def test_captions_internal_error_returns_500(self, mock_generate_srt) -> None:
|
||||
mock_generate_srt.side_effect = RuntimeError("caption-fail")
|
||||
|
||||
payload = {
|
||||
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
|
||||
"format": "srt",
|
||||
}
|
||||
res = self.client.post("/captions", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertEqual(res.json()["detail"], "caption-fail")
|
||||
|
||||
@patch("routers.audio.is_deepfilter_available")
|
||||
@patch("routers.audio.clean_audio")
|
||||
def test_audio_clean_contract(self, mock_clean_audio, mock_is_deepfilter_available) -> None:
|
||||
mock_clean_audio.return_value = "/tmp/cleaned.wav"
|
||||
mock_is_deepfilter_available.return_value = True
|
||||
|
||||
payload = {
|
||||
"input_path": "/tmp/in.wav",
|
||||
"output_path": "/tmp/cleaned.wav",
|
||||
}
|
||||
res = self.client.post("/audio/clean", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
body = res.json()
|
||||
self.assertEqual(body["status"], "ok")
|
||||
self.assertEqual(body["output_path"], "/tmp/cleaned.wav")
|
||||
self.assertEqual(body["engine"], "deepfilternet")
|
||||
|
||||
@patch("routers.audio.clean_audio")
|
||||
def test_audio_clean_error_returns_500(self, mock_clean_audio) -> None:
|
||||
mock_clean_audio.side_effect = RuntimeError("clean-fail")
|
||||
|
||||
payload = {
|
||||
"input_path": "/tmp/in.wav",
|
||||
"output_path": "/tmp/cleaned.wav",
|
||||
}
|
||||
res = self.client.post("/audio/clean", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertEqual(res.json()["detail"], "clean-fail")
|
||||
|
||||
@patch("routers.audio.detect_silence_ranges")
|
||||
def test_audio_detect_silence_contract(self, mock_detect_silence_ranges) -> None:
|
||||
mock_detect_silence_ranges.return_value = [{"start": 1.2, "end": 2.1, "duration": 0.9}]
|
||||
|
||||
payload = {
|
||||
"input_path": "/tmp/in.wav",
|
||||
"min_silence_ms": 500,
|
||||
"silence_db": -35.0,
|
||||
}
|
||||
res = self.client.post("/audio/detect-silence", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
body = res.json()
|
||||
self.assertEqual(body["status"], "ok")
|
||||
self.assertEqual(body["count"], 1)
|
||||
self.assertEqual(len(body["ranges"]), 1)
|
||||
|
||||
@patch("routers.audio.detect_silence_ranges")
|
||||
def test_audio_detect_silence_error_returns_500(self, mock_detect_silence_ranges) -> None:
|
||||
mock_detect_silence_ranges.side_effect = RuntimeError("silence-fail")
|
||||
|
||||
payload = {
|
||||
"input_path": "/tmp/in.wav",
|
||||
"min_silence_ms": 500,
|
||||
"silence_db": -35.0,
|
||||
}
|
||||
res = self.client.post("/audio/detect-silence", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertEqual(res.json()["detail"], "silence-fail")
|
||||
|
||||
@patch("routers.audio.is_deepfilter_available")
|
||||
def test_audio_capabilities_contract(self, mock_is_deepfilter_available) -> None:
|
||||
mock_is_deepfilter_available.return_value = False
|
||||
|
||||
res = self.client.get("/audio/capabilities")
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.json(), {"deepfilternet_available": False})
|
||||
|
||||
@patch("routers.export.export_stream_copy")
|
||||
def test_export_fast_contract(self, mock_export_stream_copy) -> None:
|
||||
mock_export_stream_copy.return_value = "/tmp/out.mp4"
|
||||
|
||||
payload = {
|
||||
"input_path": "/tmp/in.mp4",
|
||||
"output_path": "/tmp/out.mp4",
|
||||
"keep_segments": [{"start": 0.0, "end": 2.0}],
|
||||
"mode": "fast",
|
||||
"captions": "none",
|
||||
}
|
||||
res = self.client.post("/export", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertEqual(res.json(), {"status": "ok", "output_path": "/tmp/out.mp4"})
|
||||
mock_export_stream_copy.assert_called_once()
|
||||
|
||||
@patch("routers.export.save_captions")
|
||||
@patch("routers.export.generate_srt")
|
||||
@patch("routers.export.export_stream_copy")
|
||||
def test_export_sidecar_caption_contract(self, mock_export_stream_copy, mock_generate_srt, mock_save_captions) -> None:
|
||||
mock_export_stream_copy.return_value = "/tmp/out.mp4"
|
||||
mock_generate_srt.return_value = "1\n00:00:00,000 --> 00:00:01,000\nHello\n"
|
||||
|
||||
payload = {
|
||||
"input_path": "/tmp/in.mp4",
|
||||
"output_path": "/tmp/out.mp4",
|
||||
"keep_segments": [{"start": 0.0, "end": 2.0}],
|
||||
"mode": "fast",
|
||||
"captions": "sidecar",
|
||||
"words": [{"word": "Hello", "start": 0.0, "end": 1.0}],
|
||||
"deleted_indices": [],
|
||||
}
|
||||
res = self.client.post("/export", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 200)
|
||||
body = res.json()
|
||||
self.assertEqual(body["status"], "ok")
|
||||
self.assertEqual(body["output_path"], "/tmp/out.mp4")
|
||||
self.assertEqual(body["srt_path"], "/tmp/out.srt")
|
||||
mock_save_captions.assert_called_once()
|
||||
|
||||
def test_export_missing_segments_returns_400(self) -> None:
|
||||
payload = {
|
||||
"input_path": "/tmp/in.mp4",
|
||||
"output_path": "/tmp/out.mp4",
|
||||
"keep_segments": [],
|
||||
"mode": "fast",
|
||||
"captions": "none",
|
||||
}
|
||||
res = self.client.post("/export", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 400)
|
||||
self.assertIn("No segments to export", res.json()["detail"])
|
||||
|
||||
@patch("routers.export.export_stream_copy")
|
||||
def test_export_runtime_error_returns_500(self, mock_export_stream_copy) -> None:
|
||||
mock_export_stream_copy.side_effect = RuntimeError("export-fail")
|
||||
|
||||
payload = {
|
||||
"input_path": "/tmp/in.mp4",
|
||||
"output_path": "/tmp/out.mp4",
|
||||
"keep_segments": [{"start": 0.0, "end": 2.0}],
|
||||
"mode": "fast",
|
||||
"captions": "none",
|
||||
}
|
||||
res = self.client.post("/export", json=payload)
|
||||
|
||||
self.assertEqual(res.status_code, 500)
|
||||
self.assertEqual(res.json()["detail"], "export-fail")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
0
backend/utils/__init__.py
Normal file
0
backend/utils/__init__.py
Normal file
74
backend/utils/audio_processing.py
Normal file
74
backend/utils/audio_processing.py
Normal file
@ -0,0 +1,74 @@
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import os
|
||||
import logging
|
||||
|
||||
try:
|
||||
from moviepy import AudioFileClip
|
||||
except ImportError:
|
||||
from moviepy.editor import AudioFileClip
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_temp_audio_files = []
|
||||
|
||||
|
||||
def extract_audio(video_path: Path):
|
||||
"""Extract audio from a video file into a temp directory for automatic cleanup."""
|
||||
logger.info(f"[extract_audio] Extracting audio from: {video_path}")
|
||||
try:
|
||||
audio = AudioFileClip(str(video_path))
|
||||
if audio.duration is None or audio.duration == 0:
|
||||
logger.error(f"[extract_audio] File has no audio track or zero duration: {video_path}")
|
||||
raise RuntimeError(f"File has no audio track: {video_path}")
|
||||
logger.info(f"[extract_audio] Duration: {audio.duration:.2f}s, fps: {audio.fps}")
|
||||
temp_dir = tempfile.mkdtemp(prefix="videotranscriber_")
|
||||
audio_path = Path(temp_dir) / f"{video_path.stem}_audio.wav"
|
||||
try:
|
||||
audio.write_audiofile(str(audio_path), logger=None)
|
||||
except TypeError:
|
||||
# moviepy 1.x uses verbose parameter; moviepy 2.x removed it
|
||||
audio.write_audiofile(str(audio_path), verbose=False, logger=None)
|
||||
audio.close()
|
||||
if not audio_path.exists() or audio_path.stat().st_size == 0:
|
||||
logger.error(f"[extract_audio] Output WAV is empty or missing: {audio_path}")
|
||||
raise RuntimeError(f"Audio extraction produced empty file: {audio_path}")
|
||||
logger.info(f"[extract_audio] Extracted to: {audio_path} ({audio_path.stat().st_size} bytes)")
|
||||
_temp_audio_files.append(str(audio_path))
|
||||
return audio_path
|
||||
except RuntimeError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[extract_audio] Failed for '{video_path}': {e}", exc_info=True)
|
||||
raise RuntimeError(f"Audio extraction failed: {e}")
|
||||
|
||||
|
||||
def cleanup_temp_audio():
|
||||
"""Remove all temporary audio files created during processing."""
|
||||
cleaned = 0
|
||||
for fpath in _temp_audio_files:
|
||||
try:
|
||||
if os.path.exists(fpath):
|
||||
os.remove(fpath)
|
||||
parent = os.path.dirname(fpath)
|
||||
if os.path.isdir(parent) and not os.listdir(parent):
|
||||
os.rmdir(parent)
|
||||
cleaned += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not remove temp file {fpath}: {e}")
|
||||
_temp_audio_files.clear()
|
||||
return cleaned
|
||||
|
||||
|
||||
def get_video_duration(video_path: Path):
|
||||
"""Get duration of a video/audio file in seconds."""
|
||||
try:
|
||||
clip = AudioFileClip(str(video_path))
|
||||
duration = clip.duration
|
||||
clip.close()
|
||||
if duration is None or duration == 0:
|
||||
logger.warning(f"[get_video_duration] Zero or null duration for: {video_path}")
|
||||
return duration
|
||||
except Exception as e:
|
||||
logger.error(f"[get_video_duration] Failed for '{video_path}': {e}", exc_info=True)
|
||||
return None
|
||||
@ -1,12 +1,9 @@
|
||||
"""
|
||||
GPU utilities for the OBS Recording Transcriber.
|
||||
GPU utilities for the Video Transcriber.
|
||||
Provides functions to detect and configure GPU acceleration.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import torch
|
||||
|
||||
# Configure logging
|
||||
@ -68,8 +65,6 @@ def get_optimal_device():
|
||||
|
||||
|
||||
def set_memory_limits(memory_fraction=0.8):
|
||||
global torch
|
||||
import torch
|
||||
"""
|
||||
Set memory limits for GPU usage.
|
||||
|
||||
77
backend/video_editor.py
Normal file
77
backend/video_editor.py
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Video editing operations using FFmpeg.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs, get_video_info
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python video_editor.py <command> [args...]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
|
||||
try:
|
||||
if command == "export_stream_copy":
|
||||
if len(sys.argv) != 5:
|
||||
print("Usage: python video_editor.py export_stream_copy <input_path> <output_path> <keep_segments_json>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
input_path = sys.argv[2]
|
||||
output_path = sys.argv[3]
|
||||
keep_segments = json.loads(sys.argv[4])
|
||||
result = export_stream_copy(input_path, output_path, keep_segments)
|
||||
print(json.dumps({"output_path": result}))
|
||||
|
||||
elif command == "export_reencode":
|
||||
if len(sys.argv) != 7:
|
||||
print("Usage: python video_editor.py export_reencode <input_path> <output_path> <keep_segments_json> <resolution> <format_hint>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
input_path = sys.argv[2]
|
||||
output_path = sys.argv[3]
|
||||
keep_segments = json.loads(sys.argv[4])
|
||||
resolution = sys.argv[5]
|
||||
format_hint = sys.argv[6]
|
||||
result = export_reencode(input_path, output_path, keep_segments, resolution, format_hint)
|
||||
print(json.dumps({"output_path": result}))
|
||||
|
||||
elif command == "export_reencode_with_subs":
|
||||
if len(sys.argv) != 8:
|
||||
print("Usage: python video_editor.py export_reencode_with_subs <input_path> <output_path> <keep_segments_json> <subtitle_path> <resolution> <format_hint>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
input_path = sys.argv[2]
|
||||
output_path = sys.argv[3]
|
||||
keep_segments = json.loads(sys.argv[4])
|
||||
subtitle_path = sys.argv[5]
|
||||
resolution = sys.argv[6]
|
||||
format_hint = sys.argv[7]
|
||||
result = export_reencode_with_subs(input_path, output_path, keep_segments, subtitle_path, resolution, format_hint)
|
||||
print(json.dumps({"output_path": result}))
|
||||
|
||||
elif command == "get_video_info":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python video_editor.py get_video_info <input_path>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
input_path = sys.argv[2]
|
||||
result = get_video_info(input_path)
|
||||
print(json.dumps(result))
|
||||
|
||||
else:
|
||||
print(f"Unknown command: {command}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
86
close
Executable file
86
close
Executable file
@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
# Close TalkEdit processes (Tauri dev and Python backend)
|
||||
|
||||
KILLED_ANY=0
|
||||
|
||||
kill_pids() {
|
||||
local label=$1
|
||||
shift
|
||||
local pids=("$@")
|
||||
|
||||
[[ ${#pids[@]} -eq 0 ]] && return
|
||||
|
||||
echo "Stopping $label (PID(s): ${pids[*]})..."
|
||||
kill -TERM "${pids[@]}" 2>/dev/null || true
|
||||
sleep 0.7
|
||||
|
||||
local survivors=()
|
||||
local pid
|
||||
for pid in "${pids[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
survivors+=("$pid")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#survivors[@]} -gt 0 ]]; then
|
||||
echo "Force killing stubborn $label PID(s): ${survivors[*]}"
|
||||
kill -KILL "${survivors[@]}" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
KILLED_ANY=1
|
||||
}
|
||||
|
||||
kill_tree() {
|
||||
local pid=$1
|
||||
local children
|
||||
children=$(pgrep -P "$pid" 2>/dev/null || true)
|
||||
if [[ -n "$children" ]]; then
|
||||
local child
|
||||
for child in $children; do
|
||||
kill_tree "$child"
|
||||
done
|
||||
fi
|
||||
kill_pids "process tree" "$pid"
|
||||
}
|
||||
|
||||
kill_port() {
|
||||
local port=$1
|
||||
local name=$2
|
||||
local pids
|
||||
pids=$(lsof -ti tcp:"$port" 2>/dev/null)
|
||||
if [[ -n "$pids" ]]; then
|
||||
# Kill any children first so watcher subprocesses do not survive.
|
||||
local pid
|
||||
for pid in $pids; do
|
||||
kill_tree "$pid"
|
||||
done
|
||||
kill_pids "$name listener on port $port" $pids
|
||||
fi
|
||||
}
|
||||
|
||||
kill_pattern() {
|
||||
local pattern=$1
|
||||
local label=$2
|
||||
local pids
|
||||
pids=$(pgrep -f "$pattern" 2>/dev/null)
|
||||
if [[ -n "$pids" ]]; then
|
||||
kill_pids "$label" $pids
|
||||
fi
|
||||
}
|
||||
|
||||
# --- TalkEdit (Tauri, port 8000) ---
|
||||
kill_port 8000 "TalkEdit"
|
||||
kill_port 5173 "TalkEdit frontend"
|
||||
kill_pattern "tauri.*TalkEdit\|TalkEdit.*tauri\|cargo.*tauri dev\|/TalkEdit/target/debug" "TalkEdit (Tauri dev)"
|
||||
# Vite dev server for TalkEdit (fallback when not bound to 5173 yet)
|
||||
kill_pattern "[/ ]vite([[:space:]]|$)\|[/ ]rsbuild([[:space:]]|$)" "TalkEdit frontend dev server"
|
||||
|
||||
# --- Orphaned uvicorn workers ---
|
||||
kill_pattern "uvicorn.*main:app.*--port 8000" "leftover uvicorn workers (TalkEdit)"
|
||||
kill_pattern "uvicorn.*main:app.*--port 8642" "leftover uvicorn workers"
|
||||
|
||||
if [[ $KILLED_ANY -eq 0 ]]; then
|
||||
echo "Nothing to close — no TalkEdit processes found."
|
||||
else
|
||||
echo "Done."
|
||||
fi
|
||||
@ -1,70 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
videotranscriber:
|
||||
# Use prebuilt image from GitHub Container Registry
|
||||
image: ghcr.io/dataants-ai/videotranscriber:latest
|
||||
container_name: videotranscriber
|
||||
ports:
|
||||
- "8501:8501"
|
||||
volumes:
|
||||
# Mount your video files directory (change the left path to your actual videos folder)
|
||||
- "${VIDEO_PATH:-./videos}:/app/data/videos"
|
||||
# Mount output directory for transcripts and summaries
|
||||
- "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
|
||||
# Mount cache directory for model caching (optional, improves performance)
|
||||
- "${CACHE_PATH:-./cache}:/app/data/cache"
|
||||
# Mount a config directory if needed
|
||||
- "${CONFIG_PATH:-./config}:/app/config"
|
||||
environment:
|
||||
# Ollama configuration for host access
|
||||
- OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
|
||||
# Optional: HuggingFace token for advanced features
|
||||
- HF_TOKEN=${HF_TOKEN:-}
|
||||
# GPU configuration
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}
|
||||
# Cache settings
|
||||
- TRANSFORMERS_CACHE=/app/data/cache/transformers
|
||||
- WHISPER_CACHE=/app/data/cache/whisper
|
||||
restart: unless-stopped
|
||||
# Use bridge networking for Windows/Mac with host.docker.internal
|
||||
networks:
|
||||
- videotranscriber-network
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
# Alternative GPU-enabled service (uncomment to use)
|
||||
# videotranscriber-gpu:
|
||||
# image: ghcr.io/dataants-ai/videotranscriber:latest-gpu
|
||||
# container_name: videotranscriber-gpu
|
||||
# ports:
|
||||
# - "8501:8501"
|
||||
# volumes:
|
||||
# - "${VIDEO_PATH:-./videos}:/app/data/videos"
|
||||
# - "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
|
||||
# - "${CACHE_PATH:-./cache}:/app/data/cache"
|
||||
# - "${CONFIG_PATH:-./config}:/app/config"
|
||||
# environment:
|
||||
# - OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
|
||||
# - HF_TOKEN=${HF_TOKEN:-}
|
||||
# - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||
# - TRANSFORMERS_CACHE=/app/data/cache/transformers
|
||||
# - WHISPER_CACHE=/app/data/cache/whisper
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: 1
|
||||
# capabilities: [gpu]
|
||||
# restart: unless-stopped
|
||||
# networks:
|
||||
# - videotranscriber-network
|
||||
|
||||
networks:
|
||||
videotranscriber-network:
|
||||
driver: bridge
|
||||
@ -1,51 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
videotranscriber:
|
||||
build: .
|
||||
container_name: videotranscriber
|
||||
ports:
|
||||
- "8501:8501"
|
||||
volumes:
|
||||
# Mount your video files directory (change the left path to your actual videos folder)
|
||||
- "${VIDEO_PATH:-./videos}:/app/data/videos"
|
||||
# Mount output directory for transcripts and summaries
|
||||
- "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
|
||||
# Mount cache directory for model caching (optional, improves performance)
|
||||
- "${CACHE_PATH:-./cache}:/app/data/cache"
|
||||
# Mount a config directory if needed
|
||||
- "${CONFIG_PATH:-./config}:/app/config"
|
||||
environment:
|
||||
# Ollama configuration for host access
|
||||
- OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
|
||||
# Optional: HuggingFace token for advanced features
|
||||
- HF_TOKEN=${HF_TOKEN:-}
|
||||
# GPU configuration
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}
|
||||
# Cache settings
|
||||
- TRANSFORMERS_CACHE=/app/data/cache/transformers
|
||||
- WHISPER_CACHE=/app/data/cache/whisper
|
||||
# For GPU access (uncomment if you have NVIDIA GPU and nvidia-docker)
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: 1
|
||||
# capabilities: [gpu]
|
||||
restart: unless-stopped
|
||||
# For Linux hosts, you might prefer host networking for better Ollama access
|
||||
# network_mode: host # Uncomment for Linux hosts
|
||||
# Use bridge networking for Windows/Mac with host.docker.internal
|
||||
networks:
|
||||
- videotranscriber-network
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
networks:
|
||||
videotranscriber-network:
|
||||
driver: bridge
|
||||
@ -1,63 +0,0 @@
|
||||
# VideoTranscriber Docker Configuration
|
||||
# Copy this file to .env and modify the values as needed
|
||||
|
||||
# =============================================================================
|
||||
# DOCKER VOLUME PATHS (Host Directories)
|
||||
# =============================================================================
|
||||
|
||||
# Path to your video files directory on the host
|
||||
# This directory will be mounted into the container at /app/data/videos
|
||||
VIDEO_PATH=./videos
|
||||
|
||||
# Path where outputs (transcripts, summaries) will be saved on the host
|
||||
# This directory will be mounted into the container at /app/data/outputs
|
||||
OUTPUT_PATH=./outputs
|
||||
|
||||
# Path for caching ML models and processed files (improves performance)
|
||||
# This directory will be mounted into the container at /app/data/cache
|
||||
CACHE_PATH=./cache
|
||||
|
||||
# Optional: Configuration directory for custom settings
|
||||
CONFIG_PATH=./config
|
||||
|
||||
# =============================================================================
|
||||
# OLLAMA CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
# Ollama API URL - how the container accesses your host Ollama service
|
||||
# For Windows/Mac with Docker Desktop: use host.docker.internal
|
||||
# For Linux: use host networking or the actual host IP
|
||||
OLLAMA_API_URL=http://host.docker.internal:11434/api
|
||||
|
||||
# =============================================================================
|
||||
# ML MODEL CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
# HuggingFace token for advanced features (speaker diarization, etc.)
|
||||
# Get your token at: https://huggingface.co/settings/tokens
|
||||
# Leave empty if not using advanced features
|
||||
HF_TOKEN=
|
||||
|
||||
# GPU Configuration
|
||||
# Specify which GPU devices to use (leave empty for all available)
|
||||
# Examples: "0" for first GPU, "0,1" for first two GPUs
|
||||
CUDA_VISIBLE_DEVICES=
|
||||
|
||||
# =============================================================================
|
||||
# DOCKER-SPECIFIC SETTINGS
|
||||
# =============================================================================
|
||||
|
||||
# Container name (change if you want to run multiple instances)
|
||||
CONTAINER_NAME=videotranscriber
|
||||
|
||||
# Port mapping (host:container)
|
||||
HOST_PORT=8501
|
||||
|
||||
# =============================================================================
|
||||
# EXAMPLE USAGE
|
||||
# =============================================================================
|
||||
# 1. Copy this file: cp docker.env.example .env
|
||||
# 2. Edit the paths to match your system
|
||||
# 3. Make sure Ollama is running on your host: ollama serve
|
||||
# 4. Start the container: docker-compose up -d
|
||||
# 5. Access the app at: http://localhost:8501
|
||||
73
docs/ai-policy.md
Normal file
73
docs/ai-policy.md
Normal file
@ -0,0 +1,73 @@
|
||||
# AI Execution Policy
|
||||
|
||||
Purpose: define what autonomous AI can do in this repository without explicit human approval.
|
||||
|
||||
## Default Mode
|
||||
|
||||
- AI may implement and debug within approved scope.
|
||||
- AI must run validation commands after code changes.
|
||||
- AI must stop and escalate when blocked by policy or ambiguity.
|
||||
|
||||
## Allowed Autonomous Actions
|
||||
|
||||
1. Edit frontend, backend, shared schema, docs, and scripts.
|
||||
2. Add/modify tests related to the task.
|
||||
3. Run non-destructive validation commands.
|
||||
4. Update project docs and Copilot instructions when behavior changes.
|
||||
|
||||
## Restricted Actions (Require Approval)
|
||||
|
||||
1. Security/privacy-sensitive logic changes.
|
||||
2. Data migrations or destructive file operations.
|
||||
3. Credential handling changes or secrets management changes.
|
||||
4. Breaking API/schema changes.
|
||||
5. Build/release signing, packaging, and deployment automation changes.
|
||||
|
||||
## Prohibited Actions
|
||||
|
||||
1. Destructive git commands (`git reset --hard`, force pushing protected branches).
|
||||
2. Deleting user project/media data.
|
||||
3. Bypassing required checks in CI.
|
||||
|
||||
## Required Validation Workflow
|
||||
|
||||
For each autonomous task:
|
||||
|
||||
1. Implement smallest safe change set.
|
||||
2. Run lint/type/test/build checks for impacted scope.
|
||||
3. Inspect errors and fix with bounded retries.
|
||||
4. Re-run checks until green or escalated.
|
||||
5. Produce concise summary with risks and assumptions.
|
||||
|
||||
## Escalation Triggers
|
||||
|
||||
AI must ask a human when:
|
||||
|
||||
1. Requirements are ambiguous and affect user-visible behavior.
|
||||
2. Multiple product choices are plausible with no clear preference.
|
||||
3. Potential legal, security, or compliance impact exists.
|
||||
4. CI remains failing after 3 repair attempts in the same area.
|
||||
5. A requested operation conflicts with this policy.
|
||||
|
||||
## Required Artifacts In AI PR/Change Summary
|
||||
|
||||
1. What changed.
|
||||
2. Why it changed.
|
||||
3. Validation commands and outcome.
|
||||
4. Residual risks.
|
||||
5. Follow-up tasks.
|
||||
|
||||
## Risk Levels
|
||||
|
||||
- Low: docs, styling, isolated refactors, non-critical bugfixes.
|
||||
- Medium: feature additions with contract-stable behavior.
|
||||
- High: API/schema/security/export pipeline/transcription pipeline changes.
|
||||
|
||||
High-risk changes require explicit human review before merge.
|
||||
|
||||
## TalkEdit-Specific Rules
|
||||
|
||||
1. Preserve compatibility for desktop bridge contracts unless explicitly approved.
|
||||
2. Keep routers thin and business logic in backend services.
|
||||
3. Export/transcription pipeline changes must include regression tests.
|
||||
4. Linux WebKit startup behavior and media URL consistency are mandatory regression targets.
|
||||
59
docs/gitea-runner-setup.md
Normal file
59
docs/gitea-runner-setup.md
Normal file
@ -0,0 +1,59 @@
|
||||
# Gitea Runner Setup
|
||||
|
||||
Two self-hosted runners for the Gitea instance at `http://143.244.157.110:3000`.
|
||||
|
||||
## Linux Runner (`talkedit-builder`)
|
||||
|
||||
- **Host**: this machine (CachyOS)
|
||||
- **Binary**: `~/.local/bin/gitea-runner`
|
||||
- **Config**: `~/.runner`
|
||||
- **Labels**: `ubuntu-latest` → `docker://node:22-bookworm`, `ubuntu-24.04` → `docker://node:22-bookworm`
|
||||
- **Executor**: Docker
|
||||
- **Service**: user systemd unit `talkedit-runner.service`
|
||||
|
||||
### View runner daemon logs
|
||||
|
||||
```bash
|
||||
journalctl --user -u talkedit-runner --no-pager -n 100
|
||||
# or follow live:
|
||||
journalctl --user -u talkedit-runner -f
|
||||
```
|
||||
|
||||
### Restart
|
||||
|
||||
```bash
|
||||
systemctl --user restart talkedit-runner
|
||||
```
|
||||
|
||||
### Workflow job logs (step output)
|
||||
|
||||
Not stored locally — streamed to the Gitea server. Download from the Actions UI:
|
||||
`http://143.244.157.110:3000/dillon_stuff/TalkEdit/actions/runs/<run_id>`
|
||||
|
||||
---
|
||||
|
||||
## Windows Runner (`windows-laptop`)
|
||||
|
||||
- **Host**: Windows laptop
|
||||
- **Binary**: `%USERPROFILE%\gitea-runner-windows-amd64.exe`
|
||||
- **Labels**: `windows-latest`
|
||||
- **Executor**: host (runs directly, no Docker)
|
||||
|
||||
### Runner daemon logs
|
||||
|
||||
| Mode | Logs |
|
||||
|---|---|
|
||||
| Foreground (`daemon`) | PowerShell console stdout |
|
||||
| Windows service | `%ProgramData%\gitea-runner\log\` or Event Viewer |
|
||||
|
||||
### Workflow job logs
|
||||
|
||||
Same as Linux — download from Gitea Actions UI.
|
||||
|
||||
---
|
||||
|
||||
## Why Rust/Python jobs fail
|
||||
|
||||
The Linux runner maps `ubuntu-latest` to `docker://node:22-bookworm` which only has Node.js. The Rust and Python steps try to install toolchains (`dtolnay/rust-toolchain`, `actions/setup-python`) inside the Docker container, but something is failing early.
|
||||
|
||||
**To debug**: download the full log from Gitea Actions UI and look for the first `error` or `FAILED` line after the initial setup steps — the snippets pasted so far only show the tail (cleanup/post steps).
|
||||
44
docs/gitea-runner-windows.md
Normal file
44
docs/gitea-runner-windows.md
Normal file
@ -0,0 +1,44 @@
|
||||
# Gitea Runner — Windows Laptop
|
||||
|
||||
Self-hosted runner registered as `windows-laptop` with label `windows-latest`.
|
||||
|
||||
## Setup
|
||||
|
||||
```powershell
|
||||
# Download
|
||||
Invoke-WebRequest -Uri "https://gitea.com/gitea/runner/releases/download/v1.0.1/gitea-runner-1.0.1-windows-amd64.exe" -OutFile "$env:USERPROFILE\gitea-runner-windows-amd64.exe"
|
||||
|
||||
# Register (Admin PowerShell)
|
||||
.\gitea-runner-windows-amd64.exe register --instance http://143.244.157.110:3000 --token NS5LXzLzNOvPKD9Id4SrLQ09bReHOrn6T2c4EyGM --name windows-laptop --labels windows-latest --no-interactive
|
||||
|
||||
# Start (foreground)
|
||||
.\gitea-runner-windows-amd64.exe daemon
|
||||
|
||||
# Install as Windows service (auto-starts on boot)
|
||||
.\gitea-runner-windows-amd64.exe service install
|
||||
```
|
||||
|
||||
## Logs
|
||||
|
||||
### Workflow job logs (step output)
|
||||
|
||||
Stored on the Gitea server (not locally). Download from:
|
||||
`http://143.244.157.110:3000/<owner>/<repo>/actions/runs/<run_id>`
|
||||
|
||||
Click a job, then the **Download log** button at the top-right.
|
||||
|
||||
### Runner daemon logs (runner itself)
|
||||
|
||||
| Mode | Log location |
|
||||
|---|---|
|
||||
| Foreground (`daemon`) | PowerShell console stdout |
|
||||
| Windows service (`service install`) | `%ProgramData%\gitea-runner\log\` or Windows Event Viewer → Windows Logs → Application |
|
||||
|
||||
## Diagnostics
|
||||
|
||||
If a CI job fails, download the full log from the Gitea Actions UI (as above), then search for the first error:
|
||||
|
||||
- **Rust**: look for `error[E...]`, `error: could not compile`, or `cargo test` failures
|
||||
- **Python**: look for `FAILED`, `AssertionError`, or `ModuleNotFoundError`
|
||||
|
||||
The runner's own logs (`daemon` mode) will show which job it picked up, container lifecycle, and any infrastructure issues (disk full, Docker unavailable, etc.).
|
||||
113
docs/runbooks/error-codes.md
Normal file
113
docs/runbooks/error-codes.md
Normal file
@ -0,0 +1,113 @@
|
||||
# Error Codes Runbook
|
||||
|
||||
Purpose: provide consistent, AI-readable error categories for faster autonomous debugging.
|
||||
|
||||
## Format
|
||||
|
||||
Use codes in this format: `<SUBSYSTEM>-<CATEGORY>-<ID>`
|
||||
|
||||
Examples:
|
||||
|
||||
- `BE-EXPORT-001`
|
||||
- `FE-WAVEFORM-002`
|
||||
- `HOST-BRIDGE-003`
|
||||
|
||||
## Backend (FastAPI / Services)
|
||||
|
||||
### Export
|
||||
|
||||
- `BE-EXPORT-001`: Export request validation failed.
|
||||
- Symptoms: HTTP 400, missing/invalid ranges.
|
||||
- Likely causes: malformed payload, empty segments.
|
||||
- First checks: request body shape, keep/mute/gain ranges.
|
||||
|
||||
- `BE-EXPORT-002`: FFmpeg command failed.
|
||||
- Symptoms: HTTP 500, stderr contains filter/codec error.
|
||||
- Likely causes: invalid filter chain, unsupported codec/container.
|
||||
- First checks: generated FFmpeg args, source media codec, target format.
|
||||
|
||||
- `BE-EXPORT-003`: Caption burn-in/subtitle generation failed.
|
||||
- Symptoms: burn-in export fails while plain export works.
|
||||
- Likely causes: ASS generation issue, subtitle path/temp file cleanup race.
|
||||
- First checks: ASS file generation, temp file lifecycle.
|
||||
|
||||
### Transcription
|
||||
|
||||
- `BE-TRANSCRIBE-001`: Model unavailable or download failure.
|
||||
- Symptoms: transcription never starts or exits early.
|
||||
- Likely causes: missing model, network/cache issue.
|
||||
- First checks: model cache path, ensure-model logs.
|
||||
|
||||
- `BE-TRANSCRIBE-002`: Inference pipeline runtime failure.
|
||||
- Symptoms: mid-run crash, partial output.
|
||||
- Likely causes: CUDA/CPU mismatch, unsupported media, resource exhaustion.
|
||||
- First checks: environment, GPU availability, media decoding logs.
|
||||
|
||||
### Audio / Waveform
|
||||
|
||||
- `BE-AUDIO-001`: Waveform endpoint failed.
|
||||
- Symptoms: waveform panel shows unavailable/error.
|
||||
- Likely causes: decode error, invalid file path, unsupported media input.
|
||||
- First checks: `audio/waveform` response body, file existence, FFmpeg decode path.
|
||||
|
||||
## Frontend (React)
|
||||
|
||||
### Timeline / Zones
|
||||
|
||||
- `FE-TIMELINE-001`: Zone interaction state inconsistency.
|
||||
- Symptoms: cannot drag/select/delete zones predictably.
|
||||
- Likely causes: stale selection/editing state, hidden/selected mismatch.
|
||||
- First checks: zone mode flags, selectedZone state transitions.
|
||||
|
||||
- `FE-TIMELINE-002`: Visibility filter mismatch.
|
||||
- Symptoms: hidden zones still interactive or selected.
|
||||
- Likely causes: hit-testing ignores visibility flags.
|
||||
- First checks: hit-test filters and selected-zone reset logic.
|
||||
|
||||
### Media UI
|
||||
|
||||
- `FE-WAVEFORM-001`: Waveform fetch failed.
|
||||
- Symptoms: warning banner with URL/error.
|
||||
- Likely causes: backend unavailable, bad path encoding, CORS/proxy issue.
|
||||
- First checks: backend health endpoint, waveform URL, network tab logs.
|
||||
|
||||
- `FE-PROJECT-001`: Project load mismatch.
|
||||
- Symptoms: loaded media/transcript differs from saved data.
|
||||
- Likely causes: schema drift, fallback URL mismatch.
|
||||
- First checks: project schema fields, loadVideo/loadProject URL parity.
|
||||
|
||||
## Host / Bridge (Tauri)
|
||||
|
||||
- `HOST-BRIDGE-001`: Desktop API bridge unavailable.
|
||||
- Symptoms: open/save/transcribe actions no-op or throw.
|
||||
- Likely causes: bridge init error, host command mismatch.
|
||||
- First checks: bridge initialization, command names, runtime environment.
|
||||
|
||||
- `HOST-WEBKIT-001`: Linux WebKit startup/render regression.
|
||||
- Symptoms: noisy startup errors, UI load issues.
|
||||
- Likely causes: CSP/font regressions, unsupported protocol calls.
|
||||
- First checks: CSP config, remote font usage, bridge fallback behavior.
|
||||
|
||||
## Logging Guidance
|
||||
|
||||
When raising errors, include:
|
||||
|
||||
1. Error code.
|
||||
2. Human message.
|
||||
3. Correlation/request id.
|
||||
4. Relevant paths/ids (sanitized).
|
||||
5. Suggested first-check hints.
|
||||
|
||||
Example structured payload:
|
||||
|
||||
```json
|
||||
{
|
||||
"code": "BE-EXPORT-002",
|
||||
"message": "FFmpeg export failed",
|
||||
"requestId": "exp_20260415_001",
|
||||
"context": {
|
||||
"format": "mp4",
|
||||
"mode": "reencode"
|
||||
}
|
||||
}
|
||||
```
|
||||
113
docs/spec-template.md
Normal file
113
docs/spec-template.md
Normal file
@ -0,0 +1,113 @@
|
||||
# Feature Spec Template
|
||||
|
||||
Use this template for every net-new feature and major behavior change.
|
||||
|
||||
## Metadata
|
||||
|
||||
- Spec ID: SPEC-YYYYMMDD-<short-name>
|
||||
- Owner:
|
||||
- Date:
|
||||
- Status: draft | approved | in-progress | done
|
||||
- Related issue/PR:
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Describe the user problem in 2-5 sentences.
|
||||
|
||||
## User Story
|
||||
|
||||
As a <user type>, I want <capability>, so that <outcome>.
|
||||
|
||||
## Scope
|
||||
|
||||
### In Scope
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
### Out of Scope
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
## Functional Requirements
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
1. Given <state>, when <action>, then <result>.
|
||||
2. Given <state>, when <action>, then <result>.
|
||||
3. Failure handling is deterministic and user-visible.
|
||||
|
||||
## UX Notes
|
||||
|
||||
- Entry points (toolbar/panel/command):
|
||||
- Empty/loading/error states:
|
||||
- Keyboard shortcuts / accessibility expectations:
|
||||
|
||||
## API And Data Contracts
|
||||
|
||||
- Endpoints impacted:
|
||||
- Request/response changes:
|
||||
- Backward compatibility plan:
|
||||
- Project schema impact (`shared/project-schema.json`):
|
||||
|
||||
## Architecture Impact
|
||||
|
||||
- Frontend files/components likely affected:
|
||||
- Backend routers/services likely affected:
|
||||
- Tauri/bridge changes required:
|
||||
|
||||
## Risks
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
## Test Plan
|
||||
|
||||
### Unit Tests
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
### Integration Tests
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
### E2E / Smoke Tests
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
### Regression Tests
|
||||
|
||||
List known regressions this spec must prevent.
|
||||
|
||||
## Observability
|
||||
|
||||
- New logs/error codes:
|
||||
- Metrics/traces needed:
|
||||
- Diagnostics artifacts expected on failure:
|
||||
|
||||
## Rollout Plan
|
||||
|
||||
1. Development and internal validation.
|
||||
2. Staged rollout or feature flag (if applicable).
|
||||
3. Rollback path.
|
||||
|
||||
## Open Questions
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
## Definition Of Done
|
||||
|
||||
1. Acceptance criteria pass.
|
||||
2. Tests added and green.
|
||||
3. Docs/instructions updated.
|
||||
4. Risks and assumptions recorded in PR summary.
|
||||
18
docs/specs/README.md
Normal file
18
docs/specs/README.md
Normal file
@ -0,0 +1,18 @@
|
||||
# Feature Specs
|
||||
|
||||
Place one feature spec document in this folder for each feature or major behavior change.
|
||||
|
||||
Use [docs/spec-template.md](../spec-template.md) as the canonical template.
|
||||
|
||||
Recommended naming format:
|
||||
|
||||
- `YYYY-MM-DD-short-feature-name.md`
|
||||
|
||||
Examples:
|
||||
|
||||
- `2026-04-15-gain-zones-and-visibility-filters.md`
|
||||
- `2026-04-16-speed-adjustment.md`
|
||||
|
||||
CI policy:
|
||||
|
||||
- Pull requests that change app code are expected to include at least one changed spec file in this folder.
|
||||
26
frontend/eslint.config.js
Normal file
26
frontend/eslint.config.js
Normal file
@ -0,0 +1,26 @@
|
||||
import js from '@eslint/js';
|
||||
import globals from 'globals';
|
||||
import reactHooks from 'eslint-plugin-react-hooks';
|
||||
import reactRefresh from 'eslint-plugin-react-refresh';
|
||||
import tseslint from 'typescript-eslint';
|
||||
|
||||
export default tseslint.config(
|
||||
{ ignores: ['dist', 'node_modules'] },
|
||||
{
|
||||
extends: [js.configs.recommended, ...tseslint.configs.recommended],
|
||||
files: ['**/*.{ts,tsx}'],
|
||||
languageOptions: {
|
||||
ecmaVersion: 2020,
|
||||
globals: globals.browser,
|
||||
},
|
||||
plugins: {
|
||||
'react-hooks': reactHooks,
|
||||
'react-refresh': reactRefresh,
|
||||
},
|
||||
rules: {
|
||||
...reactHooks.configs.recommended.rules,
|
||||
'react-refresh/only-export-components': ['warn', { allowConstantExport: true }],
|
||||
'@typescript-eslint/no-explicit-any': 'off',
|
||||
},
|
||||
},
|
||||
);
|
||||
16
frontend/index.html
Normal file
16
frontend/index.html
Normal file
@ -0,0 +1,16 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline' https://fonts.googleapis.com; font-src 'self' data: https://fonts.gstatic.com; connect-src 'self' ipc: http://ipc.localhost http://localhost:* http://127.0.0.1:* ws://localhost:* ws://127.0.0.1:*; media-src 'self' file: blob: http://localhost:* http://127.0.0.1:*; img-src 'self' data: blob:;" />
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet" />
|
||||
<title>TalkEdit</title>
|
||||
</head>
|
||||
<body class="bg-editor-bg text-editor-text antialiased">
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
5383
frontend/package-lock.json
generated
Normal file
5383
frontend/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
44
frontend/package.json
Normal file
44
frontend/package.json
Normal file
@ -0,0 +1,44 @@
|
||||
{
|
||||
"name": "talkedit-frontend",
|
||||
"private": true,
|
||||
"version": "0.1.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "tsc -b && vite build",
|
||||
"lint": "eslint .",
|
||||
"test": "vitest run",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"@tauri-apps/api": "^2",
|
||||
"@tauri-apps/plugin-dialog": "^2",
|
||||
"@tauri-apps/plugin-fs": "^2",
|
||||
"lucide-react": "^0.468.0",
|
||||
"react": "^19.0.0",
|
||||
"react-dom": "^19.0.0",
|
||||
"react-virtuoso": "^4.18.3",
|
||||
"wavesurfer.js": "^7.8.0",
|
||||
"zundo": "^2.3.0",
|
||||
"zustand": "^5.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.39.4",
|
||||
"@tauri-apps/cli": "^2",
|
||||
"@types/react": "^19.0.0",
|
||||
"@types/react-dom": "^19.0.0",
|
||||
"@vitejs/plugin-react": "^4.3.0",
|
||||
"autoprefixer": "^10.4.20",
|
||||
"eslint": "^9.39.4",
|
||||
"eslint-plugin-react-hooks": "^7.0.1",
|
||||
"eslint-plugin-react-refresh": "^0.5.2",
|
||||
"globals": "^17.5.0",
|
||||
"jsdom": "^29.1.1",
|
||||
"postcss": "^8.4.49",
|
||||
"tailwindcss": "^3.4.0",
|
||||
"typescript": "^5.7.0",
|
||||
"typescript-eslint": "^8.58.2",
|
||||
"vite": "^6.0.0",
|
||||
"vitest": "^4.1.4"
|
||||
}
|
||||
}
|
||||
6
frontend/postcss.config.js
Normal file
6
frontend/postcss.config.js
Normal file
@ -0,0 +1,6 @@
|
||||
export default {
|
||||
plugins: {
|
||||
tailwindcss: {},
|
||||
autoprefixer: {},
|
||||
},
|
||||
};
|
||||
1085
frontend/src/App.tsx
Normal file
1085
frontend/src/App.tsx
Normal file
File diff suppressed because it is too large
Load Diff
474
frontend/src/components/AIPanel.tsx
Normal file
474
frontend/src/components/AIPanel.tsx
Normal file
@ -0,0 +1,474 @@
|
||||
import { useCallback, useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { useAIStore } from '../store/aiStore';
|
||||
import { useLicenseStore } from '../store/licenseStore';
|
||||
import { Sparkles, Scissors, Film, Loader2, Check, X, Play, Download, RotateCcw, RefreshCw, Lock } from 'lucide-react';
|
||||
import type { ClipSuggestion } from '../types/project';
|
||||
|
||||
interface AIPanelProps {
|
||||
onReprocess: () => void;
|
||||
whisperModel: string;
|
||||
setWhisperModel: (model: string) => void;
|
||||
}
|
||||
|
||||
export default function AIPanel({ onReprocess, whisperModel, setWhisperModel }: AIPanelProps) {
|
||||
const { words, videoPath, backendUrl, deleteWordRange, setCurrentTime } = useEditorStore();
|
||||
const canUseAI = useLicenseStore((s) => s.canUseAI);
|
||||
const setShowLicenseDialog = useLicenseStore((s) => s.setShowDialog);
|
||||
const {
|
||||
defaultProvider,
|
||||
providers,
|
||||
customFillerWords,
|
||||
fillerResult,
|
||||
clipSuggestions,
|
||||
isProcessing,
|
||||
processingMessage,
|
||||
setCustomFillerWords,
|
||||
setFillerResult,
|
||||
setClipSuggestions,
|
||||
setProcessing,
|
||||
} = useAIStore();
|
||||
|
||||
const [activeTab, setActiveTab] = useState<'filler' | 'clips' | 'reprocess'>('filler');
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const detectFillers = useCallback(async () => {
|
||||
if (words.length === 0) return;
|
||||
setError(null);
|
||||
setProcessing(true, 'Detecting filler words...');
|
||||
try {
|
||||
const config = providers[defaultProvider];
|
||||
const transcript = words.map((w) => w.word).join(' ');
|
||||
const res = await fetch(`${backendUrl}/ai/filler-removal`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
transcript,
|
||||
words: words.map((w, i) => ({ index: i, word: w.word })),
|
||||
provider: defaultProvider,
|
||||
model: config.model,
|
||||
api_key: config.apiKey || undefined,
|
||||
base_url: config.baseUrl || undefined,
|
||||
custom_filler_words: customFillerWords || undefined,
|
||||
}),
|
||||
});
|
||||
if (!res.ok) {
|
||||
const errData = await res.json().catch(() => ({}));
|
||||
throw new Error(errData.error || `Filler detection failed (${res.status})`);
|
||||
}
|
||||
const data = await res.json();
|
||||
setFillerResult(data);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
setError(err instanceof Error ? err.message : 'Filler detection failed');
|
||||
} finally {
|
||||
setProcessing(false);
|
||||
}
|
||||
}, [words, backendUrl, defaultProvider, providers, customFillerWords, setProcessing, setFillerResult]);
|
||||
|
||||
const createClips = useCallback(async () => {
|
||||
if (words.length === 0) return;
|
||||
setError(null);
|
||||
setProcessing(true, 'Finding best clip segments...');
|
||||
try {
|
||||
const config = providers[defaultProvider];
|
||||
const transcript = words.map((w) => w.word).join(' ');
|
||||
const res = await fetch(`${backendUrl}/ai/create-clip`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
transcript,
|
||||
words: words.map((w, i) => ({
|
||||
index: i,
|
||||
word: w.word,
|
||||
start: w.start,
|
||||
end: w.end,
|
||||
})),
|
||||
provider: defaultProvider,
|
||||
model: config.model,
|
||||
api_key: config.apiKey || undefined,
|
||||
base_url: config.baseUrl || undefined,
|
||||
target_duration: 60,
|
||||
}),
|
||||
});
|
||||
if (!res.ok) {
|
||||
const errData = await res.json().catch(() => ({}));
|
||||
throw new Error(errData.error || `Clip creation failed (${res.status})`);
|
||||
}
|
||||
const data = await res.json();
|
||||
setClipSuggestions(data.clips || []);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
setError(err instanceof Error ? err.message : 'Clip creation failed');
|
||||
} finally {
|
||||
setProcessing(false);
|
||||
}
|
||||
}, [words, backendUrl, defaultProvider, providers, setProcessing, setClipSuggestions]);
|
||||
|
||||
const applyFillerDeletions = useCallback(() => {
|
||||
if (!fillerResult) return;
|
||||
const sorted = [...fillerResult.fillerWords].sort((a, b) => b.index - a.index);
|
||||
for (const fw of sorted) {
|
||||
deleteWordRange(fw.index, fw.index);
|
||||
}
|
||||
setFillerResult(null);
|
||||
}, [fillerResult, deleteWordRange, setFillerResult]);
|
||||
|
||||
const handlePreviewClip = useCallback(
|
||||
(clip: ClipSuggestion) => {
|
||||
setCurrentTime(clip.startTime);
|
||||
const video = document.querySelector('video');
|
||||
if (video) {
|
||||
video.currentTime = clip.startTime;
|
||||
video.play();
|
||||
}
|
||||
},
|
||||
[setCurrentTime],
|
||||
);
|
||||
|
||||
const [exportingClipIndex, setExportingClipIndex] = useState<number | null>(null);
|
||||
|
||||
const handleExportClip = useCallback(
|
||||
async (clip: ClipSuggestion, index: number) => {
|
||||
if (!videoPath) return;
|
||||
setExportingClipIndex(index);
|
||||
try {
|
||||
const safeName = clip.title.replace(/[^a-zA-Z0-9_-]/g, '_').substring(0, 40);
|
||||
const dirSep = videoPath.lastIndexOf('\\') >= 0 ? '\\' : '/';
|
||||
const dir = videoPath.substring(0, videoPath.lastIndexOf(dirSep));
|
||||
const outputPath = `${dir}${dirSep}${safeName}_clip.mp4`;
|
||||
|
||||
const res = await fetch(`${backendUrl}/export`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
input_path: videoPath,
|
||||
output_path: outputPath,
|
||||
keep_segments: [{ start: clip.startTime, end: clip.endTime }],
|
||||
mode: 'fast',
|
||||
format: 'mp4',
|
||||
}),
|
||||
});
|
||||
if (!res.ok) throw new Error('Export failed');
|
||||
const data = await res.json();
|
||||
alert(`Clip exported to: ${data.output_path}`);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
alert('Failed to export clip. Check console for details.');
|
||||
} finally {
|
||||
setExportingClipIndex(null);
|
||||
}
|
||||
},
|
||||
[videoPath, backendUrl],
|
||||
);
|
||||
|
||||
return (
|
||||
<div className="flex flex-col h-full">
|
||||
<div className="flex border-b border-editor-border shrink-0">
|
||||
<TabButton
|
||||
active={activeTab === 'filler'}
|
||||
onClick={() => setActiveTab('filler')}
|
||||
icon={<Scissors className="w-3.5 h-3.5" />}
|
||||
label="Filler Words"
|
||||
title="Detect and remove filler words from transcript"
|
||||
/>
|
||||
<TabButton
|
||||
active={activeTab === 'clips'}
|
||||
onClick={() => setActiveTab('clips')}
|
||||
icon={<Film className="w-3.5 h-3.5" />}
|
||||
label="Create Clips"
|
||||
title="Find the best segments for social media clips"
|
||||
/>
|
||||
<TabButton
|
||||
active={activeTab === 'reprocess'}
|
||||
onClick={() => setActiveTab('reprocess')}
|
||||
icon={<RefreshCw className="w-3.5 h-3.5" />}
|
||||
label="Reprocess"
|
||||
title="Re-run transcription with a different Whisper model"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="flex-1 overflow-y-auto p-4">
|
||||
{activeTab === 'filler' && (
|
||||
<div className="space-y-4">
|
||||
{!canUseAI ? (
|
||||
<div className="text-center py-8 px-4">
|
||||
<Lock className="w-8 h-8 text-editor-text-muted mx-auto mb-3" />
|
||||
<p className="text-sm font-medium mb-1">AI editing requires Business</p>
|
||||
<p className="text-xs text-editor-text-muted mb-4">
|
||||
Upgrade to Business to unlock filler word removal, clip suggestions, and more.
|
||||
</p>
|
||||
<button
|
||||
onClick={() => setShowLicenseDialog(true)}
|
||||
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover text-white rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
Upgrade Now
|
||||
</button>
|
||||
</div>
|
||||
) : (
|
||||
<>
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
Use AI to detect and remove filler words like "um", "uh", "like", "you know" from
|
||||
your transcript.
|
||||
</p>
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-[11px] text-editor-text-muted font-medium">
|
||||
Custom filler words (comma-separated)
|
||||
</label>
|
||||
<input
|
||||
type="text"
|
||||
value={customFillerWords}
|
||||
onChange={(e) => setCustomFillerWords(e.target.value)}
|
||||
placeholder="e.g. okay, alright, anyway"
|
||||
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
/>
|
||||
</div>
|
||||
<button
|
||||
onClick={detectFillers}
|
||||
disabled={isProcessing || words.length === 0}
|
||||
title="Scan the entire transcript for filler words (um, uh, like, you know) and mark for removal"
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
{isProcessing ? (
|
||||
<>
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
{processingMessage}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Sparkles className="w-4 h-4" />
|
||||
Detect Filler Words
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
|
||||
{error && (
|
||||
<div className="bg-red-500/10 border border-red-500/40 rounded text-xs text-red-300 p-2 flex items-center justify-between">
|
||||
<span>{error}</span>
|
||||
<button
|
||||
onClick={detectFillers}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-red-500/20 hover:bg-red-500/30 rounded transition-colors shrink-0 ml-2"
|
||||
>
|
||||
<RotateCcw className="w-3 h-3" /> Retry
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
{fillerResult && fillerResult.fillerWords.length > 0 && (
|
||||
<div className="space-y-3">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-medium">
|
||||
Found {fillerResult.fillerWords.length} filler words
|
||||
</span>
|
||||
<div className="flex gap-1">
|
||||
<button
|
||||
onClick={applyFillerDeletions}
|
||||
title="Create cut ranges for all detected filler words at once"
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-success/20 text-editor-success rounded hover:bg-editor-success/30"
|
||||
>
|
||||
<Check className="w-3 h-3" /> Apply All
|
||||
</button>
|
||||
<button
|
||||
onClick={() => { setFillerResult(null); setError(null); }}
|
||||
title="Clear detected filler word results without applying"
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-border text-editor-text-muted rounded hover:bg-editor-surface"
|
||||
>
|
||||
<X className="w-3 h-3" /> Dismiss
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div className="space-y-1 max-h-64 overflow-y-auto">
|
||||
{fillerResult.fillerWords.map((fw) => (
|
||||
<div
|
||||
key={fw.index}
|
||||
className="flex items-center justify-between px-2 py-1.5 bg-editor-word-filler rounded text-xs"
|
||||
>
|
||||
<span>
|
||||
<strong>"{fw.word}"</strong>
|
||||
<span className="text-editor-text-muted ml-1">— {fw.reason}</span>
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{fillerResult && fillerResult.fillerWords.length === 0 && (
|
||||
<p className="text-xs text-editor-success">No filler words detected.</p>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{activeTab === 'clips' && (
|
||||
<div className="space-y-4">
|
||||
{!canUseAI ? (
|
||||
<div className="text-center py-8 px-4">
|
||||
<Lock className="w-8 h-8 text-editor-text-muted mx-auto mb-3" />
|
||||
<p className="text-sm font-medium mb-1">AI clip suggestions require Business</p>
|
||||
<p className="text-xs text-editor-text-muted mb-4">
|
||||
Upgrade to Business to find the best segments for social media clips.
|
||||
</p>
|
||||
<button
|
||||
onClick={() => setShowLicenseDialog(true)}
|
||||
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover text-white rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
Upgrade Now
|
||||
</button>
|
||||
</div>
|
||||
) : (
|
||||
<>
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
AI analyzes your transcript and suggests the most engaging segments for a
|
||||
YouTube Short or social media clip.
|
||||
</p>
|
||||
<button
|
||||
onClick={createClips}
|
||||
disabled={isProcessing || words.length === 0}
|
||||
title="Analyze transcript to find the most engaging 20-60 second segments for social media"
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
{isProcessing ? (
|
||||
<>
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
{processingMessage}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Film className="w-4 h-4" />
|
||||
Find Best Clips
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
|
||||
{error && (
|
||||
<div className="bg-red-500/10 border border-red-500/40 rounded text-xs text-red-300 p-2 flex items-center justify-between">
|
||||
<span>{error}</span>
|
||||
<button
|
||||
onClick={createClips}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-red-500/20 hover:bg-red-500/30 rounded transition-colors shrink-0 ml-2"
|
||||
>
|
||||
<RotateCcw className="w-3 h-3" /> Retry
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
{clipSuggestions.length > 0 && (
|
||||
<div className="space-y-3">
|
||||
{clipSuggestions.map((clip, i) => (
|
||||
<div key={i} className="p-3 bg-editor-surface rounded-lg space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-semibold">{clip.title}</span>
|
||||
<span className="text-[10px] text-editor-text-muted">
|
||||
{Math.round(clip.endTime - clip.startTime)}s
|
||||
</span>
|
||||
</div>
|
||||
<p className="text-[11px] text-editor-text-muted">{clip.reason}</p>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
onClick={() => handlePreviewClip(clip)}
|
||||
title="Seek to this clip's position and play a preview"
|
||||
className="flex-1 flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-accent/20 text-editor-accent rounded hover:bg-editor-accent/30 transition-colors"
|
||||
>
|
||||
<Play className="w-3 h-3" /> Preview
|
||||
</button>
|
||||
<button
|
||||
onClick={() => handleExportClip(clip, i)}
|
||||
disabled={exportingClipIndex === i}
|
||||
title="Export just this segment as a standalone video file"
|
||||
className="flex-1 flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-success/20 text-editor-success rounded hover:bg-editor-success/30 disabled:opacity-40 transition-colors"
|
||||
>
|
||||
{exportingClipIndex === i ? (
|
||||
<Loader2 className="w-3 h-3 animate-spin" />
|
||||
) : (
|
||||
<Download className="w-3 h-3" />
|
||||
)}
|
||||
Export
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{activeTab === 'reprocess' && (
|
||||
<div className="space-y-4">
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
Re-run transcription with a different model — replaces the current transcript entirely.
|
||||
</p>
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-[11px] text-editor-text-muted font-medium">
|
||||
Whisper Model
|
||||
</label>
|
||||
<select
|
||||
value={whisperModel}
|
||||
onChange={(e) => setWhisperModel(e.target.value)}
|
||||
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
>
|
||||
<optgroup label="Multilingual (any language)">
|
||||
<option value="tiny">tiny — ~75 MB · fastest, low accuracy</option>
|
||||
<option value="base">base — ~140 MB · fast, decent accuracy</option>
|
||||
<option value="small">small — ~460 MB · good balance</option>
|
||||
<option value="medium">medium — ~1.5 GB · better accuracy</option>
|
||||
<option value="large-v2">large-v2 — ~2.9 GB · high accuracy</option>
|
||||
<option value="large-v3">large-v3 — ~2.9 GB · best overall ★</option>
|
||||
<option value="large-v3-turbo">large-v3-turbo — ~1.6 GB · fast + accurate ★</option>
|
||||
<option value="distil-large-v3">distil-large-v3 — ~1.5 GB · fast, near large-v3 quality</option>
|
||||
</optgroup>
|
||||
<optgroup label="English-only (faster & more accurate for English)">
|
||||
<option value="tiny.en">tiny.en — ~75 MB · fastest English</option>
|
||||
<option value="base.en">base.en — ~140 MB · fast English</option>
|
||||
<option value="small.en">small.en — ~460 MB · good English</option>
|
||||
<option value="medium.en">medium.en — ~1.5 GB · great English</option>
|
||||
<option value="distil-small.en">distil-small.en — ~190 MB · fast English ★</option>
|
||||
<option value="distil-medium.en">distil-medium.en — ~750 MB · best fast English ★</option>
|
||||
</optgroup>
|
||||
</select>
|
||||
</div>
|
||||
<button
|
||||
onClick={onReprocess}
|
||||
disabled={isProcessing || words.length === 0}
|
||||
title="Re-run transcription with the selected model — this will replace all current words"
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
<RefreshCw className="w-4 h-4" />
|
||||
Reprocess Transcript
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function TabButton({
|
||||
active,
|
||||
onClick,
|
||||
icon,
|
||||
label,
|
||||
title,
|
||||
}: {
|
||||
active: boolean;
|
||||
onClick: () => void;
|
||||
icon: React.ReactNode;
|
||||
label: string;
|
||||
title?: string;
|
||||
}) {
|
||||
return (
|
||||
<button
|
||||
onClick={onClick}
|
||||
title={title}
|
||||
className={`flex-1 flex items-center justify-center gap-1.5 px-3 py-2.5 text-xs font-medium transition-colors border-b-2 ${
|
||||
active
|
||||
? 'border-editor-accent text-editor-accent'
|
||||
: 'border-transparent text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
>
|
||||
{icon}
|
||||
{label}
|
||||
</button>
|
||||
);
|
||||
}
|
||||
84
frontend/src/components/AppendClipPanel.tsx
Normal file
84
frontend/src/components/AppendClipPanel.tsx
Normal file
@ -0,0 +1,84 @@
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Video, Plus, Trash2, ChevronUp, ChevronDown } from 'lucide-react';
|
||||
|
||||
export default function AppendClipPanel() {
|
||||
const { additionalClips, addAdditionalClip, removeAdditionalClip, reorderAdditionalClip, videoPath } = useEditorStore();
|
||||
|
||||
const handleAddClip = async () => {
|
||||
const path = await window.electronAPI?.openFile({
|
||||
filters: [
|
||||
{ name: 'Video Files', extensions: ['mp4', 'mkv', 'mov', 'avi', 'webm'] },
|
||||
{ name: 'All Files', extensions: ['*'] },
|
||||
],
|
||||
});
|
||||
if (path) {
|
||||
addAdditionalClip(path);
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-3">
|
||||
<h3 className="text-sm font-semibold flex items-center gap-1.5">
|
||||
<Video className="w-4 h-4" />
|
||||
Append Clips
|
||||
</h3>
|
||||
<p className="text-[10px] text-editor-text-muted leading-relaxed">
|
||||
Load additional video clips to append after the main video. Clips are concatenated in order during export.
|
||||
</p>
|
||||
|
||||
{additionalClips.length === 0 ? (
|
||||
<div className="text-[11px] text-editor-text-muted text-center py-3">
|
||||
No additional clips loaded
|
||||
</div>
|
||||
) : (
|
||||
<div className="space-y-1 max-h-60 overflow-y-auto">
|
||||
{additionalClips.map((clip, idx) => (
|
||||
<div
|
||||
key={clip.id}
|
||||
className="flex items-center gap-2 p-2 rounded bg-editor-surface border border-editor-border text-xs"
|
||||
>
|
||||
<Video className="w-3 h-3 text-editor-accent shrink-0" />
|
||||
<span className="flex-1 truncate text-editor-text">{clip.label}</span>
|
||||
<span className="text-[10px] text-editor-text-muted shrink-0">#{idx + 1}</span>
|
||||
<div className="flex items-center gap-0.5 shrink-0">
|
||||
<button
|
||||
onClick={() => reorderAdditionalClip(clip.id, -1)}
|
||||
disabled={idx === 0}
|
||||
className="p-0.5 rounded hover:bg-editor-bg disabled:opacity-30 text-editor-text-muted hover:text-editor-text"
|
||||
title="Move up"
|
||||
>
|
||||
<ChevronUp className="w-3 h-3" />
|
||||
</button>
|
||||
<button
|
||||
onClick={() => reorderAdditionalClip(clip.id, 1)}
|
||||
disabled={idx === additionalClips.length - 1}
|
||||
className="p-0.5 rounded hover:bg-editor-bg disabled:opacity-30 text-editor-text-muted hover:text-editor-text"
|
||||
title="Move down"
|
||||
>
|
||||
<ChevronDown className="w-3 h-3" />
|
||||
</button>
|
||||
</div>
|
||||
<button
|
||||
onClick={() => removeAdditionalClip(clip.id)}
|
||||
className="p-0.5 rounded hover:bg-red-500/20 text-red-400"
|
||||
title="Remove clip"
|
||||
>
|
||||
<Trash2 className="w-3 h-3" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<button
|
||||
onClick={handleAddClip}
|
||||
disabled={!videoPath}
|
||||
className="w-full flex items-center justify-center gap-2 px-3 py-2 rounded-lg border-2 border-dashed border-editor-border text-xs text-editor-text-muted hover:text-editor-text hover:border-editor-text-muted disabled:opacity-40 transition-colors"
|
||||
title="Select a video or audio file to append during export"
|
||||
>
|
||||
<Plus className="w-3.5 h-3.5" />
|
||||
Add Clip
|
||||
</button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
150
frontend/src/components/BackgroundMusicPanel.tsx
Normal file
150
frontend/src/components/BackgroundMusicPanel.tsx
Normal file
@ -0,0 +1,150 @@
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Music, Trash2, Volume2, Disc3 } from 'lucide-react';
|
||||
|
||||
export default function BackgroundMusicPanel() {
|
||||
const { backgroundMusic, setBackgroundMusic, updateBackgroundMusic } = useEditorStore();
|
||||
|
||||
const handleLoadMusic = async () => {
|
||||
const path = await window.electronAPI?.openFile({
|
||||
filters: [
|
||||
{ name: 'Audio Files', extensions: ['mp3', 'wav', 'm4a', 'flac', 'ogg', 'aac', 'wma'] },
|
||||
{ name: 'All Files', extensions: ['*'] },
|
||||
],
|
||||
});
|
||||
if (path) {
|
||||
setBackgroundMusic({
|
||||
path,
|
||||
volumeDb: -10,
|
||||
duckingEnabled: true,
|
||||
duckingDb: 6,
|
||||
duckingAttackMs: 10,
|
||||
duckingReleaseMs: 200,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const handleRemoveMusic = () => {
|
||||
setBackgroundMusic(null);
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-4">
|
||||
<h3 className="text-sm font-semibold flex items-center gap-1.5">
|
||||
<Music className="w-4 h-4" />
|
||||
Background Music
|
||||
</h3>
|
||||
|
||||
{!backgroundMusic ? (
|
||||
<button
|
||||
onClick={handleLoadMusic}
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-3 rounded-lg border-2 border-dashed border-editor-border text-xs text-editor-text-muted hover:text-editor-text hover:border-editor-text-muted transition-colors"
|
||||
title="Select an audio file to use as background music"
|
||||
>
|
||||
<Disc3 className="w-4 h-4" />
|
||||
Load Music File
|
||||
</button>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
<div className="flex items-center gap-2 p-2 rounded bg-editor-surface border border-editor-border">
|
||||
<Music className="w-4 h-4 text-editor-accent shrink-0" />
|
||||
<span className="flex-1 text-xs truncate">
|
||||
{backgroundMusic.path.split(/[/\\]/).pop()}
|
||||
</span>
|
||||
<button
|
||||
onClick={handleRemoveMusic}
|
||||
className="p-1 rounded hover:bg-red-500/20 text-red-400 transition-colors"
|
||||
title="Remove music"
|
||||
>
|
||||
<Trash2 className="w-3 h-3" />
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<Volume2 className="w-3 h-3 text-editor-text-muted shrink-0" />
|
||||
<span className="text-[10px] text-editor-text-muted w-16">Volume:</span>
|
||||
<input
|
||||
type="range"
|
||||
min={-30}
|
||||
max={12}
|
||||
step={1}
|
||||
value={backgroundMusic.volumeDb}
|
||||
onChange={(e) => updateBackgroundMusic({ volumeDb: Number(e.target.value) })}
|
||||
className="flex-1 h-1.5"
|
||||
title="Background music volume relative to main audio — positive boosts, negative reduces"
|
||||
/>
|
||||
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.volumeDb} dB</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<label className="flex items-center gap-2 cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={backgroundMusic.duckingEnabled}
|
||||
onChange={(e) => updateBackgroundMusic({ duckingEnabled: e.target.checked })}
|
||||
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
|
||||
title="Automatically lower music volume when speech is detected"
|
||||
/>
|
||||
<div>
|
||||
<span className="text-xs font-medium">Auto-ducking</span>
|
||||
<p className="text-[10px] text-editor-text-muted">
|
||||
Lower music volume when speech is detected
|
||||
</p>
|
||||
</div>
|
||||
</label>
|
||||
|
||||
{backgroundMusic.duckingEnabled && (
|
||||
<div className="pl-6 space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-[10px] text-editor-text-muted w-20">Duck amount:</span>
|
||||
<input
|
||||
type="range"
|
||||
min={1}
|
||||
max={20}
|
||||
step={1}
|
||||
value={backgroundMusic.duckingDb}
|
||||
onChange={(e) => updateBackgroundMusic({ duckingDb: Number(e.target.value) })}
|
||||
className="flex-1 h-1.5"
|
||||
title="How much to reduce music volume during speech (1-20 dB)"
|
||||
/>
|
||||
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.duckingDb} dB</span>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-[10px] text-editor-text-muted w-20">Attack:</span>
|
||||
<input
|
||||
type="range"
|
||||
min={1}
|
||||
max={100}
|
||||
step={1}
|
||||
value={backgroundMusic.duckingAttackMs}
|
||||
onChange={(e) => updateBackgroundMusic({ duckingAttackMs: Number(e.target.value) })}
|
||||
className="flex-1 h-1.5"
|
||||
title="How quickly the ducking effect engages when speech starts"
|
||||
/>
|
||||
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.duckingAttackMs}ms</span>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-[10px] text-editor-text-muted w-20">Release:</span>
|
||||
<input
|
||||
type="range"
|
||||
min={10}
|
||||
max={1000}
|
||||
step={10}
|
||||
value={backgroundMusic.duckingReleaseMs}
|
||||
onChange={(e) => updateBackgroundMusic({ duckingReleaseMs: Number(e.target.value) })}
|
||||
className="flex-1 h-1.5"
|
||||
title="How quickly the ducking effect fades when speech ends"
|
||||
/>
|
||||
<span className="text-xs text-editor-text w-10 text-right">{backgroundMusic.duckingReleaseMs}ms</span>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<p className="text-[10px] text-editor-text-muted leading-relaxed">
|
||||
The music will be mixed during export. Enable auto-ducking to lower music volume whenever speech is active.
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
160
frontend/src/components/DevPanel.tsx
Normal file
160
frontend/src/components/DevPanel.tsx
Normal file
@ -0,0 +1,160 @@
|
||||
import { useState, useCallback } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Terminal, ChevronDown, ChevronUp, Play, Wifi, AlertTriangle } from 'lucide-react';
|
||||
|
||||
export default function DevPanel() {
|
||||
const [open, setOpen] = useState(false);
|
||||
const [pathInput, setPathInput] = useState('');
|
||||
const [testResult, setTestResult] = useState<string | null>(null);
|
||||
const [testing, setTesting] = useState(false);
|
||||
const [showResetConfirm, setShowResetConfirm] = useState(false);
|
||||
|
||||
const { backendUrl, videoPath, loadVideo } = useEditorStore();
|
||||
|
||||
const handleLoad = useCallback(() => {
|
||||
const p = pathInput.trim();
|
||||
if (p) loadVideo(p);
|
||||
}, [pathInput, loadVideo]);
|
||||
|
||||
const testEndpoint = useCallback(async (endpoint: string) => {
|
||||
setTesting(true);
|
||||
setTestResult(null);
|
||||
try {
|
||||
const url = `${backendUrl}${endpoint}`;
|
||||
const res = await fetch(url);
|
||||
const text = res.headers.get('content-type')?.includes('json')
|
||||
? JSON.stringify(await res.json(), null, 2)
|
||||
: `${res.status} ${res.statusText} (${res.headers.get('content-type') ?? 'no type'})`;
|
||||
setTestResult(`✓ ${url}\n${text}`);
|
||||
} catch (e) {
|
||||
setTestResult(`✗ ${e}`);
|
||||
} finally {
|
||||
setTesting(false);
|
||||
}
|
||||
}, [backendUrl]);
|
||||
|
||||
const testWaveform = useCallback(async () => {
|
||||
const p = pathInput.trim() || videoPath;
|
||||
if (!p) { setTestResult('No path to test'); return; }
|
||||
setTesting(true);
|
||||
setTestResult(null);
|
||||
try {
|
||||
const url = `${backendUrl}/audio/waveform?path=${encodeURIComponent(p)}`;
|
||||
const res = await fetch(url);
|
||||
if (res.ok) {
|
||||
const buf = await res.arrayBuffer();
|
||||
setTestResult(`✓ Waveform OK — ${buf.byteLength} bytes\n${url}`);
|
||||
} else {
|
||||
const body = await res.text().catch(() => '');
|
||||
setTestResult(`✗ HTTP ${res.status}\n${body}`);
|
||||
}
|
||||
} catch (e) {
|
||||
setTestResult(`✗ ${e}`);
|
||||
} finally {
|
||||
setTesting(false);
|
||||
}
|
||||
}, [backendUrl, pathInput, videoPath]);
|
||||
|
||||
return (
|
||||
<div className="fixed bottom-0 right-0 z-50 w-96 font-mono text-[11px]">
|
||||
{/* Header */}
|
||||
<button
|
||||
onClick={() => setOpen(o => !o)}
|
||||
className="w-full flex items-center justify-between px-3 py-1.5 bg-[#0d0f1a] border-t border-l border-[#2a2d3e] text-[#6b7280] hover:text-white"
|
||||
>
|
||||
<span className="flex items-center gap-1.5">
|
||||
<Terminal className="w-3 h-3" />
|
||||
DevPanel
|
||||
<span className="ml-2 text-[#4a4f6a]">{backendUrl}</span>
|
||||
</span>
|
||||
{open ? <ChevronDown className="w-3 h-3" /> : <ChevronUp className="w-3 h-3" />}
|
||||
</button>
|
||||
|
||||
{open && (
|
||||
<div className="bg-[#0d0f1a] border-t border-l border-[#2a2d3e] p-3 space-y-3">
|
||||
{/* State */}
|
||||
<div className="space-y-0.5 text-[#4a4f6a]">
|
||||
<div>backendUrl: <span className="text-[#6366f1]">{backendUrl}</span></div>
|
||||
<div className="truncate">videoPath: <span className="text-[#6366f1]">{videoPath ?? 'null'}</span></div>
|
||||
</div>
|
||||
|
||||
{/* Load file by path */}
|
||||
<div className="space-y-1">
|
||||
<div className="text-[#6b7280] uppercase tracking-wider text-[9px]">Load file</div>
|
||||
<div className="flex gap-1">
|
||||
<input
|
||||
type="text"
|
||||
value={pathInput}
|
||||
onChange={e => setPathInput(e.target.value)}
|
||||
onKeyDown={e => e.key === 'Enter' && handleLoad()}
|
||||
placeholder={videoPath ?? '/path/to/file.wav'}
|
||||
className="flex-1 bg-[#13141f] border border-[#2a2d3e] rounded px-2 py-1 text-white placeholder-[#2a2d3e] focus:outline-none focus:border-[#6366f1]"
|
||||
/>
|
||||
<button
|
||||
onClick={handleLoad}
|
||||
disabled={!pathInput.trim()}
|
||||
className="px-2 py-1 bg-[#6366f1] hover:bg-[#4f52d4] disabled:opacity-30 rounded text-white"
|
||||
>
|
||||
<Play className="w-3 h-3" />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Quick tests */}
|
||||
<div className="space-y-1">
|
||||
<div className="text-[#6b7280] uppercase tracking-wider text-[9px]">Test endpoints</div>
|
||||
<div className="flex flex-wrap gap-1">
|
||||
<button onClick={() => testEndpoint('/health')} className="px-2 py-0.5 bg-[#1e2030] hover:bg-[#2a2d3e] rounded text-[#6b7280] hover:text-white flex items-center gap-1">
|
||||
<Wifi className="w-2.5 h-2.5" />/health
|
||||
</button>
|
||||
<button onClick={() => testEndpoint('/audio/capabilities')} className="px-2 py-0.5 bg-[#1e2030] hover:bg-[#2a2d3e] rounded text-[#6b7280] hover:text-white">
|
||||
/audio/capabilities
|
||||
</button>
|
||||
<button onClick={testWaveform} disabled={testing} className="px-2 py-0.5 bg-[#1e2030] hover:bg-[#2a2d3e] disabled:opacity-40 rounded text-[#6b7280] hover:text-white">
|
||||
/audio/waveform
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Result */}
|
||||
{testResult && (
|
||||
<pre className="bg-[#13141f] border border-[#2a2d3e] rounded p-2 text-[10px] text-[#9ca3af] whitespace-pre-wrap break-all max-h-32 overflow-y-auto">
|
||||
{testResult}
|
||||
</pre>
|
||||
)}
|
||||
{/* Danger Zone */}
|
||||
<div className="space-y-1">
|
||||
<div className="text-[#ef4444] uppercase tracking-wider text-[9px]">Danger Zone</div>
|
||||
{!showResetConfirm ? (
|
||||
<button
|
||||
onClick={() => setShowResetConfirm(true)}
|
||||
className="w-full px-2 py-1.5 rounded border border-red-500/40 text-red-400 hover:bg-red-500/10 text-xs flex items-center justify-center gap-1.5"
|
||||
>
|
||||
<AlertTriangle className="w-3 h-3" />
|
||||
Reset Editor State
|
||||
</button>
|
||||
) : (
|
||||
<div className="bg-[#1e1020] border border-red-500/40 rounded p-2 space-y-1.5">
|
||||
<p className="text-[#fca5a5] text-[10px]">This will clear all editor data and reload the page. Unsaved changes will be lost.</p>
|
||||
<div className="flex gap-1">
|
||||
<button
|
||||
onClick={() => setShowResetConfirm(false)}
|
||||
className="flex-1 px-2 py-1 rounded text-[10px] text-[#6b7280] hover:text-white hover:bg-[#2a2d3e]"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
<button
|
||||
onClick={() => { useEditorStore.getState().reset(); window.location.reload(); }}
|
||||
className="flex-1 px-2 py-1 rounded text-[10px] border border-red-500/40 text-red-400 hover:bg-red-500/10"
|
||||
>
|
||||
Confirm Reset
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
90
frontend/src/components/ErrorBoundary.tsx
Normal file
90
frontend/src/components/ErrorBoundary.tsx
Normal file
@ -0,0 +1,90 @@
|
||||
import { Component, type ReactNode } from 'react';
|
||||
|
||||
interface Props {
|
||||
children: ReactNode;
|
||||
}
|
||||
|
||||
interface State {
|
||||
hasError: boolean;
|
||||
error: Error | null;
|
||||
}
|
||||
|
||||
export default class ErrorBoundary extends Component<Props, State> {
|
||||
constructor(props: Props) {
|
||||
super(props);
|
||||
this.state = { hasError: false, error: null };
|
||||
}
|
||||
|
||||
static getDerivedStateFromError(error: Error): State {
|
||||
return { hasError: true, error };
|
||||
}
|
||||
|
||||
componentDidCatch(error: Error, info: React.ErrorInfo) {
|
||||
console.error('ErrorBoundary caught:', error, info.componentStack);
|
||||
try {
|
||||
window.electronAPI?.logError?.(error.message, error.stack || '', info.componentStack || '');
|
||||
} catch {}
|
||||
}
|
||||
|
||||
handleReload = () => {
|
||||
window.location.reload();
|
||||
};
|
||||
|
||||
handleReset = () => {
|
||||
try {
|
||||
localStorage.clear();
|
||||
sessionStorage.clear();
|
||||
} catch {}
|
||||
window.location.reload();
|
||||
};
|
||||
|
||||
render() {
|
||||
if (this.state.hasError) {
|
||||
return (
|
||||
<div className="h-screen flex flex-col items-center justify-center gap-6 bg-editor-bg px-6">
|
||||
<div className="flex flex-col items-center gap-3 max-w-md text-center">
|
||||
<div className="w-12 h-12 rounded-full bg-red-500/20 flex items-center justify-center">
|
||||
<svg className="w-6 h-6 text-red-400" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-2.5L13.732 4c-.77-.833-1.964-.833-2.732 0L4.082 16.5c-.77.833.192 2.5 1.732 2.5z" />
|
||||
</svg>
|
||||
</div>
|
||||
<h2 className="text-lg font-semibold text-editor-text">Something went wrong</h2>
|
||||
<p className="text-xs text-editor-text-muted leading-relaxed">
|
||||
An unexpected error occurred. Your work may still be recoverable.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{this.state.error && (
|
||||
<details className="max-w-md w-full">
|
||||
<summary className="text-xs text-editor-text-muted cursor-pointer hover:text-editor-text">
|
||||
Error details
|
||||
</summary>
|
||||
<pre className="mt-2 p-3 rounded bg-editor-surface border border-editor-border text-[10px] text-red-300 overflow-auto max-h-32 whitespace-pre-wrap">
|
||||
{this.state.error.message}
|
||||
{'\n'}
|
||||
{this.state.error.stack}
|
||||
</pre>
|
||||
</details>
|
||||
)}
|
||||
|
||||
<div className="flex flex-col items-center gap-2">
|
||||
<button
|
||||
onClick={this.handleReload}
|
||||
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
Reload App
|
||||
</button>
|
||||
<button
|
||||
onClick={this.handleReset}
|
||||
className="text-xs text-editor-text-muted hover:text-editor-text underline transition-colors"
|
||||
>
|
||||
Reset & Clear All Data
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return this.props.children;
|
||||
}
|
||||
}
|
||||
623
frontend/src/components/ExportDialog.tsx
Normal file
623
frontend/src/components/ExportDialog.tsx
Normal file
@ -0,0 +1,623 @@
|
||||
import { useState, useCallback } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Download, Loader2, Zap, Cog, Info, Volume2, FileText, ZoomIn, Video, Music } from 'lucide-react';
|
||||
import type { ExportOptions } from '../types/project';
|
||||
import { assert } from '../lib/assert';
|
||||
|
||||
export default function ExportDialog() {
|
||||
const { videoPath, words, cutRanges, muteRanges, gainRanges, speedRanges, globalGainDb, isExporting, exportProgress, backendUrl, setExporting, getKeepSegments, additionalClips, backgroundMusic } =
|
||||
useEditorStore();
|
||||
|
||||
const hasCuts = cutRanges.length > 0;
|
||||
|
||||
// Compute set of deleted word indices from cutRanges
|
||||
const getDeletedSet = useCallback(() => {
|
||||
const deletedSet = new Set<number>();
|
||||
for (const range of cutRanges) {
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
if (words[i].start >= range.start && words[i].end <= range.end) {
|
||||
deletedSet.add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return deletedSet;
|
||||
}, [cutRanges, words]);
|
||||
|
||||
// Detect if input is audio-only by its extension
|
||||
const audioExtensions = new Set(['.wav', '.mp3', '.flac', '.m4a', '.ogg', '.aac', '.wma']);
|
||||
const inputExt = videoPath ? '.' + videoPath.split('.').pop()?.toLowerCase() : '';
|
||||
const isAudioOnly = videoPath ? audioExtensions.has(inputExt) : false;
|
||||
|
||||
const [options, setOptions] = useState<Omit<ExportOptions, 'outputPath'> & { normalizeAudio: boolean; normalizeTarget: number }>({
|
||||
mode: isAudioOnly ? 'reencode' : 'fast',
|
||||
resolution: '1080p',
|
||||
format: isAudioOnly ? 'wav' : 'mp4',
|
||||
enhanceAudio: false,
|
||||
captions: 'none',
|
||||
normalizeAudio: false,
|
||||
normalizeTarget: -14,
|
||||
zoom: { enabled: false, zoomFactor: 1.25, panX: 0, panY: 0 },
|
||||
removeBackground: false,
|
||||
backgroundReplacement: 'blur',
|
||||
backgroundReplacementValue: '',
|
||||
});
|
||||
const [exportError, setExportError] = useState<string | null>(null);
|
||||
const [transcriptFormat, setTranscriptFormat] = useState<'txt' | 'srt'>('txt');
|
||||
const [isTranscribingTranscript, setIsTranscribingTranscript] = useState(false);
|
||||
|
||||
const handleTranscriptExport = useCallback(async () => {
|
||||
if (!videoPath || words.length === 0) return;
|
||||
|
||||
const defaultExt = transcriptFormat === 'srt' ? 'srt' : 'txt';
|
||||
const outputPath = await window.electronAPI?.saveFile({
|
||||
defaultPath: videoPath.replace(/\.[^.]+$/, `_transcript.${defaultExt}`),
|
||||
filters: transcriptFormat === 'srt'
|
||||
? [{ name: 'SRT Subtitles', extensions: ['srt'] }]
|
||||
: [{ name: 'Text File', extensions: ['txt'] }],
|
||||
});
|
||||
if (!outputPath) return;
|
||||
|
||||
setIsTranscribingTranscript(true);
|
||||
try {
|
||||
// Compute deleted word set
|
||||
const deletedSet = getDeletedSet();
|
||||
|
||||
// Generate content entirely on the frontend — no backend needed
|
||||
let content: string;
|
||||
if (transcriptFormat === 'srt') {
|
||||
const lines: string[] = [];
|
||||
let counter = 1;
|
||||
const activeWords: Array<[number, typeof words[0]]> = [];
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
if (!deletedSet.has(i)) activeWords.push([i, words[i]]);
|
||||
}
|
||||
const wordsPerLine = 8;
|
||||
for (let ci = 0; ci < activeWords.length; ci += wordsPerLine) {
|
||||
const chunk = activeWords.slice(ci, ci + wordsPerLine);
|
||||
if (chunk.length === 0) continue;
|
||||
const startTime = chunk[0][1].start;
|
||||
const endTime = chunk[chunk.length - 1][1].end;
|
||||
|
||||
const fmt = (s: number) => {
|
||||
const h = Math.floor(s / 3600);
|
||||
const m = Math.floor((s % 3600) / 60);
|
||||
const sec = Math.floor(s % 60);
|
||||
const ms = Math.floor((s % 1) * 1000);
|
||||
return `${String(h).padStart(2, '0')}:${String(m).padStart(2, '0')}:${String(sec).padStart(2, '0')},${String(ms).padStart(3, '0')}`;
|
||||
};
|
||||
|
||||
lines.push(String(counter));
|
||||
lines.push(`${fmt(startTime)} --> ${fmt(endTime)}`);
|
||||
lines.push(chunk.map(([, w]) => w.word).join(' '));
|
||||
lines.push('');
|
||||
counter++;
|
||||
}
|
||||
content = lines.join('\n');
|
||||
} else {
|
||||
// Plain text
|
||||
const activeWords: string[] = [];
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
if (!deletedSet.has(i)) activeWords.push(words[i].word);
|
||||
}
|
||||
content = activeWords.join(' ');
|
||||
}
|
||||
|
||||
// Write directly via Tauri — instant, no backend round-trip
|
||||
await window.electronAPI?.writeFile(outputPath, content);
|
||||
} catch (err) {
|
||||
console.error('Transcript export error:', err);
|
||||
setExportError(err instanceof Error ? err.message : 'Transcript export failed');
|
||||
} finally {
|
||||
setIsTranscribingTranscript(false);
|
||||
}
|
||||
}, [videoPath, words, getDeletedSet, transcriptFormat]);
|
||||
|
||||
const HANDLE_EXPORT_filters = useCallback(() => {
|
||||
const ext = options.format;
|
||||
const nameMap: Record<string, string> = {
|
||||
mp4: 'MP4',
|
||||
mov: 'MOV',
|
||||
webm: 'WebM',
|
||||
wav: 'WAV Audio',
|
||||
};
|
||||
return [{ name: nameMap[ext] || 'File', extensions: [ext] }];
|
||||
}, [options.format]);
|
||||
|
||||
const handleExport = useCallback(async () => {
|
||||
if (!videoPath) return;
|
||||
|
||||
const defaultExt = options.format === 'wav' ? 'wav' : 'mp4';
|
||||
const outputPath = await window.electronAPI?.saveFile({
|
||||
defaultPath: videoPath.replace(/\.[^.]+$/, `_edited.${defaultExt}`),
|
||||
filters: HANDLE_EXPORT_filters(),
|
||||
});
|
||||
if (!outputPath) return;
|
||||
|
||||
setExporting(true, 0);
|
||||
setExportError(null);
|
||||
try {
|
||||
const keepSegments = getKeepSegments();
|
||||
assert(words.length > 0, 'handleExport: words is empty before building keep segments');
|
||||
const deletedSet = getDeletedSet();
|
||||
|
||||
// Map frontend camelCase gain/speed fields to backend snake_case
|
||||
const backendGainRanges = gainRanges.map((r) => ({
|
||||
start: r.start,
|
||||
end: r.end,
|
||||
gain_db: r.gainDb,
|
||||
}));
|
||||
const backendSpeedRanges = speedRanges.map((r) => ({
|
||||
start: r.start,
|
||||
end: r.end,
|
||||
speed: r.speed,
|
||||
}));
|
||||
|
||||
const body: Record<string, any> = {
|
||||
input_path: videoPath,
|
||||
output_path: outputPath,
|
||||
keep_segments: keepSegments,
|
||||
mute_ranges: muteRanges.length > 0 ? muteRanges.map((r) => ({ start: r.start, end: r.end })) : undefined,
|
||||
gain_ranges: backendGainRanges.length > 0 ? backendGainRanges : undefined,
|
||||
speed_ranges: backendSpeedRanges.length > 0 ? backendSpeedRanges : undefined,
|
||||
global_gain_db: globalGainDb,
|
||||
words: options.captions !== 'none' ? words : undefined,
|
||||
deleted_indices: options.captions !== 'none' ? [...deletedSet] : undefined,
|
||||
mode: options.mode,
|
||||
resolution: options.resolution,
|
||||
format: options.format,
|
||||
enhanceAudio: options.enhanceAudio,
|
||||
normalize_loudness: options.normalizeAudio,
|
||||
normalize_target_lufs: options.normalizeTarget,
|
||||
captions: options.captions,
|
||||
};
|
||||
|
||||
// Zoom
|
||||
if (options.zoom?.enabled) {
|
||||
body.zoom = options.zoom;
|
||||
}
|
||||
|
||||
// Additional clips
|
||||
if (additionalClips.length > 0) {
|
||||
body.additional_clips = additionalClips.map((c) => c.path);
|
||||
}
|
||||
|
||||
// Background music
|
||||
if (backgroundMusic) {
|
||||
body.background_music = backgroundMusic;
|
||||
}
|
||||
|
||||
// Background removal
|
||||
if (options.removeBackground) {
|
||||
body.remove_background = true;
|
||||
body.background_replacement = options.backgroundReplacement || 'blur';
|
||||
body.background_replacement_value = options.backgroundReplacementValue || '';
|
||||
}
|
||||
|
||||
const res = await fetch(`${backendUrl}/export`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
if (!res.ok) {
|
||||
let detail = res.statusText;
|
||||
try {
|
||||
const body = await res.json();
|
||||
if (body?.detail) detail = String(body.detail);
|
||||
} catch {
|
||||
// Keep statusText fallback when response body is not JSON.
|
||||
}
|
||||
throw new Error(`Export failed: ${detail}`);
|
||||
}
|
||||
setExporting(false, 100);
|
||||
} catch (err) {
|
||||
console.error('Export error:', err);
|
||||
setExportError(err instanceof Error ? err.message : 'Export failed');
|
||||
setExporting(false);
|
||||
}
|
||||
}, [videoPath, options, backendUrl, setExporting, getKeepSegments, getDeletedSet, muteRanges, gainRanges, speedRanges, globalGainDb, words, HANDLE_EXPORT_filters, additionalClips, backgroundMusic]);
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-5">
|
||||
<h3 className="text-sm font-semibold">Export Video</h3>
|
||||
|
||||
{/* Mode */}
|
||||
<fieldset className="space-y-2">
|
||||
<legend className="text-xs text-editor-text-muted font-medium">Export Mode</legend>
|
||||
<div className="grid grid-cols-2 gap-2">
|
||||
<ModeCard
|
||||
active={options.mode === 'fast'}
|
||||
onClick={() => setOptions((o) => ({ ...o, mode: 'fast' }))}
|
||||
icon={<Zap className="w-4 h-4" />}
|
||||
title="Fast"
|
||||
desc="Stream copy, seconds"
|
||||
tooltip="Stream copy — fast, no quality loss, but does not apply cuts or effects"
|
||||
/>
|
||||
<ModeCard
|
||||
active={options.mode === 'reencode'}
|
||||
onClick={() => setOptions((o) => ({ ...o, mode: 'reencode' }))}
|
||||
icon={<Cog className="w-4 h-4" />}
|
||||
title="Re-encode"
|
||||
desc="Custom quality, slower"
|
||||
tooltip="Full re-encode — applies cuts, gain, speed, zoom, captions, and effects"
|
||||
/>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
{/* Resolution (only for re-encode) */}
|
||||
{options.mode === 'reencode' && (
|
||||
<SelectField
|
||||
label="Resolution"
|
||||
value={options.resolution}
|
||||
onChange={(v) => setOptions((o) => ({ ...o, resolution: v as ExportOptions['resolution'] }))}
|
||||
options={[
|
||||
{ value: '720p', label: '720p (HD)' },
|
||||
{ value: '1080p', label: '1080p (Full HD)' },
|
||||
{ value: '4k', label: '4K (Ultra HD)' },
|
||||
]}
|
||||
title="Output video resolution — higher resolution = larger file"
|
||||
/>
|
||||
)}
|
||||
|
||||
{/* Format */}
|
||||
<SelectField
|
||||
label="Format"
|
||||
value={options.format}
|
||||
onChange={(v) => setOptions((o) => ({ ...o, format: v as ExportOptions['format'] }))}
|
||||
options={[
|
||||
{ value: 'mp4', label: 'MP4 (H.264)' },
|
||||
{ value: 'mov', label: 'MOV (QuickTime)' },
|
||||
{ value: 'webm', label: 'WebM (VP9)' },
|
||||
...(isAudioOnly ? [{ value: 'wav' as const, label: 'WAV (Uncompressed)' }] : []),
|
||||
]}
|
||||
title="Output container format — MP4 is most compatible"
|
||||
/>
|
||||
|
||||
{/* Video zoom / punch-in */}
|
||||
<div className="space-y-2 pt-1 border-t border-editor-border">
|
||||
<label className="flex items-center gap-2 cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={options.zoom?.enabled || false}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, enabled: e.target.checked } }))}
|
||||
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
|
||||
title="Crop and reposition the video frame — useful for removing black bars or reframing"
|
||||
/>
|
||||
<div>
|
||||
<span className="text-xs font-medium flex items-center gap-1">
|
||||
<ZoomIn className="w-3 h-3" />
|
||||
Video zoom / punch-in
|
||||
</span>
|
||||
<p className="text-[10px] text-editor-text-muted">
|
||||
Crop and zoom into the center of the video. Requires re-encode.
|
||||
</p>
|
||||
</div>
|
||||
</label>
|
||||
{options.zoom?.enabled && (
|
||||
<div className="pl-6 space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-[10px] text-editor-text-muted w-16">Zoom:</span>
|
||||
<input
|
||||
type="range"
|
||||
min={1}
|
||||
max={3}
|
||||
step={0.05}
|
||||
value={options.zoom?.zoomFactor || 1}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, zoomFactor: Number(e.target.value) } }))}
|
||||
className="flex-1 h-1.5"
|
||||
title="Magnification level — 1.0x is original, higher values zoom in"
|
||||
/>
|
||||
<span className="text-xs text-editor-text w-10 text-right">{options.zoom?.zoomFactor?.toFixed(2)}x</span>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-[10px] text-editor-text-muted w-16">Pan X:</span>
|
||||
<input
|
||||
type="range"
|
||||
min={-1}
|
||||
max={1}
|
||||
step={0.05}
|
||||
value={options.zoom?.panX || 0}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, panX: Number(e.target.value) } }))}
|
||||
className="flex-1 h-1.5"
|
||||
title="Horizontal position of the crop window — negative moves left, positive moves right"
|
||||
/>
|
||||
<span className="text-xs text-editor-text w-10 text-right">{((options.zoom?.panX || 0) * 100).toFixed(0)}%</span>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-[10px] text-editor-text-muted w-16">Pan Y:</span>
|
||||
<input
|
||||
type="range"
|
||||
min={-1}
|
||||
max={1}
|
||||
step={0.05}
|
||||
value={options.zoom?.panY || 0}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, zoom: { ...o.zoom!, panY: Number(e.target.value) } }))}
|
||||
className="flex-1 h-1.5"
|
||||
title="Vertical position of the crop window — negative moves up, positive moves down"
|
||||
/>
|
||||
<span className="text-xs text-editor-text w-10 text-right">{((options.zoom?.panY || 0) * 100).toFixed(0)}%</span>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Background removal */}
|
||||
{!isAudioOnly && (
|
||||
<div className="space-y-2 pt-1 border-t border-editor-border">
|
||||
<label className="flex items-center gap-2 cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={options.removeBackground || false}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, removeBackground: e.target.checked }))}
|
||||
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
|
||||
title="Remove or replace the background behind the speaker"
|
||||
/>
|
||||
<div>
|
||||
<span className="text-xs font-medium flex items-center gap-1">
|
||||
<Video className="w-3 h-3" />
|
||||
Remove background
|
||||
</span>
|
||||
<p className="text-[10px] text-editor-text-muted">
|
||||
Replace or blur the background. Uses MediaPipe if available.
|
||||
</p>
|
||||
</div>
|
||||
</label>
|
||||
{options.removeBackground && (
|
||||
<div className="pl-6 space-y-2">
|
||||
<SelectField
|
||||
label="Background replacement"
|
||||
value={options.backgroundReplacement || 'blur'}
|
||||
onChange={(v) => setOptions((o) => ({ ...o, backgroundReplacement: v as 'blur' | 'color' | 'image' }))}
|
||||
options={[
|
||||
{ value: 'blur', label: 'Blur background' },
|
||||
{ value: 'color', label: 'Solid color' },
|
||||
{ value: 'image', label: 'Custom image' },
|
||||
]}
|
||||
/>
|
||||
{options.backgroundReplacement === 'color' && (
|
||||
<input
|
||||
type="text"
|
||||
value={options.backgroundReplacementValue || '#00FF00'}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, backgroundReplacementValue: e.target.value }))}
|
||||
placeholder="#00FF00"
|
||||
className="w-full px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent [color-scheme:dark]"
|
||||
/>
|
||||
)}
|
||||
{options.backgroundReplacement === 'image' && (
|
||||
<p className="text-[10px] text-editor-text-muted">Place a background image file path above.</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Background music track info */}
|
||||
{backgroundMusic && (
|
||||
<div className="pt-1 border-t border-editor-border">
|
||||
<div className="flex items-center gap-1.5 text-xs text-editor-accent">
|
||||
<Music className="w-3 h-3" />
|
||||
Background music: {backgroundMusic.path.split(/[/\\]/).pop()}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Append clips info */}
|
||||
{additionalClips.length > 0 && (
|
||||
<div className="pt-1 border-t border-editor-border">
|
||||
<div className="flex items-center gap-1.5 text-xs text-editor-accent">
|
||||
<Video className="w-3 h-3" />
|
||||
{additionalClips.length} additional clip{additionalClips.length > 1 ? 's' : ''} appended
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Audio normalization — integrated into export */}
|
||||
<div className="space-y-2 pt-1 border-t border-editor-border">
|
||||
<label className="flex items-center gap-2 cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={options.normalizeAudio}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, normalizeAudio: e.target.checked }))}
|
||||
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
|
||||
title="Normalize audio to a consistent loudness target"
|
||||
/>
|
||||
<div>
|
||||
<span className="text-xs font-medium">Normalize loudness</span>
|
||||
<p className="text-[10px] text-editor-text-muted">
|
||||
Apply LUFS normalization during export. Requires re-encode.
|
||||
</p>
|
||||
</div>
|
||||
</label>
|
||||
{options.normalizeAudio && (
|
||||
<div className="flex items-center gap-2 pl-6">
|
||||
<Volume2 className="w-3 h-3 text-editor-text-muted shrink-0" />
|
||||
<select
|
||||
value={options.normalizeTarget}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, normalizeTarget: Number(e.target.value) }))}
|
||||
className="flex-1 px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent [color-scheme:dark]"
|
||||
title="Loudness target — YouTube (-14), Spotify (-16), Broadcast (-23)"
|
||||
>
|
||||
<option value={-14}>YouTube (-14 LUFS)</option>
|
||||
<option value={-16}>Spotify (-16 LUFS)</option>
|
||||
<option value={-23}>Broadcast (-23 LUFS)</option>
|
||||
<option value={-11}>Loud (-11 LUFS)</option>
|
||||
<option value={-9}>Very Loud (-9 LUFS)</option>
|
||||
</select>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Audio enhancement */}
|
||||
<label className="flex items-center gap-2 cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={options.enhanceAudio}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, enhanceAudio: e.target.checked }))}
|
||||
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
|
||||
title="Apply noise reduction and speech enhancement"
|
||||
/>
|
||||
<span className="text-xs">Enhance audio (Studio Sound)</span>
|
||||
</label>
|
||||
|
||||
{/* Captions */}
|
||||
<SelectField
|
||||
label="Captions"
|
||||
value={options.captions}
|
||||
onChange={(v) => setOptions((o) => ({ ...o, captions: v as ExportOptions['captions'] }))}
|
||||
options={[
|
||||
{ value: 'none', label: 'No captions' },
|
||||
{ value: 'burn-in', label: 'Burn-in (permanent)' },
|
||||
{ value: 'sidecar', label: 'Sidecar SRT file' },
|
||||
]}
|
||||
title="Burn captions into video, export as separate SRT/VTT file, or none"
|
||||
/>
|
||||
|
||||
{/* Transcript-only export */}
|
||||
<div className="space-y-2 pt-1 border-t border-editor-border">
|
||||
<h4 className="text-xs font-semibold flex items-center gap-1.5">
|
||||
<FileText className="w-3.5 h-3.5" />
|
||||
Export Transcript Only
|
||||
</h4>
|
||||
<p className="text-[10px] text-editor-text-muted leading-relaxed">
|
||||
Export the edited transcript as plain text or SRT without rendering video.
|
||||
</p>
|
||||
<div className="flex items-center gap-2">
|
||||
<select
|
||||
value={transcriptFormat}
|
||||
onChange={(e) => setTranscriptFormat(e.target.value as 'txt' | 'srt')}
|
||||
className="flex-1 px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent [color-scheme:dark]"
|
||||
>
|
||||
<option value="txt">Plain Text (.txt)</option>
|
||||
<option value="srt">Subtitles (.srt)</option>
|
||||
</select>
|
||||
<button
|
||||
onClick={handleTranscriptExport}
|
||||
disabled={isTranscribingTranscript || words.length === 0}
|
||||
className="flex items-center gap-1.5 px-3 py-1.5 text-xs rounded bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30 disabled:opacity-40 transition-colors"
|
||||
title="Export just the transcript text or subtitles without the video"
|
||||
>
|
||||
{isTranscribingTranscript ? (
|
||||
<Loader2 className="w-3 h-3 animate-spin" />
|
||||
) : (
|
||||
<FileText className="w-3 h-3" />
|
||||
)}
|
||||
Export
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Export video button */}
|
||||
<button
|
||||
onClick={handleExport}
|
||||
disabled={isExporting || !videoPath}
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-3 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-semibold transition-colors"
|
||||
title="Start export with current settings"
|
||||
>
|
||||
<Download className="w-4 h-4" />
|
||||
Export Video
|
||||
</button>
|
||||
|
||||
{/* Export progress */}
|
||||
{isExporting && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<Loader2 className="w-4 h-4 animate-spin text-editor-accent" />
|
||||
<span className="text-xs font-medium">Exporting...</span>
|
||||
<span className="text-xs text-editor-text-muted">{Math.round(exportProgress)}%</span>
|
||||
</div>
|
||||
<div className="w-full h-2 bg-editor-border rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-editor-accent rounded-full transition-all duration-300"
|
||||
style={{ width: `${exportProgress}%` }}
|
||||
/>
|
||||
</div>
|
||||
<p className="text-xs text-editor-text-muted">Export in progress...</p>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{exportError && (
|
||||
<div className="rounded border border-red-500/40 bg-red-500/10 px-3 py-2 text-xs text-red-300">
|
||||
{exportError}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{options.mode === 'fast' && !hasCuts && (
|
||||
<p className="text-[10px] text-editor-text-muted text-center">
|
||||
Fast mode uses stream copy — no quality loss, exports in seconds.
|
||||
</p>
|
||||
)}
|
||||
{options.mode === 'fast' && hasCuts && (
|
||||
<div className="flex items-start gap-1.5 p-2 bg-editor-accent/10 rounded text-[10px] text-editor-accent">
|
||||
<Info className="w-3.5 h-3.5 shrink-0 mt-0.5" />
|
||||
<span>
|
||||
Word-level cuts require re-encoding for frame-accurate output. Export will
|
||||
automatically use re-encode mode. This takes longer but ensures your cuts are precise.
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ModeCard({
|
||||
active,
|
||||
onClick,
|
||||
icon,
|
||||
title,
|
||||
desc,
|
||||
tooltip,
|
||||
}: {
|
||||
active: boolean;
|
||||
onClick: () => void;
|
||||
icon: React.ReactNode;
|
||||
title: string;
|
||||
desc: string;
|
||||
tooltip?: string;
|
||||
}) {
|
||||
return (
|
||||
<button
|
||||
onClick={onClick}
|
||||
title={tooltip}
|
||||
className={`flex flex-col items-center gap-1 p-3 rounded-lg border-2 transition-colors ${
|
||||
active
|
||||
? 'border-editor-accent bg-editor-accent/10'
|
||||
: 'border-editor-border hover:border-editor-text-muted'
|
||||
}`}
|
||||
>
|
||||
{icon}
|
||||
<span className="text-xs font-medium">{title}</span>
|
||||
<span className="text-[10px] text-editor-text-muted">{desc}</span>
|
||||
</button>
|
||||
);
|
||||
}
|
||||
|
||||
function SelectField({
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
options,
|
||||
title,
|
||||
}: {
|
||||
label: string;
|
||||
value: string;
|
||||
onChange: (value: string) => void;
|
||||
options: Array<{ value: string; label: string }>;
|
||||
title?: string;
|
||||
}) {
|
||||
return (
|
||||
<div className="space-y-1">
|
||||
<label className="text-xs text-editor-text-muted font-medium">{label}</label>
|
||||
<select
|
||||
title={title}
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
className="w-full px-3 py-2 bg-editor-surface border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent [color-scheme:dark]"
|
||||
>
|
||||
{options.map((opt) => (
|
||||
<option key={opt.value} value={opt.value}>
|
||||
{opt.label}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
156
frontend/src/components/HelpContent.tsx
Normal file
156
frontend/src/components/HelpContent.tsx
Normal file
@ -0,0 +1,156 @@
|
||||
import { HelpCircle, Scissors, VolumeX, SlidersHorizontal, Gauge, Film, Search, FileText, Download, Music, MapPin, ListVideo, Sparkles, Keyboard } from 'lucide-react';
|
||||
|
||||
export default function HelpContent() {
|
||||
return (
|
||||
<div className="p-4 space-y-5 overflow-y-auto">
|
||||
<h3 className="text-sm font-semibold flex items-center gap-1.5">
|
||||
<HelpCircle className="w-4 h-4" />
|
||||
Help & Reference
|
||||
</h3>
|
||||
|
||||
<Section title="Getting Started" icon={<Film className="w-3.5 h-3.5" />}>
|
||||
<Step num={1}>Open a video file — click <strong>File > Open File</strong> or press <kbd>Ctrl+O</kbd></Step>
|
||||
<Step num={2}>Wait for transcription — Whisper processes your audio and creates a word-level transcript</Step>
|
||||
<Step num={3}>Edit by selecting words — choose <strong>Cut</strong>, <strong>Mute</strong>, <strong>Sound Gain</strong>, or <strong>Speed Adjust</strong> from the toolbar</Step>
|
||||
<Step num={4}>Use AI tools — detect filler words, find clips, re-transcribe with a different model</Step>
|
||||
<Step num={5}>Export — apply all edits and save your final video</Step>
|
||||
<Step>Press <kbd>?</kbd> anytime to see all keyboard shortcuts</Step>
|
||||
</Section>
|
||||
|
||||
<Section title="Cut / Mute / Sound Gain / Speed Adjust" icon={<Scissors className="w-3.5 h-3.5" />}>
|
||||
<P>These are time-range edits applied during export. You create them in three ways:</P>
|
||||
<Bullet>Select words in the transcript — the toolbar buttons create a zone from the selected word range</Bullet>
|
||||
<Bullet>Use <strong>Mark In</strong> (<kbd>I</kbd>) and <strong>Mark Out</strong> (<kbd>O</kbd>) on the timeline, then clicking the toolbar button</Bullet>
|
||||
<Bullet>Click a toolbar button to enter <strong>zone mode</strong>, then drag on the waveform timeline to draw a zone</Bullet>
|
||||
<P className="mt-2">
|
||||
<strong>Cut</strong> — removes the segment from the output entirely<br />
|
||||
<strong>Mute</strong> — silences the audio but keeps the video<br />
|
||||
<strong>Sound Gain</strong> — adjusts volume (positive = louder, negative = quieter)<br />
|
||||
<strong>Speed Adjust</strong> — changes playback speed (1.0x = normal, 2.0x = double)
|
||||
</P>
|
||||
<P>View and manage all zones in the <strong>Edit Zones</strong> panel. Click a zone on the waveform to select it — drag edges to resize, drag the body to move.</P>
|
||||
</Section>
|
||||
|
||||
<Section title="Waveform Timeline" icon={<Film className="w-3.5 h-3.5" />}>
|
||||
<Bullet>Click to seek, drag to scrub through the video</Bullet>
|
||||
<Bullet>Enter zone mode from the toolbar, then drag on the waveform to create a zone</Bullet>
|
||||
<Bullet>Click an existing zone to select it — drag edges to resize, drag body to move</Bullet>
|
||||
<Bullet><kbd>Delete</kbd> or <kbd>Backspace</kbd> removes the selected zone (with confirmation)</Bullet>
|
||||
<Bullet><kbd>Ctrl+Scroll</kbd> to zoom in/out, scroll to pan horizontally</Bullet>
|
||||
<Bullet>Toggle individual zone types on/off with the colored buttons above the waveform</Bullet>
|
||||
<Bullet>"Show adjusted timeline" compresses cut regions to preview the output</Bullet>
|
||||
</Section>
|
||||
|
||||
<Section title="Transcript Editing" icon={<FileText className="w-3.5 h-3.5" />}>
|
||||
<Bullet>Click a word to select it, <kbd>Shift+Click</kbd> to extend the selection</Bullet>
|
||||
<Bullet><kbd>Ctrl+Click</kbd> any word to seek the video to that exact timestamp</Bullet>
|
||||
<Bullet>Double-click any word to edit its text directly</Bullet>
|
||||
<Bullet>Words with low confidence get an orange dotted underline — adjust the threshold in Settings</Bullet>
|
||||
<Bullet><kbd>Ctrl+F</kbd> to search the transcript — navigate matches with <kbd>Enter</kbd> / <kbd>Shift+Enter</kbd></Bullet>
|
||||
<Bullet>Select a word range and click <strong>Re-transcribe</strong> to re-run Whisper on just that segment</Bullet>
|
||||
</Section>
|
||||
|
||||
<Section title="Chapter Marks" icon={<MapPin className="w-3.5 h-3.5" />}>
|
||||
<Bullet>Add markers at the current playhead position with a label and color</Bullet>
|
||||
<Bullet>Use <kbd>I</kbd> / <kbd>O</kbd> keys to set mark in/out points on the timeline</Bullet>
|
||||
<Bullet>Markers auto-sort as chapters — click <strong>Copy as YouTube timestamps</strong> to get chapter text</Bullet>
|
||||
</Section>
|
||||
|
||||
<Section title="AI Tools" icon={<Sparkles className="w-3.5 h-3.5" />}>
|
||||
<P><strong>Filler Words</strong> — detects "um", "uh", "like", "you know" and similar words. Add custom fillers (e.g. "okay", "alright"). <strong>Apply All</strong> creates cut ranges for every detection at once.</P>
|
||||
<P><strong>Create Clips</strong> — analyzes your transcript to find the best 20-60 second segments for TikTok, YouTube Shorts, or Instagram Reels.</P>
|
||||
<P><strong>Reprocess</strong> — re-run transcription with a different Whisper model. Larger models are more accurate but slower. English-only models are faster for English content.</P>
|
||||
<P>AI features work with the bundled local model (no setup needed), or via Ollama/OpenAI/Claude — configure in Settings.</P>
|
||||
</Section>
|
||||
|
||||
<Section title="Export" icon={<Download className="w-3.5 h-3.5" />}>
|
||||
<Bullet><strong>Fast mode</strong> (stream copy): instant, no quality loss — but doesn't apply cuts or effects</Bullet>
|
||||
<Bullet><strong>Re-encode mode</strong>: applies all edits — cuts, gain, speed, zoom, captions, background music</Bullet>
|
||||
<Bullet>Captions can be burned into the video or exported as separate SRT/VTT files</Bullet>
|
||||
<Bullet>Loudness normalization targets: YouTube (-14 LUFS), Spotify (-16), Broadcast (-23)</Bullet>
|
||||
<Bullet>Audio enhancement: noise reduction and speech clarity</Bullet>
|
||||
<Bullet>Export Transcript Only — get SRT or plain text without the video</Bullet>
|
||||
</Section>
|
||||
|
||||
<Section title="Background Music + Add Clips" icon={<Music className="w-3.5 h-3.5" />}>
|
||||
<Bullet><strong>Bkg. Music</strong> — add a music track with auto-ducking: the music automatically lowers when someone speaks. Adjust volume, duck amount, attack, and release times.</Bullet>
|
||||
<Bullet><strong>Add Clips</strong> — load additional video files to concatenate during export. Drag to reorder.</Bullet>
|
||||
<Bullet>Both are applied during re-encode export only</Bullet>
|
||||
</Section>
|
||||
|
||||
<Section title="Keyboard Shortcuts" icon={<Keyboard className="w-3.5 h-3.5" />}>
|
||||
<P>Press <kbd>?</kbd> anytime to see the full cheatsheet overlay. Remap any shortcut in Settings.</P>
|
||||
<div className="grid grid-cols-2 gap-1 mt-2">
|
||||
<Shortcut keys="Space" desc="Play / Pause" />
|
||||
<Shortcut keys="J K L" desc="Slow / Pause / Speed" />
|
||||
<Shortcut keys="← →" desc="Skip 5s back / forward" />
|
||||
<Shortcut keys="I / O" desc="Mark In / Out points" />
|
||||
<Shortcut keys="Delete" desc="Cut selected / marked range" />
|
||||
<Shortcut keys="Ctrl+Z" desc="Undo" />
|
||||
<Shortcut keys="Ctrl+Shift+Z" desc="Redo" />
|
||||
<Shortcut keys="Ctrl+S" desc="Save project" />
|
||||
<Shortcut keys="Ctrl+E" desc="Export" />
|
||||
<Shortcut keys="Ctrl+F" desc="Find in transcript" />
|
||||
<Shortcut keys="?" desc="Toggle cheatsheet" />
|
||||
</div>
|
||||
<button
|
||||
onClick={() => window.dispatchEvent(new KeyboardEvent('keydown', { key: '?' }))}
|
||||
className="text-editor-accent hover:underline text-xs mt-2"
|
||||
>
|
||||
View full keyboard shortcut reference
|
||||
</button>
|
||||
</Section>
|
||||
|
||||
<div className="text-[10px] text-editor-text-muted leading-relaxed border-t border-editor-border pt-4">
|
||||
TalkEdit is 100% offline. No account required. No data leaves your machine. No subscription — buy once, own forever.
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Section({ title, icon, children }: { title: string; icon: React.ReactNode; children: React.ReactNode }) {
|
||||
return (
|
||||
<div className="space-y-2 p-3 bg-editor-surface rounded-lg">
|
||||
<h4 className="text-xs font-semibold flex items-center gap-1.5 text-editor-text">
|
||||
{icon}
|
||||
{title}
|
||||
</h4>
|
||||
<div className="space-y-1.5">
|
||||
{children}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function P({ children, className = '' }: { children: React.ReactNode; className?: string }) {
|
||||
return <p className={`text-xs text-editor-text-muted leading-relaxed ${className}`}>{children}</p>;
|
||||
}
|
||||
|
||||
function Bullet({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<div className="flex items-start gap-1.5">
|
||||
<span className="text-editor-accent mt-1.5 w-1 h-1 rounded-full bg-editor-accent shrink-0" />
|
||||
<span className="text-xs text-editor-text-muted leading-relaxed">{children}</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Step({ num, children }: { num?: number; children: React.ReactNode }) {
|
||||
return (
|
||||
<div className="flex items-start gap-2">
|
||||
<span className="w-5 h-5 rounded-full bg-editor-accent/20 text-editor-accent text-[10px] font-semibold flex items-center justify-center shrink-0 mt-0.5">
|
||||
{num}
|
||||
</span>
|
||||
<span className="text-xs text-editor-text-muted leading-relaxed">{children}</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Shortcut({ keys, desc }: { keys: string; desc: string }) {
|
||||
return (
|
||||
<div className="flex items-center gap-2 text-xs">
|
||||
<kbd className="px-1.5 py-0.5 text-[10px] font-mono bg-editor-bg border border-editor-border rounded text-editor-text min-w-[72px] text-center">{keys}</kbd>
|
||||
<span className="text-editor-text-muted">{desc}</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
296
frontend/src/components/LicenseDialog.tsx
Normal file
296
frontend/src/components/LicenseDialog.tsx
Normal file
@ -0,0 +1,296 @@
|
||||
import { useState } from 'react';
|
||||
import { useLicenseStore } from '../store/licenseStore';
|
||||
import { Key, Check, X, Loader2, Shield, Clock, AlertTriangle } from 'lucide-react';
|
||||
|
||||
export default function LicenseDialog() {
|
||||
const { status, showDialog, setShowDialog, activateLicense } = useLicenseStore();
|
||||
const [key, setKey] = useState('');
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [activating, setActivating] = useState(false);
|
||||
const [confirmedEmail, setConfirmedEmail] = useState<string | null>(null);
|
||||
const [verifying, setVerifying] = useState(false);
|
||||
|
||||
const handleActivate = async () => {
|
||||
if (!key.trim()) return;
|
||||
setError(null);
|
||||
|
||||
// If we already verified and the user confirmed, complete activation
|
||||
if (confirmedEmail) {
|
||||
setActivating(true);
|
||||
const ok = await activateLicense(key.trim());
|
||||
if (!ok) {
|
||||
setError('Invalid license key. Check it was entered correctly.');
|
||||
}
|
||||
setActivating(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 1: Verify the key (don't cache yet) to get the email
|
||||
setVerifying(true);
|
||||
try {
|
||||
const payload = await window.electronAPI?.verifyLicense(key.trim());
|
||||
if (payload?.customer_email) {
|
||||
setConfirmedEmail(payload.customer_email);
|
||||
} else {
|
||||
setError('Invalid license key. Check it was entered correctly.');
|
||||
}
|
||||
} catch {
|
||||
setError('Invalid license key. Check it was entered correctly.');
|
||||
}
|
||||
setVerifying(false);
|
||||
};
|
||||
|
||||
const handleDeny = () => {
|
||||
setConfirmedEmail(null);
|
||||
setKey('');
|
||||
setError(null);
|
||||
};
|
||||
|
||||
const formatDate = (ts: number) => {
|
||||
const d = new Date(ts * 1000);
|
||||
return d.toLocaleDateString('en-US', { year: 'numeric', month: 'short', day: 'numeric' });
|
||||
};
|
||||
|
||||
if (!status) return null;
|
||||
|
||||
if (status.tag === 'Licensed') {
|
||||
return (
|
||||
<div className="fixed bottom-4 right-4 z-50">
|
||||
<div className="flex items-center gap-2 px-3 py-2 rounded-lg bg-editor-surface border border-editor-border shadow-lg text-xs">
|
||||
<Shield className="w-3.5 h-3.5 text-editor-success" />
|
||||
<span className="text-editor-text-muted">
|
||||
{status.license.tier === 'business' ? 'Business' : 'Pro'} — {status.license.customer_email}
|
||||
</span>
|
||||
<span className="text-editor-text-muted/50">
|
||||
expires {formatDate(status.license.expires_at)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (status.tag === 'Trial') {
|
||||
return (
|
||||
<>
|
||||
<div className="fixed bottom-4 right-4 z-50">
|
||||
<button
|
||||
onClick={() => setShowDialog(true)}
|
||||
className="flex items-center gap-2 px-3 py-2 rounded-lg bg-editor-surface border border-editor-border shadow-lg text-xs hover:bg-editor-bg transition-colors"
|
||||
>
|
||||
<Clock className="w-3.5 h-3.5 text-editor-accent" />
|
||||
<span className="text-editor-text-muted">
|
||||
Trial — {status.days_remaining} day{status.days_remaining !== 1 ? 's' : ''} left
|
||||
</span>
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{showDialog && (
|
||||
<LicenseActivateDialog
|
||||
onClose={() => { setShowDialog(false); handleDeny(); }}
|
||||
onActivate={handleActivate}
|
||||
onDeny={handleDeny}
|
||||
keyValue={key}
|
||||
setKeyValue={setKey}
|
||||
error={error}
|
||||
activating={activating}
|
||||
verifying={verifying}
|
||||
confirmedEmail={confirmedEmail}
|
||||
trialEnding={status.days_remaining <= 3}
|
||||
/>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
// Expired — show banner + activation dialog (both dismissible)
|
||||
return (
|
||||
<>
|
||||
<ExpiredBanner onActivate={() => setShowDialog(true)} />
|
||||
|
||||
{showDialog && (
|
||||
<LicenseActivateDialog
|
||||
onClose={() => { setShowDialog(false); handleDeny(); }}
|
||||
onActivate={handleActivate}
|
||||
onDeny={handleDeny}
|
||||
keyValue={key}
|
||||
setKeyValue={setKey}
|
||||
error={error}
|
||||
activating={activating}
|
||||
verifying={verifying}
|
||||
confirmedEmail={confirmedEmail}
|
||||
expired
|
||||
/>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
/** Persistent top banner shown when trial expired — still allows export and loading */
|
||||
function ExpiredBanner({ onActivate }: { onActivate: () => void }) {
|
||||
return (
|
||||
<div className="h-9 flex items-center justify-center gap-3 px-4 bg-red-500/15 border-b border-red-500/30 shrink-0">
|
||||
<AlertTriangle className="w-3.5 h-3.5 text-red-400 shrink-0" />
|
||||
<span className="text-xs text-red-300">
|
||||
Trial expired — export and project loading still work.
|
||||
<button onClick={onActivate} className="underline font-medium hover:text-red-200">
|
||||
Activate license
|
||||
</button>
|
||||
to restore editing.
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function LicenseActivateDialog({
|
||||
onClose,
|
||||
onActivate,
|
||||
onDeny,
|
||||
keyValue,
|
||||
setKeyValue,
|
||||
error,
|
||||
activating,
|
||||
verifying,
|
||||
confirmedEmail,
|
||||
trialEnding,
|
||||
expired,
|
||||
}: {
|
||||
onClose: () => void;
|
||||
onActivate: () => void;
|
||||
onDeny: () => void;
|
||||
keyValue: string;
|
||||
setKeyValue: (v: string) => void;
|
||||
error: string | null;
|
||||
activating: boolean;
|
||||
verifying: boolean;
|
||||
confirmedEmail: string | null;
|
||||
trialEnding?: boolean;
|
||||
expired?: boolean;
|
||||
}) {
|
||||
const isProcessing = activating || verifying;
|
||||
|
||||
if (confirmedEmail) {
|
||||
return (
|
||||
<div className="fixed inset-0 z-[80] flex items-center justify-center bg-black/60 px-4">
|
||||
<div
|
||||
className="w-full max-w-md rounded-xl border border-editor-border bg-editor-bg p-6 space-y-4"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
>
|
||||
<div className="flex items-center gap-2">
|
||||
<Shield className="w-5 h-5 text-editor-accent" />
|
||||
<h3 className="text-sm font-semibold">Confirm License</h3>
|
||||
</div>
|
||||
|
||||
<div className="p-3 rounded-lg bg-editor-surface border border-editor-border space-y-1">
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
This license key is registered to:
|
||||
</p>
|
||||
<p className="text-sm font-medium text-editor-text">{confirmedEmail}</p>
|
||||
</div>
|
||||
|
||||
<p className="text-xs text-editor-text-muted leading-relaxed">
|
||||
License keys are tied to your email. Sharing this key may result in deactivation.
|
||||
</p>
|
||||
|
||||
<div className="flex items-center justify-end gap-2 pt-1">
|
||||
<button
|
||||
onClick={onDeny}
|
||||
className="px-3 py-1.5 rounded-md text-xs text-editor-text-muted hover:text-editor-text hover:bg-editor-surface"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
<button
|
||||
onClick={onActivate}
|
||||
disabled={activating}
|
||||
className="px-4 py-2 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors flex items-center gap-2"
|
||||
>
|
||||
{activating ? (
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
) : (
|
||||
<Check className="w-4 h-4" />
|
||||
)}
|
||||
Activate
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="fixed inset-0 z-[80] flex items-center justify-center bg-black/60 px-4">
|
||||
<div
|
||||
className="w-full max-w-md rounded-xl border border-editor-border bg-editor-bg p-6 space-y-4"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
>
|
||||
<div className="flex items-center justify-between">
|
||||
<div className="flex items-center gap-2">
|
||||
<Key className="w-5 h-5 text-editor-accent" />
|
||||
<h3 className="text-sm font-semibold">
|
||||
{expired ? 'Trial Expired' : 'Activate TalkEdit'}
|
||||
</h3>
|
||||
</div>
|
||||
<button
|
||||
onClick={onClose}
|
||||
className="p-1 rounded hover:bg-editor-surface text-editor-text-muted"
|
||||
title="Close dialog"
|
||||
>
|
||||
<X className="w-4 h-4" />
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{expired && (
|
||||
<div className="text-xs text-editor-text-muted leading-relaxed space-y-1">
|
||||
<p className="font-medium text-red-300">Your 30-day trial has ended.</p>
|
||||
<p>
|
||||
You can still <strong>export videos</strong> and <strong>load projects</strong>.
|
||||
Enter a license key to restore editing, AI tools, and all other features.
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{trialEnding && !expired && (
|
||||
<div className="flex items-start gap-2 p-3 rounded-lg bg-amber-500/10 border border-amber-500/30">
|
||||
<AlertTriangle className="w-4 h-4 text-amber-400 shrink-0 mt-0.5" />
|
||||
<p className="text-xs text-amber-300">Your trial ends soon. Activate now to keep using all features.</p>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{!expired && !trialEnding && (
|
||||
<p className="text-xs text-editor-text-muted leading-relaxed">
|
||||
Enter your license key to activate TalkEdit Pro or Business.
|
||||
You received this key by email after purchase.
|
||||
</p>
|
||||
)}
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-xs text-editor-text-muted font-medium">License Key</label>
|
||||
<textarea
|
||||
value={keyValue}
|
||||
onChange={(e) => { setKeyValue(e.target.value); }}
|
||||
placeholder="talkedit_v1_..."
|
||||
rows={3}
|
||||
className="w-full px-3 py-2 text-xs font-mono bg-editor-surface border border-editor-border rounded-lg text-editor-text placeholder:text-editor-text-muted/50 focus:outline-none focus:border-editor-accent resize-none"
|
||||
/>
|
||||
{error && <p className="text-xs text-red-400">{error}</p>}
|
||||
</div>
|
||||
|
||||
<button
|
||||
onClick={onActivate}
|
||||
disabled={isProcessing || !keyValue.trim()}
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
{isProcessing ? (
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
) : (
|
||||
<Key className="w-4 h-4" />
|
||||
)}
|
||||
{verifying ? 'Verifying...' : 'Verify Key'}
|
||||
</button>
|
||||
|
||||
<p className="text-[10px] text-editor-text-muted text-center">
|
||||
No license? <a href="#" className="text-editor-accent hover:underline">Purchase at talked.it</a>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
171
frontend/src/components/MarkersPanel.tsx
Normal file
171
frontend/src/components/MarkersPanel.tsx
Normal file
@ -0,0 +1,171 @@
|
||||
import { useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { MapPin, Trash2, PencilLine, Check, X, Copy } from 'lucide-react';
|
||||
|
||||
const COLOR_NAMES: Record<string, string> = {
|
||||
'#6366f1': 'Indigo',
|
||||
'#ef4444': 'Red',
|
||||
'#22c55e': 'Green',
|
||||
'#f59e0b': 'Amber',
|
||||
'#3b82f6': 'Blue',
|
||||
'#ec4899': 'Pink',
|
||||
'#8b5cf6': 'Purple',
|
||||
'#14b8a6': 'Teal',
|
||||
};
|
||||
|
||||
const COLORS = ['#6366f1', '#ef4444', '#22c55e', '#f59e0b', '#3b82f6', '#ec4899', '#8b5cf6', '#14b8a6'];
|
||||
|
||||
export default function MarkersPanel() {
|
||||
const { timelineMarkers, addTimelineMarker, updateTimelineMarker, removeTimelineMarker, getChapters } =
|
||||
useEditorStore();
|
||||
const currentTime = useEditorStore((s) => s.currentTime);
|
||||
const [editingId, setEditingId] = useState<string | null>(null);
|
||||
const [editLabel, setEditLabel] = useState('');
|
||||
const [newLabel, setNewLabel] = useState('');
|
||||
const [newColor, setNewColor] = useState(COLORS[0]);
|
||||
const [showChapters, setShowChapters] = useState(false);
|
||||
|
||||
const chapters = getChapters();
|
||||
|
||||
const addAtCurrentTime = () => {
|
||||
addTimelineMarker(currentTime, newLabel || undefined, newColor);
|
||||
setNewLabel('');
|
||||
};
|
||||
|
||||
const startEdit = (id: string, label: string) => {
|
||||
setEditingId(id);
|
||||
setEditLabel(label);
|
||||
};
|
||||
|
||||
const commitEdit = (id: string) => {
|
||||
if (editLabel.trim()) {
|
||||
updateTimelineMarker(id, { label: editLabel.trim() });
|
||||
}
|
||||
setEditingId(null);
|
||||
};
|
||||
|
||||
const exportChapters = () => {
|
||||
const lines = chapters.map((ch) => {
|
||||
const h = Math.floor(ch.startTime / 3600);
|
||||
const m = Math.floor((ch.startTime % 3600) / 60);
|
||||
const s = Math.floor(ch.startTime % 60);
|
||||
const timeStr = `${h > 0 ? `${h}:` : ''}${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`;
|
||||
return `${timeStr} ${ch.label}`;
|
||||
});
|
||||
const text = lines.join('\n');
|
||||
navigator.clipboard.writeText(text).catch(() => {});
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-4">
|
||||
<div className="space-y-1">
|
||||
<h3 className="text-sm font-semibold flex items-center gap-1.5">
|
||||
<MapPin className="w-4 h-4" />
|
||||
Timeline Markers
|
||||
</h3>
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
Drop markers at key points. Markers become YouTube-compatible chapters.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Add marker at current time */}
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<input
|
||||
value={newLabel}
|
||||
onChange={(e) => setNewLabel(e.target.value)}
|
||||
placeholder={`${currentTime.toFixed(2)}s`}
|
||||
className="flex-1 px-2 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:outline-none focus:border-editor-accent"
|
||||
/>
|
||||
<div className="flex gap-0.5">
|
||||
{COLORS.map((c) => (
|
||||
<button
|
||||
key={c}
|
||||
onClick={() => setNewColor(c)}
|
||||
className={`w-4 h-4 rounded-full border ${newColor === c ? 'border-white ring-1 ring-white' : 'border-transparent'}`}
|
||||
style={{ backgroundColor: c }}
|
||||
title={COLOR_NAMES[c]}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
<button
|
||||
onClick={addAtCurrentTime}
|
||||
className="w-full flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30 rounded"
|
||||
title="Add a marker at the current playhead position"
|
||||
>
|
||||
<MapPin className="w-3 h-3" />
|
||||
Add
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Marker list */}
|
||||
{timelineMarkers.length > 0 ? (
|
||||
<div className="space-y-1 max-h-60 overflow-y-auto">
|
||||
{timelineMarkers.map((m) => (
|
||||
<div
|
||||
key={m.id}
|
||||
className="flex items-center gap-2 px-2 py-1.5 rounded bg-editor-surface border border-editor-border text-xs"
|
||||
>
|
||||
<div className="w-2.5 h-2.5 rounded-full shrink-0" style={{ backgroundColor: m.color }} />
|
||||
<span className="text-[10px] text-editor-text-muted w-14 shrink-0">{m.time.toFixed(2)}s</span>
|
||||
{editingId === m.id ? (
|
||||
<>
|
||||
<input
|
||||
value={editLabel}
|
||||
onChange={(e) => setEditLabel(e.target.value)}
|
||||
autoFocus
|
||||
className="flex-1 px-1.5 py-0.5 text-xs bg-editor-bg border border-editor-border rounded focus:outline-none focus:border-editor-accent"
|
||||
/>
|
||||
<button onClick={() => commitEdit(m.id)} className="p-0.5 text-editor-success"><Check className="w-3 h-3" /></button>
|
||||
<button onClick={() => setEditingId(null)} className="p-0.5 text-editor-text-muted"><X className="w-3 h-3" /></button>
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<span className="flex-1 truncate">{m.label}</span>
|
||||
<button onClick={() => startEdit(m.id, m.label)} className="p-0.5 hover:text-editor-accent" title="Edit marker label and color"><PencilLine className="w-3 h-3" /></button>
|
||||
<button onClick={() => { if (window.confirm("Delete this marker?")) removeTimelineMarker(m.id); }} className="p-0.5 hover:text-editor-danger" title="Delete this marker"><Trash2 className="w-3 h-3" /></button>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
) : (
|
||||
<div className="p-4 rounded border border-dashed border-editor-border text-center">
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
No markers yet. Press I and O on the timeline to set mark in/out points, then add a marker here.
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Chapters */}
|
||||
{chapters.length > 0 && (
|
||||
<div className="space-y-2 pt-1 border-t border-editor-border">
|
||||
<button
|
||||
onClick={() => setShowChapters(!showChapters)}
|
||||
className="flex items-center gap-1 text-xs font-medium text-editor-text-muted hover:text-editor-text"
|
||||
>
|
||||
{showChapters ? '▼' : '▶'} Chapters ({chapters.length})
|
||||
</button>
|
||||
{showChapters && (
|
||||
<div className="space-y-1">
|
||||
{chapters.map((ch) => (
|
||||
<div key={ch.markerId} className="flex items-center gap-2 text-[10px] text-editor-text-muted">
|
||||
<span className="font-mono">{ch.label}</span>
|
||||
</div>
|
||||
))}
|
||||
<button
|
||||
onClick={exportChapters}
|
||||
className="flex items-center gap-1 text-[10px] text-editor-accent hover:underline"
|
||||
title="Copy chapter timestamps to clipboard in YouTube format"
|
||||
>
|
||||
<Copy className="w-2.5 h-2.5" />
|
||||
Copy as YouTube timestamps
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
469
frontend/src/components/SettingsPanel.tsx
Normal file
469
frontend/src/components/SettingsPanel.tsx
Normal file
@ -0,0 +1,469 @@
|
||||
import { useAIStore } from '../store/aiStore';
|
||||
import { useState, useEffect, useCallback } from 'react';
|
||||
import type { AIProvider, KeyBinding, HotkeyPreset } from '../types/project';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Bot, Cloud, Brain, RefreshCw, Keyboard, Trash2, HardDrive } from 'lucide-react';
|
||||
import { loadBindings, saveBindings, applyPreset as applyKeyPreset, DEFAULT_PRESETS, detectConflicts as detectKeyConflicts } from '../lib/keybindings';
|
||||
|
||||
export default function SettingsPanel() {
|
||||
const { providers, defaultProvider, setProviderConfig, setDefaultProvider } = useAIStore();
|
||||
const { backendUrl, zonePreviewPaddingSeconds, setZonePreviewPaddingSeconds } = useEditorStore();
|
||||
const CONFIDENCE_THRESHOLD_KEY = 'talkedit:confidenceThreshold';
|
||||
const [confidenceThreshold, setConfidenceThresholdState] = useState(() => {
|
||||
const stored = typeof window !== 'undefined' ? Number(window.localStorage.getItem(CONFIDENCE_THRESHOLD_KEY)) : 0;
|
||||
return Number.isFinite(stored) ? stored : 0.6;
|
||||
});
|
||||
const setConfidenceThreshold = (value: number) => {
|
||||
const clamped = Math.max(0, Math.min(1, value));
|
||||
setConfidenceThresholdState(clamped);
|
||||
if (typeof window !== 'undefined') {
|
||||
window.localStorage.setItem(CONFIDENCE_THRESHOLD_KEY, String(clamped));
|
||||
}
|
||||
};
|
||||
// Keyboard shortcuts state
|
||||
const [bindings, setBindings] = useState<KeyBinding[]>(() => {
|
||||
try { return loadBindings(); } catch { return DEFAULT_PRESETS['standard']; }
|
||||
});
|
||||
const [editingKey, setEditingKey] = useState<string | null>(null);
|
||||
const [editKeyValue, setEditKeyValue] = useState('');
|
||||
const conflicts = detectKeyConflicts(bindings);
|
||||
|
||||
const persistBindings = (newB: KeyBinding[]) => {
|
||||
saveBindings(newB);
|
||||
setBindings(newB);
|
||||
};
|
||||
|
||||
const applyPresetAction = (preset: HotkeyPreset) => {
|
||||
persistBindings(applyKeyPreset(preset));
|
||||
};
|
||||
|
||||
const startKeyEdit = (idx: number) => {
|
||||
setEditingKey(bindings[idx].id);
|
||||
setEditKeyValue(bindings[idx].keys);
|
||||
};
|
||||
|
||||
const handleKeyCapture = (e: React.KeyboardEvent, idx: number) => {
|
||||
e.preventDefault();
|
||||
const parts: string[] = [];
|
||||
if (e.ctrlKey || e.metaKey) parts.push('Ctrl');
|
||||
if (e.shiftKey) parts.push('Shift');
|
||||
if (e.altKey) parts.push('Alt');
|
||||
const key = e.key === ' ' ? 'Space' : e.key.length === 1 ? e.key.toUpperCase() : e.key;
|
||||
if (!['Control', 'Shift', 'Alt', 'Meta'].includes(key)) parts.push(key);
|
||||
if (parts.length === 0) return;
|
||||
const combo = parts.join('+');
|
||||
const newBindings = bindings.map((b, i) => (i === idx ? { ...b, keys: combo } : b));
|
||||
setEditKeyValue(combo);
|
||||
setEditingKey(null);
|
||||
persistBindings(newBindings);
|
||||
};
|
||||
|
||||
const handleReset = (idx: number) => {
|
||||
const standard = DEFAULT_PRESETS['standard'];
|
||||
const existing = standard.find((b: KeyBinding) => b.id === bindings[idx].id);
|
||||
if (!existing) return;
|
||||
persistBindings(bindings.map((b, i) => (i === idx ? { ...existing } : b)));
|
||||
};
|
||||
|
||||
const [models, setModels] = useState<ModelInfo[]>([]);
|
||||
const [loadingModels, setLoadingModels] = useState(false);
|
||||
const [deleting, setDeleting] = useState<string | null>(null);
|
||||
|
||||
const fetchModels = useCallback(async () => {
|
||||
setLoadingModels(true);
|
||||
try {
|
||||
const list = await window.electronAPI.listModels();
|
||||
setModels(list);
|
||||
} catch {
|
||||
setModels([]);
|
||||
} finally {
|
||||
setLoadingModels(false);
|
||||
}
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
fetchModels();
|
||||
}, [fetchModels]);
|
||||
|
||||
const handleDeleteModel = useCallback(async (model: ModelInfo) => {
|
||||
if (deleting) return;
|
||||
setDeleting(model.path);
|
||||
try {
|
||||
await window.electronAPI.deleteModel(model.path);
|
||||
setModels((prev) => prev.filter((m) => m.path !== model.path));
|
||||
} catch {
|
||||
// Model deletion failed silently
|
||||
} finally {
|
||||
setDeleting(null);
|
||||
}
|
||||
}, [deleting]);
|
||||
|
||||
const formatBytes = (bytes: number) => {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
||||
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
||||
return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
|
||||
};
|
||||
|
||||
const [ollamaModels, setOllamaModels] = useState<string[]>([]);
|
||||
const [loadingOllamaModels, setLoadingOllamaModels] = useState(false);
|
||||
|
||||
const fetchOllamaModels = useCallback(async () => {
|
||||
setLoadingOllamaModels(true);
|
||||
try {
|
||||
const res = await fetch(`${backendUrl}/ai/ollama-models`);
|
||||
if (res.ok) {
|
||||
const data = await res.json();
|
||||
setOllamaModels(data.models || []);
|
||||
}
|
||||
} catch {
|
||||
setOllamaModels([]);
|
||||
} finally {
|
||||
setLoadingOllamaModels(false);
|
||||
}
|
||||
}, [backendUrl]);
|
||||
|
||||
useEffect(() => {
|
||||
fetchOllamaModels();
|
||||
}, [fetchOllamaModels]);
|
||||
|
||||
const providerIcons: Record<AIProvider, React.ReactNode> = {
|
||||
ollama: <Bot className="w-4 h-4" />,
|
||||
openai: <Cloud className="w-4 h-4" />,
|
||||
claude: <Brain className="w-4 h-4" />,
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-6">
|
||||
<h3 className="text-sm font-semibold">Settings</h3>
|
||||
|
||||
<ProviderSection title="Playback" icon={<RefreshCw className="w-4 h-4" />}>
|
||||
<div className="space-y-1">
|
||||
<label className="text-xs text-editor-text-muted">Zone preview padding (seconds before and after)</label>
|
||||
<div className="flex items-center gap-2">
|
||||
<input
|
||||
type="range"
|
||||
min={0}
|
||||
max={10}
|
||||
step={0.25}
|
||||
value={zonePreviewPaddingSeconds}
|
||||
onChange={(e) => setZonePreviewPaddingSeconds(Number(e.target.value) || 0)}
|
||||
className="flex-1 h-1.5"
|
||||
title="Extra time in seconds to show before and after each zone during preview"
|
||||
/>
|
||||
<input
|
||||
type="number"
|
||||
min={0}
|
||||
max={10}
|
||||
step={0.25}
|
||||
value={zonePreviewPaddingSeconds}
|
||||
onChange={(e) => setZonePreviewPaddingSeconds(Number(e.target.value) || 0)}
|
||||
className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
|
||||
title="Extra time in seconds to show before and after each zone during preview"
|
||||
/>
|
||||
<span className="text-xs text-editor-text-muted w-6">s</span>
|
||||
</div>
|
||||
</div>
|
||||
</ProviderSection>
|
||||
|
||||
{/* Confidence threshold */}
|
||||
<div className="space-y-2">
|
||||
<label className="text-xs text-editor-text-muted font-medium">Low-Confidence Word Threshold</label>
|
||||
<p className="text-[10px] text-editor-text-muted leading-relaxed">
|
||||
Words with confidence below this value are highlighted with an orange dotted underline.
|
||||
Whisper often gets homophones and proper nouns wrong at low confidence.
|
||||
</p>
|
||||
<div className="flex items-center gap-2">
|
||||
<input
|
||||
type="range"
|
||||
min={0}
|
||||
max={1}
|
||||
step={0.05}
|
||||
value={confidenceThreshold}
|
||||
onChange={(e) => setConfidenceThreshold(Number(e.target.value))}
|
||||
className="flex-1 h-1.5"
|
||||
title="Words below this confidence get an orange underline — lower values show fewer warnings"
|
||||
/>
|
||||
<input
|
||||
type="number"
|
||||
min={0}
|
||||
max={1}
|
||||
step={0.05}
|
||||
value={confidenceThreshold}
|
||||
onChange={(e) => setConfidenceThreshold(Math.max(0, Math.min(1, Number(e.target.value) || 0)))}
|
||||
className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
|
||||
title="Words below this confidence get an orange underline — lower values show fewer warnings"
|
||||
/>
|
||||
</div>
|
||||
<div className="flex items-center justify-between text-[10px]">
|
||||
<span className="text-editor-text-muted">Show all</span>
|
||||
<span className="font-medium text-editor-text">{confidenceThreshold.toFixed(2)}</span>
|
||||
<span className="text-editor-text-muted">Strict</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Keyboard shortcuts */}
|
||||
<div className="space-y-2 pt-1 border-t border-editor-border">
|
||||
<h4 className="text-xs font-semibold flex items-center gap-1.5">
|
||||
<Keyboard className="w-3.5 h-3.5" />
|
||||
Keyboard Shortcuts
|
||||
</h4>
|
||||
<div className="flex items-center gap-2">
|
||||
<button
|
||||
onClick={() => applyPresetAction('standard')}
|
||||
className="flex-1 px-2 py-1.5 text-xs rounded bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30"
|
||||
title="Reset all shortcuts to the Standard preset"
|
||||
>
|
||||
Standard Preset
|
||||
</button>
|
||||
<button
|
||||
onClick={() => applyPresetAction('left-hand')}
|
||||
className="flex-1 px-2 py-1.5 text-xs rounded bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30"
|
||||
title="Reset all shortcuts to the Left-Hand preset"
|
||||
>
|
||||
Left-Hand Preset
|
||||
</button>
|
||||
</div>
|
||||
{conflicts.length > 0 && (
|
||||
<div className="px-2 py-1 rounded border border-red-500/40 bg-red-500/10 text-[10px] text-red-300">
|
||||
⚠️ {conflicts.join('; ')}
|
||||
</div>
|
||||
)}
|
||||
<div className="max-h-52 overflow-y-auto space-y-1 pr-1">
|
||||
{bindings.map((b, i) => (
|
||||
<div key={b.id} className="flex items-center gap-2 text-[11px]">
|
||||
<span className="flex-1 truncate text-editor-text-muted">{b.label}</span>
|
||||
<input
|
||||
value={editingKey === b.id ? editKeyValue : b.keys}
|
||||
onFocus={() => startKeyEdit(i)}
|
||||
onChange={(e) => {
|
||||
setEditingKey(b.id);
|
||||
setEditKeyValue(e.target.value);
|
||||
}}
|
||||
onKeyDown={(e) => handleKeyCapture(e, i)}
|
||||
className="w-28 px-2 py-1 text-[10px] font-mono bg-editor-bg border border-editor-border rounded text-center focus:outline-none focus:border-editor-accent"
|
||||
placeholder="Type shortcut"
|
||||
title="Click then press the desired key combination"
|
||||
/>
|
||||
<button
|
||||
onClick={() => handleReset(i)}
|
||||
className="text-[10px] text-editor-text-muted hover:text-editor-text px-1"
|
||||
title="Reset this shortcut to default"
|
||||
>
|
||||
↺
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
<p className="text-[10px] text-editor-text-muted">
|
||||
Press <kbd>?</kbd> anytime to view shortcuts. Changes apply immediately.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Default provider selector */}
|
||||
<div className="space-y-2">
|
||||
<label className="text-xs text-editor-text-muted font-medium">Default AI Provider</label>
|
||||
<div className="grid grid-cols-3 gap-1.5">
|
||||
{(['ollama', 'openai', 'claude'] as AIProvider[]).map((p) => (
|
||||
<button
|
||||
key={p}
|
||||
onClick={() => setDefaultProvider(p)}
|
||||
title={`Use ${p.charAt(0).toUpperCase() + p.slice(1)} for AI features — ${
|
||||
p === 'ollama' ? 'Use a local Ollama instance' :
|
||||
p === 'openai' ? "Use OpenAI's API (requires API key)" :
|
||||
"Use Anthropic's Claude API (requires API key)"
|
||||
}`}
|
||||
className={`flex flex-col items-center gap-1 p-2 rounded-lg border transition-colors text-[10px] ${
|
||||
defaultProvider === p
|
||||
? 'border-editor-accent bg-editor-accent/10 text-editor-accent'
|
||||
: 'border-editor-border text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
>
|
||||
{providerIcons[p]}
|
||||
{p.charAt(0).toUpperCase() + p.slice(1)}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Manage downloaded models */}
|
||||
<div className="space-y-2 pt-1 border-t border-editor-border">
|
||||
<h4 className="text-xs font-semibold flex items-center gap-1.5">
|
||||
<HardDrive className="w-3.5 h-3.5" />
|
||||
Manage Models
|
||||
</h4>
|
||||
<p className="text-[10px] text-editor-text-muted leading-relaxed">
|
||||
Downloaded Whisper transcription models and bundled LLM files.
|
||||
</p>
|
||||
{models.length === 0 ? (
|
||||
<p className="text-xs text-editor-text-muted">No downloaded models found.</p>
|
||||
) : (
|
||||
<div className="space-y-1.5">
|
||||
{models.map((m) => (
|
||||
<div key={m.path} className="flex items-center gap-2 p-2 rounded bg-editor-bg border border-editor-border">
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="text-xs text-editor-text truncate">{m.name}</p>
|
||||
<p className="text-[10px] text-editor-text-muted">
|
||||
{formatBytes(m.size_bytes)} · {m.kind === 'whisper' ? 'Whisper' : 'LLM'}
|
||||
</p>
|
||||
</div>
|
||||
<button
|
||||
onClick={() => handleDeleteModel(m)}
|
||||
disabled={deleting === m.path}
|
||||
className="p-1.5 rounded text-editor-text-muted hover:text-red-400 hover:bg-red-500/10 transition-colors disabled:opacity-40"
|
||||
title="Delete model"
|
||||
>
|
||||
<Trash2 className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
<button
|
||||
onClick={fetchModels}
|
||||
disabled={loadingModels}
|
||||
className="text-[10px] text-editor-accent hover:underline flex items-center gap-0.5"
|
||||
title="Refresh list of downloaded models"
|
||||
>
|
||||
<RefreshCw className={`w-2.5 h-2.5 ${loadingModels ? 'animate-spin' : ''}`} />
|
||||
Refresh
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<h4 className="text-xs font-semibold uppercase tracking-wide text-editor-text-muted">AI Settings</h4>
|
||||
|
||||
{/* Ollama settings */}
|
||||
<ProviderSection title="Ollama (Local)" icon={providerIcons.ollama}>
|
||||
<InputField
|
||||
label="Base URL"
|
||||
value={providers.ollama.baseUrl || ''}
|
||||
onChange={(v) => setProviderConfig('ollama', { baseUrl: v })}
|
||||
placeholder="http://localhost:11434"
|
||||
title="URL of your Ollama instance — http://localhost:11434 by default"
|
||||
/>
|
||||
<div className="space-y-1">
|
||||
<div className="flex items-center justify-between">
|
||||
<label className="text-xs text-editor-text-muted">Model</label>
|
||||
<button
|
||||
onClick={fetchOllamaModels}
|
||||
disabled={loadingOllamaModels}
|
||||
className="text-[10px] text-editor-accent hover:underline flex items-center gap-0.5"
|
||||
title="Refresh available Ollama models"
|
||||
>
|
||||
<RefreshCw className={`w-2.5 h-2.5 ${loadingOllamaModels ? 'animate-spin' : ''}`} />
|
||||
Refresh
|
||||
</button>
|
||||
</div>
|
||||
{ollamaModels.length > 0 ? (
|
||||
<select
|
||||
value={providers.ollama.model}
|
||||
onChange={(e) => setProviderConfig('ollama', { model: e.target.value })}
|
||||
className="w-full px-3 py-2 bg-editor-surface border border-editor-border rounded-lg text-xs text-white focus:outline-none focus:border-editor-accent"
|
||||
title="Which Ollama model to use for AI features"
|
||||
>
|
||||
{ollamaModels.map((m) => (
|
||||
<option key={m} value={m}>{m}</option>
|
||||
))}
|
||||
</select>
|
||||
) : (
|
||||
<InputField
|
||||
label=""
|
||||
value={providers.ollama.model}
|
||||
onChange={(v) => setProviderConfig('ollama', { model: v })}
|
||||
placeholder="llama3"
|
||||
title="Which Ollama model to use for AI features"
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</ProviderSection>
|
||||
|
||||
{/* OpenAI settings */}
|
||||
<ProviderSection title="OpenAI" icon={providerIcons.openai}>
|
||||
<InputField
|
||||
label="API Key"
|
||||
value={providers.openai.apiKey || ''}
|
||||
onChange={(v) => setProviderConfig('openai', { apiKey: v })}
|
||||
placeholder="sk-..."
|
||||
type="password"
|
||||
title="Your OpenAI API key — stored encrypted on your machine"
|
||||
/>
|
||||
<InputField
|
||||
label="Model"
|
||||
value={providers.openai.model}
|
||||
onChange={(v) => setProviderConfig('openai', { model: v })}
|
||||
placeholder="gpt-4o"
|
||||
title="OpenAI model to use (e.g. gpt-4o, gpt-4o-mini)"
|
||||
/>
|
||||
</ProviderSection>
|
||||
|
||||
{/* Claude settings */}
|
||||
<ProviderSection title="Claude (Anthropic)" icon={providerIcons.claude}>
|
||||
<InputField
|
||||
label="API Key"
|
||||
value={providers.claude.apiKey || ''}
|
||||
onChange={(v) => setProviderConfig('claude', { apiKey: v })}
|
||||
placeholder="sk-ant-..."
|
||||
type="password"
|
||||
title="Your Anthropic Claude API key — stored encrypted on your machine"
|
||||
/>
|
||||
<InputField
|
||||
label="Model"
|
||||
value={providers.claude.model}
|
||||
onChange={(v) => setProviderConfig('claude', { model: v })}
|
||||
placeholder="claude-sonnet-4-20250514"
|
||||
title="Claude model to use (e.g. claude-sonnet-4-20250514)"
|
||||
/>
|
||||
</ProviderSection>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ProviderSection({
|
||||
title,
|
||||
icon,
|
||||
children,
|
||||
}: {
|
||||
title: string;
|
||||
icon: React.ReactNode;
|
||||
children: React.ReactNode;
|
||||
}) {
|
||||
return (
|
||||
<div className="space-y-3 p-3 bg-editor-surface rounded-lg">
|
||||
<div className="flex items-center gap-2 text-xs font-medium">
|
||||
{icon}
|
||||
{title}
|
||||
</div>
|
||||
<div className="space-y-2">{children}</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function InputField({
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
placeholder,
|
||||
type = 'text',
|
||||
title,
|
||||
}: {
|
||||
label: string;
|
||||
value: string;
|
||||
onChange: (value: string) => void;
|
||||
placeholder: string;
|
||||
type?: string;
|
||||
title?: string;
|
||||
}) {
|
||||
return (
|
||||
<div className="space-y-1">
|
||||
{label && <label className="text-xs text-editor-text-muted">{label}</label>}
|
||||
<input
|
||||
type={type}
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
placeholder={placeholder}
|
||||
title={title}
|
||||
className="w-full px-3 py-2 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text placeholder:text-editor-text-muted/50 focus:outline-none focus:border-editor-accent"
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
295
frontend/src/components/SilenceTrimmerPanel.tsx
Normal file
295
frontend/src/components/SilenceTrimmerPanel.tsx
Normal file
@ -0,0 +1,295 @@
|
||||
import { useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Loader2, Scissors, Trash2, RotateCcw, PencilLine, Layers } from 'lucide-react';
|
||||
import type { SilenceDetectionRange, SilenceTrimSettings } from '../types/project';
|
||||
|
||||
export default function SilenceTrimmerPanel() {
|
||||
const {
|
||||
videoPath,
|
||||
backendUrl,
|
||||
silenceTrimGroups,
|
||||
cutRanges,
|
||||
applySilenceTrimGroup,
|
||||
removeSilenceTrimGroup,
|
||||
} = useEditorStore();
|
||||
const [minSilenceMs, setMinSilenceMs] = useState(500);
|
||||
const [silenceDb, setSilenceDb] = useState(-35);
|
||||
const [preBufferMs, setPreBufferMs] = useState(80);
|
||||
const [postBufferMs, setPostBufferMs] = useState(120);
|
||||
const [isDetecting, setIsDetecting] = useState(false);
|
||||
const [ranges, setRanges] = useState<SilenceDetectionRange[]>([]);
|
||||
const [selectedGroupId, setSelectedGroupId] = useState<string | null>(null);
|
||||
const [status, setStatus] = useState<string | null>(null);
|
||||
|
||||
const selectedGroup = selectedGroupId
|
||||
? silenceTrimGroups.find((group) => group.id === selectedGroupId) ?? null
|
||||
: null;
|
||||
|
||||
const buildSettings = (): SilenceTrimSettings => ({
|
||||
minSilenceMs,
|
||||
silenceDb,
|
||||
preBufferMs,
|
||||
postBufferMs,
|
||||
});
|
||||
|
||||
const detectSilence = async () => {
|
||||
if (!videoPath) return;
|
||||
setIsDetecting(true);
|
||||
setRanges([]);
|
||||
try {
|
||||
const res = await fetch(`${backendUrl}/audio/detect-silence`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
input_path: videoPath,
|
||||
min_silence_ms: minSilenceMs,
|
||||
silence_db: silenceDb,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
let detail = `HTTP ${res.status} ${res.statusText}`;
|
||||
try {
|
||||
const err = await res.json();
|
||||
if (err?.detail) detail += ` - ${String(err.detail)}`;
|
||||
} catch {
|
||||
// ignore JSON parse errors for non-JSON error responses
|
||||
}
|
||||
if (res.status === 404) {
|
||||
detail += ' (endpoint missing: restart backend to load /audio/detect-silence)';
|
||||
}
|
||||
throw new Error(detail);
|
||||
}
|
||||
|
||||
const data = await res.json();
|
||||
setRanges(data.ranges || []);
|
||||
setStatus(`Detected ${(data.ranges || []).length} pause ranges.`);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
const message = err instanceof Error ? err.message : 'Unknown error';
|
||||
alert(`Silence detection failed: ${message}`);
|
||||
} finally {
|
||||
setIsDetecting(false);
|
||||
}
|
||||
};
|
||||
|
||||
const applyAsNewGroup = () => {
|
||||
if (ranges.length === 0) return;
|
||||
const result = applySilenceTrimGroup({
|
||||
sourceRanges: ranges,
|
||||
settings: buildSettings(),
|
||||
});
|
||||
setSelectedGroupId(result.groupId);
|
||||
setStatus(`Applied ${result.appliedCount} cut ranges as ${result.groupId}. Undo will revert this pass in one step.`);
|
||||
};
|
||||
|
||||
const loadGroupForEditing = (groupId: string) => {
|
||||
const group = silenceTrimGroups.find((entry) => entry.id === groupId);
|
||||
if (!group) return;
|
||||
setSelectedGroupId(groupId);
|
||||
setRanges(group.sourceRanges);
|
||||
setMinSilenceMs(group.settings.minSilenceMs);
|
||||
setSilenceDb(group.settings.silenceDb);
|
||||
setPreBufferMs(group.settings.preBufferMs);
|
||||
setPostBufferMs(group.settings.postBufferMs);
|
||||
setStatus(`Loaded ${group.id} for editing. Adjust settings and reapply.`);
|
||||
};
|
||||
|
||||
const reapplySelectedGroup = () => {
|
||||
if (!selectedGroupId || ranges.length === 0) return;
|
||||
const result = applySilenceTrimGroup({
|
||||
groupId: selectedGroupId,
|
||||
sourceRanges: ranges,
|
||||
settings: buildSettings(),
|
||||
});
|
||||
setStatus(`Reapplied ${result.groupId} with ${result.appliedCount} cut ranges.`);
|
||||
};
|
||||
|
||||
const removeGroup = (groupId: string) => {
|
||||
removeSilenceTrimGroup(groupId);
|
||||
if (selectedGroupId === groupId) {
|
||||
setSelectedGroupId(null);
|
||||
}
|
||||
setStatus(`Removed all cut ranges from ${groupId}.`);
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-4">
|
||||
<div className="space-y-1">
|
||||
<h3 className="text-sm font-semibold">Silence / Pause Trimmer</h3>
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
Detect pauses and convert them into cut ranges.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-3">
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-[11px] text-editor-text-muted font-medium">
|
||||
Minimum pause length (ms)
|
||||
</label>
|
||||
<input
|
||||
type="number"
|
||||
min={100}
|
||||
step={50}
|
||||
value={minSilenceMs}
|
||||
onChange={(e) => setMinSilenceMs(Number(e.target.value) || 500)}
|
||||
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
title="Minimum duration of silence to detect in milliseconds"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-[11px] text-editor-text-muted font-medium">
|
||||
Silence threshold (dB)
|
||||
</label>
|
||||
<input
|
||||
type="number"
|
||||
min={-80}
|
||||
max={0}
|
||||
step={1}
|
||||
value={silenceDb}
|
||||
onChange={(e) => setSilenceDb(Number(e.target.value) || -35)}
|
||||
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
title="Volume threshold in dB — lower values detect quieter sounds as silence"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="grid grid-cols-2 gap-2">
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-[11px] text-editor-text-muted font-medium">
|
||||
Buffer before (ms, +shrink / -expand)
|
||||
</label>
|
||||
<input
|
||||
type="number"
|
||||
min={-5000}
|
||||
max={5000}
|
||||
step={10}
|
||||
value={preBufferMs}
|
||||
onChange={(e) => setPreBufferMs(Number(e.target.value) || 0)}
|
||||
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
title="Extra time to add before each detected silence"
|
||||
/>
|
||||
</div>
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-[11px] text-editor-text-muted font-medium">
|
||||
Buffer after (ms, +shrink / -expand)
|
||||
</label>
|
||||
<input
|
||||
type="number"
|
||||
min={-5000}
|
||||
max={5000}
|
||||
step={10}
|
||||
value={postBufferMs}
|
||||
onChange={(e) => setPostBufferMs(Number(e.target.value) || 0)}
|
||||
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
title="Extra time to add after each detected silence"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<button
|
||||
onClick={detectSilence}
|
||||
disabled={isDetecting || !videoPath}
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm font-medium transition-colors"
|
||||
title="Scan the entire audio track for silent pauses"
|
||||
>
|
||||
{isDetecting ? (
|
||||
<>
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
Detecting pauses...
|
||||
</>
|
||||
) : (
|
||||
'Detect Pauses'
|
||||
)}
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{status && (
|
||||
<div className="text-[11px] text-editor-text-muted bg-editor-surface border border-editor-border rounded px-2.5 py-2">
|
||||
{status}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{ranges.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-medium">Detected {ranges.length} pause ranges</span>
|
||||
<div className="flex items-center gap-1">
|
||||
{selectedGroup && (
|
||||
<button
|
||||
onClick={reapplySelectedGroup}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-warning/20 text-editor-warning rounded hover:bg-editor-warning/30"
|
||||
title="Re-apply this silence trim group with current settings"
|
||||
>
|
||||
<RotateCcw className="w-3 h-3" />
|
||||
Reapply Group
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
onClick={applyAsNewGroup}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-accent/20 text-editor-accent rounded hover:bg-editor-accent/30"
|
||||
title="Create a new silence trim group from detected pauses"
|
||||
>
|
||||
<Scissors className="w-3 h-3" />
|
||||
Apply As New Group
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div className="max-h-56 overflow-y-auto space-y-1 pr-1">
|
||||
{ranges.slice(0, 50).map((r, i) => (
|
||||
<div key={`${r.start}-${r.end}-${i}`} className="px-2 py-1.5 rounded bg-editor-surface border border-editor-border text-xs">
|
||||
{r.start.toFixed(2)}s - {r.end.toFixed(2)}s ({r.duration.toFixed(2)}s)
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{silenceTrimGroups.length > 0 && (
|
||||
<div className="space-y-2 pt-1">
|
||||
<div className="text-xs font-medium flex items-center gap-1">
|
||||
<Layers className="w-3 h-3" />
|
||||
Silence Trim Groups
|
||||
</div>
|
||||
<div className="max-h-48 overflow-y-auto space-y-1 pr-1">
|
||||
{silenceTrimGroups.map((group) => {
|
||||
const groupCutCount = cutRanges.filter((range) => range.trimGroupId === group.id).length;
|
||||
const isActive = selectedGroupId === group.id;
|
||||
return (
|
||||
<div
|
||||
key={group.id}
|
||||
className={`rounded border px-2 py-1.5 text-xs ${isActive ? 'border-editor-accent bg-editor-accent/10' : 'border-editor-border bg-editor-surface'}`}
|
||||
>
|
||||
<div className="flex items-center justify-between gap-2">
|
||||
<div className="min-w-0">
|
||||
<div className="font-medium truncate">{group.id}</div>
|
||||
<div className="text-[10px] text-editor-text-muted">
|
||||
{groupCutCount} cuts · {group.sourceRanges.length} source pauses
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-1 shrink-0">
|
||||
<button
|
||||
onClick={() => loadGroupForEditing(group.id)}
|
||||
className="px-1.5 py-1 rounded hover:bg-editor-accent/20 text-editor-accent"
|
||||
title="Edit and reapply this group"
|
||||
>
|
||||
<PencilLine className="w-3 h-3" />
|
||||
</button>
|
||||
<button
|
||||
onClick={() => removeGroup(group.id)}
|
||||
className="px-1.5 py-1 rounded hover:bg-editor-danger/20 text-editor-danger"
|
||||
title="Delete all cuts from this group"
|
||||
>
|
||||
<Trash2 className="w-3 h-3" />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
631
frontend/src/components/TranscriptEditor.tsx
Normal file
631
frontend/src/components/TranscriptEditor.tsx
Normal file
@ -0,0 +1,631 @@
|
||||
import { useCallback, useRef, useEffect, useMemo, useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { useLicenseStore } from '../store/licenseStore';
|
||||
import { Virtuoso } from 'react-virtuoso';
|
||||
import { Scissors, VolumeX, SlidersHorizontal, Gauge, RotateCcw, Search, ChevronUp, ChevronDown, X, RefreshCw } from 'lucide-react';
|
||||
import { assert } from '../lib/assert';
|
||||
|
||||
interface TranscriptEditorProps {
|
||||
cutMode: boolean;
|
||||
muteMode: boolean;
|
||||
gainMode: boolean;
|
||||
gainModeDb: number;
|
||||
speedMode: boolean;
|
||||
speedModeValue: number;
|
||||
}
|
||||
|
||||
export default function TranscriptEditor({
|
||||
cutMode,
|
||||
muteMode,
|
||||
gainMode,
|
||||
gainModeDb,
|
||||
speedMode,
|
||||
speedModeValue,
|
||||
}: TranscriptEditorProps) {
|
||||
const words = useEditorStore((s) => s.words);
|
||||
const segments = useEditorStore((s) => s.segments);
|
||||
const cutRanges = useEditorStore((s) => s.cutRanges);
|
||||
const muteRanges = useEditorStore((s) => s.muteRanges);
|
||||
const gainRanges = useEditorStore((s) => s.gainRanges);
|
||||
const speedRanges = useEditorStore((s) => s.speedRanges);
|
||||
const selectedWordIndices = useEditorStore((s) => s.selectedWordIndices);
|
||||
const hoveredWordIndex = useEditorStore((s) => s.hoveredWordIndex);
|
||||
const setSelectedWordIndices = useEditorStore((s) => s.setSelectedWordIndices);
|
||||
const setHoveredWordIndex = useEditorStore((s) => s.setHoveredWordIndex);
|
||||
const videoPath = useEditorStore((s) => s.videoPath);
|
||||
const backendUrl = useEditorStore((s) => s.backendUrl);
|
||||
const replaceWordRange = useEditorStore((s) => s.replaceWordRange);
|
||||
const removeCutRange = useEditorStore((s) => s.removeCutRange);
|
||||
const removeMuteRange = useEditorStore((s) => s.removeMuteRange);
|
||||
const removeGainRange = useEditorStore((s) => s.removeGainRange);
|
||||
const removeSpeedRange = useEditorStore((s) => s.removeSpeedRange);
|
||||
const addCutRange = useEditorStore((s) => s.addCutRange);
|
||||
const addMuteRange = useEditorStore((s) => s.addMuteRange);
|
||||
const addGainRange = useEditorStore((s) => s.addGainRange);
|
||||
const addSpeedRange = useEditorStore((s) => s.addSpeedRange);
|
||||
const getWordAtTime = useEditorStore((s) => s.getWordAtTime);
|
||||
const canEdit = useLicenseStore((s) => s.canEdit);
|
||||
|
||||
const selectionStart = useRef<number | null>(null);
|
||||
const wasDragging = useRef(false);
|
||||
const virtuosoRef = useRef<any>(null);
|
||||
const zoneDragStart = useRef<number | null>(null);
|
||||
const [zoneDragRange, setZoneDragRange] = useState<{ start: number; end: number } | null>(null);
|
||||
const [searchOpen, setSearchOpen] = useState(false);
|
||||
const [searchQuery, setSearchQuery] = useState('');
|
||||
const [activeMatchIdx, setActiveMatchIdx] = useState(0);
|
||||
const searchInputRef = useRef<HTMLInputElement | null>(null);
|
||||
const updateWordText = useEditorStore((s) => s.updateWordText);
|
||||
const [editingWordIndex, setEditingWordIndex] = useState<number | null>(null);
|
||||
const [editText, setEditText] = useState('');
|
||||
const editInputRef = useRef<HTMLInputElement | null>(null);
|
||||
|
||||
const selectedSet = useMemo(() => new Set(selectedWordIndices), [selectedWordIndices]);
|
||||
const matchIndices = useMemo(() => {
|
||||
const q = searchQuery.trim().toLowerCase();
|
||||
if (!q) return [] as number[];
|
||||
const matches: number[] = [];
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
if (words[i].word.toLowerCase().includes(q)) matches.push(i);
|
||||
}
|
||||
return matches;
|
||||
}, [searchQuery, words]);
|
||||
const matchSet = useMemo(() => new Set(matchIndices), [matchIndices]);
|
||||
const safeActiveMatchIdx = matchIndices.length === 0
|
||||
? 0
|
||||
: Math.min(activeMatchIdx, matchIndices.length - 1);
|
||||
|
||||
const jumpToMatch = useCallback((idx: number) => {
|
||||
if (matchIndices.length === 0) return;
|
||||
const nextIdx = ((idx % matchIndices.length) + matchIndices.length) % matchIndices.length;
|
||||
setActiveMatchIdx(nextIdx);
|
||||
const wordIndex = matchIndices[nextIdx];
|
||||
const el = document.getElementById(`word-${wordIndex}`);
|
||||
if (el) {
|
||||
el.scrollIntoView({ behavior: 'smooth', block: 'center', inline: 'nearest' });
|
||||
}
|
||||
}, [matchIndices]);
|
||||
|
||||
useEffect(() => {
|
||||
const onKeyDown = (e: KeyboardEvent) => {
|
||||
const target = e.target as HTMLElement | null;
|
||||
const isInInput = !!target && (target.tagName === 'INPUT' || target.tagName === 'TEXTAREA' || target.tagName === 'SELECT');
|
||||
|
||||
if ((e.ctrlKey || e.metaKey) && e.key.toLowerCase() === 'f') {
|
||||
e.preventDefault();
|
||||
setSearchOpen(true);
|
||||
requestAnimationFrame(() => searchInputRef.current?.focus());
|
||||
return;
|
||||
}
|
||||
|
||||
if (!searchOpen) return;
|
||||
|
||||
if (e.key === 'Escape') {
|
||||
e.preventDefault();
|
||||
setSearchOpen(false);
|
||||
return;
|
||||
}
|
||||
|
||||
if (e.key === 'Enter' && !isInInput) {
|
||||
e.preventDefault();
|
||||
jumpToMatch(safeActiveMatchIdx + (e.shiftKey ? -1 : 1));
|
||||
return;
|
||||
}
|
||||
|
||||
if (e.key === 'Enter' && isInInput && target === searchInputRef.current) {
|
||||
e.preventDefault();
|
||||
jumpToMatch(safeActiveMatchIdx + (e.shiftKey ? -1 : 1));
|
||||
}
|
||||
};
|
||||
|
||||
window.addEventListener('keydown', onKeyDown);
|
||||
return () => window.removeEventListener('keydown', onKeyDown);
|
||||
}, [jumpToMatch, searchOpen, safeActiveMatchIdx]);
|
||||
|
||||
const [activeWordIndex, setActiveWordIndex] = useState(-1);
|
||||
|
||||
useEffect(() => {
|
||||
if (words.length === 0) return;
|
||||
const interval = setInterval(() => {
|
||||
const video = document.querySelector('video') as HTMLVideoElement | null;
|
||||
if (!video) return;
|
||||
const idx = getWordAtTime(video.currentTime);
|
||||
setActiveWordIndex((prev) => (prev === idx ? prev : idx));
|
||||
}, 250);
|
||||
return () => clearInterval(interval);
|
||||
}, [words, getWordAtTime]);
|
||||
|
||||
// Auto-scroll to active segment via Virtuoso
|
||||
useEffect(() => {
|
||||
if (activeWordIndex < 0 || segments.length === 0) return;
|
||||
const segIdx = segments.findIndex((seg) => {
|
||||
const start = seg.globalStartIndex ?? 0;
|
||||
return activeWordIndex >= start && activeWordIndex < start + seg.words.length;
|
||||
});
|
||||
if (segIdx >= 0 && virtuosoRef.current) {
|
||||
virtuosoRef.current.scrollIntoView({ index: segIdx, behavior: 'smooth', align: 'center' });
|
||||
}
|
||||
}, [activeWordIndex, segments]);
|
||||
|
||||
const handleWordMouseDown = useCallback(
|
||||
(index: number, e: React.MouseEvent) => {
|
||||
e.preventDefault();
|
||||
// Ctrl+click → seek video to this word's start time
|
||||
if (e.ctrlKey) {
|
||||
const word = words[index];
|
||||
if (word) {
|
||||
const video = document.querySelector('video') as HTMLVideoElement | null;
|
||||
if (video) video.currentTime = word.start;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (cutMode || muteMode || gainMode || speedMode) {
|
||||
zoneDragStart.current = index;
|
||||
setZoneDragRange({ start: index, end: index });
|
||||
selectionStart.current = null;
|
||||
return;
|
||||
}
|
||||
|
||||
wasDragging.current = false;
|
||||
if (e.shiftKey && selectedWordIndices.length > 0) {
|
||||
const first = selectedWordIndices[0];
|
||||
const start = Math.min(first, index);
|
||||
const end = Math.max(first, index);
|
||||
const indices = [];
|
||||
for (let i = start; i <= end; i++) indices.push(i);
|
||||
setSelectedWordIndices(indices);
|
||||
} else {
|
||||
selectionStart.current = index;
|
||||
setSelectedWordIndices([index]);
|
||||
}
|
||||
},
|
||||
[words, selectedWordIndices, setSelectedWordIndices, cutMode, muteMode, gainMode, speedMode],
|
||||
);
|
||||
|
||||
const handleWordMouseEnter = useCallback(
|
||||
(index: number) => {
|
||||
setHoveredWordIndex(index);
|
||||
if (zoneDragStart.current !== null) {
|
||||
setZoneDragRange({
|
||||
start: Math.min(zoneDragStart.current, index),
|
||||
end: Math.max(zoneDragStart.current, index),
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (selectionStart.current !== null) {
|
||||
wasDragging.current = true;
|
||||
const start = Math.min(selectionStart.current, index);
|
||||
const end = Math.max(selectionStart.current, index);
|
||||
const indices = [];
|
||||
for (let i = start; i <= end; i++) indices.push(i);
|
||||
setSelectedWordIndices(indices);
|
||||
}
|
||||
},
|
||||
[setHoveredWordIndex, setSelectedWordIndices],
|
||||
);
|
||||
|
||||
const handleMouseUp = useCallback(() => {
|
||||
if (zoneDragStart.current !== null && zoneDragRange) {
|
||||
assert(zoneDragRange.start >= 0 && zoneDragRange.start < words.length, 'handleMouseUp: zoneDragRange.start out of bounds');
|
||||
assert(zoneDragRange.end >= 0 && zoneDragRange.end < words.length, 'handleMouseUp: zoneDragRange.end out of bounds');
|
||||
const startWord = words[zoneDragRange.start];
|
||||
const endWord = words[zoneDragRange.end];
|
||||
if (startWord && endWord && canEdit) {
|
||||
if (cutMode) addCutRange(startWord.start, endWord.end);
|
||||
if (muteMode) addMuteRange(startWord.start, endWord.end);
|
||||
if (gainMode) addGainRange(startWord.start, endWord.end, gainModeDb);
|
||||
if (speedMode) addSpeedRange(startWord.start, endWord.end, speedModeValue);
|
||||
}
|
||||
}
|
||||
zoneDragStart.current = null;
|
||||
setZoneDragRange(null);
|
||||
selectionStart.current = null;
|
||||
}, [zoneDragRange, words, cutMode, muteMode, gainMode, gainModeDb, speedMode, speedModeValue, addCutRange, addMuteRange, addGainRange, addSpeedRange, canEdit]);
|
||||
|
||||
const handleClickOutside = useCallback(
|
||||
(e: React.MouseEvent) => {
|
||||
if (wasDragging.current) {
|
||||
wasDragging.current = false;
|
||||
return;
|
||||
}
|
||||
if ((e.target as HTMLElement).dataset.wordIndex === undefined) {
|
||||
setSelectedWordIndices([]);
|
||||
}
|
||||
},
|
||||
[setSelectedWordIndices],
|
||||
);
|
||||
|
||||
const startEditing = useCallback((index: number) => {
|
||||
const word = words[index];
|
||||
if (!word) return;
|
||||
setEditingWordIndex(index);
|
||||
setEditText(word.word);
|
||||
requestAnimationFrame(() => {
|
||||
editInputRef.current?.focus();
|
||||
editInputRef.current?.select();
|
||||
});
|
||||
}, [words]);
|
||||
|
||||
const commitEdit = useCallback(() => {
|
||||
if (editingWordIndex === null) return;
|
||||
const trimmed = editText.trim();
|
||||
if (trimmed && trimmed !== words[editingWordIndex]?.word) {
|
||||
updateWordText(editingWordIndex, trimmed);
|
||||
}
|
||||
setEditingWordIndex(null);
|
||||
setEditText('');
|
||||
}, [editingWordIndex, editText, words, updateWordText]);
|
||||
|
||||
const cancelEdit = useCallback(() => {
|
||||
setEditingWordIndex(null);
|
||||
setEditText('');
|
||||
}, []);
|
||||
|
||||
const [isReTranscribing, setIsReTranscribing] = useState(false);
|
||||
const reTranscribeGuard = useRef(false);
|
||||
|
||||
const handleReTranscribe = useCallback(async () => {
|
||||
if (!videoPath || selectedWordIndices.length === 0 || reTranscribeGuard.current) return;
|
||||
reTranscribeGuard.current = true;
|
||||
setIsReTranscribing(true);
|
||||
|
||||
// Snapshot indices and word timings before the async gap
|
||||
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
|
||||
assert(sorted[0] >= 0 && sorted[sorted.length - 1] < words.length, 'handleReTranscribe: sorted indices out of bounds');
|
||||
const startWord = words[sorted[0]];
|
||||
const endWord = words[sorted[sorted.length - 1]];
|
||||
if (!startWord || !endWord) {
|
||||
reTranscribeGuard.current = false;
|
||||
setIsReTranscribing(false);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(`${backendUrl}/transcribe/segment`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
file_path: videoPath,
|
||||
start: startWord.start,
|
||||
end: endWord.end,
|
||||
}),
|
||||
});
|
||||
if (!res.ok) {
|
||||
let detail = res.statusText;
|
||||
try { const body = await res.json(); if (body?.detail) detail = String(body.detail); } catch { /* keep statusText fallback */ }
|
||||
throw new Error(`Re-transcribe failed: ${detail}`);
|
||||
}
|
||||
const data = await res.json();
|
||||
replaceWordRange(sorted[0], sorted[sorted.length - 1], data.words);
|
||||
} catch (err) {
|
||||
console.error('Re-transcribe error:', err);
|
||||
alert(err instanceof Error ? err.message : 'Re-transcribe failed');
|
||||
} finally {
|
||||
reTranscribeGuard.current = false;
|
||||
setIsReTranscribing(false);
|
||||
}
|
||||
}, [videoPath, selectedWordIndices, words, backendUrl, replaceWordRange]);
|
||||
|
||||
const handleWordDoubleClick = useCallback((index: number) => {
|
||||
if (cutMode || muteMode || gainMode || speedMode) return;
|
||||
if (!canEdit) return;
|
||||
startEditing(index);
|
||||
}, [cutMode, muteMode, gainMode, speedMode, startEditing, canEdit]);
|
||||
|
||||
// Focus edit input when it appears
|
||||
useEffect(() => {
|
||||
if (editingWordIndex !== null && editInputRef.current) {
|
||||
editInputRef.current.focus();
|
||||
editInputRef.current.select();
|
||||
}
|
||||
}, [editingWordIndex]);
|
||||
|
||||
// Global key handler for edit mode
|
||||
useEffect(() => {
|
||||
const onKeyDown = (e: KeyboardEvent) => {
|
||||
if (editingWordIndex === null) return;
|
||||
if (e.key === 'Enter') {
|
||||
e.preventDefault();
|
||||
commitEdit();
|
||||
} else if (e.key === 'Escape') {
|
||||
e.preventDefault();
|
||||
cancelEdit();
|
||||
}
|
||||
};
|
||||
window.addEventListener('keydown', onKeyDown);
|
||||
return () => window.removeEventListener('keydown', onKeyDown);
|
||||
}, [editingWordIndex, commitEdit, cancelEdit]);
|
||||
|
||||
const cutSelectedWords = useCallback(() => {
|
||||
if (selectedWordIndices.length === 0) return;
|
||||
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
|
||||
assert(sorted[0] >= 0 && sorted[0] < words.length, 'cutSelectedWords: sorted[0] out of bounds');
|
||||
assert(sorted[sorted.length - 1] >= 0 && sorted[sorted.length - 1] < words.length, 'cutSelectedWords: sorted[last] out of bounds');
|
||||
const startTime = words[sorted[0]].start;
|
||||
const endTime = words[sorted[sorted.length - 1]].end;
|
||||
addCutRange(startTime, endTime);
|
||||
}, [selectedWordIndices, words, addCutRange]);
|
||||
|
||||
const muteSelectedWords = useCallback(() => {
|
||||
if (selectedWordIndices.length === 0) return;
|
||||
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
|
||||
const startTime = words[sorted[0]].start;
|
||||
const endTime = words[sorted[sorted.length - 1]].end;
|
||||
addMuteRange(startTime, endTime);
|
||||
}, [selectedWordIndices, words, addMuteRange]);
|
||||
|
||||
const gainSelectedWords = useCallback(() => {
|
||||
if (selectedWordIndices.length === 0) return;
|
||||
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
|
||||
const startTime = words[sorted[0]].start;
|
||||
const endTime = words[sorted[sorted.length - 1]].end;
|
||||
addGainRange(startTime, endTime, gainModeDb);
|
||||
}, [selectedWordIndices, words, addGainRange, gainModeDb]);
|
||||
|
||||
const speedSelectedWords = useCallback(() => {
|
||||
if (selectedWordIndices.length === 0) return;
|
||||
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
|
||||
const startTime = words[sorted[0]].start;
|
||||
const endTime = words[sorted[sorted.length - 1]].end;
|
||||
addSpeedRange(startTime, endTime, speedModeValue);
|
||||
}, [selectedWordIndices, words, addSpeedRange, speedModeValue]);
|
||||
|
||||
const getCutRangeForWord = useCallback(
|
||||
(wordIndex: number) => {
|
||||
const word = words[wordIndex];
|
||||
if (!word) return null;
|
||||
return cutRanges.find((r) => word.start >= r.start && word.end <= r.end);
|
||||
},
|
||||
[words, cutRanges],
|
||||
);
|
||||
|
||||
const getMuteRangeForWord = useCallback(
|
||||
(wordIndex: number) => {
|
||||
const word = words[wordIndex];
|
||||
if (!word) return null;
|
||||
return muteRanges.find((r) => word.start >= r.start && word.end <= r.end);
|
||||
},
|
||||
[words, muteRanges],
|
||||
);
|
||||
|
||||
const getGainRangeForWord = useCallback(
|
||||
(wordIndex: number) => {
|
||||
const word = words[wordIndex];
|
||||
if (!word) return null;
|
||||
return gainRanges.find((r) => word.start >= r.start && word.end <= r.end);
|
||||
},
|
||||
[words, gainRanges],
|
||||
);
|
||||
|
||||
const getSpeedRangeForWord = useCallback(
|
||||
(wordIndex: number) => {
|
||||
const word = words[wordIndex];
|
||||
if (!word) return null;
|
||||
return speedRanges.find((r) => word.start >= r.start && word.end <= r.end);
|
||||
},
|
||||
[words, speedRanges],
|
||||
);
|
||||
|
||||
const renderSegment = useCallback(
|
||||
(index: number) => {
|
||||
const segment = segments[index];
|
||||
if (!segment) return null;
|
||||
return (
|
||||
<div className="mb-3 px-4">
|
||||
{segment.speaker && (
|
||||
<div className="text-xs text-editor-accent font-medium mb-1">
|
||||
{segment.speaker}
|
||||
</div>
|
||||
)}
|
||||
<p className="text-sm leading-relaxed flex flex-wrap">
|
||||
{segment.words.map((word, localIndex) => {
|
||||
const globalIndex = (segment.globalStartIndex ?? 0) + localIndex;
|
||||
const isSelected = selectedSet.has(globalIndex);
|
||||
const isActive = globalIndex === activeWordIndex;
|
||||
const isHovered = globalIndex === hoveredWordIndex;
|
||||
const isZoneDragSelected = zoneDragRange
|
||||
? globalIndex >= zoneDragRange.start && globalIndex <= zoneDragRange.end
|
||||
: false;
|
||||
const cutRange = getCutRangeForWord(globalIndex);
|
||||
const muteRange = getMuteRangeForWord(globalIndex);
|
||||
const gainRange = getGainRangeForWord(globalIndex);
|
||||
const speedRange = getSpeedRangeForWord(globalIndex);
|
||||
const isSearchMatch = matchSet.has(globalIndex);
|
||||
const isActiveSearchMatch = matchIndices.length > 0 && matchIndices[safeActiveMatchIdx] === globalIndex;
|
||||
|
||||
const isEditing = globalIndex === editingWordIndex;
|
||||
|
||||
// Low-confidence highlighting
|
||||
const CONFIDENCE_THRESHOLD_KEY = 'talkedit:confidenceThreshold';
|
||||
const storedThreshold = typeof window !== 'undefined' ? Number(window.localStorage.getItem(CONFIDENCE_THRESHOLD_KEY)) : 0;
|
||||
const confidenceThreshold = Number.isFinite(storedThreshold) ? storedThreshold : 0.6;
|
||||
const isLowConfidence = word.confidence > 0 && word.confidence < confidenceThreshold && !cutRange && !muteRange && !gainRange && !speedRange;
|
||||
const confidencePct = word.confidence > 0 ? Math.round(word.confidence * 100) : null;
|
||||
|
||||
return (
|
||||
<span
|
||||
key={globalIndex}
|
||||
id={`word-${globalIndex}`}
|
||||
data-word-index={globalIndex}
|
||||
title={`${word.start.toFixed(2)}s — confidence: ${confidencePct !== null ? confidencePct + '%' : 'N/A'}${isLowConfidence ? ' ⚠️ Low confidence' : ''} — Ctrl+click to seek, double-click to edit`}
|
||||
onMouseDown={(e) => handleWordMouseDown(globalIndex, e)}
|
||||
onMouseEnter={() => handleWordMouseEnter(globalIndex)}
|
||||
onMouseLeave={() => setHoveredWordIndex(null)}
|
||||
onDoubleClick={() => handleWordDoubleClick(globalIndex)}
|
||||
className={`
|
||||
relative px-[2px] py-[1px] rounded cursor-pointer transition-colors
|
||||
${cutRange ? 'bg-red-500/20 text-red-100' : ''}
|
||||
${muteRange ? 'bg-blue-500/20 text-blue-100' : ''}
|
||||
${gainRange ? 'bg-amber-500/20 text-amber-100' : ''}
|
||||
${speedRange ? 'bg-emerald-500/20 text-emerald-100' : ''}
|
||||
${isZoneDragSelected && cutMode ? 'bg-red-500/30 ring-1 ring-red-400/60' : ''}
|
||||
${isZoneDragSelected && muteMode ? 'bg-blue-500/30 ring-1 ring-blue-400/60' : ''}
|
||||
${isZoneDragSelected && gainMode ? 'bg-amber-500/30 ring-1 ring-amber-400/60' : ''}
|
||||
${isZoneDragSelected && speedMode ? 'bg-emerald-500/30 ring-1 ring-emerald-400/60' : ''}
|
||||
${isSearchMatch && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/15 ring-2 ring-editor-accent/50' : ''}
|
||||
${isActiveSearchMatch && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/35 ring-2 ring-editor-accent text-white font-medium' : ''}
|
||||
${isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-selected text-white' : ''}
|
||||
${isActive && !isSelected && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-accent/20 text-editor-accent' : ''}
|
||||
${isHovered && !isSelected && !isActive && !cutRange && !muteRange && !gainRange && !speedRange ? 'bg-editor-word-hover' : ''}
|
||||
${isLowConfidence ? 'border-b border-dashed border-orange-400/60' : ''}
|
||||
`}
|
||||
>
|
||||
{isEditing ? (
|
||||
<input
|
||||
ref={editInputRef}
|
||||
value={editText}
|
||||
onChange={(e) => setEditText(e.target.value)}
|
||||
onBlur={commitEdit}
|
||||
className="w-24 px-1 py-0 text-xs bg-editor-bg border border-editor-accent rounded text-editor-text focus:outline-none"
|
||||
style={{ minWidth: `${Math.max(word.word.length * 8, 48)}px` }}
|
||||
/>
|
||||
) : (
|
||||
<>{word.word}{' '}</>
|
||||
)}
|
||||
{(cutRange || muteRange || gainRange || speedRange) && isHovered && (
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
if (cutRange) removeCutRange(cutRange.id);
|
||||
if (muteRange) removeMuteRange(muteRange.id);
|
||||
if (gainRange) removeGainRange(gainRange.id);
|
||||
if (speedRange) removeSpeedRange(speedRange.id);
|
||||
}}
|
||||
className="absolute -top-5 left-1/2 -translate-x-1/2 flex items-center gap-0.5 px-1.5 py-0.5 bg-editor-surface border border-editor-border rounded text-[10px] text-editor-success whitespace-nowrap z-10"
|
||||
>
|
||||
<RotateCcw className="w-2.5 h-2.5" /> Restore
|
||||
</button>
|
||||
)}
|
||||
</span>
|
||||
);
|
||||
})}
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
},
|
||||
[segments, selectedSet, matchSet, matchIndices, safeActiveMatchIdx, activeWordIndex, hoveredWordIndex, handleWordMouseDown, handleWordMouseEnter, setHoveredWordIndex, getCutRangeForWord, getMuteRangeForWord, getGainRangeForWord, getSpeedRangeForWord, removeCutRange, removeMuteRange, removeGainRange, removeSpeedRange, zoneDragRange, cutMode, muteMode, gainMode, speedMode, editingWordIndex, editText, editInputRef, handleWordDoubleClick, commitEdit, setEditText],
|
||||
);
|
||||
|
||||
return (
|
||||
<div className="flex-1 flex flex-col min-h-0">
|
||||
<div className="flex items-center justify-between gap-2 px-4 py-2 border-b border-editor-border shrink-0">
|
||||
<div className="flex items-center gap-1.5">
|
||||
<button
|
||||
onClick={() => {
|
||||
setSearchOpen(true);
|
||||
requestAnimationFrame(() => searchInputRef.current?.focus());
|
||||
}}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs text-editor-text-muted hover:text-editor-text hover:bg-editor-surface rounded"
|
||||
title="Find (Ctrl+F)"
|
||||
>
|
||||
<Search className="w-3 h-3" />
|
||||
Find
|
||||
</button>
|
||||
{searchOpen && (
|
||||
<div className="flex items-center gap-1.5 px-2 py-1 rounded border border-editor-border bg-editor-surface">
|
||||
<input
|
||||
ref={searchInputRef}
|
||||
value={searchQuery}
|
||||
onChange={(e) => {
|
||||
setSearchQuery(e.target.value);
|
||||
setActiveMatchIdx(0);
|
||||
}}
|
||||
placeholder="Search transcript"
|
||||
className="w-40 bg-transparent text-xs text-editor-text focus:outline-none"
|
||||
/>
|
||||
<span className="text-[10px] text-editor-text-muted min-w-[52px] text-right">
|
||||
{matchIndices.length === 0 ? '0/0' : `${safeActiveMatchIdx + 1}/${matchIndices.length}`}
|
||||
</span>
|
||||
<button
|
||||
onClick={() => jumpToMatch(safeActiveMatchIdx - 1)}
|
||||
className="p-0.5 rounded hover:bg-editor-bg text-editor-text-muted hover:text-editor-text"
|
||||
title="Previous match (Shift+Enter)"
|
||||
>
|
||||
<ChevronUp className="w-3 h-3" />
|
||||
</button>
|
||||
<button
|
||||
onClick={() => jumpToMatch(safeActiveMatchIdx + 1)}
|
||||
className="p-0.5 rounded hover:bg-editor-bg text-editor-text-muted hover:text-editor-text"
|
||||
title="Next match (Enter)"
|
||||
>
|
||||
<ChevronDown className="w-3 h-3" />
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setSearchOpen(false)}
|
||||
className="p-0.5 rounded hover:bg-editor-bg text-editor-text-muted hover:text-editor-text"
|
||||
title="Close search (Esc)"
|
||||
>
|
||||
<X className="w-3 h-3" />
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
{selectedWordIndices.length > 0 && (
|
||||
<div className="flex items-center gap-1">
|
||||
<button
|
||||
onClick={cutSelectedWords}
|
||||
disabled={!canEdit}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-red-500/20 text-red-300 rounded hover:bg-red-500/30 transition-colors disabled:opacity-40"
|
||||
title="Remove this word range from the output"
|
||||
>
|
||||
<Scissors className="w-3 h-3" />
|
||||
Cut
|
||||
</button>
|
||||
<button
|
||||
onClick={muteSelectedWords}
|
||||
disabled={!canEdit}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-blue-500/20 text-blue-300 rounded hover:bg-blue-500/30 transition-colors disabled:opacity-40"
|
||||
title="Silence audio for this word range"
|
||||
>
|
||||
<VolumeX className="w-3 h-3" />
|
||||
Mute
|
||||
</button>
|
||||
<button
|
||||
onClick={gainSelectedWords}
|
||||
disabled={!canEdit}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-amber-500/20 text-amber-300 rounded hover:bg-amber-500/30 transition-colors disabled:opacity-40"
|
||||
title="Adjust volume for this word range — positive boosts, negative reduces"
|
||||
>
|
||||
<SlidersHorizontal className="w-3 h-3" />
|
||||
Gain ({gainModeDb > 0 ? '+' : ''}{gainModeDb.toFixed(1)} dB)
|
||||
</button>
|
||||
<button
|
||||
onClick={speedSelectedWords}
|
||||
disabled={!canEdit}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-emerald-500/20 text-emerald-300 rounded hover:bg-emerald-500/30 transition-colors disabled:opacity-40"
|
||||
title="Change playback speed for this word range — lower is slower, higher is faster"
|
||||
>
|
||||
<Gauge className="w-3 h-3" />
|
||||
Speed {speedModeValue.toFixed(2)}x
|
||||
</button>
|
||||
<button
|
||||
onClick={handleReTranscribe}
|
||||
disabled={isReTranscribing || !canEdit}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-purple-500/20 text-purple-300 rounded hover:bg-purple-500/30 disabled:opacity-40 transition-colors"
|
||||
title="Re-run Whisper transcription on this segment"
|
||||
>
|
||||
<RefreshCw className={`w-3 h-3 ${isReTranscribing ? 'animate-spin' : ''}`} />
|
||||
{isReTranscribing ? 'Re-transcribing...' : 'Re-transcribe'}
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div
|
||||
className="flex-1 min-h-0 select-none"
|
||||
onMouseUp={handleMouseUp}
|
||||
onClick={handleClickOutside}
|
||||
>
|
||||
<Virtuoso
|
||||
ref={virtuosoRef}
|
||||
totalCount={segments.length}
|
||||
itemContent={renderSegment}
|
||||
overscan={200}
|
||||
className="h-full"
|
||||
style={{ height: '100%' }}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
133
frontend/src/components/VideoPlayer.tsx
Normal file
133
frontend/src/components/VideoPlayer.tsx
Normal file
@ -0,0 +1,133 @@
|
||||
import { useRef, useCallback, useState, useEffect } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { useVideoSync } from '../hooks/useVideoSync';
|
||||
import { Play, Pause, SkipBack, SkipForward, Volume2 } from 'lucide-react';
|
||||
|
||||
export default function VideoPlayer() {
|
||||
const videoRef = useRef<HTMLVideoElement>(null);
|
||||
const videoUrl = useEditorStore((s) => s.videoUrl);
|
||||
const isPlaying = useEditorStore((s) => s.isPlaying);
|
||||
const duration = useEditorStore((s) => s.duration);
|
||||
const { seekTo, togglePlay } = useVideoSync(videoRef);
|
||||
|
||||
const [displayTime, setDisplayTime] = useState(0);
|
||||
|
||||
useEffect(() => {
|
||||
const video = videoRef.current;
|
||||
if (!video) return;
|
||||
let raf = 0;
|
||||
const tick = () => {
|
||||
setDisplayTime(video.currentTime);
|
||||
raf = requestAnimationFrame(tick);
|
||||
};
|
||||
raf = requestAnimationFrame(tick);
|
||||
return () => cancelAnimationFrame(raf);
|
||||
}, [videoUrl]);
|
||||
|
||||
const formatTime = (seconds: number) => {
|
||||
const m = Math.floor(seconds / 60);
|
||||
const s = Math.floor(seconds % 60);
|
||||
return `${m}:${s.toString().padStart(2, '0')}`;
|
||||
};
|
||||
|
||||
const handleProgressClick = useCallback(
|
||||
(e: React.MouseEvent<HTMLDivElement>) => {
|
||||
const rect = e.currentTarget.getBoundingClientRect();
|
||||
const ratio = (e.clientX - rect.left) / rect.width;
|
||||
seekTo(ratio * duration);
|
||||
},
|
||||
[seekTo, duration],
|
||||
);
|
||||
|
||||
const skip = useCallback(
|
||||
(delta: number) => {
|
||||
const video = videoRef.current;
|
||||
if (!video) return;
|
||||
seekTo(Math.max(0, Math.min(duration, video.currentTime + delta)));
|
||||
},
|
||||
[seekTo, duration],
|
||||
);
|
||||
|
||||
if (!videoUrl) {
|
||||
return (
|
||||
<div className="w-full h-full flex items-center justify-center text-editor-text-muted text-sm">
|
||||
No video loaded
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="w-full h-full flex flex-col">
|
||||
<div className="flex-1 flex items-center justify-center bg-black rounded-lg overflow-hidden min-h-0">
|
||||
<video
|
||||
ref={videoRef}
|
||||
src={videoUrl}
|
||||
className="max-w-full max-h-full object-contain"
|
||||
playsInline
|
||||
onClick={togglePlay}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="pt-2 space-y-1.5 shrink-0">
|
||||
<div
|
||||
className="h-1.5 bg-editor-border rounded-full cursor-pointer group"
|
||||
onClick={handleProgressClick}
|
||||
>
|
||||
<div
|
||||
className="h-full bg-editor-accent rounded-full relative transition-all group-hover:h-2"
|
||||
style={{ width: duration > 0 ? `${(displayTime / duration) * 100}%` : '0%' }}
|
||||
>
|
||||
<div className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 bg-white rounded-full opacity-0 group-hover:opacity-100 transition-opacity" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center justify-between">
|
||||
<div className="flex items-center gap-1">
|
||||
<ControlButton onClick={() => skip(-5)} title="Back 5s">
|
||||
<SkipBack className="w-4 h-4" />
|
||||
</ControlButton>
|
||||
<ControlButton onClick={togglePlay} title={isPlaying ? 'Pause' : 'Play'} primary>
|
||||
{isPlaying ? <Pause className="w-5 h-5" /> : <Play className="w-5 h-5 ml-0.5" />}
|
||||
</ControlButton>
|
||||
<ControlButton onClick={() => skip(5)} title="Forward 5s">
|
||||
<SkipForward className="w-4 h-4" />
|
||||
</ControlButton>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-3 text-xs text-editor-text-muted">
|
||||
<Volume2 className="w-3.5 h-3.5" />
|
||||
<span className="font-mono">
|
||||
{formatTime(displayTime)} / {formatTime(duration)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ControlButton({
|
||||
children,
|
||||
onClick,
|
||||
title,
|
||||
primary,
|
||||
}: {
|
||||
children: React.ReactNode;
|
||||
onClick: () => void;
|
||||
title: string;
|
||||
primary?: boolean;
|
||||
}) {
|
||||
return (
|
||||
<button
|
||||
onClick={onClick}
|
||||
title={title}
|
||||
className={`p-1.5 rounded-md transition-colors ${
|
||||
primary
|
||||
? 'bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30'
|
||||
: 'text-editor-text-muted hover:text-editor-text hover:bg-editor-surface'
|
||||
}`}
|
||||
>
|
||||
{children}
|
||||
</button>
|
||||
);
|
||||
}
|
||||
1385
frontend/src/components/WaveformTimeline.tsx
Normal file
1385
frontend/src/components/WaveformTimeline.tsx
Normal file
File diff suppressed because it is too large
Load Diff
459
frontend/src/components/ZoneEditor.tsx
Normal file
459
frontend/src/components/ZoneEditor.tsx
Normal file
@ -0,0 +1,459 @@
|
||||
import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Trash2, Scissors, Volume2, SlidersHorizontal, Gauge, Play } from 'lucide-react';
|
||||
|
||||
function formatTimelineLikeTime(secs: number): string {
|
||||
const m = Math.floor(secs / 60);
|
||||
const s = secs % 60;
|
||||
if (m > 0) return `${m}:${String(Math.floor(s)).padStart(2, '0')}.${Math.floor((s % 1) * 10)}`;
|
||||
return `${s.toFixed(1)}s`;
|
||||
}
|
||||
|
||||
export default function ZoneEditor() {
|
||||
const [viewMode, setViewMode] = useState<'all' | 'cut' | 'mute' | 'gain' | 'speed'>('all');
|
||||
const [focusedZone, setFocusedZone] = useState<{ type: 'cut' | 'mute' | 'gain' | 'speed'; id: string } | null>(null);
|
||||
const previewFrameRef = useRef<number | null>(null);
|
||||
|
||||
const {
|
||||
cutRanges,
|
||||
muteRanges,
|
||||
gainRanges,
|
||||
speedRanges,
|
||||
duration,
|
||||
setCurrentTime,
|
||||
zonePreviewPaddingSeconds,
|
||||
setZonePreviewPaddingSeconds,
|
||||
globalGainDb,
|
||||
setGlobalGainDb,
|
||||
removeCutRange,
|
||||
removeMuteRange,
|
||||
removeGainRange,
|
||||
removeSpeedRange,
|
||||
updateGainRange,
|
||||
updateSpeedRange,
|
||||
} = useEditorStore();
|
||||
|
||||
const stopPreviewLoop = useCallback(() => {
|
||||
if (previewFrameRef.current !== null) {
|
||||
cancelAnimationFrame(previewFrameRef.current);
|
||||
previewFrameRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
useEffect(() => stopPreviewLoop, [stopPreviewLoop]);
|
||||
|
||||
const previewZone = useCallback((start: number, end: number) => {
|
||||
const video = document.querySelector('video');
|
||||
if (!(video instanceof HTMLVideoElement)) return;
|
||||
|
||||
stopPreviewLoop();
|
||||
|
||||
const previewStart = Math.max(0, start - zonePreviewPaddingSeconds);
|
||||
const maxDuration = Number.isFinite(duration) && duration > 0 ? duration : video.duration;
|
||||
const previewEnd = Math.min(maxDuration || end + zonePreviewPaddingSeconds, end + zonePreviewPaddingSeconds);
|
||||
|
||||
video.currentTime = previewStart;
|
||||
setCurrentTime(previewStart);
|
||||
|
||||
const tick = () => {
|
||||
if (video.paused || video.ended) {
|
||||
previewFrameRef.current = null;
|
||||
return;
|
||||
}
|
||||
if (video.currentTime >= previewEnd) {
|
||||
video.pause();
|
||||
video.currentTime = previewEnd;
|
||||
setCurrentTime(previewEnd);
|
||||
previewFrameRef.current = null;
|
||||
return;
|
||||
}
|
||||
previewFrameRef.current = requestAnimationFrame(tick);
|
||||
};
|
||||
|
||||
void video.play();
|
||||
previewFrameRef.current = requestAnimationFrame(tick);
|
||||
}, [duration, setCurrentTime, stopPreviewLoop, zonePreviewPaddingSeconds]);
|
||||
|
||||
const renderPreviewButton = (start: number, end: number, accentClass: string) => (
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
previewZone(start, end);
|
||||
}}
|
||||
className={`p-1 rounded opacity-0 group-hover:opacity-100 transition-opacity ${accentClass}`}
|
||||
title={`Play ${zonePreviewPaddingSeconds.toFixed(2)}s before and after zone`}
|
||||
>
|
||||
<Play className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
);
|
||||
|
||||
const totalZones = cutRanges.length + muteRanges.length + gainRanges.length + speedRanges.length;
|
||||
|
||||
const getZoneTypeColor = (type: 'cut' | 'mute' | 'gain' | 'speed') => {
|
||||
switch (type) {
|
||||
case 'cut':
|
||||
return 'border-red-500/40 bg-red-500/5';
|
||||
case 'mute':
|
||||
return 'border-blue-500/40 bg-blue-500/20';
|
||||
case 'gain':
|
||||
return 'border-amber-500/40 bg-amber-500/5';
|
||||
case 'speed':
|
||||
return 'border-emerald-500/40 bg-emerald-500/5';
|
||||
}
|
||||
};
|
||||
|
||||
const activeFocusedZone = useMemo(() => {
|
||||
if (!focusedZone) return null;
|
||||
const exists = focusedZone.type === 'cut'
|
||||
? cutRanges.some((range) => range.id === focusedZone.id)
|
||||
: focusedZone.type === 'mute'
|
||||
? muteRanges.some((range) => range.id === focusedZone.id)
|
||||
: focusedZone.type === 'gain'
|
||||
? gainRanges.some((range) => range.id === focusedZone.id)
|
||||
: speedRanges.some((range) => range.id === focusedZone.id);
|
||||
return exists ? focusedZone : null;
|
||||
}, [cutRanges, focusedZone, gainRanges, muteRanges, speedRanges]);
|
||||
|
||||
const isZoneFocused = useCallback(
|
||||
(type: 'cut' | 'mute' | 'gain' | 'speed', id: string) => activeFocusedZone?.type === type && activeFocusedZone.id === id,
|
||||
[activeFocusedZone],
|
||||
);
|
||||
|
||||
const removeZone = useCallback((type: 'cut' | 'mute' | 'gain' | 'speed', id: string) => {
|
||||
if (!window.confirm("Delete this zone?")) return;
|
||||
if (type === 'cut') removeCutRange(id);
|
||||
else if (type === 'mute') removeMuteRange(id);
|
||||
else if (type === 'gain') removeGainRange(id);
|
||||
else removeSpeedRange(id);
|
||||
setFocusedZone((current) => (current?.type === type && current.id === id ? null : current));
|
||||
}, [removeCutRange, removeGainRange, removeMuteRange, removeSpeedRange]);
|
||||
|
||||
useEffect(() => {
|
||||
const handleKeyDown = (e: KeyboardEvent) => {
|
||||
const target = e.target as HTMLElement | null;
|
||||
if (target && (target.tagName === 'INPUT' || target.tagName === 'TEXTAREA' || target.tagName === 'SELECT')) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (e.key === 'Escape') {
|
||||
setFocusedZone(null);
|
||||
return;
|
||||
}
|
||||
|
||||
if ((e.key === 'Delete' || e.key === 'Backspace') && activeFocusedZone) {
|
||||
e.preventDefault();
|
||||
removeZone(activeFocusedZone.type, activeFocusedZone.id);
|
||||
}
|
||||
};
|
||||
|
||||
window.addEventListener('keydown', handleKeyDown, { capture: true });
|
||||
return () => window.removeEventListener('keydown', handleKeyDown, { capture: true });
|
||||
}, [activeFocusedZone, removeZone]);
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-4">
|
||||
<div className="space-y-2">
|
||||
<div className="space-y-1">
|
||||
<div className="flex items-start justify-between gap-3">
|
||||
<div>
|
||||
<h3 className="text-sm font-semibold flex items-center gap-2">
|
||||
Zone Editor
|
||||
</h3>
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
Manage all timeline zones ({totalZones} total)
|
||||
</p>
|
||||
</div>
|
||||
<div className="min-w-[160px] rounded border border-editor-border bg-editor-surface px-2 py-1.5">
|
||||
<div className="flex items-center justify-between gap-2">
|
||||
<span className="text-[10px] uppercase tracking-wide text-editor-text-muted">Preview</span>
|
||||
<span className="text-[10px] text-editor-text-muted">before/after</span>
|
||||
</div>
|
||||
<div className="mt-1 flex items-center gap-1.5">
|
||||
<input
|
||||
type="number"
|
||||
min={0}
|
||||
max={10}
|
||||
step={0.25}
|
||||
value={zonePreviewPaddingSeconds}
|
||||
onChange={(e) => setZonePreviewPaddingSeconds(Number(e.target.value) || 0)}
|
||||
className="w-16 px-2 py-1 bg-editor-bg border border-editor-border rounded text-xs text-editor-text focus:outline-none focus:border-editor-accent"
|
||||
title="Preview time before and after each zone"
|
||||
/>
|
||||
<span className="text-xs text-editor-text-muted">sec</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* View Mode Toggle */}
|
||||
<div className="flex items-center gap-1 rounded bg-editor-surface border border-editor-border p-1">
|
||||
<button
|
||||
onClick={() => setViewMode('all')}
|
||||
className={`px-2 py-1 text-xs rounded transition-colors ${
|
||||
viewMode === 'all'
|
||||
? 'bg-editor-accent text-white'
|
||||
: 'text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
title="Show all zones"
|
||||
>
|
||||
All
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setViewMode('cut')}
|
||||
className={`px-2 py-1 text-xs rounded transition-colors ${
|
||||
viewMode === 'cut'
|
||||
? 'bg-red-500/30 text-red-500'
|
||||
: 'text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
title="Show only Cut zones"
|
||||
>
|
||||
Cut
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setViewMode('mute')}
|
||||
className={`px-2 py-1 text-xs rounded transition-colors ${
|
||||
viewMode === 'mute'
|
||||
? 'bg-blue-500/20 text-blue-400'
|
||||
: 'text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
title="Show only Mute zones"
|
||||
>
|
||||
Mute
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setViewMode('gain')}
|
||||
className={`px-2 py-1 text-xs rounded transition-colors ${
|
||||
viewMode === 'gain'
|
||||
? 'bg-amber-500/30 text-amber-500'
|
||||
: 'text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
title="Show only Gain zones"
|
||||
>
|
||||
Gain
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setViewMode('speed')}
|
||||
className={`px-2 py-1 text-xs rounded transition-colors ${
|
||||
viewMode === 'speed'
|
||||
? 'bg-emerald-500/30 text-emerald-500'
|
||||
: 'text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
title="Show only Speed zones"
|
||||
>
|
||||
Speed
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{totalZones === 0 ? (
|
||||
<div className="p-4 rounded-lg border border-dashed border-editor-border text-center">
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
No zones yet. Create zones from the toolbar or by highlighting words.
|
||||
</p>
|
||||
</div>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
{/* Cut Zones */}
|
||||
{(viewMode === 'all' || viewMode === 'cut') && cutRanges.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="text-xs font-semibold text-red-500/80 flex items-center gap-2">
|
||||
<Scissors className="w-3.5 h-3.5" />
|
||||
Cut Zones ({cutRanges.length})
|
||||
</div>
|
||||
<div className="space-y-1">
|
||||
{cutRanges.map((range) => (
|
||||
<div
|
||||
key={range.id}
|
||||
onClick={() => setFocusedZone({ type: 'cut', id: range.id })}
|
||||
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('cut')} ${isZoneFocused('cut', range.id) ? 'ring-1 ring-red-400 border-red-400/80 bg-red-500/12' : ''}`}
|
||||
>
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="font-medium truncate">
|
||||
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
|
||||
</div>
|
||||
</div>
|
||||
{renderPreviewButton(range.start, range.end, 'hover:bg-red-500/20 text-red-500/70 hover:text-red-500')}
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
removeZone('cut', range.id);
|
||||
}}
|
||||
className="p-1 rounded hover:bg-red-500/20 text-red-500/70 hover:text-red-500 opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
title="Delete cut zone"
|
||||
>
|
||||
<Trash2 className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Mute Zones */}
|
||||
{(viewMode === 'all' || viewMode === 'mute') && muteRanges.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="text-xs font-semibold text-blue-400 flex items-center gap-2">
|
||||
<Volume2 className="w-3.5 h-3.5" />
|
||||
Mute Zones ({muteRanges.length})
|
||||
</div>
|
||||
<div className="space-y-1">
|
||||
{muteRanges.map((range) => (
|
||||
<div
|
||||
key={range.id}
|
||||
onClick={() => setFocusedZone({ type: 'mute', id: range.id })}
|
||||
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('mute')} ${isZoneFocused('mute', range.id) ? 'ring-1 ring-blue-400 border-blue-400/80 bg-blue-500/20' : ''}`}
|
||||
>
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="font-medium truncate">
|
||||
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
|
||||
</div>
|
||||
</div>
|
||||
{renderPreviewButton(range.start, range.end, 'hover:bg-blue-500/20 text-blue-400 hover:text-blue-400')}
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
removeZone('mute', range.id);
|
||||
}}
|
||||
className="p-1 rounded hover:bg-blue-500/20 text-blue-400 hover:text-blue-400 opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
title="Delete mute zone"
|
||||
>
|
||||
<Trash2 className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Sound Gain */}
|
||||
{(viewMode === 'all' || viewMode === 'gain') && gainRanges.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="text-xs font-semibold text-amber-500/80 flex items-center gap-2">
|
||||
<SlidersHorizontal className="w-3.5 h-3.5" />
|
||||
Sound Gain ({gainRanges.length})
|
||||
</div>
|
||||
|
||||
{/* Global Gain Slider */}
|
||||
<div className="px-2 py-2 rounded border border-amber-500/20 bg-amber-500/5 space-y-2">
|
||||
<label className="text-xs text-editor-text-muted font-medium">Global Gain</label>
|
||||
<div className="flex items-center gap-2">
|
||||
<input
|
||||
type="range"
|
||||
min={-24}
|
||||
max={24}
|
||||
step={0.5}
|
||||
value={globalGainDb}
|
||||
onChange={(e) => setGlobalGainDb(Number(e.target.value))}
|
||||
className="flex-1 h-1.5"
|
||||
/>
|
||||
<input
|
||||
type="number"
|
||||
min={-24}
|
||||
max={24}
|
||||
step={0.5}
|
||||
value={globalGainDb}
|
||||
onChange={(e) => setGlobalGainDb(Math.max(-24, Math.min(24, Number(e.target.value) || 0)))}
|
||||
className="w-14 px-1.5 py-0.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
title="Volume adjustment in decibels — +6 dB doubles volume, -6 dB halves it"
|
||||
/>
|
||||
<span className="text-xs text-amber-500/80 font-medium w-6 text-right">dB</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="space-y-1">
|
||||
{gainRanges.map((range) => (
|
||||
<div
|
||||
key={range.id}
|
||||
onClick={() => setFocusedZone({ type: 'gain', id: range.id })}
|
||||
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('gain')} ${isZoneFocused('gain', range.id) ? 'ring-1 ring-amber-400 border-amber-400/80 bg-amber-500/12' : ''}`}
|
||||
>
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="font-medium truncate">
|
||||
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
|
||||
</div>
|
||||
<div className="text-editor-text-muted text-[10px]">
|
||||
{range.gainDb > 0 ? '+' : ''}{range.gainDb.toFixed(1)} dB
|
||||
</div>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
min={-24}
|
||||
max={24}
|
||||
step={0.5}
|
||||
value={range.gainDb}
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
onChange={(e) => updateGainRange(range.id, Number(e.target.value) || 0)}
|
||||
className="w-16 px-1.5 py-0.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
title="Volume adjustment in decibels — +6 dB doubles volume, -6 dB halves it"
|
||||
/>
|
||||
{renderPreviewButton(range.start, range.end, 'hover:bg-amber-500/20 text-amber-500/70 hover:text-amber-500')}
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
removeZone('gain', range.id);
|
||||
}}
|
||||
className="p-1 rounded hover:bg-amber-500/20 text-amber-500/70 hover:text-amber-500 opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
title="Delete gain zone"
|
||||
>
|
||||
<Trash2 className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Speed Adjust */}
|
||||
{(viewMode === 'all' || viewMode === 'speed') && speedRanges.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="text-xs font-semibold text-emerald-500/80 flex items-center gap-2">
|
||||
<Gauge className="w-3.5 h-3.5" />
|
||||
Speed Adjust ({speedRanges.length})
|
||||
</div>
|
||||
<div className="space-y-1">
|
||||
{speedRanges.map((range) => (
|
||||
<div
|
||||
key={range.id}
|
||||
onClick={() => setFocusedZone({ type: 'speed', id: range.id })}
|
||||
className={`px-2 py-1.5 rounded-lg border text-xs flex items-center gap-2 group cursor-pointer transition-colors ${getZoneTypeColor('speed')} ${isZoneFocused('speed', range.id) ? 'ring-1 ring-emerald-400 border-emerald-400/80 bg-emerald-500/12' : ''}`}
|
||||
>
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="font-medium truncate">
|
||||
{formatTimelineLikeTime(range.start)} - {formatTimelineLikeTime(range.end)}
|
||||
</div>
|
||||
<div className="text-editor-text-muted text-[10px]">
|
||||
{range.speed.toFixed(2)}x
|
||||
</div>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
min={0.25}
|
||||
max={4}
|
||||
step={0.05}
|
||||
value={range.speed}
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
onChange={(e) => updateSpeedRange(range.id, Number(e.target.value) || 1)}
|
||||
className="w-16 px-1.5 py-0.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
title="Playback speed multiplier — 1.0x is normal, 2.0x is twice as fast"
|
||||
/>
|
||||
{renderPreviewButton(range.start, range.end, 'hover:bg-emerald-500/20 text-emerald-500/70 hover:text-emerald-500')}
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
removeZone('speed', range.id);
|
||||
}}
|
||||
className="p-1 rounded hover:bg-emerald-500/20 text-emerald-500/70 hover:text-emerald-500 opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
title="Delete speed zone"
|
||||
>
|
||||
<Trash2 className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
212
frontend/src/hooks/useKeyboardShortcuts.ts
Normal file
212
frontend/src/hooks/useKeyboardShortcuts.ts
Normal file
@ -0,0 +1,212 @@
|
||||
import { useEffect, useRef } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { loadBindings, DEFAULT_PRESETS } from '../lib/keybindings';
|
||||
import type { KeyBinding } from '../types/project';
|
||||
|
||||
export function useKeyboardShortcuts() {
|
||||
const addCutRange = useEditorStore((s) => s.addCutRange);
|
||||
const markInTime = useEditorStore((s) => s.markInTime);
|
||||
const markOutTime = useEditorStore((s) => s.markOutTime);
|
||||
const setMarkInTime = useEditorStore((s) => s.setMarkInTime);
|
||||
const setMarkOutTime = useEditorStore((s) => s.setMarkOutTime);
|
||||
const clearMarkRange = useEditorStore((s) => s.clearMarkRange);
|
||||
const selectedWordIndices = useEditorStore((s) => s.selectedWordIndices);
|
||||
const words = useEditorStore((s) => s.words);
|
||||
const playbackRateRef = useRef(1);
|
||||
|
||||
// Read bindings fresh from localStorage on every call to avoid stale closures
|
||||
const getBindings = (): KeyBinding[] => {
|
||||
try { return loadBindings(); } catch { return []; }
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
const getVideo = (): HTMLVideoElement | null => document.querySelector('video');
|
||||
|
||||
const handler = (e: KeyboardEvent) => {
|
||||
const target = e.target as HTMLElement;
|
||||
if (target.tagName === 'INPUT' || target.tagName === 'TEXTAREA' || target.tagName === 'SELECT') return;
|
||||
|
||||
const video = getVideo();
|
||||
|
||||
// Build a key string from the event for matching
|
||||
const parts: string[] = [];
|
||||
if (e.ctrlKey || e.metaKey) parts.push('Ctrl');
|
||||
if (e.shiftKey && !['Shift'].includes(e.key)) parts.push('Shift');
|
||||
if (e.altKey) parts.push('Alt');
|
||||
const keyStr = e.key === ' ' ? 'Space' : e.key.length === 1 ? e.key.toUpperCase() : e.key;
|
||||
parts.push(keyStr);
|
||||
const combo = parts.join('+');
|
||||
|
||||
// Look up binding — fresh read every keystroke so Settings changes take effect immediately
|
||||
const currentBindings = getBindings();
|
||||
const binding = currentBindings.find((b) => b.keys === combo);
|
||||
if (!binding) return; // Unbound key — ignore
|
||||
|
||||
e.preventDefault();
|
||||
|
||||
switch (binding.id) {
|
||||
case 'undo':
|
||||
useEditorStore.temporal.getState().undo();
|
||||
return;
|
||||
case 'redo':
|
||||
useEditorStore.temporal.getState().redo();
|
||||
return;
|
||||
case 'cut': {
|
||||
if (selectedWordIndices.length > 0) {
|
||||
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
|
||||
addCutRange(words[sorted[0]].start, words[sorted[sorted.length - 1]].end);
|
||||
return;
|
||||
}
|
||||
if (markInTime !== null && markOutTime !== null) {
|
||||
const start = Math.min(markInTime, markOutTime);
|
||||
const end = Math.max(markInTime, markOutTime);
|
||||
if (end - start >= 0.01) addCutRange(start, end);
|
||||
clearMarkRange();
|
||||
}
|
||||
return;
|
||||
}
|
||||
case 'play-pause':
|
||||
if (video) { if (video.paused) video.play(); else video.pause(); }
|
||||
return;
|
||||
case 'slow-down': {
|
||||
if (video) {
|
||||
playbackRateRef.current = Math.max(-2, playbackRateRef.current - 0.5);
|
||||
if (playbackRateRef.current < 0) video.currentTime = Math.max(0, video.currentTime - 2);
|
||||
else { video.playbackRate = playbackRateRef.current; if (video.paused) video.play(); }
|
||||
}
|
||||
return;
|
||||
}
|
||||
case 'pause':
|
||||
if (video) { video.pause(); playbackRateRef.current = 1; }
|
||||
return;
|
||||
case 'speed-up': {
|
||||
if (video) {
|
||||
playbackRateRef.current = Math.min(4, playbackRateRef.current + 0.5);
|
||||
video.playbackRate = Math.max(0.25, playbackRateRef.current);
|
||||
if (video.paused) video.play();
|
||||
}
|
||||
return;
|
||||
}
|
||||
case 'rewind':
|
||||
if (video) video.currentTime = Math.max(0, video.currentTime - 5);
|
||||
return;
|
||||
case 'forward':
|
||||
if (video) video.currentTime = Math.min(video.duration, video.currentTime + 5);
|
||||
return;
|
||||
case 'mark-in':
|
||||
if (video) setMarkInTime(video.currentTime);
|
||||
return;
|
||||
case 'mark-out':
|
||||
if (video) setMarkOutTime(video.currentTime);
|
||||
return;
|
||||
case 'save': {
|
||||
const saveBtn = document.querySelector('[title="Save"]') as HTMLButtonElement | null;
|
||||
if (saveBtn) saveBtn.click();
|
||||
else saveProject();
|
||||
return;
|
||||
}
|
||||
case 'export': {
|
||||
const exportBtn = document.querySelector('[title="Export"]') as HTMLButtonElement;
|
||||
if (exportBtn) exportBtn.click();
|
||||
return;
|
||||
}
|
||||
case 'search': {
|
||||
const findBtn = document.querySelector('[title="Find (Ctrl+F)"]') as HTMLButtonElement;
|
||||
if (findBtn) findBtn.click();
|
||||
return;
|
||||
}
|
||||
case 'help':
|
||||
toggleCheatsheet(currentBindings);
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
window.addEventListener('keydown', handler);
|
||||
return () => window.removeEventListener('keydown', handler);
|
||||
}, [addCutRange, markInTime, markOutTime, setMarkInTime, setMarkOutTime, clearMarkRange, selectedWordIndices, words]);
|
||||
}
|
||||
|
||||
async function saveProject() {
|
||||
const state = useEditorStore.getState();
|
||||
if (!state.videoPath || state.words.length === 0) return;
|
||||
|
||||
try {
|
||||
const projectData = state.saveProject();
|
||||
let outputPath = state.projectFilePath;
|
||||
|
||||
if (!outputPath) {
|
||||
outputPath = await window.electronAPI?.saveFile({
|
||||
defaultPath: state.videoPath.replace(/\.[^.]+$/, '.aive'),
|
||||
filters: [{ name: 'TalkEdit Project', extensions: ['aive'] }],
|
||||
});
|
||||
}
|
||||
|
||||
if (!outputPath) return;
|
||||
|
||||
const resolvedPath = outputPath.endsWith('.aive') ? outputPath : `${outputPath}.aive`;
|
||||
|
||||
if (window.electronAPI?.writeFile) {
|
||||
await window.electronAPI.writeFile(resolvedPath, JSON.stringify(projectData, null, 2));
|
||||
useEditorStore.getState().setProjectFilePath(resolvedPath);
|
||||
} else {
|
||||
const blob = new Blob([JSON.stringify(projectData, null, 2)], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = resolvedPath.split(/[\\/]/).pop() || 'project.aive';
|
||||
a.click();
|
||||
URL.revokeObjectURL(url);
|
||||
useEditorStore.getState().setProjectFilePath(resolvedPath);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Failed to save project:', err);
|
||||
}
|
||||
}
|
||||
|
||||
function toggleCheatsheet(bindings: KeyBinding[]) {
|
||||
const existing = document.getElementById('keyboard-cheatsheet');
|
||||
if (existing) {
|
||||
existing.remove();
|
||||
return;
|
||||
}
|
||||
|
||||
const overlay = document.createElement('div');
|
||||
overlay.id = 'keyboard-cheatsheet';
|
||||
overlay.style.cssText =
|
||||
'position:fixed;inset:0;z-index:9999;display:flex;align-items:center;justify-content:center;background:rgba(0,0,0,0.7);';
|
||||
overlay.onclick = () => {
|
||||
overlay.remove();
|
||||
};
|
||||
|
||||
const presetName = JSON.stringify(bindings) === JSON.stringify(DEFAULT_PRESETS['left-hand']) ? 'Left-Hand Preset' : 'Standard Preset';
|
||||
|
||||
const rows = bindings
|
||||
.map(
|
||||
(b) =>
|
||||
`<tr><td style="padding:6px 16px 6px 0;font-family:monospace;color:#818cf8;font-weight:600;white-space:nowrap">${b.keys}</td><td style="padding:6px 0;color:#e2e8f0">${b.label}</td><td style="padding:6px 0 6px 12px;font-size:10px;color:#94a3b8">${b.category}</td></tr>`,
|
||||
)
|
||||
.join('');
|
||||
|
||||
overlay.innerHTML = `<div style="background:#1a1d27;border:1px solid #2a2d3a;border-radius:12px;padding:24px 32px;max-width:450px;position:relative;" onclick="event.stopPropagation()">
|
||||
<div style="font-size:11px;color:#94a3b8;margin-bottom:12px">Active preset: <span style="color:#818cf8;font-weight:500">${presetName}</span></div>
|
||||
<h3 style="margin:0 0 16px;font-size:14px;font-weight:600;color:#e2e8f0">Keyboard Shortcuts</h3>
|
||||
<table style="font-size:13px">${rows}</table>
|
||||
<p style="margin:16px 0 0;font-size:11px;color:#94a3b8;text-align:center">Customize in Settings • Press ? to close</p>
|
||||
<button id="cheatsheet-close" style="position:absolute;top:12px;right:16px;background:none;border:none;color:#94a3b8;font-size:18px;cursor:pointer;line-height:1;padding:4px;">×</button>
|
||||
</div>`;
|
||||
|
||||
document.body.appendChild(overlay);
|
||||
|
||||
const closeBtn = overlay.querySelector('#cheatsheet-close') as HTMLButtonElement;
|
||||
if (closeBtn) closeBtn.onclick = () => overlay.remove();
|
||||
|
||||
const escHandler = (e: KeyboardEvent) => {
|
||||
if (e.key === 'Escape') {
|
||||
overlay.remove();
|
||||
document.removeEventListener('keydown', escHandler);
|
||||
}
|
||||
};
|
||||
document.addEventListener('keydown', escHandler);
|
||||
}
|
||||
164
frontend/src/hooks/useVideoSync.ts
Normal file
164
frontend/src/hooks/useVideoSync.ts
Normal file
@ -0,0 +1,164 @@
|
||||
import { useCallback, useRef, useEffect } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
|
||||
export function useVideoSync(videoRef: React.RefObject<HTMLVideoElement | null>) {
|
||||
const rafRef = useRef<number>(0);
|
||||
const {
|
||||
setCurrentTime,
|
||||
setDuration,
|
||||
setIsPlaying,
|
||||
cutRanges,
|
||||
muteRanges,
|
||||
speedRanges,
|
||||
} = useEditorStore();
|
||||
|
||||
const getPlaybackRateAtTime = useCallback(
|
||||
(time: number) => {
|
||||
for (const range of speedRanges) {
|
||||
if (time >= range.start && time < range.end) {
|
||||
return range.speed;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
},
|
||||
[speedRanges],
|
||||
);
|
||||
|
||||
const applyVideoEffects = useCallback(
|
||||
(video: HTMLVideoElement) => {
|
||||
let t = video.currentTime;
|
||||
|
||||
const allSkipRanges = [...cutRanges];
|
||||
let skipCount = 0;
|
||||
const maxSkips = 10;
|
||||
|
||||
while (skipCount < maxSkips) {
|
||||
let shouldSkip = false;
|
||||
for (const range of allSkipRanges) {
|
||||
if (t >= range.start && t < range.end) {
|
||||
t = range.end;
|
||||
shouldSkip = true;
|
||||
skipCount++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!shouldSkip) break;
|
||||
}
|
||||
|
||||
if (skipCount > 0 && video.currentTime !== t) {
|
||||
video.currentTime = t;
|
||||
}
|
||||
|
||||
let shouldMute = false;
|
||||
for (const range of muteRanges) {
|
||||
if (t >= range.start && t < range.end) {
|
||||
shouldMute = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
video.muted = shouldMute;
|
||||
|
||||
const playbackRate = getPlaybackRateAtTime(t);
|
||||
if (video.playbackRate !== playbackRate) {
|
||||
video.playbackRate = playbackRate;
|
||||
}
|
||||
|
||||
setCurrentTime(t);
|
||||
return t;
|
||||
},
|
||||
[cutRanges, muteRanges, getPlaybackRateAtTime, setCurrentTime],
|
||||
);
|
||||
|
||||
const seekTo = useCallback(
|
||||
(time: number) => {
|
||||
if (videoRef.current) {
|
||||
let targetTime = time;
|
||||
|
||||
// If seeking into cut or deleted ranges, skip to the end (handle overlapping/chained ranges)
|
||||
const allSkipRanges = [...cutRanges];
|
||||
let skipCount = 0;
|
||||
const maxSkips = 10; // Prevent infinite loops
|
||||
|
||||
while (skipCount < maxSkips) {
|
||||
let shouldSkip = false;
|
||||
for (const range of allSkipRanges) {
|
||||
if (targetTime >= range.start && targetTime < range.end) {
|
||||
targetTime = range.end;
|
||||
shouldSkip = true;
|
||||
skipCount++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!shouldSkip) break;
|
||||
}
|
||||
|
||||
videoRef.current.currentTime = targetTime;
|
||||
videoRef.current.playbackRate = getPlaybackRateAtTime(targetTime);
|
||||
setCurrentTime(targetTime);
|
||||
}
|
||||
},
|
||||
[videoRef, cutRanges, getPlaybackRateAtTime, setCurrentTime],
|
||||
);
|
||||
|
||||
const togglePlay = useCallback(() => {
|
||||
if (!videoRef.current) return;
|
||||
if (videoRef.current.paused) {
|
||||
videoRef.current.play();
|
||||
} else {
|
||||
videoRef.current.pause();
|
||||
}
|
||||
}, [videoRef]);
|
||||
|
||||
useEffect(() => {
|
||||
const video = videoRef.current;
|
||||
if (!video) return;
|
||||
|
||||
const updateWhilePlaying = () => {
|
||||
applyVideoEffects(video);
|
||||
if (!video.paused && !video.ended) {
|
||||
rafRef.current = requestAnimationFrame(updateWhilePlaying);
|
||||
}
|
||||
};
|
||||
|
||||
const onTimeUpdate = () => {
|
||||
cancelAnimationFrame(rafRef.current);
|
||||
rafRef.current = requestAnimationFrame(() => {
|
||||
applyVideoEffects(video);
|
||||
});
|
||||
};
|
||||
|
||||
const onPlay = () => {
|
||||
setIsPlaying(true);
|
||||
cancelAnimationFrame(rafRef.current);
|
||||
rafRef.current = requestAnimationFrame(updateWhilePlaying);
|
||||
};
|
||||
const onPause = () => {
|
||||
setIsPlaying(false);
|
||||
cancelAnimationFrame(rafRef.current);
|
||||
applyVideoEffects(video);
|
||||
};
|
||||
const onLoadedMetadata = () => {
|
||||
setDuration(video.duration);
|
||||
applyVideoEffects(video);
|
||||
};
|
||||
const onSeeked = () => applyVideoEffects(video);
|
||||
|
||||
video.addEventListener('timeupdate', onTimeUpdate);
|
||||
video.addEventListener('play', onPlay);
|
||||
video.addEventListener('pause', onPause);
|
||||
video.addEventListener('loadedmetadata', onLoadedMetadata);
|
||||
video.addEventListener('seeked', onSeeked);
|
||||
|
||||
return () => {
|
||||
video.removeEventListener('timeupdate', onTimeUpdate);
|
||||
video.removeEventListener('play', onPlay);
|
||||
video.removeEventListener('pause', onPause);
|
||||
video.removeEventListener('loadedmetadata', onLoadedMetadata);
|
||||
video.removeEventListener('seeked', onSeeked);
|
||||
cancelAnimationFrame(rafRef.current);
|
||||
video.playbackRate = 1;
|
||||
};
|
||||
}, [videoRef, applyVideoEffects, setIsPlaying, setDuration]);
|
||||
|
||||
return { seekTo, togglePlay };
|
||||
}
|
||||
58
frontend/src/index.css
Normal file
58
frontend/src/index.css
Normal file
@ -0,0 +1,58 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
@keyframes waveBar {
|
||||
0% { transform: scaleY(0.3); opacity: 0.5; }
|
||||
50% { transform: scaleY(1); opacity: 1; }
|
||||
100% { transform: scaleY(0.3); opacity: 0.5; }
|
||||
}
|
||||
|
||||
@keyframes audioBounce {
|
||||
0% { height: 12px; }
|
||||
50% { height: var(--bar-peak); }
|
||||
100% { height: 12px; }
|
||||
}
|
||||
|
||||
.wave-bar {
|
||||
animation: waveBar 0.9s ease-in-out infinite;
|
||||
transform-origin: bottom;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: 'Inter', system-ui, -apple-system, sans-serif;
|
||||
overflow: hidden;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar {
|
||||
width: 6px;
|
||||
height: 6px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-track {
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb {
|
||||
background: #2a2d3a;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb:hover {
|
||||
background: #3a3d4a;
|
||||
}
|
||||
|
||||
video::-webkit-media-controls {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user