Initial CutScript release - Open-source AI-powered text-based video editor

CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT
2026-03-03 06:31:04 -05:00
parent d1e1fedcae
commit 33cca5f552
73 changed files with 7463 additions and 3906 deletions
--- a/backend/routers/init.py
+++ b/backend/routers/init.py
--- a/backend/routers/ai.py
+++ b/backend/routers/ai.py
@ -0,0 +1,83 @@
+"""AI feature endpoints: filler word detection, clip creation, Ollama model listing."""
+
+import logging
+from typing import List, Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from services.ai_provider import AIProvider, detect_filler_words, create_clip_suggestion
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+class WordInfo(BaseModel):
+    index: int
+    word: str
+    start: Optional[float] = None
+    end: Optional[float] = None
+
+
+class FillerRequest(BaseModel):
+    transcript: str
+    words: List[WordInfo]
+    provider: str = "ollama"
+    model: Optional[str] = None
+    api_key: Optional[str] = None
+    base_url: Optional[str] = None
+    custom_filler_words: Optional[str] = None
+
+
+class ClipRequest(BaseModel):
+    transcript: str
+    words: List[WordInfo]
+    provider: str = "ollama"
+    model: Optional[str] = None
+    api_key: Optional[str] = None
+    base_url: Optional[str] = None
+    target_duration: int = 60
+
+
+@router.post("/ai/filler-removal")
+async def filler_removal(req: FillerRequest):
+    try:
+        words_dicts = [w.model_dump() for w in req.words]
+        result = detect_filler_words(
+            transcript=req.transcript,
+            words=words_dicts,
+            provider=req.provider,
+            model=req.model,
+            api_key=req.api_key,
+            base_url=req.base_url,
+            custom_filler_words=req.custom_filler_words,
+        )
+        return result
+    except Exception as e:
+        logger.error(f"Filler detection failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/ai/create-clip")
+async def create_clip(req: ClipRequest):
+    try:
+        words_dicts = [w.model_dump() for w in req.words]
+        result = create_clip_suggestion(
+            transcript=req.transcript,
+            words=words_dicts,
+            target_duration=req.target_duration,
+            provider=req.provider,
+            model=req.model,
+            api_key=req.api_key,
+            base_url=req.base_url,
+        )
+        return result
+    except Exception as e:
+        logger.error(f"Clip creation failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/ai/ollama-models")
+async def ollama_models(base_url: str = "http://localhost:11434"):
+    models = AIProvider.list_ollama_models(base_url)
+    return {"models": models}
--- a/backend/routers/audio.py
+++ b/backend/routers/audio.py
@ -0,0 +1,38 @@
+"""Audio processing endpoint (noise reduction / Studio Sound)."""
+
+import logging
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from services.audio_cleaner import clean_audio, is_deepfilter_available
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+class AudioCleanRequest(BaseModel):
+    input_path: str
+    output_path: Optional[str] = None
+
+
+@router.post("/audio/clean")
+async def clean_audio_endpoint(req: AudioCleanRequest):
+    try:
+        output = clean_audio(req.input_path, req.output_path or "")
+        return {
+            "status": "ok",
+            "output_path": output,
+            "engine": "deepfilternet" if is_deepfilter_available() else "ffmpeg_anlmdn",
+        }
+    except Exception as e:
+        logger.error(f"Audio cleaning failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/audio/capabilities")
+async def audio_capabilities():
+    return {
+        "deepfilternet_available": is_deepfilter_available(),
+    }
--- a/backend/routers/captions.py
+++ b/backend/routers/captions.py
@ -0,0 +1,65 @@
+"""Caption generation endpoint."""
+
+import logging
+from typing import List, Optional
+
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import PlainTextResponse
+from pydantic import BaseModel
+
+from services.caption_generator import generate_srt, generate_vtt, generate_ass, save_captions
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+class CaptionWord(BaseModel):
+    word: str
+    start: float
+    end: float
+    confidence: float = 0.0
+
+
+class CaptionStyle(BaseModel):
+    fontName: str = "Arial"
+    fontSize: int = 48
+    fontColor: str = "&H00FFFFFF"
+    backgroundColor: str = "&H80000000"
+    position: str = "bottom"
+    bold: bool = True
+
+
+class CaptionRequest(BaseModel):
+    words: List[CaptionWord]
+    deleted_indices: List[int] = []
+    format: str = "srt"
+    words_per_line: int = 8
+    style: Optional[CaptionStyle] = None
+    output_path: Optional[str] = None
+
+
+@router.post("/captions")
+async def generate_captions(req: CaptionRequest):
+    try:
+        words_dicts = [w.model_dump() for w in req.words]
+        deleted_set = set(req.deleted_indices)
+
+        if req.format == "srt":
+            content = generate_srt(words_dicts, deleted_set, req.words_per_line)
+        elif req.format == "vtt":
+            content = generate_vtt(words_dicts, deleted_set, req.words_per_line)
+        elif req.format == "ass":
+            style_dict = req.style.model_dump() if req.style else None
+            content = generate_ass(words_dicts, deleted_set, req.words_per_line, style_dict)
+        else:
+            raise HTTPException(status_code=400, detail=f"Unknown format: {req.format}")
+
+        if req.output_path:
+            saved = save_captions(content, req.output_path)
+            return {"status": "ok", "output_path": saved}
+
+        return PlainTextResponse(content, media_type="text/plain")
+
+    except Exception as e:
+        logger.error(f"Caption generation failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
--- a/backend/routers/export.py
+++ b/backend/routers/export.py
@ -0,0 +1,156 @@
+"""Export endpoint for video cutting and rendering."""
+
+import logging
+import tempfile
+import os
+from typing import List, Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs
+from services.audio_cleaner import clean_audio
+from services.caption_generator import generate_srt, generate_ass, save_captions
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+class SegmentModel(BaseModel):
+    start: float
+    end: float
+
+
+class ExportWordModel(BaseModel):
+    word: str
+    start: float
+    end: float
+    confidence: float = 0.0
+
+
+class ExportRequest(BaseModel):
+    input_path: str
+    output_path: str
+    keep_segments: List[SegmentModel]
+    mode: str = "fast"
+    resolution: str = "1080p"
+    format: str = "mp4"
+    enhanceAudio: bool = False
+    captions: str = "none"
+    words: Optional[List[ExportWordModel]] = None
+    deleted_indices: Optional[List[int]] = None
+
+
+def _mux_audio(video_path: str, audio_path: str, output_path: str) -> str:
+    """Replace video's audio track with cleaned audio using FFmpeg."""
+    import subprocess
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", video_path,
+        "-i", audio_path,
+        "-c:v", "copy",
+        "-map", "0:v:0",
+        "-map", "1:a:0",
+        "-shortest",
+        output_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Audio mux failed: {result.stderr[-300:]}")
+    return output_path
+
+
+@router.post("/export")
+async def export_video(req: ExportRequest):
+    try:
+        segments = [{"start": s.start, "end": s.end} for s in req.keep_segments]
+
+        if not segments:
+            raise HTTPException(status_code=400, detail="No segments to export")
+
+        use_stream_copy = req.mode == "fast" and len(segments) == 1
+        needs_reencode_for_subs = req.captions == "burn-in"
+
+        # Burn-in captions require re-encode
+        if needs_reencode_for_subs:
+            use_stream_copy = False
+
+        words_dicts = [w.model_dump() for w in req.words] if req.words else []
+        deleted_set = set(req.deleted_indices or [])
+
+        # Generate ASS file for burn-in
+        ass_path = None
+        if req.captions == "burn-in" and words_dicts:
+            ass_content = generate_ass(words_dicts, deleted_set)
+            tmp = tempfile.NamedTemporaryFile(suffix=".ass", delete=False, mode="w", encoding="utf-8")
+            tmp.write(ass_content)
+            tmp.close()
+            ass_path = tmp.name
+
+        try:
+            if use_stream_copy:
+                output = export_stream_copy(req.input_path, req.output_path, segments)
+            elif ass_path:
+                output = export_reencode_with_subs(
+                    req.input_path,
+                    req.output_path,
+                    segments,
+                    ass_path,
+                    resolution=req.resolution,
+                    format_hint=req.format,
+                )
+            else:
+                output = export_reencode(
+                    req.input_path,
+                    req.output_path,
+                    segments,
+                    resolution=req.resolution,
+                    format_hint=req.format,
+                )
+        finally:
+            if ass_path and os.path.exists(ass_path):
+                os.unlink(ass_path)
+
+        # Audio enhancement: clean, then mux back into the exported video
+        if req.enhanceAudio:
+            try:
+                tmp_dir = tempfile.mkdtemp(prefix="cutscript_audio_")
+                cleaned_audio = os.path.join(tmp_dir, "cleaned.wav")
+                clean_audio(output, cleaned_audio)
+
+                muxed_path = output + ".muxed.mp4"
+                _mux_audio(output, cleaned_audio, muxed_path)
+
+                os.replace(muxed_path, output)
+                logger.info(f"Audio enhanced and muxed into {output}")
+
+                # Cleanup
+                try:
+                    os.remove(cleaned_audio)
+                    os.rmdir(tmp_dir)
+                except OSError:
+                    pass
+            except Exception as e:
+                logger.warning(f"Audio enhancement failed (non-fatal): {e}")
+
+        # Sidecar SRT: generate and save alongside video
+        srt_path = None
+        if req.captions == "sidecar" and words_dicts:
+            srt_content = generate_srt(words_dicts, deleted_set)
+            srt_path = req.output_path.rsplit(".", 1)[0] + ".srt"
+            save_captions(srt_content, srt_path)
+            logger.info(f"Sidecar SRT saved to {srt_path}")
+
+        result = {"status": "ok", "output_path": output}
+        if srt_path:
+            result["srt_path"] = srt_path
+        return result
+
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except RuntimeError as e:
+        logger.error(f"Export failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.error(f"Export error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
--- a/backend/routers/transcribe.py
+++ b/backend/routers/transcribe.py
@ -0,0 +1,53 @@
+"""Transcription endpoint using WhisperX."""
+
+import logging
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from services.transcription import transcribe_audio
+from services.diarization import diarize_and_label
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+class TranscribeRequest(BaseModel):
+    file_path: str
+    model: str = "base"
+    language: Optional[str] = None
+    use_gpu: bool = True
+    use_cache: bool = True
+    diarize: bool = False
+    hf_token: Optional[str] = None
+    num_speakers: Optional[int] = None
+
+
+@router.post("/transcribe")
+async def transcribe(req: TranscribeRequest):
+    try:
+        result = transcribe_audio(
+            file_path=req.file_path,
+            model_name=req.model,
+            use_gpu=req.use_gpu,
+            use_cache=req.use_cache,
+            language=req.language,
+        )
+
+        if req.diarize and req.hf_token:
+            result = diarize_and_label(
+                transcription_result=result,
+                audio_path=req.file_path,
+                hf_token=req.hf_token,
+                num_speakers=req.num_speakers,
+                use_gpu=req.use_gpu,
+            )
+
+        return result
+
+    except FileNotFoundError:
+        raise HTTPException(status_code=404, detail=f"File not found: {req.file_path}")
+    except Exception as e:
+        logger.error(f"Transcription failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))