TalkEdit/backend/routers/audio.py

"""Audio processing endpoint (noise reduction / Studio Sound)."""

import hashlib
import logging
import subprocess
import tempfile
from pathlib import Path
from typing import Optional

from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import FileResponse
from pydantic import BaseModel

from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available

logger = logging.getLogger(__name__)
router = APIRouter()

# Simple in-process cache: video path → extracted WAV path
_waveform_cache: dict[str, str] = {}


class AudioCleanRequest(BaseModel):
    input_path: str
    output_path: Optional[str] = None


class SilenceDetectRequest(BaseModel):
    input_path: str
    min_silence_ms: int = 500
    silence_db: float = -35.0


@router.post("/audio/clean")
async def clean_audio_endpoint(req: AudioCleanRequest):
    try:
        output = clean_audio(req.input_path, req.output_path or "")
        return {
            "status": "ok",
            "output_path": output,
            "engine": "deepfilternet" if is_deepfilter_available() else "ffmpeg_anlmdn",
        }
    except Exception as e:
        logger.error(f"Audio cleaning failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/audio/capabilities")
async def audio_capabilities():
    return {
        "deepfilternet_available": is_deepfilter_available(),
    }


@router.post("/audio/detect-silence")
async def detect_silence_endpoint(req: SilenceDetectRequest):
    try:
        ranges = detect_silence_ranges(
            req.input_path,
            req.min_silence_ms,
            req.silence_db,
        )
        return {
            "status": "ok",
            "ranges": ranges,
            "count": len(ranges),
        }
    except Exception as e:
        logger.error(f"Silence detection failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/audio/waveform")
async def get_waveform_audio(path: str = Query(...)):
    """
    Extract audio from any video/audio file and return it as a WAV.
    The WAV is cached on disk for subsequent requests.
    Uses FFmpeg directly so it works with MKV, MOV, AVI, MP4, etc.
    """
    file_path = Path(path)
    if not file_path.is_file():
        logger.warning(f"[waveform] File not found: {path}")
        raise HTTPException(status_code=404, detail=f"File not found: {path}")

    # Cache key based on path + mtime so stale cache is auto-invalidated
    mtime = file_path.stat().st_mtime
    cache_key = hashlib.md5(f"{path}:{mtime}".encode()).hexdigest()

    if cache_key in _waveform_cache:
        cached = Path(_waveform_cache[cache_key])
        if cached.exists():
            logger.info(f"[waveform] Cache hit for {file_path.name}")
            return FileResponse(str(cached), media_type="audio/wav")
        else:
            del _waveform_cache[cache_key]

    logger.info(f"[waveform] Extracting audio from: {file_path.name}")
    tmp_dir = tempfile.mkdtemp(prefix="talkedit_waveform_")
    out_wav = Path(tmp_dir) / f"{cache_key}.wav"

    # Downsample to mono 22050 Hz — enough for waveform drawing, small file
    cmd = [
        "ffmpeg", "-y",
        "-i", str(file_path),
        "-vn",                    # drop video
        "-ac", "1",               # mono
        "-ar", "22050",           # 22 kHz sample rate
        "-acodec", "pcm_s16le",   # 16-bit PCM WAV
        str(out_wav),
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        logger.error(f"[waveform] FFmpeg failed for {file_path.name}: {result.stderr[-500:]}")
        raise HTTPException(
            status_code=500,
            detail=f"Failed to extract audio: {result.stderr[-300:]}"
        )

    if not out_wav.exists() or out_wav.stat().st_size == 0:
        logger.error(f"[waveform] FFmpeg produced empty WAV for {file_path.name}")
        raise HTTPException(status_code=500, detail="Audio extraction produced empty file")

    logger.info(f"[waveform] Extracted {out_wav.stat().st_size} bytes for {file_path.name}")
    _waveform_cache[cache_key] = str(out_wav)
    return FileResponse(str(out_wav), media_type="audio/wav")
Initial CutScript release - Open-source AI-powered text-based video editor CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT 2026-03-03 06:31:04 -05:00			`"""Audio processing endpoint (noise reduction / Studio Sound)."""`

changed to python312 2026-03-28 12:26:45 -06:00			`import hashlib`
Initial CutScript release - Open-source AI-powered text-based video editor CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT 2026-03-03 06:31:04 -05:00			`import logging`
changed to python312 2026-03-28 12:26:45 -06:00			`import subprocess`
			`import tempfile`
			`from pathlib import Path`
Initial CutScript release - Open-source AI-powered text-based video editor CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT 2026-03-03 06:31:04 -05:00			`from typing import Optional`

changed to python312 2026-03-28 12:26:45 -06:00			`from fastapi import APIRouter, HTTPException, Query`
			`from fastapi.responses import FileResponse`
Initial CutScript release - Open-source AI-powered text-based video editor CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT 2026-03-03 06:31:04 -05:00			`from pydantic import BaseModel`

silence trimmer 2026-04-03 12:05:44 -06:00			`from services.audio_cleaner import clean_audio, detect_silence_ranges, is_deepfilter_available`
Initial CutScript release - Open-source AI-powered text-based video editor CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT 2026-03-03 06:31:04 -05:00
			`logger = logging.getLogger(__name__)`
			`router = APIRouter()`

changed to python312 2026-03-28 12:26:45 -06:00			`# Simple in-process cache: video path → extracted WAV path`
			`_waveform_cache: dict[str, str] = {}`

Initial CutScript release - Open-source AI-powered text-based video editor CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT 2026-03-03 06:31:04 -05:00
			`class AudioCleanRequest(BaseModel):`
			`input_path: str`
			`output_path: Optional[str] = None`


silence trimmer 2026-04-03 12:05:44 -06:00			`class SilenceDetectRequest(BaseModel):`
			`input_path: str`
			`min_silence_ms: int = 500`
			`silence_db: float = -35.0`


Initial CutScript release - Open-source AI-powered text-based video editor CutScript is a local-first, Descript-like video editor where you edit video by editing text. Delete a word from the transcript and it's cut from the video. Features: - Word-level transcription with WhisperX - Text-based video editing with undo/redo - AI filler word removal (Ollama/OpenAI/Claude) - AI clip creation for shorts - Waveform timeline with virtualized transcript - FFmpeg stream-copy (fast) and re-encode (4K) export - Caption burn-in and sidecar SRT generation - Studio Sound audio enhancement (DeepFilterNet) - Keyboard shortcuts (J/K/L, Space, Delete, Ctrl+Z/S/E) - Encrypted API key storage - Project save/load (.aive files) Architecture: - Electron + React + Tailwind (frontend) - FastAPI + Python (backend) - WhisperX for transcription - FFmpeg for video processing - Multi-provider AI support Performance optimizations: - RAF-throttled time updates - Zustand selectors for granular subscriptions - Dual-canvas waveform rendering - Virtualized transcript with react-virtuoso Built on top of DataAnts-AI/VideoTranscriber, completely rewritten as a desktop application. License: MIT 2026-03-03 06:31:04 -05:00			`@router.post("/audio/clean")`
			`async def clean_audio_endpoint(req: AudioCleanRequest):`
			`try:`
			`output = clean_audio(req.input_path, req.output_path or "")`
			`return {`
			`"status": "ok",`
			`"output_path": output,`
			`"engine": "deepfilternet" if is_deepfilter_available() else "ffmpeg_anlmdn",`
			`}`
			`except Exception as e:`
			`logger.error(f"Audio cleaning failed: {e}", exc_info=True)`
			`raise HTTPException(status_code=500, detail=str(e))`


			`@router.get("/audio/capabilities")`
			`async def audio_capabilities():`
			`return {`
			`"deepfilternet_available": is_deepfilter_available(),`
			`}`
changed to python312 2026-03-28 12:26:45 -06:00

silence trimmer 2026-04-03 12:05:44 -06:00			`@router.post("/audio/detect-silence")`
			`async def detect_silence_endpoint(req: SilenceDetectRequest):`
			`try:`
			`ranges = detect_silence_ranges(`
			`req.input_path,`
			`req.min_silence_ms,`
			`req.silence_db,`
			`)`
			`return {`
			`"status": "ok",`
			`"ranges": ranges,`
			`"count": len(ranges),`
			`}`
			`except Exception as e:`
			`logger.error(f"Silence detection failed: {e}", exc_info=True)`
			`raise HTTPException(status_code=500, detail=str(e))`


changed to python312 2026-03-28 12:26:45 -06:00			`@router.get("/audio/waveform")`
			`async def get_waveform_audio(path: str = Query(...)):`
			`"""`
			`Extract audio from any video/audio file and return it as a WAV.`
			`The WAV is cached on disk for subsequent requests.`
			`Uses FFmpeg directly so it works with MKV, MOV, AVI, MP4, etc.`
			`"""`
			`file_path = Path(path)`
			`if not file_path.is_file():`
			`logger.warning(f"[waveform] File not found: {path}")`
			`raise HTTPException(status_code=404, detail=f"File not found: {path}")`

			`# Cache key based on path + mtime so stale cache is auto-invalidated`
			`mtime = file_path.stat().st_mtime`
			`cache_key = hashlib.md5(f"{path}:{mtime}".encode()).hexdigest()`

			`if cache_key in _waveform_cache:`
			`cached = Path(_waveform_cache[cache_key])`
			`if cached.exists():`
			`logger.info(f"[waveform] Cache hit for {file_path.name}")`
			`return FileResponse(str(cached), media_type="audio/wav")`
			`else:`
			`del _waveform_cache[cache_key]`

			`logger.info(f"[waveform] Extracting audio from: {file_path.name}")`
			`tmp_dir = tempfile.mkdtemp(prefix="talkedit_waveform_")`
			`out_wav = Path(tmp_dir) / f"{cache_key}.wav"`

			`# Downsample to mono 22050 Hz — enough for waveform drawing, small file`
			`cmd = [`
			`"ffmpeg", "-y",`
			`"-i", str(file_path),`
			`"-vn", # drop video`
			`"-ac", "1", # mono`
			`"-ar", "22050", # 22 kHz sample rate`
			`"-acodec", "pcm_s16le", # 16-bit PCM WAV`
			`str(out_wav),`
			`]`
			`result = subprocess.run(cmd, capture_output=True, text=True)`
			`if result.returncode != 0:`
			`logger.error(f"[waveform] FFmpeg failed for {file_path.name}: {result.stderr[-500:]}")`
			`raise HTTPException(`
			`status_code=500,`
			`detail=f"Failed to extract audio: {result.stderr[-300:]}"`
			`)`

			`if not out_wav.exists() or out_wav.stat().st_size == 0:`
			`logger.error(f"[waveform] FFmpeg produced empty WAV for {file_path.name}")`
			`raise HTTPException(status_code=500, detail="Audio extraction produced empty file")`

			`logger.info(f"[waveform] Extracted {out_wav.stat().st_size} bytes for {file_path.name}")`
			`_waveform_cache[cache_key] = str(out_wav)`
			`return FileResponse(str(out_wav), media_type="audio/wav")`