implemented the lower priority features; haven't tested them

2026-05-05 20:46:55 -06:00
parent cde635a660
commit 4d4dfa7f7c
12 changed files with 957 additions and 60 deletions
--- a/backend/routers/export.py
+++ b/backend/routers/export.py
@ -8,9 +8,10 @@ from typing import List, Optional
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel

-from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs
+from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs, mix_background_music, concat_clips
 from services.audio_cleaner import clean_audio
 from services.caption_generator import generate_srt, generate_ass, save_captions
+from services.background_removal import remove_background_on_export as remove_bg

 logger = logging.getLogger(__name__)
 router = APIRouter()
@ -36,6 +37,22 @@ class ExportWordModel(BaseModel):
    confidence: float = 0.0


+class ZoomConfigModel(BaseModel):
+    enabled: bool = False
+    zoomFactor: float = 1.0
+    panX: float = 0.0
+    panY: float = 0.0
+
+
+class BackgroundMusicModel(BaseModel):
+    path: str
+    volumeDb: float = 0.0
+    duckingEnabled: bool = False
+    duckingDb: float = 6.0
+    duckingAttackMs: float = 10.0
+    duckingReleaseMs: float = 200.0
+
+
 class ExportRequest(BaseModel):
    input_path: str
    output_path: str
@ -53,6 +70,12 @@ class ExportRequest(BaseModel):
    captions: str = "none"
    words: Optional[List[ExportWordModel]] = None
    deleted_indices: Optional[List[int]] = None
+    zoom: Optional[ZoomConfigModel] = None
+    additional_clips: Optional[List[str]] = None
+    background_music: Optional[BackgroundMusicModel] = None
+    remove_background: bool = False
+    background_replacement: str = "blur"
+    background_replacement_value: str = ""


 class TranscriptExportRequest(BaseModel):
@ -130,6 +153,29 @@ async def export_video(req: ExportRequest):
        if not segments and not mute_segments:
            raise HTTPException(status_code=400, detail="No segments to export")

+        # Convert zoom config to dict
+        zoom_dict = None
+        if req.zoom and req.zoom.enabled:
+            zoom_dict = {
+                "enabled": True,
+                "zoomFactor": req.zoom.zoomFactor,
+                "panX": req.zoom.panX,
+                "panY": req.zoom.panY,
+            }
+
+        # Handle additional clips: pre-concat before main editing
+        working_input = req.input_path
+        has_additional = bool(req.additional_clips)
+        if has_additional:
+            try:
+                concat_output = req.output_path + ".concat.mp4"
+                concat_clips(req.input_path, req.additional_clips, concat_output)
+                working_input = concat_output
+                logger.info("Pre-concatenated %d additional clips into %s", len(req.additional_clips), concat_output)
+            except Exception as e:
+                logger.warning(f"Clip concatenation failed (non-fatal): {e}")
+                # Fall back to main input only
+
        mapped_gain_segments = _map_ranges_to_output_timeline(gain_segments or [], segments)

        has_gain = abs(float(req.global_gain_db)) > 1e-6 or bool(gain_segments)
@ -141,7 +187,7 @@ async def export_video(req: ExportRequest):
                detail="Speed zones currently cannot be combined with mute/gain filters in one export",
            )

-        use_stream_copy = req.mode == "fast" and len(segments) == 1 and not mute_segments and not has_gain and not has_speed
+        use_stream_copy = req.mode == "fast" and len(segments) == 1 and not mute_segments and not has_gain and not has_speed and not zoom_dict and not has_additional
        needs_reencode_for_subs = req.captions == "burn-in"

        # Burn-in captions or audio filters require re-encode
@ -162,10 +208,10 @@ async def export_video(req: ExportRequest):

        try:
            if use_stream_copy:
-                output = export_stream_copy(req.input_path, req.output_path, segments)
+                output = export_stream_copy(working_input, req.output_path, segments)
            elif ass_path:
                output = export_reencode_with_subs(
-                    req.input_path,
+                    working_input,
                    req.output_path,
                    segments,
                    ass_path,
@ -177,10 +223,11 @@ async def export_video(req: ExportRequest):
                    global_gain_db=req.global_gain_db,
                    normalize_loudness=req.normalize_loudness,
                    normalize_target_lufs=req.normalize_target_lufs,
+                    zoom_config=zoom_dict,
                )
            else:
                output = export_reencode(
-                    req.input_path,
+                    working_input,
                    req.output_path,
                    segments,
                    resolution=req.resolution,
@ -191,6 +238,7 @@ async def export_video(req: ExportRequest):
                    global_gain_db=req.global_gain_db,
                    normalize_loudness=req.normalize_loudness,
                    normalize_target_lufs=req.normalize_target_lufs,
+                    zoom_config=zoom_dict,
                )
        finally:
            if ass_path and os.path.exists(ass_path):
@ -209,7 +257,6 @@ async def export_video(req: ExportRequest):
                os.replace(muxed_path, output)
                logger.info(f"Audio enhanced and muxed into {output}")

-                # Cleanup
                try:
                    os.remove(cleaned_audio)
                    os.rmdir(tmp_dir)
@ -218,6 +265,35 @@ async def export_video(req: ExportRequest):
            except Exception as e:
                logger.warning(f"Audio enhancement failed (non-fatal): {e}")

+        # Background removal (post-process)
+        if req.remove_background:
+            try:
+                bg_output = output + ".nobg.mp4"
+                remove_bg(output, bg_output, req.background_replacement, req.background_replacement_value)
+                os.replace(bg_output, output)
+                logger.info("Background removed from %s", output)
+            except Exception as e:
+                logger.warning(f"Background removal failed (non-fatal): {e}")
+
+        # Background music mixing (post-process)
+        if req.background_music:
+            try:
+                music_output = output + ".music.mp4"
+                mix_background_music(
+                    output,
+                    req.background_music.path,
+                    music_output,
+                    volume_db=req.background_music.volumeDb,
+                    ducking_enabled=req.background_music.duckingEnabled,
+                    ducking_db=req.background_music.duckingDb,
+                    ducking_attack_ms=req.background_music.duckingAttackMs,
+                    ducking_release_ms=req.background_music.duckingReleaseMs,
+                )
+                os.replace(music_output, output)
+                logger.info("Background music mixed into %s", output)
+            except Exception as e:
+                logger.warning(f"Background music mixing failed (non-fatal): {e}")
+
        # Sidecar SRT: generate and save alongside video
        srt_path = None
        if req.captions == "sidecar" and words_dicts:
@ -226,6 +302,13 @@ async def export_video(req: ExportRequest):
            save_captions(srt_content, srt_path)
            logger.info(f"Sidecar SRT saved to {srt_path}")

+        # Cleanup pre-concat temp file
+        if has_additional and working_input != req.input_path and os.path.exists(working_input):
+            try:
+                os.remove(working_input)
+            except OSError:
+                pass
+
        result = {"status": "ok", "output_path": output}
        if srt_path:
            result["srt_path"] = srt_path
--- a/backend/services/background_removal.py
+++ b/backend/services/background_removal.py
@ -1,18 +1,17 @@
 """
-AI background removal (Phase 5 - future).
-Uses MediaPipe or Robust Video Matting for person segmentation.
-Export-only -- no real-time preview.
+AI background removal using MediaPipe for person segmentation.
+Applied during export as a post-processing step — no real-time preview.
 """

 import logging
+import subprocess
+import tempfile
+import os
+from pathlib import Path

 logger = logging.getLogger(__name__)

-# Placeholder for Phase 5 implementation
-# Will use mediapipe or rvm for segmentation at export time
-
 MEDIAPIPE_AVAILABLE = False
-RVM_AVAILABLE = False

 try:
    import mediapipe as mp
@ -20,14 +19,9 @@ try:
 except ImportError:
    pass

-try:
-    pass  # rvm import would go here
-except ImportError:
-    pass
-

 def is_available() -> bool:
-    return MEDIAPIPE_AVAILABLE or RVM_AVAILABLE
+    return MEDIAPIPE_AVAILABLE


 def remove_background_on_export(
@ -37,23 +31,189 @@ def remove_background_on_export(
    replacement_value: str = "",
 ) -> str:
    """
-    Process video frame-by-frame to remove/replace background.
-    Only runs during export (not real-time).
+    Process video frame-by-frame using FFmpeg chromakey fallback,
+    or MediaPipe-based segmentation if available.

    Args:
        input_path: source video
        output_path: destination
-        replacement: 'blur', 'color', 'image', or 'video'
-        replacement_value: hex color, image path, or video path
+        replacement: 'blur', 'color', or 'image'
+        replacement_value: hex color or image path (for color/image modes)

    Returns:
        output_path
    """
-    if not is_available():
-        raise RuntimeError(
-            "Background removal requires mediapipe or robust-video-matting. "
-            "Install with: pip install mediapipe"
-        )
+    input_path = str(Path(input_path).resolve())
+    output_path = str(Path(output_path).resolve())

-    # Phase 5 implementation will go here
-    raise NotImplementedError("Background removal is planned for Phase 5")
+    if MEDIAPIPE_AVAILABLE:
+        return _remove_with_mediapipe(input_path, output_path, replacement, replacement_value)
+    else:
+        return _remove_with_ffmpeg_portrait(input_path, output_path, replacement, replacement_value)
+
+
+def _remove_with_mediapipe(
+    input_path: str,
+    output_path: str,
+    replacement: str = "blur",
+    replacement_value: str = "",
+) -> str:
+    """Use MediaPipe Selfie Segmentation + FFmpeg for background removal.
+
+    Extracts frames, applies segmentation, composites replacement background.
+    """
+    try:
+        import cv2
+        import numpy as np
+        import mediapipe as mp
+
+        mp_selfie_segmentation = mp.solutions.selfie_segmentation
+
+        # Determine background color/image
+        if replacement == "color":
+            color_hex = replacement_value or "#00FF00"
+            color_hex = color_hex.lstrip("#")
+            bg_color = tuple(int(color_hex[i:i+2], 16) for i in (0, 2, 4))
+            bg_color = bg_color[::-1]  # RGB -> BGR
+        elif replacement == "image":
+            bg_image = cv2.imread(replacement_value) if replacement_value else None
+            if bg_image is None:
+                bg_color = (0, 255, 0)
+                bg_image = None
+        else:
+            # Blur background (default)
+            bg_color = None
+
+        # Open video
+        cap = cv2.VideoCapture(input_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        # Temp directory for processed frames
+        temp_dir = tempfile.mkdtemp(prefix="aive_bgrem_")
+        frame_dir = os.path.join(temp_dir, "frames")
+        os.makedirs(frame_dir, exist_ok=True)
+
+        with mp_selfie_segmentation.SelfieSegmentation(model_selection=0) as segmenter:
+            frame_idx = 0
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+
+                # Convert to RGB for MediaPipe
+                rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                result = segmenter.process(rgb)
+                mask = result.segmentation_mask
+
+                # Threshold the mask
+                condition = mask > 0.5
+
+                if replacement == "blur":
+                    # Apply strong blur to background
+                    blurred = cv2.GaussianBlur(frame, (99, 99), 0)
+                    output_frame = np.where(condition[..., None], frame, blurred)
+                elif replacement == "color":
+                    bg = np.full(frame.shape, bg_color, dtype=np.uint8)
+                    output_frame = np.where(condition[..., None], frame, bg)
+                elif replacement == "image" and bg_image is not None:
+                    bg_resized = cv2.resize(bg_image, (width, height))
+                    output_frame = np.where(condition[..., None], frame, bg_resized)
+                else:
+                    output_frame = frame
+
+                out_path = os.path.join(frame_dir, f"frame_{frame_idx:06d}.png")
+                cv2.imwrite(out_path, output_frame)
+                frame_idx += 1
+
+                if frame_idx % 100 == 0:
+                    logger.info("Background removal: %d/%d frames", frame_idx, total_frames)
+
+        cap.release()
+
+        # Encode frames back to video using FFmpeg
+        import subprocess as _sp
+        ffmpeg = "ffmpeg"
+        cmd = [
+            ffmpeg, "-y",
+            "-framerate", str(fps),
+            "-i", os.path.join(frame_dir, "frame_%06d.png"),
+            "-i", input_path,
+            "-map", "0:v:0",
+            "-map", "1:a:0?",
+            "-c:v", "libx264", "-preset", "medium", "-crf", "18",
+            "-c:a", "aac", "-b:a", "192k",
+            "-shortest",
+            "-pix_fmt", "yuv420p",
+            output_path,
+        ]
+        result = _sp.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"FFmpeg frame encode failed: {result.stderr[-500:]}")
+
+        # Cleanup
+        for f in os.listdir(frame_dir):
+            try:
+                os.remove(os.path.join(frame_dir, f))
+            except OSError:
+                pass
+        try:
+            os.rmdir(frame_dir)
+            os.rmdir(temp_dir)
+        except OSError:
+            pass
+
+        logger.info("MediaPipe background removal completed -> %s", output_path)
+        return output_path
+
+    except ImportError:
+        logger.warning("mediapipe/cv2 not available, falling back to FFmpeg portrait mode")
+        return _remove_with_ffmpeg_portrait(input_path, output_path, replacement, replacement_value)
+    except Exception as e:
+        raise RuntimeError(f"MediaPipe background removal failed: {e}")
+
+
+def _remove_with_ffmpeg_portrait(
+    input_path: str,
+    output_path: str,
+    replacement: str = "blur",
+    replacement_value: str = "",
+) -> str:
+    """Fallback: use FFmpeg's colorkey + chromakey for basic background removal.
+
+    This is a crude approximation. For best results, install mediapipe + opencv-python.
+    """
+    ffmpeg = "ffmpeg"
+
+    # Use a simple chromakey-based approach with a neutral background
+    # This won't work well for most real videos but provides a fallback
+    if replacement == "color":
+        color = replacement_value or "00FF00"
+        filter_complex = f"colorkey=0x{color}:0.3:0.1,chromakey=0x{color}:0.3:0.1"
+    elif replacement == "blur":
+        filter_complex = "gblur=sigma=20:enable='gt(scene,0.01)'"
+    else:
+        filter_complex = "null"
+
+    if filter_complex == "null":
+        # No-op, copy input to output
+        cmd = [ffmpeg, "-y", "-i", input_path, "-c", "copy", output_path]
+    else:
+        cmd = [
+            ffmpeg, "-y",
+            "-i", input_path,
+            "-vf", filter_complex,
+            "-c:v", "libx264", "-preset", "medium", "-crf", "18",
+            "-c:a", "aac", "-b:a", "192k",
+            "-movflags", "+faststart",
+            output_path,
+        ]
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"FFmpeg background removal failed: {result.stderr[-500:]}")
+
+    logger.info("FFmpeg portait background removal completed -> %s", output_path)
+    return output_path
--- a/backend/services/video_editor.py
+++ b/backend/services/video_editor.py
@ -117,6 +117,129 @@ def _split_keep_segments_by_speed(
    return result


+def _build_zoom_filter(zoom_config: dict = None) -> str:
+    """Build FFmpeg video filter snippet for zoom/punch-in effect.
+    
+    zoom_config: {enabled, zoomFactor, panX, panY}
+    Returns empty string if disabled. Should be prepended to the video filter chain.
+    """
+    if not zoom_config or not zoom_config.get("enabled"):
+        return ""
+    factor = float(zoom_config.get("zoomFactor", 1.0))
+    if abs(factor - 1.0) < 0.01:
+        return ""
+    pan_x = float(zoom_config.get("panX", 0.0))
+    pan_y = float(zoom_config.get("panY", 0.0))
+    return f"crop=iw/{factor}:ih/{factor}:((iw-iw/{factor})/2)+({pan_x}*(iw-iw/{factor})/2):((ih-ih/{factor})/2)+({pan_y}*(ih-ih/{factor})/2),scale=iw:ih"
+
+
+def mix_background_music(
+    video_path: str,
+    music_path: str,
+    output_path: str,
+    volume_db: float = 0.0,
+    ducking_enabled: bool = False,
+    ducking_db: float = 6.0,
+    ducking_attack_ms: float = 10.0,
+    ducking_release_ms: float = 200.0,
+) -> str:
+    """Mix background music into a video with optional ducking.
+    
+    Uses FFmpeg amix + sidechaincompress. Output is written to output_path.
+    """
+    ffmpeg = _find_ffmpeg()
+    escaped_music = music_path.replace("\\", "/").replace(":", "\\:")
+    
+    # Build the filter graph
+    if ducking_enabled:
+        filter_complex = (
+            f"[0:a]asplit[main][sidechain];"
+            f"movie='{escaped_music}':loop=0,volume={volume_db}dB[music];"
+            f"[main][music]amix=inputs=2:duration=first:dropout_transition=2[mixed];"
+            f"[mixed][sidechain]sidechaincompress="
+            f"threshold=-30dB:ratio=100:attack={ducking_attack_ms}ms:"
+            f"release={ducking_release_ms}ms:makeup=1:level_sc={ducking_db}[outa]"
+        )
+    else:
+        filter_complex = (
+            f"movie='{escaped_music}':loop=0,volume={volume_db}dB[music];"
+            f"[0:a][music]amix=inputs=2:duration=first:dropout_transition=2[outa]"
+        )
+    
+    cmd = [
+        ffmpeg, "-y",
+        "-i", video_path,
+        "-filter_complex", filter_complex,
+        "-map", "0:v",
+        "-map", "[outa]",
+        "-c:v", "copy",
+        "-c:a", "aac", "-b:a", "192k",
+        "-shortest",
+        output_path,
+    ]
+    
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Background music mix failed: {result.stderr[-500:]}")
+    
+    return output_path
+
+
+def concat_clips(
+    main_path: str,
+    append_paths: list,
+    output_path: str,
+) -> str:
+    """Concatenate multiple video clips using FFmpeg concat demuxer.
+    
+    The main_path is kept as-is. append_paths are appended after it.
+    """
+    if not append_paths:
+        raise ValueError("No clips to concatenate")
+    
+    ffmpeg = _find_ffmpeg()
+    import tempfile
+    import os
+    
+    temp_dir = tempfile.mkdtemp(prefix="aive_concat_")
+    try:
+        segment_files = [main_path]
+        segment_files.extend(append_paths)
+        
+        # Create concat file list
+        concat_file = os.path.join(temp_dir, "concat.txt")
+        with open(concat_file, "w") as f:
+            for path in segment_files:
+                resolved = os.path.abspath(path)
+                f.write(f"file '{resolved}'\n")
+        
+        cmd = [
+            ffmpeg, "-y",
+            "-f", "concat",
+            "-safe", "0",
+            "-i", concat_file,
+            "-c", "copy",
+            "-movflags", "+faststart",
+            output_path,
+        ]
+        
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Clip concat failed: {result.stderr[-500:]}")
+        
+        return output_path
+    finally:
+        for f in os.listdir(temp_dir):
+            try:
+                os.remove(os.path.join(temp_dir, f))
+            except OSError:
+                pass
+        try:
+            os.rmdir(temp_dir)
+        except OSError:
+            pass
+
+
 def _find_ffmpeg() -> str:
    """Locate ffmpeg binary."""
    for cmd in ["ffmpeg", "ffmpeg.exe"]:
@ -213,6 +336,29 @@ def export_stream_copy(
            pass


+def _apply_zoom_post(input_path: str, output_path: str, zoom_config: dict) -> str:
+    """Re-encode video applying zoom/punch-in crop+scale as a post-process step."""
+    ffmpeg = _find_ffmpeg()
+    zoom_filter = _build_zoom_filter(zoom_config)
+    if not zoom_filter:
+        return input_path
+    
+    cmd = [
+        ffmpeg, "-y",
+        "-i", input_path,
+        "-filter_complex", f"[0:v]{zoom_filter}[v]",
+        "-map", "[v]",
+        "-map", "0:a?",
+        "-c:a", "copy",
+        "-movflags", "+faststart",
+        output_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Zoom post-process failed: {result.stderr[-500:]}")
+    return output_path
+
+
 def export_reencode(
    input_path: str,
    output_path: str,
@ -225,6 +371,7 @@ def export_reencode(
    global_gain_db: float = 0.0,
    normalize_loudness: bool = False,
    normalize_target_lufs: float = -14.0,
+    zoom_config: dict = None,
 ) -> str:
    """
    Export video with full re-encode. Slower but supports resolution changes,
@ -421,6 +568,15 @@ def export_reencode(
    if result.returncode != 0:
        raise RuntimeError(f"FFmpeg re-encode failed: {result.stderr[-500:]}")

+    # Apply zoom post-processing if configured
+    if zoom_config and zoom_config.get("enabled") and has_video:
+        import tempfile as _tf
+        import os as _os
+        zoomed_path = output_path + ".zoomed.mp4"
+        _apply_zoom_post(output_path, zoomed_path, zoom_config)
+        _os.replace(zoomed_path, output_path)
+        logger.info("Zoom/punch-in applied to %s (factor=%s)", output_path, zoom_config.get("zoomFactor", 1.0))
+
    return output_path


@ -437,6 +593,7 @@ def export_reencode_with_subs(
    global_gain_db: float = 0.0,
    normalize_loudness: bool = False,
    normalize_target_lufs: float = -14.0,
+    zoom_config: dict = None,
 ) -> str:
    """
    Export video with re-encode and burn-in subtitles (ASS format).
@ -578,6 +735,15 @@ def export_reencode_with_subs(
    if result.returncode != 0:
        raise RuntimeError(f"FFmpeg re-encode with subs failed: {result.stderr[-500:]}")

+    # Apply zoom post-processing if configured
+    if zoom_config and zoom_config.get("enabled"):
+        import tempfile as _tf
+        import os as _os
+        zoomed_path = output_path + ".zoomed.mp4"
+        _apply_zoom_post(output_path, zoomed_path, zoom_config)
+        _os.replace(zoomed_path, output_path)
+        logger.info("Zoom/punch-in applied to %s (factor=%s)", output_path, zoom_config.get("zoomFactor", 1.0))
+
    return output_path