Compare commits
10 Commits
c5cc7c2969
...
e5c47e31b3
| Author | SHA1 | Date | |
|---|---|---|---|
| e5c47e31b3 | |||
| 78d34133ad | |||
| 33cca5f552 | |||
| d1e1fedcae | |||
| 70c5d32413 | |||
| ce398ae1d4 | |||
| 168bf5f573 | |||
| efee0b0abe | |||
| 4dd3c7600e | |||
| 78e9df31e6 |
76
.github/workflows/docker-build.yml
vendored
76
.github/workflows/docker-build.yml
vendored
@ -1,76 +0,0 @@
|
||||
name: Build and Push Docker Images
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, develop ]
|
||||
tags: [ 'v*' ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Convert repository name to lowercase
|
||||
id: lowercase-repo
|
||||
run: echo "repository=$(echo ${{ github.repository }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Extract metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=ref,event=pr
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=semver,pattern={{major}}
|
||||
type=raw,value=latest,enable={{is_default_branch}}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Build and push GPU-enabled image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile.gpu
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}:latest-gpu
|
||||
${{ env.REGISTRY }}/${{ steps.lowercase-repo.outputs.repository }}:${{ github.sha }}-gpu
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
24
.gitignore
vendored
24
.gitignore
vendored
@ -1,13 +1,33 @@
|
||||
# Python virtual environment
|
||||
# Dependencies
|
||||
node_modules/
|
||||
|
||||
# Build output
|
||||
frontend/dist/
|
||||
|
||||
# Python
|
||||
venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.egg-info/
|
||||
|
||||
# IDE files
|
||||
# IDE / Editor
|
||||
.vscode/
|
||||
.idea/
|
||||
.cursor/
|
||||
|
||||
# OS files
|
||||
.env
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Lock files (root only — frontend lock is committed)
|
||||
/package-lock.json
|
||||
|
||||
# Electron build output
|
||||
dist/
|
||||
build/
|
||||
*.asar
|
||||
|
||||
305
DOCKER.md
305
DOCKER.md
@ -1,305 +0,0 @@
|
||||
# Docker Deployment Guide for VideoTranscriber
|
||||
|
||||
This guide explains how to run VideoTranscriber in a Docker container while using Ollama models on your host system.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
│ Host System │
|
||||
│ ┌─────────────────┐ ┌──────────────────│
|
||||
│ │ Ollama Service │ │ Video Files │
|
||||
│ │ (port 11434) │ │ Directory │
|
||||
│ └─────────────────┘ └──────────────────│
|
||||
│ ▲ ▲ │
|
||||
│ │ │ │
|
||||
│ ┌───────┼─────────────────────┼─────────│
|
||||
│ │ Docker Container │ │
|
||||
│ │ ┌─────▼─────────┐ │ │
|
||||
│ │ │ VideoTranscriber │ │
|
||||
│ │ │ - Streamlit App │ │
|
||||
│ │ │ - Whisper Models │ │
|
||||
│ │ │ - ML Dependencies │ │
|
||||
│ │ └───────────────┘ │ │
|
||||
│ └────────────────────────────┼─────────│
|
||||
│ │ │
|
||||
│ Mounted Volumes ─────┘ │
|
||||
└─────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. **Docker & Docker Compose** installed
|
||||
2. **Ollama running on host**:
|
||||
```bash
|
||||
# Install Ollama (if not already installed)
|
||||
curl -fsSL https://ollama.ai/install.sh | sh
|
||||
|
||||
# Start Ollama service
|
||||
ollama serve
|
||||
|
||||
# Pull a model (in another terminal)
|
||||
ollama pull llama3
|
||||
```
|
||||
|
||||
### 1. Setup Environment
|
||||
|
||||
```bash
|
||||
# Copy environment template
|
||||
cp docker.env.example .env
|
||||
|
||||
# Edit .env file with your paths
|
||||
# Key settings to update:
|
||||
VIDEO_PATH=/path/to/your/videos
|
||||
OUTPUT_PATH=/path/to/save/outputs
|
||||
HF_TOKEN=your_huggingface_token_if_needed
|
||||
```
|
||||
|
||||
### 2. Create Required Directories
|
||||
|
||||
```bash
|
||||
# Create directories for mounting
|
||||
mkdir -p videos outputs cache config
|
||||
```
|
||||
|
||||
### 3. Build and Run
|
||||
|
||||
```bash
|
||||
# Build and start the container
|
||||
docker-compose up -d
|
||||
|
||||
# View logs
|
||||
docker-compose logs -f
|
||||
|
||||
# Access the application
|
||||
# Open browser to: http://localhost:8501
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Description | Default | Required |
|
||||
|----------|-------------|---------|----------|
|
||||
| `VIDEO_PATH` | Host directory containing video files | `./videos` | Yes |
|
||||
| `OUTPUT_PATH` | Host directory for outputs | `./outputs` | Yes |
|
||||
| `CACHE_PATH` | Host directory for model cache | `./cache` | No |
|
||||
| `OLLAMA_API_URL` | Ollama API endpoint | `http://host.docker.internal:11434/api` | No |
|
||||
| `HF_TOKEN` | HuggingFace token for advanced features | - | No |
|
||||
| `CUDA_VISIBLE_DEVICES` | GPU devices to use | - | No |
|
||||
|
||||
### Volume Mounts
|
||||
|
||||
| Host Path | Container Path | Purpose |
|
||||
|-----------|----------------|---------|
|
||||
| `${VIDEO_PATH}` | `/app/data/videos` | Input video files |
|
||||
| `${OUTPUT_PATH}` | `/app/data/outputs` | Generated transcripts/summaries |
|
||||
| `${CACHE_PATH}` | `/app/data/cache` | Model and processing cache |
|
||||
| `${CONFIG_PATH}` | `/app/config` | Configuration files |
|
||||
|
||||
## Platform-Specific Setup
|
||||
|
||||
### Windows (Docker Desktop)
|
||||
|
||||
```yaml
|
||||
# In docker-compose.yml - use bridge networking
|
||||
networks:
|
||||
- videotranscriber-network
|
||||
|
||||
environment:
|
||||
- OLLAMA_API_URL=http://host.docker.internal:11434/api
|
||||
```
|
||||
|
||||
### macOS (Docker Desktop)
|
||||
|
||||
Same as Windows - uses `host.docker.internal` to access host services.
|
||||
|
||||
### Linux
|
||||
|
||||
Option 1 - Host Networking (Recommended):
|
||||
```yaml
|
||||
# In docker-compose.yml
|
||||
network_mode: host
|
||||
|
||||
environment:
|
||||
- OLLAMA_API_URL=http://localhost:11434/api
|
||||
```
|
||||
|
||||
Option 2 - Bridge Networking:
|
||||
```yaml
|
||||
environment:
|
||||
- OLLAMA_API_URL=http://172.17.0.1:11434/api # Docker bridge IP
|
||||
```
|
||||
|
||||
## GPU Support
|
||||
|
||||
### NVIDIA GPU Setup
|
||||
|
||||
1. **Install NVIDIA Container Toolkit**:
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
2. **Enable in docker-compose.yml**:
|
||||
```yaml
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
```
|
||||
|
||||
## Usage in Container
|
||||
|
||||
### Application Settings
|
||||
|
||||
When running in Docker, update these settings in the VideoTranscriber UI:
|
||||
|
||||
1. **Base Folder**: Set to `/app/data/videos`
|
||||
2. **Ollama Models**: Should auto-detect from host
|
||||
3. **GPU Settings**: Will use container GPU if configured
|
||||
|
||||
### File Access
|
||||
|
||||
- **Input Videos**: Place in your `${VIDEO_PATH}` directory on host
|
||||
- **Outputs**: Generated files appear in `${OUTPUT_PATH}` on host
|
||||
- **Cache**: Models cached in `${CACHE_PATH}` for faster subsequent runs
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### 1. Can't Connect to Ollama
|
||||
|
||||
**Symptoms**: "Ollama service is not available" message
|
||||
|
||||
**Solutions**:
|
||||
- Verify Ollama is running: `curl http://localhost:11434/api/tags`
|
||||
- Check firewall settings
|
||||
- For Linux, try host networking mode
|
||||
- Verify OLLAMA_API_URL in environment
|
||||
|
||||
#### 2. No Video Files Detected
|
||||
|
||||
**Symptoms**: "No recordings found" message
|
||||
|
||||
**Solutions**:
|
||||
- Check VIDEO_PATH points to correct directory
|
||||
- Ensure directory contains supported formats (.mp4, .avi, .mov, .mkv)
|
||||
- Check file permissions
|
||||
|
||||
#### 3. GPU Not Detected
|
||||
|
||||
**Symptoms**: Processing is slow, no GPU utilization
|
||||
|
||||
**Solutions**:
|
||||
- Install NVIDIA Container Toolkit
|
||||
- Uncomment GPU section in docker-compose.yml
|
||||
- Verify: `docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi`
|
||||
|
||||
#### 4. Permission Issues
|
||||
|
||||
**Symptoms**: Cannot write to output directory
|
||||
|
||||
**Solutions**:
|
||||
```bash
|
||||
# Fix permissions
|
||||
sudo chown -R $(id -u):$(id -g) outputs cache config
|
||||
chmod -R 755 outputs cache config
|
||||
```
|
||||
|
||||
### Debugging
|
||||
|
||||
```bash
|
||||
# View container logs
|
||||
docker-compose logs -f videotranscriber
|
||||
|
||||
# Execute shell in container
|
||||
docker-compose exec videotranscriber bash
|
||||
|
||||
# Check Ollama connectivity from container
|
||||
docker-compose exec videotranscriber curl -f $OLLAMA_API_URL/tags
|
||||
|
||||
# Monitor resource usage
|
||||
docker stats videotranscriber
|
||||
```
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Custom Dockerfile
|
||||
|
||||
For specialized requirements, modify the Dockerfile:
|
||||
|
||||
```dockerfile
|
||||
# Add custom dependencies
|
||||
RUN pip install your-custom-package
|
||||
|
||||
# Set custom environment variables
|
||||
ENV YOUR_CUSTOM_VAR=value
|
||||
|
||||
# Copy custom configuration
|
||||
COPY custom-config.yaml /app/config/
|
||||
```
|
||||
|
||||
### Multi-Instance Deployment
|
||||
|
||||
Run multiple instances for different use cases:
|
||||
|
||||
```bash
|
||||
# Copy docker-compose.yml to docker-compose.prod.yml
|
||||
# Modify ports and paths
|
||||
docker-compose -f docker-compose.prod.yml up -d
|
||||
```
|
||||
|
||||
### CI/CD Integration
|
||||
|
||||
```yaml
|
||||
# .github/workflows/docker.yml
|
||||
name: Build and Deploy
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build Docker image
|
||||
run: docker build -t videotranscriber .
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Memory Management
|
||||
|
||||
```yaml
|
||||
# In docker-compose.yml
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G
|
||||
reservations:
|
||||
memory: 4G
|
||||
```
|
||||
|
||||
### Model Caching
|
||||
|
||||
- Use persistent volumes for `/app/data/cache`
|
||||
- Pre-download models to reduce startup time
|
||||
- Configure appropriate cache size limits
|
||||
|
||||
### Network Optimization
|
||||
|
||||
- Use host networking on Linux for better performance
|
||||
- Consider running Ollama and VideoTranscriber on same machine
|
||||
- Use SSD storage for cache directories
|
||||
45
Dockerfile
45
Dockerfile
@ -1,45 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
git \
|
||||
wget \
|
||||
curl \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements first for better Docker layer caching
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install Python dependencies with pinned versions
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Optional: Install CUDA-specific PyTorch if GPU support needed
|
||||
# Uncomment and modify for your CUDA version:
|
||||
# RUN pip install --force-reinstall torch==2.1.0+cu118 torchvision==0.16.0+cu118 torchaudio==2.1.0+cu118 --index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create directories for mounted volumes
|
||||
RUN mkdir -p /app/data/videos /app/data/outputs /app/data/cache
|
||||
|
||||
# Set environment variables
|
||||
ENV STREAMLIT_SERVER_PORT=8501
|
||||
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
||||
ENV STREAMLIT_SERVER_HEADLESS=true
|
||||
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
||||
|
||||
# Expose Streamlit port
|
||||
EXPOSE 8501
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
||||
|
||||
# Start the application
|
||||
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
||||
@ -1,54 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies including CUDA-related packages
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
git \
|
||||
wget \
|
||||
curl \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements first for better Docker layer caching
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install CPU versions from requirements.txt first
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install CUDA-optimized PyTorch (overwrites CPU versions)
|
||||
# Updated to torch 2.1.0+ for SpeechBrain 1.0 / pyannote diarization compatibility
|
||||
RUN pip install --force-reinstall \
|
||||
torch==2.1.0+cu118 \
|
||||
torchvision==0.16.0+cu118 \
|
||||
torchaudio==2.1.0+cu118 \
|
||||
--index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create directories for mounted volumes
|
||||
RUN mkdir -p /app/data/videos /app/data/outputs /app/data/cache
|
||||
|
||||
# Set environment variables
|
||||
ENV STREAMLIT_SERVER_PORT=8501
|
||||
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
||||
ENV STREAMLIT_SERVER_HEADLESS=true
|
||||
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
||||
|
||||
# GPU-specific environment variables
|
||||
ENV CUDA_VISIBLE_DEVICES=0
|
||||
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
|
||||
# Expose Streamlit port
|
||||
EXPOSE 8501
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
||||
|
||||
# Start the application
|
||||
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
||||
@ -1,105 +0,0 @@
|
||||
# Gemini Insights: OBS Recording Transcriber
|
||||
|
||||
## Project Overview
|
||||
The OBS Recording Transcriber is a Python application built with Streamlit that processes video recordings (particularly from OBS Studio) to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization.
|
||||
|
||||
## Key Improvement Areas
|
||||
|
||||
### 1. UI Enhancements
|
||||
- **Implemented:**
|
||||
- Responsive layout with columns for better organization
|
||||
- Expanded sidebar with categorized settings
|
||||
- Custom CSS for improved button styling
|
||||
- Spinner for long-running operations
|
||||
- Expanded transcript view by default
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add a dark mode toggle
|
||||
- Implement progress bars for each processing step
|
||||
- Add tooltips for complex options
|
||||
- Create a dashboard view for batch processing results
|
||||
- Add visualization of transcript segments with timestamps
|
||||
|
||||
### 2. Ollama Local API Integration
|
||||
- **Implemented:**
|
||||
- Local API integration for offline summarization
|
||||
- Model selection from available Ollama models
|
||||
- Chunking for long texts
|
||||
- Fallback to online models when Ollama fails
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add temperature and other generation parameters as advanced options
|
||||
- Implement streaming responses for real-time feedback
|
||||
- Cache results to avoid reprocessing
|
||||
- Add support for custom Ollama model creation with specific instructions
|
||||
- Implement parallel processing for multiple chunks
|
||||
|
||||
### 3. Subtitle Export Formats
|
||||
- **Implemented:**
|
||||
- SRT export with proper formatting
|
||||
- ASS export with basic styling
|
||||
- Multi-format export options
|
||||
- Automatic segment creation from plain text
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add customizable styling options for ASS subtitles
|
||||
- Implement subtitle editing before export
|
||||
- Add support for VTT format for web videos
|
||||
- Implement subtitle timing adjustment
|
||||
- Add batch export for multiple files
|
||||
|
||||
### 4. Architecture and Code Quality
|
||||
- **Recommendations:**
|
||||
- Implement proper error handling and logging throughout
|
||||
- Add unit tests for critical components
|
||||
- Create a configuration file for default settings
|
||||
- Implement caching for processed files
|
||||
- Add type hints throughout the codebase
|
||||
- Document API endpoints for potential future web service
|
||||
|
||||
### 5. Performance Optimizations
|
||||
- **Recommendations:**
|
||||
- Implement parallel processing for batch operations
|
||||
- Add GPU acceleration configuration options
|
||||
- Optimize memory usage for large files
|
||||
- Implement incremental processing for very long recordings
|
||||
- Add compression options for exported files
|
||||
|
||||
### 6. Additional Features
|
||||
- **Recommendations:**
|
||||
- Speaker diarization (identifying different speakers)
|
||||
- Language detection and translation
|
||||
- Keyword extraction and timestamp linking
|
||||
- Integration with video editing software
|
||||
- Batch processing queue with email notifications
|
||||
- Custom vocabulary for domain-specific terminology
|
||||
|
||||
## Implementation Roadmap
|
||||
1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export
|
||||
2. **Phase 2 (Completed):** Performance optimizations and additional export formats
|
||||
- Added WebVTT export format for web videos
|
||||
- Implemented GPU acceleration with automatic device selection
|
||||
- Added caching system for faster processing of previously transcribed files
|
||||
- Optimized memory usage with configurable memory limits
|
||||
- Added compression options for exported files
|
||||
- Enhanced ASS subtitle styling options
|
||||
- Added progress indicators for better user feedback
|
||||
3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation
|
||||
- Implemented speaker diarization to identify different speakers in recordings
|
||||
- Added language detection and translation capabilities
|
||||
- Integrated keyword extraction with timestamp linking
|
||||
- Created interactive transcript with keyword highlighting
|
||||
- Added named entity recognition for better content analysis
|
||||
- Generated keyword index with timestamp references
|
||||
- Provided speaker statistics and word count analysis
|
||||
4. **Phase 4:** Integration with other tools and services
|
||||
|
||||
## Technical Considerations
|
||||
- Ensure compatibility with different Whisper model sizes
|
||||
- Handle large files efficiently to prevent memory issues
|
||||
- Provide graceful degradation when optional dependencies are missing
|
||||
- Maintain backward compatibility with existing workflows
|
||||
- Consider containerization for easier deployment
|
||||
|
||||
## Conclusion
|
||||
The OBS Recording Transcriber has a solid foundation but can be significantly enhanced with the suggested improvements. The focus should be on improving user experience, adding offline processing capabilities, and expanding export options to make the tool more versatile for different use cases.
|
||||
141
INSTALLATION.md
141
INSTALLATION.md
@ -1,141 +0,0 @@
|
||||
# Installation Guide for OBS Recording Transcriber
|
||||
|
||||
This guide will help you install all the necessary dependencies for the OBS Recording Transcriber application, including the advanced features from Phase 3.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before installing the Python packages, you need to set up some prerequisites:
|
||||
|
||||
### 1. Python 3.8 or higher
|
||||
|
||||
Make sure you have Python 3.8 or higher installed. You can download it from [python.org](https://www.python.org/downloads/).
|
||||
|
||||
### 2. FFmpeg
|
||||
|
||||
FFmpeg is required for audio processing:
|
||||
|
||||
- **Windows**:
|
||||
- Download from [gyan.dev/ffmpeg/builds](https://www.gyan.dev/ffmpeg/builds/)
|
||||
- Extract the ZIP file
|
||||
- Add the `bin` folder to your system PATH
|
||||
|
||||
- **macOS**:
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
- **Linux**:
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install ffmpeg
|
||||
```
|
||||
|
||||
### 3. Visual C++ Build Tools (Windows only)
|
||||
|
||||
Some packages like `tokenizers` require C++ build tools:
|
||||
|
||||
1. Download and install [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||
2. During installation, select "Desktop development with C++"
|
||||
|
||||
## Installation Steps
|
||||
|
||||
### 1. Create a Virtual Environment (Recommended)
|
||||
|
||||
```bash
|
||||
# Create a virtual environment
|
||||
python -m venv venv
|
||||
|
||||
# Activate the virtual environment
|
||||
# Windows
|
||||
venv\Scripts\activate
|
||||
# macOS/Linux
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
### 2. Install PyTorch
|
||||
|
||||
For better performance, install PyTorch with CUDA support if you have an NVIDIA GPU:
|
||||
|
||||
```bash
|
||||
# Windows/Linux with CUDA
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
# macOS or CPU-only
|
||||
pip install torch torchvision torchaudio
|
||||
```
|
||||
|
||||
### 3. Install Dependencies
|
||||
|
||||
```bash
|
||||
# Install all dependencies from requirements.txt
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 4. Troubleshooting Common Issues
|
||||
|
||||
#### Tokenizers Installation Issues
|
||||
|
||||
If you encounter issues with `tokenizers` installation:
|
||||
|
||||
1. Make sure you have Visual C++ Build Tools installed (Windows)
|
||||
2. Try installing Rust: [rustup.rs](https://rustup.rs/)
|
||||
3. Install tokenizers separately:
|
||||
```bash
|
||||
pip install tokenizers --no-binary tokenizers
|
||||
```
|
||||
|
||||
#### PyAnnote.Audio Access
|
||||
|
||||
To use speaker diarization, you need a HuggingFace token with access to the pyannote models:
|
||||
|
||||
1. Create an account on [HuggingFace](https://huggingface.co/)
|
||||
2. Generate an access token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
||||
3. Request access to [pyannote/speaker-diarization-3.0](https://huggingface.co/pyannote/speaker-diarization-3.0)
|
||||
4. Set the token in the application when prompted or as an environment variable:
|
||||
```bash
|
||||
# Windows
|
||||
set HF_TOKEN=your_token_here
|
||||
# macOS/Linux
|
||||
export HF_TOKEN=your_token_here
|
||||
```
|
||||
|
||||
#### Memory Issues with Large Files
|
||||
|
||||
If you encounter memory issues with large files:
|
||||
|
||||
1. Use a smaller Whisper model (e.g., "base" instead of "large")
|
||||
2. Reduce the GPU memory fraction in the application settings
|
||||
3. Increase your system's swap space/virtual memory
|
||||
|
||||
## Running the Application
|
||||
|
||||
After installation, run the application with:
|
||||
|
||||
```bash
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
## Optional: Ollama Setup for Local Summarization
|
||||
|
||||
To use Ollama for local summarization:
|
||||
|
||||
1. Install Ollama from [ollama.ai](https://ollama.ai/)
|
||||
2. Pull a model:
|
||||
```bash
|
||||
ollama pull llama3
|
||||
```
|
||||
3. Uncomment the Ollama line in requirements.txt and install:
|
||||
```bash
|
||||
pip install ollama
|
||||
```
|
||||
|
||||
## Verifying Installation
|
||||
|
||||
To verify that all components are working correctly:
|
||||
|
||||
1. Run the application
|
||||
2. Check that GPU acceleration is available (if applicable)
|
||||
3. Test a small video file with basic transcription
|
||||
4. Gradually enable advanced features like diarization and translation
|
||||
|
||||
If you encounter any issues, check the application logs for specific error messages.
|
||||
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 DataAnts-AI
|
||||
Copyright (c) 2026 DataAnts AI
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
63
QUICK-FIX.md
63
QUICK-FIX.md
@ -1,63 +0,0 @@
|
||||
# 🚨 Quick Fix for PyTorch Compatibility Error
|
||||
|
||||
If you're seeing the `torch.compiler.disable` error, here's how to fix it:
|
||||
|
||||
## Immediate Fix
|
||||
|
||||
```bash
|
||||
# Stop the current container
|
||||
docker-compose down
|
||||
|
||||
# Remove the old image to force rebuild with fixed versions
|
||||
docker rmi $(docker images | grep videotranscriber | awk '{print $3}')
|
||||
|
||||
# Rebuild with fixed dependencies
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
## Better Solution: Use Prebuilt Images
|
||||
|
||||
⚠️ **Note**: GitHub Actions had a naming issue that's now fixed. See [FIX-GITHUB-ACTIONS.md](FIX-GITHUB-ACTIONS.md) for details.
|
||||
|
||||
Once prebuilt images are available, use them instead:
|
||||
|
||||
```bash
|
||||
# Check if images are ready
|
||||
docker pull ghcr.io/dataants-ai/videotranscriber:latest
|
||||
|
||||
# If successful, stop current container and use prebuilt image
|
||||
docker-compose down
|
||||
docker-compose -f docker-compose.prebuilt.yml up -d
|
||||
```
|
||||
|
||||
## What Was Fixed
|
||||
|
||||
1. **Version Pinning**: Updated `requirements.txt` with compatible versions:
|
||||
- `torch==2.0.1` (was `>=1.7.0`)
|
||||
- `pytorch-lightning==2.0.6` (compatible with torch 2.0.1)
|
||||
- `pyannote.audio==3.1.1` (updated to compatible version)
|
||||
|
||||
2. **Build Process**: Removed duplicate PyTorch installation that could cause conflicts
|
||||
|
||||
3. **Prebuilt Images**: Created GitHub Actions to build reliable, tested images
|
||||
|
||||
## Verification
|
||||
|
||||
After fixing, you should see the Streamlit app load without errors at `http://localhost:8501`
|
||||
|
||||
## If Still Having Issues
|
||||
|
||||
1. **Clear Docker cache**:
|
||||
```bash
|
||||
docker system prune -a
|
||||
```
|
||||
|
||||
2. **Check logs**:
|
||||
```bash
|
||||
docker-compose logs -f
|
||||
```
|
||||
|
||||
3. **Manual rebuild**:
|
||||
```bash
|
||||
docker build --no-cache -t videotranscriber .
|
||||
```
|
||||
265
README.md
265
README.md
@ -1,198 +1,133 @@
|
||||
# Video Transcriber
|
||||
# CutScript
|
||||
|
||||
## Project Overview
|
||||
The Video Recording Transcriber is a Python application built with Streamlit that processes video and audio recordings to generate transcripts and summaries using AI models. The application uses Whisper for transcription and Hugging Face Transformers for summarization.
|
||||
An open-source, local-first, Descript-like text-based audio and video editor powered by AI. Edit audio/video by editing text — delete a word from the transcript and it's cut from the audio/video.
|
||||
|
||||
**Supported Formats**: MP4, AVI, MOV, MKV (video) and M4A (audio)
|
||||
<img width="1034" height="661" alt="image" src="https://github.com/user-attachments/assets/b1ed9505-792e-42ca-bb73-85458d0f02a5" />
|
||||
|
||||
|
||||

|
||||
## Architecture
|
||||
|
||||
Demo here
|
||||
- **Electron + React** desktop app with Tailwind CSS
|
||||
- **FastAPI** Python backend (spawned as child process)
|
||||
- **WhisperX** for word-level transcription with alignment
|
||||
- **FFmpeg** for video processing (stream-copy and re-encode)
|
||||
- **Ollama / OpenAI / Claude** for AI features (filler removal, clip creation)
|
||||
|
||||
https://github.com/user-attachments/assets/990e63fc-232e-46a0-afdf-ca8836d46a13
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
## Installation
|
||||
- Node.js 18+
|
||||
- Python 3.10+
|
||||
- FFmpeg (in PATH)
|
||||
- (Optional) Ollama for local AI features
|
||||
|
||||
### 🐳 Docker Installation (Recommended)
|
||||
### Install
|
||||
|
||||
**Benefits**: Isolated environment, no dependency conflicts, easy deployment
|
||||
|
||||
#### Option A: Prebuilt Images (Fastest & Most Reliable)
|
||||
```bash
|
||||
# 1. Clone repository for config files
|
||||
git clone https://github.com/DataAnts-AI/VideoTranscriber.git
|
||||
cd VideoTranscriber
|
||||
# Root dependencies (Electron, concurrently)
|
||||
npm install
|
||||
|
||||
# 2. Setup environment
|
||||
cp docker.env.example .env
|
||||
# Edit .env with your video directory paths
|
||||
# Frontend dependencies (React, Tailwind, Zustand)
|
||||
cd frontend && npm install && cd ..
|
||||
|
||||
# 3. Ensure Ollama is running on host
|
||||
ollama serve # In separate terminal
|
||||
ollama pull llama3
|
||||
|
||||
# 4. Start with prebuilt image
|
||||
docker-compose -f docker-compose.prebuilt.yml up -d
|
||||
|
||||
# 5. Access application
|
||||
# Open browser to: http://localhost:8501
|
||||
# Backend dependencies
|
||||
cd backend && pip install -r requirements.txt && cd ..
|
||||
```
|
||||
|
||||
#### Option B: Build from Source (Development)
|
||||
### Run (Development)
|
||||
|
||||
```bash
|
||||
# Use the local build approach
|
||||
docker-compose up -d
|
||||
# Start all three (backend + frontend + electron)
|
||||
npm run dev
|
||||
```
|
||||
|
||||
See [DOCKER.md](DOCKER.md) for complete Docker setup guide.
|
||||
Or run them separately:
|
||||
|
||||
### Easy Installation (Recommended)
|
||||
```bash
|
||||
# Terminal 1: Backend
|
||||
cd backend && python -m uvicorn main:app --reload --port 8642
|
||||
|
||||
#### Windows
|
||||
1. Download or clone the repository
|
||||
2. Run `install.bat` by double-clicking it
|
||||
3. Follow the on-screen instructions
|
||||
# Terminal 2: Frontend
|
||||
cd frontend && npm run dev
|
||||
|
||||
#### Linux/macOS
|
||||
1. Download or clone the repository
|
||||
2. Open a terminal in the project directory
|
||||
3. Make the install script executable: `chmod +x install.sh`
|
||||
4. Run the script: `./install.sh`
|
||||
5. Follow the on-screen instructions
|
||||
|
||||
### Manual Installation
|
||||
1. Clone the repo.
|
||||
```
|
||||
git clone https://github.com/DataAnts-AI/VideoTranscriber.git
|
||||
cd VideoTranscriber
|
||||
# Terminal 3: Electron
|
||||
npx electron .
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
cutscript/
|
||||
├── electron/ # Electron main process
|
||||
│ ├── main.js # App entry, spawns Python backend
|
||||
│ ├── preload.js # Secure IPC bridge
|
||||
│ └── python-bridge.js
|
||||
├── frontend/ # React + Vite + Tailwind
|
||||
│ └── src/
|
||||
│ ├── components/ # VideoPlayer, TranscriptEditor, etc.
|
||||
│ ├── store/ # Zustand state (editorStore, aiStore)
|
||||
│ ├── hooks/ # useVideoSync, useKeyboardShortcuts
|
||||
│ └── types/ # TypeScript interfaces
|
||||
├── backend/ # FastAPI Python backend
|
||||
│ ├── main.py
|
||||
│ ├── routers/ # API endpoints
|
||||
│ ├── services/ # Core logic (transcription, editing, AI)
|
||||
│ └── utils/ # GPU, cache, audio helpers
|
||||
└── shared/ # Project schema
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Ensure that the versions align with the features you use and your system compatibility.
|
||||
- torch version should match the capabilities of your hardware (e.g., CUDA support for GPUs).
|
||||
- For advanced features like speaker diarization, you'll need a HuggingFace token.
|
||||
- See `INSTALLATION.md` for detailed instructions and troubleshooting.
|
||||
## Features
|
||||
|
||||
3. Run the application:
|
||||
```
|
||||
streamlit run app.py
|
||||
```
|
||||
| Feature | Status |
|
||||
|---------|--------|
|
||||
| Word-level transcription (WhisperX) | Done |
|
||||
| Text-based video editing | Done |
|
||||
| Undo/redo | Done |
|
||||
| Waveform timeline | Done |
|
||||
| FFmpeg stream-copy export | Done |
|
||||
| FFmpeg re-encode (up to 4K) | Done |
|
||||
| AI filler word removal | Done |
|
||||
| AI clip creation (Shorts) | Done |
|
||||
| Ollama + OpenAI + Claude | Done |
|
||||
| Word-level captions (SRT/VTT/ASS) | Done |
|
||||
| Caption burn-in on export | Done |
|
||||
| Studio Sound (DeepFilterNet) | Done |
|
||||
| Keyboard shortcuts (J/K/L) | Done |
|
||||
| Speaker diarization | Done |
|
||||
| Virtualized transcript (react-virtuoso) | Done |
|
||||
| Encrypted API key storage | Done |
|
||||
| Project save/load (.cutscript) | Done |
|
||||
| AI background removal | Planned |
|
||||
|
||||
## Usage
|
||||
1. Set your base folder where video/audio recordings are stored
|
||||
2. Select a recording from the dropdown (supports MP4, AVI, MOV, MKV, M4A)
|
||||
3. Choose transcription and summarization models
|
||||
4. Configure performance settings (GPU acceleration, caching)
|
||||
5. Select export formats and compression options
|
||||
6. Click "Process Recording" to start
|
||||
## Keyboard Shortcuts
|
||||
|
||||
## Advanced Features
|
||||
- **Speaker Diarization**: Identify and label different speakers in your recordings
|
||||
- **Translation**: Automatically detect language and translate to multiple languages
|
||||
- **Keyword Extraction**: Extract important keywords with timestamp links
|
||||
- **Interactive Transcript**: Navigate through the transcript with keyword highlighting
|
||||
- **GPU Acceleration**: Utilize your GPU for faster processing
|
||||
- **Caching**: Save processing time by caching results
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| Space | Play / Pause |
|
||||
| J / K / L | Reverse / Pause / Forward |
|
||||
| ← / → | Seek ±5 seconds |
|
||||
| Delete | Delete selected words |
|
||||
| Ctrl+Z | Undo |
|
||||
| Ctrl+Shift+Z | Redo |
|
||||
| Ctrl+S | Save project |
|
||||
| Ctrl+E | Export |
|
||||
| ? | Shortcut cheatsheet |
|
||||
|
||||
## API Endpoints
|
||||
|
||||
| Method | Endpoint | Description |
|
||||
|--------|----------|-------------|
|
||||
| GET | /health | Health check |
|
||||
| POST | /transcribe | Transcribe video with WhisperX |
|
||||
| POST | /export | Export edited video (stream copy or re-encode) |
|
||||
| POST | /ai/filler-removal | Detect filler words via LLM |
|
||||
| POST | /ai/create-clip | AI-suggested clips for shorts |
|
||||
| GET | /ai/ollama-models | List local Ollama models |
|
||||
| POST | /captions | Generate SRT/VTT/ASS captions |
|
||||
| POST | /audio/clean | Noise reduction (DeepFilterNet) |
|
||||
| GET | /audio/capabilities | Check audio processing availability |
|
||||
|
||||
## Key Improvement Areas
|
||||
## License
|
||||
|
||||
### 1. UI Enhancements
|
||||
- **Implemented:**
|
||||
- Responsive layout with columns for better organization
|
||||
- Expanded sidebar with categorized settings
|
||||
- Custom CSS for improved button styling
|
||||
- Spinner for long-running operations
|
||||
- Expanded transcript view by default
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add a dark mode toggle
|
||||
- Implement progress bars for each processing step
|
||||
- Add tooltips for complex options
|
||||
- Create a dashboard view for batch processing results
|
||||
- Add visualization of transcript segments with timestamps
|
||||
|
||||
### 2. Ollama Local API Integration
|
||||
- **Implemented:**
|
||||
- Local API integration for offline summarization
|
||||
- Model selection from available Ollama models
|
||||
- Chunking for long texts
|
||||
- Fallback to online models when Ollama fails
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add temperature and other generation parameters as advanced options
|
||||
- Implement streaming responses for real-time feedback
|
||||
- Cache results to avoid reprocessing
|
||||
- Add support for custom Ollama model creation with specific instructions
|
||||
- Implement parallel processing for multiple chunks
|
||||
|
||||
### 3. Subtitle Export Formats
|
||||
- **Implemented:**
|
||||
- SRT export with proper formatting
|
||||
- ASS export with basic styling
|
||||
- Multi-format export options
|
||||
- Automatic segment creation from plain text
|
||||
|
||||
- **Additional Recommendations:**
|
||||
- Add customizable styling options for ASS subtitles
|
||||
- Implement subtitle editing before export
|
||||
- Add support for VTT format for web videos
|
||||
- Implement subtitle timing adjustment
|
||||
- Add batch export for multiple files
|
||||
|
||||
### 4. Architecture and Code Quality
|
||||
- **Recommendations:**
|
||||
- Implement proper error handling and logging throughout
|
||||
- Add unit tests for critical components
|
||||
- Create a configuration file for default settings
|
||||
- Implement caching for processed files
|
||||
- Add type hints throughout the codebase
|
||||
- Document API endpoints for potential future web service
|
||||
|
||||
### 5. Performance Optimizations
|
||||
- **Recommendations:**
|
||||
- Implement parallel processing for batch operations
|
||||
- Add GPU acceleration configuration options
|
||||
- Optimize memory usage for large files
|
||||
- Implement incremental processing for very long recordings
|
||||
- Add compression options for exported files
|
||||
|
||||
### 6. Additional Features
|
||||
- **Recommendations:**
|
||||
- Speaker diarization (identifying different speakers)
|
||||
- Language detection and translation
|
||||
- Keyword extraction and timestamp linking
|
||||
- Integration with video editing software
|
||||
- Batch processing queue with email notifications
|
||||
- Custom vocabulary for domain-specific terminology
|
||||
|
||||
## Implementation Roadmap
|
||||
1. **Phase 1 (Completed):** Basic UI improvements, Ollama integration, and subtitle export
|
||||
2. **Phase 2 (Completed):** Performance optimizations and additional export formats
|
||||
- Added WebVTT export format for web videos
|
||||
- Implemented GPU acceleration with automatic device selection
|
||||
- Added caching system for faster processing of previously transcribed files
|
||||
- Optimized memory usage with configurable memory limits
|
||||
- Added compression options for exported files
|
||||
- Enhanced ASS subtitle styling options
|
||||
- Added progress indicators for better user feedback
|
||||
3. **Phase 3 (Completed):** Advanced features like speaker diarization and translation
|
||||
- Implemented speaker diarization to identify different speakers in recordings
|
||||
- Added language detection and translation capabilities
|
||||
- Integrated keyword extraction with timestamp linking
|
||||
- Created interactive transcript with keyword highlighting
|
||||
- Added named entity recognition for better content analysis
|
||||
- Generated keyword index with timestamp references
|
||||
- Provided speaker statistics and word count analysis
|
||||
4. **Phase 4:** Integration with other tools and services (In progess)
|
||||
|
||||
|
||||
Reach out to support@dataants.org if you need assistance with any AI solutions - we offer support for n8n workflows, local RAG chatbots, and ERP and Financial reporting.
|
||||
MIT License — see [LICENSE](LICENSE) for details.
|
||||
|
||||
544
app.py
544
app.py
@ -1,544 +0,0 @@
|
||||
import streamlit as st
|
||||
from utils.audio_processing import extract_audio
|
||||
from utils.transcription import transcribe_audio
|
||||
from utils.summarization import summarize_text
|
||||
from utils.validation import validate_environment
|
||||
from utils.export import export_transcript
|
||||
from pathlib import Path
|
||||
import os
|
||||
import logging
|
||||
import humanize
|
||||
from datetime import timedelta
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import Ollama integration, but don't fail if it's not available
|
||||
try:
|
||||
from utils.ollama_integration import check_ollama_available, list_available_models, chunk_and_summarize
|
||||
OLLAMA_AVAILABLE = check_ollama_available()
|
||||
except ImportError:
|
||||
OLLAMA_AVAILABLE = False
|
||||
|
||||
# Try to import GPU utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.gpu_utils import get_gpu_info, configure_gpu
|
||||
GPU_UTILS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GPU_UTILS_AVAILABLE = False
|
||||
|
||||
# Try to import caching utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.cache import get_cache_size, clear_cache
|
||||
CACHE_AVAILABLE = True
|
||||
except ImportError:
|
||||
CACHE_AVAILABLE = False
|
||||
|
||||
# Try to import diarization utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.diarization import transcribe_with_diarization
|
||||
DIARIZATION_AVAILABLE = True
|
||||
except ImportError:
|
||||
DIARIZATION_AVAILABLE = False
|
||||
|
||||
# Try to import translation utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.translation import transcribe_and_translate, get_language_name
|
||||
TRANSLATION_AVAILABLE = True
|
||||
except ImportError:
|
||||
TRANSLATION_AVAILABLE = False
|
||||
|
||||
# Try to import keyword extraction utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.keyword_extraction import extract_keywords_from_transcript, generate_keyword_index, generate_interactive_transcript
|
||||
KEYWORD_EXTRACTION_AVAILABLE = True
|
||||
except ImportError:
|
||||
KEYWORD_EXTRACTION_AVAILABLE = False
|
||||
|
||||
def main():
|
||||
# Set page configuration
|
||||
st.set_page_config(
|
||||
page_title="OBS Recording Transcriber",
|
||||
page_icon="🎥",
|
||||
layout="wide",
|
||||
initial_sidebar_state="expanded"
|
||||
)
|
||||
|
||||
# Custom CSS for better UI
|
||||
st.markdown("""
|
||||
<style>
|
||||
.main .block-container {
|
||||
padding-top: 2rem;
|
||||
padding-bottom: 2rem;
|
||||
}
|
||||
.stButton>button {
|
||||
width: 100%;
|
||||
}
|
||||
.stDownloadButton>button {
|
||||
width: 100%;
|
||||
}
|
||||
.stProgress > div > div > div {
|
||||
background-color: #4CAF50;
|
||||
}
|
||||
.speaker {
|
||||
font-weight: bold;
|
||||
color: #1E88E5;
|
||||
}
|
||||
.timestamp {
|
||||
color: #757575;
|
||||
font-size: 0.9em;
|
||||
margin-right: 8px;
|
||||
}
|
||||
.keyword {
|
||||
background-color: #FFF9C4;
|
||||
padding: 0 2px;
|
||||
border-radius: 3px;
|
||||
}
|
||||
.interactive-transcript p {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
st.title("🎥 OBS Recording Transcriber")
|
||||
st.caption("Process your OBS recordings with AI transcription and summarization")
|
||||
|
||||
# Sidebar configuration
|
||||
st.sidebar.header("Settings")
|
||||
|
||||
# Allow the user to select a base folder
|
||||
base_folder = st.sidebar.text_input(
|
||||
"Enter the base folder path:",
|
||||
value=str(Path.home())
|
||||
)
|
||||
|
||||
base_path = Path(base_folder)
|
||||
|
||||
# Model selection
|
||||
st.sidebar.subheader("Model Settings")
|
||||
|
||||
# Transcription model selection
|
||||
transcription_model = st.sidebar.selectbox(
|
||||
"Transcription Model",
|
||||
["tiny", "base", "small", "medium", "large"],
|
||||
index=1,
|
||||
help="Select the Whisper model size. Larger models are more accurate but slower."
|
||||
)
|
||||
|
||||
# Summarization model selection
|
||||
summarization_options = ["Hugging Face (Online)", "Ollama (Local)"] if OLLAMA_AVAILABLE else ["Hugging Face (Online)"]
|
||||
summarization_method = st.sidebar.selectbox(
|
||||
"Summarization Method",
|
||||
summarization_options,
|
||||
index=0,
|
||||
help="Select the summarization method. Ollama runs locally but requires installation."
|
||||
)
|
||||
|
||||
# If Ollama is selected, show model selection
|
||||
ollama_model = None
|
||||
if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)":
|
||||
available_models = list_available_models()
|
||||
if available_models:
|
||||
ollama_model = st.sidebar.selectbox(
|
||||
"Ollama Model",
|
||||
available_models,
|
||||
index=0 if "llama3" in available_models else 0,
|
||||
help="Select the Ollama model to use for summarization."
|
||||
)
|
||||
else:
|
||||
st.sidebar.warning("No Ollama models found. Please install models using 'ollama pull model_name'.")
|
||||
|
||||
# Advanced features
|
||||
st.sidebar.subheader("Advanced Features")
|
||||
|
||||
# Speaker diarization
|
||||
use_diarization = st.sidebar.checkbox(
|
||||
"Speaker Diarization",
|
||||
value=False,
|
||||
disabled=not DIARIZATION_AVAILABLE,
|
||||
help="Identify different speakers in the recording."
|
||||
)
|
||||
|
||||
# Show HF token input if diarization is enabled
|
||||
hf_token = None
|
||||
if use_diarization and DIARIZATION_AVAILABLE:
|
||||
hf_token = st.sidebar.text_input(
|
||||
"HuggingFace Token",
|
||||
type="password",
|
||||
help="Required for speaker diarization. Get your token at huggingface.co/settings/tokens"
|
||||
)
|
||||
|
||||
num_speakers = st.sidebar.number_input(
|
||||
"Number of Speakers",
|
||||
min_value=1,
|
||||
max_value=10,
|
||||
value=2,
|
||||
help="Specify the number of speakers if known, or leave at default for auto-detection."
|
||||
)
|
||||
|
||||
# Translation
|
||||
use_translation = st.sidebar.checkbox(
|
||||
"Translation",
|
||||
value=False,
|
||||
disabled=not TRANSLATION_AVAILABLE,
|
||||
help="Translate the transcript to another language."
|
||||
)
|
||||
|
||||
# Target language selection if translation is enabled
|
||||
target_lang = None
|
||||
if use_translation and TRANSLATION_AVAILABLE:
|
||||
target_lang = st.sidebar.selectbox(
|
||||
"Target Language",
|
||||
["en", "es", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko", "ar"],
|
||||
format_func=lambda x: f"{get_language_name(x)} ({x})",
|
||||
help="Select the language to translate to."
|
||||
)
|
||||
|
||||
# Keyword extraction
|
||||
use_keywords = st.sidebar.checkbox(
|
||||
"Keyword Extraction",
|
||||
value=False,
|
||||
disabled=not KEYWORD_EXTRACTION_AVAILABLE,
|
||||
help="Extract keywords and link them to timestamps."
|
||||
)
|
||||
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
|
||||
max_keywords = st.sidebar.slider(
|
||||
"Max Keywords",
|
||||
min_value=5,
|
||||
max_value=30,
|
||||
value=15,
|
||||
help="Maximum number of keywords to extract."
|
||||
)
|
||||
|
||||
# Performance settings
|
||||
st.sidebar.subheader("Performance Settings")
|
||||
|
||||
# GPU acceleration
|
||||
use_gpu = st.sidebar.checkbox(
|
||||
"Use GPU Acceleration",
|
||||
value=True if GPU_UTILS_AVAILABLE else False,
|
||||
disabled=not GPU_UTILS_AVAILABLE,
|
||||
help="Use GPU for faster processing if available."
|
||||
)
|
||||
|
||||
# Show GPU info if available
|
||||
if GPU_UTILS_AVAILABLE and use_gpu:
|
||||
gpu_info = get_gpu_info()
|
||||
if gpu_info["cuda_available"]:
|
||||
gpu_devices = [f"{d['name']} ({humanize.naturalsize(d['total_memory'])})" for d in gpu_info["cuda_devices"]]
|
||||
st.sidebar.info(f"GPU(s) available: {', '.join(gpu_devices)}")
|
||||
elif gpu_info["mps_available"]:
|
||||
st.sidebar.info("Apple Silicon GPU (MPS) available")
|
||||
else:
|
||||
st.sidebar.warning("No GPU detected. Using CPU.")
|
||||
|
||||
# Memory usage
|
||||
memory_fraction = st.sidebar.slider(
|
||||
"GPU Memory Usage",
|
||||
min_value=0.1,
|
||||
max_value=1.0,
|
||||
value=0.8,
|
||||
step=0.1,
|
||||
disabled=not (GPU_UTILS_AVAILABLE and use_gpu),
|
||||
help="Fraction of GPU memory to use. Lower if you encounter out-of-memory errors."
|
||||
)
|
||||
|
||||
# Caching options
|
||||
use_cache = st.sidebar.checkbox(
|
||||
"Use Caching",
|
||||
value=True if CACHE_AVAILABLE else False,
|
||||
disabled=not CACHE_AVAILABLE,
|
||||
help="Cache transcription results to avoid reprocessing the same files."
|
||||
)
|
||||
|
||||
# Cache management
|
||||
if CACHE_AVAILABLE and use_cache:
|
||||
cache_size, cache_files = get_cache_size()
|
||||
if cache_size > 0:
|
||||
st.sidebar.info(f"Cache: {humanize.naturalsize(cache_size)} ({cache_files} files)")
|
||||
if st.sidebar.button("Clear Cache"):
|
||||
cleared = clear_cache()
|
||||
st.sidebar.success(f"Cleared {cleared} cache files")
|
||||
|
||||
# Export options
|
||||
st.sidebar.subheader("Export Options")
|
||||
export_format = st.sidebar.multiselect(
|
||||
"Export Formats",
|
||||
["TXT", "SRT", "VTT", "ASS"],
|
||||
default=["TXT"],
|
||||
help="Select the formats to export the transcript."
|
||||
)
|
||||
|
||||
# Compression options
|
||||
compress_exports = st.sidebar.checkbox(
|
||||
"Compress Exports",
|
||||
value=False,
|
||||
help="Compress exported files to save space."
|
||||
)
|
||||
|
||||
if compress_exports:
|
||||
compression_type = st.sidebar.radio(
|
||||
"Compression Format",
|
||||
["gzip", "zip"],
|
||||
index=0,
|
||||
help="Select the compression format for exported files."
|
||||
)
|
||||
else:
|
||||
compression_type = None
|
||||
|
||||
# ASS subtitle styling
|
||||
if "ASS" in export_format:
|
||||
st.sidebar.subheader("ASS Subtitle Styling")
|
||||
show_style_options = st.sidebar.checkbox("Customize ASS Style", value=False)
|
||||
|
||||
if show_style_options:
|
||||
ass_style = {}
|
||||
ass_style["fontname"] = st.sidebar.selectbox(
|
||||
"Font",
|
||||
["Arial", "Helvetica", "Times New Roman", "Courier New", "Comic Sans MS"],
|
||||
index=0
|
||||
)
|
||||
ass_style["fontsize"] = st.sidebar.slider("Font Size", 12, 72, 48)
|
||||
ass_style["alignment"] = st.sidebar.selectbox(
|
||||
"Alignment",
|
||||
["2 (Bottom Center)", "1 (Bottom Left)", "3 (Bottom Right)", "8 (Top Center)"],
|
||||
index=0
|
||||
).split()[0] # Extract just the number
|
||||
ass_style["bold"] = "-1" if st.sidebar.checkbox("Bold", value=True) else "0"
|
||||
ass_style["italic"] = "-1" if st.sidebar.checkbox("Italic", value=False) else "0"
|
||||
else:
|
||||
ass_style = None
|
||||
|
||||
# Validate environment
|
||||
env_errors = validate_environment(base_path)
|
||||
if env_errors:
|
||||
st.error("## Environment Issues")
|
||||
for error in env_errors:
|
||||
st.markdown(f"- {error}")
|
||||
return
|
||||
|
||||
# File selection - support multiple video and audio formats
|
||||
supported_extensions = ["*.mp4", "*.avi", "*.mov", "*.mkv", "*.m4a"]
|
||||
recordings = []
|
||||
for extension in supported_extensions:
|
||||
recordings.extend(base_path.glob(extension))
|
||||
|
||||
if not recordings:
|
||||
st.warning(f"📂 No recordings found in the folder: {base_folder}!")
|
||||
st.info("💡 Supported formats: MP4, AVI, MOV, MKV, M4A")
|
||||
return
|
||||
|
||||
selected_file = st.selectbox("Choose a recording", recordings)
|
||||
|
||||
# Process button with spinner
|
||||
if st.button("🚀 Start Processing"):
|
||||
# Create a progress bar
|
||||
progress_bar = st.progress(0)
|
||||
status_text = st.empty()
|
||||
|
||||
try:
|
||||
# Update progress
|
||||
status_text.text("Extracting audio...")
|
||||
progress_bar.progress(10)
|
||||
|
||||
# Process based on selected features
|
||||
if use_diarization and DIARIZATION_AVAILABLE and hf_token:
|
||||
# Transcribe with speaker diarization
|
||||
status_text.text("Transcribing with speaker diarization...")
|
||||
num_speakers_arg = int(num_speakers) if num_speakers > 0 else None
|
||||
diarized_segments, diarized_transcript = transcribe_with_diarization(
|
||||
selected_file,
|
||||
whisper_model=transcription_model,
|
||||
num_speakers=num_speakers_arg,
|
||||
use_gpu=use_gpu,
|
||||
hf_token=hf_token
|
||||
)
|
||||
segments = diarized_segments
|
||||
transcript = diarized_transcript
|
||||
elif use_translation and TRANSLATION_AVAILABLE:
|
||||
# Transcribe and translate
|
||||
status_text.text("Transcribing and translating...")
|
||||
original_segments, translated_segments, original_transcript, translated_transcript = transcribe_and_translate(
|
||||
selected_file,
|
||||
whisper_model=transcription_model,
|
||||
target_lang=target_lang,
|
||||
use_gpu=use_gpu
|
||||
)
|
||||
segments = translated_segments
|
||||
transcript = translated_transcript
|
||||
# Store original for display
|
||||
original_text = original_transcript
|
||||
else:
|
||||
# Standard transcription
|
||||
status_text.text("Transcribing audio...")
|
||||
segments, transcript = transcribe_audio(
|
||||
selected_file,
|
||||
model=transcription_model,
|
||||
use_cache=use_cache,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
|
||||
progress_bar.progress(50)
|
||||
|
||||
if transcript:
|
||||
# Extract keywords if requested
|
||||
keyword_timestamps = None
|
||||
entity_timestamps = None
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE:
|
||||
status_text.text("Extracting keywords...")
|
||||
keyword_timestamps, entity_timestamps = extract_keywords_from_transcript(
|
||||
transcript,
|
||||
segments,
|
||||
max_keywords=max_keywords,
|
||||
use_gpu=use_gpu
|
||||
)
|
||||
|
||||
# Generate keyword index
|
||||
keyword_index = generate_keyword_index(keyword_timestamps, entity_timestamps)
|
||||
|
||||
# Generate interactive transcript
|
||||
interactive_transcript = generate_interactive_transcript(
|
||||
segments,
|
||||
keyword_timestamps,
|
||||
entity_timestamps
|
||||
)
|
||||
|
||||
# Generate summary based on selected method
|
||||
status_text.text("Generating summary...")
|
||||
if OLLAMA_AVAILABLE and summarization_method == "Ollama (Local)" and ollama_model:
|
||||
summary = chunk_and_summarize(transcript, model=ollama_model)
|
||||
if not summary:
|
||||
st.warning("Ollama summarization failed. Falling back to Hugging Face.")
|
||||
summary = summarize_text(
|
||||
transcript,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
else:
|
||||
summary = summarize_text(
|
||||
transcript,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
|
||||
progress_bar.progress(80)
|
||||
status_text.text("Preparing results...")
|
||||
|
||||
# Display results in tabs
|
||||
tab1, tab2, tab3 = st.tabs(["Summary", "Transcript", "Advanced"])
|
||||
|
||||
with tab1:
|
||||
st.subheader("🖍 Summary")
|
||||
st.write(summary)
|
||||
|
||||
# If translation was used, show original language
|
||||
if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals():
|
||||
with st.expander("Original Language Summary"):
|
||||
original_summary = summarize_text(
|
||||
original_text,
|
||||
use_gpu=use_gpu,
|
||||
memory_fraction=memory_fraction
|
||||
)
|
||||
st.write(original_summary)
|
||||
|
||||
with tab2:
|
||||
st.subheader("📜 Full Transcript")
|
||||
|
||||
# Show interactive transcript if keywords were extracted
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'interactive_transcript' in locals():
|
||||
st.markdown(interactive_transcript, unsafe_allow_html=True)
|
||||
else:
|
||||
st.text(transcript)
|
||||
|
||||
# If translation was used, show original language
|
||||
if use_translation and TRANSLATION_AVAILABLE and 'original_text' in locals():
|
||||
with st.expander("Original Language Transcript"):
|
||||
st.text(original_text)
|
||||
|
||||
with tab3:
|
||||
# Show keyword index if available
|
||||
if use_keywords and KEYWORD_EXTRACTION_AVAILABLE and 'keyword_index' in locals():
|
||||
st.subheader("🔑 Keyword Index")
|
||||
st.markdown(keyword_index)
|
||||
|
||||
# Show speaker information if available
|
||||
if use_diarization and DIARIZATION_AVAILABLE:
|
||||
st.subheader("🎙️ Speaker Information")
|
||||
speakers = set(segment.get('speaker', 'UNKNOWN') for segment in segments)
|
||||
st.write(f"Detected {len(speakers)} speakers: {', '.join(speakers)}")
|
||||
|
||||
# Count words per speaker
|
||||
speaker_words = {}
|
||||
for segment in segments:
|
||||
speaker = segment.get('speaker', 'UNKNOWN')
|
||||
words = len(segment['text'].split())
|
||||
if speaker in speaker_words:
|
||||
speaker_words[speaker] += words
|
||||
else:
|
||||
speaker_words[speaker] = words
|
||||
|
||||
# Display speaker statistics
|
||||
st.write("### Speaker Statistics")
|
||||
for speaker, words in speaker_words.items():
|
||||
st.write(f"- **{speaker}**: {words} words")
|
||||
|
||||
# Export options
|
||||
st.subheader("💾 Export Options")
|
||||
export_cols = st.columns(len(export_format))
|
||||
|
||||
output_base = Path(selected_file).stem
|
||||
|
||||
for i, format_type in enumerate(export_format):
|
||||
with export_cols[i]:
|
||||
if format_type == "TXT":
|
||||
st.download_button(
|
||||
label=f"Download {format_type}",
|
||||
data=transcript,
|
||||
file_name=f"{output_base}_transcript.txt",
|
||||
mime="text/plain"
|
||||
)
|
||||
elif format_type in ["SRT", "VTT", "ASS"]:
|
||||
# Export to subtitle format
|
||||
output_path = export_transcript(
|
||||
transcript,
|
||||
output_base,
|
||||
format_type.lower(),
|
||||
segments=segments,
|
||||
compress=compress_exports,
|
||||
compression_type=compression_type,
|
||||
style=ass_style if format_type == "ASS" and ass_style else None
|
||||
)
|
||||
|
||||
# Read the exported file for download
|
||||
with open(output_path, 'rb') as f:
|
||||
subtitle_content = f.read()
|
||||
|
||||
# Determine file extension
|
||||
file_ext = f".{format_type.lower()}"
|
||||
if compress_exports:
|
||||
file_ext += ".gz" if compression_type == "gzip" else ".zip"
|
||||
|
||||
st.download_button(
|
||||
label=f"Download {format_type}",
|
||||
data=subtitle_content,
|
||||
file_name=f"{output_base}{file_ext}",
|
||||
mime="application/octet-stream"
|
||||
)
|
||||
|
||||
# Clean up the temporary file
|
||||
os.remove(output_path)
|
||||
|
||||
# Complete progress
|
||||
progress_bar.progress(100)
|
||||
status_text.text("Processing complete!")
|
||||
else:
|
||||
st.error("❌ Failed to process recording")
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred: {e}")
|
||||
st.write(e) # This will show the traceback in the Streamlit app
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
117
backend/main.py
Normal file
117
backend/main.py
Normal file
@ -0,0 +1,117 @@
|
||||
import logging
|
||||
import os
|
||||
import stat
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, Query, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from routers import transcribe, export, ai, captions, audio
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
logger.info("AI Video Editor backend starting up")
|
||||
yield
|
||||
logger.info("AI Video Editor backend shutting down")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="AI Video Editor Backend",
|
||||
version="0.1.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
expose_headers=["Content-Range", "Accept-Ranges", "Content-Length"],
|
||||
)
|
||||
|
||||
app.include_router(transcribe.router)
|
||||
app.include_router(export.router)
|
||||
app.include_router(ai.router)
|
||||
app.include_router(captions.router)
|
||||
app.include_router(audio.router)
|
||||
|
||||
|
||||
MIME_MAP = {
|
||||
".mp4": "video/mp4",
|
||||
".mkv": "video/x-matroska",
|
||||
".mov": "video/quicktime",
|
||||
".avi": "video/x-msvideo",
|
||||
".webm": "video/webm",
|
||||
".m4a": "audio/mp4",
|
||||
".wav": "audio/wav",
|
||||
".mp3": "audio/mpeg",
|
||||
".flac": "audio/flac",
|
||||
}
|
||||
|
||||
|
||||
@app.get("/file")
|
||||
async def serve_local_file(request: Request, path: str = Query(...)):
|
||||
"""Stream a local file with HTTP Range support (required for video seeking)."""
|
||||
file_path = Path(path)
|
||||
if not file_path.is_file():
|
||||
raise HTTPException(status_code=404, detail=f"File not found: {path}")
|
||||
|
||||
file_size = file_path.stat().st_size
|
||||
content_type = MIME_MAP.get(file_path.suffix.lower(), "application/octet-stream")
|
||||
|
||||
range_header = request.headers.get("range")
|
||||
if range_header:
|
||||
range_spec = range_header.replace("bytes=", "")
|
||||
range_start_str, range_end_str = range_spec.split("-")
|
||||
range_start = int(range_start_str) if range_start_str else 0
|
||||
range_end = int(range_end_str) if range_end_str else file_size - 1
|
||||
range_end = min(range_end, file_size - 1)
|
||||
content_length = range_end - range_start + 1
|
||||
|
||||
def iter_range():
|
||||
with open(file_path, "rb") as f:
|
||||
f.seek(range_start)
|
||||
remaining = content_length
|
||||
while remaining > 0:
|
||||
chunk = f.read(min(65536, remaining))
|
||||
if not chunk:
|
||||
break
|
||||
remaining -= len(chunk)
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
iter_range(),
|
||||
status_code=206,
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Content-Range": f"bytes {range_start}-{range_end}/{file_size}",
|
||||
"Accept-Ranges": "bytes",
|
||||
"Content-Length": str(content_length),
|
||||
},
|
||||
)
|
||||
|
||||
def iter_file():
|
||||
with open(file_path, "rb") as f:
|
||||
while chunk := f.read(65536):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
iter_file(),
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Accept-Ranges": "bytes",
|
||||
"Content-Length": str(file_size),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
33
backend/requirements.txt
Normal file
33
backend/requirements.txt
Normal file
@ -0,0 +1,33 @@
|
||||
# FastAPI backend
|
||||
fastapi>=0.115.0
|
||||
uvicorn[standard]>=0.32.0
|
||||
websockets>=14.0
|
||||
python-multipart>=0.0.12
|
||||
|
||||
# Transcription (WhisperX for word-level alignment)
|
||||
whisperx>=3.1.0
|
||||
faster-whisper>=1.0.0
|
||||
|
||||
# Audio / Video processing
|
||||
moviepy>=1.0.3
|
||||
ffmpeg-python>=0.2.0
|
||||
soundfile>=0.10.3
|
||||
|
||||
# ML / GPU
|
||||
torch>=2.0.0
|
||||
torchaudio>=2.0.0
|
||||
numpy>=1.24.0
|
||||
|
||||
# Speaker diarization
|
||||
pyannote.audio>=3.1.1
|
||||
|
||||
# AI providers
|
||||
openai>=1.50.0
|
||||
anthropic>=0.39.0
|
||||
requests>=2.28.0
|
||||
|
||||
# Audio cleanup
|
||||
deepfilternet>=0.5.0
|
||||
|
||||
# Utilities
|
||||
pydantic>=2.0.0
|
||||
0
backend/routers/__init__.py
Normal file
0
backend/routers/__init__.py
Normal file
83
backend/routers/ai.py
Normal file
83
backend/routers/ai.py
Normal file
@ -0,0 +1,83 @@
|
||||
"""AI feature endpoints: filler word detection, clip creation, Ollama model listing."""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.ai_provider import AIProvider, detect_filler_words, create_clip_suggestion
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class WordInfo(BaseModel):
|
||||
index: int
|
||||
word: str
|
||||
start: Optional[float] = None
|
||||
end: Optional[float] = None
|
||||
|
||||
|
||||
class FillerRequest(BaseModel):
|
||||
transcript: str
|
||||
words: List[WordInfo]
|
||||
provider: str = "ollama"
|
||||
model: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
custom_filler_words: Optional[str] = None
|
||||
|
||||
|
||||
class ClipRequest(BaseModel):
|
||||
transcript: str
|
||||
words: List[WordInfo]
|
||||
provider: str = "ollama"
|
||||
model: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
target_duration: int = 60
|
||||
|
||||
|
||||
@router.post("/ai/filler-removal")
|
||||
async def filler_removal(req: FillerRequest):
|
||||
try:
|
||||
words_dicts = [w.model_dump() for w in req.words]
|
||||
result = detect_filler_words(
|
||||
transcript=req.transcript,
|
||||
words=words_dicts,
|
||||
provider=req.provider,
|
||||
model=req.model,
|
||||
api_key=req.api_key,
|
||||
base_url=req.base_url,
|
||||
custom_filler_words=req.custom_filler_words,
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Filler detection failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/ai/create-clip")
|
||||
async def create_clip(req: ClipRequest):
|
||||
try:
|
||||
words_dicts = [w.model_dump() for w in req.words]
|
||||
result = create_clip_suggestion(
|
||||
transcript=req.transcript,
|
||||
words=words_dicts,
|
||||
target_duration=req.target_duration,
|
||||
provider=req.provider,
|
||||
model=req.model,
|
||||
api_key=req.api_key,
|
||||
base_url=req.base_url,
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Clip creation failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/ai/ollama-models")
|
||||
async def ollama_models(base_url: str = "http://localhost:11434"):
|
||||
models = AIProvider.list_ollama_models(base_url)
|
||||
return {"models": models}
|
||||
38
backend/routers/audio.py
Normal file
38
backend/routers/audio.py
Normal file
@ -0,0 +1,38 @@
|
||||
"""Audio processing endpoint (noise reduction / Studio Sound)."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.audio_cleaner import clean_audio, is_deepfilter_available
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class AudioCleanRequest(BaseModel):
|
||||
input_path: str
|
||||
output_path: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/audio/clean")
|
||||
async def clean_audio_endpoint(req: AudioCleanRequest):
|
||||
try:
|
||||
output = clean_audio(req.input_path, req.output_path or "")
|
||||
return {
|
||||
"status": "ok",
|
||||
"output_path": output,
|
||||
"engine": "deepfilternet" if is_deepfilter_available() else "ffmpeg_anlmdn",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Audio cleaning failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/audio/capabilities")
|
||||
async def audio_capabilities():
|
||||
return {
|
||||
"deepfilternet_available": is_deepfilter_available(),
|
||||
}
|
||||
65
backend/routers/captions.py
Normal file
65
backend/routers/captions.py
Normal file
@ -0,0 +1,65 @@
|
||||
"""Caption generation endpoint."""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import PlainTextResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.caption_generator import generate_srt, generate_vtt, generate_ass, save_captions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class CaptionWord(BaseModel):
|
||||
word: str
|
||||
start: float
|
||||
end: float
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
class CaptionStyle(BaseModel):
|
||||
fontName: str = "Arial"
|
||||
fontSize: int = 48
|
||||
fontColor: str = "&H00FFFFFF"
|
||||
backgroundColor: str = "&H80000000"
|
||||
position: str = "bottom"
|
||||
bold: bool = True
|
||||
|
||||
|
||||
class CaptionRequest(BaseModel):
|
||||
words: List[CaptionWord]
|
||||
deleted_indices: List[int] = []
|
||||
format: str = "srt"
|
||||
words_per_line: int = 8
|
||||
style: Optional[CaptionStyle] = None
|
||||
output_path: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/captions")
|
||||
async def generate_captions(req: CaptionRequest):
|
||||
try:
|
||||
words_dicts = [w.model_dump() for w in req.words]
|
||||
deleted_set = set(req.deleted_indices)
|
||||
|
||||
if req.format == "srt":
|
||||
content = generate_srt(words_dicts, deleted_set, req.words_per_line)
|
||||
elif req.format == "vtt":
|
||||
content = generate_vtt(words_dicts, deleted_set, req.words_per_line)
|
||||
elif req.format == "ass":
|
||||
style_dict = req.style.model_dump() if req.style else None
|
||||
content = generate_ass(words_dicts, deleted_set, req.words_per_line, style_dict)
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Unknown format: {req.format}")
|
||||
|
||||
if req.output_path:
|
||||
saved = save_captions(content, req.output_path)
|
||||
return {"status": "ok", "output_path": saved}
|
||||
|
||||
return PlainTextResponse(content, media_type="text/plain")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Caption generation failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
156
backend/routers/export.py
Normal file
156
backend/routers/export.py
Normal file
@ -0,0 +1,156 @@
|
||||
"""Export endpoint for video cutting and rendering."""
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.video_editor import export_stream_copy, export_reencode, export_reencode_with_subs
|
||||
from services.audio_cleaner import clean_audio
|
||||
from services.caption_generator import generate_srt, generate_ass, save_captions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class SegmentModel(BaseModel):
|
||||
start: float
|
||||
end: float
|
||||
|
||||
|
||||
class ExportWordModel(BaseModel):
|
||||
word: str
|
||||
start: float
|
||||
end: float
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
class ExportRequest(BaseModel):
|
||||
input_path: str
|
||||
output_path: str
|
||||
keep_segments: List[SegmentModel]
|
||||
mode: str = "fast"
|
||||
resolution: str = "1080p"
|
||||
format: str = "mp4"
|
||||
enhanceAudio: bool = False
|
||||
captions: str = "none"
|
||||
words: Optional[List[ExportWordModel]] = None
|
||||
deleted_indices: Optional[List[int]] = None
|
||||
|
||||
|
||||
def _mux_audio(video_path: str, audio_path: str, output_path: str) -> str:
|
||||
"""Replace video's audio track with cleaned audio using FFmpeg."""
|
||||
import subprocess
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", video_path,
|
||||
"-i", audio_path,
|
||||
"-c:v", "copy",
|
||||
"-map", "0:v:0",
|
||||
"-map", "1:a:0",
|
||||
"-shortest",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Audio mux failed: {result.stderr[-300:]}")
|
||||
return output_path
|
||||
|
||||
|
||||
@router.post("/export")
|
||||
async def export_video(req: ExportRequest):
|
||||
try:
|
||||
segments = [{"start": s.start, "end": s.end} for s in req.keep_segments]
|
||||
|
||||
if not segments:
|
||||
raise HTTPException(status_code=400, detail="No segments to export")
|
||||
|
||||
use_stream_copy = req.mode == "fast" and len(segments) == 1
|
||||
needs_reencode_for_subs = req.captions == "burn-in"
|
||||
|
||||
# Burn-in captions require re-encode
|
||||
if needs_reencode_for_subs:
|
||||
use_stream_copy = False
|
||||
|
||||
words_dicts = [w.model_dump() for w in req.words] if req.words else []
|
||||
deleted_set = set(req.deleted_indices or [])
|
||||
|
||||
# Generate ASS file for burn-in
|
||||
ass_path = None
|
||||
if req.captions == "burn-in" and words_dicts:
|
||||
ass_content = generate_ass(words_dicts, deleted_set)
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".ass", delete=False, mode="w", encoding="utf-8")
|
||||
tmp.write(ass_content)
|
||||
tmp.close()
|
||||
ass_path = tmp.name
|
||||
|
||||
try:
|
||||
if use_stream_copy:
|
||||
output = export_stream_copy(req.input_path, req.output_path, segments)
|
||||
elif ass_path:
|
||||
output = export_reencode_with_subs(
|
||||
req.input_path,
|
||||
req.output_path,
|
||||
segments,
|
||||
ass_path,
|
||||
resolution=req.resolution,
|
||||
format_hint=req.format,
|
||||
)
|
||||
else:
|
||||
output = export_reencode(
|
||||
req.input_path,
|
||||
req.output_path,
|
||||
segments,
|
||||
resolution=req.resolution,
|
||||
format_hint=req.format,
|
||||
)
|
||||
finally:
|
||||
if ass_path and os.path.exists(ass_path):
|
||||
os.unlink(ass_path)
|
||||
|
||||
# Audio enhancement: clean, then mux back into the exported video
|
||||
if req.enhanceAudio:
|
||||
try:
|
||||
tmp_dir = tempfile.mkdtemp(prefix="cutscript_audio_")
|
||||
cleaned_audio = os.path.join(tmp_dir, "cleaned.wav")
|
||||
clean_audio(output, cleaned_audio)
|
||||
|
||||
muxed_path = output + ".muxed.mp4"
|
||||
_mux_audio(output, cleaned_audio, muxed_path)
|
||||
|
||||
os.replace(muxed_path, output)
|
||||
logger.info(f"Audio enhanced and muxed into {output}")
|
||||
|
||||
# Cleanup
|
||||
try:
|
||||
os.remove(cleaned_audio)
|
||||
os.rmdir(tmp_dir)
|
||||
except OSError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning(f"Audio enhancement failed (non-fatal): {e}")
|
||||
|
||||
# Sidecar SRT: generate and save alongside video
|
||||
srt_path = None
|
||||
if req.captions == "sidecar" and words_dicts:
|
||||
srt_content = generate_srt(words_dicts, deleted_set)
|
||||
srt_path = req.output_path.rsplit(".", 1)[0] + ".srt"
|
||||
save_captions(srt_content, srt_path)
|
||||
logger.info(f"Sidecar SRT saved to {srt_path}")
|
||||
|
||||
result = {"status": "ok", "output_path": output}
|
||||
if srt_path:
|
||||
result["srt_path"] = srt_path
|
||||
return result
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except RuntimeError as e:
|
||||
logger.error(f"Export failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Export error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
53
backend/routers/transcribe.py
Normal file
53
backend/routers/transcribe.py
Normal file
@ -0,0 +1,53 @@
|
||||
"""Transcription endpoint using WhisperX."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.transcription import transcribe_audio
|
||||
from services.diarization import diarize_and_label
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class TranscribeRequest(BaseModel):
|
||||
file_path: str
|
||||
model: str = "base"
|
||||
language: Optional[str] = None
|
||||
use_gpu: bool = True
|
||||
use_cache: bool = True
|
||||
diarize: bool = False
|
||||
hf_token: Optional[str] = None
|
||||
num_speakers: Optional[int] = None
|
||||
|
||||
|
||||
@router.post("/transcribe")
|
||||
async def transcribe(req: TranscribeRequest):
|
||||
try:
|
||||
result = transcribe_audio(
|
||||
file_path=req.file_path,
|
||||
model_name=req.model,
|
||||
use_gpu=req.use_gpu,
|
||||
use_cache=req.use_cache,
|
||||
language=req.language,
|
||||
)
|
||||
|
||||
if req.diarize and req.hf_token:
|
||||
result = diarize_and_label(
|
||||
transcription_result=result,
|
||||
audio_path=req.file_path,
|
||||
hf_token=req.hf_token,
|
||||
num_speakers=req.num_speakers,
|
||||
use_gpu=req.use_gpu,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail=f"File not found: {req.file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
0
backend/services/__init__.py
Normal file
0
backend/services/__init__.py
Normal file
211
backend/services/ai_provider.py
Normal file
211
backend/services/ai_provider.py
Normal file
@ -0,0 +1,211 @@
|
||||
"""
|
||||
Unified AI provider interface for Ollama, OpenAI, and Claude.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AIProvider:
|
||||
"""Routes completion requests to the configured provider."""
|
||||
|
||||
@staticmethod
|
||||
def complete(
|
||||
prompt: str,
|
||||
provider: str = "ollama",
|
||||
model: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
system_prompt: Optional[str] = None,
|
||||
temperature: float = 0.3,
|
||||
) -> str:
|
||||
if provider == "ollama":
|
||||
return _ollama_complete(prompt, model or "llama3", base_url or "http://localhost:11434", system_prompt, temperature)
|
||||
elif provider == "openai":
|
||||
return _openai_complete(prompt, model or "gpt-4o", api_key or "", system_prompt, temperature)
|
||||
elif provider == "claude":
|
||||
return _claude_complete(prompt, model or "claude-sonnet-4-20250514", api_key or "", system_prompt, temperature)
|
||||
else:
|
||||
raise ValueError(f"Unknown provider: {provider}")
|
||||
|
||||
@staticmethod
|
||||
def list_ollama_models(base_url: str = "http://localhost:11434") -> List[str]:
|
||||
try:
|
||||
resp = requests.get(f"{base_url}/api/tags", timeout=3)
|
||||
if resp.status_code == 200:
|
||||
return [m["name"] for m in resp.json().get("models", [])]
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
def _ollama_complete(prompt: str, model: str, base_url: str, system_prompt: Optional[str], temperature: float) -> str:
|
||||
body = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature},
|
||||
}
|
||||
if system_prompt:
|
||||
body["system"] = system_prompt
|
||||
|
||||
try:
|
||||
resp = requests.post(f"{base_url}/api/generate", json=body, timeout=120)
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("response", "").strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Ollama error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def _openai_complete(prompt: str, model: str, api_key: str, system_prompt: Optional[str], temperature: float) -> str:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=api_key)
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
)
|
||||
return response.choices[0].message.content.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"OpenAI error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def _claude_complete(prompt: str, model: str, api_key: str, system_prompt: Optional[str], temperature: float) -> str:
|
||||
try:
|
||||
import anthropic
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
kwargs = {
|
||||
"model": model,
|
||||
"max_tokens": 4096,
|
||||
"temperature": temperature,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
if system_prompt:
|
||||
kwargs["system"] = system_prompt
|
||||
|
||||
response = client.messages.create(**kwargs)
|
||||
return response.content[0].text.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Claude error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def detect_filler_words(
|
||||
transcript: str,
|
||||
words: List[dict],
|
||||
provider: str = "ollama",
|
||||
model: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
custom_filler_words: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Use an LLM to identify filler words in the transcript.
|
||||
Returns {"wordIndices": [...], "fillerWords": [{"index": N, "word": "...", "reason": "..."}]}
|
||||
"""
|
||||
word_list = "\n".join(f"{w['index']}: {w['word']}" for w in words)
|
||||
|
||||
custom_line = ""
|
||||
if custom_filler_words and custom_filler_words.strip():
|
||||
custom_line = f"\n\nAdditionally, flag these user-specified filler words/phrases: {custom_filler_words.strip()}"
|
||||
|
||||
prompt = f"""Analyze this transcript for filler words and verbal hesitations.
|
||||
|
||||
Filler words include: um, uh, uh huh, hmm, like (when used as filler), you know, so (when starting sentences unnecessarily), basically, actually, literally, right, I mean, kind of, sort of, well (when used as filler).
|
||||
|
||||
Also flag repeated words that indicate stammering (e.g., "I I I" or "the the").{custom_line}
|
||||
|
||||
Here are the words with their indices:
|
||||
{word_list}
|
||||
|
||||
Return ONLY a valid JSON object with this exact structure:
|
||||
{{"wordIndices": [list of integer indices to remove], "fillerWords": [{{"index": integer, "word": "the word", "reason": "brief reason"}}]}}
|
||||
|
||||
Be conservative -- only flag clear filler words, not words that are part of meaningful sentences."""
|
||||
|
||||
system = "You are a precise text analysis tool. Return only valid JSON, no explanation."
|
||||
|
||||
result_text = AIProvider.complete(
|
||||
prompt=prompt,
|
||||
provider=provider,
|
||||
model=model,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
system_prompt=system,
|
||||
temperature=0.1,
|
||||
)
|
||||
|
||||
try:
|
||||
start = result_text.find("{")
|
||||
end = result_text.rfind("}") + 1
|
||||
if start >= 0 and end > start:
|
||||
return json.loads(result_text[start:end])
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Failed to parse AI response as JSON: {result_text[:200]}")
|
||||
|
||||
return {"wordIndices": [], "fillerWords": []}
|
||||
|
||||
|
||||
def create_clip_suggestion(
|
||||
transcript: str,
|
||||
words: List[dict],
|
||||
target_duration: int = 60,
|
||||
provider: str = "ollama",
|
||||
model: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Use an LLM to find the best clip segments in a transcript.
|
||||
"""
|
||||
word_list = "\n".join(
|
||||
f"{w['index']}: \"{w['word']}\" ({w.get('start', 0):.1f}s - {w.get('end', 0):.1f}s)"
|
||||
for w in words
|
||||
)
|
||||
|
||||
prompt = f"""Analyze this transcript and find the most engaging {target_duration}-second segment(s) that would work well as a YouTube Short or social media clip.
|
||||
|
||||
Look for: compelling stories, surprising facts, emotional moments, clear explanations, humor, or quotable statements.
|
||||
|
||||
Words with indices and timestamps:
|
||||
{word_list}
|
||||
|
||||
Return ONLY a valid JSON object:
|
||||
{{"clips": [{{"title": "short catchy title", "startWordIndex": integer, "endWordIndex": integer, "startTime": float, "endTime": float, "reason": "why this segment is engaging"}}]}}
|
||||
|
||||
Suggest 1-3 clips, each approximately {target_duration} seconds long."""
|
||||
|
||||
system = "You are a viral content expert. Return only valid JSON, no explanation."
|
||||
|
||||
result_text = AIProvider.complete(
|
||||
prompt=prompt,
|
||||
provider=provider,
|
||||
model=model,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
system_prompt=system,
|
||||
temperature=0.5,
|
||||
)
|
||||
|
||||
try:
|
||||
start = result_text.find("{")
|
||||
end = result_text.rfind("}") + 1
|
||||
if start >= 0 and end > start:
|
||||
return json.loads(result_text[start:end])
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Failed to parse clip suggestions: {result_text[:200]}")
|
||||
|
||||
return {"clips": []}
|
||||
79
backend/services/audio_cleaner.py
Normal file
79
backend/services/audio_cleaner.py
Normal file
@ -0,0 +1,79 @@
|
||||
"""
|
||||
Audio noise reduction using DeepFilterNet.
|
||||
Falls back to a basic FFmpeg noise filter if DeepFilterNet is not installed.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from df.enhance import enhance, init_df, load_audio, save_audio
|
||||
DEEPFILTER_AVAILABLE = True
|
||||
except ImportError:
|
||||
DEEPFILTER_AVAILABLE = False
|
||||
|
||||
|
||||
_df_model = None
|
||||
_df_state = None
|
||||
|
||||
|
||||
def _init_deepfilter():
|
||||
global _df_model, _df_state
|
||||
if _df_model is None:
|
||||
logger.info("Initializing DeepFilterNet model")
|
||||
_df_model, _df_state, _ = init_df()
|
||||
return _df_model, _df_state
|
||||
|
||||
|
||||
def clean_audio(
|
||||
input_path: str,
|
||||
output_path: str = "",
|
||||
) -> str:
|
||||
"""
|
||||
Apply noise reduction to an audio file.
|
||||
|
||||
If DeepFilterNet is available, uses it for high-quality results.
|
||||
Otherwise falls back to FFmpeg's anlmdn filter.
|
||||
|
||||
Returns: path to the cleaned audio file.
|
||||
"""
|
||||
input_path = Path(input_path)
|
||||
if not output_path:
|
||||
output_path = str(input_path.with_stem(input_path.stem + "_clean"))
|
||||
|
||||
if DEEPFILTER_AVAILABLE:
|
||||
return _clean_with_deepfilter(str(input_path), output_path)
|
||||
else:
|
||||
return _clean_with_ffmpeg(str(input_path), output_path)
|
||||
|
||||
|
||||
def _clean_with_deepfilter(input_path: str, output_path: str) -> str:
|
||||
model, state = _init_deepfilter()
|
||||
audio, info = load_audio(input_path, sr=state.sr())
|
||||
enhanced = enhance(model, state, audio)
|
||||
save_audio(output_path, enhanced, sr=state.sr())
|
||||
logger.info(f"DeepFilterNet cleaned audio saved to {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
def _clean_with_ffmpeg(input_path: str, output_path: str) -> str:
|
||||
"""Fallback: basic noise reduction using FFmpeg's anlmdn filter."""
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-i", input_path,
|
||||
"-af", "anlmdn=s=7:p=0.002:r=0.002:m=15",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg audio cleaning failed: {result.stderr[-300:]}")
|
||||
logger.info(f"FFmpeg cleaned audio saved to {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
def is_deepfilter_available() -> bool:
|
||||
return DEEPFILTER_AVAILABLE
|
||||
59
backend/services/background_removal.py
Normal file
59
backend/services/background_removal.py
Normal file
@ -0,0 +1,59 @@
|
||||
"""
|
||||
AI background removal (Phase 5 - future).
|
||||
Uses MediaPipe or Robust Video Matting for person segmentation.
|
||||
Export-only -- no real-time preview.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Placeholder for Phase 5 implementation
|
||||
# Will use mediapipe or rvm for segmentation at export time
|
||||
|
||||
MEDIAPIPE_AVAILABLE = False
|
||||
RVM_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import mediapipe as mp
|
||||
MEDIAPIPE_AVAILABLE = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
pass # rvm import would go here
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
return MEDIAPIPE_AVAILABLE or RVM_AVAILABLE
|
||||
|
||||
|
||||
def remove_background_on_export(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
replacement: str = "blur",
|
||||
replacement_value: str = "",
|
||||
) -> str:
|
||||
"""
|
||||
Process video frame-by-frame to remove/replace background.
|
||||
Only runs during export (not real-time).
|
||||
|
||||
Args:
|
||||
input_path: source video
|
||||
output_path: destination
|
||||
replacement: 'blur', 'color', 'image', or 'video'
|
||||
replacement_value: hex color, image path, or video path
|
||||
|
||||
Returns:
|
||||
output_path
|
||||
"""
|
||||
if not is_available():
|
||||
raise RuntimeError(
|
||||
"Background removal requires mediapipe or robust-video-matting. "
|
||||
"Install with: pip install mediapipe"
|
||||
)
|
||||
|
||||
# Phase 5 implementation will go here
|
||||
raise NotImplementedError("Background removal is planned for Phase 5")
|
||||
148
backend/services/caption_generator.py
Normal file
148
backend/services/caption_generator.py
Normal file
@ -0,0 +1,148 @@
|
||||
"""
|
||||
Generate caption files (SRT, VTT, ASS) from word-level timestamps.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _format_srt_time(seconds: float) -> str:
|
||||
h = int(seconds // 3600)
|
||||
m = int((seconds % 3600) // 60)
|
||||
s = int(seconds % 60)
|
||||
ms = int((seconds % 1) * 1000)
|
||||
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
|
||||
|
||||
|
||||
def _format_vtt_time(seconds: float) -> str:
|
||||
h = int(seconds // 3600)
|
||||
m = int((seconds % 3600) // 60)
|
||||
s = int(seconds % 60)
|
||||
ms = int((seconds % 1) * 1000)
|
||||
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
|
||||
|
||||
|
||||
def _format_ass_time(seconds: float) -> str:
|
||||
h = int(seconds // 3600)
|
||||
m = int((seconds % 3600) // 60)
|
||||
s = int(seconds % 60)
|
||||
cs = int((seconds % 1) * 100)
|
||||
return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
|
||||
|
||||
|
||||
def generate_srt(
|
||||
words: List[dict],
|
||||
deleted_indices: Optional[set] = None,
|
||||
words_per_line: int = 8,
|
||||
) -> str:
|
||||
"""Generate SRT caption content from word-level timestamps."""
|
||||
deleted_indices = deleted_indices or set()
|
||||
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
|
||||
|
||||
lines = []
|
||||
counter = 1
|
||||
for chunk_start in range(0, len(active_words), words_per_line):
|
||||
chunk = active_words[chunk_start:chunk_start + words_per_line]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
start_time = chunk[0][1]["start"]
|
||||
end_time = chunk[-1][1]["end"]
|
||||
text = " ".join(w["word"] for _, w in chunk)
|
||||
|
||||
lines.append(str(counter))
|
||||
lines.append(f"{_format_srt_time(start_time)} --> {_format_srt_time(end_time)}")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
counter += 1
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def generate_vtt(
|
||||
words: List[dict],
|
||||
deleted_indices: Optional[set] = None,
|
||||
words_per_line: int = 8,
|
||||
) -> str:
|
||||
"""Generate WebVTT caption content."""
|
||||
deleted_indices = deleted_indices or set()
|
||||
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
|
||||
|
||||
lines = ["WEBVTT", ""]
|
||||
for chunk_start in range(0, len(active_words), words_per_line):
|
||||
chunk = active_words[chunk_start:chunk_start + words_per_line]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
start_time = chunk[0][1]["start"]
|
||||
end_time = chunk[-1][1]["end"]
|
||||
text = " ".join(w["word"] for _, w in chunk)
|
||||
|
||||
lines.append(f"{_format_vtt_time(start_time)} --> {_format_vtt_time(end_time)}")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def generate_ass(
|
||||
words: List[dict],
|
||||
deleted_indices: Optional[set] = None,
|
||||
words_per_line: int = 8,
|
||||
style: Optional[dict] = None,
|
||||
) -> str:
|
||||
"""Generate ASS subtitle content with styling."""
|
||||
deleted_indices = deleted_indices or set()
|
||||
active_words = [(i, w) for i, w in enumerate(words) if i not in deleted_indices]
|
||||
|
||||
s = style or {}
|
||||
font = s.get("fontName", "Arial")
|
||||
size = s.get("fontSize", 48)
|
||||
color = s.get("fontColor", "&H00FFFFFF")
|
||||
bold = "-1" if s.get("bold", True) else "0"
|
||||
alignment = 2
|
||||
|
||||
header = f"""[Script Info]
|
||||
Title: AI Video Editor Captions
|
||||
ScriptType: v4.00+
|
||||
PlayResX: 1920
|
||||
PlayResY: 1080
|
||||
|
||||
[V4+ Styles]
|
||||
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
||||
Style: Default,{font},{size},{color},&H000000FF,&H00000000,&H80000000,{bold},0,0,0,100,100,0,0,1,2,1,{alignment},20,20,40,1
|
||||
|
||||
[Events]
|
||||
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||
"""
|
||||
|
||||
events = []
|
||||
for chunk_start in range(0, len(active_words), words_per_line):
|
||||
chunk = active_words[chunk_start:chunk_start + words_per_line]
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
start_time = chunk[0][1]["start"]
|
||||
end_time = chunk[-1][1]["end"]
|
||||
text = " ".join(w["word"] for _, w in chunk)
|
||||
|
||||
events.append(
|
||||
f"Dialogue: 0,{_format_ass_time(start_time)},{_format_ass_time(end_time)},Default,,0,0,0,,{text}"
|
||||
)
|
||||
|
||||
return header + "\n".join(events) + "\n"
|
||||
|
||||
|
||||
def save_captions(
|
||||
content: str,
|
||||
output_path: str,
|
||||
) -> str:
|
||||
"""Write caption content to a file."""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(content, encoding="utf-8")
|
||||
logger.info(f"Saved captions to {output_path}")
|
||||
return str(output_path)
|
||||
98
backend/services/diarization.py
Normal file
98
backend/services/diarization.py
Normal file
@ -0,0 +1,98 @@
|
||||
"""
|
||||
Speaker diarization service using pyannote.audio.
|
||||
Refactored from the original repo -- removed Streamlit dependency.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from utils.gpu_utils import get_optimal_device
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_pipeline_cache = {}
|
||||
|
||||
|
||||
def _get_pipeline(hf_token: str, device: torch.device):
|
||||
cache_key = str(device)
|
||||
if cache_key in _pipeline_cache:
|
||||
return _pipeline_cache[cache_key]
|
||||
|
||||
try:
|
||||
from pyannote.audio import Pipeline
|
||||
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.0",
|
||||
use_auth_token=hf_token,
|
||||
)
|
||||
if device.type == "cuda":
|
||||
pipeline = pipeline.to(device)
|
||||
|
||||
_pipeline_cache[cache_key] = pipeline
|
||||
return pipeline
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load diarization pipeline: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def diarize_and_label(
|
||||
transcription_result: dict,
|
||||
audio_path: str,
|
||||
hf_token: Optional[str] = None,
|
||||
num_speakers: Optional[int] = None,
|
||||
use_gpu: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Apply speaker diarization to an existing transcription result.
|
||||
Adds 'speaker' field to each word and segment.
|
||||
|
||||
Returns the mutated transcription_result with speaker labels.
|
||||
"""
|
||||
hf_token = hf_token or os.environ.get("HF_TOKEN")
|
||||
if not hf_token:
|
||||
logger.warning("No HuggingFace token provided; skipping diarization")
|
||||
return transcription_result
|
||||
|
||||
device = get_optimal_device() if use_gpu else torch.device("cpu")
|
||||
pipeline = _get_pipeline(hf_token, device)
|
||||
if pipeline is None:
|
||||
return transcription_result
|
||||
|
||||
audio_path = Path(audio_path)
|
||||
logger.info(f"Running diarization on {audio_path}")
|
||||
|
||||
try:
|
||||
diarization = pipeline(str(audio_path), num_speakers=num_speakers)
|
||||
except Exception as e:
|
||||
logger.error(f"Diarization failed: {e}")
|
||||
return transcription_result
|
||||
|
||||
speaker_map = []
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
speaker_map.append((turn.start, turn.end, speaker))
|
||||
|
||||
def _find_speaker(start: float, end: float) -> str:
|
||||
best_overlap = 0
|
||||
best_speaker = "UNKNOWN"
|
||||
for s_start, s_end, speaker in speaker_map:
|
||||
overlap_start = max(start, s_start)
|
||||
overlap_end = min(end, s_end)
|
||||
overlap = max(0, overlap_end - overlap_start)
|
||||
if overlap > best_overlap:
|
||||
best_overlap = overlap
|
||||
best_speaker = speaker
|
||||
return best_speaker
|
||||
|
||||
for word in transcription_result.get("words", []):
|
||||
word["speaker"] = _find_speaker(word["start"], word["end"])
|
||||
|
||||
for segment in transcription_result.get("segments", []):
|
||||
segment["speaker"] = _find_speaker(segment["start"], segment["end"])
|
||||
for w in segment.get("words", []):
|
||||
w["speaker"] = _find_speaker(w["start"], w["end"])
|
||||
|
||||
return transcription_result
|
||||
205
backend/services/transcription.py
Normal file
205
backend/services/transcription.py
Normal file
@ -0,0 +1,205 @@
|
||||
"""
|
||||
WhisperX-based transcription service with word-level alignment.
|
||||
Falls back to standard Whisper if WhisperX is not available.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from utils.gpu_utils import get_optimal_device, configure_gpu
|
||||
from utils.audio_processing import extract_audio
|
||||
from utils.cache import load_from_cache, save_to_cache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_model_cache: dict = {}
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
WHISPERX_AVAILABLE = True
|
||||
except ImportError:
|
||||
WHISPERX_AVAILABLE = False
|
||||
import whisper
|
||||
|
||||
try:
|
||||
HF_TOKEN = None
|
||||
import os
|
||||
HF_TOKEN = os.environ.get("HF_TOKEN")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _get_device(use_gpu: bool = True) -> torch.device:
|
||||
if use_gpu:
|
||||
return get_optimal_device()
|
||||
return torch.device("cpu")
|
||||
|
||||
|
||||
def _load_model(model_name: str, device: torch.device):
|
||||
cache_key = f"{model_name}_{device}"
|
||||
if cache_key in _model_cache:
|
||||
return _model_cache[cache_key]
|
||||
|
||||
logger.info(f"Loading model: {model_name} on {device}")
|
||||
if WHISPERX_AVAILABLE:
|
||||
compute_type = "float16" if device.type == "cuda" else "int8"
|
||||
model = whisperx.load_model(
|
||||
model_name,
|
||||
device=str(device),
|
||||
compute_type=compute_type,
|
||||
)
|
||||
else:
|
||||
model = whisper.load_model(model_name, device=device)
|
||||
|
||||
_model_cache[cache_key] = model
|
||||
return model
|
||||
|
||||
|
||||
def transcribe_audio(
|
||||
file_path: str,
|
||||
model_name: str = "base",
|
||||
use_gpu: bool = True,
|
||||
use_cache: bool = True,
|
||||
language: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe audio/video file and return word-level timestamps.
|
||||
|
||||
Returns:
|
||||
dict with keys: words, segments, language
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if use_cache:
|
||||
cached = load_from_cache(file_path, model_name, "transcribe_wx")
|
||||
if cached:
|
||||
logger.info("Using cached transcription")
|
||||
return cached
|
||||
|
||||
video_extensions = {".mp4", ".avi", ".mov", ".mkv", ".webm"}
|
||||
if file_path.suffix.lower() in video_extensions:
|
||||
audio_path = extract_audio(file_path)
|
||||
else:
|
||||
audio_path = file_path
|
||||
|
||||
device = _get_device(use_gpu)
|
||||
model = _load_model(model_name, device)
|
||||
|
||||
logger.info(f"Transcribing: {file_path}")
|
||||
|
||||
if WHISPERX_AVAILABLE:
|
||||
result = _transcribe_whisperx(model, str(audio_path), device, language)
|
||||
else:
|
||||
result = _transcribe_standard(model, str(audio_path), language)
|
||||
|
||||
if use_cache:
|
||||
save_to_cache(file_path, result, model_name, "transcribe_wx")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _transcribe_whisperx(model, audio_path: str, device: torch.device, language: Optional[str]) -> dict:
|
||||
audio = whisperx.load_audio(audio_path)
|
||||
transcribe_opts = {}
|
||||
if language:
|
||||
transcribe_opts["language"] = language
|
||||
|
||||
result = model.transcribe(audio, batch_size=16, **transcribe_opts)
|
||||
detected_language = result.get("language", "en")
|
||||
|
||||
align_model, align_metadata = whisperx.load_align_model(
|
||||
language_code=detected_language,
|
||||
device=str(device),
|
||||
)
|
||||
aligned = whisperx.align(
|
||||
result["segments"],
|
||||
align_model,
|
||||
align_metadata,
|
||||
audio,
|
||||
str(device),
|
||||
return_char_alignments=False,
|
||||
)
|
||||
|
||||
words = []
|
||||
for seg in aligned.get("segments", []):
|
||||
for w in seg.get("words", []):
|
||||
words.append({
|
||||
"word": w.get("word", ""),
|
||||
"start": round(w.get("start", 0), 3),
|
||||
"end": round(w.get("end", 0), 3),
|
||||
"confidence": round(w.get("score", 0), 3),
|
||||
})
|
||||
|
||||
segments = []
|
||||
for i, seg in enumerate(aligned.get("segments", [])):
|
||||
seg_words = []
|
||||
for w in seg.get("words", []):
|
||||
seg_words.append({
|
||||
"word": w.get("word", ""),
|
||||
"start": round(w.get("start", 0), 3),
|
||||
"end": round(w.get("end", 0), 3),
|
||||
"confidence": round(w.get("score", 0), 3),
|
||||
})
|
||||
segments.append({
|
||||
"id": i,
|
||||
"start": round(seg.get("start", 0), 3),
|
||||
"end": round(seg.get("end", 0), 3),
|
||||
"text": seg.get("text", "").strip(),
|
||||
"words": seg_words,
|
||||
})
|
||||
|
||||
return {
|
||||
"words": words,
|
||||
"segments": segments,
|
||||
"language": detected_language,
|
||||
}
|
||||
|
||||
|
||||
def _transcribe_standard(model, audio_path: str, language: Optional[str]) -> dict:
|
||||
"""Fallback: standard Whisper (segment-level only, synthesized word timestamps)."""
|
||||
opts = {}
|
||||
if language:
|
||||
opts["language"] = language
|
||||
|
||||
result = model.transcribe(audio_path, **opts)
|
||||
detected_language = result.get("language", "en")
|
||||
|
||||
words = []
|
||||
segments = []
|
||||
|
||||
for i, seg in enumerate(result.get("segments", [])):
|
||||
text = seg.get("text", "").strip()
|
||||
seg_start = seg.get("start", 0)
|
||||
seg_end = seg.get("end", 0)
|
||||
seg_words_text = text.split()
|
||||
duration = seg_end - seg_start
|
||||
|
||||
seg_words = []
|
||||
for j, w_text in enumerate(seg_words_text):
|
||||
w_start = seg_start + (j / max(len(seg_words_text), 1)) * duration
|
||||
w_end = seg_start + ((j + 1) / max(len(seg_words_text), 1)) * duration
|
||||
word_obj = {
|
||||
"word": w_text,
|
||||
"start": round(w_start, 3),
|
||||
"end": round(w_end, 3),
|
||||
"confidence": 0.5,
|
||||
}
|
||||
words.append(word_obj)
|
||||
seg_words.append(word_obj)
|
||||
|
||||
segments.append({
|
||||
"id": i,
|
||||
"start": round(seg_start, 3),
|
||||
"end": round(seg_end, 3),
|
||||
"text": text,
|
||||
"words": seg_words,
|
||||
})
|
||||
|
||||
return {
|
||||
"words": words,
|
||||
"segments": segments,
|
||||
"language": detected_language,
|
||||
}
|
||||
271
backend/services/video_editor.py
Normal file
271
backend/services/video_editor.py
Normal file
@ -0,0 +1,271 @@
|
||||
"""
|
||||
FFmpeg-based video cutting engine.
|
||||
Uses stream copy for fast, lossless cuts and falls back to re-encode when needed.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _find_ffmpeg() -> str:
|
||||
"""Locate ffmpeg binary."""
|
||||
for cmd in ["ffmpeg", "ffmpeg.exe"]:
|
||||
try:
|
||||
subprocess.run([cmd, "-version"], capture_output=True, check=True)
|
||||
return cmd
|
||||
except (FileNotFoundError, subprocess.CalledProcessError):
|
||||
continue
|
||||
raise RuntimeError("FFmpeg not found. Install it or add it to PATH.")
|
||||
|
||||
|
||||
def export_stream_copy(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
keep_segments: List[dict],
|
||||
) -> str:
|
||||
"""
|
||||
Export video using FFmpeg concat demuxer with stream copy.
|
||||
~100x faster than re-encoding. No quality loss.
|
||||
|
||||
Args:
|
||||
input_path: source video file
|
||||
output_path: destination file
|
||||
keep_segments: list of {"start": float, "end": float} to keep
|
||||
|
||||
Returns:
|
||||
output_path on success
|
||||
"""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
input_path = str(Path(input_path).resolve())
|
||||
output_path = str(Path(output_path).resolve())
|
||||
|
||||
if not keep_segments:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix="aive_export_")
|
||||
|
||||
try:
|
||||
segment_files = []
|
||||
for i, seg in enumerate(keep_segments):
|
||||
seg_file = os.path.join(temp_dir, f"seg_{i:04d}.ts")
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-ss", str(seg["start"]),
|
||||
"-to", str(seg["end"]),
|
||||
"-i", input_path,
|
||||
"-c", "copy",
|
||||
"-avoid_negative_ts", "make_zero",
|
||||
"-f", "mpegts",
|
||||
seg_file,
|
||||
]
|
||||
logger.info(f"Extracting segment {i}: {seg['start']:.2f}s - {seg['end']:.2f}s")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"Stream copy segment {i} failed, will try re-encode: {result.stderr[-200:]}")
|
||||
return export_reencode(input_path, output_path, keep_segments)
|
||||
segment_files.append(seg_file)
|
||||
|
||||
concat_str = "|".join(segment_files)
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", f"concat:{concat_str}",
|
||||
"-c", "copy",
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
logger.info(f"Concatenating {len(segment_files)} segments -> {output_path}")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"Concat failed, falling back to re-encode: {result.stderr[-200:]}")
|
||||
return export_reencode(input_path, output_path, keep_segments)
|
||||
|
||||
return output_path
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
try:
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
os.rmdir(temp_dir)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def export_reencode(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
keep_segments: List[dict],
|
||||
resolution: str = "1080p",
|
||||
format_hint: str = "mp4",
|
||||
) -> str:
|
||||
"""
|
||||
Export video with full re-encode. Slower but supports resolution changes,
|
||||
format conversion, and avoids stream-copy edge cases.
|
||||
"""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
input_path = str(Path(input_path).resolve())
|
||||
output_path = str(Path(output_path).resolve())
|
||||
|
||||
if not keep_segments:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
scale_map = {
|
||||
"720p": "scale=-2:720",
|
||||
"1080p": "scale=-2:1080",
|
||||
"4k": "scale=-2:2160",
|
||||
}
|
||||
|
||||
filter_parts = []
|
||||
for i, seg in enumerate(keep_segments):
|
||||
filter_parts.append(
|
||||
f"[0:v]trim=start={seg['start']}:end={seg['end']},setpts=PTS-STARTPTS[v{i}];"
|
||||
f"[0:a]atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS[a{i}];"
|
||||
)
|
||||
|
||||
n = len(keep_segments)
|
||||
concat_inputs = "".join(f"[v{i}][a{i}]" for i in range(n))
|
||||
filter_parts.append(f"{concat_inputs}concat=n={n}:v=1:a=1[outv][outa]")
|
||||
|
||||
filter_complex = "".join(filter_parts)
|
||||
|
||||
scale = scale_map.get(resolution, "")
|
||||
if scale:
|
||||
filter_complex += f";[outv]{scale}[outv_scaled]"
|
||||
video_map = "[outv_scaled]"
|
||||
else:
|
||||
video_map = "[outv]"
|
||||
|
||||
codec_args = ["-c:v", "libx264", "-preset", "medium", "-crf", "18", "-c:a", "aac", "-b:a", "192k"]
|
||||
if format_hint == "webm":
|
||||
codec_args = ["-c:v", "libvpx-vp9", "-crf", "30", "-b:v", "0", "-c:a", "libopus"]
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", video_map,
|
||||
"-map", "[outa]",
|
||||
*codec_args,
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
|
||||
logger.info(f"Re-encoding {n} segments -> {output_path} ({resolution})")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg re-encode failed: {result.stderr[-500:]}")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def export_reencode_with_subs(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
keep_segments: List[dict],
|
||||
subtitle_path: str,
|
||||
resolution: str = "1080p",
|
||||
format_hint: str = "mp4",
|
||||
) -> str:
|
||||
"""
|
||||
Export video with re-encode and burn-in subtitles (ASS format).
|
||||
Applies trim+concat first, then overlays the subtitle file.
|
||||
"""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
input_path = str(Path(input_path).resolve())
|
||||
output_path = str(Path(output_path).resolve())
|
||||
subtitle_path = str(Path(subtitle_path).resolve())
|
||||
|
||||
if not keep_segments:
|
||||
raise ValueError("No segments to export")
|
||||
|
||||
scale_map = {
|
||||
"720p": "scale=-2:720",
|
||||
"1080p": "scale=-2:1080",
|
||||
"4k": "scale=-2:2160",
|
||||
}
|
||||
|
||||
filter_parts = []
|
||||
for i, seg in enumerate(keep_segments):
|
||||
filter_parts.append(
|
||||
f"[0:v]trim=start={seg['start']}:end={seg['end']},setpts=PTS-STARTPTS[v{i}];"
|
||||
f"[0:a]atrim=start={seg['start']}:end={seg['end']},asetpts=PTS-STARTPTS[a{i}];"
|
||||
)
|
||||
|
||||
n = len(keep_segments)
|
||||
concat_inputs = "".join(f"[v{i}][a{i}]" for i in range(n))
|
||||
filter_parts.append(f"{concat_inputs}concat=n={n}:v=1:a=1[outv][outa]")
|
||||
|
||||
filter_complex = "".join(filter_parts)
|
||||
|
||||
# Escape path for FFmpeg subtitle filter (Windows backslashes need escaping)
|
||||
escaped_sub = subtitle_path.replace("\\", "/").replace(":", "\\:")
|
||||
|
||||
scale = scale_map.get(resolution, "")
|
||||
if scale:
|
||||
filter_complex += f";[outv]{scale},ass='{escaped_sub}'[outv_final]"
|
||||
else:
|
||||
filter_complex += f";[outv]ass='{escaped_sub}'[outv_final]"
|
||||
video_map = "[outv_final]"
|
||||
|
||||
codec_args = ["-c:v", "libx264", "-preset", "medium", "-crf", "18", "-c:a", "aac", "-b:a", "192k"]
|
||||
if format_hint == "webm":
|
||||
codec_args = ["-c:v", "libvpx-vp9", "-crf", "30", "-b:v", "0", "-c:a", "libopus"]
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-i", input_path,
|
||||
"-filter_complex", filter_complex,
|
||||
"-map", video_map,
|
||||
"-map", "[outa]",
|
||||
*codec_args,
|
||||
"-movflags", "+faststart",
|
||||
output_path,
|
||||
]
|
||||
|
||||
logger.info(f"Re-encoding {n} segments with subtitles -> {output_path} ({resolution})")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg re-encode with subs failed: {result.stderr[-500:]}")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def get_video_info(input_path: str) -> dict:
|
||||
"""Get basic video metadata using ffprobe."""
|
||||
ffmpeg = _find_ffmpeg()
|
||||
ffprobe = ffmpeg.replace("ffmpeg", "ffprobe")
|
||||
|
||||
cmd = [
|
||||
ffprobe, "-v", "quiet",
|
||||
"-print_format", "json",
|
||||
"-show_format", "-show_streams",
|
||||
str(input_path),
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
import json
|
||||
data = json.loads(result.stdout)
|
||||
fmt = data.get("format", {})
|
||||
video_stream = next((s for s in data.get("streams", []) if s.get("codec_type") == "video"), {})
|
||||
|
||||
return {
|
||||
"duration": float(fmt.get("duration", 0)),
|
||||
"size": int(fmt.get("size", 0)),
|
||||
"format": fmt.get("format_name", ""),
|
||||
"width": int(video_stream.get("width", 0)),
|
||||
"height": int(video_stream.get("height", 0)),
|
||||
"codec": video_stream.get("codec_name", ""),
|
||||
"fps": eval(video_stream.get("r_frame_rate", "0/1")) if "/" in video_stream.get("r_frame_rate", "") else 0,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get video info: {e}")
|
||||
return {}
|
||||
0
backend/utils/__init__.py
Normal file
0
backend/utils/__init__.py
Normal file
59
backend/utils/audio_processing.py
Normal file
59
backend/utils/audio_processing.py
Normal file
@ -0,0 +1,59 @@
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import os
|
||||
import logging
|
||||
|
||||
try:
|
||||
from moviepy import AudioFileClip
|
||||
except ImportError:
|
||||
from moviepy.editor import AudioFileClip
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_temp_audio_files = []
|
||||
|
||||
|
||||
def extract_audio(video_path: Path):
|
||||
"""Extract audio from a video file into a temp directory for automatic cleanup."""
|
||||
try:
|
||||
audio = AudioFileClip(str(video_path))
|
||||
temp_dir = tempfile.mkdtemp(prefix="videotranscriber_")
|
||||
audio_path = Path(temp_dir) / f"{video_path.stem}_audio.wav"
|
||||
try:
|
||||
audio.write_audiofile(str(audio_path), logger=None)
|
||||
except TypeError:
|
||||
# moviepy 1.x uses verbose parameter; moviepy 2.x removed it
|
||||
audio.write_audiofile(str(audio_path), verbose=False, logger=None)
|
||||
audio.close()
|
||||
_temp_audio_files.append(str(audio_path))
|
||||
return audio_path
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Audio extraction failed: {e}")
|
||||
|
||||
|
||||
def cleanup_temp_audio():
|
||||
"""Remove all temporary audio files created during processing."""
|
||||
cleaned = 0
|
||||
for fpath in _temp_audio_files:
|
||||
try:
|
||||
if os.path.exists(fpath):
|
||||
os.remove(fpath)
|
||||
parent = os.path.dirname(fpath)
|
||||
if os.path.isdir(parent) and not os.listdir(parent):
|
||||
os.rmdir(parent)
|
||||
cleaned += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not remove temp file {fpath}: {e}")
|
||||
_temp_audio_files.clear()
|
||||
return cleaned
|
||||
|
||||
|
||||
def get_video_duration(video_path: Path):
|
||||
"""Get duration of a video/audio file in seconds."""
|
||||
try:
|
||||
clip = AudioFileClip(str(video_path))
|
||||
duration = clip.duration
|
||||
clip.close()
|
||||
return duration
|
||||
except Exception:
|
||||
return None
|
||||
@ -1,12 +1,9 @@
|
||||
"""
|
||||
GPU utilities for the OBS Recording Transcriber.
|
||||
GPU utilities for the Video Transcriber.
|
||||
Provides functions to detect and configure GPU acceleration.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import torch
|
||||
|
||||
# Configure logging
|
||||
@ -68,8 +65,6 @@ def get_optimal_device():
|
||||
|
||||
|
||||
def set_memory_limits(memory_fraction=0.8):
|
||||
global torch
|
||||
import torch
|
||||
"""
|
||||
Set memory limits for GPU usage.
|
||||
|
||||
@ -1,70 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
videotranscriber:
|
||||
# Use prebuilt image from GitHub Container Registry
|
||||
image: ghcr.io/dataants-ai/videotranscriber:latest
|
||||
container_name: videotranscriber
|
||||
ports:
|
||||
- "8501:8501"
|
||||
volumes:
|
||||
# Mount your video files directory (change the left path to your actual videos folder)
|
||||
- "${VIDEO_PATH:-./videos}:/app/data/videos"
|
||||
# Mount output directory for transcripts and summaries
|
||||
- "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
|
||||
# Mount cache directory for model caching (optional, improves performance)
|
||||
- "${CACHE_PATH:-./cache}:/app/data/cache"
|
||||
# Mount a config directory if needed
|
||||
- "${CONFIG_PATH:-./config}:/app/config"
|
||||
environment:
|
||||
# Ollama configuration for host access
|
||||
- OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
|
||||
# Optional: HuggingFace token for advanced features
|
||||
- HF_TOKEN=${HF_TOKEN:-}
|
||||
# GPU configuration
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}
|
||||
# Cache settings
|
||||
- TRANSFORMERS_CACHE=/app/data/cache/transformers
|
||||
- WHISPER_CACHE=/app/data/cache/whisper
|
||||
restart: unless-stopped
|
||||
# Use bridge networking for Windows/Mac with host.docker.internal
|
||||
networks:
|
||||
- videotranscriber-network
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
# Alternative GPU-enabled service (uncomment to use)
|
||||
# videotranscriber-gpu:
|
||||
# image: ghcr.io/dataants-ai/videotranscriber:latest-gpu
|
||||
# container_name: videotranscriber-gpu
|
||||
# ports:
|
||||
# - "8501:8501"
|
||||
# volumes:
|
||||
# - "${VIDEO_PATH:-./videos}:/app/data/videos"
|
||||
# - "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
|
||||
# - "${CACHE_PATH:-./cache}:/app/data/cache"
|
||||
# - "${CONFIG_PATH:-./config}:/app/config"
|
||||
# environment:
|
||||
# - OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
|
||||
# - HF_TOKEN=${HF_TOKEN:-}
|
||||
# - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||
# - TRANSFORMERS_CACHE=/app/data/cache/transformers
|
||||
# - WHISPER_CACHE=/app/data/cache/whisper
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: 1
|
||||
# capabilities: [gpu]
|
||||
# restart: unless-stopped
|
||||
# networks:
|
||||
# - videotranscriber-network
|
||||
|
||||
networks:
|
||||
videotranscriber-network:
|
||||
driver: bridge
|
||||
@ -1,51 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
videotranscriber:
|
||||
build: .
|
||||
container_name: videotranscriber
|
||||
ports:
|
||||
- "8501:8501"
|
||||
volumes:
|
||||
# Mount your video files directory (change the left path to your actual videos folder)
|
||||
- "${VIDEO_PATH:-./videos}:/app/data/videos"
|
||||
# Mount output directory for transcripts and summaries
|
||||
- "${OUTPUT_PATH:-./outputs}:/app/data/outputs"
|
||||
# Mount cache directory for model caching (optional, improves performance)
|
||||
- "${CACHE_PATH:-./cache}:/app/data/cache"
|
||||
# Mount a config directory if needed
|
||||
- "${CONFIG_PATH:-./config}:/app/config"
|
||||
environment:
|
||||
# Ollama configuration for host access
|
||||
- OLLAMA_API_URL=${OLLAMA_API_URL:-http://host.docker.internal:11434/api}
|
||||
# Optional: HuggingFace token for advanced features
|
||||
- HF_TOKEN=${HF_TOKEN:-}
|
||||
# GPU configuration
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}
|
||||
# Cache settings
|
||||
- TRANSFORMERS_CACHE=/app/data/cache/transformers
|
||||
- WHISPER_CACHE=/app/data/cache/whisper
|
||||
# For GPU access (uncomment if you have NVIDIA GPU and nvidia-docker)
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: 1
|
||||
# capabilities: [gpu]
|
||||
restart: unless-stopped
|
||||
# For Linux hosts, you might prefer host networking for better Ollama access
|
||||
# network_mode: host # Uncomment for Linux hosts
|
||||
# Use bridge networking for Windows/Mac with host.docker.internal
|
||||
networks:
|
||||
- videotranscriber-network
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
networks:
|
||||
videotranscriber-network:
|
||||
driver: bridge
|
||||
@ -1,63 +0,0 @@
|
||||
# VideoTranscriber Docker Configuration
|
||||
# Copy this file to .env and modify the values as needed
|
||||
|
||||
# =============================================================================
|
||||
# DOCKER VOLUME PATHS (Host Directories)
|
||||
# =============================================================================
|
||||
|
||||
# Path to your video files directory on the host
|
||||
# This directory will be mounted into the container at /app/data/videos
|
||||
VIDEO_PATH=./videos
|
||||
|
||||
# Path where outputs (transcripts, summaries) will be saved on the host
|
||||
# This directory will be mounted into the container at /app/data/outputs
|
||||
OUTPUT_PATH=./outputs
|
||||
|
||||
# Path for caching ML models and processed files (improves performance)
|
||||
# This directory will be mounted into the container at /app/data/cache
|
||||
CACHE_PATH=./cache
|
||||
|
||||
# Optional: Configuration directory for custom settings
|
||||
CONFIG_PATH=./config
|
||||
|
||||
# =============================================================================
|
||||
# OLLAMA CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
# Ollama API URL - how the container accesses your host Ollama service
|
||||
# For Windows/Mac with Docker Desktop: use host.docker.internal
|
||||
# For Linux: use host networking or the actual host IP
|
||||
OLLAMA_API_URL=http://host.docker.internal:11434/api
|
||||
|
||||
# =============================================================================
|
||||
# ML MODEL CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
# HuggingFace token for advanced features (speaker diarization, etc.)
|
||||
# Get your token at: https://huggingface.co/settings/tokens
|
||||
# Leave empty if not using advanced features
|
||||
HF_TOKEN=
|
||||
|
||||
# GPU Configuration
|
||||
# Specify which GPU devices to use (leave empty for all available)
|
||||
# Examples: "0" for first GPU, "0,1" for first two GPUs
|
||||
CUDA_VISIBLE_DEVICES=
|
||||
|
||||
# =============================================================================
|
||||
# DOCKER-SPECIFIC SETTINGS
|
||||
# =============================================================================
|
||||
|
||||
# Container name (change if you want to run multiple instances)
|
||||
CONTAINER_NAME=videotranscriber
|
||||
|
||||
# Port mapping (host:container)
|
||||
HOST_PORT=8501
|
||||
|
||||
# =============================================================================
|
||||
# EXAMPLE USAGE
|
||||
# =============================================================================
|
||||
# 1. Copy this file: cp docker.env.example .env
|
||||
# 2. Edit the paths to match your system
|
||||
# 3. Make sure Ollama is running on your host: ollama serve
|
||||
# 4. Start the container: docker-compose up -d
|
||||
# 5. Access the app at: http://localhost:8501
|
||||
131
electron/main.js
Normal file
131
electron/main.js
Normal file
@ -0,0 +1,131 @@
|
||||
const { app, BrowserWindow, ipcMain, dialog, safeStorage } = require('electron');
|
||||
const path = require('path');
|
||||
const { PythonBackend } = require('./python-bridge');
|
||||
|
||||
let mainWindow = null;
|
||||
let pythonBackend = null;
|
||||
|
||||
const isDev = !app.isPackaged;
|
||||
const BACKEND_PORT = 8642;
|
||||
|
||||
function createWindow() {
|
||||
mainWindow = new BrowserWindow({
|
||||
width: 1400,
|
||||
height: 900,
|
||||
minWidth: 1024,
|
||||
minHeight: 700,
|
||||
title: 'CutScript',
|
||||
webPreferences: {
|
||||
preload: path.join(__dirname, 'preload.js'),
|
||||
contextIsolation: true,
|
||||
nodeIntegration: false,
|
||||
webSecurity: isDev ? false : true,
|
||||
},
|
||||
show: false,
|
||||
});
|
||||
|
||||
if (isDev) {
|
||||
mainWindow.loadURL('http://localhost:5173');
|
||||
mainWindow.webContents.openDevTools();
|
||||
} else {
|
||||
mainWindow.loadFile(path.join(__dirname, '..', 'frontend', 'dist', 'index.html'));
|
||||
}
|
||||
|
||||
mainWindow.once('ready-to-show', () => {
|
||||
mainWindow.show();
|
||||
});
|
||||
|
||||
mainWindow.on('closed', () => {
|
||||
mainWindow = null;
|
||||
});
|
||||
}
|
||||
|
||||
app.whenReady().then(async () => {
|
||||
pythonBackend = new PythonBackend(BACKEND_PORT, isDev);
|
||||
await pythonBackend.start();
|
||||
|
||||
createWindow();
|
||||
|
||||
app.on('activate', () => {
|
||||
if (BrowserWindow.getAllWindows().length === 0) {
|
||||
createWindow();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
app.on('window-all-closed', () => {
|
||||
if (process.platform !== 'darwin') {
|
||||
app.quit();
|
||||
}
|
||||
});
|
||||
|
||||
app.on('before-quit', () => {
|
||||
if (pythonBackend) {
|
||||
pythonBackend.stop();
|
||||
}
|
||||
});
|
||||
|
||||
// IPC Handlers
|
||||
|
||||
ipcMain.handle('dialog:openFile', async (_event, options) => {
|
||||
const result = await dialog.showOpenDialog(mainWindow, {
|
||||
properties: ['openFile'],
|
||||
filters: [
|
||||
{ name: 'Video Files', extensions: ['mp4', 'avi', 'mov', 'mkv', 'webm'] },
|
||||
{ name: 'Audio Files', extensions: ['m4a', 'wav', 'mp3', 'flac'] },
|
||||
{ name: 'All Files', extensions: ['*'] },
|
||||
],
|
||||
...options,
|
||||
});
|
||||
return result.canceled ? null : result.filePaths[0];
|
||||
});
|
||||
|
||||
ipcMain.handle('dialog:saveFile', async (_event, options) => {
|
||||
const result = await dialog.showSaveDialog(mainWindow, {
|
||||
filters: [
|
||||
{ name: 'Video Files', extensions: ['mp4', 'mov', 'webm'] },
|
||||
{ name: 'Project Files', extensions: ['aive'] },
|
||||
],
|
||||
...options,
|
||||
});
|
||||
return result.canceled ? null : result.filePath;
|
||||
});
|
||||
|
||||
ipcMain.handle('dialog:openProject', async () => {
|
||||
const result = await dialog.showOpenDialog(mainWindow, {
|
||||
properties: ['openFile'],
|
||||
filters: [
|
||||
{ name: 'AI Video Editor Project', extensions: ['aive'] },
|
||||
],
|
||||
});
|
||||
return result.canceled ? null : result.filePaths[0];
|
||||
});
|
||||
|
||||
ipcMain.handle('safe-storage:encrypt', (_event, data) => {
|
||||
if (safeStorage.isEncryptionAvailable()) {
|
||||
return safeStorage.encryptString(data).toString('base64');
|
||||
}
|
||||
return data;
|
||||
});
|
||||
|
||||
ipcMain.handle('safe-storage:decrypt', (_event, encrypted) => {
|
||||
if (safeStorage.isEncryptionAvailable()) {
|
||||
return safeStorage.decryptString(Buffer.from(encrypted, 'base64'));
|
||||
}
|
||||
return encrypted;
|
||||
});
|
||||
|
||||
ipcMain.handle('get-backend-url', () => {
|
||||
return `http://localhost:${BACKEND_PORT}`;
|
||||
});
|
||||
|
||||
ipcMain.handle('fs:readFile', async (_event, filePath) => {
|
||||
const fs = require('fs');
|
||||
return fs.readFileSync(filePath, 'utf-8');
|
||||
});
|
||||
|
||||
ipcMain.handle('fs:writeFile', async (_event, filePath, content) => {
|
||||
const fs = require('fs');
|
||||
fs.writeFileSync(filePath, content, 'utf-8');
|
||||
return true;
|
||||
});
|
||||
12
electron/preload.js
Normal file
12
electron/preload.js
Normal file
@ -0,0 +1,12 @@
|
||||
const { contextBridge, ipcRenderer } = require('electron');
|
||||
|
||||
contextBridge.exposeInMainWorld('electronAPI', {
|
||||
openFile: (options) => ipcRenderer.invoke('dialog:openFile', options),
|
||||
saveFile: (options) => ipcRenderer.invoke('dialog:saveFile', options),
|
||||
openProject: () => ipcRenderer.invoke('dialog:openProject'),
|
||||
getBackendUrl: () => ipcRenderer.invoke('get-backend-url'),
|
||||
encryptString: (data) => ipcRenderer.invoke('safe-storage:encrypt', data),
|
||||
decryptString: (encrypted) => ipcRenderer.invoke('safe-storage:decrypt', encrypted),
|
||||
readFile: (path) => ipcRenderer.invoke('fs:readFile', path),
|
||||
writeFile: (path, content) => ipcRenderer.invoke('fs:writeFile', path, content),
|
||||
});
|
||||
105
electron/python-bridge.js
Normal file
105
electron/python-bridge.js
Normal file
@ -0,0 +1,105 @@
|
||||
const { spawn } = require('child_process');
|
||||
const path = require('path');
|
||||
const http = require('http');
|
||||
|
||||
class PythonBackend {
|
||||
constructor(port, isDev) {
|
||||
this.port = port;
|
||||
this.isDev = isDev;
|
||||
this.process = null;
|
||||
}
|
||||
|
||||
async start() {
|
||||
// In dev mode, check if a backend is already running (e.g. from `npm run dev:backend`)
|
||||
// If so, reuse it instead of spawning a duplicate.
|
||||
if (this.isDev) {
|
||||
const alreadyRunning = await this._isPortOpen(2000);
|
||||
if (alreadyRunning) {
|
||||
console.log(`[backend] Dev backend already running on port ${this.port} — reusing it.`);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const backendDir = this.isDev
|
||||
? path.join(__dirname, '..', 'backend')
|
||||
: path.join(process.resourcesPath, 'backend');
|
||||
|
||||
const pythonCmd = process.platform === 'win32' ? 'python' : 'python3';
|
||||
|
||||
this.process = spawn(pythonCmd, [
|
||||
'-m', 'uvicorn', 'main:app',
|
||||
'--host', '127.0.0.1',
|
||||
'--port', String(this.port),
|
||||
], {
|
||||
cwd: backendDir,
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
env: { ...process.env, PYTHONUNBUFFERED: '1' },
|
||||
});
|
||||
|
||||
this.process.stdout.on('data', (data) => {
|
||||
console.log(`[backend] ${data.toString().trim()}`);
|
||||
});
|
||||
|
||||
this.process.stderr.on('data', (data) => {
|
||||
console.error(`[backend] ${data.toString().trim()}`);
|
||||
});
|
||||
|
||||
this.process.on('error', (err) => {
|
||||
console.error('[backend] Failed to start Python backend:', err.message);
|
||||
});
|
||||
|
||||
this.process.on('exit', (code) => {
|
||||
console.log(`[backend] Process exited with code ${code}`);
|
||||
this.process = null;
|
||||
});
|
||||
|
||||
await this._waitForReady(30000);
|
||||
console.log(`[backend] Ready on port ${this.port}`);
|
||||
}
|
||||
|
||||
_isPortOpen(timeoutMs) {
|
||||
return new Promise((resolve) => {
|
||||
const req = http.get(`http://127.0.0.1:${this.port}/health`, (res) => {
|
||||
resolve(res.statusCode === 200);
|
||||
});
|
||||
req.on('error', () => resolve(false));
|
||||
req.setTimeout(timeoutMs, () => { req.destroy(); resolve(false); });
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
stop() {
|
||||
if (this.process) {
|
||||
if (process.platform === 'win32') {
|
||||
spawn('taskkill', ['/pid', String(this.process.pid), '/f', '/t']);
|
||||
} else {
|
||||
this.process.kill('SIGTERM');
|
||||
}
|
||||
this.process = null;
|
||||
}
|
||||
}
|
||||
|
||||
_waitForReady(timeoutMs) {
|
||||
const startTime = Date.now();
|
||||
return new Promise((resolve, reject) => {
|
||||
const check = () => {
|
||||
if (Date.now() - startTime > timeoutMs) {
|
||||
reject(new Error('Backend startup timed out'));
|
||||
return;
|
||||
}
|
||||
const req = http.get(`http://127.0.0.1:${this.port}/health`, (res) => {
|
||||
if (res.statusCode === 200) {
|
||||
resolve();
|
||||
} else {
|
||||
setTimeout(check, 500);
|
||||
}
|
||||
});
|
||||
req.on('error', () => setTimeout(check, 500));
|
||||
req.end();
|
||||
};
|
||||
setTimeout(check, 1000);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { PythonBackend };
|
||||
16
frontend/index.html
Normal file
16
frontend/index.html
Normal file
@ -0,0 +1,16 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline' https://fonts.googleapis.com; font-src 'self' https://fonts.gstatic.com; connect-src 'self' http://localhost:* ws://localhost:*; media-src 'self' file: blob: http://localhost:*;" />
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet" />
|
||||
<title>CutScript</title>
|
||||
</head>
|
||||
<body class="bg-editor-bg text-editor-text antialiased">
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
2817
frontend/package-lock.json
generated
Normal file
2817
frontend/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
31
frontend/package.json
Normal file
31
frontend/package.json
Normal file
@ -0,0 +1,31 @@
|
||||
{
|
||||
"name": "cutscript-frontend",
|
||||
"private": true,
|
||||
"version": "0.1.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "tsc -b && vite build",
|
||||
"lint": "eslint .",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"lucide-react": "^0.468.0",
|
||||
"react": "^19.0.0",
|
||||
"react-dom": "^19.0.0",
|
||||
"react-virtuoso": "^4.18.3",
|
||||
"wavesurfer.js": "^7.8.0",
|
||||
"zundo": "^2.3.0",
|
||||
"zustand": "^5.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react": "^19.0.0",
|
||||
"@types/react-dom": "^19.0.0",
|
||||
"@vitejs/plugin-react": "^4.3.0",
|
||||
"autoprefixer": "^10.4.20",
|
||||
"postcss": "^8.4.49",
|
||||
"tailwindcss": "^3.4.0",
|
||||
"typescript": "^5.7.0",
|
||||
"vite": "^6.0.0"
|
||||
}
|
||||
}
|
||||
6
frontend/postcss.config.js
Normal file
6
frontend/postcss.config.js
Normal file
@ -0,0 +1,6 @@
|
||||
export default {
|
||||
plugins: {
|
||||
tailwindcss: {},
|
||||
autoprefixer: {},
|
||||
},
|
||||
};
|
||||
310
frontend/src/App.tsx
Normal file
310
frontend/src/App.tsx
Normal file
@ -0,0 +1,310 @@
|
||||
import { useEffect, useState, useRef } from 'react';
|
||||
import { useEditorStore } from './store/editorStore';
|
||||
import VideoPlayer from './components/VideoPlayer';
|
||||
import TranscriptEditor from './components/TranscriptEditor';
|
||||
import WaveformTimeline from './components/WaveformTimeline';
|
||||
import AIPanel from './components/AIPanel';
|
||||
import ExportDialog from './components/ExportDialog';
|
||||
import SettingsPanel from './components/SettingsPanel';
|
||||
import { useKeyboardShortcuts } from './hooks/useKeyboardShortcuts';
|
||||
import {
|
||||
Film,
|
||||
FolderOpen,
|
||||
Settings,
|
||||
Sparkles,
|
||||
Download,
|
||||
Loader2,
|
||||
FolderSearch,
|
||||
FileInput,
|
||||
} from 'lucide-react';
|
||||
|
||||
const IS_ELECTRON = !!window.electronAPI;
|
||||
|
||||
type Panel = 'ai' | 'settings' | 'export' | null;
|
||||
|
||||
export default function App() {
|
||||
const {
|
||||
videoPath,
|
||||
words,
|
||||
isTranscribing,
|
||||
transcriptionProgress,
|
||||
loadVideo,
|
||||
setBackendUrl,
|
||||
setTranscription,
|
||||
setTranscribing,
|
||||
backendUrl,
|
||||
} = useEditorStore();
|
||||
|
||||
const [activePanel, setActivePanel] = useState<Panel>(null);
|
||||
const [manualPath, setManualPath] = useState('');
|
||||
const [whisperModel, setWhisperModel] = useState('base');
|
||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
useKeyboardShortcuts();
|
||||
|
||||
useEffect(() => {
|
||||
if (IS_ELECTRON) {
|
||||
window.electronAPI!.getBackendUrl().then(setBackendUrl);
|
||||
}
|
||||
}, [setBackendUrl]);
|
||||
|
||||
const handleLoadProject = async () => {
|
||||
if (!IS_ELECTRON) return;
|
||||
try {
|
||||
const projectPath = await window.electronAPI!.openProject();
|
||||
if (!projectPath) return;
|
||||
const content = await window.electronAPI!.readFile(projectPath);
|
||||
const data = JSON.parse(content);
|
||||
useEditorStore.getState().loadProject(data);
|
||||
} catch (err) {
|
||||
console.error('Failed to load project:', err);
|
||||
alert(`Failed to load project: ${err}`);
|
||||
}
|
||||
};
|
||||
|
||||
const handleOpenFile = async () => {
|
||||
if (IS_ELECTRON) {
|
||||
const path = await window.electronAPI!.openFile();
|
||||
if (path) {
|
||||
loadVideo(path);
|
||||
await transcribeVideo(path);
|
||||
}
|
||||
} else {
|
||||
// Browser: use the manual path input
|
||||
const path = manualPath.trim();
|
||||
if (path) {
|
||||
loadVideo(path);
|
||||
await transcribeVideo(path);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const handleManualSubmit = async (e: React.FormEvent) => {
|
||||
e.preventDefault();
|
||||
const path = manualPath.trim();
|
||||
if (!path) return;
|
||||
loadVideo(path);
|
||||
await transcribeVideo(path);
|
||||
};
|
||||
|
||||
const transcribeVideo = async (path: string) => {
|
||||
setTranscribing(true, 0);
|
||||
try {
|
||||
const res = await fetch(`${backendUrl}/transcribe`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ file_path: path, model: whisperModel }),
|
||||
});
|
||||
if (!res.ok) throw new Error(`Transcription failed: ${res.statusText}`);
|
||||
const data = await res.json();
|
||||
setTranscription(data);
|
||||
} catch (err) {
|
||||
console.error('Transcription error:', err);
|
||||
alert(`Transcription failed. Check the console for details.\n\n${err}`);
|
||||
} finally {
|
||||
setTranscribing(false);
|
||||
}
|
||||
};
|
||||
|
||||
const togglePanel = (panel: Panel) =>
|
||||
setActivePanel((prev) => (prev === panel ? null : panel));
|
||||
|
||||
if (!videoPath) {
|
||||
return (
|
||||
<div className="h-screen flex flex-col items-center justify-center gap-8 bg-editor-bg px-6">
|
||||
<div className="flex flex-col items-center gap-3">
|
||||
<Film className="w-14 h-14 text-editor-accent opacity-80" />
|
||||
<h1 className="text-3xl font-semibold tracking-tight">CutScript</h1>
|
||||
<p className="text-editor-text-muted text-sm max-w-sm text-center">
|
||||
Open-source text-based video editing powered by AI.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Whisper model selector */}
|
||||
<div className="flex items-center gap-3">
|
||||
<label className="text-xs text-editor-text-muted whitespace-nowrap">Whisper model:</label>
|
||||
<select
|
||||
value={whisperModel}
|
||||
onChange={(e) => setWhisperModel(e.target.value)}
|
||||
className="px-3 py-1.5 bg-editor-surface border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
|
||||
>
|
||||
<option value="tiny">tiny (~75 MB, fastest)</option>
|
||||
<option value="base">base (~140 MB, fast)</option>
|
||||
<option value="small">small (~460 MB, good)</option>
|
||||
<option value="medium">medium (~1.5 GB, better)</option>
|
||||
<option value="large">large (~2.9 GB, best)</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
{IS_ELECTRON ? (
|
||||
<div className="flex flex-col items-center gap-3">
|
||||
<button
|
||||
onClick={handleOpenFile}
|
||||
className="flex items-center gap-2 px-6 py-3 bg-editor-accent hover:bg-editor-accent-hover rounded-lg text-white font-medium transition-colors"
|
||||
>
|
||||
<FolderOpen className="w-5 h-5" />
|
||||
Open Video File
|
||||
</button>
|
||||
<button
|
||||
onClick={handleLoadProject}
|
||||
className="flex items-center gap-2 px-4 py-2 text-sm text-editor-text-muted hover:text-editor-text hover:bg-editor-surface rounded-lg transition-colors"
|
||||
>
|
||||
<FileInput className="w-4 h-4" />
|
||||
Load Project (.aive)
|
||||
</button>
|
||||
</div>
|
||||
) : (
|
||||
/* Browser: manual path input */
|
||||
<div className="w-full max-w-lg space-y-3">
|
||||
<div className="flex items-center gap-2 px-3 py-1.5 bg-editor-warning/10 border border-editor-warning/30 rounded-lg">
|
||||
<span className="text-editor-warning text-xs">
|
||||
Running in browser — paste the full path to your video file below.
|
||||
</span>
|
||||
</div>
|
||||
<form onSubmit={handleManualSubmit} className="flex gap-2">
|
||||
<div className="flex-1 relative">
|
||||
<FolderSearch className="absolute left-3 top-1/2 -translate-y-1/2 w-4 h-4 text-editor-text-muted pointer-events-none" />
|
||||
<input
|
||||
ref={fileInputRef}
|
||||
type="text"
|
||||
value={manualPath}
|
||||
onChange={(e) => setManualPath(e.target.value)}
|
||||
placeholder="C:\Videos\my-video.mp4"
|
||||
className="w-full pl-9 pr-3 py-2.5 bg-editor-surface border border-editor-border rounded-lg text-sm text-editor-text placeholder:text-editor-text-muted/40 focus:outline-none focus:border-editor-accent"
|
||||
autoFocus
|
||||
/>
|
||||
</div>
|
||||
<button
|
||||
type="submit"
|
||||
disabled={!manualPath.trim()}
|
||||
className="flex items-center gap-2 px-5 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-40 rounded-lg text-sm text-white font-medium transition-colors whitespace-nowrap"
|
||||
>
|
||||
<Film className="w-4 h-4" />
|
||||
Load & Transcribe
|
||||
</button>
|
||||
</form>
|
||||
<p className="text-[11px] text-editor-text-muted text-center">
|
||||
Supported: MP4, AVI, MOV, MKV, WebM, M4A
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="h-screen flex flex-col bg-editor-bg overflow-hidden">
|
||||
{/* Top bar */}
|
||||
<header className="h-12 flex items-center justify-between px-4 border-b border-editor-border shrink-0">
|
||||
<div className="flex items-center gap-3">
|
||||
<Film className="w-5 h-5 text-editor-accent" />
|
||||
<span className="text-sm font-medium truncate max-w-[300px]">
|
||||
{videoPath.split(/[\\/]/).pop()}
|
||||
</span>
|
||||
</div>
|
||||
<div className="flex items-center gap-1">
|
||||
<ToolbarButton
|
||||
icon={<FolderOpen className="w-4 h-4" />}
|
||||
label="Open"
|
||||
onClick={IS_ELECTRON ? handleOpenFile : () => useEditorStore.getState().reset()}
|
||||
/>
|
||||
<ToolbarButton
|
||||
icon={<Sparkles className="w-4 h-4" />}
|
||||
label="AI"
|
||||
active={activePanel === 'ai'}
|
||||
onClick={() => togglePanel('ai')}
|
||||
disabled={words.length === 0}
|
||||
/>
|
||||
<ToolbarButton
|
||||
icon={<Download className="w-4 h-4" />}
|
||||
label="Export"
|
||||
active={activePanel === 'export'}
|
||||
onClick={() => togglePanel('export')}
|
||||
disabled={words.length === 0}
|
||||
/>
|
||||
<ToolbarButton
|
||||
icon={<Settings className="w-4 h-4" />}
|
||||
label="Settings"
|
||||
active={activePanel === 'settings'}
|
||||
onClick={() => togglePanel('settings')}
|
||||
/>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
{/* Main content */}
|
||||
<div className="flex-1 flex overflow-hidden">
|
||||
{/* Left: video + transcript */}
|
||||
<div className="flex-1 flex flex-col min-w-0">
|
||||
<div className="flex-1 flex min-h-0">
|
||||
{/* Video player */}
|
||||
<div className="w-1/2 p-3 flex items-center justify-center bg-black/20">
|
||||
<VideoPlayer />
|
||||
</div>
|
||||
|
||||
{/* Transcript */}
|
||||
<div className="w-1/2 border-l border-editor-border flex flex-col min-h-0">
|
||||
{isTranscribing ? (
|
||||
<div className="flex-1 flex flex-col items-center justify-center gap-4">
|
||||
<Loader2 className="w-8 h-8 text-editor-accent animate-spin" />
|
||||
<p className="text-sm text-editor-text-muted">
|
||||
Transcribing... {Math.round(transcriptionProgress)}%
|
||||
</p>
|
||||
</div>
|
||||
) : words.length > 0 ? (
|
||||
<TranscriptEditor />
|
||||
) : (
|
||||
<div className="flex-1 flex items-center justify-center text-editor-text-muted text-sm">
|
||||
No transcript yet
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Waveform timeline */}
|
||||
<div className="h-32 border-t border-editor-border shrink-0">
|
||||
<WaveformTimeline />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Right panel (AI / Export / Settings) */}
|
||||
{activePanel && (
|
||||
<div className="w-80 border-l border-editor-border overflow-y-auto shrink-0">
|
||||
{activePanel === 'ai' && <AIPanel />}
|
||||
{activePanel === 'export' && <ExportDialog />}
|
||||
{activePanel === 'settings' && <SettingsPanel />}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ToolbarButton({
|
||||
icon,
|
||||
label,
|
||||
active,
|
||||
onClick,
|
||||
disabled,
|
||||
}: {
|
||||
icon: React.ReactNode;
|
||||
label: string;
|
||||
active?: boolean;
|
||||
onClick: () => void;
|
||||
disabled?: boolean;
|
||||
}) {
|
||||
return (
|
||||
<button
|
||||
onClick={onClick}
|
||||
disabled={disabled}
|
||||
title={label}
|
||||
className={`flex items-center gap-1.5 px-3 py-1.5 rounded-md text-xs font-medium transition-colors ${
|
||||
active
|
||||
? 'bg-editor-accent text-white'
|
||||
: 'text-editor-text-muted hover:text-editor-text hover:bg-editor-surface'
|
||||
} ${disabled ? 'opacity-40 cursor-not-allowed' : ''}`}
|
||||
>
|
||||
{icon}
|
||||
{label}
|
||||
</button>
|
||||
);
|
||||
}
|
||||
332
frontend/src/components/AIPanel.tsx
Normal file
332
frontend/src/components/AIPanel.tsx
Normal file
@ -0,0 +1,332 @@
|
||||
import { useCallback, useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { useAIStore } from '../store/aiStore';
|
||||
import { Sparkles, Scissors, Film, Loader2, Check, X, Play, Download } from 'lucide-react';
|
||||
import type { ClipSuggestion } from '../types/project';
|
||||
|
||||
export default function AIPanel() {
|
||||
const { words, videoPath, backendUrl, deleteWordRange, setCurrentTime } = useEditorStore();
|
||||
const {
|
||||
defaultProvider,
|
||||
providers,
|
||||
customFillerWords,
|
||||
fillerResult,
|
||||
clipSuggestions,
|
||||
isProcessing,
|
||||
processingMessage,
|
||||
setCustomFillerWords,
|
||||
setFillerResult,
|
||||
setClipSuggestions,
|
||||
setProcessing,
|
||||
} = useAIStore();
|
||||
|
||||
const [activeTab, setActiveTab] = useState<'filler' | 'clips'>('filler');
|
||||
|
||||
const detectFillers = useCallback(async () => {
|
||||
if (words.length === 0) return;
|
||||
setProcessing(true, 'Detecting filler words...');
|
||||
try {
|
||||
const config = providers[defaultProvider];
|
||||
const transcript = words.map((w) => w.word).join(' ');
|
||||
const res = await fetch(`${backendUrl}/ai/filler-removal`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
transcript,
|
||||
words: words.map((w, i) => ({ index: i, word: w.word })),
|
||||
provider: defaultProvider,
|
||||
model: config.model,
|
||||
api_key: config.apiKey || undefined,
|
||||
base_url: config.baseUrl || undefined,
|
||||
custom_filler_words: customFillerWords || undefined,
|
||||
}),
|
||||
});
|
||||
if (!res.ok) throw new Error('Filler detection failed');
|
||||
const data = await res.json();
|
||||
setFillerResult(data);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
} finally {
|
||||
setProcessing(false);
|
||||
}
|
||||
}, [words, backendUrl, defaultProvider, providers, customFillerWords, setProcessing, setFillerResult]);
|
||||
|
||||
const createClips = useCallback(async () => {
|
||||
if (words.length === 0) return;
|
||||
setProcessing(true, 'Finding best clip segments...');
|
||||
try {
|
||||
const config = providers[defaultProvider];
|
||||
const transcript = words.map((w) => w.word).join(' ');
|
||||
const res = await fetch(`${backendUrl}/ai/create-clip`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
transcript,
|
||||
words: words.map((w, i) => ({
|
||||
index: i,
|
||||
word: w.word,
|
||||
start: w.start,
|
||||
end: w.end,
|
||||
})),
|
||||
provider: defaultProvider,
|
||||
model: config.model,
|
||||
api_key: config.apiKey || undefined,
|
||||
base_url: config.baseUrl || undefined,
|
||||
target_duration: 60,
|
||||
}),
|
||||
});
|
||||
if (!res.ok) throw new Error('Clip creation failed');
|
||||
const data = await res.json();
|
||||
setClipSuggestions(data.clips || []);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
} finally {
|
||||
setProcessing(false);
|
||||
}
|
||||
}, [words, backendUrl, defaultProvider, providers, setProcessing, setClipSuggestions]);
|
||||
|
||||
const applyFillerDeletions = useCallback(() => {
|
||||
if (!fillerResult) return;
|
||||
const sorted = [...fillerResult.fillerWords].sort((a, b) => b.index - a.index);
|
||||
for (const fw of sorted) {
|
||||
deleteWordRange(fw.index, fw.index);
|
||||
}
|
||||
setFillerResult(null);
|
||||
}, [fillerResult, deleteWordRange, setFillerResult]);
|
||||
|
||||
const handlePreviewClip = useCallback(
|
||||
(clip: ClipSuggestion) => {
|
||||
setCurrentTime(clip.startTime);
|
||||
const video = document.querySelector('video');
|
||||
if (video) {
|
||||
video.currentTime = clip.startTime;
|
||||
video.play();
|
||||
}
|
||||
},
|
||||
[setCurrentTime],
|
||||
);
|
||||
|
||||
const [exportingClipIndex, setExportingClipIndex] = useState<number | null>(null);
|
||||
|
||||
const handleExportClip = useCallback(
|
||||
async (clip: ClipSuggestion, index: number) => {
|
||||
if (!videoPath) return;
|
||||
setExportingClipIndex(index);
|
||||
try {
|
||||
const safeName = clip.title.replace(/[^a-zA-Z0-9_-]/g, '_').substring(0, 40);
|
||||
const dirSep = videoPath.lastIndexOf('\\') >= 0 ? '\\' : '/';
|
||||
const dir = videoPath.substring(0, videoPath.lastIndexOf(dirSep));
|
||||
const outputPath = `${dir}${dirSep}${safeName}_clip.mp4`;
|
||||
|
||||
const res = await fetch(`${backendUrl}/export`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
input_path: videoPath,
|
||||
output_path: outputPath,
|
||||
keep_segments: [{ start: clip.startTime, end: clip.endTime }],
|
||||
mode: 'fast',
|
||||
format: 'mp4',
|
||||
}),
|
||||
});
|
||||
if (!res.ok) throw new Error('Export failed');
|
||||
const data = await res.json();
|
||||
alert(`Clip exported to: ${data.output_path}`);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
alert('Failed to export clip. Check console for details.');
|
||||
} finally {
|
||||
setExportingClipIndex(null);
|
||||
}
|
||||
},
|
||||
[videoPath, backendUrl],
|
||||
);
|
||||
|
||||
return (
|
||||
<div className="flex flex-col h-full">
|
||||
<div className="flex border-b border-editor-border shrink-0">
|
||||
<TabButton
|
||||
active={activeTab === 'filler'}
|
||||
onClick={() => setActiveTab('filler')}
|
||||
icon={<Scissors className="w-3.5 h-3.5" />}
|
||||
label="Filler Words"
|
||||
/>
|
||||
<TabButton
|
||||
active={activeTab === 'clips'}
|
||||
onClick={() => setActiveTab('clips')}
|
||||
icon={<Film className="w-3.5 h-3.5" />}
|
||||
label="Create Clips"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="flex-1 overflow-y-auto p-4">
|
||||
{activeTab === 'filler' && (
|
||||
<div className="space-y-4">
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
Use AI to detect and remove filler words like "um", "uh", "like", "you know" from
|
||||
your transcript.
|
||||
</p>
|
||||
<div className="space-y-1.5">
|
||||
<label className="text-[11px] text-editor-text-muted font-medium">
|
||||
Custom filler words (comma-separated)
|
||||
</label>
|
||||
<input
|
||||
type="text"
|
||||
value={customFillerWords}
|
||||
onChange={(e) => setCustomFillerWords(e.target.value)}
|
||||
placeholder="e.g. okay, alright, anyway"
|
||||
className="w-full px-2.5 py-1.5 text-xs bg-editor-surface border border-editor-border rounded focus:border-editor-accent focus:outline-none"
|
||||
/>
|
||||
</div>
|
||||
<button
|
||||
onClick={detectFillers}
|
||||
disabled={isProcessing || words.length === 0}
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-50 rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
{isProcessing ? (
|
||||
<>
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
{processingMessage}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Sparkles className="w-4 h-4" />
|
||||
Detect Filler Words
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
|
||||
{fillerResult && fillerResult.fillerWords.length > 0 && (
|
||||
<div className="space-y-3">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-medium">
|
||||
Found {fillerResult.fillerWords.length} filler words
|
||||
</span>
|
||||
<div className="flex gap-1">
|
||||
<button
|
||||
onClick={applyFillerDeletions}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-success/20 text-editor-success rounded hover:bg-editor-success/30"
|
||||
>
|
||||
<Check className="w-3 h-3" /> Apply All
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setFillerResult(null)}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-border text-editor-text-muted rounded hover:bg-editor-surface"
|
||||
>
|
||||
<X className="w-3 h-3" /> Dismiss
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div className="space-y-1 max-h-64 overflow-y-auto">
|
||||
{fillerResult.fillerWords.map((fw) => (
|
||||
<div
|
||||
key={fw.index}
|
||||
className="flex items-center justify-between px-2 py-1.5 bg-editor-word-filler rounded text-xs"
|
||||
>
|
||||
<span>
|
||||
<strong>"{fw.word}"</strong>
|
||||
<span className="text-editor-text-muted ml-1">— {fw.reason}</span>
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{fillerResult && fillerResult.fillerWords.length === 0 && (
|
||||
<p className="text-xs text-editor-success">No filler words detected.</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{activeTab === 'clips' && (
|
||||
<div className="space-y-4">
|
||||
<p className="text-xs text-editor-text-muted">
|
||||
AI analyzes your transcript and suggests the most engaging segments for a
|
||||
YouTube Short or social media clip.
|
||||
</p>
|
||||
<button
|
||||
onClick={createClips}
|
||||
disabled={isProcessing || words.length === 0}
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-2.5 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-50 rounded-lg text-sm font-medium transition-colors"
|
||||
>
|
||||
{isProcessing ? (
|
||||
<>
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
{processingMessage}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Film className="w-4 h-4" />
|
||||
Find Best Clips
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
|
||||
{clipSuggestions.length > 0 && (
|
||||
<div className="space-y-3">
|
||||
{clipSuggestions.map((clip, i) => (
|
||||
<div key={i} className="p-3 bg-editor-surface rounded-lg space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-semibold">{clip.title}</span>
|
||||
<span className="text-[10px] text-editor-text-muted">
|
||||
{Math.round(clip.endTime - clip.startTime)}s
|
||||
</span>
|
||||
</div>
|
||||
<p className="text-[11px] text-editor-text-muted">{clip.reason}</p>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
onClick={() => handlePreviewClip(clip)}
|
||||
className="flex-1 flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-accent/20 text-editor-accent rounded hover:bg-editor-accent/30 transition-colors"
|
||||
>
|
||||
<Play className="w-3 h-3" /> Preview
|
||||
</button>
|
||||
<button
|
||||
onClick={() => handleExportClip(clip, i)}
|
||||
disabled={exportingClipIndex === i}
|
||||
className="flex-1 flex items-center justify-center gap-1 px-2 py-1.5 text-xs bg-editor-success/20 text-editor-success rounded hover:bg-editor-success/30 disabled:opacity-50 transition-colors"
|
||||
>
|
||||
{exportingClipIndex === i ? (
|
||||
<Loader2 className="w-3 h-3 animate-spin" />
|
||||
) : (
|
||||
<Download className="w-3 h-3" />
|
||||
)}
|
||||
Export
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function TabButton({
|
||||
active,
|
||||
onClick,
|
||||
icon,
|
||||
label,
|
||||
}: {
|
||||
active: boolean;
|
||||
onClick: () => void;
|
||||
icon: React.ReactNode;
|
||||
label: string;
|
||||
}) {
|
||||
return (
|
||||
<button
|
||||
onClick={onClick}
|
||||
className={`flex-1 flex items-center justify-center gap-1.5 px-3 py-2.5 text-xs font-medium transition-colors border-b-2 ${
|
||||
active
|
||||
? 'border-editor-accent text-editor-accent'
|
||||
: 'border-transparent text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
>
|
||||
{icon}
|
||||
{label}
|
||||
</button>
|
||||
);
|
||||
}
|
||||
229
frontend/src/components/ExportDialog.tsx
Normal file
229
frontend/src/components/ExportDialog.tsx
Normal file
@ -0,0 +1,229 @@
|
||||
import { useState, useCallback, useMemo } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Download, Loader2, Zap, Cog, Info } from 'lucide-react';
|
||||
import type { ExportOptions } from '../types/project';
|
||||
|
||||
export default function ExportDialog() {
|
||||
const { videoPath, words, deletedRanges, isExporting, exportProgress, backendUrl, setExporting, getKeepSegments } =
|
||||
useEditorStore();
|
||||
|
||||
const hasCuts = deletedRanges.length > 0;
|
||||
|
||||
const [options, setOptions] = useState<Omit<ExportOptions, 'outputPath'>>({
|
||||
mode: 'fast',
|
||||
resolution: '1080p',
|
||||
format: 'mp4',
|
||||
enhanceAudio: false,
|
||||
captions: 'none',
|
||||
});
|
||||
|
||||
const handleExport = useCallback(async () => {
|
||||
if (!videoPath) return;
|
||||
|
||||
const outputPath = await window.electronAPI?.saveFile({
|
||||
defaultPath: videoPath.replace(/\.[^.]+$/, '_edited.mp4'),
|
||||
filters: [
|
||||
{ name: 'MP4', extensions: ['mp4'] },
|
||||
{ name: 'MOV', extensions: ['mov'] },
|
||||
{ name: 'WebM', extensions: ['webm'] },
|
||||
],
|
||||
});
|
||||
if (!outputPath) return;
|
||||
|
||||
setExporting(true, 0);
|
||||
try {
|
||||
const keepSegments = getKeepSegments();
|
||||
|
||||
const deletedSet = new Set<number>();
|
||||
for (const range of deletedRanges) {
|
||||
for (const idx of range.wordIndices) deletedSet.add(idx);
|
||||
}
|
||||
|
||||
const res = await fetch(`${backendUrl}/export`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
input_path: videoPath,
|
||||
output_path: outputPath,
|
||||
keep_segments: keepSegments,
|
||||
words: options.captions !== 'none' ? words : undefined,
|
||||
deleted_indices: options.captions !== 'none' ? [...deletedSet] : undefined,
|
||||
...options,
|
||||
}),
|
||||
});
|
||||
if (!res.ok) throw new Error(`Export failed: ${res.statusText}`);
|
||||
setExporting(false, 100);
|
||||
} catch (err) {
|
||||
console.error('Export error:', err);
|
||||
setExporting(false);
|
||||
}
|
||||
}, [videoPath, options, backendUrl, setExporting, getKeepSegments]);
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-5">
|
||||
<h3 className="text-sm font-semibold">Export Video</h3>
|
||||
|
||||
{/* Mode */}
|
||||
<fieldset className="space-y-2">
|
||||
<legend className="text-xs text-editor-text-muted font-medium">Export Mode</legend>
|
||||
<div className="grid grid-cols-2 gap-2">
|
||||
<ModeCard
|
||||
active={options.mode === 'fast'}
|
||||
onClick={() => setOptions((o) => ({ ...o, mode: 'fast' }))}
|
||||
icon={<Zap className="w-4 h-4" />}
|
||||
title="Fast"
|
||||
desc="Stream copy, seconds"
|
||||
/>
|
||||
<ModeCard
|
||||
active={options.mode === 'reencode'}
|
||||
onClick={() => setOptions((o) => ({ ...o, mode: 'reencode' }))}
|
||||
icon={<Cog className="w-4 h-4" />}
|
||||
title="Re-encode"
|
||||
desc="Custom quality, slower"
|
||||
/>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
{/* Resolution (only for re-encode) */}
|
||||
{options.mode === 'reencode' && (
|
||||
<SelectField
|
||||
label="Resolution"
|
||||
value={options.resolution}
|
||||
onChange={(v) => setOptions((o) => ({ ...o, resolution: v as ExportOptions['resolution'] }))}
|
||||
options={[
|
||||
{ value: '720p', label: '720p (HD)' },
|
||||
{ value: '1080p', label: '1080p (Full HD)' },
|
||||
{ value: '4k', label: '4K (Ultra HD)' },
|
||||
]}
|
||||
/>
|
||||
)}
|
||||
|
||||
{/* Format */}
|
||||
<SelectField
|
||||
label="Format"
|
||||
value={options.format}
|
||||
onChange={(v) => setOptions((o) => ({ ...o, format: v as ExportOptions['format'] }))}
|
||||
options={[
|
||||
{ value: 'mp4', label: 'MP4 (H.264)' },
|
||||
{ value: 'mov', label: 'MOV (QuickTime)' },
|
||||
{ value: 'webm', label: 'WebM (VP9)' },
|
||||
]}
|
||||
/>
|
||||
|
||||
{/* Audio enhancement */}
|
||||
<label className="flex items-center gap-2 cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={options.enhanceAudio}
|
||||
onChange={(e) => setOptions((o) => ({ ...o, enhanceAudio: e.target.checked }))}
|
||||
className="w-4 h-4 rounded bg-editor-surface border-editor-border accent-editor-accent"
|
||||
/>
|
||||
<span className="text-xs">Enhance audio (Studio Sound)</span>
|
||||
</label>
|
||||
|
||||
{/* Captions */}
|
||||
<SelectField
|
||||
label="Captions"
|
||||
value={options.captions}
|
||||
onChange={(v) => setOptions((o) => ({ ...o, captions: v as ExportOptions['captions'] }))}
|
||||
options={[
|
||||
{ value: 'none', label: 'No captions' },
|
||||
{ value: 'burn-in', label: 'Burn-in (permanent)' },
|
||||
{ value: 'sidecar', label: 'Sidecar SRT file' },
|
||||
]}
|
||||
/>
|
||||
|
||||
{/* Export button */}
|
||||
<button
|
||||
onClick={handleExport}
|
||||
disabled={isExporting || !videoPath}
|
||||
className="w-full flex items-center justify-center gap-2 px-4 py-3 bg-editor-accent hover:bg-editor-accent-hover disabled:opacity-50 rounded-lg text-sm font-semibold transition-colors"
|
||||
>
|
||||
{isExporting ? (
|
||||
<>
|
||||
<Loader2 className="w-4 h-4 animate-spin" />
|
||||
Exporting... {Math.round(exportProgress)}%
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Download className="w-4 h-4" />
|
||||
Export
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
|
||||
{options.mode === 'fast' && !hasCuts && (
|
||||
<p className="text-[10px] text-editor-text-muted text-center">
|
||||
Fast mode uses stream copy — no quality loss, exports in seconds.
|
||||
</p>
|
||||
)}
|
||||
{options.mode === 'fast' && hasCuts && (
|
||||
<div className="flex items-start gap-1.5 p-2 bg-editor-accent/10 rounded text-[10px] text-editor-accent">
|
||||
<Info className="w-3.5 h-3.5 shrink-0 mt-0.5" />
|
||||
<span>
|
||||
Word-level cuts require re-encoding for frame-accurate output. Export will
|
||||
automatically use re-encode mode. This takes longer but ensures your cuts are precise.
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ModeCard({
|
||||
active,
|
||||
onClick,
|
||||
icon,
|
||||
title,
|
||||
desc,
|
||||
}: {
|
||||
active: boolean;
|
||||
onClick: () => void;
|
||||
icon: React.ReactNode;
|
||||
title: string;
|
||||
desc: string;
|
||||
}) {
|
||||
return (
|
||||
<button
|
||||
onClick={onClick}
|
||||
className={`flex flex-col items-center gap-1 p-3 rounded-lg border-2 transition-colors ${
|
||||
active
|
||||
? 'border-editor-accent bg-editor-accent/10'
|
||||
: 'border-editor-border hover:border-editor-text-muted'
|
||||
}`}
|
||||
>
|
||||
{icon}
|
||||
<span className="text-xs font-medium">{title}</span>
|
||||
<span className="text-[10px] text-editor-text-muted">{desc}</span>
|
||||
</button>
|
||||
);
|
||||
}
|
||||
|
||||
function SelectField({
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
options,
|
||||
}: {
|
||||
label: string;
|
||||
value: string;
|
||||
onChange: (value: string) => void;
|
||||
options: Array<{ value: string; label: string }>;
|
||||
}) {
|
||||
return (
|
||||
<div className="space-y-1">
|
||||
<label className="text-xs text-editor-text-muted font-medium">{label}</label>
|
||||
<select
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
className="w-full px-3 py-2 bg-editor-surface border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
|
||||
>
|
||||
{options.map((opt) => (
|
||||
<option key={opt.value} value={opt.value}>
|
||||
{opt.label}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
192
frontend/src/components/SettingsPanel.tsx
Normal file
192
frontend/src/components/SettingsPanel.tsx
Normal file
@ -0,0 +1,192 @@
|
||||
import { useAIStore } from '../store/aiStore';
|
||||
import { useState, useEffect } from 'react';
|
||||
import type { AIProvider } from '../types/project';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Bot, Cloud, Brain, RefreshCw } from 'lucide-react';
|
||||
|
||||
export default function SettingsPanel() {
|
||||
const { providers, defaultProvider, setProviderConfig, setDefaultProvider } = useAIStore();
|
||||
const { backendUrl } = useEditorStore();
|
||||
const [ollamaModels, setOllamaModels] = useState<string[]>([]);
|
||||
const [loadingModels, setLoadingModels] = useState(false);
|
||||
|
||||
const fetchOllamaModels = async () => {
|
||||
setLoadingModels(true);
|
||||
try {
|
||||
const res = await fetch(`${backendUrl}/ai/ollama-models`);
|
||||
if (res.ok) {
|
||||
const data = await res.json();
|
||||
setOllamaModels(data.models || []);
|
||||
}
|
||||
} catch {
|
||||
setOllamaModels([]);
|
||||
} finally {
|
||||
setLoadingModels(false);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
fetchOllamaModels();
|
||||
}, [backendUrl]);
|
||||
|
||||
const providerIcons: Record<AIProvider, React.ReactNode> = {
|
||||
ollama: <Bot className="w-4 h-4" />,
|
||||
openai: <Cloud className="w-4 h-4" />,
|
||||
claude: <Brain className="w-4 h-4" />,
|
||||
};
|
||||
|
||||
const providerLabels: Record<AIProvider, string> = {
|
||||
ollama: 'Ollama (Local)',
|
||||
openai: 'OpenAI',
|
||||
claude: 'Claude (Anthropic)',
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-6">
|
||||
<h3 className="text-sm font-semibold">AI Settings</h3>
|
||||
|
||||
{/* Default provider selector */}
|
||||
<div className="space-y-2">
|
||||
<label className="text-xs text-editor-text-muted font-medium">Default AI Provider</label>
|
||||
<div className="grid grid-cols-3 gap-1.5">
|
||||
{(['ollama', 'openai', 'claude'] as AIProvider[]).map((p) => (
|
||||
<button
|
||||
key={p}
|
||||
onClick={() => setDefaultProvider(p)}
|
||||
className={`flex flex-col items-center gap-1 p-2 rounded-lg border transition-colors text-[10px] ${
|
||||
defaultProvider === p
|
||||
? 'border-editor-accent bg-editor-accent/10 text-editor-accent'
|
||||
: 'border-editor-border text-editor-text-muted hover:text-editor-text'
|
||||
}`}
|
||||
>
|
||||
{providerIcons[p]}
|
||||
{p.charAt(0).toUpperCase() + p.slice(1)}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Ollama settings */}
|
||||
<ProviderSection title="Ollama (Local)" icon={providerIcons.ollama}>
|
||||
<InputField
|
||||
label="Base URL"
|
||||
value={providers.ollama.baseUrl || ''}
|
||||
onChange={(v) => setProviderConfig('ollama', { baseUrl: v })}
|
||||
placeholder="http://localhost:11434"
|
||||
/>
|
||||
<div className="space-y-1">
|
||||
<div className="flex items-center justify-between">
|
||||
<label className="text-xs text-editor-text-muted">Model</label>
|
||||
<button
|
||||
onClick={fetchOllamaModels}
|
||||
disabled={loadingModels}
|
||||
className="text-[10px] text-editor-accent hover:underline flex items-center gap-0.5"
|
||||
>
|
||||
<RefreshCw className={`w-2.5 h-2.5 ${loadingModels ? 'animate-spin' : ''}`} />
|
||||
Refresh
|
||||
</button>
|
||||
</div>
|
||||
{ollamaModels.length > 0 ? (
|
||||
<select
|
||||
value={providers.ollama.model}
|
||||
onChange={(e) => setProviderConfig('ollama', { model: e.target.value })}
|
||||
className="w-full px-3 py-2 bg-editor-surface border border-editor-border rounded-lg text-xs text-editor-text focus:outline-none focus:border-editor-accent"
|
||||
>
|
||||
{ollamaModels.map((m) => (
|
||||
<option key={m} value={m}>{m}</option>
|
||||
))}
|
||||
</select>
|
||||
) : (
|
||||
<InputField
|
||||
label=""
|
||||
value={providers.ollama.model}
|
||||
onChange={(v) => setProviderConfig('ollama', { model: v })}
|
||||
placeholder="llama3"
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</ProviderSection>
|
||||
|
||||
{/* OpenAI settings */}
|
||||
<ProviderSection title="OpenAI" icon={providerIcons.openai}>
|
||||
<InputField
|
||||
label="API Key"
|
||||
value={providers.openai.apiKey || ''}
|
||||
onChange={(v) => setProviderConfig('openai', { apiKey: v })}
|
||||
placeholder="sk-..."
|
||||
type="password"
|
||||
/>
|
||||
<InputField
|
||||
label="Model"
|
||||
value={providers.openai.model}
|
||||
onChange={(v) => setProviderConfig('openai', { model: v })}
|
||||
placeholder="gpt-4o"
|
||||
/>
|
||||
</ProviderSection>
|
||||
|
||||
{/* Claude settings */}
|
||||
<ProviderSection title="Claude (Anthropic)" icon={providerIcons.claude}>
|
||||
<InputField
|
||||
label="API Key"
|
||||
value={providers.claude.apiKey || ''}
|
||||
onChange={(v) => setProviderConfig('claude', { apiKey: v })}
|
||||
placeholder="sk-ant-..."
|
||||
type="password"
|
||||
/>
|
||||
<InputField
|
||||
label="Model"
|
||||
value={providers.claude.model}
|
||||
onChange={(v) => setProviderConfig('claude', { model: v })}
|
||||
placeholder="claude-sonnet-4-20250514"
|
||||
/>
|
||||
</ProviderSection>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ProviderSection({
|
||||
title,
|
||||
icon,
|
||||
children,
|
||||
}: {
|
||||
title: string;
|
||||
icon: React.ReactNode;
|
||||
children: React.ReactNode;
|
||||
}) {
|
||||
return (
|
||||
<div className="space-y-3 p-3 bg-editor-surface rounded-lg">
|
||||
<div className="flex items-center gap-2 text-xs font-medium">
|
||||
{icon}
|
||||
{title}
|
||||
</div>
|
||||
<div className="space-y-2">{children}</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function InputField({
|
||||
label,
|
||||
value,
|
||||
onChange,
|
||||
placeholder,
|
||||
type = 'text',
|
||||
}: {
|
||||
label: string;
|
||||
value: string;
|
||||
onChange: (value: string) => void;
|
||||
placeholder: string;
|
||||
type?: string;
|
||||
}) {
|
||||
return (
|
||||
<div className="space-y-1">
|
||||
{label && <label className="text-xs text-editor-text-muted">{label}</label>}
|
||||
<input
|
||||
type={type}
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
placeholder={placeholder}
|
||||
className="w-full px-3 py-2 bg-editor-bg border border-editor-border rounded-lg text-xs text-editor-text placeholder:text-editor-text-muted/50 focus:outline-none focus:border-editor-accent"
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
204
frontend/src/components/TranscriptEditor.tsx
Normal file
204
frontend/src/components/TranscriptEditor.tsx
Normal file
@ -0,0 +1,204 @@
|
||||
import { useCallback, useRef, useEffect, useMemo, useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { Virtuoso } from 'react-virtuoso';
|
||||
import { Trash2, RotateCcw } from 'lucide-react';
|
||||
|
||||
export default function TranscriptEditor() {
|
||||
const words = useEditorStore((s) => s.words);
|
||||
const segments = useEditorStore((s) => s.segments);
|
||||
const deletedRanges = useEditorStore((s) => s.deletedRanges);
|
||||
const selectedWordIndices = useEditorStore((s) => s.selectedWordIndices);
|
||||
const hoveredWordIndex = useEditorStore((s) => s.hoveredWordIndex);
|
||||
const setSelectedWordIndices = useEditorStore((s) => s.setSelectedWordIndices);
|
||||
const setHoveredWordIndex = useEditorStore((s) => s.setHoveredWordIndex);
|
||||
const deleteSelectedWords = useEditorStore((s) => s.deleteSelectedWords);
|
||||
const restoreRange = useEditorStore((s) => s.restoreRange);
|
||||
const getWordAtTime = useEditorStore((s) => s.getWordAtTime);
|
||||
|
||||
const selectionStart = useRef<number | null>(null);
|
||||
const wasDragging = useRef(false);
|
||||
const virtuosoRef = useRef<any>(null);
|
||||
|
||||
const deletedSet = useMemo(() => {
|
||||
const s = new Set<number>();
|
||||
for (const range of deletedRanges) {
|
||||
for (const idx of range.wordIndices) s.add(idx);
|
||||
}
|
||||
return s;
|
||||
}, [deletedRanges]);
|
||||
|
||||
const selectedSet = useMemo(() => new Set(selectedWordIndices), [selectedWordIndices]);
|
||||
|
||||
const [activeWordIndex, setActiveWordIndex] = useState(-1);
|
||||
|
||||
useEffect(() => {
|
||||
if (words.length === 0) return;
|
||||
const interval = setInterval(() => {
|
||||
const video = document.querySelector('video') as HTMLVideoElement | null;
|
||||
if (!video) return;
|
||||
const idx = getWordAtTime(video.currentTime);
|
||||
setActiveWordIndex((prev) => (prev === idx ? prev : idx));
|
||||
}, 250);
|
||||
return () => clearInterval(interval);
|
||||
}, [words, getWordAtTime]);
|
||||
|
||||
// Auto-scroll to active segment via Virtuoso
|
||||
useEffect(() => {
|
||||
if (activeWordIndex < 0 || segments.length === 0) return;
|
||||
const segIdx = segments.findIndex((seg) => {
|
||||
const start = seg.globalStartIndex ?? 0;
|
||||
return activeWordIndex >= start && activeWordIndex < start + seg.words.length;
|
||||
});
|
||||
if (segIdx >= 0 && virtuosoRef.current) {
|
||||
virtuosoRef.current.scrollIntoView({ index: segIdx, behavior: 'smooth', align: 'center' });
|
||||
}
|
||||
}, [activeWordIndex, segments]);
|
||||
|
||||
const handleWordMouseDown = useCallback(
|
||||
(index: number, e: React.MouseEvent) => {
|
||||
e.preventDefault();
|
||||
wasDragging.current = false;
|
||||
if (e.shiftKey && selectedWordIndices.length > 0) {
|
||||
const first = selectedWordIndices[0];
|
||||
const start = Math.min(first, index);
|
||||
const end = Math.max(first, index);
|
||||
const indices = [];
|
||||
for (let i = start; i <= end; i++) indices.push(i);
|
||||
setSelectedWordIndices(indices);
|
||||
} else {
|
||||
selectionStart.current = index;
|
||||
setSelectedWordIndices([index]);
|
||||
}
|
||||
},
|
||||
[selectedWordIndices, setSelectedWordIndices],
|
||||
);
|
||||
|
||||
const handleWordMouseEnter = useCallback(
|
||||
(index: number) => {
|
||||
setHoveredWordIndex(index);
|
||||
if (selectionStart.current !== null) {
|
||||
wasDragging.current = true;
|
||||
const start = Math.min(selectionStart.current, index);
|
||||
const end = Math.max(selectionStart.current, index);
|
||||
const indices = [];
|
||||
for (let i = start; i <= end; i++) indices.push(i);
|
||||
setSelectedWordIndices(indices);
|
||||
}
|
||||
},
|
||||
[setHoveredWordIndex, setSelectedWordIndices],
|
||||
);
|
||||
|
||||
const handleMouseUp = useCallback(() => {
|
||||
selectionStart.current = null;
|
||||
}, []);
|
||||
|
||||
const handleClickOutside = useCallback(
|
||||
(e: React.MouseEvent) => {
|
||||
if (wasDragging.current) {
|
||||
wasDragging.current = false;
|
||||
return;
|
||||
}
|
||||
if ((e.target as HTMLElement).dataset.wordIndex === undefined) {
|
||||
setSelectedWordIndices([]);
|
||||
}
|
||||
},
|
||||
[setSelectedWordIndices],
|
||||
);
|
||||
|
||||
const getRangeForWord = useCallback(
|
||||
(wordIndex: number) => deletedRanges.find((r) => r.wordIndices.includes(wordIndex)),
|
||||
[deletedRanges],
|
||||
);
|
||||
|
||||
const renderSegment = useCallback(
|
||||
(index: number) => {
|
||||
const segment = segments[index];
|
||||
if (!segment) return null;
|
||||
return (
|
||||
<div className="mb-3 px-4">
|
||||
{segment.speaker && (
|
||||
<div className="text-xs text-editor-accent font-medium mb-1">
|
||||
{segment.speaker}
|
||||
</div>
|
||||
)}
|
||||
<p className="text-sm leading-relaxed flex flex-wrap">
|
||||
{segment.words.map((word, localIndex) => {
|
||||
const globalIndex = (segment.globalStartIndex ?? 0) + localIndex;
|
||||
const isDeleted = deletedSet.has(globalIndex);
|
||||
const isSelected = selectedSet.has(globalIndex);
|
||||
const isActive = globalIndex === activeWordIndex;
|
||||
const isHovered = globalIndex === hoveredWordIndex;
|
||||
const deletedRange = isDeleted ? getRangeForWord(globalIndex) : null;
|
||||
|
||||
return (
|
||||
<span
|
||||
key={globalIndex}
|
||||
id={`word-${globalIndex}`}
|
||||
data-word-index={globalIndex}
|
||||
onMouseDown={(e) => handleWordMouseDown(globalIndex, e)}
|
||||
onMouseEnter={() => handleWordMouseEnter(globalIndex)}
|
||||
onMouseLeave={() => setHoveredWordIndex(null)}
|
||||
className={`
|
||||
relative px-[2px] py-[1px] rounded cursor-pointer transition-colors
|
||||
${isDeleted ? 'line-through text-editor-text-muted/40 bg-editor-word-deleted' : ''}
|
||||
${isSelected && !isDeleted ? 'bg-editor-word-selected text-white' : ''}
|
||||
${isActive && !isDeleted && !isSelected ? 'bg-editor-accent/20 text-editor-accent' : ''}
|
||||
${isHovered && !isDeleted && !isSelected && !isActive ? 'bg-editor-word-hover' : ''}
|
||||
`}
|
||||
>
|
||||
{word.word}{' '}
|
||||
{isDeleted && isHovered && deletedRange && (
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
restoreRange(deletedRange.id);
|
||||
}}
|
||||
className="absolute -top-5 left-1/2 -translate-x-1/2 flex items-center gap-0.5 px-1.5 py-0.5 bg-editor-surface border border-editor-border rounded text-[10px] text-editor-success whitespace-nowrap z-10"
|
||||
>
|
||||
<RotateCcw className="w-2.5 h-2.5" /> Restore
|
||||
</button>
|
||||
)}
|
||||
</span>
|
||||
);
|
||||
})}
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
},
|
||||
[segments, deletedSet, selectedSet, activeWordIndex, hoveredWordIndex, handleWordMouseDown, handleWordMouseEnter, setHoveredWordIndex, getRangeForWord, restoreRange],
|
||||
);
|
||||
|
||||
return (
|
||||
<div className="flex-1 flex flex-col min-h-0">
|
||||
<div className="flex items-center gap-2 px-4 py-2 border-b border-editor-border shrink-0">
|
||||
<span className="text-xs text-editor-text-muted flex-1">
|
||||
{words.length} words · {deletedRanges.length} cuts
|
||||
</span>
|
||||
{selectedWordIndices.length > 0 && (
|
||||
<button
|
||||
onClick={deleteSelectedWords}
|
||||
className="flex items-center gap-1 px-2 py-1 text-xs bg-editor-danger/20 text-editor-danger rounded hover:bg-editor-danger/30 transition-colors"
|
||||
>
|
||||
<Trash2 className="w-3 h-3" />
|
||||
Delete {selectedWordIndices.length} words
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div
|
||||
className="flex-1 min-h-0 select-none"
|
||||
onMouseUp={handleMouseUp}
|
||||
onClick={handleClickOutside}
|
||||
>
|
||||
<Virtuoso
|
||||
ref={virtuosoRef}
|
||||
totalCount={segments.length}
|
||||
itemContent={renderSegment}
|
||||
overscan={200}
|
||||
className="h-full"
|
||||
style={{ height: '100%' }}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
133
frontend/src/components/VideoPlayer.tsx
Normal file
133
frontend/src/components/VideoPlayer.tsx
Normal file
@ -0,0 +1,133 @@
|
||||
import { useRef, useCallback, useState, useEffect } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { useVideoSync } from '../hooks/useVideoSync';
|
||||
import { Play, Pause, SkipBack, SkipForward, Volume2 } from 'lucide-react';
|
||||
|
||||
export default function VideoPlayer() {
|
||||
const videoRef = useRef<HTMLVideoElement>(null);
|
||||
const videoUrl = useEditorStore((s) => s.videoUrl);
|
||||
const isPlaying = useEditorStore((s) => s.isPlaying);
|
||||
const duration = useEditorStore((s) => s.duration);
|
||||
const { seekTo, togglePlay } = useVideoSync(videoRef);
|
||||
|
||||
const [displayTime, setDisplayTime] = useState(0);
|
||||
|
||||
useEffect(() => {
|
||||
const video = videoRef.current;
|
||||
if (!video) return;
|
||||
let raf = 0;
|
||||
const tick = () => {
|
||||
setDisplayTime(video.currentTime);
|
||||
raf = requestAnimationFrame(tick);
|
||||
};
|
||||
raf = requestAnimationFrame(tick);
|
||||
return () => cancelAnimationFrame(raf);
|
||||
}, [videoUrl]);
|
||||
|
||||
const formatTime = (seconds: number) => {
|
||||
const m = Math.floor(seconds / 60);
|
||||
const s = Math.floor(seconds % 60);
|
||||
return `${m}:${s.toString().padStart(2, '0')}`;
|
||||
};
|
||||
|
||||
const handleProgressClick = useCallback(
|
||||
(e: React.MouseEvent<HTMLDivElement>) => {
|
||||
const rect = e.currentTarget.getBoundingClientRect();
|
||||
const ratio = (e.clientX - rect.left) / rect.width;
|
||||
seekTo(ratio * duration);
|
||||
},
|
||||
[seekTo, duration],
|
||||
);
|
||||
|
||||
const skip = useCallback(
|
||||
(delta: number) => {
|
||||
const video = videoRef.current;
|
||||
if (!video) return;
|
||||
seekTo(Math.max(0, Math.min(duration, video.currentTime + delta)));
|
||||
},
|
||||
[seekTo, duration],
|
||||
);
|
||||
|
||||
if (!videoUrl) {
|
||||
return (
|
||||
<div className="w-full h-full flex items-center justify-center text-editor-text-muted text-sm">
|
||||
No video loaded
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="w-full h-full flex flex-col">
|
||||
<div className="flex-1 flex items-center justify-center bg-black rounded-lg overflow-hidden min-h-0">
|
||||
<video
|
||||
ref={videoRef}
|
||||
src={videoUrl}
|
||||
className="max-w-full max-h-full object-contain"
|
||||
playsInline
|
||||
onClick={togglePlay}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="pt-2 space-y-1.5 shrink-0">
|
||||
<div
|
||||
className="h-1.5 bg-editor-border rounded-full cursor-pointer group"
|
||||
onClick={handleProgressClick}
|
||||
>
|
||||
<div
|
||||
className="h-full bg-editor-accent rounded-full relative transition-all group-hover:h-2"
|
||||
style={{ width: duration > 0 ? `${(displayTime / duration) * 100}%` : '0%' }}
|
||||
>
|
||||
<div className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 bg-white rounded-full opacity-0 group-hover:opacity-100 transition-opacity" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center justify-between">
|
||||
<div className="flex items-center gap-1">
|
||||
<ControlButton onClick={() => skip(-5)} title="Back 5s">
|
||||
<SkipBack className="w-4 h-4" />
|
||||
</ControlButton>
|
||||
<ControlButton onClick={togglePlay} title={isPlaying ? 'Pause' : 'Play'} primary>
|
||||
{isPlaying ? <Pause className="w-5 h-5" /> : <Play className="w-5 h-5 ml-0.5" />}
|
||||
</ControlButton>
|
||||
<ControlButton onClick={() => skip(5)} title="Forward 5s">
|
||||
<SkipForward className="w-4 h-4" />
|
||||
</ControlButton>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-3 text-xs text-editor-text-muted">
|
||||
<Volume2 className="w-3.5 h-3.5" />
|
||||
<span className="font-mono">
|
||||
{formatTime(displayTime)} / {formatTime(duration)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ControlButton({
|
||||
children,
|
||||
onClick,
|
||||
title,
|
||||
primary,
|
||||
}: {
|
||||
children: React.ReactNode;
|
||||
onClick: () => void;
|
||||
title: string;
|
||||
primary?: boolean;
|
||||
}) {
|
||||
return (
|
||||
<button
|
||||
onClick={onClick}
|
||||
title={title}
|
||||
className={`p-1.5 rounded-md transition-colors ${
|
||||
primary
|
||||
? 'bg-editor-accent/20 text-editor-accent hover:bg-editor-accent/30'
|
||||
: 'text-editor-text-muted hover:text-editor-text hover:bg-editor-surface'
|
||||
}`}
|
||||
>
|
||||
{children}
|
||||
</button>
|
||||
);
|
||||
}
|
||||
220
frontend/src/components/WaveformTimeline.tsx
Normal file
220
frontend/src/components/WaveformTimeline.tsx
Normal file
@ -0,0 +1,220 @@
|
||||
import { useRef, useEffect, useCallback, useState } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
import { ZoomIn, ZoomOut, AlertTriangle } from 'lucide-react';
|
||||
|
||||
export default function WaveformTimeline() {
|
||||
const waveCanvasRef = useRef<HTMLCanvasElement>(null);
|
||||
const headCanvasRef = useRef<HTMLCanvasElement>(null);
|
||||
const containerRef = useRef<HTMLDivElement>(null);
|
||||
const [audioError, setAudioError] = useState<string | null>(null);
|
||||
|
||||
const videoUrl = useEditorStore((s) => s.videoUrl);
|
||||
const videoPath = useEditorStore((s) => s.videoPath);
|
||||
const duration = useEditorStore((s) => s.duration);
|
||||
const deletedRanges = useEditorStore((s) => s.deletedRanges);
|
||||
const setCurrentTime = useEditorStore((s) => s.setCurrentTime);
|
||||
|
||||
const audioContextRef = useRef<AudioContext | null>(null);
|
||||
const audioBufferRef = useRef<AudioBuffer | null>(null);
|
||||
const zoomRef = useRef(1);
|
||||
const rafRef = useRef(0);
|
||||
|
||||
useEffect(() => {
|
||||
if (!videoUrl || !videoPath) return;
|
||||
setAudioError(null);
|
||||
|
||||
const loadAudio = async () => {
|
||||
try {
|
||||
const ctx = new AudioContext();
|
||||
audioContextRef.current = ctx;
|
||||
|
||||
const response = await fetch(videoUrl);
|
||||
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
const audioBuffer = await ctx.decodeAudioData(arrayBuffer);
|
||||
audioBufferRef.current = audioBuffer;
|
||||
drawStaticWaveform();
|
||||
} catch (err) {
|
||||
console.warn('Could not decode audio for waveform:', err);
|
||||
setAudioError('Waveform unavailable — audio could not be decoded');
|
||||
}
|
||||
};
|
||||
|
||||
loadAudio();
|
||||
|
||||
return () => {
|
||||
audioContextRef.current?.close();
|
||||
};
|
||||
}, [videoUrl, videoPath]);
|
||||
|
||||
const drawStaticWaveform = useCallback(() => {
|
||||
const canvas = waveCanvasRef.current;
|
||||
const buffer = audioBufferRef.current;
|
||||
if (!canvas || !buffer) return;
|
||||
|
||||
const ctx = canvas.getContext('2d');
|
||||
if (!ctx) return;
|
||||
|
||||
const dpr = window.devicePixelRatio || 1;
|
||||
const rect = canvas.getBoundingClientRect();
|
||||
canvas.width = rect.width * dpr;
|
||||
canvas.height = rect.height * dpr;
|
||||
ctx.scale(dpr, dpr);
|
||||
|
||||
const width = rect.width;
|
||||
const height = rect.height;
|
||||
const channelData = buffer.getChannelData(0);
|
||||
const samplesPerPixel = Math.floor(channelData.length / width);
|
||||
|
||||
ctx.clearRect(0, 0, width, height);
|
||||
|
||||
for (const range of deletedRanges) {
|
||||
const x1 = (range.start / buffer.duration) * width;
|
||||
const x2 = (range.end / buffer.duration) * width;
|
||||
ctx.fillStyle = 'rgba(239, 68, 68, 0.15)';
|
||||
ctx.fillRect(x1, 0, x2 - x1, height);
|
||||
}
|
||||
|
||||
const mid = height / 2;
|
||||
ctx.beginPath();
|
||||
ctx.strokeStyle = '#4a4d5e';
|
||||
ctx.lineWidth = 1;
|
||||
|
||||
for (let x = 0; x < width; x++) {
|
||||
const start = x * samplesPerPixel;
|
||||
const end = Math.min(start + samplesPerPixel, channelData.length);
|
||||
|
||||
let min = 0;
|
||||
let max = 0;
|
||||
for (let i = start; i < end; i++) {
|
||||
if (channelData[i] < min) min = channelData[i];
|
||||
if (channelData[i] > max) max = channelData[i];
|
||||
}
|
||||
|
||||
const yMin = mid + min * mid * 0.9;
|
||||
const yMax = mid + max * mid * 0.9;
|
||||
ctx.moveTo(x, yMin);
|
||||
ctx.lineTo(x, yMax);
|
||||
}
|
||||
ctx.stroke();
|
||||
}, [deletedRanges]);
|
||||
|
||||
// Redraw static layer when deletedRanges change
|
||||
useEffect(() => {
|
||||
drawStaticWaveform();
|
||||
}, [drawStaticWaveform]);
|
||||
|
||||
// Lightweight RAF loop for playhead only -- reads video.currentTime directly,
|
||||
// never triggers React re-renders
|
||||
useEffect(() => {
|
||||
const headCanvas = headCanvasRef.current;
|
||||
const waveCanvas = waveCanvasRef.current;
|
||||
if (!headCanvas || !waveCanvas) return;
|
||||
|
||||
const tick = () => {
|
||||
const ctx = headCanvas.getContext('2d');
|
||||
if (!ctx) { rafRef.current = requestAnimationFrame(tick); return; }
|
||||
|
||||
const buffer = audioBufferRef.current;
|
||||
const video = document.querySelector('video') as HTMLVideoElement | null;
|
||||
const dur = buffer?.duration ?? 0;
|
||||
|
||||
const dpr = window.devicePixelRatio || 1;
|
||||
const rect = headCanvas.getBoundingClientRect();
|
||||
if (headCanvas.width !== waveCanvas.width || headCanvas.height !== waveCanvas.height) {
|
||||
headCanvas.width = rect.width * dpr;
|
||||
headCanvas.height = rect.height * dpr;
|
||||
}
|
||||
ctx.setTransform(dpr, 0, 0, dpr, 0, 0);
|
||||
|
||||
const width = rect.width;
|
||||
const height = rect.height;
|
||||
ctx.clearRect(0, 0, width, height);
|
||||
|
||||
if (dur > 0 && video) {
|
||||
const px = (video.currentTime / dur) * width;
|
||||
ctx.beginPath();
|
||||
ctx.strokeStyle = '#6366f1';
|
||||
ctx.lineWidth = 2;
|
||||
ctx.moveTo(px, 0);
|
||||
ctx.lineTo(px, height);
|
||||
ctx.stroke();
|
||||
}
|
||||
|
||||
rafRef.current = requestAnimationFrame(tick);
|
||||
};
|
||||
|
||||
rafRef.current = requestAnimationFrame(tick);
|
||||
return () => cancelAnimationFrame(rafRef.current);
|
||||
}, [videoUrl]);
|
||||
|
||||
useEffect(() => {
|
||||
const observer = new ResizeObserver(() => {
|
||||
drawStaticWaveform();
|
||||
});
|
||||
if (containerRef.current) observer.observe(containerRef.current);
|
||||
return () => observer.disconnect();
|
||||
}, [drawStaticWaveform]);
|
||||
|
||||
const handleClick = useCallback(
|
||||
(e: React.MouseEvent<HTMLCanvasElement>) => {
|
||||
if (!headCanvasRef.current || duration === 0) return;
|
||||
const rect = headCanvasRef.current.getBoundingClientRect();
|
||||
const ratio = (e.clientX - rect.left) / rect.width;
|
||||
const newTime = ratio * duration;
|
||||
setCurrentTime(newTime);
|
||||
const video = document.querySelector('video');
|
||||
if (video) video.currentTime = newTime;
|
||||
},
|
||||
[duration, setCurrentTime],
|
||||
);
|
||||
|
||||
if (!videoUrl) {
|
||||
return (
|
||||
<div className="w-full h-full flex items-center justify-center text-editor-text-muted text-xs">
|
||||
Load a video to see the waveform
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div ref={containerRef} className="w-full h-full flex flex-col">
|
||||
<div className="flex items-center justify-between px-3 py-1 shrink-0">
|
||||
<span className="text-[10px] text-editor-text-muted font-medium uppercase tracking-wider">
|
||||
Timeline
|
||||
</span>
|
||||
<div className="flex items-center gap-1">
|
||||
<button
|
||||
onClick={() => { zoomRef.current = Math.max(0.5, zoomRef.current - 0.5); drawStaticWaveform(); }}
|
||||
className="p-0.5 text-editor-text-muted hover:text-editor-text"
|
||||
title="Zoom out"
|
||||
>
|
||||
<ZoomOut className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
<button
|
||||
onClick={() => { zoomRef.current = Math.min(10, zoomRef.current + 0.5); drawStaticWaveform(); }}
|
||||
className="p-0.5 text-editor-text-muted hover:text-editor-text"
|
||||
title="Zoom in"
|
||||
>
|
||||
<ZoomIn className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
{audioError ? (
|
||||
<div className="flex-1 flex items-center justify-center gap-2 text-editor-text-muted text-xs">
|
||||
<AlertTriangle className="w-4 h-4 text-yellow-500" />
|
||||
<span>{audioError}</span>
|
||||
</div>
|
||||
) : (
|
||||
<div className="flex-1 relative">
|
||||
<canvas ref={waveCanvasRef} className="absolute inset-0 w-full h-full" />
|
||||
<canvas
|
||||
ref={headCanvasRef}
|
||||
className="absolute inset-0 w-full h-full cursor-crosshair"
|
||||
onClick={handleClick}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
236
frontend/src/hooks/useKeyboardShortcuts.ts
Normal file
236
frontend/src/hooks/useKeyboardShortcuts.ts
Normal file
@ -0,0 +1,236 @@
|
||||
import { useEffect, useRef } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
|
||||
export function useKeyboardShortcuts() {
|
||||
const deleteSelectedWords = useEditorStore((s) => s.deleteSelectedWords);
|
||||
const selectedWordIndices = useEditorStore((s) => s.selectedWordIndices);
|
||||
|
||||
const playbackRateRef = useRef(1);
|
||||
|
||||
useEffect(() => {
|
||||
const getVideo = (): HTMLVideoElement | null => document.querySelector('video');
|
||||
|
||||
const handler = (e: KeyboardEvent) => {
|
||||
const target = e.target as HTMLElement;
|
||||
if (target.tagName === 'INPUT' || target.tagName === 'TEXTAREA' || target.tagName === 'SELECT') return;
|
||||
|
||||
const video = getVideo();
|
||||
|
||||
switch (true) {
|
||||
// --- Undo / Redo ---
|
||||
case e.key === 'z' && (e.ctrlKey || e.metaKey) && e.shiftKey: {
|
||||
e.preventDefault();
|
||||
useEditorStore.temporal.getState().redo();
|
||||
return;
|
||||
}
|
||||
case e.key === 'z' && (e.ctrlKey || e.metaKey): {
|
||||
e.preventDefault();
|
||||
useEditorStore.temporal.getState().undo();
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Delete / Backspace: delete selected words ---
|
||||
case e.key === 'Delete' || e.key === 'Backspace': {
|
||||
if (selectedWordIndices.length > 0) {
|
||||
e.preventDefault();
|
||||
deleteSelectedWords();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Space: play / pause ---
|
||||
case e.key === ' ' && !e.ctrlKey: {
|
||||
e.preventDefault();
|
||||
if (video) {
|
||||
if (video.paused) video.play();
|
||||
else video.pause();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// --- J: reverse / slow down ---
|
||||
case e.key === 'j' || e.key === 'J': {
|
||||
e.preventDefault();
|
||||
if (video) {
|
||||
playbackRateRef.current = Math.max(-2, playbackRateRef.current - 0.5);
|
||||
if (playbackRateRef.current < 0) {
|
||||
// HTML5 video doesn't support negative rates natively; step back
|
||||
video.currentTime = Math.max(0, video.currentTime - 2);
|
||||
} else {
|
||||
video.playbackRate = playbackRateRef.current;
|
||||
if (video.paused) video.play();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// --- K: pause ---
|
||||
case e.key === 'k' || e.key === 'K': {
|
||||
e.preventDefault();
|
||||
if (video) {
|
||||
video.pause();
|
||||
playbackRateRef.current = 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// --- L: forward / speed up ---
|
||||
case e.key === 'l' || e.key === 'L': {
|
||||
e.preventDefault();
|
||||
if (video) {
|
||||
playbackRateRef.current = Math.min(4, playbackRateRef.current + 0.5);
|
||||
video.playbackRate = Math.max(0.25, playbackRateRef.current);
|
||||
if (video.paused) video.play();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Arrow Left: seek back 5s ---
|
||||
case e.key === 'ArrowLeft' && !e.ctrlKey: {
|
||||
e.preventDefault();
|
||||
if (video) video.currentTime = Math.max(0, video.currentTime - 5);
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Arrow Right: seek forward 5s ---
|
||||
case e.key === 'ArrowRight' && !e.ctrlKey: {
|
||||
e.preventDefault();
|
||||
if (video) video.currentTime = Math.min(video.duration, video.currentTime + 5);
|
||||
return;
|
||||
}
|
||||
|
||||
// --- [ mark in-point (home) ---
|
||||
case e.key === '[': {
|
||||
e.preventDefault();
|
||||
if (video) video.currentTime = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// --- ] mark out-point (end) ---
|
||||
case e.key === ']': {
|
||||
e.preventDefault();
|
||||
if (video) video.currentTime = video.duration;
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Ctrl+S: save project ---
|
||||
case e.key === 's' && (e.ctrlKey || e.metaKey): {
|
||||
e.preventDefault();
|
||||
saveProject();
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Ctrl+E: export ---
|
||||
case e.key === 'e' && (e.ctrlKey || e.metaKey): {
|
||||
e.preventDefault();
|
||||
// Trigger export panel via DOM click
|
||||
const exportBtn = document.querySelector('[title="Export"]') as HTMLButtonElement;
|
||||
if (exportBtn) exportBtn.click();
|
||||
return;
|
||||
}
|
||||
|
||||
// --- ?: show shortcut cheatsheet ---
|
||||
case e.key === '?' || (e.key === '/' && e.shiftKey): {
|
||||
e.preventDefault();
|
||||
toggleCheatsheet();
|
||||
return;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
window.addEventListener('keydown', handler);
|
||||
return () => window.removeEventListener('keydown', handler);
|
||||
}, [deleteSelectedWords, selectedWordIndices]);
|
||||
}
|
||||
|
||||
async function saveProject() {
|
||||
const state = useEditorStore.getState();
|
||||
if (!state.videoPath || state.words.length === 0) return;
|
||||
|
||||
try {
|
||||
const projectData = {
|
||||
version: 1,
|
||||
videoPath: state.videoPath,
|
||||
words: state.words,
|
||||
segments: state.segments,
|
||||
deletedRanges: state.deletedRanges,
|
||||
language: state.language,
|
||||
createdAt: new Date().toISOString(),
|
||||
modifiedAt: new Date().toISOString(),
|
||||
};
|
||||
|
||||
const outputPath = await window.electronAPI?.saveFile({
|
||||
defaultPath: state.videoPath.replace(/\.[^.]+$/, '.aive'),
|
||||
filters: [{ name: 'CutScript Project', extensions: ['aive'] }],
|
||||
});
|
||||
|
||||
if (outputPath) {
|
||||
if (window.electronAPI?.writeFile) {
|
||||
await window.electronAPI.writeFile(outputPath, JSON.stringify(projectData, null, 2));
|
||||
} else {
|
||||
const blob = new Blob([JSON.stringify(projectData, null, 2)], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = outputPath.split(/[\\/]/).pop() || 'project.aive';
|
||||
a.click();
|
||||
URL.revokeObjectURL(url);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Failed to save project:', err);
|
||||
}
|
||||
}
|
||||
|
||||
let cheatsheetVisible = false;
|
||||
|
||||
function toggleCheatsheet() {
|
||||
const existing = document.getElementById('keyboard-cheatsheet');
|
||||
if (existing) {
|
||||
existing.remove();
|
||||
cheatsheetVisible = false;
|
||||
return;
|
||||
}
|
||||
|
||||
cheatsheetVisible = true;
|
||||
const overlay = document.createElement('div');
|
||||
overlay.id = 'keyboard-cheatsheet';
|
||||
overlay.style.cssText =
|
||||
'position:fixed;inset:0;z-index:9999;display:flex;align-items:center;justify-content:center;background:rgba(0,0,0,0.7);';
|
||||
overlay.onclick = () => {
|
||||
overlay.remove();
|
||||
cheatsheetVisible = false;
|
||||
};
|
||||
|
||||
const shortcuts = [
|
||||
['Space', 'Play / Pause'],
|
||||
['J', 'Reverse / Slow down'],
|
||||
['K', 'Pause'],
|
||||
['L', 'Forward / Speed up'],
|
||||
['\u2190 / \u2192', 'Seek \u00b15 seconds'],
|
||||
['Delete', 'Delete selected words'],
|
||||
['Ctrl+Z', 'Undo'],
|
||||
['Ctrl+Shift+Z', 'Redo'],
|
||||
['Ctrl+S', 'Save project'],
|
||||
['Ctrl+E', 'Export'],
|
||||
['?', 'This cheatsheet'],
|
||||
];
|
||||
|
||||
const rows = shortcuts
|
||||
.map(
|
||||
([key, desc]) =>
|
||||
`<tr><td style="padding:6px 16px 6px 0;font-family:monospace;color:#818cf8;font-weight:600">${key}</td><td style="padding:6px 0;color:#e2e8f0">${desc}</td></tr>`,
|
||||
)
|
||||
.join('');
|
||||
|
||||
overlay.innerHTML = `<div style="background:#1a1d27;border:1px solid #2a2d3a;border-radius:12px;padding:24px 32px;max-width:400px;" onclick="event.stopPropagation()">
|
||||
<h3 style="margin:0 0 16px;font-size:14px;font-weight:600;color:#e2e8f0">Keyboard Shortcuts</h3>
|
||||
<table style="font-size:13px">${rows}</table>
|
||||
<p style="margin:16px 0 0;font-size:11px;color:#94a3b8;text-align:center">Press ? or click outside to close</p>
|
||||
</div>`;
|
||||
|
||||
document.body.appendChild(overlay);
|
||||
}
|
||||
69
frontend/src/hooks/useVideoSync.ts
Normal file
69
frontend/src/hooks/useVideoSync.ts
Normal file
@ -0,0 +1,69 @@
|
||||
import { useCallback, useRef, useEffect } from 'react';
|
||||
import { useEditorStore } from '../store/editorStore';
|
||||
|
||||
export function useVideoSync(videoRef: React.RefObject<HTMLVideoElement | null>) {
|
||||
const rafRef = useRef<number>(0);
|
||||
const {
|
||||
setCurrentTime,
|
||||
setDuration,
|
||||
setIsPlaying,
|
||||
deletedRanges,
|
||||
} = useEditorStore();
|
||||
|
||||
const seekTo = useCallback(
|
||||
(time: number) => {
|
||||
if (videoRef.current) {
|
||||
videoRef.current.currentTime = time;
|
||||
setCurrentTime(time);
|
||||
}
|
||||
},
|
||||
[videoRef, setCurrentTime],
|
||||
);
|
||||
|
||||
const togglePlay = useCallback(() => {
|
||||
if (!videoRef.current) return;
|
||||
if (videoRef.current.paused) {
|
||||
videoRef.current.play();
|
||||
} else {
|
||||
videoRef.current.pause();
|
||||
}
|
||||
}, [videoRef]);
|
||||
|
||||
useEffect(() => {
|
||||
const video = videoRef.current;
|
||||
if (!video) return;
|
||||
|
||||
const onTimeUpdate = () => {
|
||||
cancelAnimationFrame(rafRef.current);
|
||||
rafRef.current = requestAnimationFrame(() => {
|
||||
const t = video.currentTime;
|
||||
for (const range of deletedRanges) {
|
||||
if (t >= range.start && t < range.end) {
|
||||
video.currentTime = range.end;
|
||||
return;
|
||||
}
|
||||
}
|
||||
setCurrentTime(t);
|
||||
});
|
||||
};
|
||||
|
||||
const onPlay = () => setIsPlaying(true);
|
||||
const onPause = () => setIsPlaying(false);
|
||||
const onLoadedMetadata = () => setDuration(video.duration);
|
||||
|
||||
video.addEventListener('timeupdate', onTimeUpdate);
|
||||
video.addEventListener('play', onPlay);
|
||||
video.addEventListener('pause', onPause);
|
||||
video.addEventListener('loadedmetadata', onLoadedMetadata);
|
||||
|
||||
return () => {
|
||||
video.removeEventListener('timeupdate', onTimeUpdate);
|
||||
video.removeEventListener('play', onPlay);
|
||||
video.removeEventListener('pause', onPause);
|
||||
video.removeEventListener('loadedmetadata', onLoadedMetadata);
|
||||
cancelAnimationFrame(rafRef.current);
|
||||
};
|
||||
}, [videoRef, deletedRanges, setCurrentTime, setIsPlaying, setDuration]);
|
||||
|
||||
return { seekTo, togglePlay };
|
||||
}
|
||||
37
frontend/src/index.css
Normal file
37
frontend/src/index.css
Normal file
@ -0,0 +1,37 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: 'Inter', system-ui, -apple-system, sans-serif;
|
||||
overflow: hidden;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar {
|
||||
width: 6px;
|
||||
height: 6px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-track {
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb {
|
||||
background: #2a2d3a;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb:hover {
|
||||
background: #3a3d4a;
|
||||
}
|
||||
|
||||
video::-webkit-media-controls {
|
||||
display: none !important;
|
||||
}
|
||||
10
frontend/src/main.tsx
Normal file
10
frontend/src/main.tsx
Normal file
@ -0,0 +1,10 @@
|
||||
import React from 'react';
|
||||
import ReactDOM from 'react-dom/client';
|
||||
import App from './App';
|
||||
import './index.css';
|
||||
|
||||
ReactDOM.createRoot(document.getElementById('root')!).render(
|
||||
<React.StrictMode>
|
||||
<App />
|
||||
</React.StrictMode>,
|
||||
);
|
||||
129
frontend/src/store/aiStore.ts
Normal file
129
frontend/src/store/aiStore.ts
Normal file
@ -0,0 +1,129 @@
|
||||
import { create } from 'zustand';
|
||||
import { persist } from 'zustand/middleware';
|
||||
import type { AIProvider, AIProviderConfig, FillerWordResult, ClipSuggestion } from '../types/project';
|
||||
|
||||
const ENCRYPTED_KEY_PREFIX = 'aive_enc_';
|
||||
|
||||
interface AIState {
|
||||
providers: Record<AIProvider, AIProviderConfig>;
|
||||
defaultProvider: AIProvider;
|
||||
customFillerWords: string;
|
||||
fillerResult: FillerWordResult | null;
|
||||
clipSuggestions: ClipSuggestion[];
|
||||
isProcessing: boolean;
|
||||
processingMessage: string;
|
||||
_keysHydrated: boolean;
|
||||
}
|
||||
|
||||
interface AIActions {
|
||||
setProviderConfig: (provider: AIProvider, config: Partial<AIProviderConfig>) => void;
|
||||
setDefaultProvider: (provider: AIProvider) => void;
|
||||
setCustomFillerWords: (words: string) => void;
|
||||
setFillerResult: (result: FillerWordResult | null) => void;
|
||||
setClipSuggestions: (suggestions: ClipSuggestion[]) => void;
|
||||
setProcessing: (active: boolean, message?: string) => void;
|
||||
hydrateKeys: () => Promise<void>;
|
||||
}
|
||||
|
||||
async function encryptAndStore(key: string, value: string): Promise<void> {
|
||||
if (!value) {
|
||||
localStorage.removeItem(ENCRYPTED_KEY_PREFIX + key);
|
||||
return;
|
||||
}
|
||||
if (window.electronAPI) {
|
||||
const encrypted = await window.electronAPI.encryptString(value);
|
||||
localStorage.setItem(ENCRYPTED_KEY_PREFIX + key, encrypted);
|
||||
} else {
|
||||
localStorage.setItem(ENCRYPTED_KEY_PREFIX + key, btoa(value));
|
||||
}
|
||||
}
|
||||
|
||||
async function loadAndDecrypt(key: string): Promise<string> {
|
||||
const stored = localStorage.getItem(ENCRYPTED_KEY_PREFIX + key);
|
||||
if (!stored) return '';
|
||||
if (window.electronAPI) {
|
||||
try {
|
||||
return await window.electronAPI.decryptString(stored);
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
try {
|
||||
return atob(stored);
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
export const useAIStore = create<AIState & AIActions>()(
|
||||
persist(
|
||||
(set, get) => ({
|
||||
providers: {
|
||||
ollama: { provider: 'ollama', baseUrl: 'http://localhost:11434', model: 'llama3' },
|
||||
openai: { provider: 'openai', apiKey: '', model: 'gpt-4o' },
|
||||
claude: { provider: 'claude', apiKey: '', model: 'claude-sonnet-4-20250514' },
|
||||
},
|
||||
defaultProvider: 'ollama',
|
||||
customFillerWords: '',
|
||||
fillerResult: null,
|
||||
clipSuggestions: [],
|
||||
isProcessing: false,
|
||||
processingMessage: '',
|
||||
_keysHydrated: false,
|
||||
|
||||
setProviderConfig: (provider, config) => {
|
||||
set((state) => ({
|
||||
providers: {
|
||||
...state.providers,
|
||||
[provider]: { ...state.providers[provider], ...config },
|
||||
},
|
||||
}));
|
||||
|
||||
if (config.apiKey !== undefined) {
|
||||
encryptAndStore(`${provider}_apiKey`, config.apiKey);
|
||||
}
|
||||
},
|
||||
|
||||
setDefaultProvider: (provider) => set({ defaultProvider: provider }),
|
||||
|
||||
setCustomFillerWords: (words) => set({ customFillerWords: words }),
|
||||
|
||||
setFillerResult: (result) => set({ fillerResult: result }),
|
||||
|
||||
setClipSuggestions: (suggestions) => set({ clipSuggestions: suggestions }),
|
||||
|
||||
setProcessing: (active, message) =>
|
||||
set({ isProcessing: active, processingMessage: message ?? '' }),
|
||||
|
||||
hydrateKeys: async () => {
|
||||
const [openaiKey, claudeKey] = await Promise.all([
|
||||
loadAndDecrypt('openai_apiKey'),
|
||||
loadAndDecrypt('claude_apiKey'),
|
||||
]);
|
||||
const state = get();
|
||||
set({
|
||||
providers: {
|
||||
...state.providers,
|
||||
openai: { ...state.providers.openai, apiKey: openaiKey },
|
||||
claude: { ...state.providers.claude, apiKey: claudeKey },
|
||||
},
|
||||
_keysHydrated: true,
|
||||
});
|
||||
},
|
||||
}),
|
||||
{
|
||||
name: 'aive-ai-settings',
|
||||
partialize: (state) => ({
|
||||
providers: {
|
||||
ollama: { ...state.providers.ollama, apiKey: undefined },
|
||||
openai: { ...state.providers.openai, apiKey: '' },
|
||||
claude: { ...state.providers.claude, apiKey: '' },
|
||||
},
|
||||
defaultProvider: state.defaultProvider,
|
||||
customFillerWords: state.customFillerWords,
|
||||
}),
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
useAIStore.getState().hydrateKeys();
|
||||
232
frontend/src/store/editorStore.ts
Normal file
232
frontend/src/store/editorStore.ts
Normal file
@ -0,0 +1,232 @@
|
||||
import { create } from 'zustand';
|
||||
import { temporal } from 'zundo';
|
||||
import type { Word, Segment, DeletedRange, TranscriptionResult } from '../types/project';
|
||||
|
||||
interface EditorState {
|
||||
videoPath: string | null;
|
||||
videoUrl: string | null;
|
||||
words: Word[];
|
||||
segments: Segment[];
|
||||
deletedRanges: DeletedRange[];
|
||||
language: string;
|
||||
|
||||
currentTime: number;
|
||||
duration: number;
|
||||
isPlaying: boolean;
|
||||
|
||||
selectedWordIndices: number[];
|
||||
hoveredWordIndex: number | null;
|
||||
|
||||
isTranscribing: boolean;
|
||||
transcriptionProgress: number;
|
||||
isExporting: boolean;
|
||||
exportProgress: number;
|
||||
|
||||
backendUrl: string;
|
||||
}
|
||||
|
||||
interface EditorActions {
|
||||
setBackendUrl: (url: string) => void;
|
||||
loadVideo: (path: string) => void;
|
||||
setTranscription: (result: TranscriptionResult) => void;
|
||||
setCurrentTime: (time: number) => void;
|
||||
setDuration: (duration: number) => void;
|
||||
setIsPlaying: (playing: boolean) => void;
|
||||
setSelectedWordIndices: (indices: number[]) => void;
|
||||
setHoveredWordIndex: (index: number | null) => void;
|
||||
deleteSelectedWords: () => void;
|
||||
deleteWordRange: (startIndex: number, endIndex: number) => void;
|
||||
restoreRange: (rangeId: string) => void;
|
||||
setTranscribing: (active: boolean, progress?: number) => void;
|
||||
setExporting: (active: boolean, progress?: number) => void;
|
||||
getKeepSegments: () => Array<{ start: number; end: number }>;
|
||||
getWordAtTime: (time: number) => number;
|
||||
loadProject: (projectData: any) => void;
|
||||
reset: () => void;
|
||||
}
|
||||
|
||||
const initialState: EditorState = {
|
||||
videoPath: null,
|
||||
videoUrl: null,
|
||||
words: [],
|
||||
segments: [],
|
||||
deletedRanges: [],
|
||||
language: '',
|
||||
currentTime: 0,
|
||||
duration: 0,
|
||||
isPlaying: false,
|
||||
selectedWordIndices: [],
|
||||
hoveredWordIndex: null,
|
||||
isTranscribing: false,
|
||||
transcriptionProgress: 0,
|
||||
isExporting: false,
|
||||
exportProgress: 0,
|
||||
backendUrl: 'http://localhost:8642',
|
||||
};
|
||||
|
||||
let nextRangeId = 1;
|
||||
|
||||
export const useEditorStore = create<EditorState & EditorActions>()(
|
||||
temporal(
|
||||
(set, get) => ({
|
||||
...initialState,
|
||||
|
||||
setBackendUrl: (url) => set({ backendUrl: url }),
|
||||
|
||||
loadVideo: (path) => {
|
||||
const backend = get().backendUrl;
|
||||
const url = `${backend}/file?path=${encodeURIComponent(path)}`;
|
||||
set({
|
||||
...initialState,
|
||||
backendUrl: backend,
|
||||
videoPath: path,
|
||||
videoUrl: url,
|
||||
});
|
||||
},
|
||||
|
||||
setTranscription: (result) => {
|
||||
let globalIdx = 0;
|
||||
const annotatedSegments = result.segments.map((seg) => {
|
||||
const annotated = { ...seg, globalStartIndex: globalIdx };
|
||||
globalIdx += seg.words.length;
|
||||
return annotated;
|
||||
});
|
||||
set({
|
||||
words: result.words,
|
||||
segments: annotatedSegments,
|
||||
language: result.language,
|
||||
deletedRanges: [],
|
||||
selectedWordIndices: [],
|
||||
});
|
||||
},
|
||||
|
||||
setCurrentTime: (time) => set({ currentTime: time }),
|
||||
setDuration: (duration) => set({ duration }),
|
||||
setIsPlaying: (playing) => set({ isPlaying: playing }),
|
||||
setSelectedWordIndices: (indices) => set({ selectedWordIndices: indices }),
|
||||
setHoveredWordIndex: (index) => set({ hoveredWordIndex: index }),
|
||||
|
||||
deleteSelectedWords: () => {
|
||||
const { selectedWordIndices, words, deletedRanges } = get();
|
||||
if (selectedWordIndices.length === 0) return;
|
||||
|
||||
const sorted = [...selectedWordIndices].sort((a, b) => a - b);
|
||||
const startWord = words[sorted[0]];
|
||||
const endWord = words[sorted[sorted.length - 1]];
|
||||
|
||||
const newRange: DeletedRange = {
|
||||
id: `dr_${nextRangeId++}`,
|
||||
start: startWord.start,
|
||||
end: endWord.end,
|
||||
wordIndices: sorted,
|
||||
};
|
||||
|
||||
set({
|
||||
deletedRanges: [...deletedRanges, newRange],
|
||||
selectedWordIndices: [],
|
||||
});
|
||||
},
|
||||
|
||||
deleteWordRange: (startIndex, endIndex) => {
|
||||
const { words, deletedRanges } = get();
|
||||
const indices = [];
|
||||
for (let i = startIndex; i <= endIndex; i++) indices.push(i);
|
||||
|
||||
const newRange: DeletedRange = {
|
||||
id: `dr_${nextRangeId++}`,
|
||||
start: words[startIndex].start,
|
||||
end: words[endIndex].end,
|
||||
wordIndices: indices,
|
||||
};
|
||||
|
||||
set({ deletedRanges: [...deletedRanges, newRange] });
|
||||
},
|
||||
|
||||
restoreRange: (rangeId) => {
|
||||
const { deletedRanges } = get();
|
||||
set({ deletedRanges: deletedRanges.filter((r) => r.id !== rangeId) });
|
||||
},
|
||||
|
||||
setTranscribing: (active, progress) =>
|
||||
set({
|
||||
isTranscribing: active,
|
||||
transcriptionProgress: progress ?? (active ? 0 : 100),
|
||||
}),
|
||||
|
||||
setExporting: (active, progress) =>
|
||||
set({
|
||||
isExporting: active,
|
||||
exportProgress: progress ?? (active ? 0 : 100),
|
||||
}),
|
||||
|
||||
getKeepSegments: () => {
|
||||
const { words, deletedRanges, duration } = get();
|
||||
if (words.length === 0) return [{ start: 0, end: duration }];
|
||||
|
||||
const deletedSet = new Set<number>();
|
||||
for (const range of deletedRanges) {
|
||||
for (const idx of range.wordIndices) deletedSet.add(idx);
|
||||
}
|
||||
|
||||
const segments: Array<{ start: number; end: number }> = [];
|
||||
let segStart: number | null = null;
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
if (!deletedSet.has(i)) {
|
||||
if (segStart === null) segStart = words[i].start;
|
||||
} else {
|
||||
if (segStart !== null) {
|
||||
segments.push({ start: segStart, end: words[i - 1].end });
|
||||
segStart = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (segStart !== null) {
|
||||
segments.push({ start: segStart, end: words[words.length - 1].end });
|
||||
}
|
||||
|
||||
return segments;
|
||||
},
|
||||
|
||||
getWordAtTime: (time) => {
|
||||
const { words } = get();
|
||||
let lo = 0;
|
||||
let hi = words.length - 1;
|
||||
while (lo <= hi) {
|
||||
const mid = (lo + hi) >>> 1;
|
||||
if (words[mid].end < time) lo = mid + 1;
|
||||
else if (words[mid].start > time) hi = mid - 1;
|
||||
else return mid;
|
||||
}
|
||||
return lo < words.length ? lo : words.length - 1;
|
||||
},
|
||||
|
||||
loadProject: (data) => {
|
||||
const backend = get().backendUrl;
|
||||
const url = `${backend}/file?path=${encodeURIComponent(data.videoPath)}`;
|
||||
|
||||
let globalIdx = 0;
|
||||
const annotatedSegments = (data.segments || []).map((seg: Segment) => {
|
||||
const annotated = { ...seg, globalStartIndex: globalIdx };
|
||||
globalIdx += seg.words.length;
|
||||
return annotated;
|
||||
});
|
||||
|
||||
set({
|
||||
...initialState,
|
||||
backendUrl: backend,
|
||||
videoPath: data.videoPath,
|
||||
videoUrl: url,
|
||||
words: data.words || [],
|
||||
segments: annotatedSegments,
|
||||
deletedRanges: data.deletedRanges || [],
|
||||
language: data.language || '',
|
||||
});
|
||||
},
|
||||
|
||||
reset: () => set(initialState),
|
||||
}),
|
||||
{ limit: 100 },
|
||||
),
|
||||
);
|
||||
86
frontend/src/types/project.ts
Normal file
86
frontend/src/types/project.ts
Normal file
@ -0,0 +1,86 @@
|
||||
export interface Word {
|
||||
word: string;
|
||||
start: number;
|
||||
end: number;
|
||||
confidence: number;
|
||||
speaker?: string;
|
||||
}
|
||||
|
||||
export interface Segment {
|
||||
id: number;
|
||||
start: number;
|
||||
end: number;
|
||||
text: string;
|
||||
words: Word[];
|
||||
speaker?: string;
|
||||
globalStartIndex: number;
|
||||
}
|
||||
|
||||
export interface TimeRange {
|
||||
start: number;
|
||||
end: number;
|
||||
}
|
||||
|
||||
export interface DeletedRange extends TimeRange {
|
||||
id: string;
|
||||
wordIndices: number[];
|
||||
}
|
||||
|
||||
export interface ProjectFile {
|
||||
version: 1;
|
||||
videoPath: string;
|
||||
words: Word[];
|
||||
segments: Segment[];
|
||||
deletedRanges: DeletedRange[];
|
||||
language: string;
|
||||
createdAt: string;
|
||||
modifiedAt: string;
|
||||
}
|
||||
|
||||
export interface TranscriptionResult {
|
||||
words: Word[];
|
||||
segments: Segment[];
|
||||
language: string;
|
||||
}
|
||||
|
||||
export interface ExportOptions {
|
||||
outputPath: string;
|
||||
mode: 'fast' | 'reencode';
|
||||
resolution: '720p' | '1080p' | '4k';
|
||||
format: 'mp4' | 'mov' | 'webm';
|
||||
enhanceAudio: boolean;
|
||||
captions: 'none' | 'burn-in' | 'sidecar';
|
||||
captionStyle?: CaptionStyle;
|
||||
}
|
||||
|
||||
export interface CaptionStyle {
|
||||
fontName: string;
|
||||
fontSize: number;
|
||||
fontColor: string;
|
||||
backgroundColor: string;
|
||||
position: 'bottom' | 'top' | 'center';
|
||||
bold: boolean;
|
||||
}
|
||||
|
||||
export type AIProvider = 'ollama' | 'openai' | 'claude';
|
||||
|
||||
export interface AIProviderConfig {
|
||||
provider: AIProvider;
|
||||
apiKey?: string;
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
}
|
||||
|
||||
export interface FillerWordResult {
|
||||
wordIndices: number[];
|
||||
fillerWords: Array<{ index: number; word: string; reason: string }>;
|
||||
}
|
||||
|
||||
export interface ClipSuggestion {
|
||||
title: string;
|
||||
startWordIndex: number;
|
||||
endWordIndex: number;
|
||||
startTime: number;
|
||||
endTime: number;
|
||||
reason: string;
|
||||
}
|
||||
16
frontend/src/vite-env.d.ts
vendored
Normal file
16
frontend/src/vite-env.d.ts
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
/// <reference types="vite/client" />
|
||||
|
||||
interface ElectronAPI {
|
||||
openFile: (options?: Record<string, unknown>) => Promise<string | null>;
|
||||
saveFile: (options?: Record<string, unknown>) => Promise<string | null>;
|
||||
openProject: () => Promise<string | null>;
|
||||
getBackendUrl: () => Promise<string>;
|
||||
encryptString: (data: string) => Promise<string>;
|
||||
decryptString: (encrypted: string) => Promise<string>;
|
||||
readFile: (path: string) => Promise<string>;
|
||||
writeFile: (path: string, content: string) => Promise<boolean>;
|
||||
}
|
||||
|
||||
interface Window {
|
||||
electronAPI?: ElectronAPI;
|
||||
}
|
||||
30
frontend/tailwind.config.js
Normal file
30
frontend/tailwind.config.js
Normal file
@ -0,0 +1,30 @@
|
||||
/** @type {import('tailwindcss').Config} */
|
||||
export default {
|
||||
content: ['./index.html', './src/**/*.{js,ts,jsx,tsx}'],
|
||||
theme: {
|
||||
extend: {
|
||||
colors: {
|
||||
editor: {
|
||||
bg: '#0f1117',
|
||||
surface: '#1a1d27',
|
||||
border: '#2a2d3a',
|
||||
accent: '#6366f1',
|
||||
'accent-hover': '#818cf8',
|
||||
text: '#e2e8f0',
|
||||
'text-muted': '#94a3b8',
|
||||
danger: '#ef4444',
|
||||
success: '#22c55e',
|
||||
warning: '#f59e0b',
|
||||
'word-hover': 'rgba(99, 102, 241, 0.15)',
|
||||
'word-selected': 'rgba(99, 102, 241, 0.3)',
|
||||
'word-deleted': 'rgba(239, 68, 68, 0.2)',
|
||||
'word-filler': 'rgba(245, 158, 11, 0.25)',
|
||||
},
|
||||
},
|
||||
fontFamily: {
|
||||
mono: ['JetBrains Mono', 'Fira Code', 'monospace'],
|
||||
},
|
||||
},
|
||||
},
|
||||
plugins: [],
|
||||
};
|
||||
23
frontend/tsconfig.json
Normal file
23
frontend/tsconfig.json
Normal file
@ -0,0 +1,23 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2020",
|
||||
"useDefineForClassFields": true,
|
||||
"lib": ["ES2020", "DOM", "DOM.Iterable"],
|
||||
"module": "ESNext",
|
||||
"skipLibCheck": true,
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"isolatedModules": true,
|
||||
"moduleDetection": "force",
|
||||
"noEmit": true,
|
||||
"jsx": "react-jsx",
|
||||
"strict": true,
|
||||
"noUnusedLocals": false,
|
||||
"noUnusedParameters": false,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"resolveJsonModule": true,
|
||||
"esModuleInterop": true
|
||||
},
|
||||
"include": ["src", "src/vite-env.d.ts"]
|
||||
}
|
||||
15
frontend/vite.config.ts
Normal file
15
frontend/vite.config.ts
Normal file
@ -0,0 +1,15 @@
|
||||
import { defineConfig } from 'vite';
|
||||
import react from '@vitejs/plugin-react';
|
||||
|
||||
export default defineConfig({
|
||||
plugins: [react()],
|
||||
base: './',
|
||||
server: {
|
||||
port: 5173,
|
||||
strictPort: true,
|
||||
},
|
||||
build: {
|
||||
outDir: 'dist',
|
||||
emptyOutDir: true,
|
||||
},
|
||||
});
|
||||
25
install.bat
25
install.bat
@ -1,25 +0,0 @@
|
||||
@echo off
|
||||
echo ===================================================
|
||||
echo OBS Recording Transcriber - Windows Installation
|
||||
echo ===================================================
|
||||
echo.
|
||||
|
||||
:: Check for Python
|
||||
python --version > nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
echo Python not found! Please install Python 3.8 or higher.
|
||||
echo Download from: https://www.python.org/downloads/
|
||||
echo Make sure to check "Add Python to PATH" during installation.
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
:: Run the installation script
|
||||
echo Running installation script...
|
||||
python install.py
|
||||
|
||||
echo.
|
||||
echo If the installation was successful, you can run the application with:
|
||||
echo streamlit run app.py
|
||||
echo.
|
||||
pause
|
||||
307
install.py
307
install.py
@ -1,307 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Installation script for OBS Recording Transcriber.
|
||||
This script helps install all required dependencies and checks for common issues.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
import subprocess
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
def print_header(text):
|
||||
"""Print a formatted header."""
|
||||
print("\n" + "=" * 80)
|
||||
print(f" {text}")
|
||||
print("=" * 80)
|
||||
|
||||
def print_step(text):
|
||||
"""Print a step in the installation process."""
|
||||
print(f"\n>> {text}")
|
||||
|
||||
def run_command(command, check=True):
|
||||
"""Run a shell command and return the result."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
command,
|
||||
shell=True,
|
||||
check=check,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
return result
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error executing command: {command}")
|
||||
print(f"Error message: {e.stderr}")
|
||||
return None
|
||||
|
||||
def check_python_version():
|
||||
"""Check if Python version is 3.8 or higher."""
|
||||
print_step("Checking Python version")
|
||||
version = sys.version_info
|
||||
if version.major < 3 or (version.major == 3 and version.minor < 8):
|
||||
print(f"Python 3.8 or higher is required. You have {sys.version}")
|
||||
print("Please upgrade your Python installation.")
|
||||
return False
|
||||
print(f"Python version: {sys.version}")
|
||||
return True
|
||||
|
||||
def check_ffmpeg():
|
||||
"""Check if FFmpeg is installed."""
|
||||
print_step("Checking FFmpeg installation")
|
||||
result = shutil.which("ffmpeg")
|
||||
if result is None:
|
||||
print("FFmpeg not found in PATH.")
|
||||
print("Please install FFmpeg:")
|
||||
if platform.system() == "Windows":
|
||||
print(" - Download from: https://www.gyan.dev/ffmpeg/builds/")
|
||||
print(" - Extract and add the bin folder to your PATH")
|
||||
elif platform.system() == "Darwin": # macOS
|
||||
print(" - Install with Homebrew: brew install ffmpeg")
|
||||
else: # Linux
|
||||
print(" - Install with apt: sudo apt update && sudo apt install ffmpeg")
|
||||
return False
|
||||
|
||||
# Check FFmpeg version
|
||||
version_result = run_command("ffmpeg -version")
|
||||
if version_result:
|
||||
print(f"FFmpeg is installed: {version_result.stdout.splitlines()[0]}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_gpu():
|
||||
"""Check for GPU availability."""
|
||||
print_step("Checking GPU availability")
|
||||
|
||||
# Check for NVIDIA GPU
|
||||
if platform.system() == "Windows":
|
||||
nvidia_smi = shutil.which("nvidia-smi")
|
||||
if nvidia_smi:
|
||||
result = run_command("nvidia-smi", check=False)
|
||||
if result and result.returncode == 0:
|
||||
print("NVIDIA GPU detected:")
|
||||
for line in result.stdout.splitlines()[:10]:
|
||||
print(f" {line}")
|
||||
return "nvidia"
|
||||
|
||||
# Check for Apple Silicon
|
||||
if platform.system() == "Darwin" and platform.machine() == "arm64":
|
||||
print("Apple Silicon (M1/M2) detected")
|
||||
return "apple"
|
||||
|
||||
print("No GPU detected or GPU drivers not installed. CPU will be used for processing.")
|
||||
return "cpu"
|
||||
|
||||
def setup_virtual_env():
|
||||
"""Set up a virtual environment."""
|
||||
print_step("Setting up virtual environment")
|
||||
|
||||
# Check if venv module is available
|
||||
try:
|
||||
import venv
|
||||
print("Python venv module is available")
|
||||
except ImportError:
|
||||
print("Python venv module is not available. Please install it.")
|
||||
return False
|
||||
|
||||
# Create virtual environment if it doesn't exist
|
||||
venv_path = Path("venv")
|
||||
if venv_path.exists():
|
||||
print(f"Virtual environment already exists at {venv_path}")
|
||||
activate_venv()
|
||||
return True
|
||||
|
||||
print(f"Creating virtual environment at {venv_path}")
|
||||
try:
|
||||
subprocess.run([sys.executable, "-m", "venv", "venv"], check=True)
|
||||
print("Virtual environment created successfully")
|
||||
activate_venv()
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error creating virtual environment: {e}")
|
||||
return False
|
||||
|
||||
def activate_venv():
|
||||
"""Activate the virtual environment."""
|
||||
print_step("Activating virtual environment")
|
||||
|
||||
venv_path = Path("venv")
|
||||
if not venv_path.exists():
|
||||
print("Virtual environment not found")
|
||||
return False
|
||||
|
||||
# Get the path to the activate script
|
||||
if platform.system() == "Windows":
|
||||
activate_script = venv_path / "Scripts" / "activate.bat"
|
||||
activate_cmd = f"call {activate_script}"
|
||||
else:
|
||||
activate_script = venv_path / "bin" / "activate"
|
||||
activate_cmd = f"source {activate_script}"
|
||||
|
||||
print(f"To activate the virtual environment, run:")
|
||||
print(f" {activate_cmd}")
|
||||
|
||||
# We can't actually activate the venv in this script because it would only
|
||||
# affect the subprocess, not the parent process. We just provide instructions.
|
||||
return True
|
||||
|
||||
def install_pytorch(gpu_type):
|
||||
"""Install PyTorch with appropriate GPU support."""
|
||||
print_step("Installing PyTorch")
|
||||
|
||||
if gpu_type == "nvidia":
|
||||
print("Installing PyTorch with CUDA support")
|
||||
cmd = "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
|
||||
elif gpu_type == "apple":
|
||||
print("Installing PyTorch with MPS support")
|
||||
cmd = "pip install torch torchvision torchaudio"
|
||||
else:
|
||||
print("Installing PyTorch (CPU version)")
|
||||
cmd = "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu"
|
||||
|
||||
result = run_command(cmd)
|
||||
if result and result.returncode == 0:
|
||||
print("PyTorch installed successfully")
|
||||
return True
|
||||
else:
|
||||
print("Failed to install PyTorch")
|
||||
return False
|
||||
|
||||
def install_dependencies():
|
||||
"""Install dependencies from requirements.txt."""
|
||||
print_step("Installing dependencies from requirements.txt")
|
||||
|
||||
requirements_path = Path("requirements.txt")
|
||||
if not requirements_path.exists():
|
||||
print("requirements.txt not found")
|
||||
return False
|
||||
|
||||
result = run_command("pip install -r requirements.txt")
|
||||
if result and result.returncode == 0:
|
||||
print("Dependencies installed successfully")
|
||||
return True
|
||||
else:
|
||||
print("Some dependencies failed to install. See error messages above.")
|
||||
return False
|
||||
|
||||
def install_tokenizers():
|
||||
"""Install tokenizers package separately."""
|
||||
print_step("Installing tokenizers package")
|
||||
|
||||
# First try the normal installation
|
||||
result = run_command("pip install tokenizers", check=False)
|
||||
if result and result.returncode == 0:
|
||||
print("Tokenizers installed successfully")
|
||||
return True
|
||||
|
||||
# If that fails, try the no-binary option
|
||||
print("Standard installation failed, trying alternative method...")
|
||||
result = run_command("pip install tokenizers --no-binary tokenizers", check=False)
|
||||
if result and result.returncode == 0:
|
||||
print("Tokenizers installed successfully with alternative method")
|
||||
return True
|
||||
|
||||
print("Failed to install tokenizers. You may need to install Rust or Visual C++ Build Tools.")
|
||||
if platform.system() == "Windows":
|
||||
print("Download Visual C++ Build Tools: https://visualstudio.microsoft.com/visual-cpp-build-tools/")
|
||||
print("Install Rust: https://rustup.rs/")
|
||||
return False
|
||||
|
||||
def check_installation():
|
||||
"""Verify the installation by importing key packages."""
|
||||
print_step("Verifying installation")
|
||||
|
||||
packages_to_check = [
|
||||
"streamlit",
|
||||
"torch",
|
||||
"transformers",
|
||||
"whisper",
|
||||
"numpy",
|
||||
"sklearn"
|
||||
]
|
||||
|
||||
all_successful = True
|
||||
for package in packages_to_check:
|
||||
try:
|
||||
__import__(package)
|
||||
print(f"✓ {package} imported successfully")
|
||||
except ImportError:
|
||||
print(f"✗ Failed to import {package}")
|
||||
all_successful = False
|
||||
|
||||
# Check optional packages
|
||||
optional_packages = [
|
||||
"pyannote.audio",
|
||||
"iso639"
|
||||
]
|
||||
|
||||
print("\nChecking optional packages:")
|
||||
for package in optional_packages:
|
||||
try:
|
||||
if package == "pyannote.audio":
|
||||
# Just try to import pyannote
|
||||
__import__("pyannote")
|
||||
else:
|
||||
__import__(package)
|
||||
print(f"✓ {package} imported successfully")
|
||||
except ImportError:
|
||||
print(f"⚠ {package} not available (required for some advanced features)")
|
||||
|
||||
return all_successful
|
||||
|
||||
def main():
|
||||
"""Main installation function."""
|
||||
print_header("OBS Recording Transcriber - Installation Script")
|
||||
|
||||
# Check prerequisites
|
||||
if not check_python_version():
|
||||
return
|
||||
|
||||
ffmpeg_available = check_ffmpeg()
|
||||
gpu_type = check_gpu()
|
||||
|
||||
# Setup environment
|
||||
if not setup_virtual_env():
|
||||
print("Failed to set up virtual environment. Continuing with system Python...")
|
||||
|
||||
# Install packages
|
||||
print("\nReady to install packages. Make sure your virtual environment is activated.")
|
||||
input("Press Enter to continue...")
|
||||
|
||||
install_pytorch(gpu_type)
|
||||
install_dependencies()
|
||||
install_tokenizers()
|
||||
|
||||
# Verify installation
|
||||
success = check_installation()
|
||||
|
||||
print_header("Installation Summary")
|
||||
print(f"Python: {'✓ OK' if check_python_version() else '✗ Needs upgrade'}")
|
||||
print(f"FFmpeg: {'✓ Installed' if ffmpeg_available else '✗ Not found'}")
|
||||
print(f"GPU Support: {gpu_type.upper()}")
|
||||
print(f"Dependencies: {'✓ Installed' if success else '⚠ Some issues'}")
|
||||
|
||||
print("\nNext steps:")
|
||||
if not ffmpeg_available:
|
||||
print("1. Install FFmpeg (required for audio processing)")
|
||||
|
||||
print("1. Activate your virtual environment:")
|
||||
if platform.system() == "Windows":
|
||||
print(" venv\\Scripts\\activate")
|
||||
else:
|
||||
print(" source venv/bin/activate")
|
||||
|
||||
print("2. Run the application:")
|
||||
print(" streamlit run app.py")
|
||||
|
||||
print("\nFor advanced features like speaker diarization:")
|
||||
print("1. Get a HuggingFace token: https://huggingface.co/settings/tokens")
|
||||
print("2. Request access to pyannote models: https://huggingface.co/pyannote/speaker-diarization-3.0")
|
||||
|
||||
print("\nSee INSTALLATION.md for more details and troubleshooting.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
26
install.sh
26
install.sh
@ -1,26 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "==================================================="
|
||||
echo " OBS Recording Transcriber - Unix Installation"
|
||||
echo "==================================================="
|
||||
echo
|
||||
|
||||
# Check for Python
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
echo "Python 3 not found! Please install Python 3.8 or higher."
|
||||
echo "For Ubuntu/Debian: sudo apt update && sudo apt install python3 python3-pip python3-venv"
|
||||
echo "For macOS: brew install python3"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Make the script executable
|
||||
chmod +x install.py
|
||||
|
||||
# Run the installation script
|
||||
echo "Running installation script..."
|
||||
python3 ./install.py
|
||||
|
||||
echo
|
||||
echo "If the installation was successful, you can run the application with:"
|
||||
echo "streamlit run app.py"
|
||||
echo
|
||||
49
package.json
Normal file
49
package.json
Normal file
@ -0,0 +1,49 @@
|
||||
{
|
||||
"name": "cutscript",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"description": "CutScript — Open-source AI-powered text-based video editor",
|
||||
"main": "electron/main.js",
|
||||
"scripts": {
|
||||
"dev": "concurrently \"npm run dev:backend\" \"npm run dev:frontend\" \"wait-on http://localhost:5173 && npm run dev:electron\"",
|
||||
"dev:frontend": "cd frontend && npm run dev",
|
||||
"dev:electron": "electron .",
|
||||
"dev:backend": "cd backend && python -m uvicorn main:app --reload --port 8642",
|
||||
"build": "cd frontend && npm run build && electron-builder",
|
||||
"lint": "cd frontend && npm run lint"
|
||||
},
|
||||
"devDependencies": {
|
||||
"concurrently": "^9.1.0",
|
||||
"electron": "^33.2.0",
|
||||
"electron-builder": "^25.1.0",
|
||||
"wait-on": "^8.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"python-shell": "^5.0.0"
|
||||
},
|
||||
"build": {
|
||||
"appId": "com.dataants.cutscript",
|
||||
"productName": "CutScript",
|
||||
"files": [
|
||||
"electron/**/*",
|
||||
"frontend/dist/**/*",
|
||||
"backend/**/*",
|
||||
"shared/**/*"
|
||||
],
|
||||
"extraResources": [
|
||||
{
|
||||
"from": "backend",
|
||||
"to": "backend"
|
||||
}
|
||||
],
|
||||
"win": {
|
||||
"target": "nsis"
|
||||
},
|
||||
"mac": {
|
||||
"target": "dmg"
|
||||
},
|
||||
"linux": {
|
||||
"target": "AppImage"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,53 +0,0 @@
|
||||
# OBS Recording Transcriber Dependencies
|
||||
# Core dependencies with pinned compatible versions
|
||||
streamlit==1.26.0
|
||||
moviepy==1.0.3
|
||||
openai-whisper==20231117
|
||||
requests>=2.28.0
|
||||
humanize>=4.6.0
|
||||
|
||||
# PyTorch ecosystem - updated for SpeechBrain 1.0 compatibility
|
||||
# torchaudio >= 2.1.0 is REQUIRED for diarization to work properly
|
||||
torch==2.1.0
|
||||
torchaudio==2.1.0
|
||||
torchvision==0.16.0
|
||||
|
||||
# Transformers ecosystem - compatible versions
|
||||
transformers==4.35.0
|
||||
tokenizers>=0.14.0
|
||||
|
||||
# ML dependencies with compatible versions
|
||||
numpy==1.24.3
|
||||
scipy==1.10.1
|
||||
scikit-learn==1.3.0
|
||||
|
||||
# Audio processing and ML models
|
||||
# Pin speechbrain for stability with pyannote.audio
|
||||
speechbrain==1.0.0
|
||||
pyannote.audio==3.1.1
|
||||
pytorch-lightning==2.1.0
|
||||
|
||||
# Other dependencies
|
||||
iso639>=0.1.4
|
||||
protobuf>=3.20.0,<5.0.0
|
||||
matplotlib>=3.5.0
|
||||
soundfile>=0.10.3
|
||||
ffmpeg-python>=0.2.0
|
||||
|
||||
# Optional: Ollama Python client (uncomment to install)
|
||||
# ollama
|
||||
|
||||
# Installation notes:
|
||||
# 1. For Windows users, you may need to install PyTorch separately:
|
||||
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
#
|
||||
# 2. For tokenizers issues, try installing Visual C++ Build Tools:
|
||||
# https://visualstudio.microsoft.com/visual-cpp-build-tools/
|
||||
#
|
||||
# 3. For pyannote.audio, you'll need a HuggingFace token with access to:
|
||||
# https://huggingface.co/pyannote/speaker-diarization-3.0
|
||||
#
|
||||
# 4. FFmpeg is required for audio processing:
|
||||
# Windows: https://www.gyan.dev/ffmpeg/builds/
|
||||
# Mac: brew install ffmpeg
|
||||
# Linux: apt-get install ffmpeg
|
||||
55
shared/project-schema.json
Normal file
55
shared/project-schema.json
Normal file
@ -0,0 +1,55 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "AI Video Editor Project",
|
||||
"type": "object",
|
||||
"required": ["version", "videoPath", "words", "segments", "deletedRanges", "language", "createdAt", "modifiedAt"],
|
||||
"properties": {
|
||||
"version": { "type": "integer", "const": 1 },
|
||||
"videoPath": { "type": "string" },
|
||||
"words": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["word", "start", "end", "confidence"],
|
||||
"properties": {
|
||||
"word": { "type": "string" },
|
||||
"start": { "type": "number" },
|
||||
"end": { "type": "number" },
|
||||
"confidence": { "type": "number" },
|
||||
"speaker": { "type": "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"segments": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "start", "end", "text", "words"],
|
||||
"properties": {
|
||||
"id": { "type": "integer" },
|
||||
"start": { "type": "number" },
|
||||
"end": { "type": "number" },
|
||||
"text": { "type": "string" },
|
||||
"words": { "$ref": "#/properties/words" },
|
||||
"speaker": { "type": "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"deletedRanges": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "start", "end", "wordIndices"],
|
||||
"properties": {
|
||||
"id": { "type": "string" },
|
||||
"start": { "type": "number" },
|
||||
"end": { "type": "number" },
|
||||
"wordIndices": { "type": "array", "items": { "type": "integer" } }
|
||||
}
|
||||
}
|
||||
},
|
||||
"language": { "type": "string" },
|
||||
"createdAt": { "type": "string", "format": "date-time" },
|
||||
"modifiedAt": { "type": "string", "format": "date-time" }
|
||||
}
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
from moviepy.editor import AudioFileClip
|
||||
from pathlib import Path
|
||||
|
||||
def extract_audio(video_path: Path):
|
||||
"""Extract audio from a video file."""
|
||||
try:
|
||||
audio = AudioFileClip(str(video_path))
|
||||
audio_path = video_path.parent / f"{video_path.stem}_audio.wav"
|
||||
audio.write_audiofile(str(audio_path), verbose=False, logger=None)
|
||||
return audio_path
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Audio extraction failed: {e}")
|
||||
@ -1,226 +0,0 @@
|
||||
"""
|
||||
Speaker diarization utilities for the OBS Recording Transcriber.
|
||||
Provides functions to identify different speakers in audio recordings.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import torch
|
||||
from pyannote.audio import Pipeline
|
||||
from pyannote.core import Segment
|
||||
import whisper
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import GPU utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.gpu_utils import get_optimal_device
|
||||
GPU_UTILS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GPU_UTILS_AVAILABLE = False
|
||||
|
||||
# Default HuggingFace auth token environment variable
|
||||
HF_TOKEN_ENV = "HF_TOKEN"
|
||||
|
||||
|
||||
def get_diarization_pipeline(use_gpu=True, hf_token=None):
|
||||
"""
|
||||
Initialize the speaker diarization pipeline.
|
||||
|
||||
Args:
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
hf_token (str, optional): HuggingFace API token for accessing the model
|
||||
|
||||
Returns:
|
||||
Pipeline or None: Diarization pipeline if successful, None otherwise
|
||||
"""
|
||||
# Check if token is provided or in environment
|
||||
if hf_token is None:
|
||||
hf_token = os.environ.get(HF_TOKEN_ENV)
|
||||
if hf_token is None:
|
||||
logger.error(f"HuggingFace token not provided. Set {HF_TOKEN_ENV} environment variable or pass token directly.")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Configure device
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
device = get_optimal_device()
|
||||
logger.info(f"Using device: {device} for diarization")
|
||||
|
||||
# Initialize the pipeline
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.0",
|
||||
use_auth_token=hf_token
|
||||
)
|
||||
|
||||
# Move to appropriate device
|
||||
if device.type == "cuda":
|
||||
pipeline = pipeline.to(torch.device(device))
|
||||
|
||||
return pipeline
|
||||
except Exception as e:
|
||||
logger.error(f"Error initializing diarization pipeline: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def diarize_audio(audio_path, pipeline=None, num_speakers=None, use_gpu=True, hf_token=None):
|
||||
"""
|
||||
Perform speaker diarization on an audio file.
|
||||
|
||||
Args:
|
||||
audio_path (Path): Path to the audio file
|
||||
pipeline (Pipeline, optional): Pre-initialized diarization pipeline
|
||||
num_speakers (int, optional): Number of speakers (if known)
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
hf_token (str, optional): HuggingFace API token
|
||||
|
||||
Returns:
|
||||
dict: Dictionary mapping time segments to speaker IDs
|
||||
"""
|
||||
audio_path = Path(audio_path)
|
||||
|
||||
# Initialize pipeline if not provided
|
||||
if pipeline is None:
|
||||
pipeline = get_diarization_pipeline(use_gpu, hf_token)
|
||||
if pipeline is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Run diarization
|
||||
logger.info(f"Running speaker diarization on {audio_path}")
|
||||
diarization = pipeline(audio_path, num_speakers=num_speakers)
|
||||
|
||||
# Extract speaker segments
|
||||
speaker_segments = {}
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
segment = (turn.start, turn.end)
|
||||
speaker_segments[segment] = speaker
|
||||
|
||||
return speaker_segments
|
||||
except Exception as e:
|
||||
logger.error(f"Error during diarization: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def apply_diarization_to_transcript(transcript_segments, speaker_segments):
|
||||
"""
|
||||
Apply speaker diarization results to transcript segments.
|
||||
|
||||
Args:
|
||||
transcript_segments (list): List of transcript segments with timing info
|
||||
speaker_segments (dict): Dictionary mapping time segments to speaker IDs
|
||||
|
||||
Returns:
|
||||
list: Updated transcript segments with speaker information
|
||||
"""
|
||||
if not speaker_segments:
|
||||
return transcript_segments
|
||||
|
||||
# Convert speaker segments to a more usable format
|
||||
speaker_ranges = [(Segment(start, end), speaker)
|
||||
for (start, end), speaker in speaker_segments.items()]
|
||||
|
||||
# Update transcript segments with speaker information
|
||||
for segment in transcript_segments:
|
||||
segment_start = segment['start']
|
||||
segment_end = segment['end']
|
||||
segment_range = Segment(segment_start, segment_end)
|
||||
|
||||
# Find overlapping speaker segments
|
||||
overlaps = []
|
||||
for (spk_range, speaker) in speaker_ranges:
|
||||
overlap = segment_range.intersect(spk_range)
|
||||
if overlap:
|
||||
overlaps.append((overlap.duration, speaker))
|
||||
|
||||
# Assign the speaker with the most overlap
|
||||
if overlaps:
|
||||
overlaps.sort(reverse=True) # Sort by duration (descending)
|
||||
segment['speaker'] = overlaps[0][1]
|
||||
else:
|
||||
segment['speaker'] = "UNKNOWN"
|
||||
|
||||
return transcript_segments
|
||||
|
||||
|
||||
def format_transcript_with_speakers(transcript_segments):
|
||||
"""
|
||||
Format transcript with speaker labels.
|
||||
|
||||
Args:
|
||||
transcript_segments (list): List of transcript segments with speaker info
|
||||
|
||||
Returns:
|
||||
str: Formatted transcript with speaker labels
|
||||
"""
|
||||
formatted_lines = []
|
||||
current_speaker = None
|
||||
|
||||
for segment in transcript_segments:
|
||||
speaker = segment.get('speaker', 'UNKNOWN')
|
||||
text = segment['text'].strip()
|
||||
|
||||
# Add speaker label when speaker changes
|
||||
if speaker != current_speaker:
|
||||
formatted_lines.append(f"\n[{speaker}]")
|
||||
current_speaker = speaker
|
||||
|
||||
formatted_lines.append(text)
|
||||
|
||||
return " ".join(formatted_lines)
|
||||
|
||||
|
||||
def transcribe_with_diarization(audio_path, whisper_model="base", num_speakers=None,
|
||||
use_gpu=True, hf_token=None):
|
||||
"""
|
||||
Transcribe audio with speaker diarization.
|
||||
|
||||
Args:
|
||||
audio_path (Path): Path to the audio file
|
||||
whisper_model (str): Whisper model size to use
|
||||
num_speakers (int, optional): Number of speakers (if known)
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
hf_token (str, optional): HuggingFace API token
|
||||
|
||||
Returns:
|
||||
tuple: (diarized_segments, formatted_transcript)
|
||||
"""
|
||||
audio_path = Path(audio_path)
|
||||
|
||||
# Configure device
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
device = get_optimal_device()
|
||||
|
||||
try:
|
||||
# Step 1: Transcribe audio with Whisper
|
||||
logger.info(f"Transcribing audio with Whisper model: {whisper_model}")
|
||||
model = whisper.load_model(whisper_model, device=device if device.type != "mps" else "cpu")
|
||||
result = model.transcribe(str(audio_path))
|
||||
transcript_segments = result["segments"]
|
||||
|
||||
# Step 2: Perform speaker diarization
|
||||
logger.info("Performing speaker diarization")
|
||||
pipeline = get_diarization_pipeline(use_gpu, hf_token)
|
||||
if pipeline is None:
|
||||
logger.warning("Diarization pipeline not available, returning transcript without speakers")
|
||||
return transcript_segments, result["text"]
|
||||
|
||||
speaker_segments = diarize_audio(audio_path, pipeline, num_speakers, use_gpu)
|
||||
|
||||
# Step 3: Apply diarization to transcript
|
||||
if speaker_segments:
|
||||
diarized_segments = apply_diarization_to_transcript(transcript_segments, speaker_segments)
|
||||
formatted_transcript = format_transcript_with_speakers(diarized_segments)
|
||||
return diarized_segments, formatted_transcript
|
||||
else:
|
||||
return transcript_segments, result["text"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transcribe_with_diarization: {e}")
|
||||
return None, None
|
||||
284
utils/export.py
284
utils/export.py
@ -1,284 +0,0 @@
|
||||
"""
|
||||
Subtitle export utilities for the OBS Recording Transcriber.
|
||||
Supports exporting transcripts to SRT, ASS, and WebVTT subtitle formats.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
from datetime import timedelta
|
||||
import gzip
|
||||
import zipfile
|
||||
import logging
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def format_timestamp_srt(timestamp_ms):
|
||||
"""
|
||||
Format a timestamp in milliseconds to SRT format (HH:MM:SS,mmm).
|
||||
|
||||
Args:
|
||||
timestamp_ms (int): Timestamp in milliseconds
|
||||
|
||||
Returns:
|
||||
str: Formatted timestamp string
|
||||
"""
|
||||
hours, remainder = divmod(timestamp_ms, 3600000)
|
||||
minutes, remainder = divmod(remainder, 60000)
|
||||
seconds, milliseconds = divmod(remainder, 1000)
|
||||
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
|
||||
|
||||
|
||||
def format_timestamp_ass(timestamp_ms):
|
||||
"""
|
||||
Format a timestamp in milliseconds to ASS format (H:MM:SS.cc).
|
||||
|
||||
Args:
|
||||
timestamp_ms (int): Timestamp in milliseconds
|
||||
|
||||
Returns:
|
||||
str: Formatted timestamp string
|
||||
"""
|
||||
hours, remainder = divmod(timestamp_ms, 3600000)
|
||||
minutes, remainder = divmod(remainder, 60000)
|
||||
seconds, remainder = divmod(remainder, 1000)
|
||||
centiseconds = remainder // 10
|
||||
return f"{int(hours)}:{int(minutes):02d}:{int(seconds):02d}.{int(centiseconds):02d}"
|
||||
|
||||
|
||||
def format_timestamp_vtt(timestamp_ms):
|
||||
"""
|
||||
Format a timestamp in milliseconds to WebVTT format (HH:MM:SS.mmm).
|
||||
|
||||
Args:
|
||||
timestamp_ms (int): Timestamp in milliseconds
|
||||
|
||||
Returns:
|
||||
str: Formatted timestamp string
|
||||
"""
|
||||
hours, remainder = divmod(timestamp_ms, 3600000)
|
||||
minutes, remainder = divmod(remainder, 60000)
|
||||
seconds, milliseconds = divmod(remainder, 1000)
|
||||
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
|
||||
|
||||
|
||||
def export_to_srt(segments, output_path):
|
||||
"""
|
||||
Export transcript segments to SRT format.
|
||||
|
||||
Args:
|
||||
segments (list): List of transcript segments with start, end, and text
|
||||
output_path (Path): Path to save the SRT file
|
||||
|
||||
Returns:
|
||||
Path: Path to the saved SRT file
|
||||
"""
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
for i, segment in enumerate(segments, 1):
|
||||
start_time = format_timestamp_srt(int(segment['start'] * 1000))
|
||||
end_time = format_timestamp_srt(int(segment['end'] * 1000))
|
||||
|
||||
f.write(f"{i}\n")
|
||||
f.write(f"{start_time} --> {end_time}\n")
|
||||
f.write(f"{segment['text'].strip()}\n\n")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def export_to_ass(segments, output_path, video_width=1920, video_height=1080, style=None):
|
||||
"""
|
||||
Export transcript segments to ASS format with styling.
|
||||
|
||||
Args:
|
||||
segments (list): List of transcript segments with start, end, and text
|
||||
output_path (Path): Path to save the ASS file
|
||||
video_width (int): Width of the video in pixels
|
||||
video_height (int): Height of the video in pixels
|
||||
style (dict, optional): Custom style parameters
|
||||
|
||||
Returns:
|
||||
Path: Path to the saved ASS file
|
||||
"""
|
||||
# Default style
|
||||
default_style = {
|
||||
"fontname": "Arial",
|
||||
"fontsize": "48",
|
||||
"primary_color": "&H00FFFFFF", # White
|
||||
"secondary_color": "&H000000FF", # Blue
|
||||
"outline_color": "&H00000000", # Black
|
||||
"back_color": "&H80000000", # Semi-transparent black
|
||||
"bold": "-1", # True
|
||||
"italic": "0", # False
|
||||
"alignment": "2", # Bottom center
|
||||
}
|
||||
|
||||
# Apply custom style if provided
|
||||
if style:
|
||||
default_style.update(style)
|
||||
|
||||
# ASS header template
|
||||
ass_header = f"""[Script Info]
|
||||
Title: Transcription
|
||||
ScriptType: v4.00+
|
||||
WrapStyle: 0
|
||||
PlayResX: {video_width}
|
||||
PlayResY: {video_height}
|
||||
ScaledBorderAndShadow: yes
|
||||
|
||||
[V4+ Styles]
|
||||
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
||||
Style: Default,{default_style['fontname']},{default_style['fontsize']},{default_style['primary_color']},{default_style['secondary_color']},{default_style['outline_color']},{default_style['back_color']},{default_style['bold']},{default_style['italic']},0,0,100,100,0,0,1,2,2,{default_style['alignment']},10,10,10,1
|
||||
|
||||
[Events]
|
||||
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||
"""
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(ass_header)
|
||||
|
||||
for segment in segments:
|
||||
start_time = format_timestamp_ass(int(segment['start'] * 1000))
|
||||
end_time = format_timestamp_ass(int(segment['end'] * 1000))
|
||||
text = segment['text'].strip().replace('\n', '\\N')
|
||||
|
||||
f.write(f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{text}\n")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def export_to_vtt(segments, output_path):
|
||||
"""
|
||||
Export transcript segments to WebVTT format.
|
||||
|
||||
Args:
|
||||
segments (list): List of transcript segments with start, end, and text
|
||||
output_path (Path): Path to save the WebVTT file
|
||||
|
||||
Returns:
|
||||
Path: Path to the saved WebVTT file
|
||||
"""
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
# WebVTT header
|
||||
f.write("WEBVTT\n\n")
|
||||
|
||||
for i, segment in enumerate(segments, 1):
|
||||
start_time = format_timestamp_vtt(int(segment['start'] * 1000))
|
||||
end_time = format_timestamp_vtt(int(segment['end'] * 1000))
|
||||
|
||||
# Optional cue identifier
|
||||
f.write(f"{i}\n")
|
||||
f.write(f"{start_time} --> {end_time}\n")
|
||||
f.write(f"{segment['text'].strip()}\n\n")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def transcript_to_segments(transcript, segment_duration=5.0):
|
||||
"""
|
||||
Convert a plain transcript to timed segments for subtitle export.
|
||||
Used when the original segments are not available.
|
||||
|
||||
Args:
|
||||
transcript (str): Full transcript text
|
||||
segment_duration (float): Duration of each segment in seconds
|
||||
|
||||
Returns:
|
||||
list: List of segments with start, end, and text
|
||||
"""
|
||||
# Split transcript into sentences
|
||||
sentences = re.split(r'(?<=[.!?])\s+', transcript)
|
||||
segments = []
|
||||
|
||||
current_time = 0.0
|
||||
for sentence in sentences:
|
||||
if not sentence.strip():
|
||||
continue
|
||||
|
||||
# Estimate duration based on word count (approx. 2.5 words per second)
|
||||
word_count = len(sentence.split())
|
||||
duration = max(2.0, word_count / 2.5)
|
||||
|
||||
segments.append({
|
||||
'start': current_time,
|
||||
'end': current_time + duration,
|
||||
'text': sentence
|
||||
})
|
||||
|
||||
current_time += duration
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def compress_file(input_path, compression_type='gzip'):
|
||||
"""
|
||||
Compress a file using the specified compression method.
|
||||
|
||||
Args:
|
||||
input_path (Path): Path to the file to compress
|
||||
compression_type (str): Type of compression ('gzip' or 'zip')
|
||||
|
||||
Returns:
|
||||
Path: Path to the compressed file
|
||||
"""
|
||||
input_path = Path(input_path)
|
||||
|
||||
if compression_type == 'gzip':
|
||||
output_path = input_path.with_suffix(input_path.suffix + '.gz')
|
||||
with open(input_path, 'rb') as f_in:
|
||||
with gzip.open(output_path, 'wb') as f_out:
|
||||
f_out.write(f_in.read())
|
||||
return output_path
|
||||
|
||||
elif compression_type == 'zip':
|
||||
output_path = input_path.with_suffix('.zip')
|
||||
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
zipf.write(input_path, arcname=input_path.name)
|
||||
return output_path
|
||||
|
||||
else:
|
||||
logger.warning(f"Unsupported compression type: {compression_type}")
|
||||
return input_path
|
||||
|
||||
|
||||
def export_transcript(transcript, output_path, format_type='srt', segments=None,
|
||||
compress=False, compression_type='gzip', style=None):
|
||||
"""
|
||||
Export transcript to the specified subtitle format.
|
||||
|
||||
Args:
|
||||
transcript (str): Full transcript text
|
||||
output_path (Path): Base path for the output file (without extension)
|
||||
format_type (str): 'srt', 'ass', or 'vtt'
|
||||
segments (list, optional): List of transcript segments with timing information
|
||||
compress (bool): Whether to compress the output file
|
||||
compression_type (str): Type of compression ('gzip' or 'zip')
|
||||
style (dict, optional): Custom style parameters for ASS format
|
||||
|
||||
Returns:
|
||||
Path: Path to the saved subtitle file
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
# If segments are not provided, create them from the transcript
|
||||
if segments is None:
|
||||
segments = transcript_to_segments(transcript)
|
||||
|
||||
if format_type.lower() == 'srt':
|
||||
output_file = output_path.with_suffix('.srt')
|
||||
result_path = export_to_srt(segments, output_file)
|
||||
elif format_type.lower() == 'ass':
|
||||
output_file = output_path.with_suffix('.ass')
|
||||
result_path = export_to_ass(segments, output_file, style=style)
|
||||
elif format_type.lower() == 'vtt':
|
||||
output_file = output_path.with_suffix('.vtt')
|
||||
result_path = export_to_vtt(segments, output_file)
|
||||
else:
|
||||
raise ValueError(f"Unsupported format type: {format_type}. Use 'srt', 'ass', or 'vtt'.")
|
||||
|
||||
# Compress the file if requested
|
||||
if compress:
|
||||
result_path = compress_file(result_path, compression_type)
|
||||
|
||||
return result_path
|
||||
@ -1,330 +0,0 @@
|
||||
"""
|
||||
Keyword extraction utilities for the OBS Recording Transcriber.
|
||||
Provides functions to extract keywords and link them to timestamps.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import torch
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from collections import Counter
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import GPU utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.gpu_utils import get_optimal_device
|
||||
GPU_UTILS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GPU_UTILS_AVAILABLE = False
|
||||
|
||||
# Default models
|
||||
NER_MODEL = "dslim/bert-base-NER"
|
||||
|
||||
|
||||
def extract_keywords_tfidf(text, max_keywords=10, ngram_range=(1, 2)):
|
||||
"""
|
||||
Extract keywords using TF-IDF.
|
||||
|
||||
Args:
|
||||
text (str): Text to extract keywords from
|
||||
max_keywords (int): Maximum number of keywords to extract
|
||||
ngram_range (tuple): Range of n-grams to consider
|
||||
|
||||
Returns:
|
||||
list: List of (keyword, score) tuples
|
||||
"""
|
||||
try:
|
||||
# Preprocess text
|
||||
text = text.lower()
|
||||
|
||||
# Remove common stopwords - convert to list for scikit-learn compatibility
|
||||
stopwords = ['a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
||||
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
||||
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
||||
'is', 'of', 'while', 'during', 'to', 'from', 'in', 'out', 'on', 'off', 'by']
|
||||
|
||||
# Create sentences for better TF-IDF analysis
|
||||
sentences = re.split(r'[.!?]', text)
|
||||
sentences = [s.strip() for s in sentences if s.strip()]
|
||||
|
||||
if not sentences:
|
||||
return []
|
||||
|
||||
# Apply TF-IDF
|
||||
vectorizer = TfidfVectorizer(
|
||||
max_features=100,
|
||||
stop_words=stopwords,
|
||||
ngram_range=ngram_range
|
||||
)
|
||||
|
||||
try:
|
||||
tfidf_matrix = vectorizer.fit_transform(sentences)
|
||||
feature_names = vectorizer.get_feature_names_out()
|
||||
|
||||
# Calculate average TF-IDF score across all sentences
|
||||
avg_tfidf = np.mean(tfidf_matrix.toarray(), axis=0)
|
||||
|
||||
# Get top keywords
|
||||
keywords = [(feature_names[i], avg_tfidf[i]) for i in avg_tfidf.argsort()[::-1]]
|
||||
|
||||
# Filter out single-character keywords and limit to max_keywords
|
||||
keywords = [(k, s) for k, s in keywords if len(k) > 1][:max_keywords]
|
||||
|
||||
return keywords
|
||||
except ValueError as e:
|
||||
logger.warning(f"TF-IDF extraction failed: {e}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting keywords with TF-IDF: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def extract_named_entities(text, model=NER_MODEL, use_gpu=True):
|
||||
"""
|
||||
Extract named entities from text.
|
||||
|
||||
Args:
|
||||
text (str): Text to extract entities from
|
||||
model (str): Model to use for NER
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
|
||||
Returns:
|
||||
list: List of (entity, type) tuples
|
||||
"""
|
||||
# Configure device
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
device = get_optimal_device()
|
||||
device_arg = 0 if device.type == "cuda" else -1
|
||||
else:
|
||||
device_arg = -1
|
||||
|
||||
try:
|
||||
# Initialize the pipeline
|
||||
ner_pipeline = pipeline("ner", model=model, device=device_arg, aggregation_strategy="simple")
|
||||
|
||||
# Split text into manageable chunks if too long
|
||||
max_length = 512
|
||||
if len(text) > max_length:
|
||||
chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
||||
else:
|
||||
chunks = [text]
|
||||
|
||||
# Process each chunk
|
||||
all_entities = []
|
||||
for chunk in chunks:
|
||||
entities = ner_pipeline(chunk)
|
||||
all_entities.extend(entities)
|
||||
|
||||
# Extract entity text and type
|
||||
entity_info = [(entity["word"], entity["entity_group"]) for entity in all_entities]
|
||||
|
||||
return entity_info
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting named entities: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def find_keyword_timestamps(segments, keywords):
|
||||
"""
|
||||
Find timestamps for keywords in transcript segments.
|
||||
|
||||
Args:
|
||||
segments (list): List of transcript segments with timing info
|
||||
keywords (list): List of keywords to find
|
||||
|
||||
Returns:
|
||||
dict: Dictionary mapping keywords to lists of timestamps
|
||||
"""
|
||||
keyword_timestamps = {}
|
||||
|
||||
# Convert keywords to lowercase for case-insensitive matching
|
||||
# Check if keywords list is not empty before accessing keywords[0]
|
||||
if not keywords:
|
||||
return keyword_timestamps
|
||||
|
||||
if isinstance(keywords[0], tuple):
|
||||
# If keywords is a list of (keyword, score) tuples
|
||||
keywords_lower = [k.lower() for k, _ in keywords]
|
||||
else:
|
||||
# If keywords is just a list of keywords
|
||||
keywords_lower = [k.lower() for k in keywords]
|
||||
|
||||
# Process each segment
|
||||
for segment in segments:
|
||||
segment_text = segment["text"].lower()
|
||||
start_time = segment["start"]
|
||||
end_time = segment["end"]
|
||||
|
||||
# Check each keyword
|
||||
for i, keyword in enumerate(keywords_lower):
|
||||
if keyword in segment_text:
|
||||
# Get the original case of the keyword
|
||||
# Safe access to keywords[0] since we already checked keywords is not empty
|
||||
original_keyword = keywords[i][0] if isinstance(keywords[0], tuple) else keywords[i]
|
||||
|
||||
# Initialize the list if this is the first occurrence
|
||||
if original_keyword not in keyword_timestamps:
|
||||
keyword_timestamps[original_keyword] = []
|
||||
|
||||
# Add the timestamp
|
||||
keyword_timestamps[original_keyword].append({
|
||||
"start": start_time,
|
||||
"end": end_time,
|
||||
"context": segment["text"]
|
||||
})
|
||||
|
||||
return keyword_timestamps
|
||||
|
||||
|
||||
def extract_keywords_from_transcript(transcript, segments, max_keywords=15, use_gpu=True):
|
||||
"""
|
||||
Extract keywords from transcript and link them to timestamps.
|
||||
|
||||
Args:
|
||||
transcript (str): Full transcript text
|
||||
segments (list): List of transcript segments with timing info
|
||||
max_keywords (int): Maximum number of keywords to extract
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
|
||||
Returns:
|
||||
tuple: (keyword_timestamps, entities_with_timestamps)
|
||||
"""
|
||||
try:
|
||||
# Extract keywords using TF-IDF
|
||||
tfidf_keywords = extract_keywords_tfidf(transcript, max_keywords=max_keywords)
|
||||
|
||||
# Extract named entities
|
||||
entities = extract_named_entities(transcript, use_gpu=use_gpu)
|
||||
|
||||
# Count entity occurrences and get the most frequent ones
|
||||
entity_counter = Counter([entity for entity, _ in entities])
|
||||
top_entities = [(entity, count) for entity, count in entity_counter.most_common(max_keywords)]
|
||||
|
||||
# Find timestamps for keywords and entities
|
||||
keyword_timestamps = find_keyword_timestamps(segments, tfidf_keywords)
|
||||
entity_timestamps = find_keyword_timestamps(segments, top_entities)
|
||||
|
||||
return keyword_timestamps, entity_timestamps
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting keywords from transcript: {e}")
|
||||
return {}, {}
|
||||
|
||||
|
||||
def generate_keyword_index(keyword_timestamps, entity_timestamps=None):
|
||||
"""
|
||||
Generate a keyword index with timestamps.
|
||||
|
||||
Args:
|
||||
keyword_timestamps (dict): Dictionary mapping keywords to timestamp lists
|
||||
entity_timestamps (dict, optional): Dictionary mapping entities to timestamp lists
|
||||
|
||||
Returns:
|
||||
str: Formatted keyword index
|
||||
"""
|
||||
lines = ["# Keyword Index\n"]
|
||||
|
||||
# Add keywords section
|
||||
if keyword_timestamps:
|
||||
lines.append("## Keywords\n")
|
||||
for keyword, timestamps in sorted(keyword_timestamps.items()):
|
||||
if timestamps:
|
||||
times = [f"{int(ts['start'] // 60):02d}:{int(ts['start'] % 60):02d}" for ts in timestamps]
|
||||
lines.append(f"- **{keyword}**: {', '.join(times)}\n")
|
||||
|
||||
# Add entities section
|
||||
if entity_timestamps:
|
||||
lines.append("\n## Named Entities\n")
|
||||
for entity, timestamps in sorted(entity_timestamps.items()):
|
||||
if timestamps:
|
||||
times = [f"{int(ts['start'] // 60):02d}:{int(ts['start'] % 60):02d}" for ts in timestamps]
|
||||
lines.append(f"- **{entity}**: {', '.join(times)}\n")
|
||||
|
||||
return "".join(lines)
|
||||
|
||||
|
||||
def generate_interactive_transcript(segments, keyword_timestamps=None, entity_timestamps=None):
|
||||
"""
|
||||
Generate an interactive transcript with keyword highlighting.
|
||||
|
||||
Args:
|
||||
segments (list): List of transcript segments with timing info
|
||||
keyword_timestamps (dict, optional): Dictionary mapping keywords to timestamp lists
|
||||
entity_timestamps (dict, optional): Dictionary mapping entities to timestamp lists
|
||||
|
||||
Returns:
|
||||
str: HTML formatted interactive transcript
|
||||
"""
|
||||
# Combine keywords and entities
|
||||
all_keywords = {}
|
||||
if keyword_timestamps:
|
||||
all_keywords.update(keyword_timestamps)
|
||||
if entity_timestamps:
|
||||
all_keywords.update(entity_timestamps)
|
||||
|
||||
# Generate HTML
|
||||
html = ["<div class='interactive-transcript'>"]
|
||||
|
||||
for segment in segments:
|
||||
start_time = segment["start"]
|
||||
end_time = segment["end"]
|
||||
text = segment["text"]
|
||||
|
||||
# Format timestamp
|
||||
timestamp = f"{int(start_time // 60):02d}:{int(start_time % 60):02d}"
|
||||
|
||||
# Add speaker if available
|
||||
speaker = segment.get("speaker", "")
|
||||
speaker_html = f"<span class='speaker'>[{speaker}]</span> " if speaker else ""
|
||||
|
||||
# Highlight keywords in text
|
||||
highlighted_text = text
|
||||
for keyword in all_keywords:
|
||||
# Use regex to match whole words only
|
||||
pattern = r'\b' + re.escape(keyword) + r'\b'
|
||||
replacement = f"<span class='keyword' data-keyword='{keyword}'>{keyword}</span>"
|
||||
highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
|
||||
|
||||
# Add segment to HTML
|
||||
html.append(f"<p class='segment' data-start='{start_time}' data-end='{end_time}'>")
|
||||
html.append(f"<span class='timestamp'>{timestamp}</span> {speaker_html}{highlighted_text}")
|
||||
html.append("</p>")
|
||||
|
||||
html.append("</div>")
|
||||
|
||||
return "\n".join(html)
|
||||
|
||||
|
||||
def create_keyword_cloud_data(keyword_timestamps, entity_timestamps=None):
|
||||
"""
|
||||
Create data for a keyword cloud visualization.
|
||||
|
||||
Args:
|
||||
keyword_timestamps (dict): Dictionary mapping keywords to timestamp lists
|
||||
entity_timestamps (dict, optional): Dictionary mapping entities to timestamp lists
|
||||
|
||||
Returns:
|
||||
list: List of (keyword, weight) tuples for visualization
|
||||
"""
|
||||
cloud_data = []
|
||||
|
||||
# Process keywords
|
||||
for keyword, timestamps in keyword_timestamps.items():
|
||||
weight = len(timestamps) # Weight by occurrence count
|
||||
cloud_data.append((keyword, weight))
|
||||
|
||||
# Process entities if provided
|
||||
if entity_timestamps:
|
||||
for entity, timestamps in entity_timestamps.items():
|
||||
weight = len(timestamps) * 1.5 # Give entities slightly higher weight
|
||||
cloud_data.append((entity, weight))
|
||||
|
||||
return cloud_data
|
||||
@ -1,155 +0,0 @@
|
||||
"""
|
||||
Ollama integration for local AI model inference.
|
||||
Provides functions to use Ollama's API for text summarization.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default Ollama API endpoint - configurable via environment variable
|
||||
OLLAMA_API_URL = os.environ.get("OLLAMA_API_URL", "http://localhost:11434/api")
|
||||
|
||||
|
||||
def check_ollama_available():
|
||||
"""
|
||||
Check if Ollama service is available.
|
||||
|
||||
Returns:
|
||||
bool: True if Ollama is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
response = requests.get(f"{OLLAMA_API_URL}/tags", timeout=2)
|
||||
return response.status_code == 200
|
||||
except requests.exceptions.RequestException:
|
||||
return False
|
||||
|
||||
|
||||
def list_available_models():
|
||||
"""
|
||||
List available models in Ollama.
|
||||
|
||||
Returns:
|
||||
list: List of available model names
|
||||
"""
|
||||
try:
|
||||
response = requests.get(f"{OLLAMA_API_URL}/tags")
|
||||
if response.status_code == 200:
|
||||
models = response.json().get('models', [])
|
||||
return [model['name'] for model in models]
|
||||
return []
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error listing Ollama models: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def summarize_with_ollama(text, model="llama3", max_length=150):
|
||||
"""
|
||||
Summarize text using Ollama's local API.
|
||||
|
||||
Args:
|
||||
text (str): Text to summarize
|
||||
model (str): Ollama model to use
|
||||
max_length (int): Maximum length of the summary
|
||||
|
||||
Returns:
|
||||
str: Summarized text or None if failed
|
||||
"""
|
||||
if not check_ollama_available():
|
||||
logger.warning("Ollama service is not available")
|
||||
return None
|
||||
|
||||
# Check if the model is available
|
||||
available_models = list_available_models()
|
||||
if model not in available_models:
|
||||
logger.warning(f"Model {model} not available in Ollama. Available models: {available_models}")
|
||||
return None
|
||||
|
||||
# Prepare the prompt for summarization
|
||||
prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
|
||||
|
||||
try:
|
||||
# Make the API request
|
||||
response = requests.post(
|
||||
f"{OLLAMA_API_URL}/generate",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.3,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": max_length * 2 # Approximate token count
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
return result.get('response', '').strip()
|
||||
else:
|
||||
logger.error(f"Ollama API error: {response.status_code} - {response.text}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error communicating with Ollama: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
|
||||
"""
|
||||
Chunk long text and summarize each chunk, then combine the summaries.
|
||||
|
||||
Args:
|
||||
text (str): Text to summarize
|
||||
model (str): Ollama model to use
|
||||
chunk_size (int): Maximum size of each chunk in characters
|
||||
max_length (int): Maximum length of the final summary
|
||||
|
||||
Returns:
|
||||
str: Combined summary or None if failed
|
||||
"""
|
||||
if len(text) <= chunk_size:
|
||||
return summarize_with_ollama(text, model, max_length)
|
||||
|
||||
# Split text into chunks
|
||||
words = text.split()
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for word in words:
|
||||
if current_length + len(word) + 1 <= chunk_size:
|
||||
current_chunk.append(word)
|
||||
current_length += len(word) + 1
|
||||
else:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
current_chunk = [word]
|
||||
current_length = len(word) + 1
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
|
||||
# Summarize each chunk
|
||||
chunk_summaries = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
|
||||
summary = summarize_with_ollama(chunk, model, max_length // len(chunks))
|
||||
if summary:
|
||||
chunk_summaries.append(summary)
|
||||
|
||||
if not chunk_summaries:
|
||||
return None
|
||||
|
||||
# If there's only one chunk summary, return it
|
||||
if len(chunk_summaries) == 1:
|
||||
return chunk_summaries[0]
|
||||
|
||||
# Otherwise, combine the summaries and summarize again
|
||||
combined_summary = " ".join(chunk_summaries)
|
||||
return summarize_with_ollama(combined_summary, model, max_length)
|
||||
@ -1,113 +0,0 @@
|
||||
from transformers import pipeline, AutoTokenizer
|
||||
import torch
|
||||
import logging
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SUMMARY_MODEL = "Falconsai/text_summarization"
|
||||
|
||||
def chunk_text(text, max_tokens, tokenizer):
|
||||
"""
|
||||
Splits the text into a list of chunks based on token limits.
|
||||
|
||||
Args:
|
||||
text (str): Text to chunk
|
||||
max_tokens (int): Maximum tokens per chunk
|
||||
tokenizer (AutoTokenizer): Tokenizer to use
|
||||
|
||||
Returns:
|
||||
list: List of text chunks
|
||||
"""
|
||||
words = text.split()
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for word in words:
|
||||
hypothetical_length = current_length + len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||||
if hypothetical_length <= max_tokens:
|
||||
current_chunk.append(word)
|
||||
current_length = hypothetical_length
|
||||
else:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
current_chunk = [word]
|
||||
current_length = len(tokenizer(word, return_tensors='pt')['input_ids'][0]) - 2
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
|
||||
return chunks
|
||||
|
||||
def summarize_text(text, use_gpu=True, memory_fraction=0.8):
|
||||
"""
|
||||
Summarize text using a Hugging Face pipeline with chunking support.
|
||||
|
||||
Args:
|
||||
text (str): Text to summarize
|
||||
use_gpu (bool): Whether to use GPU if available
|
||||
memory_fraction (float): Fraction of GPU memory to use
|
||||
|
||||
Returns:
|
||||
str: Summarized text
|
||||
"""
|
||||
# Determine device
|
||||
device = -1 # Default to CPU
|
||||
if use_gpu and torch.cuda.is_available():
|
||||
device = 0 # Use first GPU
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_per_process_memory_fraction(memory_fraction)
|
||||
|
||||
logger.info(f"Using device {device} for summarization")
|
||||
|
||||
try:
|
||||
# Initialize the pipeline and tokenizer
|
||||
summarizer = pipeline("summarization", model=SUMMARY_MODEL, device=device)
|
||||
tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL)
|
||||
|
||||
# Check if text needs to be chunked
|
||||
max_tokens = 512
|
||||
tokens = tokenizer(text, return_tensors='pt')
|
||||
num_tokens = len(tokens['input_ids'][0])
|
||||
|
||||
if num_tokens > max_tokens:
|
||||
chunks = chunk_text(text, max_tokens, tokenizer)
|
||||
summaries = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
|
||||
summary_output = summarizer(
|
||||
"summarize: " + chunk,
|
||||
max_length=150,
|
||||
min_length=30,
|
||||
do_sample=False
|
||||
)
|
||||
summaries.append(summary_output[0]['summary_text'])
|
||||
|
||||
# If multiple chunks, summarize the combined summaries
|
||||
if len(summaries) > 1:
|
||||
logger.info("Generating final summary from chunk summaries")
|
||||
combined_text = " ".join(summaries)
|
||||
return summarizer(
|
||||
"summarize: " + combined_text,
|
||||
max_length=150,
|
||||
min_length=30,
|
||||
do_sample=False
|
||||
)[0]['summary_text']
|
||||
return summaries[0]
|
||||
else:
|
||||
return summarizer(
|
||||
"summarize: " + text,
|
||||
max_length=150,
|
||||
min_length=30,
|
||||
do_sample=False
|
||||
)[0]['summary_text']
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during summarization: {e}")
|
||||
# Fallback to CPU if GPU fails
|
||||
if device != -1:
|
||||
logger.info("Falling back to CPU")
|
||||
return summarize_text(text, use_gpu=False, memory_fraction=memory_fraction)
|
||||
raise
|
||||
@ -1,86 +0,0 @@
|
||||
import whisper
|
||||
from pathlib import Path
|
||||
from transformers import pipeline, AutoTokenizer
|
||||
from utils.audio_processing import extract_audio
|
||||
from utils.summarization import summarize_text
|
||||
import logging
|
||||
import torch
|
||||
|
||||
# Try to import GPU utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.gpu_utils import configure_gpu, get_optimal_device
|
||||
GPU_UTILS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GPU_UTILS_AVAILABLE = False
|
||||
|
||||
# Try to import caching utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.cache import load_from_cache, save_to_cache
|
||||
CACHE_AVAILABLE = True
|
||||
except ImportError:
|
||||
CACHE_AVAILABLE = False
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
WHISPER_MODEL = "base"
|
||||
|
||||
def transcribe_audio(audio_path: Path, model=WHISPER_MODEL, use_cache=True, cache_max_age=None,
|
||||
use_gpu=True, memory_fraction=0.8):
|
||||
"""
|
||||
Transcribe audio using Whisper and return both segments and full transcript.
|
||||
|
||||
Args:
|
||||
audio_path (Path): Path to the audio or video file
|
||||
model (str): Whisper model size to use (tiny, base, small, medium, large)
|
||||
use_cache (bool): Whether to use caching
|
||||
cache_max_age (float, optional): Maximum age of cache in seconds
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
memory_fraction (float): Fraction of GPU memory to use (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
tuple: (segments, transcript) where segments is a list of dicts with timing info
|
||||
"""
|
||||
audio_path = Path(audio_path)
|
||||
|
||||
# Check cache first if enabled
|
||||
if use_cache and CACHE_AVAILABLE:
|
||||
cached_data = load_from_cache(audio_path, model, "transcribe", cache_max_age)
|
||||
if cached_data:
|
||||
logger.info(f"Using cached transcription for {audio_path}")
|
||||
return cached_data.get("segments", []), cached_data.get("transcript", "")
|
||||
|
||||
# Extract audio if the input is a video file (M4A is already audio)
|
||||
video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
|
||||
if audio_path.suffix.lower() in video_extensions:
|
||||
audio_path = extract_audio(audio_path)
|
||||
|
||||
# Configure GPU if available and requested
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
gpu_config = configure_gpu(model, memory_fraction)
|
||||
device = gpu_config["device"]
|
||||
logger.info(f"Using device: {device} for transcription")
|
||||
|
||||
# Load the specified Whisper model
|
||||
logger.info(f"Loading Whisper model: {model}")
|
||||
whisper_model = whisper.load_model(model, device=device if device.type != "mps" else "cpu")
|
||||
|
||||
# Transcribe the audio
|
||||
logger.info(f"Transcribing audio: {audio_path}")
|
||||
result = whisper_model.transcribe(str(audio_path))
|
||||
|
||||
# Extract the full transcript and segments
|
||||
transcript = result["text"]
|
||||
segments = result["segments"]
|
||||
|
||||
# Cache the results if caching is enabled
|
||||
if use_cache and CACHE_AVAILABLE:
|
||||
cache_data = {
|
||||
"transcript": transcript,
|
||||
"segments": segments
|
||||
}
|
||||
save_to_cache(audio_path, cache_data, model, "transcribe")
|
||||
|
||||
return segments, transcript
|
||||
@ -1,283 +0,0 @@
|
||||
"""
|
||||
Translation utilities for the OBS Recording Transcriber.
|
||||
Provides functions for language detection and translation.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, M2M100ForConditionalGeneration
|
||||
import whisper
|
||||
import iso639
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import GPU utilities, but don't fail if not available
|
||||
try:
|
||||
from utils.gpu_utils import get_optimal_device
|
||||
GPU_UTILS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GPU_UTILS_AVAILABLE = False
|
||||
|
||||
# Default models
|
||||
TRANSLATION_MODEL = "facebook/m2m100_418M"
|
||||
LANGUAGE_DETECTION_MODEL = "papluca/xlm-roberta-base-language-detection"
|
||||
|
||||
# ISO language code mapping
|
||||
def get_language_name(code):
|
||||
"""
|
||||
Get the language name from ISO code.
|
||||
|
||||
Args:
|
||||
code (str): ISO language code
|
||||
|
||||
Returns:
|
||||
str: Language name or original code if not found
|
||||
"""
|
||||
try:
|
||||
return iso639.languages.get(part1=code).name
|
||||
except (KeyError, AttributeError):
|
||||
try:
|
||||
return iso639.languages.get(part2b=code).name
|
||||
except (KeyError, AttributeError):
|
||||
return code
|
||||
|
||||
|
||||
def detect_language(text, model=LANGUAGE_DETECTION_MODEL, use_gpu=True):
|
||||
"""
|
||||
Detect the language of a text.
|
||||
|
||||
Args:
|
||||
text (str): Text to detect language for
|
||||
model (str): Model to use for language detection
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
|
||||
Returns:
|
||||
tuple: (language_code, confidence)
|
||||
"""
|
||||
# Configure device
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
device = get_optimal_device()
|
||||
device_arg = 0 if device.type == "cuda" else -1
|
||||
else:
|
||||
device_arg = -1
|
||||
|
||||
try:
|
||||
# Initialize the pipeline
|
||||
classifier = pipeline("text-classification", model=model, device=device_arg)
|
||||
|
||||
# Truncate text if too long
|
||||
max_length = 512
|
||||
if len(text) > max_length:
|
||||
text = text[:max_length]
|
||||
|
||||
# Detect language
|
||||
result = classifier(text)[0]
|
||||
language_code = result["label"]
|
||||
confidence = result["score"]
|
||||
|
||||
return language_code, confidence
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting language: {e}")
|
||||
return None, 0.0
|
||||
|
||||
|
||||
def translate_text(text, source_lang=None, target_lang="en", model=TRANSLATION_MODEL, use_gpu=True):
|
||||
"""
|
||||
Translate text from source language to target language.
|
||||
|
||||
Args:
|
||||
text (str): Text to translate
|
||||
source_lang (str, optional): Source language code (auto-detect if None)
|
||||
target_lang (str): Target language code
|
||||
model (str): Model to use for translation
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
|
||||
Returns:
|
||||
str: Translated text
|
||||
"""
|
||||
# Auto-detect source language if not provided
|
||||
if source_lang is None:
|
||||
detected_lang, confidence = detect_language(text, use_gpu=use_gpu)
|
||||
if detected_lang and confidence > 0.5:
|
||||
source_lang = detected_lang
|
||||
logger.info(f"Detected language: {get_language_name(source_lang)} ({source_lang}) with confidence {confidence:.2f}")
|
||||
else:
|
||||
logger.warning("Could not reliably detect language, defaulting to English")
|
||||
source_lang = "en"
|
||||
|
||||
# Skip translation if source and target are the same
|
||||
if source_lang == target_lang:
|
||||
logger.info(f"Source and target languages are the same ({source_lang}), skipping translation")
|
||||
return text
|
||||
|
||||
# Configure device
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
device = get_optimal_device()
|
||||
|
||||
try:
|
||||
# Load model and tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
model = M2M100ForConditionalGeneration.from_pretrained(model)
|
||||
|
||||
# Move model to device
|
||||
model = model.to(device)
|
||||
|
||||
# Prepare for translation
|
||||
tokenizer.src_lang = source_lang
|
||||
|
||||
# Split text into manageable chunks if too long
|
||||
max_length = 512
|
||||
if len(text) > max_length:
|
||||
chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
||||
else:
|
||||
chunks = [text]
|
||||
|
||||
# Translate each chunk
|
||||
translated_chunks = []
|
||||
for chunk in chunks:
|
||||
encoded = tokenizer(chunk, return_tensors="pt").to(device)
|
||||
generated_tokens = model.generate(
|
||||
**encoded,
|
||||
forced_bos_token_id=tokenizer.get_lang_id(target_lang),
|
||||
max_length=max_length
|
||||
)
|
||||
translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
||||
translated_chunks.append(translated_chunk)
|
||||
|
||||
# Combine translated chunks
|
||||
translated_text = " ".join(translated_chunks)
|
||||
|
||||
return translated_text
|
||||
except Exception as e:
|
||||
logger.error(f"Error translating text: {e}")
|
||||
return text
|
||||
|
||||
|
||||
def translate_segments(segments, source_lang=None, target_lang="en", use_gpu=True):
|
||||
"""
|
||||
Translate transcript segments.
|
||||
|
||||
Args:
|
||||
segments (list): List of transcript segments
|
||||
source_lang (str, optional): Source language code (auto-detect if None)
|
||||
target_lang (str): Target language code
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
|
||||
Returns:
|
||||
list: Translated segments
|
||||
"""
|
||||
if not segments:
|
||||
return []
|
||||
|
||||
# Auto-detect source language from combined text if not provided
|
||||
if source_lang is None:
|
||||
combined_text = " ".join([segment["text"] for segment in segments])
|
||||
detected_lang, _ = detect_language(combined_text, use_gpu=use_gpu)
|
||||
source_lang = detected_lang if detected_lang else "en"
|
||||
|
||||
# Skip translation if source and target are the same
|
||||
if source_lang == target_lang:
|
||||
return segments
|
||||
|
||||
try:
|
||||
# Initialize translation pipeline
|
||||
translated_segments = []
|
||||
|
||||
# Translate each segment
|
||||
for segment in segments:
|
||||
translated_text = translate_text(
|
||||
segment["text"],
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
use_gpu=use_gpu
|
||||
)
|
||||
|
||||
# Create a new segment with translated text
|
||||
translated_segment = segment.copy()
|
||||
translated_segment["text"] = translated_text
|
||||
translated_segment["original_text"] = segment["text"]
|
||||
translated_segment["source_lang"] = source_lang
|
||||
translated_segment["target_lang"] = target_lang
|
||||
|
||||
translated_segments.append(translated_segment)
|
||||
|
||||
return translated_segments
|
||||
except Exception as e:
|
||||
logger.error(f"Error translating segments: {e}")
|
||||
return segments
|
||||
|
||||
|
||||
def transcribe_and_translate(audio_path, whisper_model="base", target_lang="en",
|
||||
use_gpu=True, detect_source=True):
|
||||
"""
|
||||
Transcribe audio and translate to target language.
|
||||
|
||||
Args:
|
||||
audio_path (Path): Path to the audio file
|
||||
whisper_model (str): Whisper model size to use
|
||||
target_lang (str): Target language code
|
||||
use_gpu (bool): Whether to use GPU acceleration if available
|
||||
detect_source (bool): Whether to auto-detect source language
|
||||
|
||||
Returns:
|
||||
tuple: (original_segments, translated_segments, original_transcript, translated_transcript)
|
||||
"""
|
||||
audio_path = Path(audio_path)
|
||||
|
||||
# Configure device
|
||||
device = torch.device("cpu")
|
||||
if use_gpu and GPU_UTILS_AVAILABLE:
|
||||
device = get_optimal_device()
|
||||
|
||||
try:
|
||||
# Step 1: Transcribe audio with Whisper
|
||||
logger.info(f"Transcribing audio with Whisper model: {whisper_model}")
|
||||
model = whisper.load_model(whisper_model, device=device if device.type != "mps" else "cpu")
|
||||
|
||||
# Use Whisper's built-in language detection if requested
|
||||
if detect_source:
|
||||
# First, detect language with Whisper
|
||||
audio = whisper.load_audio(str(audio_path))
|
||||
audio = whisper.pad_or_trim(audio)
|
||||
mel = whisper.log_mel_spectrogram(audio).to(device if device.type != "mps" else "cpu")
|
||||
_, probs = model.detect_language(mel)
|
||||
source_lang = max(probs, key=probs.get)
|
||||
logger.info(f"Whisper detected language: {get_language_name(source_lang)} ({source_lang})")
|
||||
|
||||
# Transcribe with detected language
|
||||
result = model.transcribe(str(audio_path), language=source_lang)
|
||||
else:
|
||||
# Transcribe without language specification
|
||||
result = model.transcribe(str(audio_path))
|
||||
source_lang = result.get("language", "en")
|
||||
|
||||
original_segments = result["segments"]
|
||||
original_transcript = result["text"]
|
||||
|
||||
# Step 2: Translate if needed
|
||||
if source_lang != target_lang:
|
||||
logger.info(f"Translating from {source_lang} to {target_lang}")
|
||||
translated_segments = translate_segments(
|
||||
original_segments,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
use_gpu=use_gpu
|
||||
)
|
||||
|
||||
# Create full translated transcript
|
||||
translated_transcript = " ".join([segment["text"] for segment in translated_segments])
|
||||
else:
|
||||
logger.info(f"Source and target languages are the same ({source_lang}), skipping translation")
|
||||
translated_segments = original_segments
|
||||
translated_transcript = original_transcript
|
||||
|
||||
return original_segments, translated_segments, original_transcript, translated_transcript
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transcribe_and_translate: {e}")
|
||||
return None, None, None, None
|
||||
@ -1,8 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
def validate_environment(obs_path: Path):
|
||||
"""Validate environment and prerequisites."""
|
||||
errors = []
|
||||
if not obs_path.exists():
|
||||
errors.append(f"OBS directory not found: {obs_path}")
|
||||
return errors
|
||||
Reference in New Issue
Block a user