Add local LLM router and service

2026-05-07 01:32:19 -06:00
parent 1993aabeac
commit 124f215a0a
2 changed files with 179 additions and 0 deletions
--- a/backend/routers/local_llm.py
+++ b/backend/routers/local_llm.py
@ -0,0 +1,54 @@
 """Local LLM endpoints for bundled Qwen3 inference."""
 import logging
 from typing import Optional
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 from services.local_llm import get_status, download_model, complete
 logger = logging.getLogger(__name__)
 router = APIRouter()
 class CompleteRequest(BaseModel):
    prompt: str
    model_id: str = "qwen3-1.7b"
    system_prompt: Optional[str] = None
    temperature: float = 0.3
    max_tokens: int = 2048
@router.get("/local-llm/status")
 async def llm_status():
    try:
        return get_status()
    except Exception as e:
        logger.error(f"Local LLM status failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))
@router.post("/local-llm/download")
 async def llm_download(model_id: str = "qwen3-1.7b"):
    try:
        return download_model(model_id)
    except Exception as e:
        logger.error(f"Local LLM download failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))
@router.post("/local-llm/complete")
 async def llm_complete(req: CompleteRequest):
    try:
        result = complete(
            prompt=req.prompt,
            model_id=req.model_id,
            system_prompt=req.system_prompt,
            temperature=req.temperature,
            max_tokens=req.max_tokens,
        )
        return {"response": result}
    except Exception as e:
        logger.error(f"Local LLM completion failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))
--- a/backend/services/local_llm.py
+++ b/backend/services/local_llm.py
@ -0,0 +1,125 @@
 """
 Local LLM inference using llama.cpp via llama-cpp-python.
 Handles model download from HuggingFace and text completion.
 """
 import json
 import logging
 import os
 import subprocess
 import sys
 from pathlib import Path
 from typing import Optional
 logger = logging.getLogger(__name__)
 LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
 QWEN_MODELS = {
    "qwen3-1.7b": {
        "repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
        "file": "qwen3-1.7b-instruct-q4_k_m.gguf",
        "size_gb": 1.0,
    },
    "qwen3-4b": {
        "repo": "Qwen/Qwen3-4B-Instruct-GGUF",
        "file": "qwen3-4b-instruct-q4_k_m.gguf",
        "size_gb": 2.5,
    },
 }
 def _ensure_llama_cpp() -> bool:
    try:
        from llama_cpp import Llama
        return True
    except ImportError:
        return False
 def _model_path(model_id: str) -> Path:
    info = QWEN_MODELS.get(model_id)
    if not info:
        raise ValueError(f"Unknown model: {model_id}")
    return LOCAL_MODELS_DIR / model_id / info["file"]
 def get_status() -> dict:
    """Check status of local LLM setup."""
    llama_available = _ensure_llama_cpp()
    models = {}
    for model_id in QWEN_MODELS:
        path = _model_path(model_id)
        models[model_id] = {
            "downloaded": path.exists(),
            "size_bytes": path.stat().st_size if path.exists() else 0,
            "total_gb": QWEN_MODELS[model_id]["size_gb"],
        }
    return {
        "llama_cpp_available": llama_available,
        "models": models,
        "models_dir": str(LOCAL_MODELS_DIR),
    }
 def download_model(model_id: str) -> dict:
    """Download a Qwen3 GGUF model from HuggingFace."""
    info = QWEN_MODELS.get(model_id)
    if not info:
        raise ValueError(f"Unknown model: {model_id}")
    model_dir = LOCAL_MODELS_DIR / model_id
    model_dir.mkdir(parents=True, exist_ok=True)
    output_path = model_dir / info["file"]
    if output_path.exists():
        return {"status": "already_downloaded", "path": str(output_path)}
    logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
    subprocess.run([
        sys.executable, "-m", "huggingface_hub", "download",
        info["repo"], info["file"],
        "--local-dir", str(model_dir),
        "--local-dir-use-symlinks", "False",
    ], check=True)
    if not output_path.exists():
        raise RuntimeError(f"Download failed: {output_path} not found")
    return {"status": "downloaded", "path": str(output_path)}
 def complete(
    prompt: str,
    model_id: str = "qwen3-1.7b",
    system_prompt: Optional[str] = None,
    temperature: float = 0.3,
    max_tokens: int = 2048,
 ) -> str:
    """Run inference using a local Qwen3 model."""
    model_path = _model_path(model_id)
    if not model_path.exists():
        raise RuntimeError(f"Model not downloaded: {model_id}")
    from llama_cpp import Llama
    llm = Llama(
        model_path=str(model_path),
        n_ctx=4096,
        n_threads=4,
        n_gpu_layers=-1,
        verbose=False,
    )
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})
    response = llm.create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return response["choices"][0]["message"]["content"].strip()