From 124f215a0a3ea81e29e8e7ec02fc21127db5c902 Mon Sep 17 00:00:00 2001
From: dillonj <dilljens@gmail.com>
Date: Thu, 7 May 2026 01:32:19 -0600
Subject: [PATCH] Add local LLM router and service

---
 backend/routers/local_llm.py  |  54 +++++++++++++++
 backend/services/local_llm.py | 125 ++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 backend/routers/local_llm.py
 create mode 100644 backend/services/local_llm.py

diff --git a/backend/routers/local_llm.py b/backend/routers/local_llm.py
new file mode 100644
index 0000000..d3e99e3
--- /dev/null
+++ b/backend/routers/local_llm.py
@@ -0,0 +1,54 @@
+"""Local LLM endpoints for bundled Qwen3 inference."""
+
+import logging
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from services.local_llm import get_status, download_model, complete
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+
+class CompleteRequest(BaseModel):
+    prompt: str
+    model_id: str = "qwen3-1.7b"
+    system_prompt: Optional[str] = None
+    temperature: float = 0.3
+    max_tokens: int = 2048
+
+
+@router.get("/local-llm/status")
+async def llm_status():
+    try:
+        return get_status()
+    except Exception as e:
+        logger.error(f"Local LLM status failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/local-llm/download")
+async def llm_download(model_id: str = "qwen3-1.7b"):
+    try:
+        return download_model(model_id)
+    except Exception as e:
+        logger.error(f"Local LLM download failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/local-llm/complete")
+async def llm_complete(req: CompleteRequest):
+    try:
+        result = complete(
+            prompt=req.prompt,
+            model_id=req.model_id,
+            system_prompt=req.system_prompt,
+            temperature=req.temperature,
+            max_tokens=req.max_tokens,
+        )
+        return {"response": result}
+    except Exception as e:
+        logger.error(f"Local LLM completion failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
diff --git a/backend/services/local_llm.py b/backend/services/local_llm.py
new file mode 100644
index 0000000..728584b
--- /dev/null
+++ b/backend/services/local_llm.py
@@ -0,0 +1,125 @@
+"""
+Local LLM inference using llama.cpp via llama-cpp-python.
+Handles model download from HuggingFace and text completion.
+"""
+
+import json
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
+QWEN_MODELS = {
+    "qwen3-1.7b": {
+        "repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
+        "file": "qwen3-1.7b-instruct-q4_k_m.gguf",
+        "size_gb": 1.0,
+    },
+    "qwen3-4b": {
+        "repo": "Qwen/Qwen3-4B-Instruct-GGUF",
+        "file": "qwen3-4b-instruct-q4_k_m.gguf",
+        "size_gb": 2.5,
+    },
+}
+
+
+def _ensure_llama_cpp() -> bool:
+    try:
+        from llama_cpp import Llama
+        return True
+    except ImportError:
+        return False
+
+
+def _model_path(model_id: str) -> Path:
+    info = QWEN_MODELS.get(model_id)
+    if not info:
+        raise ValueError(f"Unknown model: {model_id}")
+    return LOCAL_MODELS_DIR / model_id / info["file"]
+
+
+def get_status() -> dict:
+    """Check status of local LLM setup."""
+    llama_available = _ensure_llama_cpp()
+    models = {}
+    for model_id in QWEN_MODELS:
+        path = _model_path(model_id)
+        models[model_id] = {
+            "downloaded": path.exists(),
+            "size_bytes": path.stat().st_size if path.exists() else 0,
+            "total_gb": QWEN_MODELS[model_id]["size_gb"],
+        }
+
+    return {
+        "llama_cpp_available": llama_available,
+        "models": models,
+        "models_dir": str(LOCAL_MODELS_DIR),
+    }
+
+
+def download_model(model_id: str) -> dict:
+    """Download a Qwen3 GGUF model from HuggingFace."""
+    info = QWEN_MODELS.get(model_id)
+    if not info:
+        raise ValueError(f"Unknown model: {model_id}")
+
+    model_dir = LOCAL_MODELS_DIR / model_id
+    model_dir.mkdir(parents=True, exist_ok=True)
+    output_path = model_dir / info["file"]
+
+    if output_path.exists():
+        return {"status": "already_downloaded", "path": str(output_path)}
+
+    logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
+    subprocess.run([
+        sys.executable, "-m", "huggingface_hub", "download",
+        info["repo"], info["file"],
+        "--local-dir", str(model_dir),
+        "--local-dir-use-symlinks", "False",
+    ], check=True)
+
+    if not output_path.exists():
+        raise RuntimeError(f"Download failed: {output_path} not found")
+
+    return {"status": "downloaded", "path": str(output_path)}
+
+
+def complete(
+    prompt: str,
+    model_id: str = "qwen3-1.7b",
+    system_prompt: Optional[str] = None,
+    temperature: float = 0.3,
+    max_tokens: int = 2048,
+) -> str:
+    """Run inference using a local Qwen3 model."""
+    model_path = _model_path(model_id)
+    if not model_path.exists():
+        raise RuntimeError(f"Model not downloaded: {model_id}")
+
+    from llama_cpp import Llama
+
+    llm = Llama(
+        model_path=str(model_path),
+        n_ctx=4096,
+        n_threads=4,
+        n_gpu_layers=-1,
+        verbose=False,
+    )
+
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": prompt})
+
+    response = llm.create_chat_completion(
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+
+    return response["choices"][0]["message"]["content"].strip()