Add local LLM router and service

2026-05-07 01:32:19 -06:00
parent 1993aabeac
commit 124f215a0a
2 changed files with 179 additions and 0 deletions
--- a/backend/services/local_llm.py
+++ b/backend/services/local_llm.py
@ -0,0 +1,125 @@
+"""
+Local LLM inference using llama.cpp via llama-cpp-python.
+Handles model download from HuggingFace and text completion.
+"""
+
+import json
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
+QWEN_MODELS = {
+    "qwen3-1.7b": {
+        "repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
+        "file": "qwen3-1.7b-instruct-q4_k_m.gguf",
+        "size_gb": 1.0,
+    },
+    "qwen3-4b": {
+        "repo": "Qwen/Qwen3-4B-Instruct-GGUF",
+        "file": "qwen3-4b-instruct-q4_k_m.gguf",
+        "size_gb": 2.5,
+    },
+}
+
+
+def _ensure_llama_cpp() -> bool:
+    try:
+        from llama_cpp import Llama
+        return True
+    except ImportError:
+        return False
+
+
+def _model_path(model_id: str) -> Path:
+    info = QWEN_MODELS.get(model_id)
+    if not info:
+        raise ValueError(f"Unknown model: {model_id}")
+    return LOCAL_MODELS_DIR / model_id / info["file"]
+
+
+def get_status() -> dict:
+    """Check status of local LLM setup."""
+    llama_available = _ensure_llama_cpp()
+    models = {}
+    for model_id in QWEN_MODELS:
+        path = _model_path(model_id)
+        models[model_id] = {
+            "downloaded": path.exists(),
+            "size_bytes": path.stat().st_size if path.exists() else 0,
+            "total_gb": QWEN_MODELS[model_id]["size_gb"],
+        }
+
+    return {
+        "llama_cpp_available": llama_available,
+        "models": models,
+        "models_dir": str(LOCAL_MODELS_DIR),
+    }
+
+
+def download_model(model_id: str) -> dict:
+    """Download a Qwen3 GGUF model from HuggingFace."""
+    info = QWEN_MODELS.get(model_id)
+    if not info:
+        raise ValueError(f"Unknown model: {model_id}")
+
+    model_dir = LOCAL_MODELS_DIR / model_id
+    model_dir.mkdir(parents=True, exist_ok=True)
+    output_path = model_dir / info["file"]
+
+    if output_path.exists():
+        return {"status": "already_downloaded", "path": str(output_path)}
+
+    logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
+    subprocess.run([
+        sys.executable, "-m", "huggingface_hub", "download",
+        info["repo"], info["file"],
+        "--local-dir", str(model_dir),
+        "--local-dir-use-symlinks", "False",
+    ], check=True)
+
+    if not output_path.exists():
+        raise RuntimeError(f"Download failed: {output_path} not found")
+
+    return {"status": "downloaded", "path": str(output_path)}
+
+
+def complete(
+    prompt: str,
+    model_id: str = "qwen3-1.7b",
+    system_prompt: Optional[str] = None,
+    temperature: float = 0.3,
+    max_tokens: int = 2048,
+) -> str:
+    """Run inference using a local Qwen3 model."""
+    model_path = _model_path(model_id)
+    if not model_path.exists():
+        raise RuntimeError(f"Model not downloaded: {model_id}")
+
+    from llama_cpp import Llama
+
+    llm = Llama(
+        model_path=str(model_path),
+        n_ctx=4096,
+        n_threads=4,
+        n_gpu_layers=-1,
+        verbose=False,
+    )
+
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": prompt})
+
+    response = llm.create_chat_completion(
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+
+    return response["choices"][0]["message"]["content"].strip()