From 124f215a0a3ea81e29e8e7ec02fc21127db5c902 Mon Sep 17 00:00:00 2001 From: dillonj Date: Thu, 7 May 2026 01:32:19 -0600 Subject: [PATCH] Add local LLM router and service --- backend/routers/local_llm.py | 54 +++++++++++++++ backend/services/local_llm.py | 125 ++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 backend/routers/local_llm.py create mode 100644 backend/services/local_llm.py diff --git a/backend/routers/local_llm.py b/backend/routers/local_llm.py new file mode 100644 index 0000000..d3e99e3 --- /dev/null +++ b/backend/routers/local_llm.py @@ -0,0 +1,54 @@ +"""Local LLM endpoints for bundled Qwen3 inference.""" + +import logging +from typing import Optional + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +from services.local_llm import get_status, download_model, complete + +logger = logging.getLogger(__name__) +router = APIRouter() + + +class CompleteRequest(BaseModel): + prompt: str + model_id: str = "qwen3-1.7b" + system_prompt: Optional[str] = None + temperature: float = 0.3 + max_tokens: int = 2048 + + +@router.get("/local-llm/status") +async def llm_status(): + try: + return get_status() + except Exception as e: + logger.error(f"Local LLM status failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/local-llm/download") +async def llm_download(model_id: str = "qwen3-1.7b"): + try: + return download_model(model_id) + except Exception as e: + logger.error(f"Local LLM download failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/local-llm/complete") +async def llm_complete(req: CompleteRequest): + try: + result = complete( + prompt=req.prompt, + model_id=req.model_id, + system_prompt=req.system_prompt, + temperature=req.temperature, + max_tokens=req.max_tokens, + ) + return {"response": result} + except Exception as e: + logger.error(f"Local LLM completion failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) diff --git a/backend/services/local_llm.py b/backend/services/local_llm.py new file mode 100644 index 0000000..728584b --- /dev/null +++ b/backend/services/local_llm.py @@ -0,0 +1,125 @@ +""" +Local LLM inference using llama.cpp via llama-cpp-python. +Handles model download from HuggingFace and text completion. +""" + +import json +import logging +import os +import subprocess +import sys +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models" +QWEN_MODELS = { + "qwen3-1.7b": { + "repo": "Qwen/Qwen3-1.7B-Instruct-GGUF", + "file": "qwen3-1.7b-instruct-q4_k_m.gguf", + "size_gb": 1.0, + }, + "qwen3-4b": { + "repo": "Qwen/Qwen3-4B-Instruct-GGUF", + "file": "qwen3-4b-instruct-q4_k_m.gguf", + "size_gb": 2.5, + }, +} + + +def _ensure_llama_cpp() -> bool: + try: + from llama_cpp import Llama + return True + except ImportError: + return False + + +def _model_path(model_id: str) -> Path: + info = QWEN_MODELS.get(model_id) + if not info: + raise ValueError(f"Unknown model: {model_id}") + return LOCAL_MODELS_DIR / model_id / info["file"] + + +def get_status() -> dict: + """Check status of local LLM setup.""" + llama_available = _ensure_llama_cpp() + models = {} + for model_id in QWEN_MODELS: + path = _model_path(model_id) + models[model_id] = { + "downloaded": path.exists(), + "size_bytes": path.stat().st_size if path.exists() else 0, + "total_gb": QWEN_MODELS[model_id]["size_gb"], + } + + return { + "llama_cpp_available": llama_available, + "models": models, + "models_dir": str(LOCAL_MODELS_DIR), + } + + +def download_model(model_id: str) -> dict: + """Download a Qwen3 GGUF model from HuggingFace.""" + info = QWEN_MODELS.get(model_id) + if not info: + raise ValueError(f"Unknown model: {model_id}") + + model_dir = LOCAL_MODELS_DIR / model_id + model_dir.mkdir(parents=True, exist_ok=True) + output_path = model_dir / info["file"] + + if output_path.exists(): + return {"status": "already_downloaded", "path": str(output_path)} + + logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...") + subprocess.run([ + sys.executable, "-m", "huggingface_hub", "download", + info["repo"], info["file"], + "--local-dir", str(model_dir), + "--local-dir-use-symlinks", "False", + ], check=True) + + if not output_path.exists(): + raise RuntimeError(f"Download failed: {output_path} not found") + + return {"status": "downloaded", "path": str(output_path)} + + +def complete( + prompt: str, + model_id: str = "qwen3-1.7b", + system_prompt: Optional[str] = None, + temperature: float = 0.3, + max_tokens: int = 2048, +) -> str: + """Run inference using a local Qwen3 model.""" + model_path = _model_path(model_id) + if not model_path.exists(): + raise RuntimeError(f"Model not downloaded: {model_id}") + + from llama_cpp import Llama + + llm = Llama( + model_path=str(model_path), + n_ctx=4096, + n_threads=4, + n_gpu_layers=-1, + verbose=False, + ) + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + response = llm.create_chat_completion( + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + ) + + return response["choices"][0]["message"]["content"].strip()