Add local LLM router and service
Some checks failed
CI / rust (push) Has been cancelled
CI / frontend (push) Has been cancelled
CI / python (push) Has been cancelled
Validate All / validate-all (push) Has been cancelled
Release / build (archlinux, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (deb, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (msi, windows-latest, x86_64-pc-windows-msvc) (push) Has been cancelled
Release / build (rpm, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Some checks failed
CI / rust (push) Has been cancelled
CI / frontend (push) Has been cancelled
CI / python (push) Has been cancelled
Validate All / validate-all (push) Has been cancelled
Release / build (archlinux, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (deb, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (msi, windows-latest, x86_64-pc-windows-msvc) (push) Has been cancelled
Release / build (rpm, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
This commit is contained in:
54
backend/routers/local_llm.py
Normal file
54
backend/routers/local_llm.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
"""Local LLM endpoints for bundled Qwen3 inference."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from services.local_llm import get_status, download_model, complete
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
class CompleteRequest(BaseModel):
|
||||||
|
prompt: str
|
||||||
|
model_id: str = "qwen3-1.7b"
|
||||||
|
system_prompt: Optional[str] = None
|
||||||
|
temperature: float = 0.3
|
||||||
|
max_tokens: int = 2048
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/local-llm/status")
|
||||||
|
async def llm_status():
|
||||||
|
try:
|
||||||
|
return get_status()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Local LLM status failed: {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/local-llm/download")
|
||||||
|
async def llm_download(model_id: str = "qwen3-1.7b"):
|
||||||
|
try:
|
||||||
|
return download_model(model_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Local LLM download failed: {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/local-llm/complete")
|
||||||
|
async def llm_complete(req: CompleteRequest):
|
||||||
|
try:
|
||||||
|
result = complete(
|
||||||
|
prompt=req.prompt,
|
||||||
|
model_id=req.model_id,
|
||||||
|
system_prompt=req.system_prompt,
|
||||||
|
temperature=req.temperature,
|
||||||
|
max_tokens=req.max_tokens,
|
||||||
|
)
|
||||||
|
return {"response": result}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Local LLM completion failed: {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
125
backend/services/local_llm.py
Normal file
125
backend/services/local_llm.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
"""
|
||||||
|
Local LLM inference using llama.cpp via llama-cpp-python.
|
||||||
|
Handles model download from HuggingFace and text completion.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
|
||||||
|
QWEN_MODELS = {
|
||||||
|
"qwen3-1.7b": {
|
||||||
|
"repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
|
||||||
|
"file": "qwen3-1.7b-instruct-q4_k_m.gguf",
|
||||||
|
"size_gb": 1.0,
|
||||||
|
},
|
||||||
|
"qwen3-4b": {
|
||||||
|
"repo": "Qwen/Qwen3-4B-Instruct-GGUF",
|
||||||
|
"file": "qwen3-4b-instruct-q4_k_m.gguf",
|
||||||
|
"size_gb": 2.5,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_llama_cpp() -> bool:
|
||||||
|
try:
|
||||||
|
from llama_cpp import Llama
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _model_path(model_id: str) -> Path:
|
||||||
|
info = QWEN_MODELS.get(model_id)
|
||||||
|
if not info:
|
||||||
|
raise ValueError(f"Unknown model: {model_id}")
|
||||||
|
return LOCAL_MODELS_DIR / model_id / info["file"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_status() -> dict:
|
||||||
|
"""Check status of local LLM setup."""
|
||||||
|
llama_available = _ensure_llama_cpp()
|
||||||
|
models = {}
|
||||||
|
for model_id in QWEN_MODELS:
|
||||||
|
path = _model_path(model_id)
|
||||||
|
models[model_id] = {
|
||||||
|
"downloaded": path.exists(),
|
||||||
|
"size_bytes": path.stat().st_size if path.exists() else 0,
|
||||||
|
"total_gb": QWEN_MODELS[model_id]["size_gb"],
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"llama_cpp_available": llama_available,
|
||||||
|
"models": models,
|
||||||
|
"models_dir": str(LOCAL_MODELS_DIR),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def download_model(model_id: str) -> dict:
|
||||||
|
"""Download a Qwen3 GGUF model from HuggingFace."""
|
||||||
|
info = QWEN_MODELS.get(model_id)
|
||||||
|
if not info:
|
||||||
|
raise ValueError(f"Unknown model: {model_id}")
|
||||||
|
|
||||||
|
model_dir = LOCAL_MODELS_DIR / model_id
|
||||||
|
model_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path = model_dir / info["file"]
|
||||||
|
|
||||||
|
if output_path.exists():
|
||||||
|
return {"status": "already_downloaded", "path": str(output_path)}
|
||||||
|
|
||||||
|
logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
|
||||||
|
subprocess.run([
|
||||||
|
sys.executable, "-m", "huggingface_hub", "download",
|
||||||
|
info["repo"], info["file"],
|
||||||
|
"--local-dir", str(model_dir),
|
||||||
|
"--local-dir-use-symlinks", "False",
|
||||||
|
], check=True)
|
||||||
|
|
||||||
|
if not output_path.exists():
|
||||||
|
raise RuntimeError(f"Download failed: {output_path} not found")
|
||||||
|
|
||||||
|
return {"status": "downloaded", "path": str(output_path)}
|
||||||
|
|
||||||
|
|
||||||
|
def complete(
|
||||||
|
prompt: str,
|
||||||
|
model_id: str = "qwen3-1.7b",
|
||||||
|
system_prompt: Optional[str] = None,
|
||||||
|
temperature: float = 0.3,
|
||||||
|
max_tokens: int = 2048,
|
||||||
|
) -> str:
|
||||||
|
"""Run inference using a local Qwen3 model."""
|
||||||
|
model_path = _model_path(model_id)
|
||||||
|
if not model_path.exists():
|
||||||
|
raise RuntimeError(f"Model not downloaded: {model_id}")
|
||||||
|
|
||||||
|
from llama_cpp import Llama
|
||||||
|
|
||||||
|
llm = Llama(
|
||||||
|
model_path=str(model_path),
|
||||||
|
n_ctx=4096,
|
||||||
|
n_threads=4,
|
||||||
|
n_gpu_layers=-1,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
if system_prompt:
|
||||||
|
messages.append({"role": "system", "content": system_prompt})
|
||||||
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
|
||||||
|
response = llm.create_chat_completion(
|
||||||
|
messages=messages,
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
return response["choices"][0]["message"]["content"].strip()
|
||||||
Reference in New Issue
Block a user