Add local LLM router and service
Some checks failed
CI / rust (push) Has been cancelled
CI / frontend (push) Has been cancelled
CI / python (push) Has been cancelled
Validate All / validate-all (push) Has been cancelled
Release / build (archlinux, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (deb, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (msi, windows-latest, x86_64-pc-windows-msvc) (push) Has been cancelled
Release / build (rpm, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Some checks failed
CI / rust (push) Has been cancelled
CI / frontend (push) Has been cancelled
CI / python (push) Has been cancelled
Validate All / validate-all (push) Has been cancelled
Release / build (archlinux, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (deb, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
Release / build (msi, windows-latest, x86_64-pc-windows-msvc) (push) Has been cancelled
Release / build (rpm, ubuntu-24.04, x86_64-unknown-linux-gnu) (push) Has been cancelled
This commit is contained in:
125
backend/services/local_llm.py
Normal file
125
backend/services/local_llm.py
Normal file
@ -0,0 +1,125 @@
|
||||
"""
|
||||
Local LLM inference using llama.cpp via llama-cpp-python.
|
||||
Handles model download from HuggingFace and text completion.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
|
||||
QWEN_MODELS = {
|
||||
"qwen3-1.7b": {
|
||||
"repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
|
||||
"file": "qwen3-1.7b-instruct-q4_k_m.gguf",
|
||||
"size_gb": 1.0,
|
||||
},
|
||||
"qwen3-4b": {
|
||||
"repo": "Qwen/Qwen3-4B-Instruct-GGUF",
|
||||
"file": "qwen3-4b-instruct-q4_k_m.gguf",
|
||||
"size_gb": 2.5,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _ensure_llama_cpp() -> bool:
|
||||
try:
|
||||
from llama_cpp import Llama
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
def _model_path(model_id: str) -> Path:
|
||||
info = QWEN_MODELS.get(model_id)
|
||||
if not info:
|
||||
raise ValueError(f"Unknown model: {model_id}")
|
||||
return LOCAL_MODELS_DIR / model_id / info["file"]
|
||||
|
||||
|
||||
def get_status() -> dict:
|
||||
"""Check status of local LLM setup."""
|
||||
llama_available = _ensure_llama_cpp()
|
||||
models = {}
|
||||
for model_id in QWEN_MODELS:
|
||||
path = _model_path(model_id)
|
||||
models[model_id] = {
|
||||
"downloaded": path.exists(),
|
||||
"size_bytes": path.stat().st_size if path.exists() else 0,
|
||||
"total_gb": QWEN_MODELS[model_id]["size_gb"],
|
||||
}
|
||||
|
||||
return {
|
||||
"llama_cpp_available": llama_available,
|
||||
"models": models,
|
||||
"models_dir": str(LOCAL_MODELS_DIR),
|
||||
}
|
||||
|
||||
|
||||
def download_model(model_id: str) -> dict:
|
||||
"""Download a Qwen3 GGUF model from HuggingFace."""
|
||||
info = QWEN_MODELS.get(model_id)
|
||||
if not info:
|
||||
raise ValueError(f"Unknown model: {model_id}")
|
||||
|
||||
model_dir = LOCAL_MODELS_DIR / model_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_path = model_dir / info["file"]
|
||||
|
||||
if output_path.exists():
|
||||
return {"status": "already_downloaded", "path": str(output_path)}
|
||||
|
||||
logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
|
||||
subprocess.run([
|
||||
sys.executable, "-m", "huggingface_hub", "download",
|
||||
info["repo"], info["file"],
|
||||
"--local-dir", str(model_dir),
|
||||
"--local-dir-use-symlinks", "False",
|
||||
], check=True)
|
||||
|
||||
if not output_path.exists():
|
||||
raise RuntimeError(f"Download failed: {output_path} not found")
|
||||
|
||||
return {"status": "downloaded", "path": str(output_path)}
|
||||
|
||||
|
||||
def complete(
|
||||
prompt: str,
|
||||
model_id: str = "qwen3-1.7b",
|
||||
system_prompt: Optional[str] = None,
|
||||
temperature: float = 0.3,
|
||||
max_tokens: int = 2048,
|
||||
) -> str:
|
||||
"""Run inference using a local Qwen3 model."""
|
||||
model_path = _model_path(model_id)
|
||||
if not model_path.exists():
|
||||
raise RuntimeError(f"Model not downloaded: {model_id}")
|
||||
|
||||
from llama_cpp import Llama
|
||||
|
||||
llm = Llama(
|
||||
model_path=str(model_path),
|
||||
n_ctx=4096,
|
||||
n_threads=4,
|
||||
n_gpu_layers=-1,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = llm.create_chat_completion(
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
return response["choices"][0]["message"]["content"].strip()
|
||||
Reference in New Issue
Block a user