Files
TalkEdit/backend/services/local_llm.py

126 lines
3.3 KiB
Python
Raw Normal View History

2026-05-07 01:32:19 -06:00
"""
Local LLM inference using llama.cpp via llama-cpp-python.
Handles model download from HuggingFace and text completion.
"""
import json
import logging
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
QWEN_MODELS = {
"qwen3-1.7b": {
"repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
"file": "qwen3-1.7b-instruct-q4_k_m.gguf",
"size_gb": 1.0,
},
"qwen3-4b": {
"repo": "Qwen/Qwen3-4B-Instruct-GGUF",
"file": "qwen3-4b-instruct-q4_k_m.gguf",
"size_gb": 2.5,
},
}
def _ensure_llama_cpp() -> bool:
try:
from llama_cpp import Llama
return True
except ImportError:
return False
def _model_path(model_id: str) -> Path:
info = QWEN_MODELS.get(model_id)
if not info:
raise ValueError(f"Unknown model: {model_id}")
return LOCAL_MODELS_DIR / model_id / info["file"]
def get_status() -> dict:
"""Check status of local LLM setup."""
llama_available = _ensure_llama_cpp()
models = {}
for model_id in QWEN_MODELS:
path = _model_path(model_id)
models[model_id] = {
"downloaded": path.exists(),
"size_bytes": path.stat().st_size if path.exists() else 0,
"total_gb": QWEN_MODELS[model_id]["size_gb"],
}
return {
"llama_cpp_available": llama_available,
"models": models,
"models_dir": str(LOCAL_MODELS_DIR),
}
def download_model(model_id: str) -> dict:
"""Download a Qwen3 GGUF model from HuggingFace."""
info = QWEN_MODELS.get(model_id)
if not info:
raise ValueError(f"Unknown model: {model_id}")
model_dir = LOCAL_MODELS_DIR / model_id
model_dir.mkdir(parents=True, exist_ok=True)
output_path = model_dir / info["file"]
if output_path.exists():
return {"status": "already_downloaded", "path": str(output_path)}
logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
subprocess.run([
sys.executable, "-m", "huggingface_hub", "download",
info["repo"], info["file"],
"--local-dir", str(model_dir),
"--local-dir-use-symlinks", "False",
], check=True)
if not output_path.exists():
raise RuntimeError(f"Download failed: {output_path} not found")
return {"status": "downloaded", "path": str(output_path)}
def complete(
prompt: str,
model_id: str = "qwen3-1.7b",
system_prompt: Optional[str] = None,
temperature: float = 0.3,
max_tokens: int = 2048,
) -> str:
"""Run inference using a local Qwen3 model."""
model_path = _model_path(model_id)
if not model_path.exists():
raise RuntimeError(f"Model not downloaded: {model_id}")
from llama_cpp import Llama
llm = Llama(
model_path=str(model_path),
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1,
verbose=False,
)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = llm.create_chat_completion(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
return response["choices"][0]["message"]["content"].strip()