126 lines
3.3 KiB
Python
126 lines
3.3 KiB
Python
|
|
"""
|
||
|
|
Local LLM inference using llama.cpp via llama-cpp-python.
|
||
|
|
Handles model download from HuggingFace and text completion.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import logging
|
||
|
|
import os
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
|
||
|
|
QWEN_MODELS = {
|
||
|
|
"qwen3-1.7b": {
|
||
|
|
"repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
|
||
|
|
"file": "qwen3-1.7b-instruct-q4_k_m.gguf",
|
||
|
|
"size_gb": 1.0,
|
||
|
|
},
|
||
|
|
"qwen3-4b": {
|
||
|
|
"repo": "Qwen/Qwen3-4B-Instruct-GGUF",
|
||
|
|
"file": "qwen3-4b-instruct-q4_k_m.gguf",
|
||
|
|
"size_gb": 2.5,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _ensure_llama_cpp() -> bool:
|
||
|
|
try:
|
||
|
|
from llama_cpp import Llama
|
||
|
|
return True
|
||
|
|
except ImportError:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def _model_path(model_id: str) -> Path:
|
||
|
|
info = QWEN_MODELS.get(model_id)
|
||
|
|
if not info:
|
||
|
|
raise ValueError(f"Unknown model: {model_id}")
|
||
|
|
return LOCAL_MODELS_DIR / model_id / info["file"]
|
||
|
|
|
||
|
|
|
||
|
|
def get_status() -> dict:
|
||
|
|
"""Check status of local LLM setup."""
|
||
|
|
llama_available = _ensure_llama_cpp()
|
||
|
|
models = {}
|
||
|
|
for model_id in QWEN_MODELS:
|
||
|
|
path = _model_path(model_id)
|
||
|
|
models[model_id] = {
|
||
|
|
"downloaded": path.exists(),
|
||
|
|
"size_bytes": path.stat().st_size if path.exists() else 0,
|
||
|
|
"total_gb": QWEN_MODELS[model_id]["size_gb"],
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
"llama_cpp_available": llama_available,
|
||
|
|
"models": models,
|
||
|
|
"models_dir": str(LOCAL_MODELS_DIR),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def download_model(model_id: str) -> dict:
|
||
|
|
"""Download a Qwen3 GGUF model from HuggingFace."""
|
||
|
|
info = QWEN_MODELS.get(model_id)
|
||
|
|
if not info:
|
||
|
|
raise ValueError(f"Unknown model: {model_id}")
|
||
|
|
|
||
|
|
model_dir = LOCAL_MODELS_DIR / model_id
|
||
|
|
model_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
output_path = model_dir / info["file"]
|
||
|
|
|
||
|
|
if output_path.exists():
|
||
|
|
return {"status": "already_downloaded", "path": str(output_path)}
|
||
|
|
|
||
|
|
logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
|
||
|
|
subprocess.run([
|
||
|
|
sys.executable, "-m", "huggingface_hub", "download",
|
||
|
|
info["repo"], info["file"],
|
||
|
|
"--local-dir", str(model_dir),
|
||
|
|
"--local-dir-use-symlinks", "False",
|
||
|
|
], check=True)
|
||
|
|
|
||
|
|
if not output_path.exists():
|
||
|
|
raise RuntimeError(f"Download failed: {output_path} not found")
|
||
|
|
|
||
|
|
return {"status": "downloaded", "path": str(output_path)}
|
||
|
|
|
||
|
|
|
||
|
|
def complete(
|
||
|
|
prompt: str,
|
||
|
|
model_id: str = "qwen3-1.7b",
|
||
|
|
system_prompt: Optional[str] = None,
|
||
|
|
temperature: float = 0.3,
|
||
|
|
max_tokens: int = 2048,
|
||
|
|
) -> str:
|
||
|
|
"""Run inference using a local Qwen3 model."""
|
||
|
|
model_path = _model_path(model_id)
|
||
|
|
if not model_path.exists():
|
||
|
|
raise RuntimeError(f"Model not downloaded: {model_id}")
|
||
|
|
|
||
|
|
from llama_cpp import Llama
|
||
|
|
|
||
|
|
llm = Llama(
|
||
|
|
model_path=str(model_path),
|
||
|
|
n_ctx=4096,
|
||
|
|
n_threads=4,
|
||
|
|
n_gpu_layers=-1,
|
||
|
|
verbose=False,
|
||
|
|
)
|
||
|
|
|
||
|
|
messages = []
|
||
|
|
if system_prompt:
|
||
|
|
messages.append({"role": "system", "content": system_prompt})
|
||
|
|
messages.append({"role": "user", "content": prompt})
|
||
|
|
|
||
|
|
response = llm.create_chat_completion(
|
||
|
|
messages=messages,
|
||
|
|
temperature=temperature,
|
||
|
|
max_tokens=max_tokens,
|
||
|
|
)
|
||
|
|
|
||
|
|
return response["choices"][0]["message"]["content"].strip()
|