""" Local LLM inference using llama.cpp via llama-cpp-python. Handles model download from HuggingFace and text completion. """ import json import logging import os import subprocess import sys from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models" QWEN_MODELS = { "qwen3-1.7b": { "repo": "Qwen/Qwen3-1.7B-Instruct-GGUF", "file": "qwen3-1.7b-instruct-q4_k_m.gguf", "size_gb": 1.0, }, "qwen3-4b": { "repo": "Qwen/Qwen3-4B-Instruct-GGUF", "file": "qwen3-4b-instruct-q4_k_m.gguf", "size_gb": 2.5, }, } def _ensure_llama_cpp() -> bool: try: from llama_cpp import Llama return True except ImportError: return False def _model_path(model_id: str) -> Path: info = QWEN_MODELS.get(model_id) if not info: raise ValueError(f"Unknown model: {model_id}") return LOCAL_MODELS_DIR / model_id / info["file"] def get_status() -> dict: """Check status of local LLM setup.""" llama_available = _ensure_llama_cpp() models = {} for model_id in QWEN_MODELS: path = _model_path(model_id) models[model_id] = { "downloaded": path.exists(), "size_bytes": path.stat().st_size if path.exists() else 0, "total_gb": QWEN_MODELS[model_id]["size_gb"], } return { "llama_cpp_available": llama_available, "models": models, "models_dir": str(LOCAL_MODELS_DIR), } def download_model(model_id: str) -> dict: """Download a Qwen3 GGUF model from HuggingFace.""" info = QWEN_MODELS.get(model_id) if not info: raise ValueError(f"Unknown model: {model_id}") model_dir = LOCAL_MODELS_DIR / model_id model_dir.mkdir(parents=True, exist_ok=True) output_path = model_dir / info["file"] if output_path.exists(): return {"status": "already_downloaded", "path": str(output_path)} logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...") subprocess.run([ sys.executable, "-m", "huggingface_hub", "download", info["repo"], info["file"], "--local-dir", str(model_dir), "--local-dir-use-symlinks", "False", ], check=True) if not output_path.exists(): raise RuntimeError(f"Download failed: {output_path} not found") return {"status": "downloaded", "path": str(output_path)} def complete( prompt: str, model_id: str = "qwen3-1.7b", system_prompt: Optional[str] = None, temperature: float = 0.3, max_tokens: int = 2048, ) -> str: """Run inference using a local Qwen3 model.""" model_path = _model_path(model_id) if not model_path.exists(): raise RuntimeError(f"Model not downloaded: {model_id}") from llama_cpp import Llama llm = Llama( model_path=str(model_path), n_ctx=4096, n_threads=4, n_gpu_layers=-1, verbose=False, ) messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) response = llm.create_chat_completion( messages=messages, temperature=temperature, max_tokens=max_tokens, ) return response["choices"][0]["message"]["content"].strip()