TalkEdit/backend/services/local_llm.py

"""
Local LLM inference using llama.cpp via llama-cpp-python.
Handles model download from HuggingFace and text completion.
"""

import json
import logging
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
QWEN_MODELS = {
    "qwen3-1.7b": {
        "repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
        "file": "qwen3-1.7b-instruct-q4_k_m.gguf",
        "size_gb": 1.0,
    },
    "qwen3-4b": {
        "repo": "Qwen/Qwen3-4B-Instruct-GGUF",
        "file": "qwen3-4b-instruct-q4_k_m.gguf",
        "size_gb": 2.5,
    },
}


def _ensure_llama_cpp() -> bool:
    try:
        from llama_cpp import Llama
        return True
    except ImportError:
        return False


def _model_path(model_id: str) -> Path:
    info = QWEN_MODELS.get(model_id)
    if not info:
        raise ValueError(f"Unknown model: {model_id}")
    return LOCAL_MODELS_DIR / model_id / info["file"]


def get_status() -> dict:
    """Check status of local LLM setup."""
    llama_available = _ensure_llama_cpp()
    models = {}
    for model_id in QWEN_MODELS:
        path = _model_path(model_id)
        models[model_id] = {
            "downloaded": path.exists(),
            "size_bytes": path.stat().st_size if path.exists() else 0,
            "total_gb": QWEN_MODELS[model_id]["size_gb"],
        }

    return {
        "llama_cpp_available": llama_available,
        "models": models,
        "models_dir": str(LOCAL_MODELS_DIR),
    }


def download_model(model_id: str) -> dict:
    """Download a Qwen3 GGUF model from HuggingFace."""
    info = QWEN_MODELS.get(model_id)
    if not info:
        raise ValueError(f"Unknown model: {model_id}")

    model_dir = LOCAL_MODELS_DIR / model_id
    model_dir.mkdir(parents=True, exist_ok=True)
    output_path = model_dir / info["file"]

    if output_path.exists():
        return {"status": "already_downloaded", "path": str(output_path)}

    logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
    subprocess.run([
        sys.executable, "-m", "huggingface_hub", "download",
        info["repo"], info["file"],
        "--local-dir", str(model_dir),
        "--local-dir-use-symlinks", "False",
    ], check=True)

    if not output_path.exists():
        raise RuntimeError(f"Download failed: {output_path} not found")

    return {"status": "downloaded", "path": str(output_path)}


def complete(
    prompt: str,
    model_id: str = "qwen3-1.7b",
    system_prompt: Optional[str] = None,
    temperature: float = 0.3,
    max_tokens: int = 2048,
) -> str:
    """Run inference using a local Qwen3 model."""
    model_path = _model_path(model_id)
    if not model_path.exists():
        raise RuntimeError(f"Model not downloaded: {model_id}")

    from llama_cpp import Llama

    llm = Llama(
        model_path=str(model_path),
        n_ctx=4096,
        n_threads=4,
        n_gpu_layers=-1,
        verbose=False,
    )

    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    response = llm.create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )

    return response["choices"][0]["message"]["content"].strip()
Add local LLM router and service 2026-05-07 01:32:19 -06:00			`"""`
			`Local LLM inference using llama.cpp via llama-cpp-python.`
			`Handles model download from HuggingFace and text completion.`
			`"""`

			`import json`
			`import logging`
			`import os`
			`import subprocess`
			`import sys`
			`from pathlib import Path`
			`from typing import Optional`

			`logger = logging.getLogger(__name__)`

			`LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"`
			`QWEN_MODELS = {`
			`"qwen3-1.7b": {`
			`"repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",`
			`"file": "qwen3-1.7b-instruct-q4_k_m.gguf",`
			`"size_gb": 1.0,`
			`},`
			`"qwen3-4b": {`
			`"repo": "Qwen/Qwen3-4B-Instruct-GGUF",`
			`"file": "qwen3-4b-instruct-q4_k_m.gguf",`
			`"size_gb": 2.5,`
			`},`
			`}`


			`def _ensure_llama_cpp() -> bool:`
			`try:`
			`from llama_cpp import Llama`
			`return True`
			`except ImportError:`
			`return False`


			`def _model_path(model_id: str) -> Path:`
			`info = QWEN_MODELS.get(model_id)`
			`if not info:`
			`raise ValueError(f"Unknown model: {model_id}")`
			`return LOCAL_MODELS_DIR / model_id / info["file"]`


			`def get_status() -> dict:`
			`"""Check status of local LLM setup."""`
			`llama_available = _ensure_llama_cpp()`
			`models = {}`
			`for model_id in QWEN_MODELS:`
			`path = _model_path(model_id)`
			`models[model_id] = {`
			`"downloaded": path.exists(),`
			`"size_bytes": path.stat().st_size if path.exists() else 0,`
			`"total_gb": QWEN_MODELS[model_id]["size_gb"],`
			`}`

			`return {`
			`"llama_cpp_available": llama_available,`
			`"models": models,`
			`"models_dir": str(LOCAL_MODELS_DIR),`
			`}`


			`def download_model(model_id: str) -> dict:`
			`"""Download a Qwen3 GGUF model from HuggingFace."""`
			`info = QWEN_MODELS.get(model_id)`
			`if not info:`
			`raise ValueError(f"Unknown model: {model_id}")`

			`model_dir = LOCAL_MODELS_DIR / model_id`
			`model_dir.mkdir(parents=True, exist_ok=True)`
			`output_path = model_dir / info["file"]`

			`if output_path.exists():`
			`return {"status": "already_downloaded", "path": str(output_path)}`

			`logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")`
			`subprocess.run([`
			`sys.executable, "-m", "huggingface_hub", "download",`
			`info["repo"], info["file"],`
			`"--local-dir", str(model_dir),`
			`"--local-dir-use-symlinks", "False",`
			`], check=True)`

			`if not output_path.exists():`
			`raise RuntimeError(f"Download failed: {output_path} not found")`

			`return {"status": "downloaded", "path": str(output_path)}`


			`def complete(`
			`prompt: str,`
			`model_id: str = "qwen3-1.7b",`
			`system_prompt: Optional[str] = None,`
			`temperature: float = 0.3,`
			`max_tokens: int = 2048,`
			`) -> str:`
			`"""Run inference using a local Qwen3 model."""`
			`model_path = _model_path(model_id)`
			`if not model_path.exists():`
			`raise RuntimeError(f"Model not downloaded: {model_id}")`

			`from llama_cpp import Llama`

			`llm = Llama(`
			`model_path=str(model_path),`
			`n_ctx=4096,`
			`n_threads=4,`
			`n_gpu_layers=-1,`
			`verbose=False,`
			`)`

			`messages = []`
			`if system_prompt:`
			`messages.append({"role": "system", "content": system_prompt})`
			`messages.append({"role": "user", "content": prompt})`

			`response = llm.create_chat_completion(`
			`messages=messages,`
			`temperature=temperature,`
			`max_tokens=max_tokens,`
			`)`

			`return response["choices"][0]["message"]["content"].strip()`