"""
Local LLM inference using llama.cpp via llama-cpp-python.
Handles model download from HuggingFace and text completion.
"""

import json
import logging
import os
import subprocess
import sys
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

LOCAL_MODELS_DIR = Path.home() / ".cache" / "talkedit" / "models"
QWEN_MODELS = {
    "qwen3-1.7b": {
        "repo": "Qwen/Qwen3-1.7B-Instruct-GGUF",
        "file": "qwen3-1.7b-instruct-q4_k_m.gguf",
        "size_gb": 1.0,
    },
    "qwen3-4b": {
        "repo": "Qwen/Qwen3-4B-Instruct-GGUF",
        "file": "qwen3-4b-instruct-q4_k_m.gguf",
        "size_gb": 2.5,
    },
}


def _ensure_llama_cpp() -> bool:
    try:
        from llama_cpp import Llama
        return True
    except ImportError:
        return False


def _model_path(model_id: str) -> Path:
    info = QWEN_MODELS.get(model_id)
    if not info:
        raise ValueError(f"Unknown model: {model_id}")
    return LOCAL_MODELS_DIR / model_id / info["file"]


def get_status() -> dict:
    """Check status of local LLM setup."""
    llama_available = _ensure_llama_cpp()
    models = {}
    for model_id in QWEN_MODELS:
        path = _model_path(model_id)
        models[model_id] = {
            "downloaded": path.exists(),
            "size_bytes": path.stat().st_size if path.exists() else 0,
            "total_gb": QWEN_MODELS[model_id]["size_gb"],
        }

    return {
        "llama_cpp_available": llama_available,
        "models": models,
        "models_dir": str(LOCAL_MODELS_DIR),
    }


def download_model(model_id: str) -> dict:
    """Download a Qwen3 GGUF model from HuggingFace."""
    info = QWEN_MODELS.get(model_id)
    if not info:
        raise ValueError(f"Unknown model: {model_id}")

    model_dir = LOCAL_MODELS_DIR / model_id
    model_dir.mkdir(parents=True, exist_ok=True)
    output_path = model_dir / info["file"]

    if output_path.exists():
        return {"status": "already_downloaded", "path": str(output_path)}

    logger.info(f"Downloading {info['repo']}/{info['file']} ({info['size_gb']} GB)...")
    subprocess.run([
        sys.executable, "-m", "huggingface_hub", "download",
        info["repo"], info["file"],
        "--local-dir", str(model_dir),
        "--local-dir-use-symlinks", "False",
    ], check=True)

    if not output_path.exists():
        raise RuntimeError(f"Download failed: {output_path} not found")

    return {"status": "downloaded", "path": str(output_path)}


def complete(
    prompt: str,
    model_id: str = "qwen3-1.7b",
    system_prompt: Optional[str] = None,
    temperature: float = 0.3,
    max_tokens: int = 2048,
) -> str:
    """Run inference using a local Qwen3 model."""
    model_path = _model_path(model_id)
    if not model_path.exists():
        raise RuntimeError(f"Model not downloaded: {model_id}")

    from llama_cpp import Llama

    llm = Llama(
        model_path=str(model_path),
        n_ctx=4096,
        n_threads=4,
        n_gpu_layers=-1,
        verbose=False,
    )

    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    response = llm.create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )

    return response["choices"][0]["message"]["content"].strip()