workbench / models /llama_cpp_python_service.py
GitHub Actions
Initial ZeroGPU deployment with spaces shim
7f9dfed
from __future__ import annotations
import importlib.util
from dataclasses import dataclass
from importlib import import_module
from pathlib import Path
from typing import Any
from models.base import BackendStatus
from models.model_catalog import ModelInfo
from models.response_parsing import extract_chat_response
@dataclass(frozen=True)
class LlamaCppPythonConfig:
"""Runtime configuration for llama-cpp-python."""
model_path: str = ""
n_ctx: int = 4096
n_gpu_layers: int = 0
temperature: float = 0.7
max_tokens: int = 512
class LlamaCppPythonService:
"""Direct llama-cpp-python GGUF inference service."""
def __init__(
self,
model: ModelInfo,
config: LlamaCppPythonConfig | None = None,
) -> None:
self.model = model
self.config = config or LlamaCppPythonConfig()
@staticmethod
def status(model_path: str = "") -> BackendStatus:
if importlib.util.find_spec("llama_cpp") is None:
return BackendStatus(
"llama-cpp-python",
False,
"Python package llama-cpp-python is not installed in the current environment.",
)
if not model_path:
return BackendStatus(
"llama-cpp-python",
False,
"llama-cpp-python is installed, but no GGUF model path is configured.",
)
if not Path(model_path).exists():
return BackendStatus(
"llama-cpp-python",
False,
f"Configured GGUF model was not found: {model_path}",
)
return BackendStatus("llama-cpp-python", True, "llama-cpp-python is ready.")
def chat(self, system_prompt: str, user_prompt: str) -> str:
status = self.status(self.config.model_path)
if not status.available:
return (
"[llama-cpp-python unavailable]\n\n"
f"{status.detail}\n\n"
"Install llama-cpp-python and configure a local GGUF path before retrying."
)
llama = self._load_llama()
response = llama.create_chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
)
return self._extract_response(response)
def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
del image
if has_image:
return (
"[llama-cpp-python vision note]\n\n"
"Direct multimodal llama-cpp-python support requires model-specific mmproj "
"wiring and image serialization. Use llama-server for the current vision path."
)
return self.chat("", prompt)
def _load_llama(self):
llama_module = import_module("llama_cpp")
llama_class = llama_module.Llama
return llama_class(
model_path=self.config.model_path,
n_ctx=self.config.n_ctx,
n_gpu_layers=self.config.n_gpu_layers,
verbose=False,
)
@staticmethod
def _extract_response(data: dict[str, Any]) -> str:
return extract_chat_response(data)