workbench / models /llama_cpp_service.py
GitHub Actions
Initial ZeroGPU deployment with spaces shim
7f9dfed
Raw
History Blame Contribute Delete
4.6 kB
from __future__ import annotations
import shutil
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import requests
from models.base import BackendStatus
from models.model_catalog import ModelInfo
from models.response_parsing import extract_chat_response
@dataclass(frozen=True)
class LlamaCppConfig:
"""Runtime configuration for a local llama.cpp server."""
server_url: str = "http://127.0.0.1:8080"
server_path: str = ""
model_path: str = ""
mmproj_path: str = ""
class LlamaCppService:
"""llama.cpp HTTP client for local GGUF inference."""
def __init__(
self,
model: ModelInfo,
config: LlamaCppConfig | None = None,
timeout_seconds: float = 60,
) -> None:
self.model = model
self.config = config or LlamaCppConfig()
self.timeout_seconds = timeout_seconds
@staticmethod
def status(
which_func: Callable[[str], str | None] = shutil.which,
get_func: Callable[..., requests.Response] = requests.get,
server_url: str = "http://127.0.0.1:8080",
server_path: str = "",
) -> BackendStatus:
executable = server_path or "llama-server"
if server_path:
if not Path(server_path).exists():
return BackendStatus(
"llama.cpp",
False,
f"Configured llama-server was not found: {server_path}",
)
elif which_func(executable) is None:
return BackendStatus("llama.cpp", False, "llama-server was not found on PATH.")
try:
response = get_func(f"{server_url}/health", timeout=2)
except requests.RequestException as exc:
return BackendStatus(
"llama.cpp",
False,
f"llama-server is installed but not reachable: {exc}",
)
if response.ok:
return BackendStatus("llama.cpp", True, "llama-server is installed and reachable.")
return BackendStatus(
"llama.cpp",
False,
f"llama-server responded with HTTP {response.status_code}.",
)
def launch_command(self) -> list[str]:
if not self.config.model_path:
return []
command = [self.config.server_path or "llama-server", "-m", self.config.model_path]
if self.config.mmproj_path:
command.extend(["--mmproj", self.config.mmproj_path])
return command
def chat(self, system_prompt: str, user_prompt: str) -> str:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
return self._post_chat(messages)
def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
del image
if has_image:
return (
"[llama.cpp vision note]\n\n"
"Image upload requires a running llama-server with an mmproj file. "
"The current scaffold validates the server path but does not yet serialize "
"Gradio images into llama.cpp multimodal payloads."
)
return self._post_chat([{"role": "user", "content": prompt}])
def _post_chat(self, messages: list[dict[str, str]]) -> str:
status = self.status(
server_url=self.config.server_url,
server_path=self.config.server_path,
)
if not status.available:
return (
"[llama.cpp unavailable]\n\n"
f"{status.detail}\n\n"
"Install llama.cpp, start llama-server with an explicit GGUF model, "
"then retry."
)
try:
response = requests.post(
f"{self.config.server_url}/v1/chat/completions",
json={
"messages": messages,
"temperature": 0.7,
"max_tokens": 512,
},
timeout=self.timeout_seconds,
)
response.raise_for_status()
except requests.RequestException as exc:
return f"[llama.cpp request failed]\n\n{exc}"
return self._extract_response(dict(response.json()))
@staticmethod
def _extract_response(data: dict[str, Any]) -> str:
return extract_chat_response(data)
def local_file_status(path: str) -> str:
if not path:
return "not configured"
return "found" if Path(path).exists() else "missing"