Spaces:

build-small-hackathon
/

workbench

Running on Zero

workbench / models /llama_cpp_service.py

GitHub Actions

Initial ZeroGPU deployment with spaces shim

7f9dfed 11 days ago

4.6 kB

	from __future__ import annotations

	import shutil
	from collections.abc import Callable
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any

	import requests

	from models.base import BackendStatus
	from models.model_catalog import ModelInfo
	from models.response_parsing import extract_chat_response


	@dataclass(frozen=True)
	class LlamaCppConfig:
	"""Runtime configuration for a local llama.cpp server."""

	server_url: str = "http://127.0.0.1:8080"
	server_path: str = ""
	model_path: str = ""
	mmproj_path: str = ""


	class LlamaCppService:
	"""llama.cpp HTTP client for local GGUF inference."""

	def __init__(
	self,
	model: ModelInfo,
	config: LlamaCppConfig \| None = None,
	timeout_seconds: float = 60,
	) -> None:
	self.model = model
	self.config = config or LlamaCppConfig()
	self.timeout_seconds = timeout_seconds

	@staticmethod
	def status(
	which_func: Callable[[str], str \| None] = shutil.which,
	get_func: Callable[..., requests.Response] = requests.get,
	server_url: str = "http://127.0.0.1:8080",
	server_path: str = "",
	) -> BackendStatus:
	executable = server_path or "llama-server"
	if server_path:
	if not Path(server_path).exists():
	return BackendStatus(
	"llama.cpp",
	False,
	f"Configured llama-server was not found: {server_path}",
	)
	elif which_func(executable) is None:
	return BackendStatus("llama.cpp", False, "llama-server was not found on PATH.")

	try:
	response = get_func(f"{server_url}/health", timeout=2)
	except requests.RequestException as exc:
	return BackendStatus(
	"llama.cpp",
	False,
	f"llama-server is installed but not reachable: {exc}",
	)

	if response.ok:
	return BackendStatus("llama.cpp", True, "llama-server is installed and reachable.")
	return BackendStatus(
	"llama.cpp",
	False,
	f"llama-server responded with HTTP {response.status_code}.",
	)

	def launch_command(self) -> list[str]:
	if not self.config.model_path:
	return []

	command = [self.config.server_path or "llama-server", "-m", self.config.model_path]
	if self.config.mmproj_path:
	command.extend(["--mmproj", self.config.mmproj_path])
	return command

	def chat(self, system_prompt: str, user_prompt: str) -> str:
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]
	return self._post_chat(messages)

	def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
	del image
	if has_image:
	return (
	"[llama.cpp vision note]\n\n"
	"Image upload requires a running llama-server with an mmproj file. "
	"The current scaffold validates the server path but does not yet serialize "
	"Gradio images into llama.cpp multimodal payloads."
	)
	return self._post_chat([{"role": "user", "content": prompt}])

	def _post_chat(self, messages: list[dict[str, str]]) -> str:
	status = self.status(
	server_url=self.config.server_url,
	server_path=self.config.server_path,
	)
	if not status.available:
	return (
	"[llama.cpp unavailable]\n\n"
	f"{status.detail}\n\n"
	"Install llama.cpp, start llama-server with an explicit GGUF model, "
	"then retry."
	)

	try:
	response = requests.post(
	f"{self.config.server_url}/v1/chat/completions",
	json={
	"messages": messages,
	"temperature": 0.7,
	"max_tokens": 512,
	},
	timeout=self.timeout_seconds,
	)
	response.raise_for_status()
	except requests.RequestException as exc:
	return f"[llama.cpp request failed]\n\n{exc}"

	return self._extract_response(dict(response.json()))

	@staticmethod
	def _extract_response(data: dict[str, Any]) -> str:
	return extract_chat_response(data)


	def local_file_status(path: str) -> str:
	if not path:
	return "not configured"
	return "found" if Path(path).exists() else "missing"