Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

hackathon-advisor / tests /test_model_runtime.py

JacobLinCool

deploy: sync GitHub main 07450c9

cb3451f verified 12 days ago

Raw

History Blame Contribute Delete

22.1 kB

	import sys
	import types

	import pytest

	from hackathon_advisor.dashboard_chat_contracts import parse_native_tool_call
	from hackathon_advisor.model_runtime import (
	DEFAULT_ADAPTER_ID,
	DEFAULT_ADAPTER_REVISION,
	MiniCPMChatRunner,
	MiniCPMTransformersPlanner,
	RuleBasedChatRunner,
	RuleBasedPlanner,
	create_chat_runner,
	create_tool_planner,
	generation_lock,
	render_context,
	runtime_status,
	system_prompt,
	_best_local_device,
	_minicpm_generation_kwargs,
	_load_minicpm_causal_lm,
	_minicpm_chat_inputs,
	_minicpm_chat_inputs_with_tools,
	_normalize_xml_tool_output,
	_resolve_torch_device,
	_strip_unused_generation_inputs,
	)
	from hackathon_advisor.zerogpu import gpu_task, zero_gpu_duration_seconds, zero_gpu_enabled


	class FakeBackends:
	def __init__(self, mps: bool) -> None:
	self.mps = type("MPS", (), {"is_available": staticmethod(lambda: mps)})()


	class FakeTorch:
	def __init__(self, cuda: bool = False, mps: bool = False) -> None:
	self.bfloat16 = "bfloat16"
	self.float32 = "float32"
	self.cuda = type("CUDA", (), {"is_available": staticmethod(lambda: cuda)})()
	self.backends = FakeBackends(mps)


	class FakeInputs(dict):
	def to(self, device):
	self["device"] = device
	return self


	class FakeTokenizer:
	def __init__(self) -> None:
	self.template_call = None
	self.tokenizer_call = None

	def apply_chat_template(self, messages, *, tokenize, add_generation_prompt, enable_thinking):
	self.template_call = {
	"messages": messages,
	"tokenize": tokenize,
	"add_generation_prompt": add_generation_prompt,
	"enable_thinking": enable_thinking,
	}
	return "rendered prompt"

	def __call__(self, prompts, *, return_tensors):
	self.tokenizer_call = {"prompts": prompts, "return_tensors": return_tensors}
	return FakeInputs({"input_ids": [1], "attention_mask": [1], "token_type_ids": [0]})


	class FakeMiniCPMModel:
	last_instance = None

	@classmethod
	def from_pretrained(cls, model_id, **kwargs):
	instance = cls()
	instance.model_id = model_id
	instance.kwargs = kwargs
	instance.device = None
	cls.last_instance = instance
	return instance

	def to(self, device):
	self.device = device
	return self


	class FakeToolsTokenizer(FakeTokenizer):
	"""FakeTokenizer that also records the native tools= template path."""

	def apply_chat_template(
	self, messages, *, tokenize, add_generation_prompt, enable_thinking, tools=None
	):
	self.template_call = {
	"messages": messages,
	"tokenize": tokenize,
	"add_generation_prompt": add_generation_prompt,
	"enable_thinking": enable_thinking,
	}
	if tools is not None:
	self.template_call["tools"] = tools
	return "rendered prompt"


	class FakeStreamer:
	"""Stands in for transformers.TextIteratorStreamer in the worker-thread flow."""

	def __init__(self, tokenizer, *, skip_prompt, skip_special_tokens) -> None:
	import queue

	self._queue: queue.Queue = queue.Queue()

	def put(self, piece) -> None:
	self._queue.put(piece)

	def end(self) -> None:
	self._queue.put(None)

	def __iter__(self):
	while True:
	piece = self._queue.get()
	if piece is None:
	return
	yield piece


	class FakeParameter:
	device = "cpu"


	class FakeAdapterContext:
	def __init__(self, log: list[str]) -> None:
	self._log = log

	def __enter__(self):
	self._log.append("adapter_disabled")
	return self

	def __exit__(self, *exc_info):
	self._log.append("adapter_restored")
	return False


	class FakeChatModel:
	def __init__(self, pieces: tuple[str, ...], adapter_log: list[str] \| None = None) -> None:
	self.pieces = pieces
	self.adapter_log = adapter_log
	self.generate_calls: list[dict] = []
	self.lock_was_held: list[bool] = []

	def parameters(self):
	return iter([FakeParameter()])

	def generate(self, **kwargs) -> None:
	self.lock_was_held.append(generation_lock().locked())
	self.generate_calls.append(kwargs)
	streamer = kwargs["streamer"]
	for piece in self.pieces:
	streamer.put(piece)
	streamer.end()


	class FakeAdapterChatModel(FakeChatModel):
	def disable_adapter(self):
	assert self.adapter_log is not None
	return FakeAdapterContext(self.adapter_log)


	@pytest.fixture
	def fake_transformers(monkeypatch: pytest.MonkeyPatch):
	module = types.SimpleNamespace(TextIteratorStreamer=FakeStreamer)
	monkeypatch.setitem(sys.modules, "transformers", module)
	return module


	def chat_runner_with(model: FakeChatModel) -> MiniCPMChatRunner:
	planner = MiniCPMTransformersPlanner(
	"openbmb/MiniCPM5-1B",
	adapter_id="build-small-hackathon/some-lora" if hasattr(model, "disable_adapter") else "",
	)
	planner._model = model
	planner._tokenizer = FakeToolsTokenizer()
	return MiniCPMChatRunner(planner)


	def test_chat_inputs_with_tools_passes_native_tools() -> None:
	tokenizer = FakeToolsTokenizer()
	tools = [{"type": "function", "function": {"name": "list_quests"}}]

	inputs = _minicpm_chat_inputs_with_tools(
	tokenizer,
	[{"role": "user", "content": "hello"}],
	tools=tools,
	enable_thinking=False,
	device="cpu",
	)

	assert tokenizer.template_call["tools"] == tools
	assert tokenizer.template_call["enable_thinking"] is False
	assert inputs == {"input_ids": [1], "attention_mask": [1], "device": "cpu"}


	def test_chat_runner_streams_under_lock_with_adapter_disabled(fake_transformers) -> None:
	adapter_log: list[str] = []
	model = FakeAdapterChatModel(("<function ", 'name="list_quests">', "</function>"), adapter_log)
	runner = chat_runner_with(model)

	pieces = list(
	runner.stream(
	[{"role": "user", "content": "what quests exist"}],
	tools=[{"type": "function", "function": {"name": "list_quests"}}],
	max_new_tokens=96,
	)
	)

	assert [piece for _count, piece in pieces] == [
	"<function ",
	'name="list_quests">',
	"</function>",
	]
	assert [count for count, _piece in pieces] == [1, 2, 3]
	assert adapter_log == ["adapter_disabled", "adapter_restored"]
	assert model.lock_was_held == [True]
	assert generation_lock().locked() is False
	assert model.generate_calls[0]["max_new_tokens"] == 96
	assert model.generate_calls[0]["do_sample"] is False
	template_call = runner._planner._tokenizer.template_call
	assert "tools" in template_call


	def test_chat_runner_forwards_enable_thinking_to_the_template(fake_transformers) -> None:
	model = FakeChatModel(("thoughts</think>\n\nanswer",))
	runner = chat_runner_with(model)

	list(
	runner.stream(
	[{"role": "user", "content": "hi"}],
	tools=[{"type": "function"}],
	max_new_tokens=4096,
	enable_thinking=True,
	)
	)

	template_call = runner._planner._tokenizer.template_call
	assert template_call["enable_thinking"] is True
	assert model.generate_calls[0]["max_new_tokens"] == 4096
	assert MiniCPMChatRunner.supports_thinking is True
	assert RuleBasedChatRunner.supports_thinking is False


	def test_chat_runner_answer_pass_omits_tools_and_adapter_toggle(fake_transformers) -> None:
	model = FakeChatModel(("The map ", "shows ten projects."))
	runner = chat_runner_with(model)

	pieces = list(
	runner.stream(
	[
	{"role": "user", "content": "what is everyone building"},
	{"role": "assistant", "content": "", "tool_calls": []},
	{"role": "tool", "content": "{}"},
	],
	max_new_tokens=200,
	)
	)

	assert "".join(piece for _count, piece in pieces) == "The map shows ten projects."
	assert model.lock_was_held == [True]
	template_call = runner._planner._tokenizer.template_call
	assert "tools" not in template_call


	def test_chat_runner_surfaces_generation_errors(fake_transformers) -> None:
	class ExplodingModel(FakeChatModel):
	def generate(self, **kwargs) -> None:
	kwargs["streamer"].end()
	raise RuntimeError("boom")

	runner = chat_runner_with(ExplodingModel(()))

	with pytest.raises(RuntimeError, match="boom"):
	list(runner.stream([{"role": "user", "content": "hi"}], max_new_tokens=10))
	assert generation_lock().locked() is False


	def test_early_close_releases_generation_lock(fake_transformers) -> None:
	model = FakeChatModel(("tok1 ", "tok2 ", "tok3 ", "tok4 ", "tok5"))
	runner = chat_runner_with(model)
	stream = runner.stream([{"role": "user", "content": "hi"}], max_new_tokens=32)

	next(stream) # consume one piece then abandon mid-stream
	stream.close()

	assert generation_lock().locked() is False


	def test_rule_chat_runner_escapes_xml_special_characters() -> None:
	runner = RuleBasedChatRunner()

	output = "".join(
	piece
	for _count, piece in runner.stream(
	[{"role": "user", "content": "find projects about A & B <robots>"}],
	tools=[{"type": "function"}],
	max_new_tokens=96,
	)
	)

	call = parse_native_tool_call(output)
	assert call.name == "search_projects"
	assert call.arguments["query"] == "find projects about A & B <robots>"


	def test_rule_chat_runner_routes_tools_pass_through_intents() -> None:
	runner = RuleBasedChatRunner()

	output = "".join(
	piece
	for _count, piece in runner.stream(
	[{"role": "user", "content": "who completed the most quests"}],
	tools=[{"type": "function"}],
	max_new_tokens=96,
	)
	)

	call = parse_native_tool_call(output)
	assert call.name == "top_projects_by_quests"


	def test_rule_chat_runner_answer_pass_is_deterministic() -> None:
	runner = RuleBasedChatRunner()

	output = "".join(
	piece
	for _count, piece in runner.stream(
	[{"role": "user", "content": "hi"}, {"role": "tool", "content": "{}"}],
	max_new_tokens=200,
	)
	)

	assert "verified data" in output


	def test_create_chat_runner_matches_advisor_backend() -> None:
	minicpm = MiniCPMTransformersPlanner("openbmb/MiniCPM5-1B")

	assert isinstance(create_chat_runner(minicpm), MiniCPMChatRunner)
	assert isinstance(create_chat_runner(RuleBasedPlanner()), RuleBasedChatRunner)


	def test_base_model_context_is_null_without_adapter() -> None:
	planner = MiniCPMTransformersPlanner("openbmb/MiniCPM5-1B", adapter_id="")
	planner._model = FakeChatModel(())

	with planner.base_model_context():
	pass # no adapter -> nullcontext, nothing to toggle


	def test_rule_planner_emits_valid_search_call() -> None:
	planner = RuleBasedPlanner()

	resolution = planner.plan("search similar lullaby audio projects", {})

	assert resolution.status == "valid"
	assert resolution.call.name == "search_projects"
	assert resolution.call.arguments["query"] == "search similar lullaby audio projects"


	def test_rule_planner_uses_plan_when_idea_exists() -> None:
	planner = RuleBasedPlanner()

	resolution = planner.plan("make a build plan", {"ideas": [{"title": "A", "pitch": "B"}]})

	assert resolution.status == "valid"
	assert resolution.call.name == "make_plan"


	def test_rule_planner_keeps_empty_board_commands_as_commands() -> None:
	planner = RuleBasedPlanner()

	plan = planner.plan("make a build plan", {})
	rank = planner.plan("compare ideas", {})

	assert plan.status == "valid"
	assert plan.call.name == "make_plan"
	assert rank.status == "valid"
	assert rank.call.name == "compare_ideas"


	def test_rule_planner_defaults_blank_to_list_projects() -> None:
	planner = RuleBasedPlanner()

	resolution = planner.plan("", {})

	assert resolution.status == "valid"
	assert resolution.call.name == "list_projects"


	def test_rule_planner_routes_project_reference_commands() -> None:
	planner = RuleBasedPlanner()

	listed = planner.plan("show current map", {})
	project = planner.plan("read project lolaby", {})
	project_url = planner.plan("open space https://huggingface.co/spaces/build-small-hackathon/lolaby", {})

	assert listed.status == "valid"
	assert listed.call.name == "list_projects"
	assert project.status == "valid"
	assert project.call.name == "get_project"
	assert project.call.arguments["id"] == "lolaby"
	assert project_url.status == "valid"
	assert project_url.call.name == "get_project"
	assert project_url.call.arguments["id"] == "build-small-hackathon/lolaby"


	def test_rule_planner_keeps_project_words_inside_ideas() -> None:
	planner = RuleBasedPlanner()

	resolution = planner.plan("A dashboard that helps teams show projects to mentors", {})

	assert resolution.status == "valid"
	assert resolution.call.name == "save_idea"


	def test_rule_planner_does_not_match_commands_inside_idea_words() -> None:
	planner = RuleBasedPlanner()

	planting = planner.plan(
	"A neighborhood seed swap archive that reminds gardeners when to plant shared seeds",
	{},
	)
	cooking_plan = planner.plan(
	"A countertop helper that turns pantry leftovers into a weekly cooking plan",
	{},
	)

	assert planting.status == "valid"
	assert planting.call.name == "save_idea"
	assert cooking_plan.status == "valid"
	assert cooking_plan.call.name == "save_idea"


	def test_rule_planner_splits_explicit_idea_pitch() -> None:
	planner = RuleBasedPlanner()

	resolution = planner.plan(
	"idea: Hands-on science coach -- A lab-notebook companion for household experiments.",
	{},
	)

	assert resolution.status == "valid"
	assert resolution.call.name == "save_idea"
	assert resolution.call.arguments["title"] == "Hands-on science coach"
	assert resolution.call.arguments["pitch"] == "A lab-notebook companion for household experiments."


	def test_render_context_includes_state() -> None:
	context = render_context(
	"make a plan",
	{
	"ideas": [{"title": "Archive Cartographer", "pitch": "Map family memories."}],
	"trace": [{"input": "first", "verdict": "ECHO x2", "overall": 5.1}],
	},
	)

	assert "Archive Cartographer" in context
	assert "ECHO x2" in context
	assert '<function name="tool_name">' in context
	assert "Available tools:" in context
	assert "search_projects" in context


	def test_system_prompt_keeps_runtime_role_user_facing() -> None:
	prompt = system_prompt()

	assert "The Unwritten Almanac" in prompt
	assert "Mothback" not in prompt
	assert "Build Small" not in prompt


	def test_create_tool_planner_defaults_to_minicpm(monkeypatch: pytest.MonkeyPatch) -> None:
	monkeypatch.delenv("ADVISOR_MODEL_BACKEND", raising=False)
	monkeypatch.delenv("ADVISOR_ADAPTER_ID", raising=False)
	monkeypatch.delenv("ADVISOR_ADAPTER_REVISION", raising=False)

	planner = create_tool_planner()

	status = runtime_status(planner).to_dict()
	assert isinstance(planner, MiniCPMTransformersPlanner)
	assert status["backend"] == "minicpm-transformers"
	assert status["loaded"] is False
	assert status["adapter_id"] == DEFAULT_ADAPTER_ID
	assert status["adapter_revision"] == DEFAULT_ADAPTER_REVISION


	def test_create_tool_planner_accepts_explicit_rules_backend(monkeypatch: pytest.MonkeyPatch) -> None:
	monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "rules")

	planner = create_tool_planner()

	assert isinstance(planner, RuleBasedPlanner)
	assert runtime_status(planner).to_dict()["loaded"] is True


	def test_create_tool_planner_accepts_adapter_env(monkeypatch: pytest.MonkeyPatch) -> None:
	monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "minicpm-transformers")
	monkeypatch.setenv("ADVISOR_MODEL_ID", "openbmb/MiniCPM5-1B")
	monkeypatch.setenv("ADVISOR_ADAPTER_ID", DEFAULT_ADAPTER_ID)
	monkeypatch.setenv("ADVISOR_ADAPTER_REVISION", "abc123")

	planner = create_tool_planner()
	status = runtime_status(planner).to_dict()

	assert isinstance(planner, MiniCPMTransformersPlanner)
	assert status["backend"] == "minicpm-transformers"
	assert status["model_id"] == "openbmb/MiniCPM5-1B"
	assert status["adapter_id"] == DEFAULT_ADAPTER_ID
	assert status["adapter_revision"] == "abc123"
	assert status["loaded"] is False


	def test_create_tool_planner_rejects_unknown_backend(monkeypatch: pytest.MonkeyPatch) -> None:
	monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "bogus")

	with pytest.raises(RuntimeError, match="Unsupported"):
	create_tool_planner()


	def test_minicpm_status_is_lazy() -> None:
	planner = MiniCPMTransformersPlanner("openbmb/MiniCPM5-1B", DEFAULT_ADAPTER_ID)
	status = runtime_status(planner).to_dict()

	assert status["backend"] == "minicpm-transformers"
	assert status["adapter_id"] == DEFAULT_ADAPTER_ID
	assert status["adapter_revision"] == ""
	assert status["loaded"] is False


	def test_zerogpu_disabled_leaves_function_unwrapped(monkeypatch: pytest.MonkeyPatch) -> None:
	monkeypatch.delenv("ADVISOR_ZERO_GPU", raising=False)

	def marker() -> str:
	return "ok"

	assert zero_gpu_enabled() is False
	assert gpu_task(marker) is marker


	def test_zerogpu_duration_validates_positive_values(monkeypatch: pytest.MonkeyPatch) -> None:
	monkeypatch.setenv("ADVISOR_ZERO_GPU_DURATION", "7")
	assert zero_gpu_duration_seconds() == 7

	monkeypatch.setenv("ADVISOR_ZERO_GPU_DURATION", "0")
	with pytest.raises(RuntimeError, match="positive"):
	zero_gpu_duration_seconds()

	monkeypatch.setenv("ADVISOR_ZERO_GPU_DURATION", "121")
	with pytest.raises(RuntimeError, match="at most 120"):
	zero_gpu_duration_seconds()


	def test_generation_inputs_drop_token_type_ids() -> None:
	inputs = {"input_ids": [1], "attention_mask": [1], "token_type_ids": [0]}

	_strip_unused_generation_inputs(inputs)

	assert inputs == {"input_ids": [1], "attention_mask": [1]}


	def test_minicpm_loader_matches_official_cuda_dtype() -> None:
	model = _load_minicpm_causal_lm(FakeMiniCPMModel, "openbmb/MiniCPM5-1B", "cuda", FakeTorch())

	assert model.model_id == "openbmb/MiniCPM5-1B"
	assert model.kwargs == {"torch_dtype": "bfloat16", "trust_remote_code": True}
	assert model.device == "cuda"


	def test_minicpm_loader_uses_device_map_for_auto() -> None:
	model = _load_minicpm_causal_lm(FakeMiniCPMModel, "openbmb/MiniCPM5-1B", "auto", FakeTorch())

	assert model.kwargs == {
	"torch_dtype": "bfloat16",
	"device_map": "auto",
	"trust_remote_code": True,
	}
	assert model.device is None


	def test_minicpm_chat_inputs_follow_official_template_flow() -> None:
	tokenizer = FakeTokenizer()

	inputs = _minicpm_chat_inputs(
	tokenizer,
	[{"role": "user", "content": "hello"}],
	enable_thinking=False,
	device="cuda",
	)

	assert tokenizer.template_call == {
	"messages": [{"role": "user", "content": "hello"}],
	"tokenize": False,
	"add_generation_prompt": True,
	"enable_thinking": False,
	}
	assert tokenizer.tokenizer_call == {"prompts": ["rendered prompt"], "return_tensors": "pt"}
	assert inputs == {"input_ids": [1], "attention_mask": [1], "device": "cuda"}


	def test_minicpm_generation_kwargs_match_demo_sampling_policy() -> None:
	inputs = {"input_ids": [1], "attention_mask": [1]}

	sampled = _minicpm_generation_kwargs(inputs, max_new_tokens=32, temperature=0.9, top_p=0.95)
	deterministic = _minicpm_generation_kwargs(inputs, max_new_tokens=32, temperature=0.0)

	assert sampled == {
	"input_ids": [1],
	"attention_mask": [1],
	"max_new_tokens": 32,
	"temperature": 0.9,
	"top_p": 0.95,
	"do_sample": True,
	}
	assert deterministic == {
	"input_ids": [1],
	"attention_mask": [1],
	"max_new_tokens": 32,
	"do_sample": False,
	}


	def test_model_xml_fragment_is_normalized() -> None:
	output = 'name="save_idea">{"title":"A","pitch":"B"}'

	assert _normalize_xml_tool_output(output) == '<function name="save_idea">{"title":"A","pitch":"B"}</function>'


	def test_resolve_device_keeps_auto_and_explicit_cpu() -> None:
	assert _resolve_torch_device("auto", FakeTorch()) == "auto"
	assert _resolve_torch_device("cpu", FakeTorch(cuda=True, mps=True)) == "cpu"


	def test_resolve_device_prefers_cuda_then_mps_then_cpu(monkeypatch) -> None:
	monkeypatch.delenv("ADVISOR_ZERO_GPU", raising=False)

	assert _best_local_device(FakeTorch(cuda=True, mps=True)) == "cuda"
	assert _best_local_device(FakeTorch(cuda=False, mps=True)) == "mps"
	assert _best_local_device(FakeTorch(cuda=False, mps=False)) == "cpu"
	# "local" resolves through the same ladder
	assert _resolve_torch_device("local", FakeTorch(cuda=False, mps=True)) == "mps"


	def test_resolve_device_unavailable_request_degrades_gracefully(monkeypatch) -> None:
	monkeypatch.delenv("ADVISOR_ZERO_GPU", raising=False)

	# asking for cuda on an MPS-only box lands on mps, not a crash
	assert _resolve_torch_device("cuda", FakeTorch(cuda=False, mps=True)) == "mps"


	def test_resolve_device_skips_cuda_under_zero_gpu(monkeypatch) -> None:
	# In a ZeroGPU main process there is no local CUDA, and probing it is avoided.
	monkeypatch.setenv("ADVISOR_ZERO_GPU", "1")

	assert _best_local_device(FakeTorch(cuda=True, mps=False)) == "cpu"


	def test_runtime_status_reports_configured_device() -> None:
	planner = MiniCPMTransformersPlanner("openbmb/MiniCPM5-1B", device="local")

	assert runtime_status(planner).to_dict()["device"] == "local"
	assert runtime_status(RuleBasedPlanner()).to_dict()["device"] == ""