Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /sandbox_tool_core_eval.py

bbkdevops

about 1 month ago

download

raw

8.08 kB

	"""Sandbox Tool Core gate and evidence report."""

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path
	import tempfile

	from model.sandbox_tool_core import SandboxToolCore, SandboxToolPolicy
	from model.sandbox_sdk_profile import build_sdk_inventory


	def build_sandbox_tool_core_eval(out_dir: str \| Path) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	sandbox_root = Path(tempfile.mkdtemp(prefix="tinymind_sandbox_tool_core_", dir=out))
	core = SandboxToolCore(
	sandbox_root,
	policy=SandboxToolPolicy(max_write_bytes=16_384, max_files_per_project=16, cmd_timeout_s=3.0),
	)
	sdk_inventory = build_sdk_inventory(out / "sdk_inventory")

	calls = [
	core.call("core.manifest"),
	core.call("lua.eval", {"code": "local x = 7\nlocal y = x * 6\nreturn y"}),
	core.call("fs.write", {"path": "notes/pure.txt", "content": "sandbox evidence\n"}),
	core.call("fs.read", {"path": "notes/pure.txt"}),
	core.call("project.create", {"name": "demo", "files": {"README.md": "# Demo\n", "src/main.py": "print('ok')\n"}}),
	core.call("cmd.run", {"argv": ["echo", "tool-core"]}),
	core.call("sandbox.env.create", {"name": "build-a"}),
	core.call("sandbox.env.file_put", {"name": "build-a", "path": "input.txt", "content": "artifact\n"}),
	core.call("sandbox.env.file_get", {"name": "build-a", "path": "input.txt"}),
	core.call("sandbox.env.run", {"name": "build-a", "argv": ["echo", "env-ok"]}),
	core.call("sandbox.env.run_detached", {"name": "build-a", "argv": ["echo", "stream-ok"]}),
	core.call("sandbox.env.snapshot", {"name": "build-a", "snapshot": "clean-a"}),
	core.call("sandbox.env.fork", {"source": "build-a", "child": "build-b"}),
	core.call("sandbox.env.stop", {"name": "build-a"}),
	core.call("sandbox.env.dashboard"),
	core.call("sandbox.env.resources"),
	core.call(
	"sandbox.run_code",
	{"code": 'return sandbox_deepresearch_plan("prove native model capability without fake claims", "deep")'},
	),
	core.call(
	"sandbox.run_code",
	{
	"code": (
	'return sandbox_deeprl_choose("need reliable next action", '
	'"claim world best\|run eval with snapshot\|disable evidence gate", "0.1\|0.6\|0.2")'
	)
	},
	),
	core.call(
	"sandbox.run_code",
	{
	"code": "\n".join(
	[
	'local c = sandbox_canvas_create("sandbox proof graph")',
	'local a = sandbox_canvas_add_node(c, "claim", "tool layer works", "claim")',
	'local b = sandbox_canvas_add_node(a, "evidence", "ledger hash exists", "evidence")',
	'local d = sandbox_canvas_link(b, "evidence", "claim", "verifies")',
	"return sandbox_canvas_snapshot(d)",
	]
	)
	},
	),
	core.call("fs.write", {"path": "../escape.txt", "content": "bad"}),
	core.call("cmd.run", {"argv": ["powershell", "-NoProfile", "-Command", "Write-Output bad"]}),
	core.call("unknown.tool", {}),
	]
	ledger_lines = core.ledger_path.read_text(encoding="utf-8").splitlines()
	ledger_records = [json.loads(line) for line in ledger_lines]

	checks = {
	"manifest_exposes_policy": calls[0]["ok"] is True and "policy" in calls[0].get("result", {}),
	"lua_subset_executes": calls[1]["ok"] is True and calls[1].get("result") == 42,
	"file_roundtrip_inside_root": calls[2]["ok"] is True and calls[3]["ok"] is True and calls[3].get("result") == "sandbox evidence\n",
	"project_scaffold_inside_root": calls[4]["ok"] is True and (sandbox_root / "demo" / "src" / "main.py").exists(),
	"allowlisted_cmd_runs": calls[5]["ok"] is True and "tool-core" in calls[5].get("stdout", ""),
	"isolated_env_created": calls[6]["ok"] is True,
	"file_api_roundtrip": calls[7]["ok"] is True and calls[8]["ok"] is True and calls[8].get("result", {}).get("content") == "artifact\n",
	"isolated_env_runs_command": calls[9]["ok"] is True,
	"detached_command_started": calls[10]["ok"] is True and "stdout_path" in calls[10].get("result", {}),
	"isolated_env_snapshot_saved": calls[11]["ok"] is True,
	"isolated_env_forked": calls[12]["ok"] is True,
	"sandbox_stop_auto_snapshot": calls[13]["ok"] is True
	and (calls[13].get("result", {}).get("auto_snapshot") or {}).get("ok") is True,
	"dashboard_present": calls[14]["ok"] is True and "usage" in calls[14].get("result", {}),
	"resource_accounting_present": calls[15]["ok"] is True
	and "sandbox_provisioned_memory_mb" in calls[15].get("result", {}),
	"deepresearch_lua_helper": calls[16]["ok"] is True
	and calls[16].get("result", {}).get("method") == "deepresearch_plan"
	and len(calls[16].get("result", {}).get("steps", [])) >= 8,
	"deeprl_lua_helper_prefers_evidence": calls[17]["ok"] is True
	and calls[17].get("result", {}).get("selected_action") == "run eval with snapshot",
	"canvas_lua_helper_hashes_graph": calls[18]["ok"] is True
	and calls[18].get("result", {}).get("node_count") == 2
	and calls[18].get("result", {}).get("canvas_sha256"),
	"path_escape_rejected": calls[19]["ok"] is False and calls[19].get("error") == "path_escape",
	"non_allowlisted_cmd_rejected": calls[20]["ok"] is False and calls[20].get("error") == "command_not_allowlisted",
	"unknown_tool_rejected": calls[21]["ok"] is False and calls[21].get("error") == "unknown_tool",
	"ledger_hashes_present": all(row.get("input_sha256") and row.get("output_sha256") for row in ledger_records),
	"ledger_count_matches_calls": len(ledger_records) == len(calls),
	}
	passed = all(checks.values())
	report = {
	"schema_version": "tinymind-sandbox-tool-core-eval-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"sandbox_root": str(sandbox_root),
	"ledger_path": str(core.ledger_path),
	"tool_manifest": core.manifest(),
	"sdk_inventory": sdk_inventory,
	"checks": checks,
	"calls": calls,
	"claim_gate": {
	"sandbox_tool_core_ready": passed,
	"lua_deepresearch_deeprl_canvas_ready": all(
	checks[name]
	for name in (
	"deepresearch_lua_helper",
	"deeprl_lua_helper_prefers_evidence",
	"canvas_lua_helper_hashes_graph",
	)
	),
	"host_unrestricted_execution_claim_allowed": False,
	"world_best_tool_runtime_claim_allowed": False,
	"reason": "The core is intentionally policy-gated and audited; unrestricted host execution is not a valid safety or reliability claim.",
	},
	}
	json_path = out / "sandbox_tool_core_eval_report.json"
	md_path = out / "sandbox_tool_core_eval_report.md"
	report["json_path"] = str(json_path)
	report["markdown_path"] = str(md_path)
	json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path.write_text(_markdown(report), encoding="utf-8")
	return report


	def _markdown(report: dict) -> str:
	lines = [
	"# TinyMind Sandbox Tool Core Eval",
	"",
	f"- Sandbox tool core ready: {report['claim_gate']['sandbox_tool_core_ready']}",
	f"- Ledger: {report['ledger_path']}",
	f"- SDK inventory: {report['sdk_inventory']['json_path']}",
	f"- Host unrestricted execution claim allowed: {report['claim_gate']['host_unrestricted_execution_claim_allowed']}",
	"",
	"## Checks",
	"",
	]
	for name, passed in report["checks"].items():
	lines.append(f"- {name}: {passed}")
	return "\n".join(lines) + "\n"

Xet Storage Details

Size:: 8.08 kB
Xet hash:: 351bc0502e1b35cec7de8e2578c1a2c03143aeb1d26793accb8856b5c7c5e23a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.