Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /sandbox_tool_core_eval.py
| """Sandbox Tool Core gate and evidence report.""" | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| import tempfile | |
| from model.sandbox_tool_core import SandboxToolCore, SandboxToolPolicy | |
| from model.sandbox_sdk_profile import build_sdk_inventory | |
| def build_sandbox_tool_core_eval(out_dir: str | Path) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| sandbox_root = Path(tempfile.mkdtemp(prefix="tinymind_sandbox_tool_core_", dir=out)) | |
| core = SandboxToolCore( | |
| sandbox_root, | |
| policy=SandboxToolPolicy(max_write_bytes=16_384, max_files_per_project=16, cmd_timeout_s=3.0), | |
| ) | |
| sdk_inventory = build_sdk_inventory(out / "sdk_inventory") | |
| calls = [ | |
| core.call("core.manifest"), | |
| core.call("lua.eval", {"code": "local x = 7\nlocal y = x * 6\nreturn y"}), | |
| core.call("fs.write", {"path": "notes/pure.txt", "content": "sandbox evidence\n"}), | |
| core.call("fs.read", {"path": "notes/pure.txt"}), | |
| core.call("project.create", {"name": "demo", "files": {"README.md": "# Demo\n", "src/main.py": "print('ok')\n"}}), | |
| core.call("cmd.run", {"argv": ["echo", "tool-core"]}), | |
| core.call("sandbox.env.create", {"name": "build-a"}), | |
| core.call("sandbox.env.file_put", {"name": "build-a", "path": "input.txt", "content": "artifact\n"}), | |
| core.call("sandbox.env.file_get", {"name": "build-a", "path": "input.txt"}), | |
| core.call("sandbox.env.run", {"name": "build-a", "argv": ["echo", "env-ok"]}), | |
| core.call("sandbox.env.run_detached", {"name": "build-a", "argv": ["echo", "stream-ok"]}), | |
| core.call("sandbox.env.snapshot", {"name": "build-a", "snapshot": "clean-a"}), | |
| core.call("sandbox.env.fork", {"source": "build-a", "child": "build-b"}), | |
| core.call("sandbox.env.stop", {"name": "build-a"}), | |
| core.call("sandbox.env.dashboard"), | |
| core.call("sandbox.env.resources"), | |
| core.call( | |
| "sandbox.run_code", | |
| {"code": 'return sandbox_deepresearch_plan("prove native model capability without fake claims", "deep")'}, | |
| ), | |
| core.call( | |
| "sandbox.run_code", | |
| { | |
| "code": ( | |
| 'return sandbox_deeprl_choose("need reliable next action", ' | |
| '"claim world best|run eval with snapshot|disable evidence gate", "0.1|0.6|0.2")' | |
| ) | |
| }, | |
| ), | |
| core.call( | |
| "sandbox.run_code", | |
| { | |
| "code": "\n".join( | |
| [ | |
| 'local c = sandbox_canvas_create("sandbox proof graph")', | |
| 'local a = sandbox_canvas_add_node(c, "claim", "tool layer works", "claim")', | |
| 'local b = sandbox_canvas_add_node(a, "evidence", "ledger hash exists", "evidence")', | |
| 'local d = sandbox_canvas_link(b, "evidence", "claim", "verifies")', | |
| "return sandbox_canvas_snapshot(d)", | |
| ] | |
| ) | |
| }, | |
| ), | |
| core.call("fs.write", {"path": "../escape.txt", "content": "bad"}), | |
| core.call("cmd.run", {"argv": ["powershell", "-NoProfile", "-Command", "Write-Output bad"]}), | |
| core.call("unknown.tool", {}), | |
| ] | |
| ledger_lines = core.ledger_path.read_text(encoding="utf-8").splitlines() | |
| ledger_records = [json.loads(line) for line in ledger_lines] | |
| checks = { | |
| "manifest_exposes_policy": calls[0]["ok"] is True and "policy" in calls[0].get("result", {}), | |
| "lua_subset_executes": calls[1]["ok"] is True and calls[1].get("result") == 42, | |
| "file_roundtrip_inside_root": calls[2]["ok"] is True and calls[3]["ok"] is True and calls[3].get("result") == "sandbox evidence\n", | |
| "project_scaffold_inside_root": calls[4]["ok"] is True and (sandbox_root / "demo" / "src" / "main.py").exists(), | |
| "allowlisted_cmd_runs": calls[5]["ok"] is True and "tool-core" in calls[5].get("stdout", ""), | |
| "isolated_env_created": calls[6]["ok"] is True, | |
| "file_api_roundtrip": calls[7]["ok"] is True and calls[8]["ok"] is True and calls[8].get("result", {}).get("content") == "artifact\n", | |
| "isolated_env_runs_command": calls[9]["ok"] is True, | |
| "detached_command_started": calls[10]["ok"] is True and "stdout_path" in calls[10].get("result", {}), | |
| "isolated_env_snapshot_saved": calls[11]["ok"] is True, | |
| "isolated_env_forked": calls[12]["ok"] is True, | |
| "sandbox_stop_auto_snapshot": calls[13]["ok"] is True | |
| and (calls[13].get("result", {}).get("auto_snapshot") or {}).get("ok") is True, | |
| "dashboard_present": calls[14]["ok"] is True and "usage" in calls[14].get("result", {}), | |
| "resource_accounting_present": calls[15]["ok"] is True | |
| and "sandbox_provisioned_memory_mb" in calls[15].get("result", {}), | |
| "deepresearch_lua_helper": calls[16]["ok"] is True | |
| and calls[16].get("result", {}).get("method") == "deepresearch_plan" | |
| and len(calls[16].get("result", {}).get("steps", [])) >= 8, | |
| "deeprl_lua_helper_prefers_evidence": calls[17]["ok"] is True | |
| and calls[17].get("result", {}).get("selected_action") == "run eval with snapshot", | |
| "canvas_lua_helper_hashes_graph": calls[18]["ok"] is True | |
| and calls[18].get("result", {}).get("node_count") == 2 | |
| and calls[18].get("result", {}).get("canvas_sha256"), | |
| "path_escape_rejected": calls[19]["ok"] is False and calls[19].get("error") == "path_escape", | |
| "non_allowlisted_cmd_rejected": calls[20]["ok"] is False and calls[20].get("error") == "command_not_allowlisted", | |
| "unknown_tool_rejected": calls[21]["ok"] is False and calls[21].get("error") == "unknown_tool", | |
| "ledger_hashes_present": all(row.get("input_sha256") and row.get("output_sha256") for row in ledger_records), | |
| "ledger_count_matches_calls": len(ledger_records) == len(calls), | |
| } | |
| passed = all(checks.values()) | |
| report = { | |
| "schema_version": "tinymind-sandbox-tool-core-eval-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "sandbox_root": str(sandbox_root), | |
| "ledger_path": str(core.ledger_path), | |
| "tool_manifest": core.manifest(), | |
| "sdk_inventory": sdk_inventory, | |
| "checks": checks, | |
| "calls": calls, | |
| "claim_gate": { | |
| "sandbox_tool_core_ready": passed, | |
| "lua_deepresearch_deeprl_canvas_ready": all( | |
| checks[name] | |
| for name in ( | |
| "deepresearch_lua_helper", | |
| "deeprl_lua_helper_prefers_evidence", | |
| "canvas_lua_helper_hashes_graph", | |
| ) | |
| ), | |
| "host_unrestricted_execution_claim_allowed": False, | |
| "world_best_tool_runtime_claim_allowed": False, | |
| "reason": "The core is intentionally policy-gated and audited; unrestricted host execution is not a valid safety or reliability claim.", | |
| }, | |
| } | |
| json_path = out / "sandbox_tool_core_eval_report.json" | |
| md_path = out / "sandbox_tool_core_eval_report.md" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _markdown(report: dict) -> str: | |
| lines = [ | |
| "# TinyMind Sandbox Tool Core Eval", | |
| "", | |
| f"- Sandbox tool core ready: {report['claim_gate']['sandbox_tool_core_ready']}", | |
| f"- Ledger: {report['ledger_path']}", | |
| f"- SDK inventory: {report['sdk_inventory']['json_path']}", | |
| f"- Host unrestricted execution claim allowed: {report['claim_gate']['host_unrestricted_execution_claim_allowed']}", | |
| "", | |
| "## Checks", | |
| "", | |
| ] | |
| for name, passed in report["checks"].items(): | |
| lines.append(f"- {name}: {passed}") | |
| return "\n".join(lines) + "\n" | |
Xet Storage Details
- Size:
- 8.08 kB
- Xet hash:
- 351bc0502e1b35cec7de8e2578c1a2c03143aeb1d26793accb8856b5c7c5e23a
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.