Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

CADGenBench / tools /generate_gt_edit_diff.py

Michael Rabinovich

Gallery: show GT "answer-key" edit-diff for editing fixtures

49e27be 3 days ago

15.6 kB

	#!/usr/bin/env python3
	"""Generate the ground-truth "answer key" edit-diff turntables (editing fixtures).

	For each editing fixture (one that ships an ``input.step`` seed) this renders
	the reference companion to the per-submission edit diff: the GT drawn as a
	translucent ghost with the correct change painted blue (added material on the
	GT body, removed material as a blue phantom of the input). See
	:func:`cadgenbench.common.edit_diff.build_gt_edit_diff_shapes`.

	Like :mod:`generate_gt_turntables`, the result is a property of the **data
	revision** (GT vs input), not of any submission, so this runs once per data
	revision and both the gallery's ground-truth row and every per-submission report
	reference the same webp via the GT proxy. One clip is written per fixture:

	- ``<fixture>/renders/edit_diff_gt.webp`` -- full turntable.

	The GT mesh comes from the trusted sidecar (no tessellation); the input mesh is
	tessellated once at the GT's deflection so the GT-vs-input edit region is found
	at one consistent scale (mirrors the eval's ``_editing_input_mesh``).

	Run locally (against checkouts), render only::

	python tools/generate_gt_edit_diff.py \
	--gt-root ../cadgenbench-data-gt --inputs-root ../cadgenbench-data \
	--out-dir ../out/gt_edit_diff --no-upload

	Add ``--upload`` (and an ``HF_TOKEN`` with write scope on the private GT
	dataset) to commit the webps, or run it on an HF GPU job exactly like
	``generate_gt_turntables.py``.
	"""
	from __future__ import annotations

	import argparse
	import os
	import subprocess
	import sys
	import tempfile
	from pathlib import Path

	from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download

	# Allow running straight from the repo without installing the leaderboard pkg;
	# cadgenbench itself must be importable (installed in the env / eval-gpu image).
	_REPO_ROOT = Path(__file__).resolve().parents[2]
	_SRC = _REPO_ROOT / "cadgenbench" / "src"
	if _SRC.is_dir():
	sys.path.insert(0, str(_SRC))

	from cadgenbench.common.artifacts import StepArtifacts # noqa: E402
	from cadgenbench.common.edit_diff import render_gt_edit_diff_turntable # noqa: E402

	GT_STEP_NAME = "ground_truth.step"
	GT_SIDECAR_NAME = "ground_truth.mesh.npz"
	INPUT_STEP_NAME = "input.step"
	FULL_NAME = "renders/edit_diff_gt.webp"
	# One commit per this many files: keeps an individual commit small and
	# rate-limit friendly.
	COMMIT_CHUNK = 60


	def _default_repo_id() -> str:
	return os.getenv(
	"HF_DATA_GT_REPO",
	f"{os.getenv('HF_ORG', 'HuggingAI4Engineering')}/cadgenbench-data-gt",
	)


	def _default_inputs_repo_id() -> str:
	return os.getenv(
	"HF_DATA_REPO",
	f"{os.getenv('HF_ORG', 'HuggingAI4Engineering')}/cadgenbench-data",
	)


	def _editing_fixture_ids(
	api: HfApi,
	gt_repo: str,
	inputs_repo: str,
	gt_root: Path \| None,
	inputs_root: Path \| None,
	) -> list[str]:
	"""Fixture ids with BOTH a ``ground_truth.step`` and an ``input.step``.

	The ``input.step`` is what defines an editing fixture, so the intersection
	of the two repos (or two checkouts) is exactly the editing set.
	"""
	if gt_root is not None:
	gt_ids = {
	p.name for p in gt_root.iterdir()
	if p.is_dir() and (p / GT_STEP_NAME).is_file()
	}
	else:
	files = api.list_repo_files(gt_repo, repo_type="dataset")
	gt_ids = {f.split("/", 1)[0] for f in files if f.endswith("/" + GT_STEP_NAME)}

	if inputs_root is not None:
	in_ids = {
	p.name for p in inputs_root.iterdir()
	if p.is_dir() and (p / INPUT_STEP_NAME).is_file()
	}
	else:
	files = api.list_repo_files(inputs_repo, repo_type="dataset")
	in_ids = {f.split("/", 1)[0] for f in files if f.endswith("/" + INPUT_STEP_NAME)}

	return sorted(gt_ids & in_ids, key=lambda s: (len(s), s))


	def _materialize_gt(
	api: HfApi, repo_id: str, fixture: str, gt_root: Path \| None,
	cache_dir: Path, token: str \| None,
	) -> Path:
	"""Local dir holding this fixture's GT STEP + trusted mesh sidecar.

	The sidecar must sit next to the STEP so ``StepArtifacts`` takes the
	trusted-mesh path (no tessellation, no validation).
	"""
	if gt_root is not None:
	return gt_root / fixture
	dest = cache_dir / "gt" / fixture
	dest.mkdir(parents=True, exist_ok=True)
	for name in (GT_STEP_NAME, GT_SIDECAR_NAME):
	local = hf_hub_download(
	repo_id=repo_id, filename=f"{fixture}/{name}",
	repo_type="dataset", token=token,
	)
	target = dest / name
	if not target.exists():
	target.write_bytes(Path(local).read_bytes())
	return dest


	def _materialize_input(
	api: HfApi, repo_id: str, fixture: str, inputs_root: Path \| None,
	cache_dir: Path, token: str \| None,
	) -> Path:
	"""Local path to this fixture's ``input.step`` (checkout or Hub download)."""
	if inputs_root is not None:
	return inputs_root / fixture / INPUT_STEP_NAME
	local = hf_hub_download(
	repo_id=repo_id, filename=f"{fixture}/{INPUT_STEP_NAME}",
	repo_type="dataset", token=token,
	)
	return Path(local)


	def _render_fixture(gt_dir: Path, input_step: Path) -> bytes:
	"""Render the full answer-key turntable WebP for one editing fixture."""
	gt_mesh = StepArtifacts(gt_dir / GT_STEP_NAME, is_ground_truth=True).mesh()
	input_mesh = StepArtifacts(
	input_step, deflection_override=gt_mesh.linear_deflection_mm,
	).mesh()
	return render_gt_edit_diff_turntable(gt_mesh, input_mesh)


	def _commit_in_chunks(api: HfApi, repo_id: str, ops: list[CommitOperationAdd]) -> None:
	for i in range(0, len(ops), COMMIT_CHUNK):
	chunk = ops[i:i + COMMIT_CHUNK]
	api.create_commit(
	repo_id=repo_id, repo_type="dataset", operations=chunk,
	commit_message=f"add GT edit-diff answer-key webp(s) [{i + 1}-{i + len(chunk)}]",
	)
	print(f" committed {len(chunk)} file(s)", flush=True)


	def _resolved_fixtures(
	parser: argparse.ArgumentParser, args: argparse.Namespace,
	api: HfApi, gt_root: Path \| None, inputs_root: Path \| None,
	) -> list[str]:
	fixtures = _editing_fixture_ids(
	api, args.repo_id, args.inputs_repo_id, gt_root, inputs_root,
	)
	if args.fixtures:
	wanted = {f.strip() for f in args.fixtures.split(",") if f.strip()}
	fixtures = [f for f in fixtures if f in wanted]
	if args.limit is not None:
	fixtures = fixtures[: args.limit]
	if not fixtures:
	parser.error("No editing fixtures matched.")
	return fixtures


	def _upload_from_out_dir(api: HfApi, repo_id: str, out_dir: Path, fixtures: list[str]) -> None:
	"""Commit already-rendered webps/pngs under out_dir to the GT dataset."""
	ops: list[CommitOperationAdd] = []
	for fixture in fixtures:
	local = out_dir / fixture / "renders" / "edit_diff_gt.webp"
	if local.exists():
	ops.append(CommitOperationAdd(f"{fixture}/{FULL_NAME}", local.read_bytes()))
	if not ops:
	print("Nothing to upload (no rendered files found in --out-dir).", flush=True)
	return
	print(f"Uploading {len(ops)} file(s) to {repo_id} ...", flush=True)
	_commit_in_chunks(api, repo_id, ops)


	def _run_upload_only(parser: argparse.ArgumentParser, args: argparse.Namespace) -> int:
	"""Commit already-rendered ``edit_diff_gt.webp`` files from --out-dir."""
	if args.out_dir is None or not args.out_dir.is_dir():
	parser.error("--upload-only requires an existing --out-dir.")
	token = os.environ.get("HF_TOKEN")
	api = HfApi(token=token) # falls back to the stored CLI token when env unset
	out_dir = args.out_dir.resolve()
	fixtures = sorted(
	(p.parent.parent.name for p in out_dir.glob("*/renders/edit_diff_gt.webp")),
	key=lambda s: (len(s), s),
	)
	if not fixtures:
	parser.error(f"No edit_diff_gt.webp found under {out_dir}")
	print(f"Uploading {len(fixtures)} fixture webp(s) from {out_dir} -> {args.repo_id}", flush=True)
	print(f"FIXTURES: {' '.join(fixtures)}", flush=True)
	_upload_from_out_dir(api, args.repo_id, out_dir, fixtures)
	print("Done.", flush=True)
	return 0


	def _run_isolated(parser: argparse.ArgumentParser, args: argparse.Namespace) -> int:
	"""Render each fixture in a fresh subprocess (one fixture == ~240 plotters).

	Spawns this same tool with ``--fixtures <id> --no-upload`` per fixture so the
	GL context is fully released between fixtures, then (optionally) uploads once
	from ``--out-dir``. Worker stdout/stderr inherit the parent's, so progress
	and the VTK noise land in the same streams the non-isolated path uses.
	"""
	if args.out_dir is None:
	parser.error("--isolate requires --out-dir (workers render to disk).")
	token = os.environ.get("HF_TOKEN")
	if not args.no_upload and not token:
	parser.error("HF_TOKEN required to upload (or pass --no-upload).")
	api = HfApi(token=token)
	gt_root = args.gt_root.resolve() if args.gt_root else None
	inputs_root = args.inputs_root.resolve() if args.inputs_root else None
	for label, root in (("--gt-root", gt_root), ("--inputs-root", inputs_root)):
	if root is not None and not root.is_dir():
	parser.error(f"{label} does not exist: {root}")

	fixtures = _resolved_fixtures(parser, args, api, gt_root, inputs_root)
	print(f"Isolated render of {len(fixtures)} editing fixture(s) (one subprocess each).", flush=True)
	print(f"FIXTURES: {' '.join(fixtures)}", flush=True)

	base_cmd = [sys.executable, str(Path(__file__).resolve()),
	"--out-dir", str(args.out_dir), "--no-upload",
	"--repo-id", args.repo_id, "--inputs-repo-id", args.inputs_repo_id]
	if gt_root is not None:
	base_cmd += ["--gt-root", str(gt_root)]
	if inputs_root is not None:
	base_cmd += ["--inputs-root", str(inputs_root)]

	failures: list[str] = []
	for i, fixture in enumerate(fixtures, start=1):
	print(f"=== [{i}/{len(fixtures)}] {fixture} ===", flush=True)
	proc = subprocess.run([*base_cmd, "--fixtures", fixture]) # noqa: S603, PLW1510
	if proc.returncode != 0:
	failures.append(fixture)

	done = len(fixtures) - len(failures)
	print(f"Isolated render complete: {done}/{len(fixtures)} ok, {len(failures)} failed.", flush=True)
	if failures:
	print(f"FAILED: {' '.join(failures)}", flush=True)
	if not args.no_upload:
	_upload_from_out_dir(api, args.repo_id, args.out_dir, fixtures)
	print("Done.", flush=True)
	return 1 if failures else 0


	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--gt-root", type=Path, default=None,
	help="Local cadgenbench-data-gt checkout. Omit to download from the Hub.",
	)
	parser.add_argument(
	"--inputs-root", type=Path, default=None,
	help="Local cadgenbench-data checkout (holds input.step). Omit for Hub.",
	)
	parser.add_argument("--repo-id", default=_default_repo_id())
	parser.add_argument("--inputs-repo-id", default=_default_inputs_repo_id())
	parser.add_argument("--fixtures", help="Comma-separated fixture ids. Omit for all editing fixtures.")
	parser.add_argument("--limit", type=int, default=None)
	parser.add_argument(
	"--out-dir", type=Path, default=None,
	help="Also write each webp/png here (e.g. for local inspection).",
	)
	parser.add_argument(
	"--no-upload", action="store_true",
	help="Render only; do not commit to the GT dataset.",
	)
	parser.add_argument(
	"--upload-only", action="store_true",
	help=(
	"Skip rendering; commit the ``edit_diff_gt.webp`` files already under "
	"--out-dir to the GT dataset. Use after an isolated render run."
	),
	)
	parser.add_argument(
	"--isolate", action="store_true",
	help=(
	"Render each fixture in its own subprocess. Works around macOS "
	"offscreen VTK losing its GL context after many sequential Plotter "
	"create/close cycles (not needed on the Linux EGL eval job). Implies "
	"render-to-out-dir; upload, if requested, runs once from --out-dir."
	),
	)
	args = parser.parse_args()

	if args.upload_only:
	return _run_upload_only(parser, args)
	if args.isolate:
	return _run_isolated(parser, args)

	token = os.environ.get("HF_TOKEN")
	api = HfApi(token=token)
	gt_root = args.gt_root.resolve() if args.gt_root else None
	inputs_root = args.inputs_root.resolve() if args.inputs_root else None
	for label, root in (("--gt-root", gt_root), ("--inputs-root", inputs_root)):
	if root is not None and not root.is_dir():
	parser.error(f"{label} does not exist: {root}")

	fixtures = _editing_fixture_ids(
	api, args.repo_id, args.inputs_repo_id, gt_root, inputs_root,
	)
	if args.fixtures:
	wanted = {f.strip() for f in args.fixtures.split(",") if f.strip()}
	fixtures = [f for f in fixtures if f in wanted]
	if args.limit is not None:
	fixtures = fixtures[: args.limit]
	if not fixtures:
	parser.error("No editing fixtures matched.")

	if not args.no_upload and not token:
	parser.error("HF_TOKEN required to upload (or pass --no-upload).")

	print(
	f"Rendering {len(fixtures)} editing GT answer-key turntable(s)"
	+ ("" if args.no_upload else f" -> {args.repo_id} (will upload)"),
	flush=True,
	)
	print(f"FIXTURES: {' '.join(fixtures)}", flush=True)

	ops: list[CommitOperationAdd] = []
	failures: list[str] = []
	with tempfile.TemporaryDirectory(prefix="gt-edit-diff-") as tmp:
	cache_dir = Path(tmp)
	for i, fixture in enumerate(fixtures, start=1):
	print(f"[{i}/{len(fixtures)}] {fixture} ...", flush=True)
	try:
	gt_dir = _materialize_gt(
	api, args.repo_id, fixture, gt_root, cache_dir, token,
	)
	input_step = _materialize_input(
	api, args.inputs_repo_id, fixture, inputs_root, cache_dir, token,
	)
	full = _render_fixture(gt_dir, input_step)
	except Exception as e: # noqa: BLE001 - log and keep going
	print(f" FAILED {type(e).__name__}: {e}", flush=True)
	failures.append(fixture)
	continue

	print(f" ok: full={len(full) // 1024}KB", flush=True)

	if args.out_dir is not None:
	fx_out = args.out_dir / fixture / "renders"
	fx_out.mkdir(parents=True, exist_ok=True)
	(fx_out / "edit_diff_gt.webp").write_bytes(full)

	ops.append(CommitOperationAdd(f"{fixture}/{FULL_NAME}", full))

	done = len(fixtures) - len(failures)
	print(
	f"Rendered {done}/{len(fixtures)} fixture(s) ({len(failures)} failed).",
	flush=True,
	)
	if failures:
	print(f"FAILED: {' '.join(failures)}", flush=True)
	if args.no_upload:
	print("Upload skipped (--no-upload).", flush=True)
	return 1 if failures else 0
	print(f"Uploading {len(ops)} file(s) to {args.repo_id} ...", flush=True)
	_commit_in_chunks(api, args.repo_id, ops)
	print("Done.", flush=True)
	return 1 if failures else 0


	if __name__ == "__main__":
	raise SystemExit(main())