Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /tfw_optimizer.py

bbkdevops

about 1 month ago

download

raw

9.85 kB

	"""Throughput-per-watt optimizer for TinyMind sparse kernels."""

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path
	import subprocess
	import tempfile


	ROOT = Path(__file__).resolve().parents[1]
	INT4_BENCH = ROOT / "kernels" / "int4_sparse_ptx" / "run_efficiency_bench.ps1"
	INT6_REPORT = ROOT / "reports" / "int6_cuda_eval_dll" / "int6_cuda_eval_dll_report.json"
	INT6_BRIDGE_REPORT = ROOT / "reports" / "int6_bridge_imma_eval" / "int6_bridge_imma_eval_report.json"


	def _load(path: str \| Path) -> dict:
	p = Path(path)
	return json.loads(p.read_text(encoding="utf-8-sig")) if p.exists() else {}


	def _run_int4_bench(blocks: int, threads: int, iterations: int, passes: int, run_name: str) -> dict:
	command = [
	"powershell",
	"-NoLogo",
	"-NoProfile",
	"-ExecutionPolicy",
	"Bypass",
	"-File",
	str(INT4_BENCH),
	"-Blocks",
	str(blocks),
	"-Threads",
	str(threads),
	"-Iterations",
	str(iterations),
	"-Passes",
	str(passes),
	"-RunName",
	run_name,
	]
	vcvars = Path("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Auxiliary/Build/vcvars64.bat")
	if vcvars.exists():
	quoted = " ".join(f'"{part}"' if (" " in part or "\\" in part or ":" in part) else part for part in command)
	script_path = Path(tempfile.gettempdir()) / "tinymind_tfw_int4_bench.bat"
	script_path.write_text(f'@echo off\r\ncall "{vcvars}" >nul\r\n{quoted}\r\n', encoding="utf-8")
	proc = subprocess.run(["cmd.exe", "/d", "/c", str(script_path)], cwd=INT4_BENCH.parent, capture_output=True, text=True, timeout=240, check=False)
	else:
	proc = subprocess.run(command, cwd=INT4_BENCH.parent, capture_output=True, text=True, timeout=240, check=False)
	summary = INT4_BENCH.parent / "bench_runs" / f"{run_name}-summary.json"
	payload = _load(summary)
	payload["invocation"] = {
	"command": command,
	"exit_code": proc.returncode,
	"stdout_tail": proc.stdout[-4000:],
	"stderr_tail": proc.stderr[-4000:],
	}
	return payload


	def _int6_candidate(report_path: str \| Path = INT6_REPORT) -> dict:
	report = _load(report_path)
	kernel = report.get("int6_cuda_kernel", {})
	throughput = kernel.get("throughput", {})
	# The current INT6 DLL report does not sample power directly. Use present
	# GPU power draw as a conservative denominator if available.
	power_w = _query_power_w()
	dense_tops = float(throughput.get("dense_equivalent_tops") or 0.0)
	actual_tops = float(throughput.get("actual_sparse_tops") or 0.0)
	return {
	"name": "int6_2x4_reference_cuda",
	"format": "int6_2:4sp",
	"source": str(report_path),
	"passed": kernel.get("passed") is True,
	"avg_effective_tops": dense_tops,
	"actual_sparse_tops": actual_tops,
	"avg_power_w": power_w,
	"avg_effective_tops_per_watt": dense_tops / power_w if power_w > 0 else 0.0,
	"notes": "INT6 v1 uses packed CUDA reference math, not native Tensor Core INT6.",
	}


	def _int6_bridge_candidate(report_path: str \| Path = INT6_BRIDGE_REPORT) -> dict:
	report = _load(report_path)
	metrics = report.get("metrics", {})
	if "real_data" in metrics or "compute_peak" in metrics:
	metric_mode = "real_data" if float(metrics.get("real_data", {}).get("avg_logical_int6_tops", 0.0)) > 0 else "compute_peak"
	selected_metrics = metrics.get(metric_mode, {})
	else:
	metric_mode = "legacy"
	selected_metrics = metrics
	gate = report.get("claim_gate", {})
	power = report.get("power_samples", {})
	logical = float(selected_metrics.get("avg_logical_int6_tops", 0.0))
	hardware = float(selected_metrics.get("avg_hardware_imma_tops", 0.0))
	logical_w = float(selected_metrics.get("avg_logical_int6_tops_per_watt", 0.0))
	quality = power.get("quality_gate", {})
	power_w = float(quality.get("active_avg_power_w") or power.get("avg_power_w", 0.0))
	return {
	"name": "int6_bridge_imma_fast",
	"format": "int6_2:4sp_bridge_imma",
	"source": str(report_path),
	"passed": gate.get("int6_bottleneck_removed") is True,
	"metric_mode": metric_mode,
	"avg_effective_tops": logical,
	"avg_hardware_imma_tops": hardware,
	"avg_power_w": power_w,
	"power_measurement_representative": gate.get("power_measurement_representative") is True,
	"real_data_movement_measured": gate.get("real_data_movement_measured") is True,
	"avg_effective_tops_per_watt": logical_w,
	"notes": "Fused two-pass IMMA.SP bridge for INT6 logical arithmetic; uses measured hardware IMMA throughput.",
	}


	def _query_power_w() -> float:
	try:
	proc = subprocess.run(
	["nvidia-smi", "--query-gpu=power.draw", "--format=csv,noheader,nounits"],
	capture_output=True,
	text=True,
	timeout=10,
	check=False,
	)
	except (OSError, subprocess.SubprocessError):
	return 0.0
	first = proc.stdout.strip().splitlines()[0:1]
	if not first:
	return 0.0
	try:
	return float(first[0].strip())
	except ValueError:
	return 0.0


	def build_tfw_optimizer(
	out_dir: str \| Path,
	run_int4: bool = True,
	blocks: int = 160,
	threads: int = 256,
	iterations: int = 5000,
	passes: int = 3,
	int6_report: str \| Path = INT6_REPORT,
	int6_bridge_report: str \| Path = INT6_BRIDGE_REPORT,
	) -> dict:
	run_name = datetime.now(timezone.utc).strftime("tfw-%Y%m%d-%H%M%S")
	candidates: list[dict] = []
	if run_int4:
	int4 = _run_int4_bench(blocks, threads, iterations, passes, run_name)
	candidates.append(
	{
	"name": "int4_sparse_tensor_core_imma_sp",
	"format": "int4_2:4sp",
	"source": int4.get("summary_path") or int4.get("bench_log"),
	"passed": int4.get("invocation", {}).get("exit_code") == 0 and float(int4.get("avg_effective_tops", 0.0)) > 0,
	"avg_effective_tops": float(int4.get("avg_effective_tops", 0.0)),
	"best_effective_tops": float(int4.get("best_effective_tops", 0.0)),
	"avg_power_w": float(int4.get("avg_power_w", 0.0)),
	"max_power_w": float(int4.get("max_power_w", 0.0)),
	"avg_temp_c": float(int4.get("avg_temp_c", 0.0)),
	"max_temp_c": float(int4.get("max_temp_c", 0.0)),
	"avg_effective_tops_per_watt": float(int4.get("avg_effective_tops_per_watt", 0.0)),
	"best_effective_tops_per_watt": float(int4.get("best_effective_tops_per_watt", 0.0)),
	"notes": "Uses real Ampere IMMA.SP sparse Tensor Core instruction.",
	"diagnostics": int4.get("invocation", {}),
	}
	)
	candidates.append(_int6_candidate(int6_report))
	candidates.append(_int6_bridge_candidate(int6_bridge_report))

	runnable = [row for row in candidates if row.get("passed")]
	selected = max(runnable, key=lambda row: (row.get("avg_effective_tops_per_watt", 0.0), row.get("avg_effective_tops", 0.0))) if runnable else None
	report = {
	"schema_version": "tinymind-tfw-optimizer-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"goal": "maximize measured TF/TOPS per watt first, then raw TF/TOPS, on the current RTX 3090.",
	"candidates": candidates,
	"selected": selected,
	"runtime_recommendation": {
	"primary": selected["name"] if selected else None,
	"precision": selected["format"] if selected else None,
	"use_int6_when": "quality/drift wins and throughput is acceptable",
	"use_int4_when": "TF/W or latency is the priority",
	"next_kernel_work": "Promote fused INT6 bridge for performance paths and keep INT6 reference for correctness drift checks.",
	},
	"claim_gate": {
	"local_tfw_winner_measured": selected is not None,
	"world_highest_tfw_claim_allowed": False,
	"reason": "World-highest TF/W requires standardized external benchmark comparison across hardware and kernels.",
	},
	}
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	json_path = out / "tfw_optimizer_report.json"
	md_path = out / "tfw_optimizer_report.md"
	report["json_path"] = str(json_path)
	report["markdown_path"] = str(md_path)
	json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path.write_text(_markdown(report), encoding="utf-8")
	return report


	def _markdown(report: dict) -> str:
	selected = report.get("selected") or {}
	lines = [
	"# TinyMind TF/W Optimizer",
	"",
	f"- Selected: {selected.get('name')}",
	f"- Format: {selected.get('format')}",
	f"- Avg effective TOPS: {selected.get('avg_effective_tops')}",
	f"- Avg power W: {selected.get('avg_power_w')}",
	f"- Avg effective TOPS/W: {selected.get('avg_effective_tops_per_watt')}",
	f"- World-highest TF/W claim allowed: {report['claim_gate']['world_highest_tfw_claim_allowed']}",
	"",
	"## Candidates",
	"",
	"\| Name \| Format \| Passed \| TOPS \| W \| TOPS/W \| Notes \|",
	"\|---\|---\|---:\|---:\|---:\|---:\|---\|",
	]
	for row in report["candidates"]:
	lines.append(
	f"\| {row.get('name')} \| {row.get('format')} \| {row.get('passed')} \| "
	f"{row.get('avg_effective_tops', 0):.6f} \| {row.get('avg_power_w', 0):.2f} \| "
	f"{row.get('avg_effective_tops_per_watt', 0):.6f} \| {row.get('notes')} \|"
	)
	return "\n".join(lines) + "\n"

Xet Storage Details

Size:: 9.85 kB
Xet hash:: d9adc589d4a15307b720b2675808a3704af72278b6d972ab52b6c19541d00e18

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.