Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /tfw_optimizer.py
| """Throughput-per-watt optimizer for TinyMind sparse kernels.""" | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| import subprocess | |
| import tempfile | |
| ROOT = Path(__file__).resolve().parents[1] | |
| INT4_BENCH = ROOT / "kernels" / "int4_sparse_ptx" / "run_efficiency_bench.ps1" | |
| INT6_REPORT = ROOT / "reports" / "int6_cuda_eval_dll" / "int6_cuda_eval_dll_report.json" | |
| INT6_BRIDGE_REPORT = ROOT / "reports" / "int6_bridge_imma_eval" / "int6_bridge_imma_eval_report.json" | |
| def _load(path: str | Path) -> dict: | |
| p = Path(path) | |
| return json.loads(p.read_text(encoding="utf-8-sig")) if p.exists() else {} | |
| def _run_int4_bench(blocks: int, threads: int, iterations: int, passes: int, run_name: str) -> dict: | |
| command = [ | |
| "powershell", | |
| "-NoLogo", | |
| "-NoProfile", | |
| "-ExecutionPolicy", | |
| "Bypass", | |
| "-File", | |
| str(INT4_BENCH), | |
| "-Blocks", | |
| str(blocks), | |
| "-Threads", | |
| str(threads), | |
| "-Iterations", | |
| str(iterations), | |
| "-Passes", | |
| str(passes), | |
| "-RunName", | |
| run_name, | |
| ] | |
| vcvars = Path("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Auxiliary/Build/vcvars64.bat") | |
| if vcvars.exists(): | |
| quoted = " ".join(f'"{part}"' if (" " in part or "\\" in part or ":" in part) else part for part in command) | |
| script_path = Path(tempfile.gettempdir()) / "tinymind_tfw_int4_bench.bat" | |
| script_path.write_text(f'@echo off\r\ncall "{vcvars}" >nul\r\n{quoted}\r\n', encoding="utf-8") | |
| proc = subprocess.run(["cmd.exe", "/d", "/c", str(script_path)], cwd=INT4_BENCH.parent, capture_output=True, text=True, timeout=240, check=False) | |
| else: | |
| proc = subprocess.run(command, cwd=INT4_BENCH.parent, capture_output=True, text=True, timeout=240, check=False) | |
| summary = INT4_BENCH.parent / "bench_runs" / f"{run_name}-summary.json" | |
| payload = _load(summary) | |
| payload["invocation"] = { | |
| "command": command, | |
| "exit_code": proc.returncode, | |
| "stdout_tail": proc.stdout[-4000:], | |
| "stderr_tail": proc.stderr[-4000:], | |
| } | |
| return payload | |
| def _int6_candidate(report_path: str | Path = INT6_REPORT) -> dict: | |
| report = _load(report_path) | |
| kernel = report.get("int6_cuda_kernel", {}) | |
| throughput = kernel.get("throughput", {}) | |
| # The current INT6 DLL report does not sample power directly. Use present | |
| # GPU power draw as a conservative denominator if available. | |
| power_w = _query_power_w() | |
| dense_tops = float(throughput.get("dense_equivalent_tops") or 0.0) | |
| actual_tops = float(throughput.get("actual_sparse_tops") or 0.0) | |
| return { | |
| "name": "int6_2x4_reference_cuda", | |
| "format": "int6_2:4sp", | |
| "source": str(report_path), | |
| "passed": kernel.get("passed") is True, | |
| "avg_effective_tops": dense_tops, | |
| "actual_sparse_tops": actual_tops, | |
| "avg_power_w": power_w, | |
| "avg_effective_tops_per_watt": dense_tops / power_w if power_w > 0 else 0.0, | |
| "notes": "INT6 v1 uses packed CUDA reference math, not native Tensor Core INT6.", | |
| } | |
| def _int6_bridge_candidate(report_path: str | Path = INT6_BRIDGE_REPORT) -> dict: | |
| report = _load(report_path) | |
| metrics = report.get("metrics", {}) | |
| if "real_data" in metrics or "compute_peak" in metrics: | |
| metric_mode = "real_data" if float(metrics.get("real_data", {}).get("avg_logical_int6_tops", 0.0)) > 0 else "compute_peak" | |
| selected_metrics = metrics.get(metric_mode, {}) | |
| else: | |
| metric_mode = "legacy" | |
| selected_metrics = metrics | |
| gate = report.get("claim_gate", {}) | |
| power = report.get("power_samples", {}) | |
| logical = float(selected_metrics.get("avg_logical_int6_tops", 0.0)) | |
| hardware = float(selected_metrics.get("avg_hardware_imma_tops", 0.0)) | |
| logical_w = float(selected_metrics.get("avg_logical_int6_tops_per_watt", 0.0)) | |
| quality = power.get("quality_gate", {}) | |
| power_w = float(quality.get("active_avg_power_w") or power.get("avg_power_w", 0.0)) | |
| return { | |
| "name": "int6_bridge_imma_fast", | |
| "format": "int6_2:4sp_bridge_imma", | |
| "source": str(report_path), | |
| "passed": gate.get("int6_bottleneck_removed") is True, | |
| "metric_mode": metric_mode, | |
| "avg_effective_tops": logical, | |
| "avg_hardware_imma_tops": hardware, | |
| "avg_power_w": power_w, | |
| "power_measurement_representative": gate.get("power_measurement_representative") is True, | |
| "real_data_movement_measured": gate.get("real_data_movement_measured") is True, | |
| "avg_effective_tops_per_watt": logical_w, | |
| "notes": "Fused two-pass IMMA.SP bridge for INT6 logical arithmetic; uses measured hardware IMMA throughput.", | |
| } | |
| def _query_power_w() -> float: | |
| try: | |
| proc = subprocess.run( | |
| ["nvidia-smi", "--query-gpu=power.draw", "--format=csv,noheader,nounits"], | |
| capture_output=True, | |
| text=True, | |
| timeout=10, | |
| check=False, | |
| ) | |
| except (OSError, subprocess.SubprocessError): | |
| return 0.0 | |
| first = proc.stdout.strip().splitlines()[0:1] | |
| if not first: | |
| return 0.0 | |
| try: | |
| return float(first[0].strip()) | |
| except ValueError: | |
| return 0.0 | |
| def build_tfw_optimizer( | |
| out_dir: str | Path, | |
| run_int4: bool = True, | |
| blocks: int = 160, | |
| threads: int = 256, | |
| iterations: int = 5000, | |
| passes: int = 3, | |
| int6_report: str | Path = INT6_REPORT, | |
| int6_bridge_report: str | Path = INT6_BRIDGE_REPORT, | |
| ) -> dict: | |
| run_name = datetime.now(timezone.utc).strftime("tfw-%Y%m%d-%H%M%S") | |
| candidates: list[dict] = [] | |
| if run_int4: | |
| int4 = _run_int4_bench(blocks, threads, iterations, passes, run_name) | |
| candidates.append( | |
| { | |
| "name": "int4_sparse_tensor_core_imma_sp", | |
| "format": "int4_2:4sp", | |
| "source": int4.get("summary_path") or int4.get("bench_log"), | |
| "passed": int4.get("invocation", {}).get("exit_code") == 0 and float(int4.get("avg_effective_tops", 0.0)) > 0, | |
| "avg_effective_tops": float(int4.get("avg_effective_tops", 0.0)), | |
| "best_effective_tops": float(int4.get("best_effective_tops", 0.0)), | |
| "avg_power_w": float(int4.get("avg_power_w", 0.0)), | |
| "max_power_w": float(int4.get("max_power_w", 0.0)), | |
| "avg_temp_c": float(int4.get("avg_temp_c", 0.0)), | |
| "max_temp_c": float(int4.get("max_temp_c", 0.0)), | |
| "avg_effective_tops_per_watt": float(int4.get("avg_effective_tops_per_watt", 0.0)), | |
| "best_effective_tops_per_watt": float(int4.get("best_effective_tops_per_watt", 0.0)), | |
| "notes": "Uses real Ampere IMMA.SP sparse Tensor Core instruction.", | |
| "diagnostics": int4.get("invocation", {}), | |
| } | |
| ) | |
| candidates.append(_int6_candidate(int6_report)) | |
| candidates.append(_int6_bridge_candidate(int6_bridge_report)) | |
| runnable = [row for row in candidates if row.get("passed")] | |
| selected = max(runnable, key=lambda row: (row.get("avg_effective_tops_per_watt", 0.0), row.get("avg_effective_tops", 0.0))) if runnable else None | |
| report = { | |
| "schema_version": "tinymind-tfw-optimizer-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "goal": "maximize measured TF/TOPS per watt first, then raw TF/TOPS, on the current RTX 3090.", | |
| "candidates": candidates, | |
| "selected": selected, | |
| "runtime_recommendation": { | |
| "primary": selected["name"] if selected else None, | |
| "precision": selected["format"] if selected else None, | |
| "use_int6_when": "quality/drift wins and throughput is acceptable", | |
| "use_int4_when": "TF/W or latency is the priority", | |
| "next_kernel_work": "Promote fused INT6 bridge for performance paths and keep INT6 reference for correctness drift checks.", | |
| }, | |
| "claim_gate": { | |
| "local_tfw_winner_measured": selected is not None, | |
| "world_highest_tfw_claim_allowed": False, | |
| "reason": "World-highest TF/W requires standardized external benchmark comparison across hardware and kernels.", | |
| }, | |
| } | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| json_path = out / "tfw_optimizer_report.json" | |
| md_path = out / "tfw_optimizer_report.md" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _markdown(report: dict) -> str: | |
| selected = report.get("selected") or {} | |
| lines = [ | |
| "# TinyMind TF/W Optimizer", | |
| "", | |
| f"- Selected: {selected.get('name')}", | |
| f"- Format: {selected.get('format')}", | |
| f"- Avg effective TOPS: {selected.get('avg_effective_tops')}", | |
| f"- Avg power W: {selected.get('avg_power_w')}", | |
| f"- Avg effective TOPS/W: {selected.get('avg_effective_tops_per_watt')}", | |
| f"- World-highest TF/W claim allowed: {report['claim_gate']['world_highest_tfw_claim_allowed']}", | |
| "", | |
| "## Candidates", | |
| "", | |
| "| Name | Format | Passed | TOPS | W | TOPS/W | Notes |", | |
| "|---|---|---:|---:|---:|---:|---|", | |
| ] | |
| for row in report["candidates"]: | |
| lines.append( | |
| f"| {row.get('name')} | {row.get('format')} | {row.get('passed')} | " | |
| f"{row.get('avg_effective_tops', 0):.6f} | {row.get('avg_power_w', 0):.2f} | " | |
| f"{row.get('avg_effective_tops_per_watt', 0):.6f} | {row.get('notes')} |" | |
| ) | |
| return "\n".join(lines) + "\n" | |
Xet Storage Details
- Size:
- 9.85 kB
- Xet hash:
- d9adc589d4a15307b720b2675808a3704af72278b6d972ab52b6c19541d00e18
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.