bbkdevops's picture
download
raw
9.85 kB
"""Throughput-per-watt optimizer for TinyMind sparse kernels."""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
import subprocess
import tempfile
ROOT = Path(__file__).resolve().parents[1]
INT4_BENCH = ROOT / "kernels" / "int4_sparse_ptx" / "run_efficiency_bench.ps1"
INT6_REPORT = ROOT / "reports" / "int6_cuda_eval_dll" / "int6_cuda_eval_dll_report.json"
INT6_BRIDGE_REPORT = ROOT / "reports" / "int6_bridge_imma_eval" / "int6_bridge_imma_eval_report.json"
def _load(path: str | Path) -> dict:
p = Path(path)
return json.loads(p.read_text(encoding="utf-8-sig")) if p.exists() else {}
def _run_int4_bench(blocks: int, threads: int, iterations: int, passes: int, run_name: str) -> dict:
command = [
"powershell",
"-NoLogo",
"-NoProfile",
"-ExecutionPolicy",
"Bypass",
"-File",
str(INT4_BENCH),
"-Blocks",
str(blocks),
"-Threads",
str(threads),
"-Iterations",
str(iterations),
"-Passes",
str(passes),
"-RunName",
run_name,
]
vcvars = Path("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Auxiliary/Build/vcvars64.bat")
if vcvars.exists():
quoted = " ".join(f'"{part}"' if (" " in part or "\\" in part or ":" in part) else part for part in command)
script_path = Path(tempfile.gettempdir()) / "tinymind_tfw_int4_bench.bat"
script_path.write_text(f'@echo off\r\ncall "{vcvars}" >nul\r\n{quoted}\r\n', encoding="utf-8")
proc = subprocess.run(["cmd.exe", "/d", "/c", str(script_path)], cwd=INT4_BENCH.parent, capture_output=True, text=True, timeout=240, check=False)
else:
proc = subprocess.run(command, cwd=INT4_BENCH.parent, capture_output=True, text=True, timeout=240, check=False)
summary = INT4_BENCH.parent / "bench_runs" / f"{run_name}-summary.json"
payload = _load(summary)
payload["invocation"] = {
"command": command,
"exit_code": proc.returncode,
"stdout_tail": proc.stdout[-4000:],
"stderr_tail": proc.stderr[-4000:],
}
return payload
def _int6_candidate(report_path: str | Path = INT6_REPORT) -> dict:
report = _load(report_path)
kernel = report.get("int6_cuda_kernel", {})
throughput = kernel.get("throughput", {})
# The current INT6 DLL report does not sample power directly. Use present
# GPU power draw as a conservative denominator if available.
power_w = _query_power_w()
dense_tops = float(throughput.get("dense_equivalent_tops") or 0.0)
actual_tops = float(throughput.get("actual_sparse_tops") or 0.0)
return {
"name": "int6_2x4_reference_cuda",
"format": "int6_2:4sp",
"source": str(report_path),
"passed": kernel.get("passed") is True,
"avg_effective_tops": dense_tops,
"actual_sparse_tops": actual_tops,
"avg_power_w": power_w,
"avg_effective_tops_per_watt": dense_tops / power_w if power_w > 0 else 0.0,
"notes": "INT6 v1 uses packed CUDA reference math, not native Tensor Core INT6.",
}
def _int6_bridge_candidate(report_path: str | Path = INT6_BRIDGE_REPORT) -> dict:
report = _load(report_path)
metrics = report.get("metrics", {})
if "real_data" in metrics or "compute_peak" in metrics:
metric_mode = "real_data" if float(metrics.get("real_data", {}).get("avg_logical_int6_tops", 0.0)) > 0 else "compute_peak"
selected_metrics = metrics.get(metric_mode, {})
else:
metric_mode = "legacy"
selected_metrics = metrics
gate = report.get("claim_gate", {})
power = report.get("power_samples", {})
logical = float(selected_metrics.get("avg_logical_int6_tops", 0.0))
hardware = float(selected_metrics.get("avg_hardware_imma_tops", 0.0))
logical_w = float(selected_metrics.get("avg_logical_int6_tops_per_watt", 0.0))
quality = power.get("quality_gate", {})
power_w = float(quality.get("active_avg_power_w") or power.get("avg_power_w", 0.0))
return {
"name": "int6_bridge_imma_fast",
"format": "int6_2:4sp_bridge_imma",
"source": str(report_path),
"passed": gate.get("int6_bottleneck_removed") is True,
"metric_mode": metric_mode,
"avg_effective_tops": logical,
"avg_hardware_imma_tops": hardware,
"avg_power_w": power_w,
"power_measurement_representative": gate.get("power_measurement_representative") is True,
"real_data_movement_measured": gate.get("real_data_movement_measured") is True,
"avg_effective_tops_per_watt": logical_w,
"notes": "Fused two-pass IMMA.SP bridge for INT6 logical arithmetic; uses measured hardware IMMA throughput.",
}
def _query_power_w() -> float:
try:
proc = subprocess.run(
["nvidia-smi", "--query-gpu=power.draw", "--format=csv,noheader,nounits"],
capture_output=True,
text=True,
timeout=10,
check=False,
)
except (OSError, subprocess.SubprocessError):
return 0.0
first = proc.stdout.strip().splitlines()[0:1]
if not first:
return 0.0
try:
return float(first[0].strip())
except ValueError:
return 0.0
def build_tfw_optimizer(
out_dir: str | Path,
run_int4: bool = True,
blocks: int = 160,
threads: int = 256,
iterations: int = 5000,
passes: int = 3,
int6_report: str | Path = INT6_REPORT,
int6_bridge_report: str | Path = INT6_BRIDGE_REPORT,
) -> dict:
run_name = datetime.now(timezone.utc).strftime("tfw-%Y%m%d-%H%M%S")
candidates: list[dict] = []
if run_int4:
int4 = _run_int4_bench(blocks, threads, iterations, passes, run_name)
candidates.append(
{
"name": "int4_sparse_tensor_core_imma_sp",
"format": "int4_2:4sp",
"source": int4.get("summary_path") or int4.get("bench_log"),
"passed": int4.get("invocation", {}).get("exit_code") == 0 and float(int4.get("avg_effective_tops", 0.0)) > 0,
"avg_effective_tops": float(int4.get("avg_effective_tops", 0.0)),
"best_effective_tops": float(int4.get("best_effective_tops", 0.0)),
"avg_power_w": float(int4.get("avg_power_w", 0.0)),
"max_power_w": float(int4.get("max_power_w", 0.0)),
"avg_temp_c": float(int4.get("avg_temp_c", 0.0)),
"max_temp_c": float(int4.get("max_temp_c", 0.0)),
"avg_effective_tops_per_watt": float(int4.get("avg_effective_tops_per_watt", 0.0)),
"best_effective_tops_per_watt": float(int4.get("best_effective_tops_per_watt", 0.0)),
"notes": "Uses real Ampere IMMA.SP sparse Tensor Core instruction.",
"diagnostics": int4.get("invocation", {}),
}
)
candidates.append(_int6_candidate(int6_report))
candidates.append(_int6_bridge_candidate(int6_bridge_report))
runnable = [row for row in candidates if row.get("passed")]
selected = max(runnable, key=lambda row: (row.get("avg_effective_tops_per_watt", 0.0), row.get("avg_effective_tops", 0.0))) if runnable else None
report = {
"schema_version": "tinymind-tfw-optimizer-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"goal": "maximize measured TF/TOPS per watt first, then raw TF/TOPS, on the current RTX 3090.",
"candidates": candidates,
"selected": selected,
"runtime_recommendation": {
"primary": selected["name"] if selected else None,
"precision": selected["format"] if selected else None,
"use_int6_when": "quality/drift wins and throughput is acceptable",
"use_int4_when": "TF/W or latency is the priority",
"next_kernel_work": "Promote fused INT6 bridge for performance paths and keep INT6 reference for correctness drift checks.",
},
"claim_gate": {
"local_tfw_winner_measured": selected is not None,
"world_highest_tfw_claim_allowed": False,
"reason": "World-highest TF/W requires standardized external benchmark comparison across hardware and kernels.",
},
}
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
json_path = out / "tfw_optimizer_report.json"
md_path = out / "tfw_optimizer_report.md"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _markdown(report: dict) -> str:
selected = report.get("selected") or {}
lines = [
"# TinyMind TF/W Optimizer",
"",
f"- Selected: {selected.get('name')}",
f"- Format: {selected.get('format')}",
f"- Avg effective TOPS: {selected.get('avg_effective_tops')}",
f"- Avg power W: {selected.get('avg_power_w')}",
f"- Avg effective TOPS/W: {selected.get('avg_effective_tops_per_watt')}",
f"- World-highest TF/W claim allowed: {report['claim_gate']['world_highest_tfw_claim_allowed']}",
"",
"## Candidates",
"",
"| Name | Format | Passed | TOPS | W | TOPS/W | Notes |",
"|---|---|---:|---:|---:|---:|---|",
]
for row in report["candidates"]:
lines.append(
f"| {row.get('name')} | {row.get('format')} | {row.get('passed')} | "
f"{row.get('avg_effective_tops', 0):.6f} | {row.get('avg_power_w', 0):.2f} | "
f"{row.get('avg_effective_tops_per_watt', 0):.6f} | {row.get('notes')} |"
)
return "\n".join(lines) + "\n"

Xet Storage Details

Size:
9.85 kB
·
Xet hash:
d9adc589d4a15307b720b2675808a3704af72278b6d972ab52b6c19541d00e18

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.