""" Speed estimation: how fast will it actually feel? Two-tier design, with provenance the UI always shows: 1. TRAINED MODEL (when present): an XGBoost regressor trained on real community measurements (LocalScore, ~33k data points), following the methodology of LLM-Pilot (IBM, SC'24, arXiv:2410.02425 — gradient boosting over hardware+model features, validated leave-one-accelerator- out). Loaded from model/speed_model.skops if scripts/train_speed_model.py has been run. method = "measured-model". 2. ROOFLINE BASELINE (always available, fully offline): decode is memory- bandwidth-bound — tok/s ~ bandwidth / bytes-read-per-token (weights + KV), times an empirical efficiency factor. See kipply's "Transformer Inference Arithmetic" and the JAX scaling book inference chapter. method = "roofline". The anti-gimmick rule lives in the training script: the trained model ships only if it beats this baseline on held-out hardware; otherwise the baseline IS the product and the UI says so. Scope note (honest): this predicts LLM/VLM decode speed. Vision (YOLO) and diffusion models are COMPUTE-bound, not bandwidth-bound — FPS scales with TFLOPS / model GFLOPs, a different axis with different data (Ultralytics publishes per-size GFLOPs and official T4 latencies; dbgpu has per-GPU TFLOPS). That path is designed in SPEED-BRICK-RESEARCH.md §8 but not built; non-LLM families keep their provenance-labelled memory verdicts only, rather than getting fake speed numbers. """ import json import re from functools import lru_cache from pathlib import Path _ROOT = Path(__file__).resolve().parent.parent _SPECS_PATH = _ROOT / "data" / "gpu_specs.json" _MODEL_PATH = _ROOT / "model" / "speed_model.skops" # Decode efficiency vs theoretical bandwidth roofline. Real stacks land well # under the ceiling; 0.55-0.70 is the typical consumer-GPU range in community # measurements. We centre conservatively and report a band, never a point. _EFF_MID, _EFF_LO, _EFF_HI = 0.60, 0.42, 0.78 # Conservative system-RAM bandwidth for offload modelling (dual-channel DDR4/5). _RAM_BW_GBS = 48.0 # Reading speed reference: ~4.5 words/s, ~0.75 words per token -> ~6 tok/s. _READING_TPS = 6.0 @lru_cache(maxsize=1) def _specs() -> dict: try: return json.loads(_SPECS_PATH.read_text(encoding="utf-8")) except OSError: return {"gpus": {}, "apple": {}, "sbc": {}} def _norm(s: str) -> str: return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9 ]", " ", (s or "").lower())).strip() @lru_cache(maxsize=1) def _bw_index() -> tuple: idx = [] for name, d in _specs()["gpus"].items(): idx.append((_norm(name), float(d["bw"]), float(d.get("vram", 0)))) idx.sort(key=lambda t: -len(t[0])) # longest first: '4080 super' beats '4080' return tuple(idx) # Apple chips: the UI only knows base/Pro/Max/Ultra, not the generation. We use # M2-generation numbers as the conservative representative (older = slower). _APPLE_TIER_BW = None def _apple_bw(tier_hint: str) -> float: global _APPLE_TIER_BW if _APPLE_TIER_BW is None: a = {k: v["bw"] for k, v in _specs()["apple"].items()} _APPLE_TIER_BW = { "ultra": a.get("m2 ultra") or a.get("m1 ultra") or 800.0, "max": a.get("m2 max") or 400.0, "pro": a.get("m2 pro") or 200.0, "base": a.get("m2") or 100.0, } t = (tier_hint or "").lower() for key in ("ultra", "max", "pro"): if key in t: return _APPLE_TIER_BW[key] return _APPLE_TIER_BW["base"] def bandwidth_for_spec(spec, gpu_label: str = "") -> tuple[float | None, str]: """(memory bandwidth GB/s on the fast path, source-note) for a machine.""" if spec.is_apple_silicon: return _apple_bw(gpu_label or spec.gpu_label), "Apple unified memory (conservative M2-gen figure)" if spec.gpu_vendor in ("nvidia", "amd", "intel") and spec.vram_gb > 0: n = _norm(gpu_label or spec.gpu_label) # pass 1: name + VRAM proximity (disambiguates 8 vs 16 GB variants); # pass 2: name only — a custom VRAM override must not hide the chart. for check_vram in (True, False): for key, bw, vram in _bw_index(): if key and key in n: if check_vram and vram and spec.vram_gb and abs(vram - spec.vram_gb) > 4: continue return bw, "vendor spec sheet" return None, "" return None, "" # -------------------------------------------------------------------------- # Trained model (optional, loaded if scripts/train_speed_model.py produced it) # -------------------------------------------------------------------------- _MODEL_JSON_PATH = _ROOT / "model" / "speed_model.json" @lru_cache(maxsize=1) def _trained_model(): # Prefer XGBoost's NATIVE format: zero extra deps at runtime (the skops # artifact exists for the Hub, but its loading chain dragged in unrelated # imports on the Space). if _MODEL_JSON_PATH.exists(): try: from xgboost import XGBRegressor model = XGBRegressor() model.load_model(_MODEL_JSON_PATH) print(f"[FitCheck] speed predictor loaded from {_MODEL_JSON_PATH.name}", flush=True) return model except Exception as e: # noqa: BLE001 import sys print(f"[FitCheck] WARNING: {_MODEL_JSON_PATH.name} exists but failed " f"to load ({e!r}) — trying the skops artifact", file=sys.stderr, flush=True) if not _MODEL_PATH.exists(): return None try: from skops.io import load as skops_load # skops only loads explicitly-trusted types — exactly these two, which # scripts/train_speed_model.py produces. Anything else is refused. model = skops_load(_MODEL_PATH, trusted=[ "xgboost.core.Booster", "xgboost.sklearn.XGBRegressor", ]) print(f"[FitCheck] speed predictor loaded from {_MODEL_PATH.name}", flush=True) return model except Exception as e: # noqa: BLE001 # The file exists but won't load — say so loudly (a silent fallback # here would hide a broken deploy behind plausible roofline numbers). import sys print(f"[FitCheck] WARNING: {_MODEL_PATH.name} exists but failed to " f"load ({e!r}) — falling back to the labelled roofline estimate", file=sys.stderr, flush=True) return None _METRICS_PATH = _ROOT / "model" / "metrics.json" @lru_cache(maxsize=1) def _envelope() -> dict: """The region of feature space the training data actually covered. Decision trees cannot extrapolate: outside what they saw, they clamp to the nearest seen value and quietly give wrong answers (e.g. a 32B model gets a 14B's speed). The roofline DOES extrapolate — it's physics. So the trained model only answers inside its measured envelope; outside it, the labelled analytical estimate takes over. Bounds come from metrics.json when the training script recorded them, else conservative defaults matching the LocalScore grid (<=14B Q4 models, consumer hardware). """ env = {"bytes_gb": (0.8, 10.0), "eff_bw": (30.0, 1900.0)} try: rec = json.loads(_METRICS_PATH.read_text(encoding="utf-8")).get("envelope") if rec: env.update({k: tuple(v) for k, v in rec.items()}) except OSError: pass return env def _in_envelope(eff_bw: float, bytes_gb: float) -> bool: env = _envelope() return (env["bytes_gb"][0] <= bytes_gb <= env["bytes_gb"][1] and env["eff_bw"][0] <= eff_bw <= env["eff_bw"][1]) # -------------------------------------------------------------------------- # Prediction # -------------------------------------------------------------------------- def predict_decode_tps( *, bandwidth_gbs: float, weights_gb: float, kv_gb: float = 0.0, active_fraction: float = 1.0, offload_fraction: float = 0.0, ) -> dict: """Predict decode tokens/sec. active_fraction: MoE models only read their active experts per token. offload_fraction: share of the model living in system RAM (0 = all on GPU). """ # Bytes read per generated token: the (active) weights + the KV cache. bytes_gb = max(weights_gb * active_fraction + kv_gb, 0.05) if active_fraction < 0.9: # MoE conservatism: expert routing scatters reads across the full # weight file, so real MoE decode lands well under the active-bytes # ideal. 1.5x is a deliberate under-promise until measured data # corrects it (community MoE numbers run ~50-70% of ideal). bytes_gb *= 1.5 eff_bw = bandwidth_gbs if offload_fraction > 0: f = min(max(offload_fraction, 0.0), 1.0) eff_bw = 1.0 / ((1.0 - f) / bandwidth_gbs + f / _RAM_BW_GBS) model = _trained_model() if model is not None and _in_envelope(eff_bw, bytes_gb): try: import numpy as np x = np.array([[eff_bw, bytes_gb, weights_gb, kv_gb, active_fraction, offload_fraction, eff_bw / bytes_gb]]) tps = float(model.predict(x)[0]) return {"tps": round(tps, 1), "lo": round(tps * 0.8, 1), "hi": round(tps * 1.2, 1), "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1), "method": "measured-model", "note": ("predicted by a model trained on real community " "measurements (LocalScore), LLM-Pilot methodology")} except Exception: # noqa: BLE001 — fall through to roofline pass base = eff_bw / bytes_gb note = ("analytical estimate: decode speed is memory-bandwidth-bound " "(bandwidth divided by bytes read per token)") if model is not None: note += (" — this configuration is outside the measured data's range, " "so the physics formula answers instead of the trained model") return {"tps": round(base * _EFF_MID, 1), "lo": round(base * _EFF_LO, 1), "hi": round(base * _EFF_HI, 1), "bytes_gb": round(bytes_gb, 2), "eff_bw": round(eff_bw, 1), "method": "roofline", "note": note} def feel_text(pred: dict) -> str: """One honest, plain-English line from a prediction.""" tps = pred["tps"] lo, hi = pred["lo"], pred["hi"] if tps >= _READING_TPS * 4: speed_word = "much faster than you read" elif tps >= _READING_TPS * 1.5: speed_word = "faster than you read" elif tps >= _READING_TPS * 0.7: speed_word = "about reading speed" else: speed_word = "slower than reading — fine for short tasks" return f"~{tps:g} tok/s (likely {lo:g}-{hi:g}) — {speed_word}"