"""Pre-window latency check — does the model respond fast enough for a live demo? Run on the ACTUAL target hardware (your laptop / the Space). If a turn takes ~40s, switch to a smaller quant now (gemma4:e2b), not on June 13. Run: `make bench` (optionally CHIEF_ENGINEER_MODEL=gemma4:e2b) """ from __future__ import annotations import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) # repo root on path from core import llm from core.models import Environment, Job from core.prompts import build_system_prompt N = 3 def main() -> None: if not llm.is_available(): print("⚠ Ollama not reachable — start `ollama serve` and pull the model, then re-run.") print(f" target model: {llm.MODEL}") return job = Job(geometry_type="overhang", material="PLA", description="45° bracket") env = Environment(temp=28, humidity=50) system = build_system_prompt(job, env, []) times = [] for i in range(N): t0 = time.time() out = llm.chat_json(system, "Give your recommendation for THIS job now.") dt = time.time() - t0 times.append(dt) ok = "ok" if out else "parse-fail" print(f" run {i + 1}: {dt:5.1f}s ({ok})") # Same cold/warm split + bands as preflight G2 (calibrated 6/10: warm <20s # reads fine in a narrated demo). cold, warm = times[0], (times[1:] or times) warm_avg = sum(warm) / len(warm) verdict = ("✅ fine for a live narrated demo" if warm_avg < 20 else ("🟡 long pauses — tighten prompt or use e2b/ZeroGPU" if warm_avg < 35 else "🔴 too slow — use gemma4:e2b")) print(f"\n{llm.MODEL}: warm avg {warm_avg:.1f}s (first call {cold:.1f}s) over {N} runs → {verdict}") if __name__ == "__main__": main()