Spaces:
Runtime error
Runtime error
| """Preflight — gated GO/NO-GO check for the real local stack. Run this FIRST | |
| thing when you sit down locally, before touching anything else: | |
| ollama serve & (if not already running) | |
| make preflight [CHIEF_ENGINEER_MODEL=gemma4:e2b make preflight] | |
| (or: uv run python -m scripts.preflight) | |
| It exercises the REAL model path (the thing the sandbox could never verify) and | |
| grades every gate the demo depends on. Each FAIL points at the matching section | |
| of docs/plan/06-CONTINGENCY.md — so a failure costs minutes, not a night. | |
| Never touches demo state: uses a temp ledger copy. Offline gates still run | |
| without Ollama (reported as SKIP for the live ones). Exit code 1 if any | |
| REQUIRED gate fails — safe to wire into a pre-record ritual. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import shutil | |
| import sys | |
| import tempfile | |
| import time | |
| from pathlib import Path | |
| HERE = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(HERE)) | |
| from core import llm # noqa: E402 | |
| from core.ledger import LedgerManager # noqa: E402 | |
| from core.models import Advice, Environment, Job # noqa: E402 | |
| from core.prompts import REFLECT_SYSTEM, build_reflect_prompt, build_system_prompt # noqa: E402 | |
| from core.spine import SpineValidator # noqa: E402 | |
| from core.models import PrintSettings # noqa: E402 | |
| RESULTS: list[tuple[str, str, str]] = [] # (gate, status, detail) | |
| CONTINGENCY = "docs/plan/06-CONTINGENCY.md" | |
| def record(gate: str, status: str, detail: str, section: str = "") -> None: | |
| ptr = f" → see {CONTINGENCY} §{section}" if (section and status == "FAIL") else "" | |
| RESULTS.append((gate, status, detail)) | |
| icon = {"PASS": "✅", "WARN": "🟡", "FAIL": "🔴", "SKIP": "⏭"}[status] | |
| print(f"{icon} {gate}: {status} — {detail}{ptr}") | |
| def _temp_ledger() -> LedgerManager: | |
| tmp = Path(tempfile.mkdtemp(prefix="preflight_")) / "lessons.jsonl" | |
| seeds = HERE / "data" / "seed_lessons.jsonl" | |
| if seeds.exists(): | |
| shutil.copy(seeds, tmp) | |
| else: | |
| tmp.touch() | |
| return LedgerManager(path=tmp) | |
| # --- G1: environment --------------------------------------------------------- | |
| def g1_environment() -> bool: | |
| if "4b" == llm.MODEL.split(":")[-1]: | |
| record("G1 env", "FAIL", f"model tag '{llm.MODEL}' — gemma4:4b DOES NOT EXIST (Kaggle landmine)", "G1") | |
| return False | |
| if not llm.is_available(): | |
| record("G1 env", "FAIL", "Ollama daemon unreachable (is `ollama serve` running?)", "G1") | |
| return False | |
| try: | |
| import ollama | |
| tags = [m.get("model") or m.get("name") for m in ollama.list().get("models", [])] | |
| except Exception as e: | |
| tags = [] | |
| record("G1 env", "WARN", f"daemon up but list() odd: {e!r}") | |
| if tags and not any(llm.MODEL in (t or "") or (t or "").startswith(llm.MODEL) for t in tags): | |
| record("G1 env", "FAIL", f"'{llm.MODEL}' not pulled. Available: {tags}", "G1") | |
| return False | |
| record("G1 env", "PASS", f"daemon up, model '{llm.MODEL}' present ({len(tags)} tags local)") | |
| _tiny_titan_check() | |
| return True | |
| def _tiny_titan_check() -> None: | |
| """Report Tiny Titan ($1.5k ≤4B special award) eligibility from `ollama show`. | |
| Informational — never blocks the demo. Verified 6/10: the field guide's 32B cap | |
| counts TOTAL params ("not just active"); no ruling found for MatFormer E-models | |
| (raw 5.1B/8.0B vs effective ~2B/~4B) on the ≤4B award → treat as ambiguous and | |
| ASK in the org discussions before tagging.""" | |
| try: | |
| import ollama | |
| info = ollama.show(llm.MODEL) | |
| except Exception as e: | |
| record("Tiny Titan", "SKIP", f"`ollama show` unavailable ({e!r:.60}) — run it by hand") | |
| return | |
| def _get(obj, *keys): | |
| for k in keys: | |
| if isinstance(obj, dict) and k in obj: | |
| return obj[k] | |
| if hasattr(obj, k): | |
| return getattr(obj, k) | |
| return None | |
| details = _get(info, "details") or {} | |
| modelinfo = _get(info, "modelinfo", "model_info") or {} | |
| psize = _get(details, "parameter_size") # e.g. "4.3B" | |
| b = None | |
| if isinstance(modelinfo, dict): | |
| for k, v in modelinfo.items(): | |
| if str(k).endswith("parameter_count") and isinstance(v, (int, float)): | |
| b = float(v) / 1e9 | |
| if b is None and isinstance(psize, str): | |
| try: | |
| b = float(psize.strip().upper().rstrip("B")) | |
| except Exception: | |
| b = None | |
| # Gemma 3n E-models report RAW params via ollama (E4B~8B) but are designed as | |
| # EFFECTIVE 4B/2B (MatFormer + per-layer embeddings). The badge counts the | |
| # effective size, so key off the model NAME, not the raw count. | |
| import re | |
| em = re.search(r"e(\d+)b", llm.MODEL.lower()) | |
| eff = float(em.group(1)) if em else None | |
| raw = f"{b:.1f}B raw" if b is not None else "raw n/a" | |
| if eff is not None: | |
| if eff <= 4.0: | |
| # Verified 6/10: the guide's 32B cap counts TOTAL params ("not just | |
| # active") and no ruling exists for E-models on the <=4B award — so | |
| # effective-params eligibility is genuinely AMBIGUOUS. Ask, don't tag. | |
| record("Tiny Titan", "WARN", | |
| f"{llm.MODEL}: effective ~{eff:.0f}B but {raw} — $1.5k award counts params " | |
| f"ambiguously for E-models (32B cap counts TOTAL). ASK in the org " | |
| f"discussions before tagging tiny-titan") | |
| else: | |
| record("Tiny Titan", "WARN", | |
| f"{llm.MODEL}: effective ~{eff:.0f}B > 4B — outside Tiny Titan either way") | |
| elif b is None: | |
| record("Tiny Titan", "WARN", f"couldn't parse params (details={psize!r}); check `ollama show {llm.MODEL}` by hand") | |
| elif b <= 4.0: | |
| record("Tiny Titan", "PASS", f"{b:.2f}B ≤ 4B → ELIGIBLE; add the tag") | |
| else: | |
| record("Tiny Titan", "WARN", f"{b:.2f}B > 4B — outside Tiny Titan; skip that badge") | |
| # --- G2-G4: the load-bearing live calls --------------------------------------- | |
| def g2_g4_live_calls() -> None: | |
| lm = _temp_ledger() | |
| # Case A: precedent-rich (humid PETG stringing — seeds 007/008/012 match) | |
| job_a = Job(geometry_type="stringing", material="PETG", description="calibration tower, humid day") | |
| env_a = Environment(temp=25, humidity=65) | |
| retrieved = lm.retrieve("PETG", "stringing", 25, 65, k=3) | |
| sys_a = build_system_prompt(job_a, env_a, retrieved) | |
| # Case B: novel (TPU vase — no precedent in seeds) | |
| job_b = Job(geometry_type="vase", material="TPU", description="flexible vase") | |
| env_b = Environment(temp=22, humidity=45) | |
| sys_b = build_system_prompt(job_b, env_b, lm.retrieve("TPU", "vase", 22, 45, k=3)) | |
| # Prompt-length budget (GEMMA-STEERING Technique 5): small-Gemma attention | |
| # quality degrades past ~800 tokens. Informational — trim references/k if hot. | |
| est = len(sys_a) // 4 | |
| flag = " ⚠ over the ~800-token small-Gemma budget — trim references / k" if est > 800 else "" | |
| print(f" prompt size: ~{est} tokens (precedent-rich case){flag}") | |
| times, parses, schemas = [], 0, 0 | |
| advice_a = None | |
| N = 3 | |
| for i in range(N): | |
| t0 = time.time() | |
| raw = llm.chat_json(sys_a, "Give your recommendation for THIS job now.") | |
| dt = time.time() - t0 | |
| times.append(dt) | |
| print(f" live call {i+1}/{N}: {dt:5.1f}s {'(json ok)' if raw else '(parse FAIL)'}") | |
| if raw is not None: | |
| parses += 1 | |
| try: | |
| advice_a = Advice(**raw) | |
| schemas += 1 | |
| except Exception as e: | |
| print(f" schema reject: {e!s:.120}") | |
| # G2 latency — separate the one-time COLD model-load from WARM steady-state. | |
| # The cold call (first) only happens once; you pre-warm before recording, so | |
| # the demo experience is the warm number. Gate on warm, report cold as a tip. | |
| cold = times[0] | |
| warm = times[1:] if len(times) > 1 else times | |
| warm_avg = sum(warm) / len(warm) | |
| print(f" cold-start {cold:5.1f}s (one-time model load) · warm avg {warm_avg:.1f}s " | |
| f"over {len(warm)} — pre-warm with one throwaway call before recording") | |
| # Bands calibrated against real cockpit driving (Kyle, 6/10): warm ~18s on | |
| # e4b reads fine in a narrated demo, so <20s is a PASS, not a warning. | |
| if warm_avg < 20: | |
| record("G2 latency", "PASS", | |
| f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) — fine for a live narrated demo ({llm.MODEL}); pre-warm before recording") | |
| elif warm_avg < 35: | |
| record("G2 latency", "WARN", | |
| f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) — long pauses; tighten prompt, or gemma4:e2b / ZeroGPU", "G2") | |
| else: | |
| record("G2 latency", "FAIL", | |
| f"warm avg {warm_avg:.1f}s — too slow even warm; use gemma4:e2b or ZeroGPU", "G2") | |
| # G3 contract | |
| if schemas == N: | |
| record("G3 contract", "PASS", f"{schemas}/{N} valid JSON + Advice schema") | |
| elif schemas >= 1: | |
| record("G3 contract", "WARN", f"only {schemas}/{N} schema-valid (fallback will cover, but video needs live)", "G3") | |
| else: | |
| record("G3 contract", "FAIL", f"0/{N} valid — live path unusable as-is", "G3") | |
| # G4 reasoning quality — the load-bearing moment, heuristically graded | |
| if advice_a is not None: | |
| r = advice_a.reasoning.lower() | |
| checks = { | |
| "evaluates precedent (cites a job/precedent/prior)": any(w in r for w in ("precedent", "prior", "job", "seed-", "last time", "before")), | |
| "reasons about the room (humidity/temp/moisture/dry)": any(w in r for w in ("humid", "moisture", "temp", "°c", " rh", "dry", "wet")), | |
| "substantive (>120 chars)": len(advice_a.reasoning) > 120, | |
| "flags at least one risk region": len(advice_a.risks) >= 1, | |
| } | |
| failed = [k for k, ok in checks.items() if not ok] | |
| print(f" reasoning sample: \"{advice_a.reasoning[:180]}...\"") | |
| if not failed: | |
| record("G4 reasoning", "PASS", "precedent-evaluation text present and substantive") | |
| else: | |
| record("G4 reasoning", "WARN", f"weak on: {'; '.join(failed)} — prompt-tune before recording", "G4") | |
| else: | |
| record("G4 reasoning", "FAIL", "no schema-valid advice to grade", "G3") | |
| # G4b novel case — must NOT hallucinate precedent | |
| raw_b = llm.chat_json(sys_b, "Give your recommendation for THIS job now.") | |
| if raw_b: | |
| try: | |
| adv_b = Advice(**raw_b) | |
| rb = adv_b.reasoning.lower() | |
| honest = any(w in rb for w in ("no close precedent", "no precedent", "no prior", "novel", "material properties", "first ")) | |
| cites_fake = "seed-" in rb | |
| if honest and not cites_fake: | |
| record("G4b novel-case", "PASS", "says no-precedent / reasons from material properties") | |
| else: | |
| record("G4b novel-case", "WARN", f"novel-job reasoning suspect (honest={honest}, cites_fake={cites_fake}) — check by eye", "G4") | |
| print(f" novel sample: \"{adv_b.reasoning[:180]}...\"") | |
| except Exception: | |
| record("G4b novel-case", "WARN", "novel call returned but schema-invalid", "G3") | |
| else: | |
| record("G4b novel-case", "WARN", "novel call failed to parse", "G3") | |
| # G5 reflection | |
| raw_r = llm.chat_json(REFLECT_SYSTEM, build_reflect_prompt( | |
| job_a, env_a, "nozzle 230°C, bed 80°C, retraction 4.5mm, fan 40%, first-layer fan 0%", "success")) | |
| lesson = (raw_r or {}).get("lesson") if isinstance(raw_r, dict) else None | |
| if lesson and len(lesson) > 30: | |
| record("G5 reflection", "PASS", f"lesson distilled: \"{lesson[:100]}...\"") | |
| elif lesson: | |
| record("G5 reflection", "WARN", f"lesson thin: \"{lesson}\"", "G4") | |
| else: | |
| record("G5 reflection", "WARN", "reflect returned no lesson (deterministic fallback covers it)", "G3") | |
| # --- G6: spine (offline, always) ---------------------------------------------- | |
| def g6_spine() -> None: | |
| checked = SpineValidator().check(PrintSettings( | |
| nozzle_temp=260, bed_temp=60, retraction_mm=5, fan_pct=100, first_layer_fan_pct=0), "PLA") | |
| if checked.vetoes and checked.settings.nozzle_temp < 260: | |
| record("G6 spine", "PASS", f"unsafe PLA 260°C clamped to {checked.settings.nozzle_temp:.0f}°C ({len(checked.vetoes)} veto)") | |
| else: | |
| record("G6 spine", "FAIL", "Spine did NOT clamp an unsafe setting — demo safety claim broken", "G6") | |
| # --- G7: app serves (offline, always) ------------------------------------------- | |
| def g7_app() -> None: | |
| try: | |
| import urllib.request | |
| import app as A | |
| d = A.build() | |
| d.launch(prevent_thread_lock=True, server_name="127.0.0.1", server_port=7991, quiet=True) | |
| code = urllib.request.urlopen("http://127.0.0.1:7991/", timeout=15).status | |
| d.close() | |
| if code == 200: | |
| record("G7 app", "PASS", "build() + launch + HTTP 200") | |
| else: | |
| record("G7 app", "FAIL", f"HTTP {code}", "G7") | |
| except Exception as e: | |
| record("G7 app", "FAIL", f"{e!r:.140}", "G7") | |
| # --- G8: assets + data (offline, always) --------------------------------------- | |
| def g8_assets() -> None: | |
| missing = [n for n in ("overhang.glb", "bridge.glb", "vase.glb", "cube.glb") | |
| if not (HERE / "assets" / n).exists()] | |
| seeds = HERE / "data" / "seed_lessons.jsonl" | |
| n_seeds = len([l for l in seeds.read_text().splitlines() if l.strip()]) if seeds.exists() else 0 | |
| if not missing and n_seeds == 12: | |
| record("G8 assets", "PASS", "4 meshes present, 12 seed lessons") | |
| elif missing: | |
| record("G8 assets", "FAIL", f"missing meshes {missing} — run `make assets`", "G8") | |
| else: | |
| record("G8 assets", "WARN", f"seed count {n_seeds} != 12 — verify data/seed_lessons.jsonl", "G8") | |
| def main() -> None: | |
| print(f"Chief Engineer preflight — model={llm.MODEL} ({time.strftime('%Y-%m-%d %H:%M')})") | |
| print("=" * 70) | |
| live = g1_environment() | |
| if live: | |
| g2_g4_live_calls() | |
| else: | |
| for g in ("G2 latency", "G3 contract", "G4 reasoning", "G4b novel-case", "G5 reflection"): | |
| record(g, "SKIP", "no live backend (offline gates still checked below)") | |
| g6_spine() | |
| g7_app() | |
| g8_assets() | |
| print("=" * 70) | |
| fails = [g for g, s, _ in RESULTS if s == "FAIL"] | |
| warns = [g for g, s, _ in RESULTS if s == "WARN"] | |
| skips = [g for g, s, _ in RESULTS if s == "SKIP"] | |
| if fails: | |
| print(f"🔴 NO-GO: {len(fails)} gate(s) failed: {', '.join(fails)}") | |
| print(f" Work {CONTINGENCY} top-to-bottom for each, then re-run.") | |
| sys.exit(1) | |
| if skips: | |
| print("🟡 OFFLINE-ONLY PASS — fallback demo is safe, but DO NOT record the video") | |
| print(" until the live gates run green. Start `ollama serve` and re-run.") | |
| sys.exit(0) | |
| if warns: | |
| print(f"🟡 GO with warnings ({', '.join(warns)}) — read them before recording.") | |
| sys.exit(0) | |
| print("🟢 GO — all gates green. Record the demo today, not tomorrow.") | |
| if __name__ == "__main__": | |
| main() | |