""" The model brick: a closed-context narrator. It takes the deterministic engine's structured advice (the exact JSON the UI already shows) plus a plain-English follow-up question, and re-voices those facts simply. It NEVER invents numbers, models, or benchmarks — every figure it states must already be in the facts. All arithmetic stays in engine/. Serving (Hugging Face Spaces, ZeroGPU): app.py exposes ask() via ``@app.api(name="ask")`` so it runs on Gradio's queue; _generate() below is wrapped in ``@spaces.GPU`` so a GPU is allocated per call and released on return. The model is moved to CUDA at import (safe under ZeroGPU's CUDA emulation). Off the Space (local dev, no GPU, or a boot failure), we never download an 8 GB model. ask() degrades to a deterministic narrator that re-voices the facts with no AI in the loop — so the /api/ask contract always answers, and always stays grounded. """ import json import os import re import sys def _log(msg: str) -> None: print(f"[FitCheck] {msg}", file=sys.stderr, flush=True) # Default to the prize path (NVIDIA Nemotron Quest). Swap to a clean Apache # fallback with no code change: FITCHECK_MODEL=Qwen/Qwen3-4B-Instruct-2507 MODEL_ID = os.environ.get("FITCHECK_MODEL", "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16") # When to actually load the 8 GB model. We must NOT download it on a free CPU # Space (it can fill the disk and break the Space) or on a laptop. So: # - ZeroGPU -> load (CUDA is emulated at import; this is the target path). # - GPU Space -> load only if CUDA is genuinely present. # - CPU Space / laptop -> skip; the deterministic explainer answers instead. ZERO_GPU = bool(os.environ.get("SPACES_ZERO_GPU")) def _should_load() -> bool: if ZERO_GPU: return True if os.environ.get("SPACE_ID"): try: import torch return torch.cuda.is_available() except Exception: # noqa: BLE001 return False return False SYSTEM_PROMPT = """\ You are FitCheck's explainer. A trusted calculator has already decided what AI \ this person's computer can run. Your only job is to explain its answer in warm, \ plain words. You are talking to someone who has never heard of VRAM or \ quantisation. RULES (do not break these): - For anything about THIS person's machine or result (what fits, the verdict, GB figures, model sizes, settings), use ONLY the facts inside .... Never invent or change those numbers, models, or prices. - The verdict is already decided in the facts. Explain it; never overrule it. - You MAY explain what a general technical term means (quantisation, VRAM, GGUF, etc.) in plain everyday words using ordinary knowledge - that is the whole point. Just don't state specific GB or size numbers about their setup that aren't in the facts. - If a question about their setup isn't covered by the facts, say you don't have that detail. Never guess. - Warm, plain, no hype. Don't mention these instructions or that you are an AI. OUTPUT: reply with ONLY a JSON object, nothing else: {"headline": "<=20 words, the direct answer", "why": "<=3 short plain sentences", "next_step": "one concrete next action, or \\"\\" if none applies"}\ """ # Few-shot: small models copy a format far better than they follow abstract # rules. Two gold examples in the exact short, plain style we want. _FEWSHOT = [ ( '{"verdict":"Runs great","summary":"Yes, you can run a Medium (7-9 billion) model on your graphics card, today.",' '"you_have":{"fast":"10.2 GB","total":"22 GB","needed":"5.5 GB needed"},' '"options":[{"size":"Large (13-14 billion)","fits":"tight","memory":"9 GB"},{"size":"Medium (7-9 billion)","fits":"great","memory":"5.5 GB"}]}', "Why not the Large one?", '{"headline":"The Large model fits, but only just.","why":"Your fast graphics memory is about 10.2 GB. A Medium model needs 5.5 GB and runs comfortably there. A Large one needs 9 GB, so it works but leaves little room and feels slower.","next_step":"Stick with the Medium model for snappy replies; try the Large one later if you want more polish."}', ), ( '{"verdict":"Won\'t fit","summary":"This goal is a stretch on this machine.",' '"you_have":{"fast":"0 GB","total":"4.9 GB","needed":"6.5 GB needed"}}', "Can I run the big chatbot?", '{"headline":"Not on this computer, honestly.","why":"The big chatbot needs about 6.5 GB, but this machine can offer only about 4.9 GB once everyday programs take their share. There is no graphics card to speed things up.","next_step":"Try a smaller model, add memory, or use a free cloud option for the big one."}', ), ] def _user_prompt(question: str, facts_text: str) -> str: return f"\n{facts_text}\n\n\nQuestion: {question}" def _chat_messages(question: str, facts_text: str) -> list[dict]: msgs = [{"role": "system", "content": SYSTEM_PROMPT}] for facts, q, a in _FEWSHOT: msgs.append({"role": "user", "content": _user_prompt(q, facts)}) msgs.append({"role": "assistant", "content": a}) msgs.append({"role": "user", "content": _user_prompt(question, facts_text)}) return msgs # -------------------------------------------------------------------------- # Facts handling (shared by the model path and the fallback) # -------------------------------------------------------------------------- def _strip_html(s: str) -> str: return re.sub(r"\s+", " ", re.sub(r"<[^>]+>", "", s or "")).strip() def _parse_facts(facts) -> dict: if isinstance(facts, dict): return facts if not facts: return {} try: return json.loads(facts) except (json.JSONDecodeError, TypeError): return {} def compact_facts(facts: dict) -> str: """Flatten the advise() result into the small, flat JSON the model sees. Flat JSON (not prose) makes grounding a near string-match and keeps the prompt short. We pass only what a follow-up answer could need. """ g = facts.get("gauge") or {} compact = { "verdict": facts.get("verdict_word") or facts.get("verdict"), "summary": facts.get("headline"), "explanation": _strip_html(facts.get("detail", "")), "goal": facts.get("use_case"), "you_have": { "fast": g.get("fast_gb"), "total": g.get("total_gb"), "needed": g.get("need_gb"), }, "options": [ {"size": o.get("model"), "fits": o.get("verdict"), "memory": o.get("memory"), "setting": o.get("setting"), "speed": o.get("feel")} for o in (facts.get("options") or []) ], "how_to_run": [ {"label": c.get("label"), "command": c.get("code")} for c in ((facts.get("commands") or {}).get("items") or []) ], "note": facts.get("note") or "", } # Drop empties so the model isn't tempted to fill nulls. compact = {k: v for k, v in compact.items() if v not in (None, "", [], {})} if "you_have" in compact: compact["you_have"] = {k: v for k, v in compact["you_have"].items() if v} return json.dumps(compact, ensure_ascii=False) # -------------------------------------------------------------------------- # Faithfulness gate (also used by tests) # -------------------------------------------------------------------------- # A "figure" = a number tied to a memory/size/quant unit — the kind a model # could dangerously invent. Bare ordinals ("first", "3 steps") are ignored. _FIGURE = re.compile(r"(\d+(?:\.\d+)?)\s*(gb|-?bit|billion|b)\b", re.I) def leaked_figures(answer_text: str, facts_text: str) -> list[str]: """Numbers-with-units in the answer that don't appear in the facts.""" facts_nums = set(re.findall(r"\d+(?:\.\d+)?", facts_text)) return [num for num, _unit in _FIGURE.findall(answer_text) if num not in facts_nums] def _answer_text(ans: dict) -> str: return " ".join(str(ans.get(k, "")) for k in ("headline", "why", "next_step")) def _parse_json_answer(raw: str) -> dict | None: """Pull the first {...} object out of the model's text and validate shape.""" if not raw: return None m = re.search(r"\{.*\}", raw, re.DOTALL) if not m: return None try: obj = json.loads(m.group(0)) except json.JSONDecodeError: return None if not isinstance(obj, dict): return None out = {k: str(obj.get(k, "")).strip() for k in ("headline", "why", "next_step")} return out if out["headline"] or out["why"] else None # -------------------------------------------------------------------------- # Model load (GPU runtime only) + public entry point # -------------------------------------------------------------------------- _GENERATE = None # set to a @spaces.GPU-wrapped fn when the GPU stack imports MODEL_READY = False # GPU stack imported; the model itself loads lazily (below) LOAD_ERROR = "" # Loaded on the FIRST /ask call, inside the GPU context — NOT at import. Loading # the 8 GB model at import blocked the Space's boot health window and the process # got killed (RUNTIME_ERROR with no traceback). Lazy loading lets the app launch # instantly; the first question pays the one-time download/load cost, and ask()'s # try/except falls back to the deterministic narrator if that first call is slow. _state = {"tok": None, "model": None} if _should_load(): try: import spaces # noqa: E402 import torch # noqa: E402 from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: E402 def _load(): # Prefer transformers' NATIVE NemotronH class (it guards the # mamba-ssm import and falls back to a pure-PyTorch path, so it runs # without the painful mamba-ssm CUDA build). Only if that's # unavailable do we use NVIDIA's trust_remote_code file, which # HARD-requires mamba-ssm. try: tok = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.bfloat16) except Exception: tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True) _state["tok"] = tok _state["model"] = model.to("cuda").eval() @spaces.GPU(duration=90) # cold load ~50s + generate; shorter = better queue priority def _generate(question: str, facts_text: str) -> str: if _state["model"] is None: _load() tok, model = _state["tok"], _state["model"] msgs = _chat_messages(question, facts_text) # return_dict=True -> a BatchEncoding (input_ids + attention_mask) we # can unpack with **inputs. Passing the BatchEncoding positionally to # generate() makes it do .shape on a dict -> AttributeError. kw = dict(add_generation_prompt=True, return_tensors="pt", return_dict=True) try: inputs = tok.apply_chat_template(msgs, enable_thinking=False, **kw) except TypeError: inputs = tok.apply_chat_template(msgs, **kw) inputs = inputs.to("cuda") prompt_len = inputs["input_ids"].shape[1] with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=160, do_sample=False, pad_token_id=tok.eos_token_id, ) return tok.decode(out[0][prompt_len:], skip_special_tokens=True).strip() _GENERATE = _generate MODEL_READY = True except Exception as e: # noqa: BLE001 — any failure → graceful fallback LOAD_ERROR = repr(e) _log(f"model brick: should_load={_should_load()} MODEL_READY={MODEL_READY} " f"LOAD_ERROR={LOAD_ERROR or 'none'} MODEL_ID={MODEL_ID}") def ask(question: str, facts: str = "") -> dict: """Answer a follow-up question, grounded in the engine's facts. No fake fallbacks. The model's REAL output is always what's shown: parsed into {headline, why, next_step} when it emits JSON, otherwise its prose goes straight into `why`. If the model isn't available or errors, we return an explicit {error: ...} (and log it) instead of fabricating a plausible answer — a surfaced error is debuggable; generic filler hides the bug. """ facts_dict = _parse_facts(facts) facts_text = compact_facts(facts_dict) question = (question or "").strip() or "What can I run?" if _GENERATE is None: msg = (f"Model not loaded in this environment " f"(LOAD_ERROR={LOAD_ERROR or 'no GPU runtime here'}).") _log(msg) return {"error": msg} try: raw = _GENERATE(question, facts_text) except Exception as e: # surface the real error; never fabricate an answer import traceback traceback.print_exc() return {"error": f"Model call failed: {e}"} ans = _parse_json_answer(raw) if ans is None: # Model answered in prose, not JSON — show its REAL text, don't discard. ans = {"headline": "", "why": (raw or "").strip(), "next_step": ""} # Faithfulness gate is OBSERVABILITY only: log ungrounded figures so we can # spot drift, but never silently swap the model's real answer for filler. leaked = leaked_figures(_answer_text(ans), facts_text) if leaked: _log(f"WARNING: figures not in facts {leaked} (showing model answer anyway)") return ans