FitCheck / engine /catalogue.py
cn0303's picture
Deploy FitCheck: engine + Nemotron model brick on ZeroGPU
12d2e34 verified
"""
Static catalogue: the frozen facts the advisor reasons over.
Everything here is build-time data — no network calls at runtime. That keeps
the tool fully offline-capable (the "Off the Grid" goal) and means the advice
can't silently drift when some external API changes.
Sources for the numbers (so anyone can check our work):
- bits-per-weight for GGUF quant families: llama.cpp / Hugging Face GGUF docs
- "~2 GB per 1B params at fp16": Hugging Face Transformers optimisation guide
- 8-bit ≈ 50% of fp16, 4-bit ≈ 25-30%: bitsandbytes docs
- architecture sizes (layers / hidden): typical published configs per size class
"""
from dataclasses import dataclass, field
# --------------------------------------------------------------------------
# Quantisation tiers
# --------------------------------------------------------------------------
# "Quantisation" = squashing the model's numbers into fewer bits so it takes
# less memory. Fewer bits = smaller + faster, but slightly less sharp.
# gb_per_billion is just bits_per_weight / 8 (bits -> bytes -> GB per 1B params).
@dataclass(frozen=True)
class QuantTier:
key: str
plain_name: str # what a normal person sees
bits_per_weight: float
blurb: str # one honest sentence about the trade-off
recommended: bool = False
@property
def gb_per_billion(self) -> float:
return self.bits_per_weight / 8.0
QUANT_TIERS: list[QuantTier] = [
QuantTier("fp16", "Full quality (fp16)", 16.0,
"The original, uncompressed model. Biggest and slowest to load."),
QuantTier("Q8_0", "Near-full (8-bit)", 8.5,
"Practically indistinguishable from full quality, about half the size."),
QuantTier("Q6_K", "High (6-bit)", 6.56,
"Very close to full quality, a bit smaller again."),
QuantTier("Q5_K_M", "Balanced+ (5-bit)", 5.67,
"A touch sharper than 4-bit for a little more memory."),
QuantTier("Q4_K_M", "Balanced (4-bit)", 4.83,
"The sweet spot most people use: small, fast, and still very good.",
recommended=True),
QuantTier("Q3_K_M", "Compact (3-bit)", 3.91,
"Smaller still, with a slight, usually-acceptable quality dip."),
QuantTier("Q2_K", "Tiny (2-bit)", 3.35,
"Last resort to make something fit — noticeably less reliable."),
]
QUANT_BY_KEY = {q.key: q for q in QUANT_TIERS}
RECOMMENDED_QUANT = next(q for q in QUANT_TIERS if q.recommended)
# --------------------------------------------------------------------------
# Model size classes
# --------------------------------------------------------------------------
# We reason in *size classes* rather than individual models, because the
# memory maths is driven by parameter count + architecture shape. Each class
# carries an approximate architecture so we can estimate the KV cache (chat
# memory) honestly. Layers/hidden are conservative typicals, not exact.
@dataclass(frozen=True)
class ModelClass:
key: str
billions: float # parameter count in billions (representative)
plain_name: str
good_for: str # plain-English "what it's actually good at"
n_layers: int
hidden: int
# Example concrete models for the copy-paste commands (real, well-known).
example_label: str
ollama_tag: str # what you'd type after `ollama run`
gguf_repo: str # a real Hugging Face GGUF repo for llama.cpp
MODEL_CLASSES: list[ModelClass] = [
ModelClass("tiny", 1.0, "Tiny (around 1 billion)",
"Quick simple chat, basic questions, tidying text. Runs on almost anything.",
24, 2048, "Llama 3.2 1B", "llama3.2:1b",
"bartowski/Llama-3.2-1B-Instruct-GGUF"),
ModelClass("small", 3.5, "Small (3-4 billion)",
"Surprisingly capable everyday chat, summarising, and light coding help.",
28, 3072, "Llama 3.2 3B", "llama3.2:3b",
"bartowski/Llama-3.2-3B-Instruct-GGUF"),
ModelClass("medium", 8.0, "Medium (7-9 billion)",
"A solid all-rounder: good chat, real coding help, decent reasoning.",
32, 4096, "Qwen2.5 7B", "qwen2.5:7b",
"bartowski/Qwen2.5-7B-Instruct-GGUF"),
ModelClass("large", 14.0, "Large (13-14 billion)",
"Noticeably smarter and more reliable. Wants a real graphics card.",
40, 5120, "Qwen2.5 14B", "qwen2.5:14b",
"bartowski/Qwen2.5-14B-Instruct-GGUF"),
ModelClass("xlarge", 32.0, "Very large (30-34 billion)",
"Near-premium quality. Needs a strong GPU or a lot of memory.",
48, 6656, "Qwen2.5 32B", "qwen2.5:32b",
"bartowski/Qwen2.5-32B-Instruct-GGUF"),
ModelClass("huge", 70.0, "Huge (70 billion)",
"Top-tier open quality. Serious hardware only.",
80, 8192, "Llama 3.3 70B", "llama3.3:70b",
"bartowski/Llama-3.3-70B-Instruct-GGUF"),
]
MODEL_BY_KEY = {m.key: m for m in MODEL_CLASSES}
# --------------------------------------------------------------------------
# Use cases (jobs people actually want done)
# --------------------------------------------------------------------------
# Each maps to a *minimum* sensible size and a *comfortable* size. We never
# pretend a job works on a model that's too small for it.
@dataclass(frozen=True)
class UseCase:
key: str
plain_name: str
description: str
min_class: str # smallest model that does an OK job
good_class: str # where it starts feeling genuinely useful
# Extra memory headroom multiplier for this job (RAG/agents need more
# context; fine-tuning needs much more). 1.0 = normal inference.
overhead_factor: float = 1.0
note: str = ""
USE_CASES: list[UseCase] = [
UseCase("chat", "Just chatting / asking questions",
"General conversation, explanations, everyday questions.",
"tiny", "small"),
UseCase("writing", "Writing & summarising",
"Drafting emails, rewriting, condensing long text.",
"small", "medium"),
UseCase("coding", "Coding help",
"Explaining code, writing functions, fixing bugs.",
"small", "medium",
note="Bigger models are much more reliable for code."),
UseCase("agents", "Tool use / agents",
"Letting the model call tools, search, or take steps for you.",
"medium", "medium", overhead_factor=1.15,
note="Needs steady instruction-following — go medium or larger."),
UseCase("rag", "Document Q&A (your own files)",
"Answering questions over your PDFs/notes (a.k.a. RAG).",
"small", "medium", overhead_factor=1.25,
note="Long documents use extra memory for context."),
UseCase("finetune", "Teaching it your own data (fine-tuning)",
"Training a small adapter (LoRA/QLoRA) on your examples.",
"small", "medium", overhead_factor=2.2,
note="Training needs roughly 2-3x the memory of just chatting."),
]
USE_CASE_BY_KEY = {u.key: u for u in USE_CASES}