Spaces:

build-small-hackathon
/

FitCheck

Running on Zero

App Files Files Community

FitCheck / engine /catalogue.py

cn0303

Deploy FitCheck: engine + Nemotron model brick on ZeroGPU

12d2e34 verified 1 day ago

raw

history blame contribute delete

7.17 kB

	"""
	Static catalogue: the frozen facts the advisor reasons over.

	Everything here is build-time data — no network calls at runtime. That keeps
	the tool fully offline-capable (the "Off the Grid" goal) and means the advice
	can't silently drift when some external API changes.

	Sources for the numbers (so anyone can check our work):
	- bits-per-weight for GGUF quant families: llama.cpp / Hugging Face GGUF docs
	- "~2 GB per 1B params at fp16": Hugging Face Transformers optimisation guide
	- 8-bit ≈ 50% of fp16, 4-bit ≈ 25-30%: bitsandbytes docs
	- architecture sizes (layers / hidden): typical published configs per size class
	"""

	from dataclasses import dataclass, field


	# --------------------------------------------------------------------------
	# Quantisation tiers
	# --------------------------------------------------------------------------
	# "Quantisation" = squashing the model's numbers into fewer bits so it takes
	# less memory. Fewer bits = smaller + faster, but slightly less sharp.
	# gb_per_billion is just bits_per_weight / 8 (bits -> bytes -> GB per 1B params).

	@dataclass(frozen=True)
	class QuantTier:
	key: str
	plain_name: str # what a normal person sees
	bits_per_weight: float
	blurb: str # one honest sentence about the trade-off
	recommended: bool = False

	@property
	def gb_per_billion(self) -> float:
	return self.bits_per_weight / 8.0


	QUANT_TIERS: list[QuantTier] = [
	QuantTier("fp16", "Full quality (fp16)", 16.0,
	"The original, uncompressed model. Biggest and slowest to load."),
	QuantTier("Q8_0", "Near-full (8-bit)", 8.5,
	"Practically indistinguishable from full quality, about half the size."),
	QuantTier("Q6_K", "High (6-bit)", 6.56,
	"Very close to full quality, a bit smaller again."),
	QuantTier("Q5_K_M", "Balanced+ (5-bit)", 5.67,
	"A touch sharper than 4-bit for a little more memory."),
	QuantTier("Q4_K_M", "Balanced (4-bit)", 4.83,
	"The sweet spot most people use: small, fast, and still very good.",
	recommended=True),
	QuantTier("Q3_K_M", "Compact (3-bit)", 3.91,
	"Smaller still, with a slight, usually-acceptable quality dip."),
	QuantTier("Q2_K", "Tiny (2-bit)", 3.35,
	"Last resort to make something fit — noticeably less reliable."),
	]

	QUANT_BY_KEY = {q.key: q for q in QUANT_TIERS}
	RECOMMENDED_QUANT = next(q for q in QUANT_TIERS if q.recommended)


	# --------------------------------------------------------------------------
	# Model size classes
	# --------------------------------------------------------------------------
	# We reason in size classes rather than individual models, because the
	# memory maths is driven by parameter count + architecture shape. Each class
	# carries an approximate architecture so we can estimate the KV cache (chat
	# memory) honestly. Layers/hidden are conservative typicals, not exact.

	@dataclass(frozen=True)
	class ModelClass:
	key: str
	billions: float # parameter count in billions (representative)
	plain_name: str
	good_for: str # plain-English "what it's actually good at"
	n_layers: int
	hidden: int
	# Example concrete models for the copy-paste commands (real, well-known).
	example_label: str
	ollama_tag: str # what you'd type after `ollama run`
	gguf_repo: str # a real Hugging Face GGUF repo for llama.cpp


	MODEL_CLASSES: list[ModelClass] = [
	ModelClass("tiny", 1.0, "Tiny (around 1 billion)",
	"Quick simple chat, basic questions, tidying text. Runs on almost anything.",
	24, 2048, "Llama 3.2 1B", "llama3.2:1b",
	"bartowski/Llama-3.2-1B-Instruct-GGUF"),
	ModelClass("small", 3.5, "Small (3-4 billion)",
	"Surprisingly capable everyday chat, summarising, and light coding help.",
	28, 3072, "Llama 3.2 3B", "llama3.2:3b",
	"bartowski/Llama-3.2-3B-Instruct-GGUF"),
	ModelClass("medium", 8.0, "Medium (7-9 billion)",
	"A solid all-rounder: good chat, real coding help, decent reasoning.",
	32, 4096, "Qwen2.5 7B", "qwen2.5:7b",
	"bartowski/Qwen2.5-7B-Instruct-GGUF"),
	ModelClass("large", 14.0, "Large (13-14 billion)",
	"Noticeably smarter and more reliable. Wants a real graphics card.",
	40, 5120, "Qwen2.5 14B", "qwen2.5:14b",
	"bartowski/Qwen2.5-14B-Instruct-GGUF"),
	ModelClass("xlarge", 32.0, "Very large (30-34 billion)",
	"Near-premium quality. Needs a strong GPU or a lot of memory.",
	48, 6656, "Qwen2.5 32B", "qwen2.5:32b",
	"bartowski/Qwen2.5-32B-Instruct-GGUF"),
	ModelClass("huge", 70.0, "Huge (70 billion)",
	"Top-tier open quality. Serious hardware only.",
	80, 8192, "Llama 3.3 70B", "llama3.3:70b",
	"bartowski/Llama-3.3-70B-Instruct-GGUF"),
	]

	MODEL_BY_KEY = {m.key: m for m in MODEL_CLASSES}


	# --------------------------------------------------------------------------
	# Use cases (jobs people actually want done)
	# --------------------------------------------------------------------------
	# Each maps to a minimum sensible size and a comfortable size. We never
	# pretend a job works on a model that's too small for it.

	@dataclass(frozen=True)
	class UseCase:
	key: str
	plain_name: str
	description: str
	min_class: str # smallest model that does an OK job
	good_class: str # where it starts feeling genuinely useful
	# Extra memory headroom multiplier for this job (RAG/agents need more
	# context; fine-tuning needs much more). 1.0 = normal inference.
	overhead_factor: float = 1.0
	note: str = ""


	USE_CASES: list[UseCase] = [
	UseCase("chat", "Just chatting / asking questions",
	"General conversation, explanations, everyday questions.",
	"tiny", "small"),
	UseCase("writing", "Writing & summarising",
	"Drafting emails, rewriting, condensing long text.",
	"small", "medium"),
	UseCase("coding", "Coding help",
	"Explaining code, writing functions, fixing bugs.",
	"small", "medium",
	note="Bigger models are much more reliable for code."),
	UseCase("agents", "Tool use / agents",
	"Letting the model call tools, search, or take steps for you.",
	"medium", "medium", overhead_factor=1.15,
	note="Needs steady instruction-following — go medium or larger."),
	UseCase("rag", "Document Q&A (your own files)",
	"Answering questions over your PDFs/notes (a.k.a. RAG).",
	"small", "medium", overhead_factor=1.25,
	note="Long documents use extra memory for context."),
	UseCase("finetune", "Teaching it your own data (fine-tuning)",
	"Training a small adapter (LoRA/QLoRA) on your examples.",
	"small", "medium", overhead_factor=2.2,
	note="Training needs roughly 2-3x the memory of just chatting."),
	]

	USE_CASE_BY_KEY = {u.key: u for u in USE_CASES}