Spaces:

build-small-hackathon
/

FitCheck

Running on Zero

File size: 8,090 Bytes

12d2e34

"""
The advisor: turn a machine + a goal into an honest verdict.

Output is organised into three plain bands, because that is what makes the
tool trustworthy instead of hypey:

  - WORKS NOW          : runs well, on the fast path, today.
  - WORKS WITH COMPROMISES : it'll run, but slower or smaller than ideal.
  - DON'T BOTHER       : not realistic on this machine — said plainly.

No fake promises. If something doesn't fit, we say so and explain why.
"""

from dataclasses import dataclass, field

from .catalogue import (
    MODEL_CLASSES,
    QUANT_TIERS,
    RECOMMENDED_QUANT,
    QUANT_BY_KEY,
    MODEL_BY_KEY,
    ModelClass,
    QuantTier,
    UseCase,
    USE_CASE_BY_KEY,
)
from .estimator import MemoryEstimate, estimate_memory
from .hardware import HardwareSpec
from .runtimes import Runtime, pick_runtimes


# How much text (context) we assume per job, in tokens. ~750 words per 1000.
_CONTEXT_FOR_USE_CASE = {
    "chat": 4096,
    "writing": 4096,
    "coding": 4096,
    "agents": 4096,
    "rag": 8192,
    "finetune": 2048,
}

# We only ever fill a budget to this fraction — the rest is breathing room.
_SAFETY_FILL = 0.90

VERDICT_WORKS = "works_now"
VERDICT_COMPROMISE = "compromises"
VERDICT_NO = "dont_bother"


@dataclass
class ModelVerdict:
    model: ModelClass
    verdict: str                 # one of the VERDICT_* constants
    quant: QuantTier             # the quant we'd actually recommend
    estimate: MemoryEstimate
    full_quality_on_fast: bool   # True if it runs on the GPU at fp16/near-full
    notes: list[str] = field(default_factory=list)


@dataclass
class Advice:
    spec: HardwareSpec
    use_case: UseCase
    context_tokens: int
    verdicts: list[ModelVerdict]            # one per model class, big→small order kept
    headline: ModelVerdict | None           # the single best pick for this goal
    runtimes: list[Runtime]
    meets_goal: bool                         # does the headline satisfy the use case?

    @property
    def works_now(self) -> list[ModelVerdict]:
        return [v for v in self.verdicts if v.verdict == VERDICT_WORKS]

    @property
    def compromises(self) -> list[ModelVerdict]:
        return [v for v in self.verdicts if v.verdict == VERDICT_COMPROMISE]

    @property
    def dont_bother(self) -> list[ModelVerdict]:
        return [v for v in self.verdicts if v.verdict == VERDICT_NO]


def _evaluate_model(
    model: ModelClass, spec: HardwareSpec, use_case: UseCase, context_tokens: int
) -> ModelVerdict:
    fast = spec.fast_budget_gb
    total = spec.total_budget_gb
    of = use_case.overhead_factor
    q4_bpw = RECOMMENDED_QUANT.bits_per_weight  # the 4-bit quality floor

    # --- Fast path: best *quality* quant that fits on the GPU/shared mem ---
    # We only call it "Works now" if it fits fast at 4-bit or better. Cramming
    # a big model down to 2-bit just to claim it "fits" is exactly the kind of
    # overpromise this tool refuses to make — that path becomes a compromise.
    if spec.has_fast_path:
        for q in QUANT_TIERS:  # ordered best-quality -> smallest
            if q.bits_per_weight < q4_bpw:
                break  # don't accept sub-4-bit as a clean "works now"
            est = estimate_memory(model, q, context_tokens=context_tokens,
                                  job_overhead_factor=of)
            if est.total_gb <= fast * _SAFETY_FILL:
                full_q = q.key in ("fp16", "Q8_0", "Q6_K")
                notes = []
                if q is not RECOMMENDED_QUANT and not full_q:
                    notes.append(f"Runs at {q.plain_name} — even a touch sharper than the usual 4-bit.")
                return ModelVerdict(model, VERDICT_WORKS, q, est, full_q, notes)

    # --- Compromise path: fits if we let it use ordinary RAM (slower) ------
    # Prefer the everyday 4-bit; drop smaller only if needed.
    for q in (RECOMMENDED_QUANT, QUANT_BY_KEY["Q3_K_M"], QUANT_BY_KEY["Q2_K"]):
        est = estimate_memory(model, q, context_tokens=context_tokens,
                              job_overhead_factor=of)
        if est.total_gb <= total * _SAFETY_FILL:
            notes = []
            if not spec.has_fast_path:
                notes.append("Runs on the processor (no graphics card to speed it up) — expect slow replies.")
            else:
                notes.append("Too big to fit the graphics card on its own — part runs on slower memory, so replies come more slowly.")
            if q is not RECOMMENDED_QUANT:
                notes.append(f"Had to shrink it to {q.plain_name} to fit — some quality is lost.")
            return ModelVerdict(model, VERDICT_COMPROMISE, q, est, False, notes)

    # --- Doesn't fit even at the smallest setting --------------------------
    est = estimate_memory(model, QUANT_BY_KEY["Q2_K"], context_tokens=context_tokens,
                          job_overhead_factor=of)
    short_by = round(est.total_gb - total, 1)
    notes = [f"Needs about {est.total_gb:g} GB even squeezed down — "
             f"around {short_by:g} GB more than this machine can give it."]
    return ModelVerdict(model, VERDICT_NO, QUANT_BY_KEY["Q2_K"], est, False, notes)


def _rank(model_key: str) -> int:
    return next(i for i, m in enumerate(MODEL_CLASSES) if m.key == model_key)


def advise(spec: HardwareSpec, use_case_key: str = "chat") -> Advice:
    """Produce full advice for a machine and a goal."""
    use_case = USE_CASE_BY_KEY.get(use_case_key, USE_CASE_BY_KEY["chat"])
    context_tokens = _CONTEXT_FOR_USE_CASE.get(use_case.key, 4096)

    # Evaluate every size class, biggest first (so the table reads top-down).
    verdicts = [
        _evaluate_model(m, spec, use_case, context_tokens)
        for m in reversed(MODEL_CLASSES)
    ]

    # --- Headline: the single "just use this" pick -----------------------
    # Priorities, in order:
    #   1. The biggest model that WORKS NOW (fast + good quality) and is at
    #      least big enough for the job. Fast-and-capable is the best answer.
    #   2. If nothing fast is big enough, the best COMPROMISE that does the
    #      job — sized close to ideal, not needlessly oversized-and-slow.
    #   3. Otherwise, the best we can honestly offer, flagged as below-par.
    good_rank = _rank(use_case.good_class)
    min_rank = _rank(use_case.min_class)

    q4_bpw = RECOMMENDED_QUANT.bits_per_weight
    works = [v for v in verdicts if v.verdict == VERDICT_WORKS]
    comp = [v for v in verdicts if v.verdict == VERDICT_COMPROMISE]

    def largest(vs):
        return max(vs, key=lambda v: _rank(v.model.key))

    def nearest_good(vs):
        # Closest to the ideal size without overshooting into needless slowness.
        below = [v for v in vs if _rank(v.model.key) <= good_rank]
        return largest(below) if below else min(vs, key=lambda v: _rank(v.model.key))

    def decent(vs):
        # Don't headline a model that only fits at a desperate sub-4-bit squeeze
        # if a cleaner option exists — quality matters more than size on the box.
        return [v for v in vs if v.quant.bits_per_weight >= q4_bpw]

    works_ok = [v for v in works if _rank(v.model.key) >= min_rank]
    comp_ok = [v for v in comp if _rank(v.model.key) >= min_rank]

    headline = None
    meets_goal = False
    if works_ok:
        headline, meets_goal = largest(works_ok), True
    elif comp_ok:
        headline, meets_goal = nearest_good(decent(comp_ok) or comp_ok), True
    elif works:
        headline, meets_goal = largest(works), False
    elif comp:
        headline, meets_goal = nearest_good(decent(comp) or comp), False

    if headline is not None and not meets_goal:
        headline.notes.insert(
            0, f"This is the best this machine can do, but it's on the small "
               f"side for {use_case.plain_name.lower()} — treat results as 'okay', not great.")

    return Advice(
        spec=spec,
        use_case=use_case,
        context_tokens=context_tokens,
        verdicts=verdicts,
        headline=headline,
        runtimes=pick_runtimes(spec),
        meets_goal=meets_goal,
    )