File size: 8,090 Bytes
12d2e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
The advisor: turn a machine + a goal into an honest verdict.

Output is organised into three plain bands, because that is what makes the
tool trustworthy instead of hypey:

  - WORKS NOW          : runs well, on the fast path, today.
  - WORKS WITH COMPROMISES : it'll run, but slower or smaller than ideal.
  - DON'T BOTHER       : not realistic on this machine — said plainly.

No fake promises. If something doesn't fit, we say so and explain why.
"""

from dataclasses import dataclass, field

from .catalogue import (
    MODEL_CLASSES,
    QUANT_TIERS,
    RECOMMENDED_QUANT,
    QUANT_BY_KEY,
    MODEL_BY_KEY,
    ModelClass,
    QuantTier,
    UseCase,
    USE_CASE_BY_KEY,
)
from .estimator import MemoryEstimate, estimate_memory
from .hardware import HardwareSpec
from .runtimes import Runtime, pick_runtimes


# How much text (context) we assume per job, in tokens. ~750 words per 1000.
_CONTEXT_FOR_USE_CASE = {
    "chat": 4096,
    "writing": 4096,
    "coding": 4096,
    "agents": 4096,
    "rag": 8192,
    "finetune": 2048,
}

# We only ever fill a budget to this fraction — the rest is breathing room.
_SAFETY_FILL = 0.90

VERDICT_WORKS = "works_now"
VERDICT_COMPROMISE = "compromises"
VERDICT_NO = "dont_bother"


@dataclass
class ModelVerdict:
    model: ModelClass
    verdict: str                 # one of the VERDICT_* constants
    quant: QuantTier             # the quant we'd actually recommend
    estimate: MemoryEstimate
    full_quality_on_fast: bool   # True if it runs on the GPU at fp16/near-full
    notes: list[str] = field(default_factory=list)


@dataclass
class Advice:
    spec: HardwareSpec
    use_case: UseCase
    context_tokens: int
    verdicts: list[ModelVerdict]            # one per model class, big→small order kept
    headline: ModelVerdict | None           # the single best pick for this goal
    runtimes: list[Runtime]
    meets_goal: bool                         # does the headline satisfy the use case?

    @property
    def works_now(self) -> list[ModelVerdict]:
        return [v for v in self.verdicts if v.verdict == VERDICT_WORKS]

    @property
    def compromises(self) -> list[ModelVerdict]:
        return [v for v in self.verdicts if v.verdict == VERDICT_COMPROMISE]

    @property
    def dont_bother(self) -> list[ModelVerdict]:
        return [v for v in self.verdicts if v.verdict == VERDICT_NO]


def _evaluate_model(
    model: ModelClass, spec: HardwareSpec, use_case: UseCase, context_tokens: int
) -> ModelVerdict:
    fast = spec.fast_budget_gb
    total = spec.total_budget_gb
    of = use_case.overhead_factor
    q4_bpw = RECOMMENDED_QUANT.bits_per_weight  # the 4-bit quality floor

    # --- Fast path: best *quality* quant that fits on the GPU/shared mem ---
    # We only call it "Works now" if it fits fast at 4-bit or better. Cramming
    # a big model down to 2-bit just to claim it "fits" is exactly the kind of
    # overpromise this tool refuses to make — that path becomes a compromise.
    if spec.has_fast_path:
        for q in QUANT_TIERS:  # ordered best-quality -> smallest
            if q.bits_per_weight < q4_bpw:
                break  # don't accept sub-4-bit as a clean "works now"
            est = estimate_memory(model, q, context_tokens=context_tokens,
                                  job_overhead_factor=of)
            if est.total_gb <= fast * _SAFETY_FILL:
                full_q = q.key in ("fp16", "Q8_0", "Q6_K")
                notes = []
                if q is not RECOMMENDED_QUANT and not full_q:
                    notes.append(f"Runs at {q.plain_name} — even a touch sharper than the usual 4-bit.")
                return ModelVerdict(model, VERDICT_WORKS, q, est, full_q, notes)

    # --- Compromise path: fits if we let it use ordinary RAM (slower) ------
    # Prefer the everyday 4-bit; drop smaller only if needed.
    for q in (RECOMMENDED_QUANT, QUANT_BY_KEY["Q3_K_M"], QUANT_BY_KEY["Q2_K"]):
        est = estimate_memory(model, q, context_tokens=context_tokens,
                              job_overhead_factor=of)
        if est.total_gb <= total * _SAFETY_FILL:
            notes = []
            if not spec.has_fast_path:
                notes.append("Runs on the processor (no graphics card to speed it up) — expect slow replies.")
            else:
                notes.append("Too big to fit the graphics card on its own — part runs on slower memory, so replies come more slowly.")
            if q is not RECOMMENDED_QUANT:
                notes.append(f"Had to shrink it to {q.plain_name} to fit — some quality is lost.")
            return ModelVerdict(model, VERDICT_COMPROMISE, q, est, False, notes)

    # --- Doesn't fit even at the smallest setting --------------------------
    est = estimate_memory(model, QUANT_BY_KEY["Q2_K"], context_tokens=context_tokens,
                          job_overhead_factor=of)
    short_by = round(est.total_gb - total, 1)
    notes = [f"Needs about {est.total_gb:g} GB even squeezed down — "
             f"around {short_by:g} GB more than this machine can give it."]
    return ModelVerdict(model, VERDICT_NO, QUANT_BY_KEY["Q2_K"], est, False, notes)


def _rank(model_key: str) -> int:
    return next(i for i, m in enumerate(MODEL_CLASSES) if m.key == model_key)


def advise(spec: HardwareSpec, use_case_key: str = "chat") -> Advice:
    """Produce full advice for a machine and a goal."""
    use_case = USE_CASE_BY_KEY.get(use_case_key, USE_CASE_BY_KEY["chat"])
    context_tokens = _CONTEXT_FOR_USE_CASE.get(use_case.key, 4096)

    # Evaluate every size class, biggest first (so the table reads top-down).
    verdicts = [
        _evaluate_model(m, spec, use_case, context_tokens)
        for m in reversed(MODEL_CLASSES)
    ]

    # --- Headline: the single "just use this" pick -----------------------
    # Priorities, in order:
    #   1. The biggest model that WORKS NOW (fast + good quality) and is at
    #      least big enough for the job. Fast-and-capable is the best answer.
    #   2. If nothing fast is big enough, the best COMPROMISE that does the
    #      job — sized close to ideal, not needlessly oversized-and-slow.
    #   3. Otherwise, the best we can honestly offer, flagged as below-par.
    good_rank = _rank(use_case.good_class)
    min_rank = _rank(use_case.min_class)

    q4_bpw = RECOMMENDED_QUANT.bits_per_weight
    works = [v for v in verdicts if v.verdict == VERDICT_WORKS]
    comp = [v for v in verdicts if v.verdict == VERDICT_COMPROMISE]

    def largest(vs):
        return max(vs, key=lambda v: _rank(v.model.key))

    def nearest_good(vs):
        # Closest to the ideal size without overshooting into needless slowness.
        below = [v for v in vs if _rank(v.model.key) <= good_rank]
        return largest(below) if below else min(vs, key=lambda v: _rank(v.model.key))

    def decent(vs):
        # Don't headline a model that only fits at a desperate sub-4-bit squeeze
        # if a cleaner option exists — quality matters more than size on the box.
        return [v for v in vs if v.quant.bits_per_weight >= q4_bpw]

    works_ok = [v for v in works if _rank(v.model.key) >= min_rank]
    comp_ok = [v for v in comp if _rank(v.model.key) >= min_rank]

    headline = None
    meets_goal = False
    if works_ok:
        headline, meets_goal = largest(works_ok), True
    elif comp_ok:
        headline, meets_goal = nearest_good(decent(comp_ok) or comp_ok), True
    elif works:
        headline, meets_goal = largest(works), False
    elif comp:
        headline, meets_goal = nearest_good(decent(comp) or comp), False

    if headline is not None and not meets_goal:
        headline.notes.insert(
            0, f"This is the best this machine can do, but it's on the small "
               f"side for {use_case.plain_name.lower()} — treat results as 'okay', not great.")

    return Advice(
        spec=spec,
        use_case=use_case,
        context_tokens=context_tokens,
        verdicts=verdicts,
        headline=headline,
        runtimes=pick_runtimes(spec),
        meets_goal=meets_goal,
    )