Spaces:
Running on Zero
Running on Zero
File size: 5,787 Bytes
12d2e34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """
Putting it in plain words.
The advisor produces structured facts; this module turns them into sentences a
non-technical person actually understands, and into commands they can copy and
paste. No jargon survives here without being explained.
"""
from .advisor import (
Advice,
ModelVerdict,
VERDICT_WORKS,
VERDICT_COMPROMISE,
VERDICT_NO,
)
VERDICT_EMOJI = {
VERDICT_WORKS: "🟢",
VERDICT_COMPROMISE: "🟡",
VERDICT_NO: "🔴",
}
VERDICT_WORD = {
VERDICT_WORKS: "Works now",
VERDICT_COMPROMISE: "Works, with compromises",
VERDICT_NO: "Don't bother",
}
def speed_hint(v: ModelVerdict, spec) -> str:
"""A rough, honest feel for how fast replies will come."""
if v.verdict == VERDICT_NO:
return "—"
if v.verdict == VERDICT_COMPROMISE:
return "Slow — usable for short tasks, not snappy chat."
# Works now (fast path). Bigger models are still slower even on a GPU.
if v.model.billions <= 4:
return "Fast — replies feel instant."
if v.model.billions <= 14:
return "Comfortable — quick enough for live chat."
return "Steady — fine, just not instant on big answers."
# --------------------------------------------------------------------------
# Commands
# --------------------------------------------------------------------------
def ollama_command(v: ModelVerdict) -> str:
return f"ollama run {v.model.ollama_tag}"
def llamacpp_command(v: ModelVerdict) -> str:
# llama.cpp can pull a GGUF straight from Hugging Face by repo:quant.
return (f"llama-server -hf {v.model.gguf_repo}:{v.quant.key} "
f"-c {v.estimate.context_tokens}")
# --------------------------------------------------------------------------
# Headline summary, in human words
# --------------------------------------------------------------------------
def headline_text(advice: Advice) -> str:
spec = advice.spec
uc = advice.use_case
h = advice.headline
if h is None:
return (
f"**Honest answer: this machine can't comfortably run local AI "
f"for {uc.plain_name.lower()} yet.**\n\n"
f"Even the smallest models need more memory than the "
f"{spec.ram_gb:g} GB available here once everything else is "
f"running. That's not a failure — small computers just have small "
f"budgets. A free cloud option, or adding memory, would open this up."
)
m = h.model
q = h.quant
fast = "on the graphics card" if spec.has_fast_path and h.verdict == VERDICT_WORKS else "on the processor"
if h.verdict == VERDICT_WORKS:
lead = f"**Yes — you can run a {m.plain_name} model {fast}, today.**"
elif h.verdict == VERDICT_COMPROMISE:
lead = f"**Sort of — a {m.plain_name} model will run, but with trade-offs.**"
else:
lead = f"**Not really — even a {m.plain_name} model is a stretch here.**"
body = (
f"\n\nFor **{uc.plain_name.lower()}**, the sweet spot on your machine is a "
f"**{m.plain_name}** model at the **{q.plain_name}** setting. "
f"{m.good_for}\n\n"
f"That needs about **{h.estimate.total_gb:g} GB** of memory "
f"(model {h.estimate.weights_gb:g} GB + chat memory "
f"{h.estimate.kv_cache_gb:g} GB + working space {h.estimate.overhead_gb:g} GB), "
f"and you have roughly **{spec.fast_budget_gb:g} GB** fast / "
f"**{spec.total_budget_gb:g} GB** total to play with."
)
extra = ""
if uc.note:
extra += f"\n\n*Note for this job:* {uc.note}"
if h.notes:
extra += "\n\n" + "\n".join(f"- {n}" for n in h.notes)
return lead + body + extra
def jargon_glossary() -> str:
return (
"**Plain-English glossary**\n\n"
"- **Model** — the AI's 'brain'. Bigger = smarter but heavier.\n"
"- **Parameters (e.g. 7B)** — how big the brain is. 7B = 7 billion. "
"More = smarter and hungrier for memory.\n"
"- **Quantisation (4-bit, 8-bit)** — shrinking the model so it fits. "
"4-bit is the popular sweet spot: much smaller, barely-noticeable quality loss.\n"
"- **VRAM** — the fast memory on a graphics card. The single biggest "
"factor in what you can run quickly.\n"
"- **RAM** — your computer's normal memory. Models can use it too, but it's slower.\n"
"- **KV cache / 'chat memory'** — scratch space the model uses to "
"remember the current conversation. Longer chats use more.\n"
"- **GGUF** — a single-file model format made for running locally.\n"
"- **llama.cpp / Ollama** — the programs that actually run the model on your machine."
)
def how_to_find_specs(os_hint: str = "windows") -> str:
common = (
"**Not sure of your specs? Here's how to check:**\n\n"
)
if os_hint == "macos":
return common + (
"- Click the Apple menu (top-left) → **About This Mac**.\n"
"- It shows your chip (e.g. *Apple M2*) and **Memory** (e.g. *16 GB*).\n"
"- On a Mac, that one memory number is all you need — the graphics "
"share it."
)
if os_hint == "linux":
return common + (
"- RAM: run `free -h` in a terminal.\n"
"- Graphics card: run `nvidia-smi` (NVIDIA) or `lspci | grep VGA`.\n"
)
return common + (
"- **RAM:** press `Ctrl + Shift + Esc` → **Performance** tab → **Memory**.\n"
"- **Graphics card:** same window → **GPU**. The name is at the top "
"right (e.g. *NVIDIA RTX 3060*).\n"
"- No GPU section showing a real card? You likely have built-in "
"graphics — that's fine, just pick the 'built-in' option."
)
|