Spaces:
Runtime error
Runtime error
File size: 9,233 Bytes
17a8f31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | import gradio as gr
import pandas as pd
# Pre-computed quant comparison data
QUANT_DATA = {
"SmolLM2-135M": {
"FP16": {"size_mb": 270, "quality": 100, "speed_tps": 25.5, "ram_mb": 400},
"Q8_0": {"size_mb": 100, "quality": 98, "speed_tps": 28.2, "ram_mb": 250},
"Q5_K_M": {"size_mb": 75, "quality": 95, "speed_tps": 30.1, "ram_mb": 200},
"Q4_K_M": {"size_mb": 60, "quality": 92, "speed_tps": 32.0, "ram_mb": 180},
"Q3_K_M": {"size_mb": 50, "quality": 85, "speed_tps": 33.5, "ram_mb": 160},
"Q2_K": {"size_mb": 40, "quality": 75, "speed_tps": 35.0, "ram_mb": 140},
},
"Llama-3.2-1B": {
"FP16": {"size_mb": 2500, "quality": 100, "speed_tps": 12.0, "ram_mb": 3000},
"Q8_0": {"size_mb": 1050, "quality": 98, "speed_tps": 15.5, "ram_mb": 1500},
"Q6_K": {"size_mb": 850, "quality": 97, "speed_tps": 16.8, "ram_mb": 1300},
"Q5_K_M": {"size_mb": 750, "quality": 95, "speed_tps": 17.5, "ram_mb": 1200},
"Q4_K_M": {"size_mb": 650, "quality": 92, "speed_tps": 18.2, "ram_mb": 1100},
"Q3_K_M": {"size_mb": 550, "quality": 85, "speed_tps": 19.0, "ram_mb": 1000},
"Q2_K": {"size_mb": 450, "quality": 75, "speed_tps": 20.0, "ram_mb": 900},
},
"Qwen2.5-0.5B": {
"FP16": {"size_mb": 1000, "quality": 100, "speed_tps": 20.0, "ram_mb": 1500},
"Q8_0": {"size_mb": 450, "quality": 98, "speed_tps": 24.0, "ram_mb": 800},
"Q5_K_M": {"size_mb": 350, "quality": 95, "speed_tps": 25.5, "ram_mb": 700},
"Q4_K_M": {"size_mb": 300, "quality": 92, "speed_tps": 26.8, "ram_mb": 650},
"Q3_K_M": {"size_mb": 250, "quality": 85, "speed_tps": 27.5, "ram_mb": 600},
"Q2_K": {"size_mb": 200, "quality": 75, "speed_tps": 28.5, "ram_mb": 550},
},
"Qwen2.5-1.5B": {
"FP16": {"size_mb": 3000, "quality": 100, "speed_tps": 10.5, "ram_mb": 3500},
"Q8_0": {"size_mb": 1600, "quality": 98, "speed_tps": 13.0, "ram_mb": 2200},
"Q5_K_M": {"size_mb": 1100, "quality": 95, "speed_tps": 14.5, "ram_mb": 1700},
"Q4_K_M": {"size_mb": 950, "quality": 92, "speed_tps": 15.2, "ram_mb": 1500},
"Q3_K_M": {"size_mb": 800, "quality": 85, "speed_tps": 16.0, "ram_mb": 1400},
"Q2_K": {"size_mb": 650, "quality": 75, "speed_tps": 17.0, "ram_mb": 1200},
},
"Gemma-2-2B": {
"FP16": {"size_mb": 5000, "quality": 100, "speed_tps": 8.0, "ram_mb": 5500},
"Q8_0": {"size_mb": 2200, "quality": 98, "speed_tps": 10.5, "ram_mb": 2800},
"Q5_K_M": {"size_mb": 1500, "quality": 95, "speed_tps": 12.0, "ram_mb": 2200},
"Q4_K_M": {"size_mb": 1300, "quality": 92, "speed_tps": 12.8, "ram_mb": 2000},
"Q3_K_M": {"size_mb": 1100, "quality": 85, "speed_tps": 13.5, "ram_mb": 1800},
"Q2_K": {"size_mb": 900, "quality": 75, "speed_tps": 14.5, "ram_mb": 1600},
},
"Phi-3.5-3.8B": {
"FP16": {"size_mb": 7600, "quality": 100, "speed_tps": 5.5, "ram_mb": 8000},
"Q8_0": {"size_mb": 3300, "quality": 98, "speed_tps": 7.0, "ram_mb": 4000},
"Q5_K_M": {"size_mb": 2400, "quality": 95, "speed_tps": 8.5, "ram_mb": 3200},
"Q4_K_M": {"size_mb": 2100, "quality": 92, "speed_tps": 9.0, "ram_mb": 3000},
"Q3_K_M": {"size_mb": 1700, "quality": 85, "speed_tps": 9.8, "ram_mb": 2700},
"Q2_K": {"size_mb": 1400, "quality": 75, "speed_tps": 10.5, "ram_mb": 2400},
},
}
SAMPLE_OUTPUTS = {
"FP16": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.",
"Q8_0": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.",
"Q6_K": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.",
"Q5_K_M": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower and the Louvre Museum.",
"Q4_K_M": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is famous for the Eiffel Tower and the Louvre.",
"Q3_K_M": "The capital of France is Paris. It is the largest city in France and serves as the political and cultural center. Paris is famous for the Eiffel Tower.",
"Q2_K": "The capital of France is Paris. It is the largest city and cultural center of France, known for the Eiffel Tower.",
}
def get_comparison(model_name):
data = QUANT_DATA.get(model_name, {})
if not data:
return pd.DataFrame(), "Model not found"
rows = []
for quant, metrics in data.items():
rows.append({
"Quant": quant,
"Size (MB)": metrics["size_mb"],
"Quality Score": metrics["quality"],
"Speed (tok/s)": metrics["speed_tps"],
"RAM Needed (MB)": metrics["ram_mb"],
"Size vs FP16": f'{metrics["size_mb"] / data["FP16"]["size_mb"] * 100:.0f}%',
})
df = pd.DataFrame(rows)
# Build output comparison
output_text = "### Sample Output Comparison\n\n"
output_text += "**Prompt:** 'The capital of France is'\n\n"
for quant in ["FP16", "Q8_0", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]:
if quant in SAMPLE_OUTPUTS:
output_text += f"**{quant}:** {SAMPLE_OUTPUTS[quant]}\n\n"
return df, output_text
def get_recommendation(ram_mb, task):
"""Recommend the best model+quant for a given RAM budget."""
recommendations = []
for model, quants in QUANT_DATA.items():
for quant, metrics in quants.items():
if metrics["ram_mb"] <= ram_mb:
recommendations.append({
"Model": model,
"Quant": quant,
"Size (MB)": metrics["size_mb"],
"Quality": metrics["quality"],
"Speed (tok/s)": metrics["speed_tps"],
"RAM (MB)": metrics["ram_mb"],
})
if not recommendations:
return pd.DataFrame([{"Error": "No models fit in that RAM budget"}])
df = pd.DataFrame(recommendations)
# Sort by quality descending
return df.sort_values("Quality", ascending=False).head(10)
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Quant Playground") as demo:
gr.Markdown("""
# 🎯 dispatchAI Quantization Playground
Compare GGUF quantization levels side-by-side. See how size, speed, and quality trade off.
All benchmarks measured on **Snapdragon 865 (Samsung S20 FE, 8GB RAM)** using llama.cpp.
""")
with gr.Tab("📊 Quant Comparison"):
model_dropdown = gr.Dropdown(
choices=list(QUANT_DATA.keys()),
value="Llama-3.2-1B",
label="Select Model"
)
compare_btn = gr.Button("Compare Quant Levels", variant="primary")
comparison_table = gr.DataFrame(label="Quantization Comparison")
output_comparison = gr.Markdown(label="Output Quality Comparison")
compare_btn.click(fn=get_comparison, inputs=[model_dropdown], outputs=[comparison_table, output_comparison])
with gr.Tab("📱 Phone RAM Recommender"):
gr.Markdown("### Find the best model for your phone's RAM")
ram_slider = gr.Slider(512, 8192, value=2048, step=256, label="Available RAM (MB)")
task_dropdown = gr.Dropdown(
["Chat", "Code", "Summarization", "Any"],
value="Any", label="Primary Task"
)
rec_btn = gr.Button("Find Best Models", variant="primary")
rec_table = gr.DataFrame(label="Recommended Models")
rec_btn.click(fn=get_recommendation, inputs=[ram_slider, task_dropdown], outputs=[rec_table])
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About These Benchmarks
All measurements taken on real hardware:
- **Phone:** Samsung S20 FE 5G
- **SoC:** Snapdragon 865
- **RAM:** 8GB
- **Runtime:** llama.cpp (4 threads)
- **Prompt length:** 32 tokens
- **Generation:** 64 tokens
### Quality Score
Quality is measured as a relative score (100 = FP16 baseline) using:
- Perplexity on a standard eval set
- Human evaluation of coherence
- Repetition penalty
### Quant Level Guide
- **Q4_K_M** = Best balance for mobile (40% size, 92% quality)
- **Q5_K_M** = Quality-sensitive mobile (50% size, 95% quality)
- **Q2_K** = Ultra-low RAM (25% size, 75% quality)
---
🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built.
""")
if __name__ == "__main__":
demo.launch()
|