Spaces:

dispatchAI
/

model-recommender

Runtime error

File size: 10,605 Bytes

#!/usr/bin/env python3
"""
#301: On-device readiness checker — a Gradio Space that evaluates whether a
given model will run on a mobile device.

Paste a HuggingFace model ID or upload a config, get a "will it run on a phone?" report:
- Parameter count vs memory budget
- Architecture compatibility
- Quantization recommendations
- Estimated phone farm performance
- Recommended dispatchAI model alternatives
"""
import gradio as gr
import json
import requests
from huggingface_hub import hf_hub_download, HfApi
import os

token = os.environ.get("HF_TOKEN", "")

# Phone farm specs
PHONE_SPECS = {
    "Samsung S20 FE (Snapdragon 865, 8GB)": {
        "chipset": "Snapdragon 865",
        "ram_gb": 8,
        "usable_ram_gb": 6,  # After OS overhead
        "cpu_cores": 8,
        "max_model_size_gb": 4,  # Safe limit for 8GB phone
    },
    "Samsung S23 (Snapdragon 8 Gen 2, 8GB)": {
        "chipset": "Snapdragon 8 Gen 2",
        "ram_gb": 8,
        "usable_ram_gb": 6,
        "cpu_cores": 8,
        "max_model_size_gb": 4,
    },
    "iPhone 15 Pro (A17 Pro, 8GB)": {
        "chipset": "Apple A17 Pro",
        "ram_gb": 8,
        "usable_ram_gb": 6,
        "cpu_cores": 6,
        "max_model_size_gb": 4,
    },
    "Budget Android (4GB RAM)": {
        "chipset": "Mid-range",
        "ram_gb": 4,
        "usable_ram_gb": 3,
        "cpu_cores": 8,
        "max_model_size_gb": 2,
    },
}

# dispatchAI model catalog for recommendations
DISPATCHAI_MODELS = [
    {"id": "dispatchAI/SmolLM2-135M-Instruct-mobile", "params_m": 135, "size_mb": 270, "task": "chat"},
    {"id": "dispatchAI/SmolLM2-360M-Instruct-mobile", "params_m": 360, "size_mb": 720, "task": "chat"},
    {"id": "dispatchAI/Qwen2.5-0.5B-Instruct-mobile-int4", "params_m": 500, "size_mb": 350, "task": "chat"},
    {"id": "dispatchAI/Qwen2.5-0.5B-Coder-mobile", "params_m": 500, "size_mb": 350, "task": "code"},
    {"id": "dispatchAI/Llama-3.2-1B-Instruct-mobile", "params_m": 1000, "size_mb": 2000, "task": "chat"},
    {"id": "dispatchAI/TinyLlama-1.1B-Chat-Q5-mobile", "params_m": 1100, "size_mb": 450, "task": "chat"},
    {"id": "dispatchAI/Qwen2.5-1.5B-Instruct-Q5-mobile", "params_m": 1500, "size_mb": 900, "task": "chat"},
    {"id": "dispatchAI/Gemma-2-2B-IT-Q5-mobile", "params_m": 2000, "size_mb": 1300, "task": "chat"},
    {"id": "dispatchAI/Phi-3.5-mini-instruct-Q5-mobile", "params_m": 2000, "size_mb": 1300, "task": "chat"},
    {"id": "dispatchAI/Gemma-2B-Arabic-mobile", "params_m": 2000, "size_mb": 1300, "task": "arabic"},
]

def fetch_model_info(model_id):
    """Fetch config.json from HuggingFace."""
    try:
        config_path = hf_hub_download(model_id, "config.json", token=token)
        with open(config_path, "r") as f:
            config = json.load(f)
        
        # Try to get model size from safetensors
        api = HfApi(token=token)
        files = api.list_repo_files(model_id, token=token)
        
        size_mb = 0
        for f in files:
            if f.endswith(".safetensors") or f.endswith(".bin") or f.endswith(".gguf"):
                try:
                    info = api.get_paths_info(model_id, [f], repo_type="model", token=token)
                    if info and hasattr(info[0], 'size'):
                        size_mb += info[0].size / 1e6
                except:
                    pass
        
        return config, size_mb
    except Exception as e:
        return None, str(e)

def estimate_params(config):
    """Estimate parameter count from config."""
    try:
        hidden = config.get("hidden_size", 0)
        layers = config.get("num_hidden_layers", config.get("num_layers", 0))
        vocab = config.get("vocab_size", 0)
        intermediate = config.get("intermediate_size", hidden * 4)
        
        # Rough estimate: transformers params
        # Attention: 4 * hidden^2 per layer (Q, K, V, O)
        # MLP: 2 * hidden * intermediate per layer
        # Embeddings: vocab * hidden
        attention_params = 4 * hidden * hidden * layers
        mlp_params = 2 * hidden * intermediate * layers
        embed_params = vocab * hidden
        
        total = attention_params + mlp_params + embed_params
        return total / 1e6  # in millions
    except:
        return 0

def check_readiness(model_id, target_device):
    """Check if a model will run on the target device."""
    if not model_id.strip():
        return "Please enter a HuggingFace model ID."
    
    config_result = fetch_model_info(model_id.strip())
    
    if isinstance(config_result[1], str) and not config_result[0]:
        return f"❌ **Error fetching model info**: {config_result[1]}\n\nCheck the model ID and try again."
    
    config, size_mb = config_result
    if not config:
        return f"❌ Could not fetch config for `{model_id}`"
    
    specs = PHONE_SPECS.get(target_device, PHONE_SPECS["Samsung S20 FE (Snapdragon 865, 8GB)"])
    
    # Estimate parameters
    params_m = estimate_params(config)
    model_type = config.get("model_type", "unknown")
    hidden_size = config.get("hidden_size", 0)
    num_layers = config.get("num_hidden_layers", 0)
    
    # If we couldn't get size from API, estimate it
    if size_mb == 0:
        size_mb = params_m * 2  # fp16: 2 bytes per param
    
    # Estimates for different quantizations
    size_fp16_mb = params_m * 2
    size_q8_mb = params_m * 1
    size_q5_mb = params_m * 0.625
    size_q4_mb = params_m * 0.5
    
    # Phone farm performance estimate (based on real benchmarks)
    # S20 FE: ~18 t/s for 135M, ~10 t/s for 500M, ~6 t/s for 1B, ~3 t/s for 2B
    if params_m < 200:
        est_tps = "15-20 t/s"
        rating = "🟢 Excellent"
    elif params_m < 600:
        est_tps = "8-12 t/s"
        rating = "🟢 Good"
    elif params_m < 1200:
        est_tps = "5-7 t/s"
        rating = "🟡 Usable"
    elif params_m < 2500:
        est_tps = "2-4 t/s"
        rating = "🟠 Slow"
    else:
        est_tps = "< 2 t/s"
        rating = "🔴 Too large"
    
    # Memory check
    fits_fp16 = size_fp16_mb < specs["max_model_size_gb"] * 1024
    fits_q5 = size_q5_mb < specs["max_model_size_gb"] * 1024
    fits_q4 = size_q4_mb < specs["max_model_size_gb"] * 1024
    
    # Find recommended dispatchAI alternatives
    recommendations = []
    for m in DISPATCHAI_MODELS:
        if m["params_m"] <= params_m * 1.2 and m["params_m"] >= params_m * 0.5:
            recommendations.append(m)
    if not recommendations:
        # Find closest smaller model
        smaller = [m for m in DISPATCHAI_MODELS if m["params_m"] < params_m]
        if smaller:
            recommendations = sorted(smaller, key=lambda x: x["params_m"], reverse=True)[:3]
    
    rec_text = "\n".join([f"- [`{m['id']}`](https://huggingface.co/{m['id']}) — {m['params_m']}M params, {m['size_mb']}MB" 
                          for m in recommendations[:5]])
    
    report = f"""## 📱 On-Device Readiness Report

### Model: `{model_id}`

| Property | Value |
|----------|-------|
| Architecture | {model_type} |
| Hidden size | {hidden_size} |
| Layers | {num_layers} |
| Estimated params | ~{params_m:.0f}M |

### Size estimates by quantization

| Format | Size | Fits {target_device.split('(')[0].strip()}? |
|--------|------|------|
| FP16 | {size_fp16_mb:.0f}MB | {"✅" if fits_fp16 else "❌"} |
| Q8 | {size_q8_mb:.0f}MB | {"✅" if fits_q8 else "❌"} |
| Q5_K_M | {size_q5_mb:.0f}MB | {"✅" if fits_q5 else "❌"} |
| Q4_K_M | {size_q4_mb:.0f}MB | {"✅" if fits_q4 else "❌"} |

### Performance estimate (Snapdragon 865)

| Metric | Value |
|--------|-------|
| Estimated speed | {est_tps} |
| Readiness | {rating} |

### Target device: {target_device}

| Property | Value |
|----------|-------|
| Chipset | {specs['chipset']} |
| RAM | {specs['ram_gb']}GB |
| Max model size | {specs['max_model_size_gb']}GB |

### Recommended dispatchAI alternatives

{rec_text if rec_text else "No close matches found."}

### Recommendation

"""
    if "🟢" in rating:
        report += "✅ **This model is ready for mobile deployment.** Use Q4_K_M or Q5_K_M GGUF for best size/quality balance."
    elif "🟡" in rating:
        report += "⚠️ **This model is usable but may be slow.** Consider Q4_K_M quantization and test on target hardware."
    elif "🟠" in rating:
        report += "⚠️ **This model will be slow on mobile.** Consider a smaller alternative from dispatchAI."
    else:
        report += "❌ **This model is too large for mobile deployment.** Use a dispatchAI alternative above."
    
    return report

# Custom CSS
custom_css = """
.gradio-container { background: #0A0F1A !important; color: #F5F7FA !important; }
h1, h2, h3 { color: #1FE0E6 !important; }
.gr-button { background: linear-gradient(135deg, #2E6BFF, #1FE0E6) !important; color: #0A0F1A !important; }
"""

with gr.Blocks(css=custom_css, title="On-Device Readiness Checker") as demo:
    gr.Markdown("""
    # 📱 On-Device Readiness Checker
    
    **Will your model run on a phone?** Paste a HuggingFace model ID and find out.
    
    Powered by [dispatchAI](https://huggingface.co/dispatchAI) — mobile AI that runs.
    """)
    
    with gr.Row():
        model_input = gr.Textbox(
            label="HuggingFace Model ID",
            placeholder="e.g., Qwen/Qwen2.5-0.5B-Instruct",
            scale=3
        )
        device_input = gr.Dropdown(
            choices=list(PHONE_SPECS.keys()),
            value="Samsung S20 FE (Snapdragon 865, 8GB)",
            label="Target Device",
            scale=2
        )
        check_btn = gr.Button("Check Readiness", variant="primary", scale=1)
    
    report_output = gr.Markdown(label="Readiness Report")
    
    check_btn.click(
        fn=check_readiness,
        inputs=[model_input, device_input],
        outputs=report_output
    )
    
    gr.Markdown("""
    ---
    ### How it works
    
    1. Fetches the model's `config.json` from HuggingFace
    2. Estimates parameter count and size for each quantization level
    3. Compares against the target device's memory budget
    4. Estimates inference speed based on real phone farm benchmarks
    5. Recommends dispatchAI mobile-optimized alternatives
    
    ### Try these models
    
    - `Qwen/Qwen2.5-0.5B-Instruct` — small model, should pass
    - `Qwen/Qwen2.5-7B-Instruct` — large model, should fail
    - `meta-llama/Llama-3.2-1B-Instruct` — borderline
    - `HuggingFaceTB/SmolLM2-135M-Instruct` — tiny, excellent
    
    ---
    *Dispatch AI (FZE), Sharjah SRTI Free Zone, License No. 10818.*
    """)

if __name__ == "__main__":
    demo.launch()