Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import spaces | |
| import json | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| MODEL_ID = "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile" | |
| tokenizer = None | |
| model = None | |
| def load_model(): | |
| global tokenizer, model | |
| if tokenizer is None: | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| ) | |
| return tokenizer, model | |
| def agent_respond(task: str, history: list) -> str: | |
| """A mobile-optimized agent that can answer questions, write code, and solve tasks. | |
| Powered by dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile β a 1B parameter model | |
| quantized to Q4, designed to run on phones. This proves real agents can run | |
| on pocket-sized models. | |
| """ | |
| tokenizer, model = load_model() | |
| messages = [{"role": "system", "content": "You are a helpful mobile AI assistant. You are running on a 1B parameter model optimized for phones. Be concise and helpful."}] | |
| for h in history: | |
| messages.append({"role": "user", "content": h[0]}) | |
| if h[1]: | |
| messages.append({"role": "assistant", "content": h[1]}) | |
| messages.append({"role": "user", "content": task}) | |
| input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = tokenizer(input_text, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=256, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) | |
| return response | |
| def agent_code(instruction: str) -> str: | |
| """Generate code using a mobile-optimized model.""" | |
| tokenizer, model = load_model() | |
| prompt = f"Write Python code for: {instruction}\n\n```python\n" | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=200, | |
| temperature=0.3, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| code = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract code block | |
| if "```python" in code: | |
| code = code.split("```python")[1].split("```")[0] | |
| return code.strip() | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Mobile Agent") as demo: | |
| gr.Markdown(""" | |
| # π€ dispatchAI Mobile Agent | |
| **A real AI agent running on a 1B parameter model β small enough for your pocket.** | |
| Model: [Llama-3.2-1B-Instruct-Q4-mobile](https://huggingface.co/dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile) | |
| This agent runs on a model quantized to Q4 (700MB file size), designed to run on | |
| Snapdragon 865 phones. It can answer questions, write code, and solve tasks β | |
| all on a model 1/100th the size of GPT-4. | |
| ## Try It | |
| - **Chat**: Ask the agent anything | |
| - **Code**: Ask it to write Python code | |
| ## The Point | |
| This isn't about matching GPT-4. It's about proving that a 1B model on a phone | |
| can be genuinely useful. For the tasks people actually do on phones β quick answers, | |
| code snippets, summaries, classifications β a 1B model is enough. | |
| """) | |
| with gr.Tab("π¬ Chat"): | |
| chat = gr.ChatInterface( | |
| fn=agent_respond, | |
| title="Chat with a 1B Mobile Agent", | |
| description="Powered by Llama-3.2-1B-Instruct-Q4-mobile (700MB)", | |
| ) | |
| with gr.Tab("π¨βπ» Code"): | |
| code_input = gr.Textbox(label="What should I code?", placeholder="A function that reverses a string") | |
| code_btn = gr.Button("Generate Code", variant="primary") | |
| code_output = gr.Code(label="Generated Code", language="python") | |
| code_btn.click(fn=agent_code, inputs=code_input, outputs=code_output) | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ## How This Works | |
| This Space runs a **1 billion parameter Llama-3.2 model** quantized to 4-bit. | |
| | Metric | Value | | |
| |--------|-------| | |
| | Model | Llama-3.2-1B-Instruct | | |
| | Params | 1B | | |
| | Quantization | Q4 (4-bit) | | |
| | File size | 700MB | | |
| | RAM needed | ~1.1GB | | |
| | Speed on Snapdragon 865 | ~18 tokens/sec | | |
| | Speed on this Space (ZeroGPU) | Faster | | |
| ## Run This On Your Phone | |
| ```bash | |
| # Download the GGUF | |
| hf download dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile model.gguf | |
| # Run with llama.cpp | |
| llama-cli -m model.gguf -p "Hello!" -n 100 -t 4 | |
| ``` | |
| ## The Thesis | |
| > A 1B model on a phone is not a compromise. It's a victory. | |
| 6.8 billion smartphones. Most can't run a cloud LLM. But they CAN run a 1B model | |
| at 18 tokens/sec. That's fast enough for real-time chat, code completion, | |
| summarization, and classification. | |
| --- | |
| π [dispatchAI](https://huggingface.co/dispatchAI) β Small. Mobile. Free. UAE-built. | |
| """) | |
| demo.launch(mcp_server=True) | |