import gradio as gr import spaces import json import torch from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile" tokenizer = None model = None def load_model(): global tokenizer, model if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="auto", ) return tokenizer, model @spaces.GPU def agent_respond(task: str, history: list) -> str: """A mobile-optimized agent that can answer questions, write code, and solve tasks. Powered by dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile — a 1B parameter model quantized to Q4, designed to run on phones. This proves real agents can run on pocket-sized models. """ tokenizer, model = load_model() messages = [{"role": "system", "content": "You are a helpful mobile AI assistant. You are running on a 1B parameter model optimized for phones. Be concise and helpful."}] for h in history: messages.append({"role": "user", "content": h[0]}) if h[1]: messages.append({"role": "assistant", "content": h[1]}) messages.append({"role": "user", "content": task}) input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(input_text, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) return response @spaces.GPU def agent_code(instruction: str) -> str: """Generate code using a mobile-optimized model.""" tokenizer, model = load_model() prompt = f"Write Python code for: {instruction}\n\n```python\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=200, temperature=0.3, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) code = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract code block if "```python" in code: code = code.split("```python")[1].split("```")[0] return code.strip() with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Mobile Agent") as demo: gr.Markdown(""" # 🤖 dispatchAI Mobile Agent **A real AI agent running on a 1B parameter model — small enough for your pocket.** Model: [Llama-3.2-1B-Instruct-Q4-mobile](https://huggingface.co/dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile) This agent runs on a model quantized to Q4 (700MB file size), designed to run on Snapdragon 865 phones. It can answer questions, write code, and solve tasks — all on a model 1/100th the size of GPT-4. ## Try It - **Chat**: Ask the agent anything - **Code**: Ask it to write Python code ## The Point This isn't about matching GPT-4. It's about proving that a 1B model on a phone can be genuinely useful. For the tasks people actually do on phones — quick answers, code snippets, summaries, classifications — a 1B model is enough. """) with gr.Tab("đŸ’Ŧ Chat"): chat = gr.ChatInterface( fn=agent_respond, title="Chat with a 1B Mobile Agent", description="Powered by Llama-3.2-1B-Instruct-Q4-mobile (700MB)", ) with gr.Tab("👨‍đŸ’ģ Code"): code_input = gr.Textbox(label="What should I code?", placeholder="A function that reverses a string") code_btn = gr.Button("Generate Code", variant="primary") code_output = gr.Code(label="Generated Code", language="python") code_btn.click(fn=agent_code, inputs=code_input, outputs=code_output) with gr.Tab("â„šī¸ About"): gr.Markdown(""" ## How This Works This Space runs a **1 billion parameter Llama-3.2 model** quantized to 4-bit. | Metric | Value | |--------|-------| | Model | Llama-3.2-1B-Instruct | | Params | 1B | | Quantization | Q4 (4-bit) | | File size | 700MB | | RAM needed | ~1.1GB | | Speed on Snapdragon 865 | ~18 tokens/sec | | Speed on this Space (ZeroGPU) | Faster | ## Run This On Your Phone ```bash # Download the GGUF hf download dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile model.gguf # Run with llama.cpp llama-cli -m model.gguf -p "Hello!" -n 100 -t 4 ``` ## The Thesis > A 1B model on a phone is not a compromise. It's a victory. 6.8 billion smartphones. Most can't run a cloud LLM. But they CAN run a 1B model at 18 tokens/sec. That's fast enough for real-time chat, code completion, summarization, and classification. --- 🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built. """) demo.launch(mcp_server=True)