Spaces:

dispatchAI
/

mobile-agent

Runtime error

App Files Files Community

mobile-agent / app.py

3morixd

Upload app.py with huggingface_hub

3b7ea55 verified 1 day ago

Raw

History Blame Contribute Delete

5.46 kB

	import gradio as gr
	import spaces
	import json
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	MODEL_ID = "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile"

	tokenizer = None
	model = None

	def load_model():
	global tokenizer, model
	if tokenizer is None:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	return tokenizer, model

	@spaces.GPU
	def agent_respond(task: str, history: list) -> str:
	"""A mobile-optimized agent that can answer questions, write code, and solve tasks.

	Powered by dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile — a 1B parameter model
	quantized to Q4, designed to run on phones. This proves real agents can run
	on pocket-sized models.
	"""
	tokenizer, model = load_model()

	messages = [{"role": "system", "content": "You are a helpful mobile AI assistant. You are running on a 1B parameter model optimized for phones. Be concise and helpful."}]
	for h in history:
	messages.append({"role": "user", "content": h[0]})
	if h[1]:
	messages.append({"role": "assistant", "content": h[1]})
	messages.append({"role": "user", "content": task})

	input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=256,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
	return response

	@spaces.GPU
	def agent_code(instruction: str) -> str:
	"""Generate code using a mobile-optimized model."""
	tokenizer, model = load_model()

	prompt = f"Write Python code for: {instruction}\n\n```python\n"
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=200,
	temperature=0.3,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	code = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# Extract code block
	if "```python" in code:
	code = code.split("```python")[1].split("```")[0]
	return code.strip()

	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Mobile Agent") as demo:
	gr.Markdown("""
	# 🤖 dispatchAI Mobile Agent

	A real AI agent running on a 1B parameter model — small enough for your pocket.

	Model: [Llama-3.2-1B-Instruct-Q4-mobile](https://huggingface.co/dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile)

	This agent runs on a model quantized to Q4 (700MB file size), designed to run on
	Snapdragon 865 phones. It can answer questions, write code, and solve tasks —
	all on a model 1/100th the size of GPT-4.

	## Try It

	- Chat: Ask the agent anything
	- Code: Ask it to write Python code

	## The Point

	This isn't about matching GPT-4. It's about proving that a 1B model on a phone
	can be genuinely useful. For the tasks people actually do on phones — quick answers,
	code snippets, summaries, classifications — a 1B model is enough.
	""")

	with gr.Tab("💬 Chat"):
	chat = gr.ChatInterface(
	fn=agent_respond,
	title="Chat with a 1B Mobile Agent",
	description="Powered by Llama-3.2-1B-Instruct-Q4-mobile (700MB)",
	)

	with gr.Tab("👨‍💻 Code"):
	code_input = gr.Textbox(label="What should I code?", placeholder="A function that reverses a string")
	code_btn = gr.Button("Generate Code", variant="primary")
	code_output = gr.Code(label="Generated Code", language="python")
	code_btn.click(fn=agent_code, inputs=code_input, outputs=code_output)

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## How This Works

	This Space runs a 1 billion parameter Llama-3.2 model quantized to 4-bit.

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Model \| Llama-3.2-1B-Instruct \|
	\| Params \| 1B \|
	\| Quantization \| Q4 (4-bit) \|
	\| File size \| 700MB \|
	\| RAM needed \| ~1.1GB \|
	\| Speed on Snapdragon 865 \| ~18 tokens/sec \|
	\| Speed on this Space (ZeroGPU) \| Faster \|

	## Run This On Your Phone

	```bash
	# Download the GGUF
	hf download dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile model.gguf

	# Run with llama.cpp
	llama-cli -m model.gguf -p "Hello!" -n 100 -t 4
	```

	## The Thesis

	> A 1B model on a phone is not a compromise. It's a victory.

	6.8 billion smartphones. Most can't run a cloud LLM. But they CAN run a 1B model
	at 18 tokens/sec. That's fast enough for real-time chat, code completion,
	summarization, and classification.

	---
	🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built.
	""")

	demo.launch(mcp_server=True)