import gradio as gr
import spaces
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile"

tokenizer = None
model = None

def load_model():
    global tokenizer, model
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    return tokenizer, model

@spaces.GPU
def agent_respond(task: str, history: list) -> str:
    """A mobile-optimized agent that can answer questions, write code, and solve tasks.
    
    Powered by dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile — a 1B parameter model 
    quantized to Q4, designed to run on phones. This proves real agents can run 
    on pocket-sized models.
    """
    tokenizer, model = load_model()
    
    messages = [{"role": "system", "content": "You are a helpful mobile AI assistant. You are running on a 1B parameter model optimized for phones. Be concise and helpful."}]
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        if h[1]:
            messages.append({"role": "assistant", "content": h[1]})
    messages.append({"role": "user", "content": task})
    
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response

@spaces.GPU  
def agent_code(instruction: str) -> str:
    """Generate code using a mobile-optimized model."""
    tokenizer, model = load_model()
    
    prompt = f"Write Python code for: {instruction}\n\n```python\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.3,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract code block
    if "```python" in code:
        code = code.split("```python")[1].split("```")[0]
    return code.strip()

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Mobile Agent") as demo:
    gr.Markdown("""
    # 🤖 dispatchAI Mobile Agent
    
    **A real AI agent running on a 1B parameter model — small enough for your pocket.**
    
    Model: [Llama-3.2-1B-Instruct-Q4-mobile](https://huggingface.co/dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile)
    
    This agent runs on a model quantized to Q4 (700MB file size), designed to run on 
    Snapdragon 865 phones. It can answer questions, write code, and solve tasks — 
    all on a model 1/100th the size of GPT-4.
    
    ## Try It
    
    - **Chat**: Ask the agent anything
    - **Code**: Ask it to write Python code
    
    ## The Point
    
    This isn't about matching GPT-4. It's about proving that a 1B model on a phone 
    can be genuinely useful. For the tasks people actually do on phones — quick answers, 
    code snippets, summaries, classifications — a 1B model is enough.
    """)
    
    with gr.Tab("💬 Chat"):
        chat = gr.ChatInterface(
            fn=agent_respond,
            title="Chat with a 1B Mobile Agent",
            description="Powered by Llama-3.2-1B-Instruct-Q4-mobile (700MB)",
        )
    
    with gr.Tab("👨‍💻 Code"):
        code_input = gr.Textbox(label="What should I code?", placeholder="A function that reverses a string")
        code_btn = gr.Button("Generate Code", variant="primary")
        code_output = gr.Code(label="Generated Code", language="python")
        code_btn.click(fn=agent_code, inputs=code_input, outputs=code_output)
    
    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
        ## How This Works
        
        This Space runs a **1 billion parameter Llama-3.2 model** quantized to 4-bit.
        
        | Metric | Value |
        |--------|-------|
        | Model | Llama-3.2-1B-Instruct |
        | Params | 1B |
        | Quantization | Q4 (4-bit) |
        | File size | 700MB |
        | RAM needed | ~1.1GB |
        | Speed on Snapdragon 865 | ~18 tokens/sec |
        | Speed on this Space (ZeroGPU) | Faster |
        
        ## Run This On Your Phone
        
        ```bash
        # Download the GGUF
        hf download dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile model.gguf
        
        # Run with llama.cpp
        llama-cli -m model.gguf -p "Hello!" -n 100 -t 4
        ```
        
        ## The Thesis
        
        > A 1B model on a phone is not a compromise. It's a victory.
        
        6.8 billion smartphones. Most can't run a cloud LLM. But they CAN run a 1B model 
        at 18 tokens/sec. That's fast enough for real-time chat, code completion, 
        summarization, and classification.
        
        ---
        🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built.
        """)

demo.launch(mcp_server=True)