| """ |
| dispatchAI Inference API — Main Gateway Server |
| Runs on the Windows PC (112 cores). Acts as load balancer + API gateway. |
| |
| Architecture: |
| Customer → api.dispatchai.ai:8081 → This server → Routes to phone |
| Phone runs phone_server.py (HTTP server + llama.cpp) |
| |
| This server: |
| 1. Receives OpenAI-compatible API requests |
| 2. Finds an available phone |
| 3. Routes the request to that phone |
| 4. Returns the response to the customer |
| 5. Tracks token usage for billing |
| """ |
| import os |
| import json |
| import time |
| import asyncio |
| import httpx |
| from datetime import datetime |
| from typing import Optional |
| from fastapi import FastAPI, HTTPException, Depends, Header |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel |
|
|
| |
| |
| |
|
|
| |
| |
| |
| PHONE_PORTS = {} |
| BASE_PHONE_PORT = 5000 |
|
|
| |
| API_KEYS_FILE = "data/api_keys.json" |
| USAGE_FILE = "data/api_usage.json" |
|
|
| |
| MODELS = { |
| "dispatchAI/SmolLM2-135M-Instruct-mobile": {"phone_model": "SmolLM2-135M-Instruct-mobile", "chat_format": "llama-3"}, |
| "dispatchAI/Qwen2.5-0.5B-Instruct-mobile-int4": {"phone_model": "Qwen2.5-0.5B-Instruct-mobile-int4", "chat_format": "chatml"}, |
| "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile": {"phone_model": "Llama-3.2-1B-Instruct-Q4-mobile", "chat_format": "chatml"}, |
| "dispatchAI/TinyLlama-1.1B-Chat-Q5-mobile": {"phone_model": "TinyLlama-1.1B-Chat-Q5-mobile", "chat_format": "chatml"}, |
| "dispatchAI/Qwen2.5-0.5B-Coder-mobile": {"phone_model": "Qwen2.5-0.5B-Coder-mobile", "chat_format": "chatml"}, |
| } |
|
|
| |
| PRICING = { |
| "input": 0.001, |
| "output": 0.002, |
| } |
|
|
| |
| |
| |
|
|
| class ChatMessage(BaseModel): |
| role: str |
| content: str |
|
|
| class ChatCompletionRequest(BaseModel): |
| model: str = "dispatchAI/SmolLM2-135M-Instruct-mobile" |
| messages: list[ChatMessage] |
| max_tokens: int = 100 |
| temperature: float = 0.7 |
| stream: bool = False |
|
|
| |
| |
| |
|
|
| def load_api_keys(): |
| if os.path.exists(API_KEYS_FILE): |
| return json.load(open(API_KEYS_FILE)) |
| |
| keys = {"da-demo-key-0001": {"name": "Demo Key", "created": datetime.now().isoformat(), "balance": 1000}} |
| json.dump(keys, open(API_KEYS_FILE, "w"), indent=2) |
| return keys |
|
|
| def load_usage(): |
| if os.path.exists(USAGE_FILE): |
| return json.load(open(USAGE_FILE)) |
| return {} |
|
|
| def save_usage(usage): |
| json.dump(usage, open(USAGE_FILE, "w"), indent=2) |
|
|
| def verify_api_key(authorization: Optional[str] = Header(None)): |
| if not authorization: |
| raise HTTPException(status_code=401, detail="Missing API key. Add 'Authorization: Bearer da-xxx' header.") |
| |
| key = authorization.replace("Bearer ", "").strip() |
| api_keys = load_api_keys() |
| |
| if key not in api_keys: |
| raise HTTPException(status_code=401, detail="Invalid API key") |
| |
| return key |
|
|
| |
| |
| |
|
|
| def get_available_phones(): |
| """Get list of connected phones via ADB.""" |
| import subprocess |
| result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10) |
| phones = [] |
| for line in result.stdout.strip().split("\n")[1:]: |
| if "\tdevice" in line: |
| serial = line.split("\t")[0] |
| phones.append(serial) |
| return phones |
|
|
| def get_phone_port(serial: str) -> int: |
| """Get or assign a port for a phone.""" |
| if serial not in PHONE_PORTS: |
| PHONE_PORTS[serial] = BASE_PHONE_PORT + len(PHONE_PORTS) |
| return PHONE_PORTS[serial] |
|
|
| |
| |
| |
|
|
| app = FastAPI( |
| title="dispatchAI Inference API", |
| description="Mobile-optimized LLM inference. Small. Mobile. Free. UAE-built.", |
| version="1.0.0", |
| ) |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["https://dispatchai.ai", "https://www.dispatchai.ai", "https://huggingface.co"], |
| allow_methods=["GET", "POST"], |
| allow_headers=["*"], |
| ) |
|
|
| @app.get("/") |
| async def root(): |
| """API info.""" |
| phones = get_available_phones() |
| return { |
| "name": "dispatchAI Inference API", |
| "version": "1.0.0", |
| "status": "running", |
| "phones_connected": len(phones), |
| "models": list(MODELS.keys()), |
| "pricing": {"input": f"${PRICING['input']}/1K tokens", "output": f"${PRICING['output']}/1K tokens"}, |
| "docs": "/docs", |
| "website": "https://huggingface.co/dispatchAI", |
| } |
|
|
| @app.get("/v1/models") |
| async def list_models(api_key: str = Depends(verify_api_key)): |
| """List available models (OpenAI-compatible).""" |
| return { |
| "object": "list", |
| "data": [ |
| { |
| "id": model_id, |
| "object": "model", |
| "created": 1719500000, |
| "owned_by": "dispatchAI", |
| } |
| for model_id in MODELS.keys() |
| ] |
| } |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completions( |
| request: ChatCompletionRequest, |
| api_key: str = Depends(verify_api_key), |
| ): |
| """Create a chat completion (OpenAI-compatible).""" |
| |
| if request.model not in MODELS: |
| raise HTTPException(status_code=400, detail=f"Model '{request.model}' not available. Use GET /v1/models to see available models.") |
| |
| |
| phones = get_available_phones() |
| if not phones: |
| raise HTTPException(status_code=503, detail="No phones available. Try again later.") |
| |
| |
| |
| import time as _time |
| available_ports = [5000, 5001, 5002, 5003, 5004] |
| phone_port = available_ports[int(_time.time()) % len(available_ports)] |
| |
| model_info = MODELS[request.model] |
| |
| |
| phone_request = { |
| "model": request.model, |
| "messages": [{"role": m.role, "content": m.content} for m in request.messages], |
| "max_tokens": request.max_tokens, |
| "temperature": request.temperature, |
| "chat_format": model_info["chat_format"], |
| "raw_completion": True, |
| } |
| |
| |
| try: |
| async with httpx.AsyncClient(timeout=120.0) as client: |
| response = await client.post( |
| f"http://127.0.0.1:{phone_port}/v1/chat/completions", |
| json=phone_request, |
| ) |
| |
| if response.status_code != 200: |
| raise HTTPException(status_code=502, detail=f"Phone error: {response.text[:200]}") |
| |
| result = response.json() |
| |
| |
| usage = load_usage() |
| if api_key not in usage: |
| usage[api_key] = {"total_tokens": 0, "requests": 0, "cost": 0.0} |
| |
| tokens_used = result.get("usage", {}).get("total_tokens", 0) |
| cost = (tokens_used / 1000) * (PRICING["input"] + PRICING["output"]) |
| |
| usage[api_key]["total_tokens"] += tokens_used |
| usage[api_key]["requests"] += 1 |
| usage[api_key]["cost"] += cost |
| usage[api_key]["last_request"] = datetime.now().isoformat() |
| save_usage(usage) |
| |
| return result |
| |
| except httpx.TimeoutException: |
| raise HTTPException(status_code=504, detail="Phone inference timed out. Try a smaller max_tokens.") |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=f"Internal error: {str(e)[:200]}") |
|
|
| @app.get("/v1/usage") |
| async def get_usage(api_key: str = Depends(verify_api_key)): |
| """Get API usage stats.""" |
| usage = load_usage() |
| return usage.get(api_key, {"total_tokens": 0, "requests": 0, "cost": 0.0}) |
|
|
| @app.get("/admin/phones") |
| async def admin_phones(api_key: str = Depends(verify_api_key)): |
| """Get phone farm status (requires auth).""" |
| phones = get_available_phones() |
| return { |
| "phones_connected": len(phones), |
| "phones": [{"serial": p, "port": get_phone_port(p)} for p in phones], |
| "total_capacity_tokens_per_sec": len(phones) * 20, |
| } |
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| print("🚀 dispatchAI Inference API — Starting...") |
| print(f" Endpoint: http://api.dispatchai.ai:8081") |
| print(f" Docs: http://api.dispatchai.ai:8081/docs") |
| print(f" Phones: {len(get_available_phones())} connected") |
| print() |
| uvicorn.run(app, host="0.0.0.0", port=8081) |
|
|