dispatchAI
/

dispatchAI-API

Model card Files Files and versions

xet

Community

3morixd commited on about 16 hours ago

Commit

369004c

verified ·

1 Parent(s): 3da1e6b

Upload api/gateway.py with huggingface_hub

Browse files

Files changed (1) hide show

api/gateway.py +266 -0

api/gateway.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+dispatchAI Inference API — Main Gateway Server
+Runs on the Windows PC (112 cores). Acts as load balancer + API gateway.
+Architecture:
+  Customer → api.dispatchai.ai:8081 → This server → Routes to phone
+  Phone runs phone_server.py (HTTP server + llama.cpp)
+This server:
+1. Receives OpenAI-compatible API requests
+2. Finds an available phone
+3. Routes the request to that phone
+4. Returns the response to the customer
+5. Tracks token usage for billing
+"""
+import os
+import json
+import time
+import asyncio
+import httpx
+from datetime import datetime
+from typing import Optional
+from fastapi import FastAPI, HTTPException, Depends, Header
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# ============================================================
+# Configuration
+# ============================================================
+# Phone farm — list of phone IPs and ports
+# Each phone runs phone_server.py on port 5000
+# For now, we use ADB to get phone serials and assign ports
+PHONE_PORTS = {}  # serial → port mapping, filled at startup
+BASE_PHONE_PORT = 5000  # First phone gets port 5000, second 5001, etc.
+# API keys (simple auth — in production use a database)
+API_KEYS_FILE = "data/api_keys.json"
+USAGE_FILE = "data/api_usage.json"
+# Available models
+MODELS = {
+    "dispatchAI/SmolLM2-135M-Instruct-mobile": {"phone_model": "SmolLM2-135M-Instruct-mobile", "chat_format": "llama-3"},
+    "dispatchAI/Qwen2.5-0.5B-Instruct-mobile-int4": {"phone_model": "Qwen2.5-0.5B-Instruct-mobile-int4", "chat_format": "chatml"},
+    "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile": {"phone_model": "Llama-3.2-1B-Instruct-Q4-mobile", "chat_format": "chatml"},
+    "dispatchAI/TinyLlama-1.1B-Chat-Q5-mobile": {"phone_model": "TinyLlama-1.1B-Chat-Q5-mobile", "chat_format": "chatml"},
+    "dispatchAI/Qwen2.5-0.5B-Coder-mobile": {"phone_model": "Qwen2.5-0.5B-Coder-mobile", "chat_format": "chatml"},
+}
+# Pricing (per 1K tokens)
+PRICING = {
+    "input": 0.001,   # $0.001 per 1K input tokens
+    "output": 0.002,  # $0.002 per 1K output tokens
+}
+# ============================================================
+# Data Models (OpenAI-compatible)
+# ============================================================
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+class ChatCompletionRequest(BaseModel):
+    model: str = "dispatchAI/SmolLM2-135M-Instruct-mobile"
+    messages: list[ChatMessage]
+    max_tokens: int = 100
+    temperature: float = 0.7
+    stream: bool = False
+# ============================================================
+# API Key Management
+# ============================================================
+def load_api_keys():
+    if os.path.exists(API_KEYS_FILE):
+        return json.load(open(API_KEYS_FILE))
+    # Create default key
+    keys = {"da-demo-key-0001": {"name": "Demo Key", "created": datetime.now().isoformat(), "balance": 1000}}
+    json.dump(keys, open(API_KEYS_FILE, "w"), indent=2)
+    return keys
+def load_usage():
+    if os.path.exists(USAGE_FILE):
+        return json.load(open(USAGE_FILE))
+    return {}
+def save_usage(usage):
+    json.dump(usage, open(USAGE_FILE, "w"), indent=2)
+def verify_api_key(authorization: Optional[str] = Header(None)):
+    if not authorization:
+        raise HTTPException(status_code=401, detail="Missing API key. Add 'Authorization: Bearer da-xxx' header.")
+    key = authorization.replace("Bearer ", "").strip()
+    api_keys = load_api_keys()
+    if key not in api_keys:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+    return key
+# ============================================================
+# Phone Pool Management
+# ============================================================
+def get_available_phones():
+    """Get list of connected phones via ADB."""
+    import subprocess
+    result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10)
+    phones = []
+    for line in result.stdout.strip().split("\n")[1:]:
+        if "\tdevice" in line:
+            serial = line.split("\t")[0]
+            phones.append(serial)
+    return phones
+def get_phone_port(serial: str) -> int:
+    """Get or assign a port for a phone."""
+    if serial not in PHONE_PORTS:
+        PHONE_PORTS[serial] = BASE_PHONE_PORT + len(PHONE_PORTS)
+    return PHONE_PORTS[serial]
+# ============================================================
+# FastAPI App
+# ============================================================
+app = FastAPI(
+    title="dispatchAI Inference API",
+    description="Mobile-optimized LLM inference. Small. Mobile. Free. UAE-built.",
+    version="1.0.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["https://dispatchai.ai", "https://www.dispatchai.ai", "https://huggingface.co"],
+    allow_methods=["GET", "POST"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def root():
+    """API info."""
+    phones = get_available_phones()
+    return {
+        "name": "dispatchAI Inference API",
+        "version": "1.0.0",
+        "status": "running",
+        "phones_connected": len(phones),
+        "models": list(MODELS.keys()),
+        "pricing": {"input": f"${PRICING['input']}/1K tokens", "output": f"${PRICING['output']}/1K tokens"},
+        "docs": "/docs",
+        "website": "https://huggingface.co/dispatchAI",
+    }
+@app.get("/v1/models")
+async def list_models(api_key: str = Depends(verify_api_key)):
+    """List available models (OpenAI-compatible)."""
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": model_id,
+                "object": "model",
+                "created": 1719500000,
+                "owned_by": "dispatchAI",
+            }
+            for model_id in MODELS.keys()
+        ]
+    }
+@app.post("/v1/chat/completions")
+async def chat_completions(
+    request: ChatCompletionRequest,
+    api_key: str = Depends(verify_api_key),
+):
+    """Create a chat completion (OpenAI-compatible)."""
+    if request.model not in MODELS:
+        raise HTTPException(status_code=400, detail=f"Model '{request.model}' not available. Use GET /v1/models to see available models.")
+    # Get available phones
+    phones = get_available_phones()
+    if not phones:
+        raise HTTPException(status_code=503, detail="No phones available. Try again later.")
+    # Round-robin load balancing across active phone proxies
+    # Each phone proxy runs on port 5000, 5001, 5002, etc.
+    import time as _time
+    available_ports = [5000, 5001, 5002, 5003, 5004]  # 3 phones with proxies
+    phone_port = available_ports[int(_time.time()) % len(available_ports)]
+    model_info = MODELS[request.model]
+    # Prepare request for phone
+    phone_request = {
+        "model": request.model,
+        "messages": [{"role": m.role, "content": m.content} for m in request.messages],
+        "max_tokens": request.max_tokens,
+        "temperature": request.temperature,
+        "chat_format": model_info["chat_format"],
+        "raw_completion": True,  # Use raw text completion, not chat template
+    }
+    # Send to phone
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            response = await client.post(
+                f"http://127.0.0.1:{phone_port}/v1/chat/completions",
+                json=phone_request,
+            )
+            if response.status_code != 200:
+                raise HTTPException(status_code=502, detail=f"Phone error: {response.text[:200]}")
+            result = response.json()
+            # Track usage
+            usage = load_usage()
+            if api_key not in usage:
+                usage[api_key] = {"total_tokens": 0, "requests": 0, "cost": 0.0}
+            tokens_used = result.get("usage", {}).get("total_tokens", 0)
+            cost = (tokens_used / 1000) * (PRICING["input"] + PRICING["output"])
+            usage[api_key]["total_tokens"] += tokens_used
+            usage[api_key]["requests"] += 1
+            usage[api_key]["cost"] += cost
+            usage[api_key]["last_request"] = datetime.now().isoformat()
+            save_usage(usage)
+            return result
+    except httpx.TimeoutException:
+        raise HTTPException(status_code=504, detail="Phone inference timed out. Try a smaller max_tokens.")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Internal error: {str(e)[:200]}")
+@app.get("/v1/usage")
+async def get_usage(api_key: str = Depends(verify_api_key)):
+    """Get API usage stats."""
+    usage = load_usage()
+    return usage.get(api_key, {"total_tokens": 0, "requests": 0, "cost": 0.0})
+@app.get("/admin/phones")
+async def admin_phones(api_key: str = Depends(verify_api_key)):
+    """Get phone farm status (requires auth)."""
+    phones = get_available_phones()
+    return {
+        "phones_connected": len(phones),
+        "phones": [{"serial": p, "port": get_phone_port(p)} for p in phones],
+        "total_capacity_tokens_per_sec": len(phones) * 20,  # ~20 t/s per phone
+    }
+# ============================================================
+# Startup
+# ============================================================
+if __name__ == "__main__":
+    import uvicorn
+    print("🚀 dispatchAI Inference API — Starting...")
+    print(f"   Endpoint: http://api.dispatchai.ai:8081")
+    print(f"   Docs: http://api.dispatchai.ai:8081/docs")
+    print(f"   Phones: {len(get_available_phones())} connected")
+    print()
+    uvicorn.run(app, host="0.0.0.0", port=8081)