3morixd commited on
Commit
369004c
·
verified ·
1 Parent(s): 3da1e6b

Upload api/gateway.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. api/gateway.py +266 -0
api/gateway.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dispatchAI Inference API — Main Gateway Server
3
+ Runs on the Windows PC (112 cores). Acts as load balancer + API gateway.
4
+
5
+ Architecture:
6
+ Customer → api.dispatchai.ai:8081 → This server → Routes to phone
7
+ Phone runs phone_server.py (HTTP server + llama.cpp)
8
+
9
+ This server:
10
+ 1. Receives OpenAI-compatible API requests
11
+ 2. Finds an available phone
12
+ 3. Routes the request to that phone
13
+ 4. Returns the response to the customer
14
+ 5. Tracks token usage for billing
15
+ """
16
+ import os
17
+ import json
18
+ import time
19
+ import asyncio
20
+ import httpx
21
+ from datetime import datetime
22
+ from typing import Optional
23
+ from fastapi import FastAPI, HTTPException, Depends, Header
24
+ from fastapi.middleware.cors import CORSMiddleware
25
+ from pydantic import BaseModel
26
+
27
+ # ============================================================
28
+ # Configuration
29
+ # ============================================================
30
+
31
+ # Phone farm — list of phone IPs and ports
32
+ # Each phone runs phone_server.py on port 5000
33
+ # For now, we use ADB to get phone serials and assign ports
34
+ PHONE_PORTS = {} # serial → port mapping, filled at startup
35
+ BASE_PHONE_PORT = 5000 # First phone gets port 5000, second 5001, etc.
36
+
37
+ # API keys (simple auth — in production use a database)
38
+ API_KEYS_FILE = "data/api_keys.json"
39
+ USAGE_FILE = "data/api_usage.json"
40
+
41
+ # Available models
42
+ MODELS = {
43
+ "dispatchAI/SmolLM2-135M-Instruct-mobile": {"phone_model": "SmolLM2-135M-Instruct-mobile", "chat_format": "llama-3"},
44
+ "dispatchAI/Qwen2.5-0.5B-Instruct-mobile-int4": {"phone_model": "Qwen2.5-0.5B-Instruct-mobile-int4", "chat_format": "chatml"},
45
+ "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile": {"phone_model": "Llama-3.2-1B-Instruct-Q4-mobile", "chat_format": "chatml"},
46
+ "dispatchAI/TinyLlama-1.1B-Chat-Q5-mobile": {"phone_model": "TinyLlama-1.1B-Chat-Q5-mobile", "chat_format": "chatml"},
47
+ "dispatchAI/Qwen2.5-0.5B-Coder-mobile": {"phone_model": "Qwen2.5-0.5B-Coder-mobile", "chat_format": "chatml"},
48
+ }
49
+
50
+ # Pricing (per 1K tokens)
51
+ PRICING = {
52
+ "input": 0.001, # $0.001 per 1K input tokens
53
+ "output": 0.002, # $0.002 per 1K output tokens
54
+ }
55
+
56
+ # ============================================================
57
+ # Data Models (OpenAI-compatible)
58
+ # ============================================================
59
+
60
+ class ChatMessage(BaseModel):
61
+ role: str
62
+ content: str
63
+
64
+ class ChatCompletionRequest(BaseModel):
65
+ model: str = "dispatchAI/SmolLM2-135M-Instruct-mobile"
66
+ messages: list[ChatMessage]
67
+ max_tokens: int = 100
68
+ temperature: float = 0.7
69
+ stream: bool = False
70
+
71
+ # ============================================================
72
+ # API Key Management
73
+ # ============================================================
74
+
75
+ def load_api_keys():
76
+ if os.path.exists(API_KEYS_FILE):
77
+ return json.load(open(API_KEYS_FILE))
78
+ # Create default key
79
+ keys = {"da-demo-key-0001": {"name": "Demo Key", "created": datetime.now().isoformat(), "balance": 1000}}
80
+ json.dump(keys, open(API_KEYS_FILE, "w"), indent=2)
81
+ return keys
82
+
83
+ def load_usage():
84
+ if os.path.exists(USAGE_FILE):
85
+ return json.load(open(USAGE_FILE))
86
+ return {}
87
+
88
+ def save_usage(usage):
89
+ json.dump(usage, open(USAGE_FILE, "w"), indent=2)
90
+
91
+ def verify_api_key(authorization: Optional[str] = Header(None)):
92
+ if not authorization:
93
+ raise HTTPException(status_code=401, detail="Missing API key. Add 'Authorization: Bearer da-xxx' header.")
94
+
95
+ key = authorization.replace("Bearer ", "").strip()
96
+ api_keys = load_api_keys()
97
+
98
+ if key not in api_keys:
99
+ raise HTTPException(status_code=401, detail="Invalid API key")
100
+
101
+ return key
102
+
103
+ # ============================================================
104
+ # Phone Pool Management
105
+ # ============================================================
106
+
107
+ def get_available_phones():
108
+ """Get list of connected phones via ADB."""
109
+ import subprocess
110
+ result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10)
111
+ phones = []
112
+ for line in result.stdout.strip().split("\n")[1:]:
113
+ if "\tdevice" in line:
114
+ serial = line.split("\t")[0]
115
+ phones.append(serial)
116
+ return phones
117
+
118
+ def get_phone_port(serial: str) -> int:
119
+ """Get or assign a port for a phone."""
120
+ if serial not in PHONE_PORTS:
121
+ PHONE_PORTS[serial] = BASE_PHONE_PORT + len(PHONE_PORTS)
122
+ return PHONE_PORTS[serial]
123
+
124
+ # ============================================================
125
+ # FastAPI App
126
+ # ============================================================
127
+
128
+ app = FastAPI(
129
+ title="dispatchAI Inference API",
130
+ description="Mobile-optimized LLM inference. Small. Mobile. Free. UAE-built.",
131
+ version="1.0.0",
132
+ )
133
+
134
+ app.add_middleware(
135
+ CORSMiddleware,
136
+ allow_origins=["https://dispatchai.ai", "https://www.dispatchai.ai", "https://huggingface.co"],
137
+ allow_methods=["GET", "POST"],
138
+ allow_headers=["*"],
139
+ )
140
+
141
+ @app.get("/")
142
+ async def root():
143
+ """API info."""
144
+ phones = get_available_phones()
145
+ return {
146
+ "name": "dispatchAI Inference API",
147
+ "version": "1.0.0",
148
+ "status": "running",
149
+ "phones_connected": len(phones),
150
+ "models": list(MODELS.keys()),
151
+ "pricing": {"input": f"${PRICING['input']}/1K tokens", "output": f"${PRICING['output']}/1K tokens"},
152
+ "docs": "/docs",
153
+ "website": "https://huggingface.co/dispatchAI",
154
+ }
155
+
156
+ @app.get("/v1/models")
157
+ async def list_models(api_key: str = Depends(verify_api_key)):
158
+ """List available models (OpenAI-compatible)."""
159
+ return {
160
+ "object": "list",
161
+ "data": [
162
+ {
163
+ "id": model_id,
164
+ "object": "model",
165
+ "created": 1719500000,
166
+ "owned_by": "dispatchAI",
167
+ }
168
+ for model_id in MODELS.keys()
169
+ ]
170
+ }
171
+
172
+ @app.post("/v1/chat/completions")
173
+ async def chat_completions(
174
+ request: ChatCompletionRequest,
175
+ api_key: str = Depends(verify_api_key),
176
+ ):
177
+ """Create a chat completion (OpenAI-compatible)."""
178
+
179
+ if request.model not in MODELS:
180
+ raise HTTPException(status_code=400, detail=f"Model '{request.model}' not available. Use GET /v1/models to see available models.")
181
+
182
+ # Get available phones
183
+ phones = get_available_phones()
184
+ if not phones:
185
+ raise HTTPException(status_code=503, detail="No phones available. Try again later.")
186
+
187
+ # Round-robin load balancing across active phone proxies
188
+ # Each phone proxy runs on port 5000, 5001, 5002, etc.
189
+ import time as _time
190
+ available_ports = [5000, 5001, 5002, 5003, 5004] # 3 phones with proxies
191
+ phone_port = available_ports[int(_time.time()) % len(available_ports)]
192
+
193
+ model_info = MODELS[request.model]
194
+
195
+ # Prepare request for phone
196
+ phone_request = {
197
+ "model": request.model,
198
+ "messages": [{"role": m.role, "content": m.content} for m in request.messages],
199
+ "max_tokens": request.max_tokens,
200
+ "temperature": request.temperature,
201
+ "chat_format": model_info["chat_format"],
202
+ "raw_completion": True, # Use raw text completion, not chat template
203
+ }
204
+
205
+ # Send to phone
206
+ try:
207
+ async with httpx.AsyncClient(timeout=120.0) as client:
208
+ response = await client.post(
209
+ f"http://127.0.0.1:{phone_port}/v1/chat/completions",
210
+ json=phone_request,
211
+ )
212
+
213
+ if response.status_code != 200:
214
+ raise HTTPException(status_code=502, detail=f"Phone error: {response.text[:200]}")
215
+
216
+ result = response.json()
217
+
218
+ # Track usage
219
+ usage = load_usage()
220
+ if api_key not in usage:
221
+ usage[api_key] = {"total_tokens": 0, "requests": 0, "cost": 0.0}
222
+
223
+ tokens_used = result.get("usage", {}).get("total_tokens", 0)
224
+ cost = (tokens_used / 1000) * (PRICING["input"] + PRICING["output"])
225
+
226
+ usage[api_key]["total_tokens"] += tokens_used
227
+ usage[api_key]["requests"] += 1
228
+ usage[api_key]["cost"] += cost
229
+ usage[api_key]["last_request"] = datetime.now().isoformat()
230
+ save_usage(usage)
231
+
232
+ return result
233
+
234
+ except httpx.TimeoutException:
235
+ raise HTTPException(status_code=504, detail="Phone inference timed out. Try a smaller max_tokens.")
236
+ except Exception as e:
237
+ raise HTTPException(status_code=500, detail=f"Internal error: {str(e)[:200]}")
238
+
239
+ @app.get("/v1/usage")
240
+ async def get_usage(api_key: str = Depends(verify_api_key)):
241
+ """Get API usage stats."""
242
+ usage = load_usage()
243
+ return usage.get(api_key, {"total_tokens": 0, "requests": 0, "cost": 0.0})
244
+
245
+ @app.get("/admin/phones")
246
+ async def admin_phones(api_key: str = Depends(verify_api_key)):
247
+ """Get phone farm status (requires auth)."""
248
+ phones = get_available_phones()
249
+ return {
250
+ "phones_connected": len(phones),
251
+ "phones": [{"serial": p, "port": get_phone_port(p)} for p in phones],
252
+ "total_capacity_tokens_per_sec": len(phones) * 20, # ~20 t/s per phone
253
+ }
254
+
255
+ # ============================================================
256
+ # Startup
257
+ # ============================================================
258
+
259
+ if __name__ == "__main__":
260
+ import uvicorn
261
+ print("🚀 dispatchAI Inference API — Starting...")
262
+ print(f" Endpoint: http://api.dispatchai.ai:8081")
263
+ print(f" Docs: http://api.dispatchai.ai:8081/docs")
264
+ print(f" Phones: {len(get_available_phones())} connected")
265
+ print()
266
+ uvicorn.run(app, host="0.0.0.0", port=8081)