3morixd commited on
Commit
0ad01db
Β·
verified Β·
1 Parent(s): 369004c

Upload api/phone_proxy_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. api/phone_proxy_v2.py +313 -0
api/phone_proxy_v2.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dispatchAI Phone Proxy v2 β€” Clean output extraction
3
+ Instead of parsing the messy llama-cli output, we use --log-disable
4
+ and capture only the response text between known markers.
5
+ """
6
+ import os
7
+ import sys
8
+ import json
9
+ import subprocess
10
+ import re
11
+ import time
12
+ from http.server import HTTPServer, BaseHTTPRequestHandler
13
+
14
+ PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else ""
15
+ PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000
16
+ LLAMA_CLI = "/data/local/tmp/llama-cli"
17
+ MODEL_PATH = "/data/local/tmp/model.gguf"
18
+ ENV = os.environ.copy()
19
+ ENV["MSYS_NO_PATHCONV"] = "1"
20
+
21
+ class PhoneHandler(BaseHTTPRequestHandler):
22
+ def do_POST(self):
23
+ if self.path != "/v1/chat/completions":
24
+ self.send_error(404)
25
+ return
26
+
27
+ content_length = int(self.headers.get("Content-Length", 0))
28
+ body = self.rfile.read(content_length)
29
+
30
+ try:
31
+ req = json.loads(body)
32
+ except:
33
+ self.send_error(400, "Invalid JSON")
34
+ return
35
+
36
+ messages = req.get("messages", [])
37
+ max_tokens = req.get("max_tokens", 100)
38
+ temperature = req.get("temperature", 0.7)
39
+ chat_format = req.get("chat_format", "chatml")
40
+
41
+ # Build prompt β€” use SIMPLE raw completion (no chat template)
42
+ # This avoids template markers being echoed in output
43
+ # Just use the user's last message as the prompt
44
+ user_message = ""
45
+ for msg in reversed(messages):
46
+ if msg.get("role") == "user":
47
+ user_message = msg.get("content", "")
48
+ break
49
+
50
+ if not user_message:
51
+ user_message = "Hello"
52
+
53
+ # For chat models, prefix with a natural prompt
54
+ if len(messages) > 1:
55
+ # Multi-turn: include system + user
56
+ system_msg = ""
57
+ for msg in messages:
58
+ if msg.get("role") == "system":
59
+ system_msg = msg.get("content", "")
60
+ break
61
+
62
+ if system_msg:
63
+ prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:"
64
+ else:
65
+ prompt = f"User: {user_message}\nAssistant:"
66
+ else:
67
+ # Single turn β€” just use the message directly
68
+ # For SmolLM2/Llama, raw continuation works: "The capital of France is"
69
+ # For chat-style, use "User: ... Assistant:"
70
+ prompt = user_message
71
+
72
+ escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\")
73
+
74
+ # Run llama-cli with prompt from stdin to avoid escaping issues
75
+ # Actually, let's use a simpler approach: write prompt to file, then use -p file
76
+ # But llama-cli doesn't support file input. Let's use a different approach.
77
+
78
+ # Use the -p flag but with careful escaping
79
+ cmd = (
80
+ f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
81
+ f'timeout 60 ./llama-cli '
82
+ f'-m {MODEL_PATH} '
83
+ f'-p \'{escaped_prompt}\' '
84
+ f'-n {max_tokens} '
85
+ f'-t 4 '
86
+ f'-st '
87
+ f'--no-display-prompt '
88
+ f'--log-disable 2>&1'
89
+ )
90
+
91
+ try:
92
+ result = subprocess.run(
93
+ ["adb", "-s", PHONE_SERIAL, "shell", cmd],
94
+ env=ENV, capture_output=True, text=True, timeout=90
95
+ )
96
+
97
+ output = result.stdout + result.stderr
98
+
99
+ # With --log-disable and --no-display-prompt, the output should be cleaner
100
+ # But there may still be some noise. Extract just the response.
101
+
102
+ # Parse speed
103
+ gen_match = re.search(r'Generation:\s*([\d.]+)\s*t/s', output)
104
+ prompt_match = re.search(r'Prompt:\s*([\d.]+)\s*t/s', output)
105
+ gen_tps = float(gen_match.group(1)) if gen_match else 0
106
+ prompt_tps = float(prompt_match.group(1)) if prompt_match else 0
107
+
108
+ # Extract response: everything between the prompt marker and the stats line
109
+ # With --no-display-prompt, the response starts right after loading
110
+ lines = output.split('\n')
111
+ response_lines = []
112
+ in_response = False
113
+
114
+ for line in lines:
115
+ # Stop at stats
116
+ if 't/s' in line or 'Exiting' in line:
117
+ break
118
+
119
+ # Skip loading animation
120
+ if 'Loading model' in line:
121
+ continue
122
+ if 'llama_context' in line:
123
+ continue
124
+ if 'llama_kv_cache' in line:
125
+ continue
126
+ if 'build' in line and ':' in line:
127
+ continue
128
+ if 'model' in line and ':' in line and 'dispatchAI' not in line:
129
+ continue
130
+ if 'modalities' in line:
131
+ continue
132
+ if 'available commands' in line:
133
+ continue
134
+ if line.strip().startswith('/'):
135
+ continue
136
+ if line.strip() == '>':
137
+ continue
138
+ if not line.strip():
139
+ continue
140
+
141
+ # Clean the line
142
+ clean = line
143
+ # Remove spinner characters
144
+ clean = re.sub(r'[|/\\\-]', '', clean)
145
+ # Remove backspace
146
+ clean = clean.replace('\b', '')
147
+ # Remove block characters (loading animation)
148
+ clean = re.sub(r'[β–„β–ˆβ–€β–Œβ–β–’β–‘β”‚β•‘β•”β•—β•šβ•β•]', '', clean)
149
+ # Remove prompt template markers
150
+ clean = re.sub(r'<\|[^>]+\|>', '', clean)
151
+ clean = re.sub(r'<start_of_turn>|<end_of_turn>', '', clean)
152
+ clean = re.sub(r'eot_id|begin_of_text|start_header_id|end_header_id', '', clean)
153
+ # Remove leading/trailing whitespace and special chars
154
+ clean = clean.strip(' <>|')
155
+ # Remove leading > if present
156
+ clean = clean.lstrip('> ').strip()
157
+
158
+ if clean and len(clean) > 0:
159
+ response_lines.append(clean)
160
+
161
+ generated_text = ' '.join(response_lines).strip()
162
+
163
+ # AGGRESSIVE final cleanup β€” strip ALL template markers from final text
164
+ # The model outputs markers like <eot_id>, <start_header_id>, etc (without pipes)
165
+ import re as re_final
166
+ # Remove ALL angle-bracket content (catches <eot_id>, <|eot_id|>, etc)
167
+ generated_text = re_final.sub(r'<[^>]*>', '', generated_text)
168
+ # Also remove bare markers without brackets
169
+ generated_text = re_final.sub(r'\beot_id\b|\bbegin_of_text\b|\bstart_header_id\b|\bend_header_id\b|\bim_start\b|\bim_end\b', '', generated_text)
170
+ # Remove "User:" and "Assistant:" echoes
171
+ generated_text = re_final.sub(r'^(User:|Assistant:|System:)\s*', '', generated_text)
172
+ # Collapse whitespace
173
+ generated_text = re_final.sub(r'\s+', ' ', generated_text).strip()
174
+
175
+ # If the output is just template markers (empty after cleanup),
176
+ # try raw completion approach
177
+ if len(generated_text) < 5:
178
+ # The prompt template approach failed β€” try raw text completion
179
+ raw_prompt = messages[-1].get("content", "") if messages else "Hello"
180
+ cmd_raw = (
181
+ f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
182
+ f'timeout 60 ./llama-cli '
183
+ f'-m {MODEL_PATH} '
184
+ f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' '
185
+ f'-n {max_tokens} '
186
+ f'-t 4 '
187
+ f'-st '
188
+ f'--no-display-prompt 2>&1'
189
+ )
190
+ result2 = subprocess.run(
191
+ ["adb", "-s", PHONE_SERIAL, "shell", cmd_raw],
192
+ env=ENV, capture_output=True, text=True, timeout=90
193
+ )
194
+ output2 = result2.stdout + result2.stderr
195
+ # For raw completion, take everything before the stats line
196
+ lines2 = output2.split('\n')
197
+ raw_lines = []
198
+ for line in lines2:
199
+ if 't/s' in line or 'Exiting' in line:
200
+ break
201
+ if 'Loading' in line or 'llama_' in line or 'build' in line:
202
+ continue
203
+ if 'available commands' in line or line.strip().startswith('/'):
204
+ continue
205
+ clean2 = re.sub(r'[|/\\\-β–„β–ˆβ–€β–Œβ–β–’β–‘β”‚β•‘β•”β•—β•šβ•β•]', '', line).replace('\b', '').strip()
206
+ if clean2 and len(clean2) > 1 and clean2 != '>':
207
+ raw_lines.append(clean2)
208
+ generated_text = ' '.join(raw_lines).strip()
209
+
210
+ # If still has noise, try extracting just the part after the last prompt echo
211
+ if len(generated_text) > 10:
212
+ # Find the last occurrence of the user's message and take everything after
213
+ for msg in reversed(messages):
214
+ content = msg.get("content", "")
215
+ if content and content in generated_text:
216
+ idx = generated_text.rfind(content)
217
+ after = generated_text[idx + len(content):].strip()
218
+ if after:
219
+ generated_text = after
220
+ break
221
+
222
+ # Estimate tokens
223
+ prompt_tokens = len(prompt) // 4
224
+ completion_tokens = len(generated_text) // 4
225
+ total_tokens = prompt_tokens + completion_tokens
226
+
227
+ response = {
228
+ "id": f"chatcmpl-{int(time.time())}",
229
+ "object": "chat.completion",
230
+ "created": int(time.time()),
231
+ "model": req.get("model", "dispatchAI"),
232
+ "choices": [{
233
+ "index": 0,
234
+ "message": {"role": "assistant", "content": generated_text},
235
+ "finish_reason": "stop",
236
+ }],
237
+ "usage": {
238
+ "prompt_tokens": prompt_tokens,
239
+ "completion_tokens": completion_tokens,
240
+ "total_tokens": total_tokens,
241
+ },
242
+ "phone_info": {
243
+ "serial": PHONE_SERIAL,
244
+ "generation_tps": gen_tps,
245
+ "prompt_tps": prompt_tps,
246
+ },
247
+ }
248
+
249
+ self.send_response(200)
250
+ self.send_header("Content-Type", "application/json")
251
+ self.end_headers()
252
+ self.wfile.write(json.dumps(response).encode())
253
+
254
+ except subprocess.TimeoutExpired:
255
+ self.send_error(504, "Inference timed out")
256
+ except Exception as e:
257
+ self.send_error(500, str(e)[:200])
258
+
259
+ def do_GET(self):
260
+ if self.path == "/health":
261
+ self.send_response(200)
262
+ self.send_header("Content-Type", "application/json")
263
+ self.end_headers()
264
+ self.wfile.write(json.dumps({
265
+ "status": "ok", "phone": PHONE_SERIAL, "port": PORT
266
+ }).encode())
267
+ else:
268
+ self.send_error(404)
269
+
270
+ def build_prompt(self, messages, fmt):
271
+ prompt = ""
272
+ for msg in messages:
273
+ role = msg.get("role", "user")
274
+ content = msg.get("content", "")
275
+ if fmt == "llama-3":
276
+ if role == "system":
277
+ prompt += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{content}<|eot_id|>"
278
+ elif role == "user":
279
+ prompt += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>"
280
+ elif role == "assistant":
281
+ prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>"
282
+ prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
283
+ elif fmt == "gemma":
284
+ if role == "user":
285
+ prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
286
+ elif role == "assistant":
287
+ prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
288
+ prompt += "<start_of_turn>model\n"
289
+ else: # chatml
290
+ prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
291
+ prompt += "<|im_start|>assistant\n"
292
+ return prompt
293
+
294
+ def log_message(self, format, *args):
295
+ pass # Suppress logs
296
+
297
+ if __name__ == "__main__":
298
+ if not PHONE_SERIAL:
299
+ print("Usage: python phone_proxy_v2.py <serial> [port]")
300
+ sys.exit(1)
301
+
302
+ # Check phone
303
+ result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV)
304
+ if PHONE_SERIAL not in result.stdout:
305
+ print(f"Phone {PHONE_SERIAL} not found")
306
+ sys.exit(1)
307
+
308
+ print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}")
309
+ server = HTTPServer(("0.0.0.0", PORT), PhoneHandler)
310
+ try:
311
+ server.serve_forever()
312
+ except KeyboardInterrupt:
313
+ server.shutdown()