| """ |
| dispatchAI Phone Proxy v2 β Clean output extraction |
| Instead of parsing the messy llama-cli output, we use --log-disable |
| and capture only the response text between known markers. |
| """ |
| import os |
| import sys |
| import json |
| import subprocess |
| import re |
| import time |
| from http.server import HTTPServer, BaseHTTPRequestHandler |
|
|
| PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else "" |
| PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000 |
| LLAMA_CLI = "/data/local/tmp/llama-cli" |
| MODEL_PATH = "/data/local/tmp/model.gguf" |
| ENV = os.environ.copy() |
| ENV["MSYS_NO_PATHCONV"] = "1" |
|
|
| class PhoneHandler(BaseHTTPRequestHandler): |
| def do_POST(self): |
| if self.path != "/v1/chat/completions": |
| self.send_error(404) |
| return |
| |
| content_length = int(self.headers.get("Content-Length", 0)) |
| body = self.rfile.read(content_length) |
| |
| try: |
| req = json.loads(body) |
| except: |
| self.send_error(400, "Invalid JSON") |
| return |
| |
| messages = req.get("messages", []) |
| max_tokens = req.get("max_tokens", 100) |
| temperature = req.get("temperature", 0.7) |
| chat_format = req.get("chat_format", "chatml") |
| |
| |
| |
| |
| user_message = "" |
| for msg in reversed(messages): |
| if msg.get("role") == "user": |
| user_message = msg.get("content", "") |
| break |
| |
| if not user_message: |
| user_message = "Hello" |
| |
| |
| if len(messages) > 1: |
| |
| system_msg = "" |
| for msg in messages: |
| if msg.get("role") == "system": |
| system_msg = msg.get("content", "") |
| break |
| |
| if system_msg: |
| prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:" |
| else: |
| prompt = f"User: {user_message}\nAssistant:" |
| else: |
| |
| |
| |
| prompt = user_message |
| |
| escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\") |
| |
| |
| |
| |
| |
| |
| cmd = ( |
| f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp ' |
| f'timeout 60 ./llama-cli ' |
| f'-m {MODEL_PATH} ' |
| f'-p \'{escaped_prompt}\' ' |
| f'-n {max_tokens} ' |
| f'-t 4 ' |
| f'-st ' |
| f'--no-display-prompt ' |
| f'--log-disable 2>&1' |
| ) |
| |
| try: |
| result = subprocess.run( |
| ["adb", "-s", PHONE_SERIAL, "shell", cmd], |
| env=ENV, capture_output=True, text=True, timeout=90 |
| ) |
| |
| output = result.stdout + result.stderr |
| |
| |
| |
| |
| |
| gen_match = re.search(r'Generation:\s*([\d.]+)\s*t/s', output) |
| prompt_match = re.search(r'Prompt:\s*([\d.]+)\s*t/s', output) |
| gen_tps = float(gen_match.group(1)) if gen_match else 0 |
| prompt_tps = float(prompt_match.group(1)) if prompt_match else 0 |
| |
| |
| |
| lines = output.split('\n') |
| response_lines = [] |
| in_response = False |
| |
| for line in lines: |
| |
| if 't/s' in line or 'Exiting' in line: |
| break |
| |
| |
| if 'Loading model' in line: |
| continue |
| if 'llama_context' in line: |
| continue |
| if 'llama_kv_cache' in line: |
| continue |
| if 'build' in line and ':' in line: |
| continue |
| if 'model' in line and ':' in line and 'dispatchAI' not in line: |
| continue |
| if 'modalities' in line: |
| continue |
| if 'available commands' in line: |
| continue |
| if line.strip().startswith('/'): |
| continue |
| if line.strip() == '>': |
| continue |
| if not line.strip(): |
| continue |
| |
| |
| clean = line |
| |
| clean = re.sub(r'[|/\\\-]', '', clean) |
| |
| clean = clean.replace('\b', '') |
| |
| clean = re.sub(r'[ββββββββββββββ]', '', clean) |
| |
| clean = re.sub(r'<\|[^>]+\|>', '', clean) |
| clean = re.sub(r'<start_of_turn>|<end_of_turn>', '', clean) |
| clean = re.sub(r'eot_id|begin_of_text|start_header_id|end_header_id', '', clean) |
| |
| clean = clean.strip(' <>|') |
| |
| clean = clean.lstrip('> ').strip() |
| |
| if clean and len(clean) > 0: |
| response_lines.append(clean) |
| |
| generated_text = ' '.join(response_lines).strip() |
| |
| |
| |
| import re as re_final |
| |
| generated_text = re_final.sub(r'<[^>]*>', '', generated_text) |
| |
| generated_text = re_final.sub(r'\beot_id\b|\bbegin_of_text\b|\bstart_header_id\b|\bend_header_id\b|\bim_start\b|\bim_end\b', '', generated_text) |
| |
| generated_text = re_final.sub(r'^(User:|Assistant:|System:)\s*', '', generated_text) |
| |
| generated_text = re_final.sub(r'\s+', ' ', generated_text).strip() |
| |
| |
| |
| if len(generated_text) < 5: |
| |
| raw_prompt = messages[-1].get("content", "") if messages else "Hello" |
| cmd_raw = ( |
| f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp ' |
| f'timeout 60 ./llama-cli ' |
| f'-m {MODEL_PATH} ' |
| f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' ' |
| f'-n {max_tokens} ' |
| f'-t 4 ' |
| f'-st ' |
| f'--no-display-prompt 2>&1' |
| ) |
| result2 = subprocess.run( |
| ["adb", "-s", PHONE_SERIAL, "shell", cmd_raw], |
| env=ENV, capture_output=True, text=True, timeout=90 |
| ) |
| output2 = result2.stdout + result2.stderr |
| |
| lines2 = output2.split('\n') |
| raw_lines = [] |
| for line in lines2: |
| if 't/s' in line or 'Exiting' in line: |
| break |
| if 'Loading' in line or 'llama_' in line or 'build' in line: |
| continue |
| if 'available commands' in line or line.strip().startswith('/'): |
| continue |
| clean2 = re.sub(r'[|/\\\-ββββββββββββββ]', '', line).replace('\b', '').strip() |
| if clean2 and len(clean2) > 1 and clean2 != '>': |
| raw_lines.append(clean2) |
| generated_text = ' '.join(raw_lines).strip() |
| |
| |
| if len(generated_text) > 10: |
| |
| for msg in reversed(messages): |
| content = msg.get("content", "") |
| if content and content in generated_text: |
| idx = generated_text.rfind(content) |
| after = generated_text[idx + len(content):].strip() |
| if after: |
| generated_text = after |
| break |
| |
| |
| prompt_tokens = len(prompt) // 4 |
| completion_tokens = len(generated_text) // 4 |
| total_tokens = prompt_tokens + completion_tokens |
| |
| response = { |
| "id": f"chatcmpl-{int(time.time())}", |
| "object": "chat.completion", |
| "created": int(time.time()), |
| "model": req.get("model", "dispatchAI"), |
| "choices": [{ |
| "index": 0, |
| "message": {"role": "assistant", "content": generated_text}, |
| "finish_reason": "stop", |
| }], |
| "usage": { |
| "prompt_tokens": prompt_tokens, |
| "completion_tokens": completion_tokens, |
| "total_tokens": total_tokens, |
| }, |
| "phone_info": { |
| "serial": PHONE_SERIAL, |
| "generation_tps": gen_tps, |
| "prompt_tps": prompt_tps, |
| }, |
| } |
| |
| self.send_response(200) |
| self.send_header("Content-Type", "application/json") |
| self.end_headers() |
| self.wfile.write(json.dumps(response).encode()) |
| |
| except subprocess.TimeoutExpired: |
| self.send_error(504, "Inference timed out") |
| except Exception as e: |
| self.send_error(500, str(e)[:200]) |
| |
| def do_GET(self): |
| if self.path == "/health": |
| self.send_response(200) |
| self.send_header("Content-Type", "application/json") |
| self.end_headers() |
| self.wfile.write(json.dumps({ |
| "status": "ok", "phone": PHONE_SERIAL, "port": PORT |
| }).encode()) |
| else: |
| self.send_error(404) |
| |
| def build_prompt(self, messages, fmt): |
| prompt = "" |
| for msg in messages: |
| role = msg.get("role", "user") |
| content = msg.get("content", "") |
| if fmt == "llama-3": |
| if role == "system": |
| prompt += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{content}<|eot_id|>" |
| elif role == "user": |
| prompt += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>" |
| elif role == "assistant": |
| prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>" |
| prompt += "<|start_header_id|>assistant<|end_header_id|>\n" |
| elif fmt == "gemma": |
| if role == "user": |
| prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n" |
| elif role == "assistant": |
| prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n" |
| prompt += "<start_of_turn>model\n" |
| else: |
| prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n" |
| prompt += "<|im_start|>assistant\n" |
| return prompt |
| |
| def log_message(self, format, *args): |
| pass |
|
|
| if __name__ == "__main__": |
| if not PHONE_SERIAL: |
| print("Usage: python phone_proxy_v2.py <serial> [port]") |
| sys.exit(1) |
| |
| |
| result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV) |
| if PHONE_SERIAL not in result.stdout: |
| print(f"Phone {PHONE_SERIAL} not found") |
| sys.exit(1) |
| |
| print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}") |
| server = HTTPServer(("0.0.0.0", PORT), PhoneHandler) |
| try: |
| server.serve_forever() |
| except KeyboardInterrupt: |
| server.shutdown() |
|
|