""" dispatchAI Phone Proxy v2 — Clean output extraction Instead of parsing the messy llama-cli output, we use --log-disable and capture only the response text between known markers. """ import os import sys import json import subprocess import re import time from http.server import HTTPServer, BaseHTTPRequestHandler PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else "" PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000 LLAMA_CLI = "/data/local/tmp/llama-cli" MODEL_PATH = "/data/local/tmp/model.gguf" ENV = os.environ.copy() ENV["MSYS_NO_PATHCONV"] = "1" class PhoneHandler(BaseHTTPRequestHandler): def do_POST(self): if self.path != "/v1/chat/completions": self.send_error(404) return content_length = int(self.headers.get("Content-Length", 0)) body = self.rfile.read(content_length) try: req = json.loads(body) except: self.send_error(400, "Invalid JSON") return messages = req.get("messages", []) max_tokens = req.get("max_tokens", 100) temperature = req.get("temperature", 0.7) chat_format = req.get("chat_format", "chatml") # Build prompt — use SIMPLE raw completion (no chat template) # This avoids template markers being echoed in output # Just use the user's last message as the prompt user_message = "" for msg in reversed(messages): if msg.get("role") == "user": user_message = msg.get("content", "") break if not user_message: user_message = "Hello" # For chat models, prefix with a natural prompt if len(messages) > 1: # Multi-turn: include system + user system_msg = "" for msg in messages: if msg.get("role") == "system": system_msg = msg.get("content", "") break if system_msg: prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:" else: prompt = f"User: {user_message}\nAssistant:" else: # Single turn — just use the message directly # For SmolLM2/Llama, raw continuation works: "The capital of France is" # For chat-style, use "User: ... Assistant:" prompt = user_message escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\") # Run llama-cli with prompt from stdin to avoid escaping issues # Actually, let's use a simpler approach: write prompt to file, then use -p file # But llama-cli doesn't support file input. Let's use a different approach. # Use the -p flag but with careful escaping cmd = ( f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp ' f'timeout 60 ./llama-cli ' f'-m {MODEL_PATH} ' f'-p \'{escaped_prompt}\' ' f'-n {max_tokens} ' f'-t 4 ' f'-st ' f'--no-display-prompt ' f'--log-disable 2>&1' ) try: result = subprocess.run( ["adb", "-s", PHONE_SERIAL, "shell", cmd], env=ENV, capture_output=True, text=True, timeout=90 ) output = result.stdout + result.stderr # With --log-disable and --no-display-prompt, the output should be cleaner # But there may still be some noise. Extract just the response. # Parse speed gen_match = re.search(r'Generation:\s*([\d.]+)\s*t/s', output) prompt_match = re.search(r'Prompt:\s*([\d.]+)\s*t/s', output) gen_tps = float(gen_match.group(1)) if gen_match else 0 prompt_tps = float(prompt_match.group(1)) if prompt_match else 0 # Extract response: everything between the prompt marker and the stats line # With --no-display-prompt, the response starts right after loading lines = output.split('\n') response_lines = [] in_response = False for line in lines: # Stop at stats if 't/s' in line or 'Exiting' in line: break # Skip loading animation if 'Loading model' in line: continue if 'llama_context' in line: continue if 'llama_kv_cache' in line: continue if 'build' in line and ':' in line: continue if 'model' in line and ':' in line and 'dispatchAI' not in line: continue if 'modalities' in line: continue if 'available commands' in line: continue if line.strip().startswith('/'): continue if line.strip() == '>': continue if not line.strip(): continue # Clean the line clean = line # Remove spinner characters clean = re.sub(r'[|/\\\-]', '', clean) # Remove backspace clean = clean.replace('\b', '') # Remove block characters (loading animation) clean = re.sub(r'[▄█▀▌▐▒░│║╔╗╚╝═]', '', clean) # Remove prompt template markers clean = re.sub(r'<\|[^>]+\|>', '', clean) clean = re.sub(r'|', '', clean) clean = re.sub(r'eot_id|begin_of_text|start_header_id|end_header_id', '', clean) # Remove leading/trailing whitespace and special chars clean = clean.strip(' <>|') # Remove leading > if present clean = clean.lstrip('> ').strip() if clean and len(clean) > 0: response_lines.append(clean) generated_text = ' '.join(response_lines).strip() # AGGRESSIVE final cleanup — strip ALL template markers from final text # The model outputs markers like , , etc (without pipes) import re as re_final # Remove ALL angle-bracket content (catches , <|eot_id|>, etc) generated_text = re_final.sub(r'<[^>]*>', '', generated_text) # Also remove bare markers without brackets generated_text = re_final.sub(r'\beot_id\b|\bbegin_of_text\b|\bstart_header_id\b|\bend_header_id\b|\bim_start\b|\bim_end\b', '', generated_text) # Remove "User:" and "Assistant:" echoes generated_text = re_final.sub(r'^(User:|Assistant:|System:)\s*', '', generated_text) # Collapse whitespace generated_text = re_final.sub(r'\s+', ' ', generated_text).strip() # If the output is just template markers (empty after cleanup), # try raw completion approach if len(generated_text) < 5: # The prompt template approach failed — try raw text completion raw_prompt = messages[-1].get("content", "") if messages else "Hello" cmd_raw = ( f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp ' f'timeout 60 ./llama-cli ' f'-m {MODEL_PATH} ' f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' ' f'-n {max_tokens} ' f'-t 4 ' f'-st ' f'--no-display-prompt 2>&1' ) result2 = subprocess.run( ["adb", "-s", PHONE_SERIAL, "shell", cmd_raw], env=ENV, capture_output=True, text=True, timeout=90 ) output2 = result2.stdout + result2.stderr # For raw completion, take everything before the stats line lines2 = output2.split('\n') raw_lines = [] for line in lines2: if 't/s' in line or 'Exiting' in line: break if 'Loading' in line or 'llama_' in line or 'build' in line: continue if 'available commands' in line or line.strip().startswith('/'): continue clean2 = re.sub(r'[|/\\\-▄█▀▌▐▒░│║╔╗╚╝═]', '', line).replace('\b', '').strip() if clean2 and len(clean2) > 1 and clean2 != '>': raw_lines.append(clean2) generated_text = ' '.join(raw_lines).strip() # If still has noise, try extracting just the part after the last prompt echo if len(generated_text) > 10: # Find the last occurrence of the user's message and take everything after for msg in reversed(messages): content = msg.get("content", "") if content and content in generated_text: idx = generated_text.rfind(content) after = generated_text[idx + len(content):].strip() if after: generated_text = after break # Estimate tokens prompt_tokens = len(prompt) // 4 completion_tokens = len(generated_text) // 4 total_tokens = prompt_tokens + completion_tokens response = { "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", "created": int(time.time()), "model": req.get("model", "dispatchAI"), "choices": [{ "index": 0, "message": {"role": "assistant", "content": generated_text}, "finish_reason": "stop", }], "usage": { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens, }, "phone_info": { "serial": PHONE_SERIAL, "generation_tps": gen_tps, "prompt_tps": prompt_tps, }, } self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(json.dumps(response).encode()) except subprocess.TimeoutExpired: self.send_error(504, "Inference timed out") except Exception as e: self.send_error(500, str(e)[:200]) def do_GET(self): if self.path == "/health": self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(json.dumps({ "status": "ok", "phone": PHONE_SERIAL, "port": PORT }).encode()) else: self.send_error(404) def build_prompt(self, messages, fmt): prompt = "" for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") if fmt == "llama-3": if role == "system": prompt += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{content}<|eot_id|>" elif role == "user": prompt += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>" elif role == "assistant": prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>" prompt += "<|start_header_id|>assistant<|end_header_id|>\n" elif fmt == "gemma": if role == "user": prompt += f"user\n{content}\n" elif role == "assistant": prompt += f"model\n{content}\n" prompt += "model\n" else: # chatml prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n" prompt += "<|im_start|>assistant\n" return prompt def log_message(self, format, *args): pass # Suppress logs if __name__ == "__main__": if not PHONE_SERIAL: print("Usage: python phone_proxy_v2.py [port]") sys.exit(1) # Check phone result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV) if PHONE_SERIAL not in result.stdout: print(f"Phone {PHONE_SERIAL} not found") sys.exit(1) print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}") server = HTTPServer(("0.0.0.0", PORT), PhoneHandler) try: server.serve_forever() except KeyboardInterrupt: server.shutdown()