dispatchAI-API / api /phone_proxy_v2.py
3morixd's picture
Upload api/phone_proxy_v2.py with huggingface_hub
0ad01db verified
Raw
History Blame Contribute Delete
13.4 kB
"""
dispatchAI Phone Proxy v2 β€” Clean output extraction
Instead of parsing the messy llama-cli output, we use --log-disable
and capture only the response text between known markers.
"""
import os
import sys
import json
import subprocess
import re
import time
from http.server import HTTPServer, BaseHTTPRequestHandler
PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else ""
PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000
LLAMA_CLI = "/data/local/tmp/llama-cli"
MODEL_PATH = "/data/local/tmp/model.gguf"
ENV = os.environ.copy()
ENV["MSYS_NO_PATHCONV"] = "1"
class PhoneHandler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path != "/v1/chat/completions":
self.send_error(404)
return
content_length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(content_length)
try:
req = json.loads(body)
except:
self.send_error(400, "Invalid JSON")
return
messages = req.get("messages", [])
max_tokens = req.get("max_tokens", 100)
temperature = req.get("temperature", 0.7)
chat_format = req.get("chat_format", "chatml")
# Build prompt β€” use SIMPLE raw completion (no chat template)
# This avoids template markers being echoed in output
# Just use the user's last message as the prompt
user_message = ""
for msg in reversed(messages):
if msg.get("role") == "user":
user_message = msg.get("content", "")
break
if not user_message:
user_message = "Hello"
# For chat models, prefix with a natural prompt
if len(messages) > 1:
# Multi-turn: include system + user
system_msg = ""
for msg in messages:
if msg.get("role") == "system":
system_msg = msg.get("content", "")
break
if system_msg:
prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:"
else:
prompt = f"User: {user_message}\nAssistant:"
else:
# Single turn β€” just use the message directly
# For SmolLM2/Llama, raw continuation works: "The capital of France is"
# For chat-style, use "User: ... Assistant:"
prompt = user_message
escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\")
# Run llama-cli with prompt from stdin to avoid escaping issues
# Actually, let's use a simpler approach: write prompt to file, then use -p file
# But llama-cli doesn't support file input. Let's use a different approach.
# Use the -p flag but with careful escaping
cmd = (
f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
f'timeout 60 ./llama-cli '
f'-m {MODEL_PATH} '
f'-p \'{escaped_prompt}\' '
f'-n {max_tokens} '
f'-t 4 '
f'-st '
f'--no-display-prompt '
f'--log-disable 2>&1'
)
try:
result = subprocess.run(
["adb", "-s", PHONE_SERIAL, "shell", cmd],
env=ENV, capture_output=True, text=True, timeout=90
)
output = result.stdout + result.stderr
# With --log-disable and --no-display-prompt, the output should be cleaner
# But there may still be some noise. Extract just the response.
# Parse speed
gen_match = re.search(r'Generation:\s*([\d.]+)\s*t/s', output)
prompt_match = re.search(r'Prompt:\s*([\d.]+)\s*t/s', output)
gen_tps = float(gen_match.group(1)) if gen_match else 0
prompt_tps = float(prompt_match.group(1)) if prompt_match else 0
# Extract response: everything between the prompt marker and the stats line
# With --no-display-prompt, the response starts right after loading
lines = output.split('\n')
response_lines = []
in_response = False
for line in lines:
# Stop at stats
if 't/s' in line or 'Exiting' in line:
break
# Skip loading animation
if 'Loading model' in line:
continue
if 'llama_context' in line:
continue
if 'llama_kv_cache' in line:
continue
if 'build' in line and ':' in line:
continue
if 'model' in line and ':' in line and 'dispatchAI' not in line:
continue
if 'modalities' in line:
continue
if 'available commands' in line:
continue
if line.strip().startswith('/'):
continue
if line.strip() == '>':
continue
if not line.strip():
continue
# Clean the line
clean = line
# Remove spinner characters
clean = re.sub(r'[|/\\\-]', '', clean)
# Remove backspace
clean = clean.replace('\b', '')
# Remove block characters (loading animation)
clean = re.sub(r'[β–„β–ˆβ–€β–Œβ–β–’β–‘β”‚β•‘β•”β•—β•šβ•β•]', '', clean)
# Remove prompt template markers
clean = re.sub(r'<\|[^>]+\|>', '', clean)
clean = re.sub(r'<start_of_turn>|<end_of_turn>', '', clean)
clean = re.sub(r'eot_id|begin_of_text|start_header_id|end_header_id', '', clean)
# Remove leading/trailing whitespace and special chars
clean = clean.strip(' <>|')
# Remove leading > if present
clean = clean.lstrip('> ').strip()
if clean and len(clean) > 0:
response_lines.append(clean)
generated_text = ' '.join(response_lines).strip()
# AGGRESSIVE final cleanup β€” strip ALL template markers from final text
# The model outputs markers like <eot_id>, <start_header_id>, etc (without pipes)
import re as re_final
# Remove ALL angle-bracket content (catches <eot_id>, <|eot_id|>, etc)
generated_text = re_final.sub(r'<[^>]*>', '', generated_text)
# Also remove bare markers without brackets
generated_text = re_final.sub(r'\beot_id\b|\bbegin_of_text\b|\bstart_header_id\b|\bend_header_id\b|\bim_start\b|\bim_end\b', '', generated_text)
# Remove "User:" and "Assistant:" echoes
generated_text = re_final.sub(r'^(User:|Assistant:|System:)\s*', '', generated_text)
# Collapse whitespace
generated_text = re_final.sub(r'\s+', ' ', generated_text).strip()
# If the output is just template markers (empty after cleanup),
# try raw completion approach
if len(generated_text) < 5:
# The prompt template approach failed β€” try raw text completion
raw_prompt = messages[-1].get("content", "") if messages else "Hello"
cmd_raw = (
f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
f'timeout 60 ./llama-cli '
f'-m {MODEL_PATH} '
f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' '
f'-n {max_tokens} '
f'-t 4 '
f'-st '
f'--no-display-prompt 2>&1'
)
result2 = subprocess.run(
["adb", "-s", PHONE_SERIAL, "shell", cmd_raw],
env=ENV, capture_output=True, text=True, timeout=90
)
output2 = result2.stdout + result2.stderr
# For raw completion, take everything before the stats line
lines2 = output2.split('\n')
raw_lines = []
for line in lines2:
if 't/s' in line or 'Exiting' in line:
break
if 'Loading' in line or 'llama_' in line or 'build' in line:
continue
if 'available commands' in line or line.strip().startswith('/'):
continue
clean2 = re.sub(r'[|/\\\-β–„β–ˆβ–€β–Œβ–β–’β–‘β”‚β•‘β•”β•—β•šβ•β•]', '', line).replace('\b', '').strip()
if clean2 and len(clean2) > 1 and clean2 != '>':
raw_lines.append(clean2)
generated_text = ' '.join(raw_lines).strip()
# If still has noise, try extracting just the part after the last prompt echo
if len(generated_text) > 10:
# Find the last occurrence of the user's message and take everything after
for msg in reversed(messages):
content = msg.get("content", "")
if content and content in generated_text:
idx = generated_text.rfind(content)
after = generated_text[idx + len(content):].strip()
if after:
generated_text = after
break
# Estimate tokens
prompt_tokens = len(prompt) // 4
completion_tokens = len(generated_text) // 4
total_tokens = prompt_tokens + completion_tokens
response = {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": req.get("model", "dispatchAI"),
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": generated_text},
"finish_reason": "stop",
}],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens,
},
"phone_info": {
"serial": PHONE_SERIAL,
"generation_tps": gen_tps,
"prompt_tps": prompt_tps,
},
}
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(response).encode())
except subprocess.TimeoutExpired:
self.send_error(504, "Inference timed out")
except Exception as e:
self.send_error(500, str(e)[:200])
def do_GET(self):
if self.path == "/health":
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps({
"status": "ok", "phone": PHONE_SERIAL, "port": PORT
}).encode())
else:
self.send_error(404)
def build_prompt(self, messages, fmt):
prompt = ""
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
if fmt == "llama-3":
if role == "system":
prompt += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{content}<|eot_id|>"
elif role == "user":
prompt += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>"
elif role == "assistant":
prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>"
prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
elif fmt == "gemma":
if role == "user":
prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
elif role == "assistant":
prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
prompt += "<start_of_turn>model\n"
else: # chatml
prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
prompt += "<|im_start|>assistant\n"
return prompt
def log_message(self, format, *args):
pass # Suppress logs
if __name__ == "__main__":
if not PHONE_SERIAL:
print("Usage: python phone_proxy_v2.py <serial> [port]")
sys.exit(1)
# Check phone
result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV)
if PHONE_SERIAL not in result.stdout:
print(f"Phone {PHONE_SERIAL} not found")
sys.exit(1)
print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}")
server = HTTPServer(("0.0.0.0", PORT), PhoneHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
server.shutdown()