Upload api/phone_proxy_v2.py with huggingface_hub

0ad01db verified about 16 hours ago

13.4 kB

	"""
	dispatchAI Phone Proxy v2 — Clean output extraction
	Instead of parsing the messy llama-cli output, we use --log-disable
	and capture only the response text between known markers.
	"""
	import os
	import sys
	import json
	import subprocess
	import re
	import time
	from http.server import HTTPServer, BaseHTTPRequestHandler

	PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else ""
	PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000
	LLAMA_CLI = "/data/local/tmp/llama-cli"
	MODEL_PATH = "/data/local/tmp/model.gguf"
	ENV = os.environ.copy()
	ENV["MSYS_NO_PATHCONV"] = "1"

	class PhoneHandler(BaseHTTPRequestHandler):
	def do_POST(self):
	if self.path != "/v1/chat/completions":
	self.send_error(404)
	return

	content_length = int(self.headers.get("Content-Length", 0))
	body = self.rfile.read(content_length)

	try:
	req = json.loads(body)
	except:
	self.send_error(400, "Invalid JSON")
	return

	messages = req.get("messages", [])
	max_tokens = req.get("max_tokens", 100)
	temperature = req.get("temperature", 0.7)
	chat_format = req.get("chat_format", "chatml")

	# Build prompt — use SIMPLE raw completion (no chat template)
	# This avoids template markers being echoed in output
	# Just use the user's last message as the prompt
	user_message = ""
	for msg in reversed(messages):
	if msg.get("role") == "user":
	user_message = msg.get("content", "")
	break

	if not user_message:
	user_message = "Hello"

	# For chat models, prefix with a natural prompt
	if len(messages) > 1:
	# Multi-turn: include system + user
	system_msg = ""
	for msg in messages:
	if msg.get("role") == "system":
	system_msg = msg.get("content", "")
	break

	if system_msg:
	prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:"
	else:
	prompt = f"User: {user_message}\nAssistant:"
	else:
	# Single turn — just use the message directly
	# For SmolLM2/Llama, raw continuation works: "The capital of France is"
	# For chat-style, use "User: ... Assistant:"
	prompt = user_message

	escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\")

	# Run llama-cli with prompt from stdin to avoid escaping issues
	# Actually, let's use a simpler approach: write prompt to file, then use -p file
	# But llama-cli doesn't support file input. Let's use a different approach.

	# Use the -p flag but with careful escaping
	cmd = (
	f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
	f'timeout 60 ./llama-cli '
	f'-m {MODEL_PATH} '
	f'-p \'{escaped_prompt}\' '
	f'-n {max_tokens} '
	f'-t 4 '
	f'-st '
	f'--no-display-prompt '
	f'--log-disable 2>&1'
	)

	try:
	result = subprocess.run(
	["adb", "-s", PHONE_SERIAL, "shell", cmd],
	env=ENV, capture_output=True, text=True, timeout=90
	)

	output = result.stdout + result.stderr

	# With --log-disable and --no-display-prompt, the output should be cleaner
	# But there may still be some noise. Extract just the response.

	# Parse speed
	gen_match = re.search(r'Generation:\s([\d.]+)\st/s', output)
	prompt_match = re.search(r'Prompt:\s([\d.]+)\st/s', output)
	gen_tps = float(gen_match.group(1)) if gen_match else 0
	prompt_tps = float(prompt_match.group(1)) if prompt_match else 0

	# Extract response: everything between the prompt marker and the stats line
	# With --no-display-prompt, the response starts right after loading
	lines = output.split('\n')
	response_lines = []
	in_response = False

	for line in lines:
	# Stop at stats
	if 't/s' in line or 'Exiting' in line:
	break

	# Skip loading animation
	if 'Loading model' in line:
	continue
	if 'llama_context' in line:
	continue
	if 'llama_kv_cache' in line:
	continue
	if 'build' in line and ':' in line:
	continue
	if 'model' in line and ':' in line and 'dispatchAI' not in line:
	continue
	if 'modalities' in line:
	continue
	if 'available commands' in line:
	continue
	if line.strip().startswith('/'):
	continue
	if line.strip() == '>':
	continue
	if not line.strip():
	continue

	# Clean the line
	clean = line
	# Remove spinner characters
	clean = re.sub(r'[\|/\\\-]', '', clean)
	# Remove backspace
	clean = clean.replace('\b', '')
	# Remove block characters (loading animation)
	clean = re.sub(r'[▄█▀▌▐▒░│║╔╗╚╝═]', '', clean)
	# Remove prompt template markers
	clean = re.sub(r'<\\|[^>]+\\|>', '', clean)
	clean = re.sub(r'<start_of_turn>\|<end_of_turn>', '', clean)
	clean = re.sub(r'eot_id\|begin_of_text\|start_header_id\|end_header_id', '', clean)
	# Remove leading/trailing whitespace and special chars
	clean = clean.strip(' <>\|')
	# Remove leading > if present
	clean = clean.lstrip('> ').strip()

	if clean and len(clean) > 0:
	response_lines.append(clean)

	generated_text = ' '.join(response_lines).strip()

	# AGGRESSIVE final cleanup — strip ALL template markers from final text
	# The model outputs markers like <eot_id>, <start_header_id>, etc (without pipes)
	import re as re_final
	# Remove ALL angle-bracket content (catches <eot_id>, <\|eot_id\|>, etc)
	generated_text = re_final.sub(r'<[^>]*>', '', generated_text)
	# Also remove bare markers without brackets
	generated_text = re_final.sub(r'\beot_id\b\|\bbegin_of_text\b\|\bstart_header_id\b\|\bend_header_id\b\|\bim_start\b\|\bim_end\b', '', generated_text)
	# Remove "User:" and "Assistant:" echoes
	generated_text = re_final.sub(r'^(User:\|Assistant:\|System:)\s*', '', generated_text)
	# Collapse whitespace
	generated_text = re_final.sub(r'\s+', ' ', generated_text).strip()

	# If the output is just template markers (empty after cleanup),
	# try raw completion approach
	if len(generated_text) < 5:
	# The prompt template approach failed — try raw text completion
	raw_prompt = messages[-1].get("content", "") if messages else "Hello"
	cmd_raw = (
	f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
	f'timeout 60 ./llama-cli '
	f'-m {MODEL_PATH} '
	f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' '
	f'-n {max_tokens} '
	f'-t 4 '
	f'-st '
	f'--no-display-prompt 2>&1'
	)
	result2 = subprocess.run(
	["adb", "-s", PHONE_SERIAL, "shell", cmd_raw],
	env=ENV, capture_output=True, text=True, timeout=90
	)
	output2 = result2.stdout + result2.stderr
	# For raw completion, take everything before the stats line
	lines2 = output2.split('\n')
	raw_lines = []
	for line in lines2:
	if 't/s' in line or 'Exiting' in line:
	break
	if 'Loading' in line or 'llama_' in line or 'build' in line:
	continue
	if 'available commands' in line or line.strip().startswith('/'):
	continue
	clean2 = re.sub(r'[\|/\\\-▄█▀▌▐▒░│║╔╗╚╝═]', '', line).replace('\b', '').strip()
	if clean2 and len(clean2) > 1 and clean2 != '>':
	raw_lines.append(clean2)
	generated_text = ' '.join(raw_lines).strip()

	# If still has noise, try extracting just the part after the last prompt echo
	if len(generated_text) > 10:
	# Find the last occurrence of the user's message and take everything after
	for msg in reversed(messages):
	content = msg.get("content", "")
	if content and content in generated_text:
	idx = generated_text.rfind(content)
	after = generated_text[idx + len(content):].strip()
	if after:
	generated_text = after
	break

	# Estimate tokens
	prompt_tokens = len(prompt) // 4
	completion_tokens = len(generated_text) // 4
	total_tokens = prompt_tokens + completion_tokens

	response = {
	"id": f"chatcmpl-{int(time.time())}",
	"object": "chat.completion",
	"created": int(time.time()),
	"model": req.get("model", "dispatchAI"),
	"choices": [{
	"index": 0,
	"message": {"role": "assistant", "content": generated_text},
	"finish_reason": "stop",
	}],
	"usage": {
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion_tokens,
	"total_tokens": total_tokens,
	},
	"phone_info": {
	"serial": PHONE_SERIAL,
	"generation_tps": gen_tps,
	"prompt_tps": prompt_tps,
	},
	}

	self.send_response(200)
	self.send_header("Content-Type", "application/json")
	self.end_headers()
	self.wfile.write(json.dumps(response).encode())

	except subprocess.TimeoutExpired:
	self.send_error(504, "Inference timed out")
	except Exception as e:
	self.send_error(500, str(e)[:200])

	def do_GET(self):
	if self.path == "/health":
	self.send_response(200)
	self.send_header("Content-Type", "application/json")
	self.end_headers()
	self.wfile.write(json.dumps({
	"status": "ok", "phone": PHONE_SERIAL, "port": PORT
	}).encode())
	else:
	self.send_error(404)

	def build_prompt(self, messages, fmt):
	prompt = ""
	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	if fmt == "llama-3":
	if role == "system":
	prompt += f"<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>\n{content}<\|eot_id\|>"
	elif role == "user":
	prompt += f"<\|start_header_id\|>user<\|end_header_id\|>\n{content}<\|eot_id\|>"
	elif role == "assistant":
	prompt += f"<\|start_header_id\|>assistant<\|end_header_id\|>\n{content}<\|eot_id\|>"
	prompt += "<\|start_header_id\|>assistant<\|end_header_id\|>\n"
	elif fmt == "gemma":
	if role == "user":
	prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
	elif role == "assistant":
	prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
	prompt += "<start_of_turn>model\n"
	else: # chatml
	prompt += f"<\|im_start\|>{role}\n{content}<\|im_end\|>\n"
	prompt += "<\|im_start\|>assistant\n"
	return prompt

	def log_message(self, format, *args):
	pass # Suppress logs

	if __name__ == "__main__":
	if not PHONE_SERIAL:
	print("Usage: python phone_proxy_v2.py <serial> [port]")
	sys.exit(1)

	# Check phone
	result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV)
	if PHONE_SERIAL not in result.stdout:
	print(f"Phone {PHONE_SERIAL} not found")
	sys.exit(1)

	print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}")
	server = HTTPServer(("0.0.0.0", PORT), PhoneHandler)
	try:
	server.serve_forever()
	except KeyboardInterrupt:
	server.shutdown()