dispatchAI
/

dispatchAI-API

Model card Files Files and versions

xet

Community

3morixd commited on about 16 hours ago

Commit

0ad01db

verified ·

1 Parent(s): 369004c

Upload api/phone_proxy_v2.py with huggingface_hub

Browse files

Files changed (1) hide show

api/phone_proxy_v2.py +313 -0

api/phone_proxy_v2.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""
+dispatchAI Phone Proxy v2 — Clean output extraction
+Instead of parsing the messy llama-cli output, we use --log-disable
+and capture only the response text between known markers.
+"""
+import os
+import sys
+import json
+import subprocess
+import re
+import time
+from http.server import HTTPServer, BaseHTTPRequestHandler
+PHONE_SERIAL = sys.argv[1] if len(sys.argv) > 1 else ""
+PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 5000
+LLAMA_CLI = "/data/local/tmp/llama-cli"
+MODEL_PATH = "/data/local/tmp/model.gguf"
+ENV = os.environ.copy()
+ENV["MSYS_NO_PATHCONV"] = "1"
+class PhoneHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        if self.path != "/v1/chat/completions":
+            self.send_error(404)
+            return
+        content_length = int(self.headers.get("Content-Length", 0))
+        body = self.rfile.read(content_length)
+        try:
+            req = json.loads(body)
+        except:
+            self.send_error(400, "Invalid JSON")
+            return
+        messages = req.get("messages", [])
+        max_tokens = req.get("max_tokens", 100)
+        temperature = req.get("temperature", 0.7)
+        chat_format = req.get("chat_format", "chatml")
+        # Build prompt — use SIMPLE raw completion (no chat template)
+        # This avoids template markers being echoed in output
+        # Just use the user's last message as the prompt
+        user_message = ""
+        for msg in reversed(messages):
+            if msg.get("role") == "user":
+                user_message = msg.get("content", "")
+                break
+        if not user_message:
+            user_message = "Hello"
+        # For chat models, prefix with a natural prompt
+        if len(messages) > 1:
+            # Multi-turn: include system + user
+            system_msg = ""
+            for msg in messages:
+                if msg.get("role") == "system":
+                    system_msg = msg.get("content", "")
+                    break
+            if system_msg:
+                prompt = f"{system_msg}\n\nUser: {user_message}\nAssistant:"
+            else:
+                prompt = f"User: {user_message}\nAssistant:"
+        else:
+            # Single turn — just use the message directly
+            # For SmolLM2/Llama, raw continuation works: "The capital of France is"
+            # For chat-style, use "User: ... Assistant:"
+            prompt = user_message
+        escaped_prompt = prompt.replace("'", "'\\''").replace("\\", "\\\\")
+        # Run llama-cli with prompt from stdin to avoid escaping issues
+        # Actually, let's use a simpler approach: write prompt to file, then use -p file
+        # But llama-cli doesn't support file input. Let's use a different approach.
+        # Use the -p flag but with careful escaping
+        cmd = (
+            f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
+            f'timeout 60 ./llama-cli '
+            f'-m {MODEL_PATH} '
+            f'-p \'{escaped_prompt}\' '
+            f'-n {max_tokens} '
+            f'-t 4 '
+            f'-st '
+            f'--no-display-prompt '
+            f'--log-disable 2>&1'
+        )
+        try:
+            result = subprocess.run(
+                ["adb", "-s", PHONE_SERIAL, "shell", cmd],
+                env=ENV, capture_output=True, text=True, timeout=90
+            )
+            output = result.stdout + result.stderr
+            # With --log-disable and --no-display-prompt, the output should be cleaner
+            # But there may still be some noise. Extract just the response.
+            # Parse speed
+            gen_match = re.search(r'Generation:\s*([\d.]+)\s*t/s', output)
+            prompt_match = re.search(r'Prompt:\s*([\d.]+)\s*t/s', output)
+            gen_tps = float(gen_match.group(1)) if gen_match else 0
+            prompt_tps = float(prompt_match.group(1)) if prompt_match else 0
+            # Extract response: everything between the prompt marker and the stats line
+            # With --no-display-prompt, the response starts right after loading
+            lines = output.split('\n')
+            response_lines = []
+            in_response = False
+            for line in lines:
+                # Stop at stats
+                if 't/s' in line or 'Exiting' in line:
+                    break
+                # Skip loading animation
+                if 'Loading model' in line:
+                    continue
+                if 'llama_context' in line:
+                    continue
+                if 'llama_kv_cache' in line:
+                    continue
+                if 'build' in line and ':' in line:
+                    continue
+                if 'model' in line and ':' in line and 'dispatchAI' not in line:
+                    continue
+                if 'modalities' in line:
+                    continue
+                if 'available commands' in line:
+                    continue
+                if line.strip().startswith('/'):
+                    continue
+                if line.strip() == '>':
+                    continue
+                if not line.strip():
+                    continue
+                # Clean the line
+                clean = line
+                # Remove spinner characters
+                clean = re.sub(r'[|/\\\-]', '', clean)
+                # Remove backspace
+                clean = clean.replace('\b', '')
+                # Remove block characters (loading animation)
+                clean = re.sub(r'[▄█▀▌▐▒░│║╔╗╚╝═]', '', clean)
+                # Remove prompt template markers
+                clean = re.sub(r'<\|[^>]+\|>', '', clean)
+                clean = re.sub(r'<start_of_turn>|<end_of_turn>', '', clean)
+                clean = re.sub(r'eot_id|begin_of_text|start_header_id|end_header_id', '', clean)
+                # Remove leading/trailing whitespace and special chars
+                clean = clean.strip(' <>|')
+                # Remove leading > if present
+                clean = clean.lstrip('> ').strip()
+                if clean and len(clean) > 0:
+                    response_lines.append(clean)
+            generated_text = ' '.join(response_lines).strip()
+            # AGGRESSIVE final cleanup — strip ALL template markers from final text
+            # The model outputs markers like <eot_id>, <start_header_id>, etc (without pipes)
+            import re as re_final
+            # Remove ALL angle-bracket content (catches <eot_id>, <|eot_id|>, etc)
+            generated_text = re_final.sub(r'<[^>]*>', '', generated_text)
+            # Also remove bare markers without brackets
+            generated_text = re_final.sub(r'\beot_id\b|\bbegin_of_text\b|\bstart_header_id\b|\bend_header_id\b|\bim_start\b|\bim_end\b', '', generated_text)
+            # Remove "User:" and "Assistant:" echoes
+            generated_text = re_final.sub(r'^(User:|Assistant:|System:)\s*', '', generated_text)
+            # Collapse whitespace
+            generated_text = re_final.sub(r'\s+', ' ', generated_text).strip()
+            # If the output is just template markers (empty after cleanup),
+            # try raw completion approach
+            if len(generated_text) < 5:
+                # The prompt template approach failed — try raw text completion
+                raw_prompt = messages[-1].get("content", "") if messages else "Hello"
+                cmd_raw = (
+                    f'cd /data/local/tmp && LD_LIBRARY_PATH=/data/local/tmp '
+                    f'timeout 60 ./llama-cli '
+                    f'-m {MODEL_PATH} '
+                    f'-p \'{raw_prompt.replace(chr(39), chr(92)+chr(39))}\' '
+                    f'-n {max_tokens} '
+                    f'-t 4 '
+                    f'-st '
+                    f'--no-display-prompt 2>&1'
+                )
+                result2 = subprocess.run(
+                    ["adb", "-s", PHONE_SERIAL, "shell", cmd_raw],
+                    env=ENV, capture_output=True, text=True, timeout=90
+                )
+                output2 = result2.stdout + result2.stderr
+                # For raw completion, take everything before the stats line
+                lines2 = output2.split('\n')
+                raw_lines = []
+                for line in lines2:
+                    if 't/s' in line or 'Exiting' in line:
+                        break
+                    if 'Loading' in line or 'llama_' in line or 'build' in line:
+                        continue
+                    if 'available commands' in line or line.strip().startswith('/'):
+                        continue
+                    clean2 = re.sub(r'[|/\\\-▄█▀▌▐▒░│║╔╗╚╝═]', '', line).replace('\b', '').strip()
+                    if clean2 and len(clean2) > 1 and clean2 != '>':
+                        raw_lines.append(clean2)
+                generated_text = ' '.join(raw_lines).strip()
+            # If still has noise, try extracting just the part after the last prompt echo
+            if len(generated_text) > 10:
+                # Find the last occurrence of the user's message and take everything after
+                for msg in reversed(messages):
+                    content = msg.get("content", "")
+                    if content and content in generated_text:
+                        idx = generated_text.rfind(content)
+                        after = generated_text[idx + len(content):].strip()
+                        if after:
+                            generated_text = after
+                        break
+            # Estimate tokens
+            prompt_tokens = len(prompt) // 4
+            completion_tokens = len(generated_text) // 4
+            total_tokens = prompt_tokens + completion_tokens
+            response = {
+                "id": f"chatcmpl-{int(time.time())}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": req.get("model", "dispatchAI"),
+                "choices": [{
+                    "index": 0,
+                    "message": {"role": "assistant", "content": generated_text},
+                    "finish_reason": "stop",
+                }],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": total_tokens,
+                },
+                "phone_info": {
+                    "serial": PHONE_SERIAL,
+                    "generation_tps": gen_tps,
+                    "prompt_tps": prompt_tps,
+                },
+            }
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(json.dumps(response).encode())
+        except subprocess.TimeoutExpired:
+            self.send_error(504, "Inference timed out")
+        except Exception as e:
+            self.send_error(500, str(e)[:200])
+    def do_GET(self):
+        if self.path == "/health":
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(json.dumps({
+                "status": "ok", "phone": PHONE_SERIAL, "port": PORT
+            }).encode())
+        else:
+            self.send_error(404)
+    def build_prompt(self, messages, fmt):
+        prompt = ""
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            if fmt == "llama-3":
+                if role == "system":
+                    prompt += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{content}<|eot_id|>"
+                elif role == "user":
+                    prompt += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>"
+                elif role == "assistant":
+                    prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>"
+                prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
+            elif fmt == "gemma":
+                if role == "user":
+                    prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
+                elif role == "assistant":
+                    prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
+                prompt += "<start_of_turn>model\n"
+            else:  # chatml
+                prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
+                prompt += "<|im_start|>assistant\n"
+        return prompt
+    def log_message(self, format, *args):
+        pass  # Suppress logs
+if __name__ == "__main__":
+    if not PHONE_SERIAL:
+        print("Usage: python phone_proxy_v2.py <serial> [port]")
+        sys.exit(1)
+    # Check phone
+    result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=10, env=ENV)
+    if PHONE_SERIAL not in result.stdout:
+        print(f"Phone {PHONE_SERIAL} not found")
+        sys.exit(1)
+    print(f"Phone proxy v2 on port {PORT} for {PHONE_SERIAL}")
+    server = HTTPServer(("0.0.0.0", PORT), PhoneHandler)
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        server.shutdown()