| |
| """ |
| OpenAI-compatible API server for Ternary Transformer Engine. |
| Drop-in replacement for llama-server. |
| |
| (c) 2026 OpenTransformers Ltd / Scott Bisset |
| """ |
|
|
| import json |
| import time |
| import threading |
| from http.server import HTTPServer, BaseHTTPRequestHandler |
| from inference import TernaryQwen, Tokenizer, load_kernel |
| import os |
|
|
| MODEL_DIR = os.environ.get("TERNARY_MODEL_DIR", "deepseek-r1-1.5b-ternary") |
| TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "deepseek-r1-1.5b-hf") |
| HOST = os.environ.get("HOST", "127.0.0.1") |
| PORT = int(os.environ.get("PORT", "8080")) |
|
|
| print("Loading ternary kernel...") |
| kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so")) |
|
|
| print(f"Loading model from {MODEL_DIR}...") |
| model = TernaryQwen(MODEL_DIR, kernel) |
|
|
| print(f"Loading tokenizer from {TOKENIZER_DIR}...") |
| tokenizer = Tokenizer(TOKENIZER_DIR) |
|
|
| lock = threading.Lock() |
| print("Ready!") |
|
|
| class Handler(BaseHTTPRequestHandler): |
| def do_POST(self): |
| if self.path == "/v1/chat/completions": |
| length = int(self.headers.get("Content-Length", 0)) |
| body = json.loads(self.rfile.read(length)) |
| |
| messages = body.get("messages", []) |
| max_tokens = body.get("max_tokens", 256) |
| temperature = body.get("temperature", 0.6) |
| top_p = body.get("top_p", 0.95) |
| |
| |
| prompt = tokenizer.apply_chat_template(messages) |
| input_ids = tokenizer.encode(prompt) |
| |
| |
| with lock: |
| gen_ids, stats = model.generate( |
| input_ids, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| top_p=top_p |
| ) |
| |
| text = tokenizer.decode(gen_ids) |
| |
| response = { |
| "id": f"chatcmpl-ternary-{int(time.time())}", |
| "object": "chat.completion", |
| "created": int(time.time()), |
| "model": "DeepSeek-R1-Distill-Qwen-1.5B-TERNARY", |
| "choices": [{ |
| "index": 0, |
| "message": {"role": "assistant", "content": text}, |
| "finish_reason": "stop" |
| }], |
| "usage": { |
| "prompt_tokens": len(input_ids), |
| "completion_tokens": stats["tokens_generated"], |
| "total_tokens": len(input_ids) + stats["tokens_generated"] |
| }, |
| "timings": { |
| "prompt_n": stats["prefill_tokens"], |
| "prompt_ms": stats["prefill_ms"], |
| "predicted_n": stats["tokens_generated"], |
| "predicted_ms": stats["decode_ms"], |
| "predicted_per_second": stats["tok_per_sec"], |
| } |
| } |
| |
| self.send_response(200) |
| self.send_header("Content-Type", "application/json") |
| self.end_headers() |
| self.wfile.write(json.dumps(response).encode()) |
| else: |
| self.send_response(404) |
| self.end_headers() |
| |
| def do_GET(self): |
| if self.path == "/health": |
| self.send_response(200) |
| self.send_header("Content-Type", "application/json") |
| self.end_headers() |
| self.wfile.write(b'{"status":"ok","engine":"ternary-avx512"}') |
| else: |
| self.send_response(404) |
| self.end_headers() |
| |
| def log_message(self, format, *args): |
| pass |
|
|
| if __name__ == "__main__": |
| server = HTTPServer((HOST, PORT), Handler) |
| print(f"Ternary engine serving on {HOST}:{PORT}") |
| server.serve_forever() |
|
|