#!/usr/bin/env python3
"""Minimal CPU chat with tilelli_chat_v4.pt — what the README points new users at.

Uses TilelliLiteLM.generate_with_cache so long prompts + replies stay within
the 256-byte context window. Greedy decoding, deliberately tiny."""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent / "src"))

import torch
from tilelli.eval.metacog_probe import load_bridge

CKPT = Path(__file__).parent / "checkpoints" / "tilelli_chat_v4.pt"
MSG = sys.argv[1] if len(sys.argv) > 1 else "Hello, who are you?"
PROMPT = f"USER: {MSG}\nTILELLI:"
MAX_NEW = 120

model, _abstain, tok = load_bridge(str(CKPT))
ids = tok.encode(PROMPT).long().unsqueeze(0)

# Trim the prompt from the left so the prompt + MAX_NEW stays within the
# 256-byte context window the bundled v4 was trained on.
max_ctx = getattr(model, "max_seq_len", 256)
budget = max_ctx - MAX_NEW - 4
if ids.size(1) > budget:
    ids = ids[:, -budget:]

# Stop on newline (10) or null (0). generate_with_cache handles the rest.
with torch.no_grad():
    full, _generated, _confs = model.generate_with_cache(
        ids, n_new_tokens=MAX_NEW, stop_ids=(10, 0)
    )

print(tok.decode(full[0].tolist()))