#!/usr/bin/env python3
"""
Smoke test: load Qwen (or any causal LM) from a local folder and run one generation.

Usage:
    python3 training/local_model_inference_check.py --model-dir ./model
    python3 training/local_model_inference_check.py --model-dir /path/to/model --device cpu
"""
from __future__ import annotations

import argparse
import sys


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument(
        "--model-dir",
        default="model",
        help="Path to local folder with config + weights (e.g. ./model)",
    )
    ap.add_argument("--device", default="auto", help="auto | cuda | cpu")
    ap.add_argument("--max-new-tokens", type=int, default=128)
    args = ap.parse_args()

    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    did = "cuda" if torch.cuda.is_available() else "cpu"
    if args.device == "auto":
        dev = did
    else:
        dev = args.device

    print(f"[load] {args.model_dir!r} | device={dev}", flush=True)
    tok = AutoTokenizer.from_pretrained(args.model_dir, local_files_only=True, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    # fp16 on GPU is enough for 1.5B; CPU can stay fp32
    dtype = torch.float16 if dev == "cuda" else torch.float32
    model = AutoModelForCausalLM.from_pretrained(
        args.model_dir,
        local_files_only=True,
        trust_remote_code=True,
        torch_dtype=dtype,
    )
    if dev == "cpu":
        model = model.to("cpu")
    else:
        model = model.to("cuda")
    model.eval()

    messages = [
        {"role": "system", "content": "You reply briefly."},
        {"role": "user", "content": "Say the capital of France in one line."},
    ]
    if hasattr(tok, "apply_chat_template") and tok.chat_template is not None:
        prompt = tok.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    else:
        prompt = messages[0]["content"] + "\n" + messages[1]["content"]

    inputs = tok(prompt, return_tensors="pt")
    if dev == "cuda":
        inputs = {k: v.cuda() for k, v in inputs.items()}

    with torch.inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=args.max_new_tokens,
            do_sample=False,
            pad_token_id=tok.pad_token_id,
        )
    new_tokens = out[0, inputs["input_ids"].shape[1] :]
    text = tok.decode(new_tokens, skip_special_tokens=True)

    print("\n[ok] generation:\n" + text.strip() + "\n", flush=True)
    return 0


if __name__ == "__main__":
    try:
        raise SystemExit(main())
    except OSError as e:
        if "local_files_only" in str(e) or e.errno in (2, 20):
            print(
                "Hint: run from the parent of `model/`, e.g.:\n"
                "  cd autodatalab-plus && python3 training/local_model_inference_check.py --model-dir model",
                file=sys.stderr,
            )
        raise