{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# AutoDataLab++ — CoS evaluation on Kaggle GPU\n",
        "\n",
        "Runs the **same evaluation idea** as `cos_grpo_colab.ipynb` / the CoS eval Space: load a **base model** (+ optional **LoRA** from the Hub), take the model’s **first JSON action**, then **finish the episode** with a deterministic continuation so the **terminal grader** score matches your local pipeline.\n",
        "\n",
        "**Setup**\n",
        "1. **Settings → Accelerator → GPU** (T4 is enough for 1.5B / small LoRA).\n",
        "2. **Settings → Internet → On** (for `pip`, `git clone`, and Hub weights).\n",
        "3. Either add this repo as a **Kaggle Dataset** and set `ENV_LOCAL_PATH` below, **or** set `ENV_REPO_URL` to `git clone` into `/kaggle/working`.\n",
        "4. **HF token**: paste in the config cell, or add a Kaggle secret named `HF_TOKEN` (read access is enough for public weights; **write** if you push adapters).\n",
        "\n",
        "No Hugging Face **Space** is required — the env runs **in this notebook**.\n",
        "\n",
        "**Noise you can ignore:** lines like `Unable to register cuFFT/cuDNN factory` or `computation placer already registered` usually mean TensorFlow/JAX and PyTorch both touched CUDA in the same process — they do not stop training.\n",
        "\n",
        "**If you see** `No module named 'triton.backends'` **or** `cannot import name 'ir' from 'triton._C.libtriton'`: it is a **torch ↔ triton ABI mismatch** triggered by `torchvision`. Run the install cell below (it removes `torchvision`, which we do not need for LLM inference), then **Session → Restart session** and run all cells from the top."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# We do NOT upgrade torch/torchvision/triton on Kaggle (mismatched ABIs cause:\n",
        "#   ImportError: cannot import name 'ir' from 'triton._C.libtriton'\n",
        "#   AttributeError: module 'triton' has no attribute 'backends'\n",
        "# transformers triggers torchvision → torch._dynamo → triton on import. We only need\n",
        "# inference (no compile, no vision), so the safest fix is to remove torchvision.\n",
        "import subprocess\n",
        "import sys\n",
        "\n",
        "_PY = sys.executable\n",
        "_PKGS = [\n",
        "    \"transformers>=4.45,<4.49\",\n",
        "    \"peft>=0.13,<0.16\",                # adapters trained on newer peft set fields like eva_config\n",
        "    \"accelerate>=0.33,<1.1\",\n",
        "    \"bitsandbytes>=0.45.0\",            # Kaggle CUDA 12.8 needs newer bnb wheel\n",
        "    \"huggingface_hub>=0.24,<1.0\",\n",
        "    \"pydantic>=2\",\n",
        "    \"tqdm\",\n",
        "    \"pandas\",\n",
        "]\n",
        "subprocess.check_call(\n",
        "    [_PY, \"-m\", \"pip\", \"install\", \"-q\", \"--upgrade-strategy\", \"only-if-needed\"] + _PKGS,\n",
        ")\n",
        "subprocess.run([_PY, \"-m\", \"pip\", \"uninstall\", \"-y\", \"torchvision\"], check=False)\n",
        "print(\"pip ok — now: Session → Restart session, then Run All from the top.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 1) Config — HF token, model weights, env location"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "from pathlib import Path\n",
        "\n",
        "# --- Hugging Face token (optional if all repos/models are public) ---\n",
        "HF_TOKEN = \"\"  # paste here, OR leave empty and set Kaggle secret \"HF_TOKEN\"\n",
        "\n",
        "try:\n",
        "    from kaggle_secrets import UserSecretsClient\n",
        "    if not HF_TOKEN.strip():\n",
        "        HF_TOKEN = UserSecretsClient().get_secret(\"HF_TOKEN\")\n",
        "except Exception:\n",
        "    pass\n",
        "\n",
        "os.environ[\"HF_TOKEN\"] = HF_TOKEN or \"\"\n",
        "os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = HF_TOKEN or \"\"\n",
        "\n",
        "# --- Model weights ---\n",
        "MODEL_ID = \"Qwen/Qwen2.5-1.5B-Instruct\"  # base checkpoint on the Hub\n",
        "ADAPTER_ID = \"\"  # optional: e.g. \"your-user/your-lora-repo\"\n",
        "ADAPTER_SUBFOLDER = \"\"  # optional subfolder inside the adapter repo (e.g. \"final\")\n",
        "USE_4BIT = True  # set False if load fails or you use A100 with headroom\n",
        "\n",
        "# --- Env code (AutoDataLab++ root with ceo_brief_env/) ---\n",
        "# Option A: dataset mounted at /kaggle/input/your-dataset-name/autodatalab-plus\n",
        "ENV_LOCAL_PATH = \"/kaggle/input/autodatalab-plus\"  # change if you zip-uploaded the repo\n",
        "# Option B: git clone if A is missing\n",
        "ENV_REPO_URL = \"https://github.com/Uchihakamal1816/AutoDataLab-.git\"  # AutoDataLab++ env repo (ceo_brief_env/ at root)\n",
        "ENV_REPO_REF = \"main\"\n",
        "ENV_CLONE_DIR = Path(\"/kaggle/working/autodatalab-plus\")\n",
        "\n",
        "# --- Eval ---\n",
        "TASKS = [\"easy_brief\", \"medium_brief\", \"hard_brief\", \"expert_brief\"]\n",
        "EPISODES_PER_TASK = 3\n",
        "USE_RAG = False\n",
        "\n",
        "print(\"MODEL_ID:\", MODEL_ID)\n",
        "print(\"ADAPTER_ID:\", ADAPTER_ID or \"(none)\")\n",
        "print(\"HF_TOKEN set:\", bool((HF_TOKEN or \"\").strip()))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2) Put `ceo_brief_env` on `sys.path` (clone if needed) + `pip install -e`"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import subprocess\n",
        "import sys\n",
        "\n",
        "def resolve_env_root() -> Path:\n",
        "    p = Path(ENV_LOCAL_PATH)\n",
        "    if p.is_dir() and (p / \"ceo_brief_env\").is_dir():\n",
        "        return p.resolve()\n",
        "    ENV_CLONE_DIR.parent.mkdir(parents=True, exist_ok=True)\n",
        "    if ENV_CLONE_DIR.is_dir():\n",
        "        subprocess.run([\"rm\", \"-rf\", str(ENV_CLONE_DIR)], check=False)\n",
        "    cmd = [\"git\", \"clone\", \"--depth\", \"1\", \"-b\", ENV_REPO_REF, ENV_REPO_URL, str(ENV_CLONE_DIR)]\n",
        "    r = subprocess.run(cmd, capture_output=True, text=True)\n",
        "    if r.returncode != 0:\n",
        "        cmd2 = [\"git\", \"clone\", \"--depth\", \"1\", ENV_REPO_URL, str(ENV_CLONE_DIR)]\n",
        "        r2 = subprocess.run(cmd2, capture_output=True, text=True)\n",
        "        if r2.returncode != 0:\n",
        "            raise RuntimeError(f\"git clone failed:\\n{r.stderr}\\n{r2.stderr}\")\n",
        "    root = ENV_CLONE_DIR.resolve()\n",
        "    if not (root / \"ceo_brief_env\").is_dir():\n",
        "        raise RuntimeError(f\"No ceo_brief_env under {root}\")\n",
        "    return root\n",
        "\n",
        "ENV_ROOT = resolve_env_root()\n",
        "if str(ENV_ROOT) not in sys.path:\n",
        "    sys.path.insert(0, str(ENV_ROOT))\n",
        "\n",
        "subprocess.run(\n",
        "    [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-e\", str(ENV_ROOT)],\n",
        "    check=True,\n",
        ")\n",
        "print(\"ENV_ROOT:\", ENV_ROOT)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 3) Load model + run evaluation (first action from LLM, then deterministic finish)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import gc\n",
        "import json\n",
        "import os\n",
        "import re\n",
        "from typing import Any\n",
        "\n",
        "# Before importing torch: avoid compile paths that hard-require triton on some images\n",
        "os.environ.setdefault(\"TORCH_COMPILE_DISABLE\", \"1\")\n",
        "os.environ.setdefault(\"TORCHDYNAMO_DISABLE\", \"1\")\n",
        "\n",
        "import torch\n",
        "from tqdm.auto import tqdm\n",
        "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
        "\n",
        "from ceo_brief_env.environment import CEOBriefEnvironment, required_experts_for_task\n",
        "from ceo_brief_env.models import CoSAction, CoSObservation\n",
        "\n",
        "VALID_ACTIONS = {\"consult\", \"ask\", \"summarize\", \"submit\", \"noop\"}\n",
        "VALID_EXPERTS = {\"analyst\", \"finance\", \"hr\", \"strategy\"}\n",
        "_JSON_RE = re.compile(r\"\\{[^{}]*\\}\", re.S)\n",
        "\n",
        "SYSTEM_PROMPT = (\n",
        "    \"You are the Chief of Staff in AutoDataLab++. You orchestrate four specialists: \"\n",
        "    \"analyst, finance, strategy, hr. Reply with STRICT JSON only.\\n\"\n",
        "    'Schema: {\"action_type\": one of [consult, ask, summarize, submit, noop], '\n",
        "    '\"expert_id\": one of [analyst, finance, hr, strategy] or null}.\\n'\n",
        "    \"Rules: consult each required expert at most once -> summarize -> submit.\"\n",
        ")\n",
        "\n",
        "\n",
        "def render_obs(obs: CoSObservation) -> str:\n",
        "    return (\n",
        "        f\"task={obs.task_name} step={obs.step_count}/{obs.max_steps} \"\n",
        "        f\"rag={obs.rag_enabled} consulted={obs.consulted_experts} \"\n",
        "        f\"brief_done={obs.current_brief is not None} available={obs.available_experts}\"\n",
        "    )\n",
        "\n",
        "\n",
        "def parse_action(text: str) -> CoSAction:\n",
        "    m = _JSON_RE.search(text or \"\")\n",
        "    if not m:\n",
        "        return CoSAction(action_type=\"noop\")\n",
        "    try:\n",
        "        a = json.loads(m.group(0))\n",
        "    except Exception:\n",
        "        return CoSAction(action_type=\"noop\")\n",
        "    at = a.get(\"action_type\")\n",
        "    if at not in VALID_ACTIONS:\n",
        "        return CoSAction(action_type=\"noop\")\n",
        "    eid = a.get(\"expert_id\")\n",
        "    if eid is not None and eid not in VALID_EXPERTS:\n",
        "        eid = None\n",
        "    return CoSAction(action_type=at, expert_id=eid)\n",
        "\n",
        "\n",
        "def deterministic_continuation(env: CEOBriefEnvironment, obs: CoSObservation, task: str) -> float:\n",
        "    while not obs.done and obs.step_count < obs.max_steps:\n",
        "        missing = [e for e in required_experts_for_task(task) if e not in obs.consulted_experts]\n",
        "        if missing:\n",
        "            act = CoSAction(action_type=\"consult\", expert_id=missing[0])  # type: ignore[arg-type]\n",
        "        elif obs.current_brief is None:\n",
        "            act = CoSAction(action_type=\"summarize\")\n",
        "        else:\n",
        "            act = CoSAction(action_type=\"submit\")\n",
        "        obs = env.step(act)\n",
        "    return float(obs.terminal_grader_score or 0.0)\n",
        "\n",
        "\n",
        "def load_model():\n",
        "    tok = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN or None)\n",
        "    if tok.pad_token is None:\n",
        "        tok.pad_token = tok.eos_token\n",
        "    bnb = None\n",
        "    if USE_4BIT:\n",
        "        bnb = BitsAndBytesConfig(\n",
        "            load_in_4bit=True,\n",
        "            bnb_4bit_quant_type=\"nf4\",\n",
        "            bnb_4bit_compute_dtype=torch.bfloat16,\n",
        "            bnb_4bit_use_double_quant=True,\n",
        "        )\n",
        "    try:\n",
        "        model = AutoModelForCausalLM.from_pretrained(\n",
        "            MODEL_ID,\n",
        "            token=HF_TOKEN or None,\n",
        "            device_map=\"auto\",\n",
        "            quantization_config=bnb,\n",
        "            torch_dtype=torch.bfloat16,\n",
        "        )\n",
        "    except Exception:\n",
        "        model = AutoModelForCausalLM.from_pretrained(\n",
        "            MODEL_ID,\n",
        "            token=HF_TOKEN or None,\n",
        "            device_map=\"auto\",\n",
        "            torch_dtype=torch.bfloat16,\n",
        "        )\n",
        "    model.eval()\n",
        "    if (ADAPTER_ID or \"\").strip():\n",
        "        from peft import PeftModel\n",
        "\n",
        "        kw: dict[str, Any] = {\"token\": HF_TOKEN or None}\n",
        "        if (ADAPTER_SUBFOLDER or \"\").strip():\n",
        "            kw[\"subfolder\"] = ADAPTER_SUBFOLDER.strip()\n",
        "        model = PeftModel.from_pretrained(model, ADAPTER_ID.strip(), **kw)\n",
        "        model.eval()\n",
        "    return tok, model\n",
        "\n",
        "\n",
        "@torch.no_grad()\n",
        "def generate_action(model, tok, obs: CoSObservation):\n",
        "    msgs = [\n",
        "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
        "        {\"role\": \"user\", \"content\": render_obs(obs)},\n",
        "    ]\n",
        "    text = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n",
        "    ids = tok(text, return_tensors=\"pt\").to(model.device)\n",
        "    gen = model.generate(\n",
        "        **ids,\n",
        "        max_new_tokens=48,\n",
        "        do_sample=False,\n",
        "        pad_token_id=tok.pad_token_id,\n",
        "    )\n",
        "    comp = tok.decode(gen[0, ids.input_ids.shape[1] :], skip_special_tokens=True)\n",
        "    return parse_action(comp), comp.strip()[:300]\n",
        "\n",
        "\n",
        "def run_evaluation():\n",
        "    tok, model = load_model()\n",
        "    rows = []\n",
        "    raw = []\n",
        "    for task in TASKS:\n",
        "        scores = []\n",
        "        for ep in tqdm(range(EPISODES_PER_TASK), desc=task):\n",
        "            env = CEOBriefEnvironment()\n",
        "            obs = env.reset(task=task, use_rag=USE_RAG)\n",
        "            try:\n",
        "                action, completion = generate_action(model, tok, obs)\n",
        "                obs = env.step(action)\n",
        "                term = deterministic_continuation(env, obs, task)\n",
        "            except Exception as e:\n",
        "                completion = f\"<error: {e}>\"\n",
        "                term = 0.0\n",
        "            scores.append(term)\n",
        "            raw.append(\n",
        "                {\n",
        "                    \"task\": task,\n",
        "                    \"episode\": ep,\n",
        "                    \"first_action\": action.model_dump(exclude_none=True),\n",
        "                    \"completion_preview\": completion,\n",
        "                    \"terminal\": round(float(term), 4),\n",
        "                }\n",
        "            )\n",
        "        mean = round(sum(scores) / len(scores), 4)\n",
        "        rows.append({\"task\": task, \"episodes\": EPISODES_PER_TASK, \"mean_terminal\": mean, \"scores\": scores})\n",
        "    overall = round(sum(r[\"mean_terminal\"] for r in rows) / len(rows), 4)\n",
        "    del model\n",
        "    gc.collect()\n",
        "    if torch.cuda.is_available():\n",
        "        torch.cuda.empty_cache()\n",
        "    return rows, raw, overall\n",
        "\n",
        "\n",
        "eval_rows, eval_raw, mean_overall = run_evaluation()\n",
        "import pandas as pd\n",
        "from IPython.display import display\n",
        "\n",
        "display(pd.DataFrame(eval_rows))\n",
        "print(\"mean_overall (avg of per-task means):\", mean_overall)\n",
        "print(\"--- sample raw traces (first 3) ---\")\n",
        "for x in eval_raw[:3]:\n",
        "    print(x)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 4) (Optional) Save results JSON to `/kaggle/working` for download"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "out = {\n",
        "    \"model_id\": MODEL_ID,\n",
        "    \"adapter_id\": ADAPTER_ID or None,\n",
        "    \"mean_overall\": mean_overall,\n",
        "    \"per_task\": eval_rows,\n",
        "    \"raw\": eval_raw,\n",
        "}\n",
        "p = Path(\"/kaggle/working/cos_eval_results.json\")\n",
        "p.write_text(json.dumps(out, indent=2), encoding=\"utf-8\")\n",
        "print(\"Wrote\", p)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}