Fix label leakage: temporal split — use first 70% of events as input, predict purchase in last 30%. Remove n_purchases/purchase_rate from features.

Browse files

Files changed (1) hide show

notebooks/03_ecommerce_finetune.ipynb +111 -85

notebooks/03_ecommerce_finetune.ipynb CHANGED Viewed

@@ -4,11 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 03 — E-Commerce Fine-Tuning: Next-Purchase Prediction\n",
     "\n",
-    "**Goal:** Fine-tune the pre-trained DomainTransformer for predicting whether a user will make a purchase, and compare against a LightGBM baseline on hand-crafted features.\n",
     "\n",
-    "**Task:** Binary classification — given a user's event sequence, predict if they will purchase (1) or not (0).\n",
     "\n",
     "**Pre-trained model:** [rtferraz/ecommerce-domain-24m](https://huggingface.co/rtferraz/ecommerce-domain-24m)\n",
     "\n",
@@ -46,7 +48,7 @@
     "import matplotlib.pyplot as plt\n",
     "import torch\n",
     "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.metrics import roc_auc_score, classification_report\n",
     "\n",
     "if os.path.exists('../src'): sys.path.insert(0, '../src')\n",
     "elif os.path.exists('src'): sys.path.insert(0, 'src')\n",
@@ -54,7 +56,7 @@
     "from domain_tokenizer import (\n",
     "    DomainTokenizerBuilder, DomainTransformerConfig,\n",
     "    DomainTransformerForCausalLM, JointFusionModel,\n",
-    "    DomainFinetuneDataset, prepare_finetune_dataset, finetune_domain_model,\n",
     ")\n",
     "from domain_tokenizer.schema import DomainSchema, FieldSpec, FieldType\n",
     "\n",
@@ -82,9 +84,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Step 1 — Load Pre-trained Artifacts\n",
-    "\n",
-    "Load the artifacts saved by `02_ecommerce_pretrain.ipynb`."
    ]
   },
   {
@@ -93,20 +93,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load user sequences from pre-training notebook\n",
     "with open('./ecommerce_artifacts.pkl', 'rb') as f:\n",
     "    artifacts = pickle.load(f)\n",
-    "\n",
     "user_sequences = artifacts['user_sequences']\n",
     "user_ids = artifacts['user_ids']\n",
     "print(f'Loaded {len(user_sequences):,} users')\n",
     "\n",
-    "# Load tokenizer\n",
     "from transformers import PreTrainedTokenizerFast\n",
     "hf_tokenizer = PreTrainedTokenizerFast.from_pretrained('./ecommerce_tokenizer')\n",
     "print(f'Tokenizer vocab: {hf_tokenizer.vocab_size}')\n",
     "\n",
-    "# Rebuild the schema and builder (needed for tokenize_event)\n",
     "ECOMMERCE_REES46_SCHEMA = DomainSchema(\n",
     "    name='ecommerce_rees46',\n",
     "    fields=[\n",
@@ -130,13 +126,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load pre-trained model using from_pretrained (handles safetensors natively)\n",
-    "# Option A: from local checkpoint saved by notebook 02\n",
     "model = DomainTransformerForCausalLM.from_pretrained('./ecommerce_pretrain_checkpoints/final/')\n",
-    "\n",
-    "# Option B: from HuggingFace Hub (if local not available)\n",
-    "# model = DomainTransformerForCausalLM.from_pretrained('rtferraz/ecommerce-domain-24m')\n",
-    "\n",
     "print(f'Pre-trained model loaded: {sum(p.numel() for p in model.parameters()):,} params')"
    ]
   },
@@ -144,11 +134,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Step 2 — Create Labels and Tabular Features\n",
     "\n",
-    "**Label:** Binary — did the user make at least one purchase? (1=yes, 0=no)\n",
     "\n",
-    "**Tabular features:** Hand-crafted from user sequences (for the DCNv2 branch and LightGBM baseline)."
    ]
   },
   {
@@ -157,49 +151,87 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def compute_user_features(events):\n",
-    "    \"\"\"Extract tabular features from a user's event sequence.\"\"\"\n",
     "    n_events = len(events)\n",
     "    n_views = sum(1 for e in events if e['event_type'] == 'view')\n",
     "    n_carts = sum(1 for e in events if e['event_type'] == 'cart')\n",
-    "    n_purchases = sum(1 for e in events if e['event_type'] == 'purchase')\n",
     "    n_removes = sum(1 for e in events if e['event_type'] == 'remove_from_cart')\n",
     "    \n",
     "    prices = [e['price'] for e in events if e['price'] > 0]\n",
     "    avg_price = np.mean(prices) if prices else 0\n",
     "    max_price = max(prices) if prices else 0\n",
     "    std_price = np.std(prices) if len(prices) > 1 else 0\n",
     "    \n",
-    "    categories = set(e['category'] for e in events)\n",
-    "    n_unique_categories = len(categories)\n",
-    "    \n",
-    "    hours = [e['timestamp'].hour for e in events]\n",
-    "    avg_hour = np.mean(hours)\n",
     "    \n",
     "    cart_rate = n_carts / max(n_views, 1)\n",
-    "    purchase_rate = n_purchases / max(n_events, 1)\n",
     "    remove_rate = n_removes / max(n_carts, 1) if n_carts > 0 else 0\n",
     "    \n",
     "    return [\n",
-    "        n_events, n_views, n_carts, n_purchases, n_removes,\n",
     "        avg_price, max_price, std_price,\n",
     "        n_unique_categories, avg_hour,\n",
-    "        cart_rate, purchase_rate, remove_rate,\n",
     "    ]\n",
     "\n",
     "FEATURE_NAMES = [\n",
-    "    'n_events', 'n_views', 'n_carts', 'n_purchases', 'n_removes',\n",
     "    'avg_price', 'max_price', 'std_price',\n",
     "    'n_unique_categories', 'avg_hour',\n",
-    "    'cart_rate', 'purchase_rate', 'remove_rate',\n",
     "]\n",
     "\n",
-    "print(f'Computing features for {len(user_sequences):,} users...')\n",
-    "tabular_features = np.array([compute_user_features(seq) for seq in user_sequences], dtype=np.float32)\n",
-    "labels = np.array([1.0 if any(e['event_type'] == 'purchase' for e in seq) else 0.0 for seq in user_sequences])\n",
-    "\n",
-    "print(f'Features shape: {tabular_features.shape}')\n",
-    "print(f'Labels: {labels.sum():.0f} purchasers / {len(labels)} total ({labels.mean()*100:.1f}%)')"
    ]
   },
   {
@@ -210,27 +242,25 @@
    "source": [
     "# Train/test split (80/20, stratified)\n",
     "train_idx, test_idx = train_test_split(\n",
-    "    range(len(user_sequences)), test_size=0.2, random_state=42, stratify=labels\n",
     ")\n",
     "\n",
-    "train_seqs = [user_sequences[i] for i in train_idx]\n",
-    "test_seqs = [user_sequences[i] for i in test_idx]\n",
     "train_features = tabular_features[train_idx]\n",
     "test_features = tabular_features[test_idx]\n",
-    "train_labels = labels[train_idx]\n",
-    "test_labels = labels[test_idx]\n",
     "\n",
-    "print(f'Train: {len(train_seqs):,} ({train_labels.mean()*100:.1f}% positive)')\n",
-    "print(f'Test: {len(test_seqs):,} ({test_labels.mean()*100:.1f}% positive)')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Step 3 — LightGBM Baseline\n",
-    "\n",
-    "Standard ML baseline: LightGBM on hand-crafted tabular features. This is what we need to beat."
    ]
   },
   {
@@ -244,19 +274,15 @@
     "lgb_model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42, verbose=-1)\n",
     "lgb_model.fit(train_features, train_labels)\n",
     "\n",
-    "lgb_train_probs = lgb_model.predict_proba(train_features)[:, 1]\n",
     "lgb_test_probs = lgb_model.predict_proba(test_features)[:, 1]\n",
-    "\n",
-    "lgb_train_auc = roc_auc_score(train_labels, lgb_train_probs)\n",
     "lgb_test_auc = roc_auc_score(test_labels, lgb_test_probs)\n",
     "\n",
-    "print(f'LightGBM Baseline:')\n",
-    "print(f'  Train AUC: {lgb_train_auc:.4f}')\n",
-    "print(f'  Test AUC:  {lgb_test_auc:.4f}')\n",
     "\n",
     "importance = pd.Series(lgb_model.feature_importances_, index=FEATURE_NAMES).sort_values(ascending=False)\n",
     "print(f'\\nTop features:')\n",
-    "for feat, imp in importance.head(5).items(): print(f'  {feat}: {imp}')"
    ]
   },
   {
@@ -265,10 +291,7 @@
    "source": [
     "## Step 4 — JointFusionModel Fine-Tuning\n",
     "\n",
-    "Combines:\n",
-    "- **Transaction branch:** Pre-trained DomainTransformer → user embedding\n",
-    "- **Tabular branch:** DCNv2 with PLR embeddings on hand-crafted features\n",
-    "- **Joint head:** MLP on concatenated embeddings → binary prediction"
    ]
   },
   {
@@ -284,8 +307,7 @@
     "test_dataset = DomainFinetuneDataset(\n",
     "    test_seqs, test_features, test_labels, builder, hf_tokenizer, max_length=MAX_LENGTH)\n",
     "\n",
-    "print(f'Train: {len(train_dataset)}, Test: {len(test_dataset)}')\n",
-    "print(f'Sample keys: {set(train_dataset[0].keys())}')"
    ]
   },
   {
@@ -328,11 +350,11 @@
     "    learning_rate=1e-4,\n",
     "    warmup_steps=50,\n",
     "    logging_steps=20,\n",
-    "    eval_steps=100 if USE_GPU else 50,\n",
     "    save_strategy='no',\n",
     "    bf16=USE_BF16, fp16=USE_FP16,\n",
     "    report_to='wandb',\n",
-    "    run_name='ecommerce-finetune-joint-5ep',\n",
     "    seed=42,\n",
     ")"
    ]
@@ -366,9 +388,7 @@
     "        all_probs.extend(probs.cpu().numpy())\n",
     "        all_labels_eval.extend(labels_batch.cpu().numpy())\n",
     "\n",
-    "all_probs = np.array(all_probs)\n",
-    "all_labels_eval = np.array(all_labels_eval)\n",
-    "fusion_test_auc = roc_auc_score(all_labels_eval, all_probs)\n",
     "print(f'JointFusion Test AUC: {fusion_test_auc:.4f}')"
    ]
   },
@@ -378,19 +398,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print('=' * 50)\n",
-    "print('MODEL COMPARISON — Purchase Prediction (AUC)')\n",
-    "print('=' * 50)\n",
-    "print(f'  LightGBM (tabular only):        {lgb_test_auc:.4f}')\n",
-    "print(f'  JointFusion (Transformer+DCNv2): {fusion_test_auc:.4f}')\n",
-    "print(f'  Difference:                      {fusion_test_auc - lgb_test_auc:+.4f}')\n",
-    "print('=' * 50)\n",
     "\n",
     "if fusion_test_auc > lgb_test_auc:\n",
-    "    print(f'\\n✅ JointFusion beats LightGBM by {(fusion_test_auc - lgb_test_auc)*100:.2f} percentage points')\n",
     "else:\n",
-    "    print(f'\\n⚠️ LightGBM still leads by {(lgb_test_auc - fusion_test_auc)*100:.2f} percentage points')\n",
-    "    print(f'    (More pre-training epochs and longer context would improve transformer embeddings.)')"
    ]
   },
   {
@@ -405,9 +429,9 @@
     "fig, ax = plt.subplots(figsize=(10, 5))\n",
     "ax.plot(losses, label='Train Loss', alpha=0.7)\n",
     "if eval_losses:\n",
-    "    eval_steps_x = np.linspace(0, len(losses), len(eval_losses))\n",
-    "    ax.plot(eval_steps_x, eval_losses, 'ro-', label='Eval Loss', markersize=4)\n",
-    "ax.set_xlabel('Step'); ax.set_ylabel('Loss'); ax.set_title('Fine-Tuning Loss')\n",
     "ax.legend(); ax.grid(True, alpha=0.3); plt.tight_layout(); plt.show()"
    ]
   },
@@ -427,12 +451,14 @@
    "source": [
     "## Summary\n",
     "\n",
-    "| Model | Test AUC | Notes |\n",
     "|-------|----------|-------|\n",
-    "| LightGBM (tabular) | *see above* | 13 hand-crafted features |\n",
-    "| JointFusion (Transformer+DCNv2) | *see above* | Pre-trained domain tokens + same 13 features |\n",
     "\n",
-    "The pre-trained DomainTransformer captures sequential behavioral patterns (view→cart→purchase funnels, category stickiness, temporal habits) that hand-crafted features cannot fully represent."
    ]
   }
  ],

    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "# 03 — E-Commerce Fine-Tuning: Future Purchase Prediction\n",
     "\n",
+    "**Goal:** Fine-tune the pre-trained DomainTransformer for predicting whether a user will purchase in the future, using only their past browsing history.\n",
     "\n",
+    "**Task:** Binary classification — given the first 70% of a user's events, predict if they purchase in the remaining 30%.\n",
+    "\n",
+    "**Why temporal split:** Avoids label leakage. The previous version used `n_purchases` as a feature to predict `has_purchase` → trivial AUC 1.0. This version simulates the real production scenario: predict future behavior from past behavior.\n",
     "\n",
     "**Pre-trained model:** [rtferraz/ecommerce-domain-24m](https://huggingface.co/rtferraz/ecommerce-domain-24m)\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import torch\n",
     "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import roc_auc_score\n",
     "\n",
     "if os.path.exists('../src'): sys.path.insert(0, '../src')\n",
     "elif os.path.exists('src'): sys.path.insert(0, 'src')\n",
     "from domain_tokenizer import (\n",
     "    DomainTokenizerBuilder, DomainTransformerConfig,\n",
     "    DomainTransformerForCausalLM, JointFusionModel,\n",
+    "    DomainFinetuneDataset, finetune_domain_model,\n",
     ")\n",
     "from domain_tokenizer.schema import DomainSchema, FieldSpec, FieldType\n",
     "\n",
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Step 1 — Load Pre-trained Artifacts"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
     "with open('./ecommerce_artifacts.pkl', 'rb') as f:\n",
     "    artifacts = pickle.load(f)\n",
     "user_sequences = artifacts['user_sequences']\n",
     "user_ids = artifacts['user_ids']\n",
     "print(f'Loaded {len(user_sequences):,} users')\n",
     "\n",
     "from transformers import PreTrainedTokenizerFast\n",
     "hf_tokenizer = PreTrainedTokenizerFast.from_pretrained('./ecommerce_tokenizer')\n",
     "print(f'Tokenizer vocab: {hf_tokenizer.vocab_size}')\n",
     "\n",
     "ECOMMERCE_REES46_SCHEMA = DomainSchema(\n",
     "    name='ecommerce_rees46',\n",
     "    fields=[\n",
    "metadata": {},
    "outputs": [],
    "source": [
     "model = DomainTransformerForCausalLM.from_pretrained('./ecommerce_pretrain_checkpoints/final/')\n",
     "print(f'Pre-trained model loaded: {sum(p.numel() for p in model.parameters()):,} params')"
    ]
   },
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Step 2 — Temporal Split: Labels and Features\n",
     "\n",
+    "**The key design (avoids leakage):**\n",
+    "- Split each user's events at the 70% mark temporally\n",
+    "- **Input to model:** first 70% of events (history)\n",
+    "- **Label:** did the user purchase in the last 30%? (future)\n",
+    "- **Tabular features:** computed only from the first 70% (no future info)\n",
     "\n",
+    "This matches Nubank's setup: predict future behavior from past history."
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "SPLIT_RATIO = 0.7  # 70% history, 30% future\n",
+    "MIN_HISTORY = 5    # need at least 5 events in history\n",
+    "MIN_FUTURE = 3     # need at least 3 events in future\n",
+    "\n",
+    "history_sequences = []  # input to model\n",
+    "future_labels = []      # target: purchased in future?\n",
+    "valid_user_ids = []\n",
+    "\n",
+    "for i, events in enumerate(user_sequences):\n",
+    "    split_idx = int(len(events) * SPLIT_RATIO)\n",
+    "    history = events[:split_idx]\n",
+    "    future = events[split_idx:]\n",
+    "    \n",
+    "    if len(history) < MIN_HISTORY or len(future) < MIN_FUTURE:\n",
+    "        continue\n",
+    "    \n",
+    "    # Label: did user purchase in the future window?\n",
+    "    has_future_purchase = any(e['event_type'] == 'purchase' for e in future)\n",
+    "    \n",
+    "    history_sequences.append(history)\n",
+    "    future_labels.append(1.0 if has_future_purchase else 0.0)\n",
+    "    valid_user_ids.append(user_ids[i])\n",
+    "\n",
+    "future_labels = np.array(future_labels)\n",
+    "print(f'Valid users (enough history + future): {len(history_sequences):,}')\n",
+    "print(f'Future purchasers: {future_labels.sum():.0f} / {len(future_labels)} ({future_labels.mean()*100:.1f}%)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_history_features(events):\n",
+    "    \"\"\"Features from HISTORY ONLY — no future information leaks.\"\"\"\n",
     "    n_events = len(events)\n",
     "    n_views = sum(1 for e in events if e['event_type'] == 'view')\n",
     "    n_carts = sum(1 for e in events if e['event_type'] == 'cart')\n",
     "    n_removes = sum(1 for e in events if e['event_type'] == 'remove_from_cart')\n",
+    "    # NOTE: n_purchases in HISTORY is allowed — it's past behavior, not future\n",
+    "    n_hist_purchases = sum(1 for e in events if e['event_type'] == 'purchase')\n",
     "    \n",
     "    prices = [e['price'] for e in events if e['price'] > 0]\n",
     "    avg_price = np.mean(prices) if prices else 0\n",
     "    max_price = max(prices) if prices else 0\n",
     "    std_price = np.std(prices) if len(prices) > 1 else 0\n",
     "    \n",
+    "    n_unique_categories = len(set(e['category'] for e in events))\n",
+    "    avg_hour = np.mean([e['timestamp'].hour for e in events])\n",
     "    \n",
+    "    # Funnel ratios from history\n",
     "    cart_rate = n_carts / max(n_views, 1)\n",
     "    remove_rate = n_removes / max(n_carts, 1) if n_carts > 0 else 0\n",
+    "    hist_purchase_rate = n_hist_purchases / max(n_events, 1)\n",
+    "    \n",
+    "    # Session intensity (events per day approximation)\n",
+    "    if len(events) >= 2:\n",
+    "        time_span = (events[-1]['timestamp'] - events[0]['timestamp']).total_seconds() / 86400  # days\n",
+    "        events_per_day = n_events / max(time_span, 1)\n",
+    "    else:\n",
+    "        events_per_day = 0\n",
     "    \n",
     "    return [\n",
+    "        n_events, n_views, n_carts, n_removes, n_hist_purchases,\n",
     "        avg_price, max_price, std_price,\n",
     "        n_unique_categories, avg_hour,\n",
+    "        cart_rate, remove_rate, hist_purchase_rate, events_per_day,\n",
     "    ]\n",
     "\n",
     "FEATURE_NAMES = [\n",
+    "    'n_events', 'n_views', 'n_carts', 'n_removes', 'n_hist_purchases',\n",
     "    'avg_price', 'max_price', 'std_price',\n",
     "    'n_unique_categories', 'avg_hour',\n",
+    "    'cart_rate', 'remove_rate', 'hist_purchase_rate', 'events_per_day',\n",
     "]\n",
     "\n",
+    "print(f'Computing features from history only...')\n",
+    "tabular_features = np.array([compute_history_features(seq) for seq in history_sequences], dtype=np.float32)\n",
+    "print(f'Features: {tabular_features.shape}, {len(FEATURE_NAMES)} features')\n",
+    "print(f'Feature names: {FEATURE_NAMES}')"
    ]
   },
   {
    "source": [
     "# Train/test split (80/20, stratified)\n",
     "train_idx, test_idx = train_test_split(\n",
+    "    range(len(history_sequences)), test_size=0.2, random_state=42, stratify=future_labels\n",
     ")\n",
     "\n",
+    "train_seqs = [history_sequences[i] for i in train_idx]\n",
+    "test_seqs = [history_sequences[i] for i in test_idx]\n",
     "train_features = tabular_features[train_idx]\n",
     "test_features = tabular_features[test_idx]\n",
+    "train_labels = future_labels[train_idx]\n",
+    "test_labels = future_labels[test_idx]\n",
     "\n",
+    "print(f'Train: {len(train_seqs):,} ({train_labels.mean()*100:.1f}% will purchase in future)')\n",
+    "print(f'Test: {len(test_seqs):,} ({test_labels.mean()*100:.1f}% will purchase in future)')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Step 3 — LightGBM Baseline (history features only)"
    ]
   },
   {
     "lgb_model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42, verbose=-1)\n",
     "lgb_model.fit(train_features, train_labels)\n",
     "\n",
     "lgb_test_probs = lgb_model.predict_proba(test_features)[:, 1]\n",
     "lgb_test_auc = roc_auc_score(test_labels, lgb_test_probs)\n",
     "\n",
+    "print(f'LightGBM Baseline (history features only):')\n",
+    "print(f'  Test AUC: {lgb_test_auc:.4f}')\n",
     "\n",
     "importance = pd.Series(lgb_model.feature_importances_, index=FEATURE_NAMES).sort_values(ascending=False)\n",
     "print(f'\\nTop features:')\n",
+    "for feat, imp in importance.head(7).items(): print(f'  {feat}: {imp}')"
    ]
   },
   {
    "source": [
     "## Step 4 — JointFusionModel Fine-Tuning\n",
     "\n",
+    "The transformer sees the **raw event sequence** (history only). The DCNv2 branch sees the **hand-crafted features** (also history only). The question: does the raw sequence add signal beyond what the features capture?"
    ]
   },
   {
     "test_dataset = DomainFinetuneDataset(\n",
     "    test_seqs, test_features, test_labels, builder, hf_tokenizer, max_length=MAX_LENGTH)\n",
     "\n",
+    "print(f'Train: {len(train_dataset)}, Test: {len(test_dataset)}')"
    ]
   },
   {
     "    learning_rate=1e-4,\n",
     "    warmup_steps=50,\n",
     "    logging_steps=20,\n",
+    "    eval_steps=200 if USE_GPU else 50,\n",
     "    save_strategy='no',\n",
     "    bf16=USE_BF16, fp16=USE_FP16,\n",
     "    report_to='wandb',\n",
+    "    run_name='ecommerce-finetune-temporal-5ep',\n",
     "    seed=42,\n",
     ")"
    ]
     "        all_probs.extend(probs.cpu().numpy())\n",
     "        all_labels_eval.extend(labels_batch.cpu().numpy())\n",
     "\n",
+    "fusion_test_auc = roc_auc_score(np.array(all_labels_eval), np.array(all_probs))\n",
     "print(f'JointFusion Test AUC: {fusion_test_auc:.4f}')"
    ]
   },
    "metadata": {},
    "outputs": [],
    "source": [
+    "print('=' * 60)\n",
+    "print('MODEL COMPARISON — Future Purchase Prediction (AUC)')\n",
+    "print('=' * 60)\n",
+    "print(f'  LightGBM (history features only):     {lgb_test_auc:.4f}')\n",
+    "print(f'  JointFusion (Transformer + features): {fusion_test_auc:.4f}')\n",
+    "print(f'  Difference:                           {fusion_test_auc - lgb_test_auc:+.4f}')\n",
+    "print('=' * 60)\n",
     "\n",
     "if fusion_test_auc > lgb_test_auc:\n",
+    "    print(f'\\n✅ JointFusion beats LightGBM by {(fusion_test_auc - lgb_test_auc)*100:.2f} pp')\n",
+    "    print(f'   The sequential patterns from domain tokens add value beyond tabular features.')\n",
+    "elif abs(fusion_test_auc - lgb_test_auc) < 0.005:\n",
+    "    print(f'\\n≈ Roughly tied. The transformer embeddings match LightGBM.')\n",
+    "    print(f'   More pre-training epochs would likely push JointFusion ahead.')\n",
     "else:\n",
+    "    print(f'\\n⚠️ LightGBM leads by {(lgb_test_auc - fusion_test_auc)*100:.2f} pp')\n",
+    "    print(f'   More pre-training (10+ epochs) and longer context (1024+) needed.')"
    ]
   },
   {
     "fig, ax = plt.subplots(figsize=(10, 5))\n",
     "ax.plot(losses, label='Train Loss', alpha=0.7)\n",
     "if eval_losses:\n",
+    "    eval_x = np.linspace(0, len(losses), len(eval_losses))\n",
+    "    ax.plot(eval_x, eval_losses, 'ro-', label='Eval Loss', markersize=4)\n",
+    "ax.set_xlabel('Step'); ax.set_ylabel('Loss'); ax.set_title('Fine-Tuning Loss (Temporal Split)')\n",
     "ax.legend(); ax.grid(True, alpha=0.3); plt.tight_layout(); plt.show()"
    ]
   },
    "source": [
     "## Summary\n",
     "\n",
+    "| Model | Test AUC | Input |\n",
     "|-------|----------|-------|\n",
+    "| LightGBM | *see above* | 14 history-only features |\n",
+    "| JointFusion | *see above* | Pre-trained domain token sequence + same 14 features |\n",
+    "\n",
+    "**Task:** Predict future purchase from past browsing history (temporal split, no leakage).\n",
     "\n",
+    "The pre-trained DomainTransformer captures sequential patterns (browsing funnels, category stickiness, temporal habits) that may add predictive signal beyond aggregate features."
    ]
   }
  ],