| """ |
| RTB Bidding Algorithm Comparison Framework |
| =========================================== |
| |
| Runs all bidding algorithms on first-price auction simulations |
| and produces comprehensive comparison results. |
| |
| Algorithms: |
| - DualOGD: Lagrangian dual + online gradient descent (Wang et al. 2023) |
| - TwoSidedDual: Budget cap + spend floor (k% minimum) |
| - ValueShading: Value shading for first-price |
| - RLB: MDP-based reinforcement learning (Cai et al. 2017) |
| - Linear: Proportional bidding baseline |
| - Threshold: Fixed-bid-if-pCTR baseline |
| """ |
| import sys |
| import os |
| import json |
| import time |
| import numpy as np |
| import pandas as pd |
| from datasets import load_dataset |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import LabelEncoder, StandardScaler |
|
|
| |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
|
| def load_and_prepare_data(max_rows=100000): |
| """Load Criteo_x4 and prepare features + labels.""" |
| print("=" * 70) |
| print("LOADING CRITEO DATA") |
| print("=" * 70) |
| |
| ds = load_dataset("reczoo/Criteo_x4", split="train", streaming=True) |
| rows = [] |
| for i, row in enumerate(ds): |
| if i >= max_rows: |
| break |
| rows.append(row) |
| |
| df = pd.DataFrame(rows) |
| print(f"Loaded {len(df)} rows, CTR: {df['Label'].mean():.4f}") |
| |
| |
| dense_cols = [f'I{i}' for i in range(1, 14)] |
| sparse_cols = [f'C{i}' for i in range(1, 27)] |
| |
| |
| for col in dense_cols: |
| df[col] = df[col].fillna(df[col].median()) |
| for col in sparse_cols: |
| df[col] = df[col].fillna("MISSING") |
| |
| |
| for col in sparse_cols: |
| le = LabelEncoder() |
| df[col] = le.fit_transform(df[col].astype(str)) |
| |
| |
| scaler = StandardScaler() |
| dense_data = scaler.fit_transform(df[dense_cols].values) |
| for i, col in enumerate(dense_cols): |
| df[col] = dense_data[:, i] |
| |
| |
| sparse_data = df[sparse_cols].values.astype(np.float32) |
| sparse_data = (sparse_data - sparse_data.mean(axis=0)) / (sparse_data.std(axis=0) + 1e-8) |
| for i, col in enumerate(sparse_cols): |
| df[col] = sparse_data[:, i] |
| |
| feature_cols = dense_cols + sparse_cols |
| X = df[feature_cols].values.astype(np.float32) |
| y = df['Label'].values.astype(np.float32) |
| |
| |
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.3, random_state=42 |
| ) |
| |
| return X_train, X_test, y_train, y_test, df, feature_cols |
|
|
|
|
| def train_ctr_model(X_train, y_train): |
| """Train a CTR prediction model (Logistic Regression baseline).""" |
| print("\n" + "=" * 70) |
| print("TRAINING CTR MODEL") |
| print("=" * 70) |
| |
| model = LogisticRegression(max_iter=500, C=0.1, random_state=42) |
| model.fit(X_train, y_train) |
| |
| train_auc = roc_auc_score_safe(y_train, model.predict_proba(X_train)[:, 1]) |
| print(f"Train AUC: {train_auc:.4f}") |
| |
| return model |
|
|
|
|
| def roc_auc_score_safe(y_true, y_pred): |
| """Safe AUC computation.""" |
| from sklearn.metrics import roc_auc_score |
| if len(np.unique(y_true)) < 2: |
| return 0.5 |
| return roc_auc_score(y_true, y_pred) |
|
|
|
|
| def run_benchmark( |
| X_test, y_test, ctr_model, |
| budget=5000.0, |
| T=10000, |
| value_per_click=50.0, |
| k=0.8, |
| n_runs=3, |
| seed=42 |
| ): |
| """Run all bidding algorithms and compare.""" |
| print("\n" + "=" * 70) |
| print("RUNNING BIDDING BENCHMARK") |
| print("=" * 70) |
| print(f"Budget: {budget}, T: {T}, Value/Click: {value_per_click}") |
| print(f"Minimum spend: {k*100:.0f}%, Runs: {n_runs}") |
| |
| from src.benchmark.auction_simulator import FirstPriceAuctionSimulator |
| from src.algorithms.dual_ogd import DualOGDBidder, TwoSidedDualBidder |
| from src.algorithms.baselines import LinearBidder, ThresholdBidder, ValueShadingBidder, RLBBidder |
| |
| |
| pctr_test = ctr_model.predict_proba(X_test)[:, 1] |
| print(f"pCTR range: [{pctr_test.min():.4f}, {pctr_test.max():.4f}]") |
| print(f"pCTR mean: {pctr_test.mean():.4f}") |
| |
| all_results = {} |
| |
| for run in range(n_runs): |
| run_seed = seed + run |
| print(f"\n--- Run {run + 1}/{n_runs} (seed={run_seed}) ---") |
| |
| |
| sim = FirstPriceAuctionSimulator( |
| features=X_test[:T], |
| pctr_true=pctr_test[:T], |
| click_labels=y_test[:T], |
| value_per_click=value_per_click, |
| market_price_config={ |
| 'base_mean': 20.0, |
| 'ctr_correlation': 10.0, |
| 'noise_std': 0.6, |
| }, |
| seed=run_seed |
| ) |
| |
| |
| algorithms = { |
| 'DualOGD': DualOGDBidder(budget, T, value_per_click), |
| 'TwoSidedDual': TwoSidedDualBidder(budget, T, value_per_click, k=k), |
| 'ValueShading': ValueShadingBidder(budget, T, value_per_click), |
| 'RLB': RLBBidder(budget, T, value_per_click), |
| 'Linear': LinearBidder(20.0, float(pctr_test.mean())), |
| 'Threshold': ThresholdBidder(0.3, 30.0), |
| } |
| |
| |
| for algo in algorithms.values(): |
| if hasattr(algo, 'B'): |
| algo.B = budget |
| algo.remaining_budget = budget |
| |
| |
| run_results = sim.run_comparison(algorithms) |
| |
| for name, results in run_results.items(): |
| if name not in all_results: |
| all_results[name] = [] |
| all_results[name].append(results) |
| |
| return all_results, pctr_test |
|
|
|
|
| def aggregate_results(all_results): |
| """Aggregate results across runs.""" |
| print("\n" + "=" * 70) |
| print("AGGREGATED RESULTS") |
| print("=" * 70) |
| |
| aggregated = {} |
| |
| for name, runs in all_results.items(): |
| clicks = [r['total_clicks'] for r in runs] |
| cpc = [r.get('cpc', 0) for r in runs] |
| budget_used = [r.get('budget_used_frac', 0) for r in runs] |
| win_rate = [r.get('win_rate', 0) for r in runs] |
| |
| aggregated[name] = { |
| 'clicks_mean': np.mean(clicks), |
| 'clicks_std': np.std(clicks), |
| 'cpc_mean': np.mean(cpc), |
| 'cpc_std': np.std(cpc), |
| 'budget_used_mean': np.mean(budget_used), |
| 'budget_used_std': np.std(budget_used), |
| 'win_rate_mean': np.mean(win_rate), |
| 'win_rate_std': np.std(win_rate), |
| } |
| |
| |
| print(f"\n{'Algorithm':<18} {'Clicks':>10} {'CPC':>10} {'Budget%':>10} {'WinRate':>10}") |
| print("-" * 58) |
| |
| |
| sorted_algos = sorted(aggregated.items(), key=lambda x: x[1]['clicks_mean'], reverse=True) |
| |
| for name, stats in sorted_algos: |
| print(f"{name:<18} {stats['clicks_mean']:>8.0f}±{stats['clicks_std']:.0f} " |
| f"{stats['cpc_mean']:>8.2f} {stats['budget_used_mean']:>8.1%} " |
| f"{stats['win_rate_mean']:>8.1%}") |
| |
| return aggregated |
|
|
|
|
| def main(): |
| import argparse |
| parser = argparse.ArgumentParser(description='RTB Bidding Benchmark') |
| parser.add_argument('--max_rows', type=int, default=100000) |
| parser.add_argument('--budget', type=float, default=5000.0) |
| parser.add_argument('--T', type=int, default=10000) |
| parser.add_argument('--vpc', type=float, default=50.0) |
| parser.add_argument('--k', type=float, default=0.8) |
| parser.add_argument('--n_runs', type=int, default=3) |
| parser.add_argument('--output', type=str, default='/app/results/benchmark_results.json') |
| parser.add_argument('--seed', type=int, default=42) |
| args = parser.parse_args() |
| |
| |
| X_train, X_test, y_train, y_test, df, feature_cols = load_and_prepare_data( |
| max_rows=args.max_rows |
| ) |
| |
| |
| ctr_model = train_ctr_model(X_train, y_train) |
| |
| |
| all_results, pctr_test = run_benchmark( |
| X_test, y_test, ctr_model, |
| budget=args.budget, |
| T=args.T, |
| value_per_click=args.vpc, |
| k=args.k, |
| n_runs=args.n_runs, |
| seed=args.seed |
| ) |
| |
| |
| aggregated = aggregate_results(all_results) |
| |
| |
| os.makedirs(os.path.dirname(args.output), exist_ok=True) |
| output = { |
| 'config': { |
| 'max_rows': args.max_rows, |
| 'budget': args.budget, |
| 'T': args.T, |
| 'value_per_click': args.vpc, |
| 'k': args.k, |
| 'n_runs': args.n_runs, |
| 'seed': args.seed, |
| }, |
| 'aggregated': {k: {kk: float(vv) if isinstance(vv, (np.floating, np.integer)) else vv |
| for kk, vv in v.items()} |
| for k, v in aggregated.items()}, |
| 'raw_runs': {k: [{kk: float(vv) if isinstance(vv, (np.floating, np.integer)) else vv |
| for kk, vv in r.items()} |
| for r in runs] |
| for k, runs in all_results.items()}, |
| } |
| |
| with open(args.output, 'w') as f: |
| json.dump(output, f, indent=2) |
| |
| print(f"\nResults saved to {args.output}") |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|