import json import os import random import numpy as np # from clearml import Task from sklearn.metrics import f1_score, roc_auc_score from xgboost import XGBClassifier from data_preparation.prepare_dataset import get_numpy_splits _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) CFG = { "model_name": "face_orientation", "seed": 42, "split_ratios": (0.7, 0.15, 0.15), "scale": False, # XGBoost is tree-based — scaling is unnecessary "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"), "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"), # XGBoost hyperparameters chosen by F1 score in 40 trials of Optuna HPO "n_estimators": 600, "max_depth": 8, "learning_rate": 0.1489, "subsample": 0.9625, "colsample_bytree": 0.9013, "reg_alpha": 1.1407, "reg_lambda": 2.4181, "eval_metric": "logloss", } # ClearML disabled (uncomment + set credentials to re-enable) # task = Task.init( # project_name="FocusGuards Large Group Project", # task_name="XGBoost Model Training", # tags=["training", "xgboost"] # ) # task.connect(CFG) task = None def set_seed(seed: int): random.seed(seed) np.random.seed(seed) def main(): set_seed(CFG["seed"]) print(f"[TRAIN] Model: XGBoost") print(f"[TRAIN] Task: {CFG['model_name']}") # ── Data ────────────────────────────────────────────────────── splits, num_features, num_classes, scaler = get_numpy_splits( model_name=CFG["model_name"], split_ratios=CFG["split_ratios"], seed=CFG["seed"], scale=CFG["scale"], ) X_train, y_train = splits["X_train"], splits["y_train"] X_val, y_val = splits["X_val"], splits["y_val"] X_test, y_test = splits["X_test"], splits["y_test"] # ── Model ───────────────────────────────────────────────────── model = XGBClassifier( n_estimators=CFG["n_estimators"], max_depth=CFG["max_depth"], learning_rate=CFG["learning_rate"], subsample=CFG["subsample"], colsample_bytree=CFG["colsample_bytree"], reg_alpha=CFG["reg_alpha"], reg_lambda=CFG["reg_lambda"], eval_metric=CFG["eval_metric"], early_stopping_rounds=30, random_state=CFG["seed"], verbosity=1, ) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=10, ) print(f"[TRAIN] Best iteration: {model.best_iteration} / {CFG['n_estimators']}") # ── Evaluation ──────────────────────────────────────────────── evals = model.evals_result() train_losses = evals["validation_0"][CFG["eval_metric"]] val_losses = evals["validation_1"][CFG["eval_metric"]] # Test metrics test_preds = model.predict(X_test) test_probs = model.predict_proba(X_test) test_acc = float(np.mean(test_preds == y_test)) test_f1 = float(f1_score(y_test, test_preds, average='weighted')) if num_classes > 2: test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted')) else: test_auc = float(roc_auc_score(y_test, test_probs[:, 1])) print(f"\n[TEST] Accuracy: {test_acc:.2%}") print(f"[TEST] F1: {test_f1:.4f}") print(f"[TEST] ROC-AUC: {test_auc:.4f}") # ClearML logging (no-op when task is None) if task is not None: for i, (tl, vl) in enumerate(zip(train_losses, val_losses)): task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1) task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1) task.logger.report_single_value("test_accuracy", test_acc) task.logger.report_single_value("test_f1", test_f1) task.logger.report_single_value("test_auc", test_auc) task.logger.flush() # ── Save checkpoint ─────────────────────────────────────────── ckpt_dir = CFG["checkpoints_dir"] os.makedirs(ckpt_dir, exist_ok=True) model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json") model.save_model(model_path) print(f"\n[CKPT] Model saved to: {model_path}") # ── Write JSON log (same schema as MLP) ─────────────────────── history = { "model_name": f"xgboost_{CFG['model_name']}", "param_count": int(model.get_booster().trees_to_dataframe().shape[0]), # total tree nodes "n_estimators": CFG["n_estimators"], "max_depth": CFG["max_depth"], "epochs": list(range(1, len(train_losses) + 1)), "train_loss": [round(v, 4) for v in train_losses], "val_loss": [round(v, 4) for v in val_losses], "test_acc": round(test_acc, 4), "test_f1": round(test_f1, 4), "test_auc": round(test_auc, 4), } logs_dir = CFG["logs_dir"] os.makedirs(logs_dir, exist_ok=True) log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json") with open(log_path, "w") as f: json.dump(history, f, indent=2) print(f"[LOG] Training history saved to: {log_path}") if __name__ == "__main__": main()