Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import random | |
| import numpy as np | |
| # from clearml import Task | |
| from sklearn.metrics import f1_score, roc_auc_score | |
| from xgboost import XGBClassifier | |
| from data_preparation.prepare_dataset import get_numpy_splits | |
| _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) | |
| CFG = { | |
| "model_name": "face_orientation", | |
| "seed": 42, | |
| "split_ratios": (0.7, 0.15, 0.15), | |
| "scale": False, # XGBoost is tree-based β scaling is unnecessary | |
| "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"), | |
| "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"), | |
| # XGBoost hyperparameters chosen by F1 score in 40 trials of Optuna HPO | |
| "n_estimators": 600, | |
| "max_depth": 8, | |
| "learning_rate": 0.1489, | |
| "subsample": 0.9625, | |
| "colsample_bytree": 0.9013, | |
| "reg_alpha": 1.1407, | |
| "reg_lambda": 2.4181, | |
| "eval_metric": "logloss", | |
| } | |
| # ClearML disabled (uncomment + set credentials to re-enable) | |
| # task = Task.init( | |
| # project_name="FocusGuards Large Group Project", | |
| # task_name="XGBoost Model Training", | |
| # tags=["training", "xgboost"] | |
| # ) | |
| # task.connect(CFG) | |
| task = None | |
| def set_seed(seed: int): | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| def main(): | |
| set_seed(CFG["seed"]) | |
| print(f"[TRAIN] Model: XGBoost") | |
| print(f"[TRAIN] Task: {CFG['model_name']}") | |
| # ββ Data ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| splits, num_features, num_classes, scaler = get_numpy_splits( | |
| model_name=CFG["model_name"], | |
| split_ratios=CFG["split_ratios"], | |
| seed=CFG["seed"], | |
| scale=CFG["scale"], | |
| ) | |
| X_train, y_train = splits["X_train"], splits["y_train"] | |
| X_val, y_val = splits["X_val"], splits["y_val"] | |
| X_test, y_test = splits["X_test"], splits["y_test"] | |
| # ββ Model βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| model = XGBClassifier( | |
| n_estimators=CFG["n_estimators"], | |
| max_depth=CFG["max_depth"], | |
| learning_rate=CFG["learning_rate"], | |
| subsample=CFG["subsample"], | |
| colsample_bytree=CFG["colsample_bytree"], | |
| reg_alpha=CFG["reg_alpha"], | |
| reg_lambda=CFG["reg_lambda"], | |
| eval_metric=CFG["eval_metric"], | |
| early_stopping_rounds=30, | |
| random_state=CFG["seed"], | |
| verbosity=1, | |
| ) | |
| model.fit( | |
| X_train, y_train, | |
| eval_set=[(X_train, y_train), (X_val, y_val)], | |
| verbose=10, | |
| ) | |
| print(f"[TRAIN] Best iteration: {model.best_iteration} / {CFG['n_estimators']}") | |
| # ββ Evaluation ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| evals = model.evals_result() | |
| train_losses = evals["validation_0"][CFG["eval_metric"]] | |
| val_losses = evals["validation_1"][CFG["eval_metric"]] | |
| # Test metrics | |
| test_preds = model.predict(X_test) | |
| test_probs = model.predict_proba(X_test) | |
| test_acc = float(np.mean(test_preds == y_test)) | |
| test_f1 = float(f1_score(y_test, test_preds, average='weighted')) | |
| if num_classes > 2: | |
| test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted')) | |
| else: | |
| test_auc = float(roc_auc_score(y_test, test_probs[:, 1])) | |
| print(f"\n[TEST] Accuracy: {test_acc:.2%}") | |
| print(f"[TEST] F1: {test_f1:.4f}") | |
| print(f"[TEST] ROC-AUC: {test_auc:.4f}") | |
| # ClearML logging (no-op when task is None) | |
| if task is not None: | |
| for i, (tl, vl) in enumerate(zip(train_losses, val_losses)): | |
| task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1) | |
| task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1) | |
| task.logger.report_single_value("test_accuracy", test_acc) | |
| task.logger.report_single_value("test_f1", test_f1) | |
| task.logger.report_single_value("test_auc", test_auc) | |
| task.logger.flush() | |
| # ββ Save checkpoint βββββββββββββββββββββββββββββββββββββββββββ | |
| ckpt_dir = CFG["checkpoints_dir"] | |
| os.makedirs(ckpt_dir, exist_ok=True) | |
| model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json") | |
| model.save_model(model_path) | |
| print(f"\n[CKPT] Model saved to: {model_path}") | |
| # ββ Write JSON log (same schema as MLP) βββββββββββββββββββββββ | |
| history = { | |
| "model_name": f"xgboost_{CFG['model_name']}", | |
| "param_count": int(model.get_booster().trees_to_dataframe().shape[0]), # total tree nodes | |
| "n_estimators": CFG["n_estimators"], | |
| "max_depth": CFG["max_depth"], | |
| "epochs": list(range(1, len(train_losses) + 1)), | |
| "train_loss": [round(v, 4) for v in train_losses], | |
| "val_loss": [round(v, 4) for v in val_losses], | |
| "test_acc": round(test_acc, 4), | |
| "test_f1": round(test_f1, 4), | |
| "test_auc": round(test_auc, 4), | |
| } | |
| logs_dir = CFG["logs_dir"] | |
| os.makedirs(logs_dir, exist_ok=True) | |
| log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json") | |
| with open(log_path, "w") as f: | |
| json.dump(history, f, indent=2) | |
| print(f"[LOG] Training history saved to: {log_path}") | |
| if __name__ == "__main__": | |
| main() | |