Spaces:

FocusGuard
/

IntegrationTest

Sleeping

File size: 5,601 Bytes

8bbb872

import json
import os
import random

import numpy as np
# from clearml import Task
from sklearn.metrics import f1_score, roc_auc_score
from xgboost import XGBClassifier

from data_preparation.prepare_dataset import get_numpy_splits

_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
CFG = {
    "model_name": "face_orientation",
    "seed": 42,
    "split_ratios": (0.7, 0.15, 0.15),
    "scale": False,  # XGBoost is tree-based — scaling is unnecessary
    "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
    "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
    # XGBoost hyperparameters chosen by F1 score in 40 trials of Optuna HPO
    "n_estimators": 600,
    "max_depth": 8,
    "learning_rate": 0.1489,
    "subsample": 0.9625,
    "colsample_bytree": 0.9013,
    "reg_alpha": 1.1407,
    "reg_lambda": 2.4181,
    "eval_metric": "logloss",
}


# ClearML disabled (uncomment + set credentials to re-enable)
# task = Task.init(
#     project_name="FocusGuards Large Group Project",
#     task_name="XGBoost Model Training",
#     tags=["training", "xgboost"]
# )
# task.connect(CFG)
task = None

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)


def main():
    set_seed(CFG["seed"])

    print(f"[TRAIN] Model: XGBoost")
    print(f"[TRAIN] Task: {CFG['model_name']}")

    # ── Data ──────────────────────────────────────────────────────
    splits, num_features, num_classes, scaler = get_numpy_splits(
        model_name=CFG["model_name"],
        split_ratios=CFG["split_ratios"],
        seed=CFG["seed"],
        scale=CFG["scale"],
    )

    X_train, y_train = splits["X_train"], splits["y_train"]
    X_val,   y_val   = splits["X_val"],   splits["y_val"]
    X_test,  y_test  = splits["X_test"],  splits["y_test"]

    # ── Model ─────────────────────────────────────────────────────
    model = XGBClassifier(
        n_estimators=CFG["n_estimators"],
        max_depth=CFG["max_depth"],
        learning_rate=CFG["learning_rate"],
        subsample=CFG["subsample"],
        colsample_bytree=CFG["colsample_bytree"],
        reg_alpha=CFG["reg_alpha"],
        reg_lambda=CFG["reg_lambda"],
        eval_metric=CFG["eval_metric"],
        early_stopping_rounds=30,
        random_state=CFG["seed"],
        verbosity=1,
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=10,
    )
    print(f"[TRAIN] Best iteration: {model.best_iteration} / {CFG['n_estimators']}")

    # ── Evaluation ────────────────────────────────────────────────
    evals = model.evals_result()
    train_losses = evals["validation_0"][CFG["eval_metric"]]
    val_losses   = evals["validation_1"][CFG["eval_metric"]]

    # Test metrics
    test_preds = model.predict(X_test)
    test_probs = model.predict_proba(X_test)
    test_acc   = float(np.mean(test_preds == y_test))
    test_f1    = float(f1_score(y_test, test_preds, average='weighted'))

    if num_classes > 2:
        test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted'))
    else:
        test_auc = float(roc_auc_score(y_test, test_probs[:, 1]))

    print(f"\n[TEST] Accuracy: {test_acc:.2%}")
    print(f"[TEST] F1:       {test_f1:.4f}")
    print(f"[TEST] ROC-AUC:  {test_auc:.4f}")

    # ClearML logging (no-op when task is None)
    if task is not None:
        for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
            task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1)
            task.logger.report_scalar("Loss", "Val",   vl, iteration=i + 1)
        task.logger.report_single_value("test_accuracy", test_acc)
        task.logger.report_single_value("test_f1",       test_f1)
        task.logger.report_single_value("test_auc",      test_auc)
        task.logger.flush()

    # ── Save checkpoint ───────────────────────────────────────────
    ckpt_dir = CFG["checkpoints_dir"]
    os.makedirs(ckpt_dir, exist_ok=True)
    model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json")
    model.save_model(model_path)
    print(f"\n[CKPT] Model saved to: {model_path}")

    # ── Write JSON log (same schema as MLP) ───────────────────────
    history = {
        "model_name": f"xgboost_{CFG['model_name']}",
        "param_count": int(model.get_booster().trees_to_dataframe().shape[0]),  # total tree nodes
        "n_estimators": CFG["n_estimators"],
        "max_depth": CFG["max_depth"],
        "epochs": list(range(1, len(train_losses) + 1)),
        "train_loss": [round(v, 4) for v in train_losses],
        "val_loss":   [round(v, 4) for v in val_losses],
        "test_acc":   round(test_acc, 4),
        "test_f1":    round(test_f1, 4),
        "test_auc":   round(test_auc, 4),
    }

    logs_dir = CFG["logs_dir"]
    os.makedirs(logs_dir, exist_ok=True)
    log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json")

    with open(log_path, "w") as f:
        json.dump(history, f, indent=2)

    print(f"[LOG] Training history saved to: {log_path}")


if __name__ == "__main__":
    main()