File size: 5,601 Bytes
8bbb872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import json
import os
import random

import numpy as np
# from clearml import Task
from sklearn.metrics import f1_score, roc_auc_score
from xgboost import XGBClassifier

from data_preparation.prepare_dataset import get_numpy_splits

_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
CFG = {
    "model_name": "face_orientation",
    "seed": 42,
    "split_ratios": (0.7, 0.15, 0.15),
    "scale": False,  # XGBoost is tree-based β€” scaling is unnecessary
    "checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
    "logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
    # XGBoost hyperparameters chosen by F1 score in 40 trials of Optuna HPO
    "n_estimators": 600,
    "max_depth": 8,
    "learning_rate": 0.1489,
    "subsample": 0.9625,
    "colsample_bytree": 0.9013,
    "reg_alpha": 1.1407,
    "reg_lambda": 2.4181,
    "eval_metric": "logloss",
}


# ClearML disabled (uncomment + set credentials to re-enable)
# task = Task.init(
#     project_name="FocusGuards Large Group Project",
#     task_name="XGBoost Model Training",
#     tags=["training", "xgboost"]
# )
# task.connect(CFG)
task = None

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)


def main():
    set_seed(CFG["seed"])

    print(f"[TRAIN] Model: XGBoost")
    print(f"[TRAIN] Task: {CFG['model_name']}")

    # ── Data ──────────────────────────────────────────────────────
    splits, num_features, num_classes, scaler = get_numpy_splits(
        model_name=CFG["model_name"],
        split_ratios=CFG["split_ratios"],
        seed=CFG["seed"],
        scale=CFG["scale"],
    )

    X_train, y_train = splits["X_train"], splits["y_train"]
    X_val,   y_val   = splits["X_val"],   splits["y_val"]
    X_test,  y_test  = splits["X_test"],  splits["y_test"]

    # ── Model ─────────────────────────────────────────────────────
    model = XGBClassifier(
        n_estimators=CFG["n_estimators"],
        max_depth=CFG["max_depth"],
        learning_rate=CFG["learning_rate"],
        subsample=CFG["subsample"],
        colsample_bytree=CFG["colsample_bytree"],
        reg_alpha=CFG["reg_alpha"],
        reg_lambda=CFG["reg_lambda"],
        eval_metric=CFG["eval_metric"],
        early_stopping_rounds=30,
        random_state=CFG["seed"],
        verbosity=1,
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=10,
    )
    print(f"[TRAIN] Best iteration: {model.best_iteration} / {CFG['n_estimators']}")

    # ── Evaluation ────────────────────────────────────────────────
    evals = model.evals_result()
    train_losses = evals["validation_0"][CFG["eval_metric"]]
    val_losses   = evals["validation_1"][CFG["eval_metric"]]

    # Test metrics
    test_preds = model.predict(X_test)
    test_probs = model.predict_proba(X_test)
    test_acc   = float(np.mean(test_preds == y_test))
    test_f1    = float(f1_score(y_test, test_preds, average='weighted'))

    if num_classes > 2:
        test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted'))
    else:
        test_auc = float(roc_auc_score(y_test, test_probs[:, 1]))

    print(f"\n[TEST] Accuracy: {test_acc:.2%}")
    print(f"[TEST] F1:       {test_f1:.4f}")
    print(f"[TEST] ROC-AUC:  {test_auc:.4f}")

    # ClearML logging (no-op when task is None)
    if task is not None:
        for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
            task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1)
            task.logger.report_scalar("Loss", "Val",   vl, iteration=i + 1)
        task.logger.report_single_value("test_accuracy", test_acc)
        task.logger.report_single_value("test_f1",       test_f1)
        task.logger.report_single_value("test_auc",      test_auc)
        task.logger.flush()

    # ── Save checkpoint ───────────────────────────────────────────
    ckpt_dir = CFG["checkpoints_dir"]
    os.makedirs(ckpt_dir, exist_ok=True)
    model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json")
    model.save_model(model_path)
    print(f"\n[CKPT] Model saved to: {model_path}")

    # ── Write JSON log (same schema as MLP) ───────────────────────
    history = {
        "model_name": f"xgboost_{CFG['model_name']}",
        "param_count": int(model.get_booster().trees_to_dataframe().shape[0]),  # total tree nodes
        "n_estimators": CFG["n_estimators"],
        "max_depth": CFG["max_depth"],
        "epochs": list(range(1, len(train_losses) + 1)),
        "train_loss": [round(v, 4) for v in train_losses],
        "val_loss":   [round(v, 4) for v in val_losses],
        "test_acc":   round(test_acc, 4),
        "test_f1":    round(test_f1, 4),
        "test_auc":   round(test_auc, 4),
    }

    logs_dir = CFG["logs_dir"]
    os.makedirs(logs_dir, exist_ok=True)
    log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json")

    with open(log_path, "w") as f:
        json.dump(history, f, indent=2)

    print(f"[LOG] Training history saved to: {log_path}")


if __name__ == "__main__":
    main()