Yingtao-Zheng's picture
Upload partially updated files
8bbb872
import json
import os
import random
import numpy as np
# from clearml import Task
from sklearn.metrics import f1_score, roc_auc_score
from xgboost import XGBClassifier
from data_preparation.prepare_dataset import get_numpy_splits
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
CFG = {
"model_name": "face_orientation",
"seed": 42,
"split_ratios": (0.7, 0.15, 0.15),
"scale": False, # XGBoost is tree-based β€” scaling is unnecessary
"checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
"logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
# XGBoost hyperparameters chosen by F1 score in 40 trials of Optuna HPO
"n_estimators": 600,
"max_depth": 8,
"learning_rate": 0.1489,
"subsample": 0.9625,
"colsample_bytree": 0.9013,
"reg_alpha": 1.1407,
"reg_lambda": 2.4181,
"eval_metric": "logloss",
}
# ClearML disabled (uncomment + set credentials to re-enable)
# task = Task.init(
# project_name="FocusGuards Large Group Project",
# task_name="XGBoost Model Training",
# tags=["training", "xgboost"]
# )
# task.connect(CFG)
task = None
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
def main():
set_seed(CFG["seed"])
print(f"[TRAIN] Model: XGBoost")
print(f"[TRAIN] Task: {CFG['model_name']}")
# ── Data ──────────────────────────────────────────────────────
splits, num_features, num_classes, scaler = get_numpy_splits(
model_name=CFG["model_name"],
split_ratios=CFG["split_ratios"],
seed=CFG["seed"],
scale=CFG["scale"],
)
X_train, y_train = splits["X_train"], splits["y_train"]
X_val, y_val = splits["X_val"], splits["y_val"]
X_test, y_test = splits["X_test"], splits["y_test"]
# ── Model ─────────────────────────────────────────────────────
model = XGBClassifier(
n_estimators=CFG["n_estimators"],
max_depth=CFG["max_depth"],
learning_rate=CFG["learning_rate"],
subsample=CFG["subsample"],
colsample_bytree=CFG["colsample_bytree"],
reg_alpha=CFG["reg_alpha"],
reg_lambda=CFG["reg_lambda"],
eval_metric=CFG["eval_metric"],
early_stopping_rounds=30,
random_state=CFG["seed"],
verbosity=1,
)
model.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
verbose=10,
)
print(f"[TRAIN] Best iteration: {model.best_iteration} / {CFG['n_estimators']}")
# ── Evaluation ────────────────────────────────────────────────
evals = model.evals_result()
train_losses = evals["validation_0"][CFG["eval_metric"]]
val_losses = evals["validation_1"][CFG["eval_metric"]]
# Test metrics
test_preds = model.predict(X_test)
test_probs = model.predict_proba(X_test)
test_acc = float(np.mean(test_preds == y_test))
test_f1 = float(f1_score(y_test, test_preds, average='weighted'))
if num_classes > 2:
test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted'))
else:
test_auc = float(roc_auc_score(y_test, test_probs[:, 1]))
print(f"\n[TEST] Accuracy: {test_acc:.2%}")
print(f"[TEST] F1: {test_f1:.4f}")
print(f"[TEST] ROC-AUC: {test_auc:.4f}")
# ClearML logging (no-op when task is None)
if task is not None:
for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1)
task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1)
task.logger.report_single_value("test_accuracy", test_acc)
task.logger.report_single_value("test_f1", test_f1)
task.logger.report_single_value("test_auc", test_auc)
task.logger.flush()
# ── Save checkpoint ───────────────────────────────────────────
ckpt_dir = CFG["checkpoints_dir"]
os.makedirs(ckpt_dir, exist_ok=True)
model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json")
model.save_model(model_path)
print(f"\n[CKPT] Model saved to: {model_path}")
# ── Write JSON log (same schema as MLP) ───────────────────────
history = {
"model_name": f"xgboost_{CFG['model_name']}",
"param_count": int(model.get_booster().trees_to_dataframe().shape[0]), # total tree nodes
"n_estimators": CFG["n_estimators"],
"max_depth": CFG["max_depth"],
"epochs": list(range(1, len(train_losses) + 1)),
"train_loss": [round(v, 4) for v in train_losses],
"val_loss": [round(v, 4) for v in val_losses],
"test_acc": round(test_acc, 4),
"test_f1": round(test_f1, 4),
"test_auc": round(test_auc, 4),
}
logs_dir = CFG["logs_dir"]
os.makedirs(logs_dir, exist_ok=True)
log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json")
with open(log_path, "w") as f:
json.dump(history, f, indent=2)
print(f"[LOG] Training history saved to: {log_path}")
if __name__ == "__main__":
main()