Spaces:

FocusGuard
/

IntegrationTest

Sleeping

App Files Files Community

IntegrationTest / models /xgboost /train.py

Yingtao-Zheng

Upload partially updated files

8bbb872 8 days ago

raw

history blame contribute delete

5.6 kB

	import json
	import os
	import random

	import numpy as np
	# from clearml import Task
	from sklearn.metrics import f1_score, roc_auc_score
	from xgboost import XGBClassifier

	from data_preparation.prepare_dataset import get_numpy_splits

	_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
	CFG = {
	"model_name": "face_orientation",
	"seed": 42,
	"split_ratios": (0.7, 0.15, 0.15),
	"scale": False, # XGBoost is tree-based — scaling is unnecessary
	"checkpoints_dir": os.path.join(_PROJECT_ROOT, "checkpoints"),
	"logs_dir": os.path.join(_PROJECT_ROOT, "evaluation", "logs"),
	# XGBoost hyperparameters chosen by F1 score in 40 trials of Optuna HPO
	"n_estimators": 600,
	"max_depth": 8,
	"learning_rate": 0.1489,
	"subsample": 0.9625,
	"colsample_bytree": 0.9013,
	"reg_alpha": 1.1407,
	"reg_lambda": 2.4181,
	"eval_metric": "logloss",
	}


	# ClearML disabled (uncomment + set credentials to re-enable)
	# task = Task.init(
	# project_name="FocusGuards Large Group Project",
	# task_name="XGBoost Model Training",
	# tags=["training", "xgboost"]
	# )
	# task.connect(CFG)
	task = None

	def set_seed(seed: int):
	random.seed(seed)
	np.random.seed(seed)


	def main():
	set_seed(CFG["seed"])

	print(f"[TRAIN] Model: XGBoost")
	print(f"[TRAIN] Task: {CFG['model_name']}")

	# ── Data ──────────────────────────────────────────────────────
	splits, num_features, num_classes, scaler = get_numpy_splits(
	model_name=CFG["model_name"],
	split_ratios=CFG["split_ratios"],
	seed=CFG["seed"],
	scale=CFG["scale"],
	)

	X_train, y_train = splits["X_train"], splits["y_train"]
	X_val, y_val = splits["X_val"], splits["y_val"]
	X_test, y_test = splits["X_test"], splits["y_test"]

	# ── Model ─────────────────────────────────────────────────────
	model = XGBClassifier(
	n_estimators=CFG["n_estimators"],
	max_depth=CFG["max_depth"],
	learning_rate=CFG["learning_rate"],
	subsample=CFG["subsample"],
	colsample_bytree=CFG["colsample_bytree"],
	reg_alpha=CFG["reg_alpha"],
	reg_lambda=CFG["reg_lambda"],
	eval_metric=CFG["eval_metric"],
	early_stopping_rounds=30,
	random_state=CFG["seed"],
	verbosity=1,
	)

	model.fit(
	X_train, y_train,
	eval_set=[(X_train, y_train), (X_val, y_val)],
	verbose=10,
	)
	print(f"[TRAIN] Best iteration: {model.best_iteration} / {CFG['n_estimators']}")

	# ── Evaluation ────────────────────────────────────────────────
	evals = model.evals_result()
	train_losses = evals["validation_0"][CFG["eval_metric"]]
	val_losses = evals["validation_1"][CFG["eval_metric"]]

	# Test metrics
	test_preds = model.predict(X_test)
	test_probs = model.predict_proba(X_test)
	test_acc = float(np.mean(test_preds == y_test))
	test_f1 = float(f1_score(y_test, test_preds, average='weighted'))

	if num_classes > 2:
	test_auc = float(roc_auc_score(y_test, test_probs, multi_class='ovr', average='weighted'))
	else:
	test_auc = float(roc_auc_score(y_test, test_probs[:, 1]))

	print(f"\n[TEST] Accuracy: {test_acc:.2%}")
	print(f"[TEST] F1: {test_f1:.4f}")
	print(f"[TEST] ROC-AUC: {test_auc:.4f}")

	# ClearML logging (no-op when task is None)
	if task is not None:
	for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
	task.logger.report_scalar("Loss", "Train", tl, iteration=i + 1)
	task.logger.report_scalar("Loss", "Val", vl, iteration=i + 1)
	task.logger.report_single_value("test_accuracy", test_acc)
	task.logger.report_single_value("test_f1", test_f1)
	task.logger.report_single_value("test_auc", test_auc)
	task.logger.flush()

	# ── Save checkpoint ───────────────────────────────────────────
	ckpt_dir = CFG["checkpoints_dir"]
	os.makedirs(ckpt_dir, exist_ok=True)
	model_path = os.path.join(ckpt_dir, f"xgboost_{CFG['model_name']}_best.json")
	model.save_model(model_path)
	print(f"\n[CKPT] Model saved to: {model_path}")

	# ── Write JSON log (same schema as MLP) ───────────────────────
	history = {
	"model_name": f"xgboost_{CFG['model_name']}",
	"param_count": int(model.get_booster().trees_to_dataframe().shape[0]), # total tree nodes
	"n_estimators": CFG["n_estimators"],
	"max_depth": CFG["max_depth"],
	"epochs": list(range(1, len(train_losses) + 1)),
	"train_loss": [round(v, 4) for v in train_losses],
	"val_loss": [round(v, 4) for v in val_losses],
	"test_acc": round(test_acc, 4),
	"test_f1": round(test_f1, 4),
	"test_auc": round(test_auc, 4),
	}

	logs_dir = CFG["logs_dir"]
	os.makedirs(logs_dir, exist_ok=True)
	log_path = os.path.join(logs_dir, f"xgboost_{CFG['model_name']}_training_log.json")

	with open(log_path, "w") as f:
	json.dump(history, f, indent=2)

	print(f"[LOG] Training history saved to: {log_path}")


	if __name__ == "__main__":
	main()