Spaces:

FocusGuard
/

IntegrationTest

Sleeping

App Files Files Community

IntegrationTest / models /xgboost /sweep.py

Yingtao-Zheng

Upload partially updated files

8bbb872 8 days ago

raw

history blame contribute delete

6.1 kB

	"""
	XGBoost Hyperparameter Sweep — ClearML HPO Controller
	======================================================
	Run this script once to launch the full sweep. It clones the base
	XGBoost training Task (identified by BASE_TASK_ID) and dispatches one
	clone per trial, each with a different hyperparameter configuration.

	Prerequisites
	-------------
	1. Run train.py at least once so a completed Task exists in ClearML.
	Copy its Task-ID into BASE_TASK_ID below.
	2. Have at least one `clearml-agent` listening on the "default" queue,
	OR set EXECUTE_LOCALLY = True to run trials in-process (slower but
	needs no agent).

	Usage
	-----
	python models/xgboost/sweep.py
	"""

	import time
	from clearml import Task
	from clearml.automation import (
	HyperParameterOptimizer,
	UniformParameterRange,
	DiscreteParameterRange,
	)

	try:
	from clearml.automation.optuna import OptimizerOptuna # preferred
	OPTIMIZER_CLASS = OptimizerOptuna
	except ImportError:
	from clearml.automation import OptimizerBOHB # fallback
	OPTIMIZER_CLASS = OptimizerBOHB

	# ── Configuration ─────────────────────────────────────────────────────────────

	# Paste the Task-ID of a successfully completed XGBoost training run here.
	# Find it in the ClearML UI: Projects → Focus Guard → XGBoost Model Training
	# → right-click the task → Copy ID.
	BASE_TASK_ID = "0f42afbb3396400babc7a1a0728e7326"

	# Set True to run trials in the same process (no agent needed, but serial).
	# Set False to dispatch to clearml-agents on the "default" queue (parallel).
	EXECUTE_LOCALLY = True

	# Total budget and concurrency
	MAX_TRIALS = 40 # total number of hyperparameter configurations to try
	MAX_CONCURRENT = 4 # how many trials to run in parallel (match agent count)

	# ── Search space ──────────────────────────────────────────────────────────────
	# These names must match the keys in CFG inside train.py exactly, because
	# task.connect(CFG) registers them under those names.

	SEARCH_SPACE = [
	DiscreteParameterRange("General/n_estimators", values=[100, 200, 400, 600]),
	DiscreteParameterRange("General/max_depth", values=[3, 4, 5, 6, 8]),
	UniformParameterRange( "General/learning_rate", min_value=0.01, max_value=0.30),
	UniformParameterRange( "General/subsample", min_value=0.50, max_value=1.00),
	UniformParameterRange( "General/colsample_bytree",min_value=0.50, max_value=1.00),
	UniformParameterRange( "General/reg_alpha", min_value=0.00, max_value=2.00),
	UniformParameterRange( "General/reg_lambda", min_value=0.50, max_value=5.00),
	]

	# Switch back to the per-epoch validation logloss scalar curve since Optuna
	# integration struggles to use single-value metrics natively.
	OBJECTIVE_METRIC_TITLE = "Loss"
	OBJECTIVE_METRIC_SERIES = "Val"
	OBJECTIVE_SIGN = "min" # minimize logloss

	# ── Main ──────────────────────────────────────────────────────────────────────

	def main():
	# Register this controller as its own ClearML Task so it is tracked too.
	controller_task = Task.init(
	project_name="FocusGuards Large Group Project",
	task_name="XGBoost HPO Sweep Controller",
	task_type=Task.TaskTypes.optimizer,
	tags=["sweep", "xgboost", "hpo"],
	)

	optimizer = HyperParameterOptimizer(
	base_task_id=BASE_TASK_ID,
	hyper_parameters=SEARCH_SPACE,
	objective_metric_title=OBJECTIVE_METRIC_TITLE,
	objective_metric_series=OBJECTIVE_METRIC_SERIES,
	objective_metric_sign=OBJECTIVE_SIGN,
	optimizer_class=OPTIMIZER_CLASS,
	# Execution
	execution_queue="default",
	max_number_of_concurrent_tasks=MAX_CONCURRENT,
	total_max_jobs=MAX_TRIALS,
	# Early stop a trial if validation loss hasn't improved in 10 rounds
	# (relies on the per-round "Loss/Val" scalars logged in train.py)
	min_iteration_per_job=10,
	max_iteration_per_job=600,
	)

	if EXECUTE_LOCALLY:
	optimizer.start_locally(job_complete_callback=_on_trial_done)
	else:
	optimizer.start(job_complete_callback=_on_trial_done)

	print("[SWEEP] Optimizer started. Waiting for trials to complete …")
	print(f"[SWEEP] Budget: {MAX_TRIALS} trials, {MAX_CONCURRENT} concurrent")

	# Poll until the budget is exhausted
	optimizer.wait()
	optimizer.stop()

	# ── Print best result ─────────────────────────────────────────────────────
	top_k = optimizer.get_top_experiments(top_k=5)
	print("\n[SWEEP] ── Top-5 trials by Validation Loss ──────────────────────────────")
	for rank, task in enumerate(top_k, 1):
	params = task.get_parameters()
	cfg = {k.split("/")[-1]: v for k, v in params.items()
	if k.startswith("General/")}
	metrics = task.get_last_scalar_metrics()
	val_loss = metrics.get(OBJECTIVE_METRIC_TITLE, {}).get(OBJECTIVE_METRIC_SERIES, {}).get("last", float('inf'))
	val_acc = metrics.get("Summary", {}).get("val_accuracy", {}).get("last", 0.0)
	val_f1 = metrics.get("Summary", {}).get("val_f1", {}).get("last", 0.0)

	print(f" #{rank} Val_Loss={val_loss:.4f} Val_Acc={val_acc:.2%} Val_F1={val_f1:.4f} task_id={task.id}")
	print(f" {cfg}")

	controller_task.close()


	def _on_trial_done(job_id: str, objective: float):
	"""Callback fired each time a trial finishes."""
	print(f"[SWEEP] Trial {job_id} finished → {OBJECTIVE_METRIC_TITLE}={objective:.4f}")


	if __name__ == "__main__":
	main()