Yingtao-Zheng's picture
Upload partially updated files
8bbb872
"""
XGBoost Hyperparameter Sweep β€” ClearML HPO Controller
======================================================
Run this script once to launch the full sweep. It clones the base
XGBoost training Task (identified by BASE_TASK_ID) and dispatches one
clone per trial, each with a different hyperparameter configuration.
Prerequisites
-------------
1. Run train.py at least once so a completed Task exists in ClearML.
Copy its Task-ID into BASE_TASK_ID below.
2. Have at least one `clearml-agent` listening on the "default" queue,
OR set EXECUTE_LOCALLY = True to run trials in-process (slower but
needs no agent).
Usage
-----
python models/xgboost/sweep.py
"""
import time
from clearml import Task
from clearml.automation import (
HyperParameterOptimizer,
UniformParameterRange,
DiscreteParameterRange,
)
try:
from clearml.automation.optuna import OptimizerOptuna # preferred
OPTIMIZER_CLASS = OptimizerOptuna
except ImportError:
from clearml.automation import OptimizerBOHB # fallback
OPTIMIZER_CLASS = OptimizerBOHB
# ── Configuration ─────────────────────────────────────────────────────────────
# Paste the Task-ID of a successfully completed XGBoost training run here.
# Find it in the ClearML UI: Projects β†’ Focus Guard β†’ XGBoost Model Training
# β†’ right-click the task β†’ Copy ID.
BASE_TASK_ID = "0f42afbb3396400babc7a1a0728e7326"
# Set True to run trials in the same process (no agent needed, but serial).
# Set False to dispatch to clearml-agents on the "default" queue (parallel).
EXECUTE_LOCALLY = True
# Total budget and concurrency
MAX_TRIALS = 40 # total number of hyperparameter configurations to try
MAX_CONCURRENT = 4 # how many trials to run in parallel (match agent count)
# ── Search space ──────────────────────────────────────────────────────────────
# These names must match the keys in CFG inside train.py exactly, because
# task.connect(CFG) registers them under those names.
SEARCH_SPACE = [
DiscreteParameterRange("General/n_estimators", values=[100, 200, 400, 600]),
DiscreteParameterRange("General/max_depth", values=[3, 4, 5, 6, 8]),
UniformParameterRange( "General/learning_rate", min_value=0.01, max_value=0.30),
UniformParameterRange( "General/subsample", min_value=0.50, max_value=1.00),
UniformParameterRange( "General/colsample_bytree",min_value=0.50, max_value=1.00),
UniformParameterRange( "General/reg_alpha", min_value=0.00, max_value=2.00),
UniformParameterRange( "General/reg_lambda", min_value=0.50, max_value=5.00),
]
# Switch back to the per-epoch validation logloss scalar curve since Optuna
# integration struggles to use single-value metrics natively.
OBJECTIVE_METRIC_TITLE = "Loss"
OBJECTIVE_METRIC_SERIES = "Val"
OBJECTIVE_SIGN = "min" # minimize logloss
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
# Register this controller as its own ClearML Task so it is tracked too.
controller_task = Task.init(
project_name="FocusGuards Large Group Project",
task_name="XGBoost HPO Sweep Controller",
task_type=Task.TaskTypes.optimizer,
tags=["sweep", "xgboost", "hpo"],
)
optimizer = HyperParameterOptimizer(
base_task_id=BASE_TASK_ID,
hyper_parameters=SEARCH_SPACE,
objective_metric_title=OBJECTIVE_METRIC_TITLE,
objective_metric_series=OBJECTIVE_METRIC_SERIES,
objective_metric_sign=OBJECTIVE_SIGN,
optimizer_class=OPTIMIZER_CLASS,
# Execution
execution_queue="default",
max_number_of_concurrent_tasks=MAX_CONCURRENT,
total_max_jobs=MAX_TRIALS,
# Early stop a trial if validation loss hasn't improved in 10 rounds
# (relies on the per-round "Loss/Val" scalars logged in train.py)
min_iteration_per_job=10,
max_iteration_per_job=600,
)
if EXECUTE_LOCALLY:
optimizer.start_locally(job_complete_callback=_on_trial_done)
else:
optimizer.start(job_complete_callback=_on_trial_done)
print("[SWEEP] Optimizer started. Waiting for trials to complete …")
print(f"[SWEEP] Budget: {MAX_TRIALS} trials, {MAX_CONCURRENT} concurrent")
# Poll until the budget is exhausted
optimizer.wait()
optimizer.stop()
# ── Print best result ─────────────────────────────────────────────────────
top_k = optimizer.get_top_experiments(top_k=5)
print("\n[SWEEP] ── Top-5 trials by Validation Loss ──────────────────────────────")
for rank, task in enumerate(top_k, 1):
params = task.get_parameters()
cfg = {k.split("/")[-1]: v for k, v in params.items()
if k.startswith("General/")}
metrics = task.get_last_scalar_metrics()
val_loss = metrics.get(OBJECTIVE_METRIC_TITLE, {}).get(OBJECTIVE_METRIC_SERIES, {}).get("last", float('inf'))
val_acc = metrics.get("Summary", {}).get("val_accuracy", {}).get("last", 0.0)
val_f1 = metrics.get("Summary", {}).get("val_f1", {}).get("last", 0.0)
print(f" #{rank} Val_Loss={val_loss:.4f} Val_Acc={val_acc:.2%} Val_F1={val_f1:.4f} task_id={task.id}")
print(f" {cfg}")
controller_task.close()
def _on_trial_done(job_id: str, objective: float):
"""Callback fired each time a trial finishes."""
print(f"[SWEEP] Trial {job_id} finished β†’ {OBJECTIVE_METRIC_TITLE}={objective:.4f}")
if __name__ == "__main__":
main()