""" XGBoost Hyperparameter Sweep — ClearML HPO Controller ====================================================== Run this script once to launch the full sweep. It clones the base XGBoost training Task (identified by BASE_TASK_ID) and dispatches one clone per trial, each with a different hyperparameter configuration. Prerequisites ------------- 1. Run train.py at least once so a completed Task exists in ClearML. Copy its Task-ID into BASE_TASK_ID below. 2. Have at least one `clearml-agent` listening on the "default" queue, OR set EXECUTE_LOCALLY = True to run trials in-process (slower but needs no agent). Usage ----- python models/xgboost/sweep.py """ import time from clearml import Task from clearml.automation import ( HyperParameterOptimizer, UniformParameterRange, DiscreteParameterRange, ) try: from clearml.automation.optuna import OptimizerOptuna # preferred OPTIMIZER_CLASS = OptimizerOptuna except ImportError: from clearml.automation import OptimizerBOHB # fallback OPTIMIZER_CLASS = OptimizerBOHB # ── Configuration ───────────────────────────────────────────────────────────── # Paste the Task-ID of a successfully completed XGBoost training run here. # Find it in the ClearML UI: Projects → Focus Guard → XGBoost Model Training # → right-click the task → Copy ID. BASE_TASK_ID = "0f42afbb3396400babc7a1a0728e7326" # Set True to run trials in the same process (no agent needed, but serial). # Set False to dispatch to clearml-agents on the "default" queue (parallel). EXECUTE_LOCALLY = True # Total budget and concurrency MAX_TRIALS = 40 # total number of hyperparameter configurations to try MAX_CONCURRENT = 4 # how many trials to run in parallel (match agent count) # ── Search space ────────────────────────────────────────────────────────────── # These names must match the keys in CFG inside train.py exactly, because # task.connect(CFG) registers them under those names. SEARCH_SPACE = [ DiscreteParameterRange("General/n_estimators", values=[100, 200, 400, 600]), DiscreteParameterRange("General/max_depth", values=[3, 4, 5, 6, 8]), UniformParameterRange( "General/learning_rate", min_value=0.01, max_value=0.30), UniformParameterRange( "General/subsample", min_value=0.50, max_value=1.00), UniformParameterRange( "General/colsample_bytree",min_value=0.50, max_value=1.00), UniformParameterRange( "General/reg_alpha", min_value=0.00, max_value=2.00), UniformParameterRange( "General/reg_lambda", min_value=0.50, max_value=5.00), ] # Switch back to the per-epoch validation logloss scalar curve since Optuna # integration struggles to use single-value metrics natively. OBJECTIVE_METRIC_TITLE = "Loss" OBJECTIVE_METRIC_SERIES = "Val" OBJECTIVE_SIGN = "min" # minimize logloss # ── Main ────────────────────────────────────────────────────────────────────── def main(): # Register this controller as its own ClearML Task so it is tracked too. controller_task = Task.init( project_name="FocusGuards Large Group Project", task_name="XGBoost HPO Sweep Controller", task_type=Task.TaskTypes.optimizer, tags=["sweep", "xgboost", "hpo"], ) optimizer = HyperParameterOptimizer( base_task_id=BASE_TASK_ID, hyper_parameters=SEARCH_SPACE, objective_metric_title=OBJECTIVE_METRIC_TITLE, objective_metric_series=OBJECTIVE_METRIC_SERIES, objective_metric_sign=OBJECTIVE_SIGN, optimizer_class=OPTIMIZER_CLASS, # Execution execution_queue="default", max_number_of_concurrent_tasks=MAX_CONCURRENT, total_max_jobs=MAX_TRIALS, # Early stop a trial if validation loss hasn't improved in 10 rounds # (relies on the per-round "Loss/Val" scalars logged in train.py) min_iteration_per_job=10, max_iteration_per_job=600, ) if EXECUTE_LOCALLY: optimizer.start_locally(job_complete_callback=_on_trial_done) else: optimizer.start(job_complete_callback=_on_trial_done) print("[SWEEP] Optimizer started. Waiting for trials to complete …") print(f"[SWEEP] Budget: {MAX_TRIALS} trials, {MAX_CONCURRENT} concurrent") # Poll until the budget is exhausted optimizer.wait() optimizer.stop() # ── Print best result ───────────────────────────────────────────────────── top_k = optimizer.get_top_experiments(top_k=5) print("\n[SWEEP] ── Top-5 trials by Validation Loss ──────────────────────────────") for rank, task in enumerate(top_k, 1): params = task.get_parameters() cfg = {k.split("/")[-1]: v for k, v in params.items() if k.startswith("General/")} metrics = task.get_last_scalar_metrics() val_loss = metrics.get(OBJECTIVE_METRIC_TITLE, {}).get(OBJECTIVE_METRIC_SERIES, {}).get("last", float('inf')) val_acc = metrics.get("Summary", {}).get("val_accuracy", {}).get("last", 0.0) val_f1 = metrics.get("Summary", {}).get("val_f1", {}).get("last", 0.0) print(f" #{rank} Val_Loss={val_loss:.4f} Val_Acc={val_acc:.2%} Val_F1={val_f1:.4f} task_id={task.id}") print(f" {cfg}") controller_task.close() def _on_trial_done(job_id: str, objective: float): """Callback fired each time a trial finishes.""" print(f"[SWEEP] Trial {job_id} finished → {OBJECTIVE_METRIC_TITLE}={objective:.4f}") if __name__ == "__main__": main()