Spaces:
Sleeping
Sleeping
| """ | |
| XGBoost Hyperparameter Sweep β ClearML HPO Controller | |
| ====================================================== | |
| Run this script once to launch the full sweep. It clones the base | |
| XGBoost training Task (identified by BASE_TASK_ID) and dispatches one | |
| clone per trial, each with a different hyperparameter configuration. | |
| Prerequisites | |
| ------------- | |
| 1. Run train.py at least once so a completed Task exists in ClearML. | |
| Copy its Task-ID into BASE_TASK_ID below. | |
| 2. Have at least one `clearml-agent` listening on the "default" queue, | |
| OR set EXECUTE_LOCALLY = True to run trials in-process (slower but | |
| needs no agent). | |
| Usage | |
| ----- | |
| python models/xgboost/sweep.py | |
| """ | |
| import time | |
| from clearml import Task | |
| from clearml.automation import ( | |
| HyperParameterOptimizer, | |
| UniformParameterRange, | |
| DiscreteParameterRange, | |
| ) | |
| try: | |
| from clearml.automation.optuna import OptimizerOptuna # preferred | |
| OPTIMIZER_CLASS = OptimizerOptuna | |
| except ImportError: | |
| from clearml.automation import OptimizerBOHB # fallback | |
| OPTIMIZER_CLASS = OptimizerBOHB | |
| # ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Paste the Task-ID of a successfully completed XGBoost training run here. | |
| # Find it in the ClearML UI: Projects β Focus Guard β XGBoost Model Training | |
| # β right-click the task β Copy ID. | |
| BASE_TASK_ID = "0f42afbb3396400babc7a1a0728e7326" | |
| # Set True to run trials in the same process (no agent needed, but serial). | |
| # Set False to dispatch to clearml-agents on the "default" queue (parallel). | |
| EXECUTE_LOCALLY = True | |
| # Total budget and concurrency | |
| MAX_TRIALS = 40 # total number of hyperparameter configurations to try | |
| MAX_CONCURRENT = 4 # how many trials to run in parallel (match agent count) | |
| # ββ Search space ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # These names must match the keys in CFG inside train.py exactly, because | |
| # task.connect(CFG) registers them under those names. | |
| SEARCH_SPACE = [ | |
| DiscreteParameterRange("General/n_estimators", values=[100, 200, 400, 600]), | |
| DiscreteParameterRange("General/max_depth", values=[3, 4, 5, 6, 8]), | |
| UniformParameterRange( "General/learning_rate", min_value=0.01, max_value=0.30), | |
| UniformParameterRange( "General/subsample", min_value=0.50, max_value=1.00), | |
| UniformParameterRange( "General/colsample_bytree",min_value=0.50, max_value=1.00), | |
| UniformParameterRange( "General/reg_alpha", min_value=0.00, max_value=2.00), | |
| UniformParameterRange( "General/reg_lambda", min_value=0.50, max_value=5.00), | |
| ] | |
| # Switch back to the per-epoch validation logloss scalar curve since Optuna | |
| # integration struggles to use single-value metrics natively. | |
| OBJECTIVE_METRIC_TITLE = "Loss" | |
| OBJECTIVE_METRIC_SERIES = "Val" | |
| OBJECTIVE_SIGN = "min" # minimize logloss | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| # Register this controller as its own ClearML Task so it is tracked too. | |
| controller_task = Task.init( | |
| project_name="FocusGuards Large Group Project", | |
| task_name="XGBoost HPO Sweep Controller", | |
| task_type=Task.TaskTypes.optimizer, | |
| tags=["sweep", "xgboost", "hpo"], | |
| ) | |
| optimizer = HyperParameterOptimizer( | |
| base_task_id=BASE_TASK_ID, | |
| hyper_parameters=SEARCH_SPACE, | |
| objective_metric_title=OBJECTIVE_METRIC_TITLE, | |
| objective_metric_series=OBJECTIVE_METRIC_SERIES, | |
| objective_metric_sign=OBJECTIVE_SIGN, | |
| optimizer_class=OPTIMIZER_CLASS, | |
| # Execution | |
| execution_queue="default", | |
| max_number_of_concurrent_tasks=MAX_CONCURRENT, | |
| total_max_jobs=MAX_TRIALS, | |
| # Early stop a trial if validation loss hasn't improved in 10 rounds | |
| # (relies on the per-round "Loss/Val" scalars logged in train.py) | |
| min_iteration_per_job=10, | |
| max_iteration_per_job=600, | |
| ) | |
| if EXECUTE_LOCALLY: | |
| optimizer.start_locally(job_complete_callback=_on_trial_done) | |
| else: | |
| optimizer.start(job_complete_callback=_on_trial_done) | |
| print("[SWEEP] Optimizer started. Waiting for trials to complete β¦") | |
| print(f"[SWEEP] Budget: {MAX_TRIALS} trials, {MAX_CONCURRENT} concurrent") | |
| # Poll until the budget is exhausted | |
| optimizer.wait() | |
| optimizer.stop() | |
| # ββ Print best result βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| top_k = optimizer.get_top_experiments(top_k=5) | |
| print("\n[SWEEP] ββ Top-5 trials by Validation Loss ββββββββββββββββββββββββββββββ") | |
| for rank, task in enumerate(top_k, 1): | |
| params = task.get_parameters() | |
| cfg = {k.split("/")[-1]: v for k, v in params.items() | |
| if k.startswith("General/")} | |
| metrics = task.get_last_scalar_metrics() | |
| val_loss = metrics.get(OBJECTIVE_METRIC_TITLE, {}).get(OBJECTIVE_METRIC_SERIES, {}).get("last", float('inf')) | |
| val_acc = metrics.get("Summary", {}).get("val_accuracy", {}).get("last", 0.0) | |
| val_f1 = metrics.get("Summary", {}).get("val_f1", {}).get("last", 0.0) | |
| print(f" #{rank} Val_Loss={val_loss:.4f} Val_Acc={val_acc:.2%} Val_F1={val_f1:.4f} task_id={task.id}") | |
| print(f" {cfg}") | |
| controller_task.close() | |
| def _on_trial_done(job_id: str, objective: float): | |
| """Callback fired each time a trial finishes.""" | |
| print(f"[SWEEP] Trial {job_id} finished β {OBJECTIVE_METRIC_TITLE}={objective:.4f}") | |
| if __name__ == "__main__": | |
| main() | |