Spaces:
Sleeping
Sleeping
File size: 14,818 Bytes
9244b7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | """
pipeline_parser.py β infer which AutoML pipeline step is active from live logs.
Each framework has a sequence of steps. This module parses log lines
to determine which step is "done", which is "active", and which is "pending".
"""
from typing import Optional
# ββ Step definitions per framework βββββββββββββββββββββββββββββββββββββββββββ
# Each step has:
# label β displayed name
# keywords β log keywords that signal this step has STARTED or is active
# done_kw β log keywords that signal this step is DONE (optional)
# description β tooltip / explainer text
_STEPS: dict[str, list[dict]] = {
"autogluon": [
{
"label": "Data Preparation",
"icon": "π",
"keywords": ["preprocessing", "converting", "fitting", "loading data", "train_data"],
"done_kw": ["beginning automl", "fitting model:"],
"description": "Validates and preprocesses the dataset. Handles missing values, categorical encoding and feature types.",
},
{
"label": "Fitting Models",
"icon": "π€",
"keywords": ["fitting model:", "training model for", "fitting with cpus"],
"done_kw": ["weightedensemble", "autogluon training complete"],
"description": "Trains each individual model (LightGBM, XGBoost, CatBoost, RF, etc.) within the time budget.",
},
{
"label": "Stacking / Ensembling",
"icon": "ποΈ",
"keywords": ["weightedensemble", "ensemble weights", "stacking"],
"done_kw": ["autogluon training complete"],
"description": "Combines the best models using weighted ensembling or multi-layer stacking.",
},
{
"label": "Evaluation",
"icon": "π",
"keywords": ["leaderboard", "best model:", "validation score", "score_val"],
"done_kw": ["tabularpredictor saved", "best model logged"],
"description": "Evaluates all models on the validation set and builds the final leaderboard.",
},
{
"label": "MLflow Logging",
"icon": "π",
"keywords": ["mlflow", "log_artifacts", "logged successfully", "artifacts logged"],
"done_kw": ["thread finished"],
"description": "Persists model artifacts, parameters, and metrics to MLflow for tracking and versioning.",
},
],
"flaml": [
{
"label": "Data Preparation",
"icon": "π",
"keywords": ["data ready", "preprocessing", "starting flaml"],
"done_kw": ["executing hyperparameter search"],
"description": "Validates the dataset, detects feature types, and prepares inputs for FLAML's optimizer.",
},
{
"label": "Hyperparameter Search",
"icon": "π",
"keywords": ["executing hyperparameter search", "automl.fit", "[flaml.automl", "trial", "best config"],
"done_kw": ["search finished"],
"description": "FLAML runs a cost-effective search over hyperparameter configurations using Bayesian optimization.",
},
{
"label": "Best Config Selection",
"icon": "π",
"keywords": ["search finished", "best estimator", "best loss", "best final"],
"done_kw": ["saving best model"],
"description": "Identifies the best-performing estimator and its configuration from the search results.",
},
{
"label": "Model Saving",
"icon": "πΎ",
"keywords": ["saving best model", "model_path", "artifact_path"],
"done_kw": ["mlflow", "logged successfully"],
"description": "Serializes the trained model to disk using pickle.",
},
{
"label": "MLflow Logging",
"icon": "π",
"keywords": ["mlflow", "log_artifact", "logged successfully"],
"done_kw": ["thread finished"],
"description": "Persists model artifacts, parameters, and metrics to MLflow for tracking and versioning.",
},
],
"h2o": [
{
"label": "H2O Cluster Init",
"icon": "π",
"keywords": ["h2o cluster initialized", "initializing h2o", "h2o init"],
"done_kw": ["starting h2o automl"],
"description": "Starts the local H2O Java cluster and allocates memory for distributed model training.",
},
{
"label": "Data Preparation",
"icon": "π",
"keywords": ["preparing data", "h2oframe", "feature engineering", "asfactor"],
"done_kw": ["starting h2o automl training"],
"description": "Converts Pandas DataFrames to H2O frames and applies type casting for features/targets.",
},
{
"label": "AutoML Training",
"icon": "π€",
"keywords": ["starting h2o automl training", "automl session", "training completed", "aml.train"],
"done_kw": ["training completed in"],
"description": "H2O trains multiple model families (GBM, XGBoost, GLM, DRF, DeepLearning) and their variants.",
},
{
"label": "Leaderboard & Scoring",
"icon": "π",
"keywords": ["top 5 models", "leaderboard", "best model score", "auc", "total_models_trained"],
"done_kw": ["model saved at", "log model to mlflow"],
"description": "Ranks all trained models and evaluates the leader on the validation/test set.",
},
{
"label": "MLflow Logging",
"icon": "π",
"keywords": ["mlflow", "log_artifacts", "logged successfully", "artifacts logged"],
"done_kw": ["thread finished"],
"description": "Persists model artifacts, parameters, and metrics to MLflow for tracking and versioning.",
},
],
"tpot": [
{
"label": "Data Preparation",
"icon": "π",
"keywords": ["problem type:", "training data shape", "test data shape", "label encoder"],
"done_kw": ["starting tpot training"],
"description": "Applies feature engineering pipelines: TF-IDF for text, ordinal encoding, and standard scaling.",
},
{
"label": "Pipeline Generation (GA)",
"icon": "π§¬",
"keywords": ["starting tpot training", "generation:", "pipeline score:", "optimizing pipeline"],
"done_kw": ["training completed"],
"description": "TPOT uses a Genetic Algorithm to evolve and select the best scikit-learn pipeline configurations.",
},
{
"label": "Pipeline Selection",
"icon": "π",
"keywords": ["training completed", "best pipeline", "fitted_pipeline_", "accuracy:", "f1_macro:"],
"done_kw": ["pipeline exported"],
"description": "Identifies the highest-scoring pipeline from the genetic search as the final model.",
},
{
"label": "Export & Analysis",
"icon": "π€",
"keywords": ["pipeline exported", "export", "classification report"],
"done_kw": ["mlflow"],
"description": "Exports the best pipeline as a .py file and generates a classification/regression report.",
},
{
"label": "MLflow Logging",
"icon": "π",
"keywords": ["mlflow", "tpot automl model", "registered_model_name", "logged successfully"],
"done_kw": ["thread finished"],
"description": "Persists model artifacts, parameters, and metrics to MLflow for tracking and versioning.",
},
],
"pycaret": [
{
"label": "Environment Setup",
"icon": "βοΈ",
"keywords": ["setting up pycaret", "dataset shape"],
"done_kw": ["comparing models", "step: comparing models..."],
"description": "Initializes the PyCaret setup, handling normalization, encoding, and train/test splits internally.",
},
{
"label": "Model Comparison",
"icon": "βοΈ",
"keywords": ["comparing models", "including fast/robust models"],
"done_kw": ["tuning best model", "step: tuning best model..."],
"description": "Trains and evaluates a fast baseline of multiple estimators to find the top candidates.",
},
{
"label": "Hyperparameter Tuning",
"icon": "π§",
"keywords": ["tuning best model", "step: tuning best model..."],
"done_kw": ["blending top models", "step: blending top models..."],
"description": "Applies randomized search to optimize hyperparameters of the best performing model.",
},
{
"label": "Model Blending",
"icon": "πͺοΈ",
"keywords": ["blending top models", "step: blending top models..."],
"done_kw": ["saving model", "pycaret experiment completed"],
"description": "Creates an ensemble of the top models to improve generalized performance via voting/averaging.",
},
{
"label": "MLflow Logging",
"icon": "π",
"keywords": ["saving model to", "pycaret experiment completed", "thread finished"],
"done_kw": ["thread finished"],
"description": "Persists model artifacts, parameters, and metrics to MLflow for tracking and versioning.",
},
],
"lale": [
{
"label": "Pipeline Definition",
"icon": "βοΈ",
"keywords": ["defining lale planned pipeline", "dataset shape"],
"done_kw": ["tuning with hyperopt", "step: tuning with hyperopt..."],
"description": "Maps a search space over transformers (PCA, Scalers) and estimators (LR, RF, KNN).",
},
{
"label": "Hyperopt Tuning",
"icon": "π§",
"keywords": ["tuning with hyperopt", "step: tuning with hyperopt..."],
"done_kw": ["fitting lale optimizer", "step: fitting lale optimizer"],
"description": "Configures Tree-structured Parzen Estimators (TPE) algorithm for intelligent hyperparameter search.",
},
{
"label": "Fitting Optimizer",
"icon": "π",
"keywords": ["fitting lale optimizer", "step: fitting lale optimizer"],
"done_kw": ["saving model locally", "step: saving model locally"],
"description": "Executes identical cross-validation folds on generated pipelines within the set budget.",
},
{
"label": "Best Model Extraction",
"icon": "π",
"keywords": ["best pipeline structure:", "best f1 (macro) score"],
"done_kw": ["saving model locally", "step: saving model locally"],
"description": "Decodes the structure and metrics of the optimized pipeline graph.",
},
{
"label": "MLflow Logging",
"icon": "π",
"keywords": ["saving model locally", "lale experiment completed", "thread finished"],
"done_kw": ["thread finished"],
"description": "Persists model artifacts, parameters, and metrics to MLflow for tracking and versioning.",
},
],
}
# ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def get_framework_steps(framework_key: str) -> list[dict]:
"""Return the step definitions for a given framework key."""
return _STEPS.get(framework_key.lower(), [])
def infer_pipeline_steps(framework_key: str, logs: list[str], status: str) -> list[dict]:
"""
Returns enriched step list with status attached:
status = "done" | "active" | "pending"
On completed/failed/cancelled runs, all matched steps are "done".
"""
steps = get_framework_steps(framework_key)
if not steps:
return []
log_blob = " ".join(logs).lower()
if status == "completed":
# Mark all steps done
return [{"label": s["label"], "icon": s["icon"], "description": s["description"], "status": "done"} for s in steps]
if status in ("failed", "cancelled"):
# Mark up to the last-seen step as done, rest pending, mark last active as failed
last_done_idx = -1
for i, step in enumerate(steps):
if any(kw in log_blob for kw in step["keywords"]):
last_done_idx = i
result = []
for i, step in enumerate(steps):
if i < last_done_idx:
st_val = "done"
elif i == last_done_idx:
st_val = "failed" if status == "failed" else "cancelled"
else:
st_val = "pending"
result.append({"label": step["label"], "icon": step["icon"], "description": step["description"], "status": st_val})
return result
# Running or queued: find the active step
last_done_idx = -1
for i, step in enumerate(steps):
done_signals = step.get("done_kw", [])
if any(kw in log_blob for kw in done_signals):
last_done_idx = i
# Active = first step after last_done
active_idx = min(last_done_idx + 1, len(steps) - 1)
result = []
for i, step in enumerate(steps):
if i <= last_done_idx:
st_val = "done"
elif i == active_idx and status == "running":
st_val = "active"
else:
st_val = "pending"
result.append({"label": step["label"], "icon": step["icon"], "description": step["description"], "status": st_val})
return result
def extract_best_tpot_pipeline(logs: list[str]) -> Optional[str]:
"""Extract the TPOT best pipeline string from logs."""
for line in reversed(logs):
if "best pipeline:" in line.lower() or "fitted_pipeline_" in line.lower():
return line.strip()
if "pipeline(" in line.lower():
return line.strip()
return None
def extract_autogluon_leaderboard_text(logs: list[str]) -> Optional[str]:
"""Extract leaderboard table text from AutoGluon logs."""
rows = []
capture = False
for line in logs:
if "model" in line.lower() and "score_val" in line.lower():
capture = True
if capture:
rows.append(line)
if len(rows) > 15:
break
return "\n".join(rows) if rows else None
|