Spaces:

PedroM2626
/

Multi-AutoML-Interface

Runtime error

Multi-AutoML-Interface / src /tpot_utils.py

feat: add support for multiple AutoML frameworks (TPOT, H2O, AutoGluon, FLAML) including data preprocessing and MLflow integration.

9c720d9 15 days ago

raw

history blame contribute delete

20.2 kB

	import os
	import pandas as pd
	import numpy as np
	import mlflow
	import mlflow.sklearn
	import shutil
	import logging
	import time
	import warnings
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import accuracy_score, f1_score, classification_report, mean_squared_error, r2_score

	# Monkeypatch for scikit-learn >= 1.2 compatibility with TPOT
	import sklearn.metrics
	if not hasattr(sklearn.metrics, 'SCORERS'):
	try:
	from sklearn.metrics import get_scorer_names
	sklearn.metrics.SCORERS = {name: name for name in get_scorer_names()}
	except ImportError:
	pass

	from tpot import TPOTClassifier, TPOTRegressor
	from src.mlflow_utils import safe_set_experiment

	logger = logging.getLogger(__name__)

	def detect_problem_type(y):
	"""Detect if problem is classification or regression"""
	if pd.api.types.is_numeric_dtype(y):
	unique_values = y.nunique()
	if unique_values <= 20 and all(y % 1 == 0 for val in y.dropna()):
	return 'classification'
	else:
	return 'regression'
	else:
	return 'classification'

	def create_feature_pipeline(df, target_column, text_columns=None, tfidf_max_features=500, tfidf_ngram_range=(1, 2)):
	"""Create feature engineering pipeline for TPOT"""
	# Identify column types
	categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
	numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()

	# Remove target column from features
	if target_column in categorical_columns:
	categorical_columns.remove(target_column)
	if target_column in numerical_columns:
	numerical_columns.remove(target_column)

	# Handle text columns separately
	if text_columns is None:
	text_columns = []
	# Auto-detect text columns (high cardinality object columns)
	for col in categorical_columns:
	if df[col].nunique() > len(df) * 0.5 and df[col].dtype == 'object':
	text_columns.append(col)
	categorical_columns.remove(col)

	transformers = []

	# Text processing
	if text_columns:
	for col in text_columns:
	transformers.append((f'tfidf_{col}', TfidfVectorizer(
	max_features=tfidf_max_features,
	ngram_range=tfidf_ngram_range,
	stop_words='english',
	dtype=np.float64,
	token_pattern=r'(?u)\b\w+\b' # Handle empty strings better
	), col))

	# Numerical features
	if numerical_columns:
	transformers.append(('num', StandardScaler(), numerical_columns))

	# Categorical features (non-text)
	if categorical_columns:
	# For TPOT, we use OrdinalEncoder to prevent dimension explosion on high cardinality
	transformers.append(('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns))

	if transformers:
	# TPOT requires dense arrays for most operations, so we force sparse_threshold=0
	preprocessor = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0)
	else:
	preprocessor = None

	return preprocessor, text_columns, categorical_columns, numerical_columns

	def prepare_data_for_tpot(df, target_column, test_data=None, test_size=0.2, random_state=42):
	"""Prepare data for TPOT training"""
	# Drop rows with missing target
	df_clean = df.dropna(subset=[target_column]).copy()

	# Handle missing values in features
	# For text columns, fill with empty string
	# For numerical columns, fill with median
	# For categorical columns, fill with mode
	for col in df_clean.columns:
	if col != target_column:
	if df_clean[col].dtype == 'object':
	df_clean[col] = df_clean[col].fillna('').astype(str)
	else:
	df_clean[col] = df_clean[col].fillna(df_clean[col].median())

	# Convert all object columns to string to avoid mixed types
	for col in df_clean.select_dtypes(include=['object']).columns:
	if col != target_column:
	df_clean[col] = df_clean[col].astype(str)

	# Process test_data if provided
	if test_data is not None:
	if target_column not in test_data.columns:
	raise ValueError(f"Target column '{target_column}' not found in Test data.")
	test_clean = test_data.dropna(subset=[target_column]).copy()
	for col in test_clean.columns:
	if col != target_column:
	if test_clean[col].dtype == 'object':
	test_clean[col] = test_clean[col].fillna('').astype(str)
	else:
	test_clean[col] = test_clean[col].fillna(test_clean[col].median())
	for col in test_clean.select_dtypes(include=['object']).columns:
	if col != target_column:
	test_clean[col] = test_clean[col].astype(str)

	# Split features and target
	X_train = df_clean.drop(columns=[target_column])
	y_train = df_clean[target_column].copy()

	if test_data is not None:
	X_test = test_clean.drop(columns=[target_column])
	y_test = test_clean[target_column].copy()

	# Handle target encoding for classification
	problem_type = detect_problem_type(y_train)
	label_encoder = None
	if problem_type == 'classification' and y_train.dtype == 'object':
	label_encoder = LabelEncoder()
	y_train = label_encoder.fit_transform(y_train)
	if test_data is not None:
	# Handle unknown labels in test by assigning them to a special class or throwing error
	# For simplicity in AutoML we fit_transform on train and transform on test, catching unknown label cases if needed
	try:
	y_test = label_encoder.transform(y_test)
	except ValueError:
	# If there are new labels in test, handle them gracefully by forcing a combined fit
	combined_y = pd.concat([df_clean[target_column], test_clean[target_column]])
	label_encoder.fit(combined_y)
	y_train = label_encoder.transform(df_clean[target_column])
	y_test = label_encoder.transform(test_clean[target_column])

	# Split data if test_data is not explicitly provided
	if test_data is None:
	X_train, X_test, y_train, y_test = train_test_split(
	X_train, y_train, test_size=test_size, random_state=random_state, stratify=y_train if problem_type == 'classification' else None
	)

	return X_train, X_test, y_train, y_test, problem_type, label_encoder

	def train_tpot_model(df, target_column, run_name,
	valid_data=None, test_data=None,
	generations=5, population_size=20, cv=5,
	scoring=None, max_time_mins=30, max_eval_time_mins=5, random_state=42,
	verbosity=2, n_jobs=-1, config_dict='TPOT sparse',
	tfidf_max_features=500, tfidf_ngram_range=(1, 2)):
	"""
	Train TPOT model with MLflow tracking
	"""
	try:
	# TPOT handles validation automatically via CV. If validation is passed, concatenate to train for larger pool
	if valid_data is not None:
	if target_column not in valid_data.columns:
	raise ValueError(f"Target column '{target_column}' not found in Validation data.")
	df = pd.concat([df, valid_data], ignore_index=True)
	mlflow.log_param("has_validation_data", True)

	# Setup experiment
	safe_set_experiment("TPOT_Experiments")

	# Prepare data
	X_train, X_test, y_train, y_test, problem_type, label_encoder = prepare_data_for_tpot(
	df, target_column, test_data=test_data, random_state=random_state
	)

	# Create feature pipeline
	preprocessor, text_columns, cat_columns, num_columns = create_feature_pipeline(
	X_train, target_column,
	tfidf_max_features=tfidf_max_features,
	tfidf_ngram_range=tfidf_ngram_range
	)

	# Process features
	if preprocessor is not None:
	X_train_processed = preprocessor.fit_transform(X_train)
	X_test_processed = preprocessor.transform(X_test)
	else:
	X_train_processed = X_train
	X_test_processed = X_test

	logger.info(f"Problem type: {problem_type}")
	logger.info(f"Training data shape: {X_train_processed.shape}")
	logger.info(f"Test data shape: {X_test_processed.shape}")

	# Check for high dimensional data and adjust parameters
	n_features = X_train_processed.shape[1]
	n_samples = X_train_processed.shape[0]

	if n_features > 10000:
	logger.warning(f"High dimensional data detected: {n_features} features")
	# Reduce complexity for high dimensional data
	generations = min(generations, 2)
	population_size = min(population_size, 10)
	max_eval_time_mins = min(max_eval_time_mins, 2)
	config_dict = 'TPOT light'
	logger.info(f"Adjusted parameters for high dimensional data: generations={generations}, population_size={population_size}")

	if n_samples > 50000:
	logger.warning(f"Large dataset detected: {n_samples} samples")
	# Reduce complexity for large datasets
	cv = min(cv, 3)
	max_eval_time_mins = min(max_eval_time_mins, 1)
	logger.info(f"Adjusted parameters for large dataset: cv={cv}, max_eval_time_mins={max_eval_time_mins}")

	# Default scoring based on problem type
	if scoring is None:
	if problem_type == 'classification':
	scoring = 'f1_macro'
	else:
	scoring = 'neg_mean_squared_error'

	# Ensure there are no loose active runs that could cause errors on start
	while mlflow.active_run():
	mlflow.end_run()

	with mlflow.start_run(run_name=run_name) as run:
	logger.info(f"Starting TPOT training for run: {run_name}")

	# Choose TPOT class based on problem type
	if problem_type == 'classification':
	tpot = TPOTClassifier(
	generations=generations,
	population_size=population_size,
	cv=cv,
	scoring=scoring,
	max_time_mins=max_time_mins,
	max_eval_time_mins=max_eval_time_mins,
	random_state=random_state,
	verbosity=verbosity,
	n_jobs=n_jobs,
	config_dict=config_dict
	)
	else:
	tpot = TPOTRegressor(
	generations=generations,
	population_size=population_size,
	cv=cv,
	scoring=scoring,
	max_time_mins=max_time_mins,
	max_eval_time_mins=max_eval_time_mins,
	random_state=random_state,
	verbosity=verbosity,
	n_jobs=n_jobs,
	config_dict=config_dict
	)

	# Log parameters
	mlflow.log_param("problem_type", problem_type)
	mlflow.log_param("generations", generations)
	mlflow.log_param("population_size", population_size)
	mlflow.log_param("cv", cv)
	mlflow.log_param("scoring", scoring)
	mlflow.log_param("max_time_mins", max_time_mins)
	mlflow.log_param("max_eval_time_mins", max_eval_time_mins)
	mlflow.log_param("random_state", random_state)
	mlflow.log_param("config_dict", config_dict)
	mlflow.log_param("n_jobs", n_jobs)
	mlflow.log_param("target", target_column)
	mlflow.log_param("n_features", n_features)
	mlflow.log_param("n_samples", n_samples)
	mlflow.log_param("text_columns", text_columns)
	mlflow.log_param("categorical_columns", cat_columns)
	mlflow.log_param("numerical_columns", num_columns)

	# Train model with error handling
	try:
	start_time = time.time()
	tpot.fit(X_train_processed, y_train)
	training_duration = time.time() - start_time

	logger.info(f"Training completed in {training_duration:.2f} seconds")

	except Exception as tpot_error:
	logger.error(f"Error during TPOT training: {tpot_error}")

	# Try with simpler configuration
	logger.info("Trying with simpler configuration...")
	tpot = TPOTClassifier(
	generations=1,
	population_size=5,
	cv=2,
	scoring='accuracy' if problem_type == 'classification' else 'neg_mean_squared_error',
	max_time_mins=min(max_time_mins, 5),
	max_eval_time_mins=1,
	random_state=random_state,
	verbosity=1,
	n_jobs=1,
	config_dict='TPOT light'
	)

	tpot.fit(X_train_processed, y_train)
	training_duration = time.time() - start_time
	logger.info(f"Simplified training completed in {training_duration:.2f} seconds")

	# Predictions
	y_pred = tpot.predict(X_test_processed)

	# Calculate metrics
	if problem_type == 'classification':
	accuracy = accuracy_score(y_test, y_pred)
	f1_macro = f1_score(y_test, y_pred, average='macro')
	f1_weighted = f1_score(y_test, y_pred, average='weighted')

	logger.info(f"Accuracy: {accuracy:.4f}")
	logger.info(f"F1-Score (macro): {f1_macro:.4f}")
	logger.info(f"F1-Score (weighted): {f1_weighted:.4f}")

	# Log metrics
	mlflow.log_metric("accuracy", accuracy)
	mlflow.log_metric("f1_macro", f1_macro)
	mlflow.log_metric("f1_weighted", f1_weighted)

	# Classification report
	try:
	if label_encoder is not None:
	# Convert back to original labels for report
	y_test_orig = label_encoder.inverse_transform(y_test)
	y_pred_orig = label_encoder.inverse_transform(y_pred)
	class_names = label_encoder.classes_
	else:
	y_test_orig = y_test
	y_pred_orig = y_pred
	class_names = [f"Class_{i}" for i in np.unique(y_test)]

	report = classification_report(y_test_orig, y_pred_orig, target_names=class_names)
	logger.info(f"\nClassification Report:\n{report}")

	# Save classification report
	report_path = f"classification_report_{run_name}.txt"
	with open(report_path, "w") as f:
	f.write(f"TPOT AutoML Classification Report\n")
	f.write(f"{'='*50}\n\n")
	f.write(f"Problem Type: {problem_type}\n")
	f.write(f"Best Pipeline: {tpot.fitted_pipeline_}\n\n")
	f.write(report)

	mlflow.log_artifact(report_path)

	except Exception as e:
	logger.warning(f"Could not generate classification report: {e}")

	else: # Regression
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = r2_score(y_test, y_pred)

	logger.info(f"MSE: {mse:.4f}")
	logger.info(f"RMSE: {rmse:.4f}")
	logger.info(f"R²: {r2:.4f}")

	# Log metrics
	mlflow.log_metric("mse", mse)
	mlflow.log_metric("rmse", rmse)
	mlflow.log_metric("r2", r2)

	mlflow.log_metric("training_duration", training_duration)

	# Create complete pipeline for saving
	if preprocessor is not None:
	final_pipeline = Pipeline([
	('preprocessor', preprocessor),
	('classifier', tpot.fitted_pipeline_)
	])
	else:
	final_pipeline = tpot.fitted_pipeline_

	# Save model info
	model_info = {
	"problem_type": problem_type,
	"best_pipeline": str(tpot.fitted_pipeline_),
	"generations": generations,
	"population_size": population_size,
	"cv": cv,
	"scoring": scoring,
	"training_duration": training_duration,
	"n_features": n_features,
	"n_samples": n_samples
	}

	if problem_type == 'classification':
	model_info.update({
	"accuracy": accuracy if 'accuracy' in locals() else None,
	"f1_macro": f1_macro if 'f1_macro' in locals() else None,
	"f1_weighted": f1_weighted if 'f1_weighted' in locals() else None
	})
	else:
	model_info.update({
	"mse": mse if 'mse' in locals() else None,
	"rmse": rmse if 'rmse' in locals() else None,
	"r2": r2 if 'r2' in locals() else None
	})

	# Export pipeline
	pipeline_path = f"tpot_models/best_pipeline_{run_name}.py"
	os.makedirs("tpot_models", exist_ok=True)
	tpot.export(pipeline_path)
	logger.info(f"Pipeline exported to {pipeline_path}")

	# Save model info
	info_path = f"tpot_models/model_info_{run_name}.txt"
	with open(info_path, "w") as f:
	f.write("TPOT AutoML Model Information\n")
	f.write(f"{'='*50}\n\n")
	for key, value in model_info.items():
	f.write(f"{key}: {value}\n")

	mlflow.log_artifact(pipeline_path)
	mlflow.log_artifact(info_path)

	# Log the fitted pipeline
	mlflow.sklearn.log_model(final_pipeline, "model", registered_model_name=f"TPOT_{run_name}")

	logger.info("TPOT model successfully registered in MLflow")

	return tpot, final_pipeline, run.info.run_id, model_info

	except Exception as e:
	logger.error(f"Error during TPOT training: {e}")
	raise

	def load_tpot_model(run_id, model_path="model"):
	"""Load TPOT model from MLflow"""
	try:
	model = mlflow.sklearn.load_model(f"runs:/{run_id}/{model_path}")
	return model
	except Exception as e:
	logger.error(f"Error loading TPOT model: {e}")
	raise

	def predict_with_tpot(model, data, preprocessor=None):
	"""Make predictions with TPOT model"""
	try:
	if preprocessor is not None:
	data_processed = preprocessor.transform(data)
	else:
	data_processed = data

	predictions = model.predict(data_processed)
	return predictions
	except Exception as e:
	logger.error(f"Error during TPOT prediction: {e}")
	raise