import os import pandas as pd import mlflow import shutil import logging import time import numpy as np from sklearn.metrics import accuracy_score, f1_score, classification_report from src.mlflow_utils import safe_set_experiment logger = logging.getLogger(__name__) def check_java_availability(): """Checks if Java is available in the system""" try: import subprocess import os # Try to find Java in PATH result = subprocess.run(['java', '-version'], capture_output=True, text=True, timeout=5) if result.returncode == 0: return True # If not found in PATH, try common paths on Windows java_paths = [ r"C:\Program Files\Eclipse Adoptium\jdk-11.0.30.7-hotspot\bin\java.exe", r"C:\Program Files\Eclipse Adoptium\jdk-11.0.23.9-hotspot\bin\java.exe", r"C:\Program Files\Java\jdk-11\bin\java.exe", r"C:\Program Files\Java\jdk-17\bin\java.exe", ] for java_path in java_paths: if os.path.exists(java_path): result = subprocess.run([java_path, '-version'], capture_output=True, text=True, timeout=5) if result.returncode == 0: return True return False except (subprocess.TimeoutExpired, FileNotFoundError, Exception): return False def initialize_h2o(): """Initializes the H2O cluster with Java check""" if not check_java_availability(): raise RuntimeError( "Java is not installed on the system. H2O AutoML requires Java to function.\n\n" "Options:\n" "1. Install Java locally (JRE/JDK)\n" "2. Use Docker: docker build -t multi-automl-interface . && docker run -p 8501:8501 multi-automl-interface\n" "3. Use AutoGluon or FLAML as alternatives (they do not require Java)\n" "\nTo install Java on Windows:\n" "- Download from: https://adoptium.net/\n" "- Or use: winget install EclipseAdoptium.Temurin.11.JDK" ) try: import h2o h2o.init(max_mem_size="4G", nthreads=-1) logger.info("H2O Cluster initialized successfully") return h2o except Exception as e: logger.error(f"Error initializing H2O: {e}") raise def cleanup_h2o(): """Finalizes the H2O cluster""" try: import h2o h2o.cluster().shutdown() logger.info("H2O Cluster finalized") except Exception as e: logger.warning(f"Error finalizing H2O: {e}") def prepare_data_for_h2o(train_data: pd.DataFrame, target: str): """Prepares data for H2O AutoML""" import h2o # Drop null values train_data_clean = train_data.dropna(subset=[target]) # For textual data, create basic numerical features if train_data_clean.select_dtypes(include=['object']).shape[1] > 0: logger.info("Text columns detected, generating basic numerical features...") # For each text column, build basic features for col in train_data_clean.select_dtypes(include=['object']).columns: if col != target: # Text length train_data_clean[f'{col}_length'] = train_data_clean[col].astype(str).str.len() # Word count train_data_clean[f'{col}_word_count'] = train_data_clean[col].astype(str).str.split().str.len() # Drop text columns except target text_cols = train_data_clean.select_dtypes(include=['object']).columns text_cols = [col for col in text_cols if col != target] train_data_clean = train_data_clean.drop(columns=text_cols) # Convert to H2OFrame h2o_frame = h2o.H2OFrame(train_data_clean) # Convert target to factor (categorical) if classification if train_data_clean[target].dtype == 'object' or train_data_clean[target].nunique() < 20: h2o_frame[target] = h2o_frame[target].asfactor() return h2o_frame, train_data_clean def train_h2o_model(train_data: pd.DataFrame, target: str, run_name: str, valid_data: pd.DataFrame = None, test_data: pd.DataFrame = None, max_runtime_secs: int = 300, max_models: int = 10, nfolds: int = 3, balance_classes: bool = True, seed: int = 42, sort_metric: str = "AUTO", exclude_algos: list = None): """ Trains H2O AutoML model and registers in MLflow """ import h2o from h2o.automl import H2OAutoML safe_set_experiment("H2O_Experiments") logging.info(f"Starting H2O AutoML training for run: {run_name}") # Initialize H2O h2o_instance = initialize_h2o() try: with mlflow.start_run(run_name=run_name) as run: # Prepare data h2o_frame, clean_data = prepare_data_for_h2o(train_data, target) # Log parameters mlflow.log_param("target", target) mlflow.log_param("max_runtime_secs", max_runtime_secs) mlflow.log_param("max_models", max_models) mlflow.log_param("nfolds", nfolds) mlflow.log_param("balance_classes", balance_classes) mlflow.log_param("seed", seed) mlflow.log_param("sort_metric", sort_metric) mlflow.log_param("model_type", "h2o_automl") if exclude_algos: mlflow.log_param("exclude_algos", exclude_algos) # Define features (all except target) features = [col for col in clean_data.columns if col != target] mlflow.log_param("features", features) # Configurar AutoML aml = H2OAutoML( max_runtime_secs=max_runtime_secs, max_models=max_models, seed=seed, nfolds=nfolds, balance_classes=balance_classes, keep_cross_validation_predictions=True, keep_cross_validation_models=False, verbosity='info', sort_metric=sort_metric, exclude_algos=exclude_algos or [] ) # Prepare test and validation data if present h2o_valid = None if valid_data is not None: if target not in valid_data.columns: raise ValueError(f"Target column '{target}' not found in Validation data.") valid_data = valid_data.dropna(subset=[target]) h2o_valid, _ = prepare_data_for_h2o(valid_data, target) mlflow.log_param("has_validation_data", True) h2o_test = None if test_data is not None: if target not in test_data.columns: raise ValueError(f"Target column '{target}' not found in Test data.") test_data = test_data.dropna(subset=[target]) h2o_test, _ = prepare_data_for_h2o(test_data, target) mlflow.log_param("has_test_data", True) # Train model logger.info("Starting H2O AutoML training...") start_time = time.time() train_kwargs = {"x": features, "y": target, "training_frame": h2o_frame} if h2o_valid is not None: train_kwargs["validation_frame"] = h2o_valid if h2o_test is not None: train_kwargs["leaderboard_frame"] = h2o_test aml.train(**train_kwargs) training_duration = time.time() - start_time logger.info(f"Training completed in {training_duration:.2f} seconds") # Get leaderboard leaderboard = aml.leaderboard # Check if leaderboard is empty if leaderboard.nrow == 0: logger.warning("⚠️ No models trained. Leaderboard is empty.") logger.warning("This can happen if:") logger.warning("1. Max runtime is too short") logger.warning("2. Data is not adequate for algorithms") logger.warning("3. Data has underlying issues") # Log basic metrics even without models mlflow.log_metric("total_models_trained", 0) mlflow.log_metric("training_duration", training_duration) mlflow.log_metric("best_model_score", 0.0) # Return AutoML even without models return aml, run.info.run_id logger.info("\nTop 5 models:") print(leaderboard.head(5)) # Save leaderboard as metric with safe wrapper try: # Check available columns in leaderboard leaderboard_df = None try: leaderboard_df = leaderboard.as_data_frame() logger.info(f"Available columns: {list(leaderboard_df.columns)}") except Exception as e: logger.warning(f"Could not convert leaderboard to DataFrame: {e}") # Try to get the best available metric best_model_score = 0.0 if leaderboard_df is not None and len(leaderboard_df) > 0: # Search for metrics in preference order for metric in ['auc', 'logloss', 'rmse', 'mae', 'r2']: if metric in leaderboard_df.columns: best_model_score = leaderboard_df.iloc[0][metric] logger.info(f"Using metric '{metric}': {best_model_score}") break mlflow.log_metric("total_models_trained", len(leaderboard_df)) else: # Fallback: use the first value in H2O leaderboard try: available_columns = leaderboard.columns logger.info(f"Available H2O columns: {available_columns}") # Try accessing first row, first metric col if len(available_columns) > 0: first_col = available_columns[0] best_model_score = leaderboard[0, first_col] logger.info(f"Using first available column '{first_col}': {best_model_score}") mlflow.log_metric("total_models_trained", leaderboard.nrow) except Exception as e: logger.warning(f"Could not extract metrics from leaderboard: {e}") mlflow.log_metric("total_models_trained", 0) mlflow.log_metric("best_model_score", best_model_score) mlflow.log_metric("training_duration", training_duration) except Exception as e: logger.warning(f"Error processing leaderboard metrics: {e}") # Default fallback mlflow.log_metric("best_model_score", 0.0) mlflow.log_metric("training_duration", training_duration) mlflow.log_metric("total_models_trained", 0) # Try saving leaderboard with error handling try: leaderboard_df = leaderboard.as_data_frame() leaderboard_path = f"h2o_leaderboard_{run_name}.csv" leaderboard_df.to_csv(leaderboard_path, index=False) mlflow.log_artifact(leaderboard_path) except Exception as e: logger.warning(f"Could not save leaderboard as CSV: {e}") # Save as plain text if CSV fails try: leaderboard_text = str(leaderboard.head(10)) leaderboard_path = f"h2o_leaderboard_{run_name}.txt" with open(leaderboard_path, "w") as f: f.write(f"H2O AutoML Leaderboard - {run_name}\n") f.write("=" * 50 + "\n") f.write(leaderboard_text) mlflow.log_artifact(leaderboard_path) except Exception as e2: logger.warning(f"Could not save leaderboard as text: {e2}") # Save local model (only if there are models) if hasattr(aml, 'leader') and aml.leader is not None: model_dir = "models/h2o_models" os.makedirs(model_dir, exist_ok=True) model_path = f"{model_dir}/h2o_model_{run_name}" # Save best model (leader) rather than AutoML object best_model = aml.leader h2o.save_model(best_model, path=model_path) logger.info(f"Model saved at: {model_path}") # Log model to MLflow temp_model_path = f"temp_h2o_model_{run_name}" os.makedirs(temp_model_path, exist_ok=True) h2o.save_model(best_model, path=temp_model_path) mlflow.log_artifacts(temp_model_path, artifact_path="model") # Clean temp directory import shutil if os.path.exists(temp_model_path): shutil.rmtree(temp_model_path) else: logger.warning("⚠️ No model to save (no models were trained)") # Create a placeholder file explaining the situation no_model_path = f"no_model_{run_name}.txt" with open(no_model_path, "w") as f: f.write(f"H2O AutoML - {run_name}\n") f.write("=" * 50 + "\n") f.write("No models were trained during this run.\n") f.write("Possible causes:\n") f.write("1. Insufficient training time\n") f.write("2. Data inadequate for algorithms\n") f.write("3. Data quality issues\n") f.write(f"Training time: {training_duration:.2f} seconds\n") mlflow.log_artifact(no_model_path) # Generate classification report for classification tasks (only if models exist) if (clean_data[target].dtype == 'object' or clean_data[target].nunique() < 20) and hasattr(aml, 'leader') and aml.leader is not None: try: best_model = aml.leader predictions = best_model.predict(h2o_frame) pred_array = predictions['predict'].as_data_frame()['predict'].values true_labels = clean_data[target].values # Calculate metrics accuracy = accuracy_score(true_labels, pred_array) f1_macro = f1_score(true_labels, pred_array, average='macro') f1_weighted = f1_score(true_labels, pred_array, average='weighted') logger.info(f"\nValidation metrics:") logger.info(f"Accuracy: {accuracy:.4f}") logger.info(f"F1-Score (macro): {f1_macro:.4f}") logger.info(f"F1-Score (weighted): {f1_weighted:.4f}") # Log validation metrics mlflow.log_metric("validation_accuracy", accuracy) mlflow.log_metric("validation_f1_macro", f1_macro) mlflow.log_metric("validation_f1_weighted", f1_weighted) # Generate report class_report = classification_report(true_labels, pred_array) report_path = f"classification_report_{run_name}.txt" with open(report_path, "w") as f: f.write(f"Classification Report - H2O AutoML\n") f.write(f"{'='*50}\n\n") f.write(class_report) mlflow.log_artifact(report_path) except Exception as e: logger.warning(f"Could not generate classification report: {e}") else: logger.info("Skipping report generation (no models trained or not a classification problem)") # Clean temporary files if os.path.exists(leaderboard_path): os.remove(leaderboard_path) report_path_temp = f"classification_report_{run_name}.txt" if os.path.exists(report_path_temp): os.remove(report_path_temp) return aml, run.info.run_id except Exception as e: logger.error(f"Error during H2O training: {e}") raise def load_h2o_model(run_id: str): """ Loads H2O model from MLflow """ import h2o # Initialize H2O if not active try: h2o.init(max_mem_size="2G", nthreads=-1) except: pass # H2O might already be active try: # Download artifact local_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path="model") # Find and load the model for root, dirs, files in os.walk(local_path): for file in files: if file.endswith(".zip"): model_path = os.path.join(root, file) logger.info(f"Loading H2O model from: {model_path}") model = h2o.load_model(model_path) # Check if model loaded correctly if model is None: raise ValueError("Loaded model is None") logger.info(f"H2O model loaded successfully: {type(model)}") return model raise FileNotFoundError("H2O model not found in artifacts.") except Exception as e: logger.error(f"Error loading H2O model: {e}") raise def predict_with_h2o(model, data: pd.DataFrame): """ Makes predictions using an H2O model """ import h2o # Check if model is valid if model is None: raise ValueError("H2O model is None. Ensure the model was loaded correctly.") try: logger.info(f"Starting prediction with H2O model: {type(model)}") # Prepare data the same way as training h2o_frame, _ = prepare_data_for_h2o(data, target="dummy") # target not used for prediction # Do predictions predictions = model.predict(h2o_frame) pred_array = predictions['predict'].as_data_frame()['predict'].values logger.info(f"Prediction complete: {len(pred_array)} predictions") return pred_array except Exception as e: logger.error(f"Error in H2O prediction: {e}") raise finally: # Clean H2O frame to release memory try: if 'h2o_frame' in locals(): h2o_frame = None except: pass