import numpy as np import pandas as pd from sklearn.model_selection import RandomizedSearchCV from typing import Dict, Any, Tuple, Optional try: from metrics import ( get_regression_scorer, get_classification_scorer, classification_scorer, regression_scorer, compute_single_label_classification_metrics, compute_regression_metrics, ) from utils import print_message from seed_utils import get_global_seed except ImportError: from ..metrics import ( get_regression_scorer, get_classification_scorer, classification_scorer, regression_scorer, compute_single_label_classification_metrics, compute_regression_metrics, ) from ..utils import print_message from ..seed_utils import get_global_seed from transformers import EvalPrediction from .lazy_predict import ( LazyRegressor, LazyClassifier, CLASSIFIER_DICT, REGRESSOR_DICT, ALL_MODEL_DICT ) from .scikit_hypers import HYPERPARAMETER_DISTRIBUTIONS class ScikitArguments: """ Combined arguments class for scikit-learn model training and tuning. """ def __init__( self, # Tuning arguments n_iter: int = 100, cv: int = 3, random_state: Optional[int] = None, # Specific model arguments (optional) model_name: Optional[str] = None, scikit_model_name: Optional[str] = None, # CLI arg name scikit_model_args: Optional[str] = None, # CLI arg - JSON string model_args: Optional[Dict[str, Any]] = None, production_model: bool = False, **kwargs, ): import json # Tuning arguments self.n_iter = n_iter self.cv = cv self.random_state = random_state or get_global_seed() # Specific model arguments - scikit_model_name takes precedence (CLI arg) self.model_name = scikit_model_name or model_name # Parse scikit_model_args JSON string if provided (CLI), otherwise use model_args dict if scikit_model_args is not None: try: self.model_args = json.loads(scikit_model_args) print_message(f"Using pre-specified hyperparameters (skipping tuning): {self.model_args}") except json.JSONDecodeError as e: raise ValueError(f"Failed to parse --scikit_model_args JSON: {e}") else: self.model_args = model_args if model_args is not None else {} self.production_model = production_model class ModelResults: def __init__( self, initial_scores: Optional[pd.DataFrame], best_model_name: str, best_params: Optional[Dict[str, Any]], final_scores: Dict[str, float], best_model: Any ): self.initial_scores = initial_scores self.best_model_name = best_model_name self.best_params = best_params self.final_scores = final_scores self.best_model = best_model def __str__(self) -> str: return ( f"Best Model: {self.best_model_name}\n" f"Best Parameters: {self.best_params}\n" f"Final Scores: {self.final_scores}" ) class ScikitProbe: """ A class for finding and tuning the best scikit-learn models for a given dataset. """ def __init__(self, args: ScikitArguments): self.args = args self.n_jobs = 1 def _tune_hyperparameters( self, model_class: Any, model_name: str, X_train: np.ndarray, y_train: np.ndarray, custom_scorer: Any, ) -> Tuple[Any, Dict[str, Any]]: """ Perform hyperparameter tuning using RandomizedSearchCV. """ param_distributions = HYPERPARAMETER_DISTRIBUTIONS.get(model_name, {}) if not param_distributions: print_message(f"No hyperparameter distributions defined for {model_name}, using defaults") return model_class(), {} print_message(f"Running RandomizedSearchCV with {self.args.n_iter} iterations, {self.args.cv}-fold CV...") print_message(f"Hyperparameter search space: {list(param_distributions.keys())}") random_search = RandomizedSearchCV( model_class(), param_distributions=param_distributions, n_iter=self.args.n_iter, scoring=custom_scorer, cv=self.args.cv, random_state=self.args.random_state, n_jobs=self.n_jobs, verbose=2 # Show progress for each fit ) random_search.fit(X_train, y_train) print_message(f"Best CV score: {random_search.best_score_:.4f}") return random_search.best_estimator_, random_search.best_params_ def find_best_regressor( self, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, ) -> ModelResults: """ Find the best regression model through lazy prediction and hyperparameter tuning. Args: X_train: Training features y_train: Training targets X_test: Test features y_test: Test targets Returns: ModelResults object containing all results and the best model """ # Initial lazy prediction print_message(f"Initial lazy prediction started") regressor = LazyRegressor( verbose=0, ignore_warnings=False, custom_metric=regression_scorer() ) initial_scores = regressor.fit(X_train, X_test, y_train, y_test) if isinstance(initial_scores, Tuple): initial_scores = initial_scores[0] # Get best model name and class best_model_name = initial_scores.index[0] # Models are now stored directly (not as Pipeline) after optimization best_model_class = regressor.models[best_model_name].__class__ print_message(f"Best model name: {best_model_name}") print_message(f"Best model class: {best_model_class}") print_message(f"Initial scores: \n{initial_scores}") print_message(f"Tuning hyperparameters") # Tune hyperparameters scorer = get_regression_scorer() best_model, best_params = self._tune_hyperparameters( best_model_class, best_model_name, X_train, y_train, scorer, ) # Get final scores with tuned model best_model.fit(X_train, y_train) final_scores = self._calculate_metrics(best_model, X_test, y_test, best_model_name) print_message(f"Final scores: {final_scores}") print_message(f"Best params: \n{best_params}") return ModelResults( initial_scores=initial_scores, best_model_name=best_model_name, best_params=best_params, final_scores=final_scores, best_model=best_model ) def find_best_classifier( self, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, ) -> ModelResults: """ Find the best classification model through lazy prediction and hyperparameter tuning. Args: X_train: Training features y_train: Training targets X_test: Test features y_test: Test targets Returns: ModelResults object containing all results and the best model """ # Initial lazy prediction print_message(f"Initial lazy prediction started") classifier = LazyClassifier( verbose=0, ignore_warnings=False, custom_metric=classification_scorer() ) initial_scores = classifier.fit(X_train, X_test, y_train, y_test) if isinstance(initial_scores, Tuple): initial_scores = initial_scores[0] # Get best model name and class best_model_name = initial_scores.index[0] # Models are now stored directly (not as Pipeline) after optimization best_model_class = classifier.models[best_model_name].__class__ print_message(f"Best model name: {best_model_name}") print_message(f"Best model class: {best_model_class}") print_message(f"Initial scores: \n{initial_scores}") print_message(f"Tuning hyperparameters") # Tune hyperparameters scorer = get_classification_scorer() best_model, best_params = self._tune_hyperparameters( best_model_class, best_model_name, X_train, y_train, scorer, ) # Get final scores with tuned model best_model.fit(X_train, y_train) final_scores = self._calculate_metrics(best_model, X_test, y_test, best_model_name) print_message(f"Final scores: {final_scores}") print_message(f"Best params: \n{best_params}") return ModelResults( initial_scores=initial_scores, best_model_name=best_model_name, best_params=best_params, final_scores=final_scores, best_model=best_model ) def _calculate_metrics( self, model: Any, X: np.ndarray, y: np.ndarray, model_name: str, ) -> Dict[str, float]: """ Delegate to the shared metric functions in metrics.py via EvalPrediction, keeping a single source of truth for metric calculation across the codebase. """ if model_name in CLASSIFIER_DICT: if hasattr(model, 'predict_proba'): predictions = model.predict_proba(X) else: # Fall back to one-hot hard predictions for models without predict_proba y_pred = model.predict(X) n_classes = len(np.unique(y)) predictions = np.eye(n_classes)[y_pred.astype(int)] p = EvalPrediction(predictions=predictions, label_ids=y) return compute_single_label_classification_metrics(p) elif model_name in REGRESSOR_DICT: y_pred = model.predict(X) p = EvalPrediction(predictions=y_pred, label_ids=y) return compute_regression_metrics(p) return {} def run_specific_model( self, X_train: np.ndarray, y_train: np.ndarray, X_valid: np.ndarray, y_valid: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, model_results: Optional[ModelResults] = None, ) -> ModelResults: """ Run a specific model with given arguments or based on a previous ModelResults. Args: X_train: Training features y_train: Training targets X_valid: Validation features y_valid: Validation targets X_test: Test features y_test: Test targets model_results: Optional ModelResults from find_best_classifier or find_best_regressor If provided, will use the best model type and parameters from it Returns: ModelResults object containing results and the model """ print_message("Running specific model") if self.args.production_model: print_message(f"Running in production mode, train and validation are combined") X_train = np.concatenate([X_train, X_valid]) y_train = np.concatenate([y_train, y_valid]) # If model_results is provided, use its best model type and parameters if model_results is not None: model_name = model_results.best_model_name model_params = model_results.best_params if model_results.best_params is not None else {} # Get the model class model_class = ALL_MODEL_DICT[model_name] # Create and train the model with the best parameters cls = model_class(**model_params) print_message(f"Training model {cls}") cls.fit(X_train, y_train) print_message(f"Model trained") final_scores = self._calculate_metrics(cls, X_test, y_test, model_name) print_message(f"Final scores: {final_scores}") return ModelResults( initial_scores=None, best_model_name=model_name, best_params=model_params, final_scores=final_scores, best_model=cls ) # Original functionality when no model_results is provided elif self.args.model_name is not None: model_name = self.args.model_name if model_name in CLASSIFIER_DICT: scorer = get_classification_scorer() elif model_name in REGRESSOR_DICT: scorer = get_regression_scorer() else: raise ValueError(f"Model {model_name} not supported") model_class = ALL_MODEL_DICT[model_name] # Skip tuning if model_args is already provided if self.args.model_args: print_message(f"Skipping hyperparameter tuning - using provided args: {self.args.model_args}") best_model = model_class(**self.args.model_args) best_params = self.args.model_args else: # Run hyperparameter tuning print_message(f"Tuning hyperparameters for {model_name}") best_model, best_params = self._tune_hyperparameters( model_class, model_name, X_train, y_train, scorer ) print_message(f"Best parameters: {best_params}") # Train final model with best parameters print_message(f"Training final model with best parameters") best_model.fit(X_train, y_train) final_scores = self._calculate_metrics(best_model, X_test, y_test, model_name) print_message(f"Final scores: {final_scores}") return ModelResults( initial_scores=None, best_model_name=model_name, best_params=best_params, final_scores=final_scores, best_model=best_model ) else: raise ValueError("Either model_name must be specified in args or model_results must be provided")