# ============================================================================= # AQI PREDICTION - CHAMPION MODEL TRAINING SCRIPT # ============================================================================= # # Description: # This script automates the process of training the champion AQI prediction model. # It performs the following steps: # 1. Loads the latest daily data. # 2. Preprocesses the data (handles timestamps). # 3. Performs two stages of feature engineering (lags, rolling stats, interactions, etc.). # 4. Defines the top 3 optimized base models (RandomForest, CatBoost, XGBoost). # 5. Trains a Weighted Averaging Ensemble model on the entire dataset. # 6. Saves the final, trained model object to a joblib file for use in prediction. import pandas as pd import numpy as np import joblib import os import time # --- Model Imports --- from sklearn.ensemble import RandomForestRegressor, VotingRegressor import xgboost as xgb import catboost as cb # --- CONFIGURATION --- # Define file paths here to make them easy to change. DATA_FILE_PATH = 'data/karachi_daily_data_5_years.csv' MODEL_OUTPUT_DIR = 'models' MODEL_FILENAME = 'MAIN MODEL.joblib' # --- DATA PROCESSING FUNCTIONS --- def load_and_preprocess_data(file_path): """Loads and cleans the raw dataset.""" print(f"1/4: Loading and preprocessing data from '{file_path}'...") df = pd.read_csv(file_path) df['timestamp'] = pd.to_datetime(df['timestamp']) df.set_index('timestamp', inplace=True) df.sort_index(inplace=True) print(" ...Data loaded and preprocessed.") return df def create_base_features(df, lags=7): """Creates the initial lag and time-based features.""" print("2/4: Creating base features (lags and time)...") df_featured = df.copy() # Lag Features for AQI for i in range(1, lags + 1): df_featured[f'aqi_lag_{i}'] = df_featured['aqi'].shift(i) # Time-Based Features df_featured['month'] = df_featured.index.month df_featured['day_of_year'] = df_featured.index.dayofyear df_featured['day_of_week'] = df_featured.index.dayofweek print(" ...Base features created.") return df_featured def create_advanced_features(df): """Creates advanced rolling stats, interactions, and cyclical features.""" print("3/4: Creating advanced features (rolling stats, interactions, cyclical)...") df_advanced = df.copy() # Rolling Window Features window_sizes = [3, 7] cols_to_roll = ['aqi', 'pm25', 'carbon_monoxide', 'wind_speed', 'humidity'] for window in window_sizes: for col in cols_to_roll: df_advanced[f'{col}_rolling_mean_{window}'] = df_advanced[col].shift(1).rolling(window=window).mean() df_advanced[f'{col}_rolling_std_{window}'] = df_advanced[col].shift(1).rolling(window=window).std() # Interaction Features df_advanced['pm25_x_wind_interaction'] = df_advanced['pm25'] / (df_advanced['wind_speed'] + 1) df_advanced['temp_x_humidity_interaction'] = df_advanced['temperature'] * df_advanced['humidity'] # Cyclical Features df_advanced['month_sin'] = np.sin(2 * np.pi * df_advanced['month'] / 12) df_advanced['month_cos'] = np.cos(2 * np.pi * df_advanced['month'] / 12) df_advanced['day_of_week_sin'] = np.sin(2 * np.pi * df_advanced['day_of_week'] / 7) df_advanced['day_of_week_cos'] = np.cos(2 * np.pi * df_advanced['day_of_week'] / 7) df_advanced.drop(['month', 'day_of_week'], axis=1, inplace=True) # Drop NaNs created by the feature engineering process df_advanced.dropna(inplace=True) print(" ...Advanced features created.") return df_advanced def train_champion_model(df, output_path): """Trains the final weighted ensemble model and saves it to a file.""" print(f"4/4: Training the champion model...") # --- a. Define the top-performing base models with their best parameters --- rf_model = RandomForestRegressor( n_estimators=200, max_depth=20, max_features='sqrt', min_samples_split=2, min_samples_leaf=1, random_state=42, n_jobs=-1 ) catboost_model = cb.CatBoostRegressor( iterations=300, learning_rate=0.05, depth=4, l2_leaf_reg=3, random_state=42, verbose=0 ) xgboost_model = xgb.XGBRegressor( n_estimators=100, max_depth=3, learning_rate=0.1, subsample=0.7, colsample_bytree=0.8, random_state=42, n_jobs=-1 ) # --- b. Define the Weighted Averaging Ensemble (VotingRegressor) --- # The weights correspond to the confidence in each model (40%, 40%, 20%) estimators = [ ('Optimized RandomForest', rf_model), ('Optimized CatBoost', catboost_model), ('Optimized XGBoost', xgboost_model) ] weights = [0.4, 0.4, 0.2] ensemble_model = VotingRegressor(estimators=estimators, weights=weights) # --- c. Prepare final data and train the model --- X_full = df.drop('aqi', axis=1) y_full = df['aqi'] ensemble_model.fit(X_full, y_full) # --- d. Save the trained model object --- os.makedirs(os.path.dirname(output_path), exist_ok=True) joblib.dump(ensemble_model, output_path) print(f" ...Model training complete.") # ============================================================================= # --- MAIN EXECUTION BLOCK --- # ============================================================================= if __name__ == "__main__": start_time = time.time() print("--- Starting Daily Model Retraining Pipeline ---") try: # Step 1: Load and preprocess df_clean = load_and_preprocess_data(DATA_FILE_PATH) # Step 2: Create base features df_featured = create_base_features(df_clean) # Step 3: Create advanced features df_final_features = create_advanced_features(df_featured) # Step 4: Train and save the champion model model_output_path = os.path.join(MODEL_OUTPUT_DIR, MODEL_FILENAME) train_champion_model(df_final_features, model_output_path) end_time = time.time() print("\n--- PIPELINE COMPLETED SUCCESSFULLY ---") print(f"Final model saved to: {model_output_path}") print(f"Total runtime: {end_time - start_time:.2f} seconds") except FileNotFoundError: print(f"\nERROR: Input data file not found at '{DATA_FILE_PATH}'. Aborting pipeline.") except Exception as e: print(f"\nAn unexpected error occurred during the pipeline: {e}")