import pickle import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error, r2_score import re # For using regular expressions import matplotlib.pyplot as plt import datetime train_path = "../Datasets_all/A2_dataset_80.csv" test_path = "../Datasets_all/A2_dataset_20.csv" def save_prediction_plot(y_test, y_test_pred_baseline, baseline_test_r2): # Visualize baseline predictions fig, axes = plt.subplots(figsize=(5, 5)) # Actual vs Predicted axes.scatter(y_test, y_test_pred_baseline, alpha=0.5) axes.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) axes.set_xlabel('Actual AimoScore') axes.set_ylabel('Predicted AimoScore') axes.set_title(f'Baseline: Actual vs Predicted (R²={baseline_test_r2:.4f})') axes.grid(True, alpha=0.3) # Save the figure timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # e.g., 20260130_143210 fig_path = f"baseline_actual_vs_predicted_{timestamp}.png" plt.savefig(fig_path, dpi=300, bbox_inches='tight') print(f"Figure saved to {fig_path}") def extract_missing_feature(error_message): # Use regex to find feature names in the ValueError message match = re.search(r"Feature names unseen at fit time:\s*-\s*(.+)", error_message) if match: return match.group(1).strip().split(', ') # Return list of feature names return [] def load_and_evaluate_model(model_path): # Load the pickled model with open(model_path, "rb") as f: model = pickle.load(f) # Check the model type assert isinstance(model, LinearRegression) # Load data train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) # Define target and features dynamically target_col = "AimoScore" unwanted_cols = ["EstimatedScore"] features_cols = [ col for col in train_df.columns if col not in unwanted_cols and col != target_col ] # Initialize features for prediction X_test = test_df[features_cols] y_test = test_df[target_col] #define y_pred and r2 y_pred, r2 = 0, 0 # Continue to predict until no ValueErrors occur while True: try: # Predict on test set y_pred = model.predict(X_test) # Evaluate mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print(f"Mean Absolute Error on test set: {mae:.4f}") print(f"R^2 score on test set: {r2:.4f}") # Assert the threshold values assert mae < 0.15, "Mean Absolute Error is too high" assert r2 > 0.5, "R^2 score is too low" break # Exit the loop if no errors occur except ValueError as e: print(f"Error during prediction: {e}") # Extract missing features from the error message missing_features = extract_missing_feature(str(e)) # Remove missing features from X_test and features_cols if missing_features: print(f"Removing missing features from test set: {missing_features}") # Update features list features_cols = [col for col in features_cols if col not in missing_features] # Update X_test X_test = X_test[features_cols] else: print("No more features can be removed, stopping execution.") break # Exit if there are no more features to remove if 'y_pred' in locals(): # Check if predictions were made # Save predictions to CSV test_df["Predicted_AimoScore"] = y_pred test_df.to_csv("predicted_test.csv", index=False) else: print("no predictions!!!") save_prediction_plot(y_test, y_pred, r2) if __name__ == "__main__": model_path = "linear_regression_model.pkl" load_and_evaluate_model(model_path)