import pickle
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import re  # For using regular expressions
import matplotlib.pyplot as plt
import datetime

train_path = "../Datasets_all/A2_dataset_80.csv"
test_path = "../Datasets_all/A2_dataset_20.csv"

def save_prediction_plot(y_test, y_test_pred_baseline, baseline_test_r2):
    # Visualize baseline predictions
    fig, axes = plt.subplots(figsize=(5, 5))

    # Actual vs Predicted
    axes.scatter(y_test, y_test_pred_baseline, alpha=0.5)
    axes.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes.set_xlabel('Actual AimoScore')
    axes.set_ylabel('Predicted AimoScore')
    axes.set_title(f'Baseline: Actual vs Predicted (R²={baseline_test_r2:.4f})')
    axes.grid(True, alpha=0.3)

    # Save the figure
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")   # e.g., 20260130_143210
    fig_path = f"baseline_actual_vs_predicted_{timestamp}.png"
    plt.savefig(fig_path, dpi=300, bbox_inches='tight')
    print(f"Figure saved to {fig_path}")

def extract_missing_feature(error_message):
    # Use regex to find feature names in the ValueError message
    match = re.search(r"Feature names unseen at fit time:\s*-\s*(.+)", error_message)
    if match:
        return match.group(1).strip().split(', ')  # Return list of feature names
    return []

def load_and_evaluate_model(model_path):
    # Load the pickled model
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    # Check the model type
    assert isinstance(model, LinearRegression)

    # Load data
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Define target and features dynamically
    target_col = "AimoScore"
    unwanted_cols = ["EstimatedScore"]

    features_cols = [
        col for col in train_df.columns
        if col not in unwanted_cols and col != target_col
    ]

    # Initialize features for prediction
    X_test = test_df[features_cols]
    y_test = test_df[target_col]

    #define y_pred and r2
    y_pred, r2 = 0, 0

    # Continue to predict until no ValueErrors occur
    while True:
        try:
            # Predict on test set
            y_pred = model.predict(X_test)

            # Evaluate
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            print(f"Mean Absolute Error on test set: {mae:.4f}")
            print(f"R^2 score on test set: {r2:.4f}")

            # Assert the threshold values
            assert mae < 0.15, "Mean Absolute Error is too high"
            assert r2 > 0.5, "R^2 score is too low"
            break  # Exit the loop if no errors occur

        except ValueError as e:
            print(f"Error during prediction: {e}")

            # Extract missing features from the error message
            missing_features = extract_missing_feature(str(e))

            # Remove missing features from X_test and features_cols
            if missing_features:
                print(f"Removing missing features from test set: {missing_features}")
                # Update features list
                features_cols = [col for col in features_cols if col not in missing_features]
                # Update X_test
                X_test = X_test[features_cols]
            else:
                print("No more features can be removed, stopping execution.")
                break  # Exit if there are no more features to remove

    if 'y_pred' in locals():  # Check if predictions were made
        # Save predictions to CSV
        test_df["Predicted_AimoScore"] = y_pred
        test_df.to_csv("predicted_test.csv", index=False)
    else:
        print("no predictions!!!")

    save_prediction_plot(y_test, y_pred, r2)


if __name__ == "__main__":
    model_path = "linear_regression_model.pkl"
    load_and_evaluate_model(model_path)