Spaces:

Bachstelze
/

github_sync

Running

github_sync / test /test_model.py

Bachstelze

save plot figure

9182b44 5 days ago

3.95 kB

	import pickle
	import pandas as pd
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_absolute_error, r2_score
	import re # For using regular expressions
	import matplotlib.pyplot as plt
	import datetime

	train_path = "../Datasets_all/A2_dataset_80.csv"
	test_path = "../Datasets_all/A2_dataset_20.csv"

	def save_prediction_plot(y_test, y_test_pred_baseline, baseline_test_r2):
	# Visualize baseline predictions
	fig, axes = plt.subplots(figsize=(5, 5))

	# Actual vs Predicted
	axes.scatter(y_test, y_test_pred_baseline, alpha=0.5)
	axes.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
	axes.set_xlabel('Actual AimoScore')
	axes.set_ylabel('Predicted AimoScore')
	axes.set_title(f'Baseline: Actual vs Predicted (R²={baseline_test_r2:.4f})')
	axes.grid(True, alpha=0.3)

	# Save the figure
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # e.g., 20260130_143210
	fig_path = f"baseline_actual_vs_predicted_{timestamp}.png"
	plt.savefig(fig_path, dpi=300, bbox_inches='tight')
	print(f"Figure saved to {fig_path}")

	def extract_missing_feature(error_message):
	# Use regex to find feature names in the ValueError message
	match = re.search(r"Feature names unseen at fit time:\s-\s(.+)", error_message)
	if match:
	return match.group(1).strip().split(', ') # Return list of feature names
	return []

	def load_and_evaluate_model(model_path):
	# Load the pickled model
	with open(model_path, "rb") as f:
	model = pickle.load(f)

	# Check the model type
	assert isinstance(model, LinearRegression)

	# Load data
	train_df = pd.read_csv(train_path)
	test_df = pd.read_csv(test_path)

	# Define target and features dynamically
	target_col = "AimoScore"
	unwanted_cols = ["EstimatedScore"]

	features_cols = [
	col for col in train_df.columns
	if col not in unwanted_cols and col != target_col
	]

	# Initialize features for prediction
	X_test = test_df[features_cols]
	y_test = test_df[target_col]

	#define y_pred and r2
	y_pred, r2 = 0, 0

	# Continue to predict until no ValueErrors occur
	while True:
	try:
	# Predict on test set
	y_pred = model.predict(X_test)

	# Evaluate
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	print(f"Mean Absolute Error on test set: {mae:.4f}")
	print(f"R^2 score on test set: {r2:.4f}")

	# Assert the threshold values
	assert mae < 0.15, "Mean Absolute Error is too high"
	assert r2 > 0.5, "R^2 score is too low"
	break # Exit the loop if no errors occur

	except ValueError as e:
	print(f"Error during prediction: {e}")

	# Extract missing features from the error message
	missing_features = extract_missing_feature(str(e))

	# Remove missing features from X_test and features_cols
	if missing_features:
	print(f"Removing missing features from test set: {missing_features}")
	# Update features list
	features_cols = [col for col in features_cols if col not in missing_features]
	# Update X_test
	X_test = X_test[features_cols]
	else:
	print("No more features can be removed, stopping execution.")
	break # Exit if there are no more features to remove

	if 'y_pred' in locals(): # Check if predictions were made
	# Save predictions to CSV
	test_df["Predicted_AimoScore"] = y_pred
	test_df.to_csv("predicted_test.csv", index=False)
	else:
	print("no predictions!!!")

	save_prediction_plot(y_test, y_pred, r2)


	if __name__ == "__main__":
	model_path = "linear_regression_model.pkl"
	load_and_evaluate_model(model_path)