Project_MLops / src /models /feature_selection.py
QuanTH02's picture
feat: dir structure
e964b12
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import (
train_test_split,
GridSearchCV,
cross_val_score,
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from factor_analyzer import FactorAnalyzer
import pickle
def train(df):
unique_genres = set(genre for sublist in df["genres"].str.split() for genre in sublist)
for genre in unique_genres:
df[genre] = df["genres"].apply(lambda x: 1 if genre in x.split() else 0)
df = df.drop(columns=["genres"])
selected_columns = [
"month",
"year",
"mpaa",
"budget",
"runtime",
"screens",
"opening_week",
"domestic_box_office",
"user_vote",
"ratings",
"critic_vote",
"meta_score",
"country",
"sequel",
] + list(unique_genres)
df = df[selected_columns]
genre_columns = list(unique_genres)
genre_data = df[genre_columns]
scaler = StandardScaler()
genre_data_scaled = scaler.fit_transform(genre_data)
fa = FactorAnalyzer()
fa.fit(genre_data_scaled)
eigenvalues, _ = fa.get_eigenvalues()
n_factors = sum(eigenvalues > 1)
print(f"Number of factors to retain: {n_factors}")
fa = FactorAnalyzer(n_factors=n_factors, rotation="varimax")
fa.fit(genre_data_scaled)
factor_scores = fa.transform(genre_data_scaled)
factor_scores_df = pd.DataFrame(
factor_scores, columns=[f"Factor{i+1}" for i in range(n_factors)]
)
df = pd.concat([df, factor_scores_df], axis=1)
df = df.drop(columns=genre_columns)
mpaa_label_encoder = LabelEncoder()
country_label_encoder = LabelEncoder()
df["mpaa"] = mpaa_label_encoder.fit_transform(df["mpaa"])
df["country"] = country_label_encoder.fit_transform(df["country"])
df.to_csv("merge_data/preprocess_data.csv", index=False)
X = df.drop("domestic_box_office", axis=1)
y = df["domestic_box_office"]
y_log = np.log(y)
correlation_threshold = 0.2
selected_features = [
column
for column in X.columns
if abs(pearsonr(X[column], y_log)[0]) > correlation_threshold
]
X = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(
X, y_log, test_size=0.2, random_state=42
)
numeric_features = selected_features
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
]
)
def grid_search(model, param_grid):
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("regressor", model),
]
)
search = GridSearchCV(
pipeline,
param_grid,
cv=5,
n_jobs=-1,
scoring="neg_mean_squared_error"
)
search.fit(X_train, y_train)
return search
param_grid_rf = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [None, 10, 20, 30],
"regressor__min_samples_split": [2, 5, 10],
}
param_grid_gb = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [3, 5, 7],
"regressor__learning_rate": [0.01, 0.1, 0.2],
}
param_grid_xgb = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [3, 5, 7],
"regressor__learning_rate": [0.01, 0.1, 0.2],
"regressor__subsample": [0.8, 0.9, 1.0],
}
param_grid_lgbm = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [-1, 10, 20],
"regressor__learning_rate": [0.01, 0.1, 0.2],
"regressor__num_leaves": [31, 50, 100],
}
param_grid_cb = {
"regressor__iterations": [50, 100, 150],
"regressor__depth": [4, 6, 10],
"regressor__learning_rate": [0.01, 0.1, 0.2],
"regressor__l2_leaf_reg": [1, 3, 5],
}
models = [
(RandomForestRegressor(random_state=42), param_grid_rf),
(GradientBoostingRegressor(random_state=42), param_grid_gb),
(XGBRegressor(random_state=42), param_grid_xgb),
(LGBMRegressor(random_state=42), param_grid_lgbm),
(CatBoostRegressor(random_state=42, verbose=0), param_grid_cb),
]
best_score = float("inf")
best_model = None
best_params = None
list_file_name = ["model_efa/model_rf.pkl", "model_efa/model_gb.pkl", "model_efa/model_xgb.pkl", "model_efa/model_lgbm.pkl", "model_efa/model_cb.pkl"]
index_file_name = 0
for model, param_grid in models:
search = grid_search(model, param_grid)
best_score = -search.best_score_
best_model = search.best_estimator_
best_params = search.best_params_
with open(list_file_name[index_file_name], "wb") as f:
pickle.dump(best_model, f)
index_file_name += 1
print(f"Best model: {best_model}")
print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)
mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_actual, y_pred)
r2 = r2_score(y_test_actual, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")
scores = cross_val_score(best_model, X, y_log, cv=5, scoring="neg_mean_squared_error")
rmse_scores = np.sqrt(-scores)
print(f"Cross-validated RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {rmse_scores.mean()}")
print(f"Standard deviation of RMSE: {rmse_scores.std()}")
with open("gridsearch_result/result_with_opening.txt", "a") as f:
print(f"Best model: {best_model}", file=f)
print(f"Best parameters: {best_params}", file=f)
print(f"Best score: {best_score}", file=f)
print(f"Mean Squared Error (MSE): {mse}", file=f)
print(f"Root Mean Squared Error (RMSE): {rmse}", file=f)
print(f"Mean Absolute Error (MAE): {mae}", file=f)
print(f"R^2 Score: {r2}", file=f)
print(f"Cross-validated RMSE scores: {rmse_scores}", file=f)
print(f"Mean RMSE: {rmse_scores.mean()}", file=f)
print(f"Standard deviation of RMSE: {rmse_scores.std()}", file=f)
print("----------------------------------------------------------------\n\n",file=f)
with open("model_efa/mpaa_label_encoder.pkl", "wb") as f:
pickle.dump(mpaa_label_encoder, f)
with open("model_efa/country_label_encoder.pkl", "wb") as f:
pickle.dump(country_label_encoder, f)
with open("model_efa/scaler.pkl", "wb") as f:
pickle.dump(scaler, f)
with open("model_efa/factor_analyzer.pkl", "wb") as f:
pickle.dump(fa, f)
with open("model_efa/unique_genres.pkl", "wb") as f:
pickle.dump(unique_genres, f)
with open("model_efa/selected_features.pkl", "wb") as f:
pickle.dump(selected_features, f)
def train_without_opening_week(df):
unique_genres = set(genre for sublist in df["genres"].str.split() for genre in sublist)
for genre in unique_genres:
df[genre] = df["genres"].apply(lambda x: 1 if genre in x.split() else 0)
df = df.drop(columns=["genres"])
selected_columns = [
"month",
"year",
"mpaa",
"budget",
"runtime",
"screens",
"domestic_box_office",
"critic_vote",
"meta_score",
"country",
"sequel",
] + list(unique_genres)
df = df[selected_columns]
genre_columns = list(unique_genres)
genre_data = df[genre_columns]
scaler = StandardScaler()
genre_data_scaled = scaler.fit_transform(genre_data)
fa = FactorAnalyzer()
fa.fit(genre_data_scaled)
eigenvalues, _ = fa.get_eigenvalues()
n_factors = sum(eigenvalues > 1)
print(f"Number of factors to retain: {n_factors}")
fa = FactorAnalyzer(n_factors=n_factors, rotation="varimax")
fa.fit(genre_data_scaled)
factor_scores = fa.transform(genre_data_scaled)
factor_scores_df = pd.DataFrame(
factor_scores, columns=[f"Factor{i+1}" for i in range(n_factors)]
)
df = pd.concat([df, factor_scores_df], axis=1)
df = df.drop(columns=genre_columns)
mpaa_label_encoder = LabelEncoder()
country_label_encoder = LabelEncoder()
df["mpaa"] = mpaa_label_encoder.fit_transform(df["mpaa"])
df["country"] = country_label_encoder.fit_transform(df["country"])
df.to_csv("merge_data/preprocess_data_without_opening_week.csv", index=False)
X = df.drop("domestic_box_office", axis=1)
y = df["domestic_box_office"]
y_log = np.log(y)
correlation_threshold = 0.2
selected_features = [
column
for column in X.columns
if abs(pearsonr(X[column], y_log)[0]) > correlation_threshold
]
X = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(
X, y_log, test_size=0.2, random_state=42
)
numeric_features = selected_features
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
]
)
def grid_search(model, param_grid):
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("regressor", model),
]
)
search = GridSearchCV(
pipeline,
param_grid,
cv=5,
n_jobs=-1,
scoring="neg_mean_squared_error"
)
search.fit(X_train, y_train)
return search
param_grid_rf = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [None, 10, 20, 30],
"regressor__min_samples_split": [2, 5, 10],
}
param_grid_gb = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [3, 5, 7],
"regressor__learning_rate": [0.01, 0.1, 0.2],
}
param_grid_xgb = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [3, 5, 7],
"regressor__learning_rate": [0.01, 0.1, 0.2],
"regressor__subsample": [0.8, 0.9, 1.0],
}
param_grid_lgbm = {
"regressor__n_estimators": [50, 100, 150],
"regressor__max_depth": [-1, 10, 20],
"regressor__learning_rate": [0.01, 0.1, 0.2],
"regressor__num_leaves": [31, 50, 100],
}
param_grid_cb = {
"regressor__iterations": [50, 100, 150],
"regressor__depth": [4, 6, 10],
"regressor__learning_rate": [0.01, 0.1, 0.2],
"regressor__l2_leaf_reg": [1, 3, 5],
}
models = [
(RandomForestRegressor(random_state=42), param_grid_rf),
(GradientBoostingRegressor(random_state=42), param_grid_gb),
(XGBRegressor(random_state=42), param_grid_xgb),
(LGBMRegressor(random_state=42), param_grid_lgbm),
(CatBoostRegressor(random_state=42, verbose=0), param_grid_cb),
]
best_score = float("inf")
best_model = None
best_params = None
list_file_name = ["model_efa/model_rf_without_opening_week.pkl", "model_efa/model_gb_without_opening_week.pkl", "model_efa/model_xgb_without_opening_week.pkl", "model_efa/model_lgbm_without_opening_week.pkl", "model_efa/model_cb_without_opening_week.pkl"]
index_file_name = 0
for model, param_grid in models:
search = grid_search(model, param_grid)
best_score = -search.best_score_
best_model = search.best_estimator_
best_params = search.best_params_
with open(list_file_name[index_file_name], "wb") as f:
pickle.dump(best_model, f)
index_file_name += 1
print(f"Best model: {best_model}")
print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)
mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_actual, y_pred)
r2 = r2_score(y_test_actual, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")
scores = cross_val_score(
best_model, X, y_log, cv=5, scoring="neg_mean_squared_error"
)
rmse_scores = np.sqrt(-scores)
print(f"Cross-validated RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {rmse_scores.mean()}")
print(f"Standard deviation of RMSE: {rmse_scores.std()}")
with open("gridsearch_result/result_without_opening.txt", "a") as f:
print(f"Best model: {best_model}", file=f)
print(f"Best parameters: {best_params}", file=f)
print(f"Best score: {best_score}", file=f)
print(f"Mean Squared Error (MSE): {mse}", file=f)
print(f"Root Mean Squared Error (RMSE): {rmse}", file=f)
print(f"Mean Absolute Error (MAE): {mae}", file=f)
print(f"R^2 Score: {r2}", file=f)
print(f"Cross-validated RMSE scores: {rmse_scores}", file=f)
print(f"Mean RMSE: {rmse_scores.mean()}", file=f)
print(f"Standard deviation of RMSE: {rmse_scores.std()}", file=f)
print(
"----------------------------------------------------------------\n\n",file=f
)
with open("model_efa/mpaa_label_encoder.pkl", "wb") as f:
pickle.dump(mpaa_label_encoder, f)
with open("model_efa/country_label_encoder.pkl", "wb") as f:
pickle.dump(country_label_encoder, f)
with open("model_efa/scaler.pkl", "wb") as f:
pickle.dump(scaler, f)
with open("model_efa/factor_analyzer.pkl", "wb") as f:
pickle.dump(fa, f)
with open("model_efa/unique_genres.pkl", "wb") as f:
pickle.dump(unique_genres, f)
with open("model_efa/selected_features_without_opening_week.pkl", "wb") as f:
pickle.dump(selected_features, f)
if __name__ == "__main__":
df = pd.read_csv("merge_data/final_merged.csv")
# with open("gridsearch_result/result_with_opening.txt","w") as f:
# pass
# with open("gridsearch_result/result_without_opening.txt","w") as f:
# pass
train(df.copy())
train_without_opening_week(df.copy())