Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor | |
| from xgboost import XGBRegressor | |
| from lightgbm import LGBMRegressor | |
| from catboost import CatBoostRegressor | |
| from scipy.stats import pearsonr | |
| from sklearn.model_selection import ( | |
| train_test_split, | |
| GridSearchCV, | |
| cross_val_score, | |
| ) | |
| from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error | |
| from factor_analyzer import FactorAnalyzer | |
| import pickle | |
| def train(df): | |
| unique_genres = set(genre for sublist in df["genres"].str.split() for genre in sublist) | |
| for genre in unique_genres: | |
| df[genre] = df["genres"].apply(lambda x: 1 if genre in x.split() else 0) | |
| df = df.drop(columns=["genres"]) | |
| selected_columns = [ | |
| "month", | |
| "year", | |
| "mpaa", | |
| "budget", | |
| "runtime", | |
| "screens", | |
| "opening_week", | |
| "domestic_box_office", | |
| "user_vote", | |
| "ratings", | |
| "critic_vote", | |
| "meta_score", | |
| "country", | |
| "sequel", | |
| ] + list(unique_genres) | |
| df = df[selected_columns] | |
| genre_columns = list(unique_genres) | |
| genre_data = df[genre_columns] | |
| scaler = StandardScaler() | |
| genre_data_scaled = scaler.fit_transform(genre_data) | |
| fa = FactorAnalyzer() | |
| fa.fit(genre_data_scaled) | |
| eigenvalues, _ = fa.get_eigenvalues() | |
| n_factors = sum(eigenvalues > 1) | |
| print(f"Number of factors to retain: {n_factors}") | |
| fa = FactorAnalyzer(n_factors=n_factors, rotation="varimax") | |
| fa.fit(genre_data_scaled) | |
| factor_scores = fa.transform(genre_data_scaled) | |
| factor_scores_df = pd.DataFrame( | |
| factor_scores, columns=[f"Factor{i+1}" for i in range(n_factors)] | |
| ) | |
| df = pd.concat([df, factor_scores_df], axis=1) | |
| df = df.drop(columns=genre_columns) | |
| mpaa_label_encoder = LabelEncoder() | |
| country_label_encoder = LabelEncoder() | |
| df["mpaa"] = mpaa_label_encoder.fit_transform(df["mpaa"]) | |
| df["country"] = country_label_encoder.fit_transform(df["country"]) | |
| df.to_csv("merge_data/preprocess_data.csv", index=False) | |
| X = df.drop("domestic_box_office", axis=1) | |
| y = df["domestic_box_office"] | |
| y_log = np.log(y) | |
| correlation_threshold = 0.2 | |
| selected_features = [ | |
| column | |
| for column in X.columns | |
| if abs(pearsonr(X[column], y_log)[0]) > correlation_threshold | |
| ] | |
| X = X[selected_features] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y_log, test_size=0.2, random_state=42 | |
| ) | |
| numeric_features = selected_features | |
| numeric_transformer = StandardScaler() | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ("num", numeric_transformer, numeric_features), | |
| ] | |
| ) | |
| def grid_search(model, param_grid): | |
| pipeline = Pipeline( | |
| steps=[ | |
| ("preprocessor", preprocessor), | |
| ("regressor", model), | |
| ] | |
| ) | |
| search = GridSearchCV( | |
| pipeline, | |
| param_grid, | |
| cv=5, | |
| n_jobs=-1, | |
| scoring="neg_mean_squared_error" | |
| ) | |
| search.fit(X_train, y_train) | |
| return search | |
| param_grid_rf = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [None, 10, 20, 30], | |
| "regressor__min_samples_split": [2, 5, 10], | |
| } | |
| param_grid_gb = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [3, 5, 7], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| } | |
| param_grid_xgb = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [3, 5, 7], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| "regressor__subsample": [0.8, 0.9, 1.0], | |
| } | |
| param_grid_lgbm = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [-1, 10, 20], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| "regressor__num_leaves": [31, 50, 100], | |
| } | |
| param_grid_cb = { | |
| "regressor__iterations": [50, 100, 150], | |
| "regressor__depth": [4, 6, 10], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| "regressor__l2_leaf_reg": [1, 3, 5], | |
| } | |
| models = [ | |
| (RandomForestRegressor(random_state=42), param_grid_rf), | |
| (GradientBoostingRegressor(random_state=42), param_grid_gb), | |
| (XGBRegressor(random_state=42), param_grid_xgb), | |
| (LGBMRegressor(random_state=42), param_grid_lgbm), | |
| (CatBoostRegressor(random_state=42, verbose=0), param_grid_cb), | |
| ] | |
| best_score = float("inf") | |
| best_model = None | |
| best_params = None | |
| list_file_name = ["model_efa/model_rf.pkl", "model_efa/model_gb.pkl", "model_efa/model_xgb.pkl", "model_efa/model_lgbm.pkl", "model_efa/model_cb.pkl"] | |
| index_file_name = 0 | |
| for model, param_grid in models: | |
| search = grid_search(model, param_grid) | |
| best_score = -search.best_score_ | |
| best_model = search.best_estimator_ | |
| best_params = search.best_params_ | |
| with open(list_file_name[index_file_name], "wb") as f: | |
| pickle.dump(best_model, f) | |
| index_file_name += 1 | |
| print(f"Best model: {best_model}") | |
| print(f"Best parameters: {best_params}") | |
| print(f"Best score: {best_score}") | |
| y_pred_log = best_model.predict(X_test) | |
| y_pred = np.expm1(y_pred_log) | |
| y_test_actual = np.expm1(y_test) | |
| mse = mean_squared_error(y_test_actual, y_pred) | |
| rmse = np.sqrt(mse) | |
| mae = mean_absolute_error(y_test_actual, y_pred) | |
| r2 = r2_score(y_test_actual, y_pred) | |
| print(f"Mean Squared Error (MSE): {mse}") | |
| print(f"Root Mean Squared Error (RMSE): {rmse}") | |
| print(f"Mean Absolute Error (MAE): {mae}") | |
| print(f"R^2 Score: {r2}") | |
| scores = cross_val_score(best_model, X, y_log, cv=5, scoring="neg_mean_squared_error") | |
| rmse_scores = np.sqrt(-scores) | |
| print(f"Cross-validated RMSE scores: {rmse_scores}") | |
| print(f"Mean RMSE: {rmse_scores.mean()}") | |
| print(f"Standard deviation of RMSE: {rmse_scores.std()}") | |
| with open("gridsearch_result/result_with_opening.txt", "a") as f: | |
| print(f"Best model: {best_model}", file=f) | |
| print(f"Best parameters: {best_params}", file=f) | |
| print(f"Best score: {best_score}", file=f) | |
| print(f"Mean Squared Error (MSE): {mse}", file=f) | |
| print(f"Root Mean Squared Error (RMSE): {rmse}", file=f) | |
| print(f"Mean Absolute Error (MAE): {mae}", file=f) | |
| print(f"R^2 Score: {r2}", file=f) | |
| print(f"Cross-validated RMSE scores: {rmse_scores}", file=f) | |
| print(f"Mean RMSE: {rmse_scores.mean()}", file=f) | |
| print(f"Standard deviation of RMSE: {rmse_scores.std()}", file=f) | |
| print("----------------------------------------------------------------\n\n",file=f) | |
| with open("model_efa/mpaa_label_encoder.pkl", "wb") as f: | |
| pickle.dump(mpaa_label_encoder, f) | |
| with open("model_efa/country_label_encoder.pkl", "wb") as f: | |
| pickle.dump(country_label_encoder, f) | |
| with open("model_efa/scaler.pkl", "wb") as f: | |
| pickle.dump(scaler, f) | |
| with open("model_efa/factor_analyzer.pkl", "wb") as f: | |
| pickle.dump(fa, f) | |
| with open("model_efa/unique_genres.pkl", "wb") as f: | |
| pickle.dump(unique_genres, f) | |
| with open("model_efa/selected_features.pkl", "wb") as f: | |
| pickle.dump(selected_features, f) | |
| def train_without_opening_week(df): | |
| unique_genres = set(genre for sublist in df["genres"].str.split() for genre in sublist) | |
| for genre in unique_genres: | |
| df[genre] = df["genres"].apply(lambda x: 1 if genre in x.split() else 0) | |
| df = df.drop(columns=["genres"]) | |
| selected_columns = [ | |
| "month", | |
| "year", | |
| "mpaa", | |
| "budget", | |
| "runtime", | |
| "screens", | |
| "domestic_box_office", | |
| "critic_vote", | |
| "meta_score", | |
| "country", | |
| "sequel", | |
| ] + list(unique_genres) | |
| df = df[selected_columns] | |
| genre_columns = list(unique_genres) | |
| genre_data = df[genre_columns] | |
| scaler = StandardScaler() | |
| genre_data_scaled = scaler.fit_transform(genre_data) | |
| fa = FactorAnalyzer() | |
| fa.fit(genre_data_scaled) | |
| eigenvalues, _ = fa.get_eigenvalues() | |
| n_factors = sum(eigenvalues > 1) | |
| print(f"Number of factors to retain: {n_factors}") | |
| fa = FactorAnalyzer(n_factors=n_factors, rotation="varimax") | |
| fa.fit(genre_data_scaled) | |
| factor_scores = fa.transform(genre_data_scaled) | |
| factor_scores_df = pd.DataFrame( | |
| factor_scores, columns=[f"Factor{i+1}" for i in range(n_factors)] | |
| ) | |
| df = pd.concat([df, factor_scores_df], axis=1) | |
| df = df.drop(columns=genre_columns) | |
| mpaa_label_encoder = LabelEncoder() | |
| country_label_encoder = LabelEncoder() | |
| df["mpaa"] = mpaa_label_encoder.fit_transform(df["mpaa"]) | |
| df["country"] = country_label_encoder.fit_transform(df["country"]) | |
| df.to_csv("merge_data/preprocess_data_without_opening_week.csv", index=False) | |
| X = df.drop("domestic_box_office", axis=1) | |
| y = df["domestic_box_office"] | |
| y_log = np.log(y) | |
| correlation_threshold = 0.2 | |
| selected_features = [ | |
| column | |
| for column in X.columns | |
| if abs(pearsonr(X[column], y_log)[0]) > correlation_threshold | |
| ] | |
| X = X[selected_features] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y_log, test_size=0.2, random_state=42 | |
| ) | |
| numeric_features = selected_features | |
| numeric_transformer = StandardScaler() | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ("num", numeric_transformer, numeric_features), | |
| ] | |
| ) | |
| def grid_search(model, param_grid): | |
| pipeline = Pipeline( | |
| steps=[ | |
| ("preprocessor", preprocessor), | |
| ("regressor", model), | |
| ] | |
| ) | |
| search = GridSearchCV( | |
| pipeline, | |
| param_grid, | |
| cv=5, | |
| n_jobs=-1, | |
| scoring="neg_mean_squared_error" | |
| ) | |
| search.fit(X_train, y_train) | |
| return search | |
| param_grid_rf = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [None, 10, 20, 30], | |
| "regressor__min_samples_split": [2, 5, 10], | |
| } | |
| param_grid_gb = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [3, 5, 7], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| } | |
| param_grid_xgb = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [3, 5, 7], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| "regressor__subsample": [0.8, 0.9, 1.0], | |
| } | |
| param_grid_lgbm = { | |
| "regressor__n_estimators": [50, 100, 150], | |
| "regressor__max_depth": [-1, 10, 20], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| "regressor__num_leaves": [31, 50, 100], | |
| } | |
| param_grid_cb = { | |
| "regressor__iterations": [50, 100, 150], | |
| "regressor__depth": [4, 6, 10], | |
| "regressor__learning_rate": [0.01, 0.1, 0.2], | |
| "regressor__l2_leaf_reg": [1, 3, 5], | |
| } | |
| models = [ | |
| (RandomForestRegressor(random_state=42), param_grid_rf), | |
| (GradientBoostingRegressor(random_state=42), param_grid_gb), | |
| (XGBRegressor(random_state=42), param_grid_xgb), | |
| (LGBMRegressor(random_state=42), param_grid_lgbm), | |
| (CatBoostRegressor(random_state=42, verbose=0), param_grid_cb), | |
| ] | |
| best_score = float("inf") | |
| best_model = None | |
| best_params = None | |
| list_file_name = ["model_efa/model_rf_without_opening_week.pkl", "model_efa/model_gb_without_opening_week.pkl", "model_efa/model_xgb_without_opening_week.pkl", "model_efa/model_lgbm_without_opening_week.pkl", "model_efa/model_cb_without_opening_week.pkl"] | |
| index_file_name = 0 | |
| for model, param_grid in models: | |
| search = grid_search(model, param_grid) | |
| best_score = -search.best_score_ | |
| best_model = search.best_estimator_ | |
| best_params = search.best_params_ | |
| with open(list_file_name[index_file_name], "wb") as f: | |
| pickle.dump(best_model, f) | |
| index_file_name += 1 | |
| print(f"Best model: {best_model}") | |
| print(f"Best parameters: {best_params}") | |
| print(f"Best score: {best_score}") | |
| y_pred_log = best_model.predict(X_test) | |
| y_pred = np.expm1(y_pred_log) | |
| y_test_actual = np.expm1(y_test) | |
| mse = mean_squared_error(y_test_actual, y_pred) | |
| rmse = np.sqrt(mse) | |
| mae = mean_absolute_error(y_test_actual, y_pred) | |
| r2 = r2_score(y_test_actual, y_pred) | |
| print(f"Mean Squared Error (MSE): {mse}") | |
| print(f"Root Mean Squared Error (RMSE): {rmse}") | |
| print(f"Mean Absolute Error (MAE): {mae}") | |
| print(f"R^2 Score: {r2}") | |
| scores = cross_val_score( | |
| best_model, X, y_log, cv=5, scoring="neg_mean_squared_error" | |
| ) | |
| rmse_scores = np.sqrt(-scores) | |
| print(f"Cross-validated RMSE scores: {rmse_scores}") | |
| print(f"Mean RMSE: {rmse_scores.mean()}") | |
| print(f"Standard deviation of RMSE: {rmse_scores.std()}") | |
| with open("gridsearch_result/result_without_opening.txt", "a") as f: | |
| print(f"Best model: {best_model}", file=f) | |
| print(f"Best parameters: {best_params}", file=f) | |
| print(f"Best score: {best_score}", file=f) | |
| print(f"Mean Squared Error (MSE): {mse}", file=f) | |
| print(f"Root Mean Squared Error (RMSE): {rmse}", file=f) | |
| print(f"Mean Absolute Error (MAE): {mae}", file=f) | |
| print(f"R^2 Score: {r2}", file=f) | |
| print(f"Cross-validated RMSE scores: {rmse_scores}", file=f) | |
| print(f"Mean RMSE: {rmse_scores.mean()}", file=f) | |
| print(f"Standard deviation of RMSE: {rmse_scores.std()}", file=f) | |
| print( | |
| "----------------------------------------------------------------\n\n",file=f | |
| ) | |
| with open("model_efa/mpaa_label_encoder.pkl", "wb") as f: | |
| pickle.dump(mpaa_label_encoder, f) | |
| with open("model_efa/country_label_encoder.pkl", "wb") as f: | |
| pickle.dump(country_label_encoder, f) | |
| with open("model_efa/scaler.pkl", "wb") as f: | |
| pickle.dump(scaler, f) | |
| with open("model_efa/factor_analyzer.pkl", "wb") as f: | |
| pickle.dump(fa, f) | |
| with open("model_efa/unique_genres.pkl", "wb") as f: | |
| pickle.dump(unique_genres, f) | |
| with open("model_efa/selected_features_without_opening_week.pkl", "wb") as f: | |
| pickle.dump(selected_features, f) | |
| if __name__ == "__main__": | |
| df = pd.read_csv("merge_data/final_merged.csv") | |
| # with open("gridsearch_result/result_with_opening.txt","w") as f: | |
| # pass | |
| # with open("gridsearch_result/result_without_opening.txt","w") as f: | |
| # pass | |
| train(df.copy()) | |
| train_without_opening_week(df.copy()) | |