import pandas as pd import numpy as np import pickle import os import warnings warnings.filterwarnings("ignore", message="X does not have valid feature names") # Dope,R,7000000.0,103.0,2002.0,6100010.0,17506470.0,7.2,89000.0,United States,Adventure Comedy Crime Drama,200.0,85.04,0,6.0,2015.0 # Dora and the Lost City of Gold,PG,49000000.0,102.0,3735.0,17431588.0,60477943.0,6.1,35000.0,"Australia, United States",Action Adventure Comedy Family Fantasy Mystery,179.0,82.17,0,8.0,2019.0 # Double Take,PG-13,24000000.0,88.0,1631.0,11736236.0,29831583.0,5.4,88000.0,United States,Action Comedy Crime Thriller,100.0,14.86,0,1.0,2001.0 # Doubt,PG-13,20000000.0,104.0,1287.0,507226.0,33446470.0,7.5,136000.0,United States,Drama Mystery,257.0,77.46,0,12.0,2008.0 # Down to Earth,PG-13,49000000.0,87.0,2521.0,20027309.0,64186502.0,5.4,25000.0,United States,Comedy Fantasy,124.0,22.71,0,2.0,2001.0 # Downsizing,R,68000000.0,135.0,2668.0,4954287.0,24449754.0,5.8,125000.0,United States,Drama Fantasy Sci-Fi,349.0,49.34,0,12.0,2017.0 # Downton Abbey,PG,13000000.0,122.0,3548.0,31033665.0,96853865.0,7.4,63000.0,United Kingdom,Drama Romance,303.0,81.23,0,9.0,2019.0 # Dr. Dolittle 2,PG,72000000.0,87.0,3053.0,25037039.0,112952899.0,4.7,47000.0,United States,Comedy Family Fantasy,135.0,42.66,0,6.0,2001.0 # Dracula 2000,R,54000000.0,100.0,2204.0,8636567.0,33022767.0,4.9,37000.0,"Canada, United States",Action Fantasy Horror Thriller,14.0,26.0,0,12.0,2000.0 # Dracula Untold,PG-13,70000000.0,92.0,2900.0,23514615.0,56280355.0,6.2,208000.0,United States,Action Drama Fantasy Horror,168.0,27.68,0,10.0,2014.0 # Draft Day,PG-13,25000000.0,110.0,2781.0,9783603.0,28842237.0,6.8,67000.0,United States,Drama Sport,196.0,58.99,0,4.0,2014.0 # Drag Me to Hell,PG-13,30000000.0,99.0,2510.0,15825480.0,42100625.0,6.6,218000.0,United States,Horror,302.0,91.05,0,5.0,2009.0 # Dragon Wars: D-War,PG-13,32000000.0,107.0,2277.0,5376000.0,10977721.0,3.5,25000.0,"Republic of Korea, United States",Action Drama Fantasy Thriller,47.0,29.77,0,9.0,2007.0 # Dragonball Evolution,PG,30000000.0,85.0,2181.0,4756488.0,9362785.0,2.5,79000.0,United States,Action Adventure Fantasy Sci-Fi Thriller,10.0,45.0,0,4.0,2009.0 # Dragonfly,PG-13,60000000.0,104.0,2507.0,10216025.0,30323400.0,6.1,40000.0,"Germany, United States",Drama Fantasy Mystery Romance Thriller,158.0,10.76,0,2.0,2002.0 def predict_with_feature_selection(model_file_name, month, year, mpaa, budget, runtime, screens, opening_week, user_vote, ratings, critic_vote, meta_score, sequel, genres, country): movie = {} movie["month"] = float(month) movie["year"] = float(year) movie["mpaa"] = mpaa movie["budget"] = float(budget) movie["runtime"] = float(runtime) movie["screens"] = float(screens) movie["opening_week"] = float(opening_week) movie["user_vote"] = float(user_vote) movie["ratings"] = float(ratings) movie["critic_vote"] = float(critic_vote) movie["meta_score"] = float(meta_score) movie["sequel"] = float(sequel) movie["genres"] = genres movie["country"] = country with open(model_file_name, "rb") as f: model = pickle.load(f) with open("../model_efa/mpaa_label_encoder.pkl", "rb") as f: mpaa_label_encoder = pickle.load(f) with open("../model_efa/country_label_encoder.pkl", "rb") as f: country_label_encoder = pickle.load(f) with open("../model_efa/scaler.pkl", "rb") as f: scaler = pickle.load(f) with open("../model_efa/factor_analyzer.pkl", "rb") as f: fa = pickle.load(f) with open("../model_efa/unique_genres.pkl", "rb") as f: unique_genres = pickle.load(f) with open("../model_efa/selected_features.pkl", "rb") as f: selected_features = pickle.load(f) movie["mpaa"] = mpaa_label_encoder.transform([movie["mpaa"]])[0] movie["country"] = country_label_encoder.transform([movie["country"]])[0] new_movie_genres = np.array( [ 1 if genre in movie.get("genres", "").split() else 0 for genre in unique_genres ] ).reshape(1, -1) new_movie_genres_scaled = scaler.transform(new_movie_genres) new_movie_factors = fa.transform(new_movie_genres_scaled) movie.update( { f"Factor{i+1}": new_movie_factors[0, i] for i in range(new_movie_factors.shape[1]) } ) movie_df = pd.DataFrame([movie]) movie_df = movie_df[selected_features] prediction_log = model.predict(movie_df) prediction = np.expm1(prediction_log) return prediction[0] def predict_with_feature_selection_without_opening_week(model_file_name, month, year, mpaa, budget, runtime, screens, critic_vote, meta_score, sequel, genres, country): movie = {} movie["month"] = float(month) movie["year"] = float(year) movie["mpaa"] = mpaa movie["budget"] = float(budget) movie["runtime"] = float(runtime) movie["screens"] = float(screens) movie["critic_vote"] = float(critic_vote) movie["meta_score"] = float(meta_score) movie["sequel"] = float(sequel) movie["genres"] = genres movie["country"] = country with open(model_file_name, "rb") as f: model = pickle.load(f) with open("../model_efa/mpaa_label_encoder.pkl", "rb") as f: mpaa_label_encoder = pickle.load(f) with open("../model_efa/country_label_encoder.pkl", "rb") as f: country_label_encoder = pickle.load(f) with open("../model_efa/scaler.pkl", "rb") as f: scaler = pickle.load(f) with open("../model_efa/factor_analyzer.pkl", "rb") as f: fa = pickle.load(f) with open("../model_efa/unique_genres.pkl", "rb") as f: unique_genres = pickle.load(f) with open("../model_efa/selected_features_without_opening_week.pkl", "rb") as f: selected_features = pickle.load(f) movie["mpaa"] = mpaa_label_encoder.transform([movie["mpaa"]])[0] movie["country"] = country_label_encoder.transform([movie["country"]])[0] new_movie_genres = np.array( [ 1 if genre in movie.get("genres", "").split() else 0 for genre in unique_genres ] ).reshape(1, -1) new_movie_genres_scaled = scaler.transform(new_movie_genres) new_movie_factors = fa.transform(new_movie_genres_scaled) movie.update( { f"Factor{i+1}": new_movie_factors[0, i] for i in range(new_movie_factors.shape[1]) } ) movie_df = pd.DataFrame([movie]) movie_df = movie_df[selected_features] prediction_log = model.predict(movie_df) prediction = np.expm1(prediction_log) return prediction[0] if __name__ == '__main__': list_file_name = ["../model_efa/model_rf.pkl", "../model_efa/model_gb.pkl", "../model_efa/model_xgb.pkl", "../model_efa/model_lgbm.pkl", "../model_efa/model_cb.pkl"] list_file_name_without_opening_week = ["../model_efa/model_rf_without_opening_week.pkl", "../model_efa/model_gb_without_opening_week.pkl", "../model_efa/model_xgb_without_opening_week.pkl", "../model_efa/model_lgbm_without_opening_week.pkl", "../model_efa/model_cb_without_opening_week.pkl"] for file_name in list_file_name: print(predict_with_feature_selection(file_name, 1, 2021, "PG-13", 15000000, 103, 3427, 24727437, 72082999, 7.2, 355000, 88.32, 0, "Drama Horror Mystery Sci-Fi Thriller", "United States")) for file_name in list_file_name_without_opening_week: print(predict_with_feature_selection_without_opening_week(file_name, 1, 2021, "PG-13", 15000000, 103, 3427, 355000, 88.32, 0, "Drama Horror Mystery Sci-Fi Thriller", "United States"))