Project_MLops / src /app /predict_with_efa.py
QuanTH02's picture
feat: dir structure
e964b12
import pandas as pd
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")
# Dope,R,7000000.0,103.0,2002.0,6100010.0,17506470.0,7.2,89000.0,United States,Adventure Comedy Crime Drama,200.0,85.04,0,6.0,2015.0
# Dora and the Lost City of Gold,PG,49000000.0,102.0,3735.0,17431588.0,60477943.0,6.1,35000.0,"Australia, United States",Action Adventure Comedy Family Fantasy Mystery,179.0,82.17,0,8.0,2019.0
# Double Take,PG-13,24000000.0,88.0,1631.0,11736236.0,29831583.0,5.4,88000.0,United States,Action Comedy Crime Thriller,100.0,14.86,0,1.0,2001.0
# Doubt,PG-13,20000000.0,104.0,1287.0,507226.0,33446470.0,7.5,136000.0,United States,Drama Mystery,257.0,77.46,0,12.0,2008.0
# Down to Earth,PG-13,49000000.0,87.0,2521.0,20027309.0,64186502.0,5.4,25000.0,United States,Comedy Fantasy,124.0,22.71,0,2.0,2001.0
# Downsizing,R,68000000.0,135.0,2668.0,4954287.0,24449754.0,5.8,125000.0,United States,Drama Fantasy Sci-Fi,349.0,49.34,0,12.0,2017.0
# Downton Abbey,PG,13000000.0,122.0,3548.0,31033665.0,96853865.0,7.4,63000.0,United Kingdom,Drama Romance,303.0,81.23,0,9.0,2019.0
# Dr. Dolittle 2,PG,72000000.0,87.0,3053.0,25037039.0,112952899.0,4.7,47000.0,United States,Comedy Family Fantasy,135.0,42.66,0,6.0,2001.0
# Dracula 2000,R,54000000.0,100.0,2204.0,8636567.0,33022767.0,4.9,37000.0,"Canada, United States",Action Fantasy Horror Thriller,14.0,26.0,0,12.0,2000.0
# Dracula Untold,PG-13,70000000.0,92.0,2900.0,23514615.0,56280355.0,6.2,208000.0,United States,Action Drama Fantasy Horror,168.0,27.68,0,10.0,2014.0
# Draft Day,PG-13,25000000.0,110.0,2781.0,9783603.0,28842237.0,6.8,67000.0,United States,Drama Sport,196.0,58.99,0,4.0,2014.0
# Drag Me to Hell,PG-13,30000000.0,99.0,2510.0,15825480.0,42100625.0,6.6,218000.0,United States,Horror,302.0,91.05,0,5.0,2009.0
# Dragon Wars: D-War,PG-13,32000000.0,107.0,2277.0,5376000.0,10977721.0,3.5,25000.0,"Republic of Korea, United States",Action Drama Fantasy Thriller,47.0,29.77,0,9.0,2007.0
# Dragonball Evolution,PG,30000000.0,85.0,2181.0,4756488.0,9362785.0,2.5,79000.0,United States,Action Adventure Fantasy Sci-Fi Thriller,10.0,45.0,0,4.0,2009.0
# Dragonfly,PG-13,60000000.0,104.0,2507.0,10216025.0,30323400.0,6.1,40000.0,"Germany, United States",Drama Fantasy Mystery Romance Thriller,158.0,10.76,0,2.0,2002.0
def predict_with_feature_selection(model_file_name, month, year, mpaa, budget, runtime, screens, opening_week, user_vote, ratings, critic_vote, meta_score, sequel, genres, country):
movie = {}
movie["month"] = float(month)
movie["year"] = float(year)
movie["mpaa"] = mpaa
movie["budget"] = float(budget)
movie["runtime"] = float(runtime)
movie["screens"] = float(screens)
movie["opening_week"] = float(opening_week)
movie["user_vote"] = float(user_vote)
movie["ratings"] = float(ratings)
movie["critic_vote"] = float(critic_vote)
movie["meta_score"] = float(meta_score)
movie["sequel"] = float(sequel)
movie["genres"] = genres
movie["country"] = country
with open(model_file_name, "rb") as f:
model = pickle.load(f)
with open("../model_efa/mpaa_label_encoder.pkl", "rb") as f:
mpaa_label_encoder = pickle.load(f)
with open("../model_efa/country_label_encoder.pkl", "rb") as f:
country_label_encoder = pickle.load(f)
with open("../model_efa/scaler.pkl", "rb") as f:
scaler = pickle.load(f)
with open("../model_efa/factor_analyzer.pkl", "rb") as f:
fa = pickle.load(f)
with open("../model_efa/unique_genres.pkl", "rb") as f:
unique_genres = pickle.load(f)
with open("../model_efa/selected_features.pkl", "rb") as f:
selected_features = pickle.load(f)
movie["mpaa"] = mpaa_label_encoder.transform([movie["mpaa"]])[0]
movie["country"] = country_label_encoder.transform([movie["country"]])[0]
new_movie_genres = np.array(
[
1 if genre in movie.get("genres", "").split() else 0
for genre in unique_genres
]
).reshape(1, -1)
new_movie_genres_scaled = scaler.transform(new_movie_genres)
new_movie_factors = fa.transform(new_movie_genres_scaled)
movie.update(
{
f"Factor{i+1}": new_movie_factors[0, i]
for i in range(new_movie_factors.shape[1])
}
)
movie_df = pd.DataFrame([movie])
movie_df = movie_df[selected_features]
prediction_log = model.predict(movie_df)
prediction = np.expm1(prediction_log)
return prediction[0]
def predict_with_feature_selection_without_opening_week(model_file_name, month, year, mpaa, budget, runtime, screens, critic_vote, meta_score, sequel, genres, country):
movie = {}
movie["month"] = float(month)
movie["year"] = float(year)
movie["mpaa"] = mpaa
movie["budget"] = float(budget)
movie["runtime"] = float(runtime)
movie["screens"] = float(screens)
movie["critic_vote"] = float(critic_vote)
movie["meta_score"] = float(meta_score)
movie["sequel"] = float(sequel)
movie["genres"] = genres
movie["country"] = country
with open(model_file_name, "rb") as f:
model = pickle.load(f)
with open("../model_efa/mpaa_label_encoder.pkl", "rb") as f:
mpaa_label_encoder = pickle.load(f)
with open("../model_efa/country_label_encoder.pkl", "rb") as f:
country_label_encoder = pickle.load(f)
with open("../model_efa/scaler.pkl", "rb") as f:
scaler = pickle.load(f)
with open("../model_efa/factor_analyzer.pkl", "rb") as f:
fa = pickle.load(f)
with open("../model_efa/unique_genres.pkl", "rb") as f:
unique_genres = pickle.load(f)
with open("../model_efa/selected_features_without_opening_week.pkl", "rb") as f:
selected_features = pickle.load(f)
movie["mpaa"] = mpaa_label_encoder.transform([movie["mpaa"]])[0]
movie["country"] = country_label_encoder.transform([movie["country"]])[0]
new_movie_genres = np.array(
[
1 if genre in movie.get("genres", "").split() else 0
for genre in unique_genres
]
).reshape(1, -1)
new_movie_genres_scaled = scaler.transform(new_movie_genres)
new_movie_factors = fa.transform(new_movie_genres_scaled)
movie.update(
{
f"Factor{i+1}": new_movie_factors[0, i]
for i in range(new_movie_factors.shape[1])
}
)
movie_df = pd.DataFrame([movie])
movie_df = movie_df[selected_features]
prediction_log = model.predict(movie_df)
prediction = np.expm1(prediction_log)
return prediction[0]
if __name__ == '__main__':
list_file_name = ["../model_efa/model_rf.pkl", "../model_efa/model_gb.pkl", "../model_efa/model_xgb.pkl", "../model_efa/model_lgbm.pkl", "../model_efa/model_cb.pkl"]
list_file_name_without_opening_week = ["../model_efa/model_rf_without_opening_week.pkl", "../model_efa/model_gb_without_opening_week.pkl", "../model_efa/model_xgb_without_opening_week.pkl", "../model_efa/model_lgbm_without_opening_week.pkl", "../model_efa/model_cb_without_opening_week.pkl"]
for file_name in list_file_name:
print(predict_with_feature_selection(file_name, 1, 2021, "PG-13", 15000000, 103, 3427, 24727437, 72082999, 7.2, 355000, 88.32, 0, "Drama Horror Mystery Sci-Fi Thriller", "United States"))
for file_name in list_file_name_without_opening_week:
print(predict_with_feature_selection_without_opening_week(file_name, 1, 2021, "PG-13", 15000000, 103, 3427, 355000, 88.32, 0, "Drama Horror Mystery Sci-Fi Thriller", "United States"))