File size: 7,605 Bytes
e964b12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

# Dope,R,7000000.0,103.0,2002.0,6100010.0,17506470.0,7.2,89000.0,United States,Adventure Comedy Crime Drama,200.0,85.04,0,6.0,2015.0
# Dora and the Lost City of Gold,PG,49000000.0,102.0,3735.0,17431588.0,60477943.0,6.1,35000.0,"Australia, United States",Action Adventure Comedy Family Fantasy Mystery,179.0,82.17,0,8.0,2019.0
# Double Take,PG-13,24000000.0,88.0,1631.0,11736236.0,29831583.0,5.4,88000.0,United States,Action Comedy Crime Thriller,100.0,14.86,0,1.0,2001.0
# Doubt,PG-13,20000000.0,104.0,1287.0,507226.0,33446470.0,7.5,136000.0,United States,Drama Mystery,257.0,77.46,0,12.0,2008.0
# Down to Earth,PG-13,49000000.0,87.0,2521.0,20027309.0,64186502.0,5.4,25000.0,United States,Comedy Fantasy,124.0,22.71,0,2.0,2001.0
# Downsizing,R,68000000.0,135.0,2668.0,4954287.0,24449754.0,5.8,125000.0,United States,Drama Fantasy Sci-Fi,349.0,49.34,0,12.0,2017.0
# Downton Abbey,PG,13000000.0,122.0,3548.0,31033665.0,96853865.0,7.4,63000.0,United Kingdom,Drama Romance,303.0,81.23,0,9.0,2019.0
# Dr. Dolittle 2,PG,72000000.0,87.0,3053.0,25037039.0,112952899.0,4.7,47000.0,United States,Comedy Family Fantasy,135.0,42.66,0,6.0,2001.0
# Dracula 2000,R,54000000.0,100.0,2204.0,8636567.0,33022767.0,4.9,37000.0,"Canada, United States",Action Fantasy Horror Thriller,14.0,26.0,0,12.0,2000.0
# Dracula Untold,PG-13,70000000.0,92.0,2900.0,23514615.0,56280355.0,6.2,208000.0,United States,Action Drama Fantasy Horror,168.0,27.68,0,10.0,2014.0
# Draft Day,PG-13,25000000.0,110.0,2781.0,9783603.0,28842237.0,6.8,67000.0,United States,Drama Sport,196.0,58.99,0,4.0,2014.0
# Drag Me to Hell,PG-13,30000000.0,99.0,2510.0,15825480.0,42100625.0,6.6,218000.0,United States,Horror,302.0,91.05,0,5.0,2009.0
# Dragon Wars: D-War,PG-13,32000000.0,107.0,2277.0,5376000.0,10977721.0,3.5,25000.0,"Republic of Korea, United States",Action Drama Fantasy Thriller,47.0,29.77,0,9.0,2007.0
# Dragonball Evolution,PG,30000000.0,85.0,2181.0,4756488.0,9362785.0,2.5,79000.0,United States,Action Adventure Fantasy Sci-Fi Thriller,10.0,45.0,0,4.0,2009.0
# Dragonfly,PG-13,60000000.0,104.0,2507.0,10216025.0,30323400.0,6.1,40000.0,"Germany, United States",Drama Fantasy Mystery Romance Thriller,158.0,10.76,0,2.0,2002.0

def predict_with_feature_selection(model_file_name, month, year, mpaa, budget, runtime, screens, opening_week, user_vote, ratings, critic_vote, meta_score, sequel, genres, country):
    movie = {}

    movie["month"] = float(month)
    movie["year"] = float(year)
    movie["mpaa"] = mpaa
    movie["budget"] = float(budget)
    movie["runtime"] = float(runtime)
    movie["screens"] = float(screens)
    movie["opening_week"] = float(opening_week)
    movie["user_vote"] = float(user_vote)
    movie["ratings"] = float(ratings)
    movie["critic_vote"] = float(critic_vote)
    movie["meta_score"] = float(meta_score)
    movie["sequel"] = float(sequel)
    movie["genres"] = genres
    movie["country"] = country
    
    with open(model_file_name, "rb") as f:
        model = pickle.load(f)
    with open("../model_efa/mpaa_label_encoder.pkl", "rb") as f:
        mpaa_label_encoder = pickle.load(f)
    with open("../model_efa/country_label_encoder.pkl", "rb") as f:
        country_label_encoder = pickle.load(f)
    with open("../model_efa/scaler.pkl", "rb") as f:
        scaler = pickle.load(f)
    with open("../model_efa/factor_analyzer.pkl", "rb") as f:
        fa = pickle.load(f)
    with open("../model_efa/unique_genres.pkl", "rb") as f:
        unique_genres = pickle.load(f)
    with open("../model_efa/selected_features.pkl", "rb") as f:
        selected_features = pickle.load(f)

    movie["mpaa"] = mpaa_label_encoder.transform([movie["mpaa"]])[0]
    movie["country"] = country_label_encoder.transform([movie["country"]])[0]

    new_movie_genres = np.array(
        [
            1 if genre in movie.get("genres", "").split() else 0
            for genre in unique_genres
        ]
    ).reshape(1, -1)
    new_movie_genres_scaled = scaler.transform(new_movie_genres)
    new_movie_factors = fa.transform(new_movie_genres_scaled)

    movie.update(
        {
            f"Factor{i+1}": new_movie_factors[0, i]
            for i in range(new_movie_factors.shape[1])
        }
    )

    movie_df = pd.DataFrame([movie])
    movie_df = movie_df[selected_features]
    prediction_log = model.predict(movie_df)
    prediction = np.expm1(prediction_log)  
    return prediction[0]

def predict_with_feature_selection_without_opening_week(model_file_name, month, year, mpaa, budget, runtime, screens, critic_vote, meta_score, sequel, genres, country):
    movie = {}

    movie["month"] = float(month)
    movie["year"] = float(year)
    movie["mpaa"] = mpaa
    movie["budget"] = float(budget)
    movie["runtime"] = float(runtime)
    movie["screens"] = float(screens)
    movie["critic_vote"] = float(critic_vote)
    movie["meta_score"] = float(meta_score)
    movie["sequel"] = float(sequel)
    movie["genres"] = genres
    movie["country"] = country
    
    with open(model_file_name, "rb") as f:
        model = pickle.load(f)
    with open("../model_efa/mpaa_label_encoder.pkl", "rb") as f:
        mpaa_label_encoder = pickle.load(f)
    with open("../model_efa/country_label_encoder.pkl", "rb") as f:
        country_label_encoder = pickle.load(f)
    with open("../model_efa/scaler.pkl", "rb") as f:
        scaler = pickle.load(f)
    with open("../model_efa/factor_analyzer.pkl", "rb") as f:
        fa = pickle.load(f)
    with open("../model_efa/unique_genres.pkl", "rb") as f:
        unique_genres = pickle.load(f)
    with open("../model_efa/selected_features_without_opening_week.pkl", "rb") as f:
        selected_features = pickle.load(f)

    movie["mpaa"] = mpaa_label_encoder.transform([movie["mpaa"]])[0]
    movie["country"] = country_label_encoder.transform([movie["country"]])[0]

    new_movie_genres = np.array(
        [
            1 if genre in movie.get("genres", "").split() else 0
            for genre in unique_genres
        ]
    ).reshape(1, -1)
    new_movie_genres_scaled = scaler.transform(new_movie_genres)
    new_movie_factors = fa.transform(new_movie_genres_scaled)

    movie.update(
        {
            f"Factor{i+1}": new_movie_factors[0, i]
            for i in range(new_movie_factors.shape[1])
        }
    )

    movie_df = pd.DataFrame([movie])
    movie_df = movie_df[selected_features]
    prediction_log = model.predict(movie_df)
    prediction = np.expm1(prediction_log)  
    return prediction[0]

if __name__ == '__main__':
    list_file_name = ["../model_efa/model_rf.pkl", "../model_efa/model_gb.pkl", "../model_efa/model_xgb.pkl", "../model_efa/model_lgbm.pkl", "../model_efa/model_cb.pkl"]
    list_file_name_without_opening_week = ["../model_efa/model_rf_without_opening_week.pkl", "../model_efa/model_gb_without_opening_week.pkl", "../model_efa/model_xgb_without_opening_week.pkl", "../model_efa/model_lgbm_without_opening_week.pkl", "../model_efa/model_cb_without_opening_week.pkl"]
    for file_name in list_file_name:
        print(predict_with_feature_selection(file_name, 1, 2021, "PG-13", 15000000, 103, 3427, 24727437, 72082999, 7.2, 355000, 88.32, 0, "Drama Horror Mystery Sci-Fi Thriller", "United States"))
    for file_name in list_file_name_without_opening_week:
        print(predict_with_feature_selection_without_opening_week(file_name, 1, 2021, "PG-13", 15000000, 103, 3427, 355000, 88.32, 0, "Drama Horror Mystery Sci-Fi Thriller", "United States"))