notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [4]:

    
df = pd.read_csv("../data_forlr.csv",encoding="latin1")



In [3]:

    
df_act_dir = pd.read_csv("FinalClean.csv",encoding="latin1")



In [2]:

    
df_act_dir = df_act_dir[["actor_popularity","director_popularity"]]
df = pd.concat([df,df_act_dir],axis=1)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-88aa909104d6> in <module>()
----> 1 df_act_dir = df_act_dir[["actor_popularity","director_popularity"]]
      2 df = pd.concat([df,df_act_dir],axis=1)

NameError: name 'df_act_dir' is not defined



In [5]:

    
df.shape









    Out[5]:





(2384, 21)



In [6]:

    
df.isnull().sum()









    Out[6]:





Unnamed: 0               0
imdb_id                  0
X.x                      0
Actors                   1
Country                  0
Director                 1
Genre                    1
IMDB.Rating              1
IMDB.Votes               1
Language                 2
Production             157
Rated                  198
Released                 0
Runtime                  6
Title                    0
Year                     0
X.y                      0
budget                   0
revenue                  0
actor_popularity         0
director_popularity      0
dtype: int64



In [7]:

    
col_replace = {"Actors":"actors","Country":"country","Director":"director","Genre":"genre",
               "IMDB.Rating":"imdb_rating","IMDB.Votes":"imdb_votes","Language":"language",
               "Released":"released","Runtime":"runtime","Year":"year",
               "Production":"production","Rated":"rated"}
df = df.drop(["Unnamed: 0","imdb_id","Title","X.x","X.y"],axis=1).rename(columns=col_replace)



In [8]:

    
# drop_missing_value
mis_val_col = ["actors","director","genre","imdb_votes","runtime","imdb_rating","language"]
for col in mis_val_col:
    df = df.drop(df[df[col].isnull()].index)



In [9]:

    
df.isnull().sum(),df.shape









    Out[9]:





(actors                   0
 country                  0
 director                 0
 genre                    0
 imdb_rating              0
 imdb_votes               0
 language                 0
 production             148
 rated                  189
 released                 0
 runtime                  0
 year                     0
 budget                   0
 revenue                  0
 actor_popularity         0
 director_popularity      0
 dtype: int64, (2373, 16))



In [10]:

    
num_feat = []
cate_feat = []
for i in df.columns:
    if (df[i]).dtype == "int64" or (df[i]).dtype == "float64":
        num_feat.append(i)
    else:
        cate_feat.append(i)



In [11]:

    
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity'] 6
['actors', 'country', 'director', 'genre', 'imdb_votes', 'language', 'production', 'rated', 'released', 'runtime'] 10

Numeric features

imdb_rating



In [12]:

    
sns.distplot(df["imdb_rating"])
# df["imdb_rating"].hist()









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f193a8eb8>

budget



In [14]:

    
import math
df["budget"] = df["budget"].map(lambda x:math.log(x))



In [15]:

    
# df["budget"].hist()
sns.distplot(df["budget"])









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f18da0d68>

revenue (target)



In [16]:

    
# df["revenue"].hist()
# sns.distplot(df["revenue"],kde=False)
df["revenue"] = df["revenue"].map(lambda x:math.log(x))



In [17]:

    
sns.distplot(df["revenue"])









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f18f8e780>



In [18]:

    
sns.jointplot(x="budget",y="revenue",data=df,kind="reg")









    Out[18]:





<seaborn.axisgrid.JointGrid at 0x18f191ad9b0>

Categorical features

country



In [19]:

    
# # regions/countries involved in producing movies
df["country"] = df["country"].map(lambda x:len(str(x).split(",")))
# df["country"].value_counts().plot.bar(figsize=(16,6))
# df["country"].nunique()
#  /df.shape[0]).plot.bar(figsize=(16,6))
# sns.boxplot(x="country",y="revenue",data=df)



In [ ]:

    
# df["country"].value_counts()
# df = df.drop("country",axis=1)



In [20]:

    
num_feat.append("country")
print(num_feat,len(num_feat))
cate_feat.remove("country")
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country'] 7
['actors', 'director', 'genre', 'imdb_votes', 'language', 'production', 'rated', 'released', 'runtime'] 9

genre



In [21]:

    
df = pd.concat([df, df['genre'].str.get_dummies(sep=', ')], axis=1) 
df['Thriller'] = df[['Thriller', 'Horror']].sum(axis=1)
df['Fantasy'] = df[['Fantasy', 'Sci-Fi']].sum(axis=1)
df['Other_genre'] = df[['Music', 'History', 'Sport', 'War', 'Western', 'Musical', 'Documentary', 'News']].sum(axis=1)
df.drop(['Music', 'History', 'Sport', 'War', 'Western', 'Musical', 'Documentary', 'News', 'Horror', 'Sci-Fi'], 
        axis=1, inplace=True)
genre_lst = list(df)[19:32]
for x in genre_lst:
    #print(x)
    df.loc[df['%s' % x] > 1, '%s' % x] = 1
    #print(df['%s' % x].value_counts())
df = df.drop("genre",axis=1)



In [22]:

    
genre_dict = {}
for i in df.columns[14:]:
    genre_dict.update({i:i.lower()})
df = df.rename(columns = genre_dict)



In [23]:

    
for i in df.columns[14:]:
    num_feat.append(i)
print(num_feat,len(num_feat))
cate_feat.remove("genre")
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre'] 21
['actors', 'director', 'imdb_votes', 'language', 'production', 'rated', 'released', 'runtime'] 8

imdb_votes



In [24]:

    
df["imdb_votes"] = df["imdb_votes"].astype(str).str.replace("\D+","").astype(int)
# df["imdb_votes"].hist()
sns.distplot(df["imdb_votes"],kde=False)









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1a500fd0>



In [25]:

    
df["imdb_votes"] = df["imdb_votes"].map(lambda x:math.log(x))



In [26]:

    
sns.distplot(df["imdb_votes"])









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1a6a5cf8>



In [27]:

    
num_feat.append("imdb_votes")
cate_feat.remove("imdb_votes")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre', 'imdb_votes'] 22
['actors', 'director', 'language', 'production', 'rated', 'released', 'runtime'] 7

language



In [28]:

    
# list length
df["language"] = df["language"].map(lambda x:len(str(x).split(",")))



In [29]:

    
df["language"].value_counts()









    Out[29]:





1     1466
2      533
3      229
4       83
5       42
6       11
9        6
19       1
7        1
8        1
Name: language, dtype: int64



In [30]:

    
num_feat.append("language")
cate_feat.remove("language")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre', 'imdb_votes', 'language'] 23
['actors', 'director', 'production', 'rated', 'released', 'runtime'] 6

production



In [31]:

    
# frequency encoding
df["production"] = df["production"].replace(np.nan, "Unknown")\
                    .map(lambda x: x.split(" ")[0] if len(x) > 1 else x)
# (df["production"].value_counts()/df.shape[0])[:100].plot.bar(figsize=(16,6))
# zip_freq = list(df_2014['addrzip'].value_counts()[:20].index)
# df_2014['addrzip'] = df_2014['addrzip'].map(lambda s:'others' if s not in zip_freq else s)
# list(df_2014['addrzip'].value_counts().index)
# zip_map = {'others':20,'750':0,'945':1,'112':2,'606':3,'300':4,'070':5,'331':6,'100':7,'770':8,
#            '900':9,'117':10,'917':11,'104':12,'891':13,'330':14,'852':15,'921':16,'913':17,'926':18,'925':19}
# df_2014['addrzip'] = df_2014['addrzip'].map(lambda s: zip_map.get(s) if s in zip_map else s)



In [32]:

    
prod_freq = list(df["production"].value_counts()[:20].index)



In [33]:

    
df["production"] = df["production"].map(lambda s:"other_productions" if s not in prod_freq else s)



In [34]:

    
prod_counts = df["production"].value_counts()
prod_dict = prod_counts.to_dict()



In [35]:

    
df["production"] = df["production"].map(lambda s:prod_dict.get(s) if s in prod_dict else s)



In [36]:

    
df["production"].unique()









    Out[36]:





array([102, 163,  21, 210, 701,  27,  20, 127,  50,  60, 155, 206,  80,
        37,  78, 149,  57,  46,  24,  32,  28], dtype=int64)



In [37]:

    
# high cardinality: may use frequency encoding
# (df["production"].value_counts()/df.shape[0])[:].sum()



In [38]:

    
num_feat.append("production")
cate_feat.remove("production")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre', 'imdb_votes', 'language', 'production'] 24
['actors', 'director', 'rated', 'released', 'runtime'] 5

rated



In [39]:

    
df["rated"].value_counts()









    Out[39]:





R            936
PG-13        774
PG           285
NOT RATED    129
G             29
UNRATED       22
TV-MA          4
TV-14          2
NC-17          2
TV-PG          1
Name: rated, dtype: int64



In [40]:

    
# df["rated"].value_counts().plot.bar()
sns.countplot(x="rated", data=df)









    Out[40]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1a3ddc88>



In [41]:

    
sns.set_style('ticks')
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.boxplot(x="rated", y="revenue", data=df)









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1a9a3940>



In [42]:

    
df["rated"] = df["rated"].replace(np.nan, "UNRATED")\
            .replace("NOT RATED", "UNRATED")



In [43]:

    
df = pd.concat([df, df['rated'].str.get_dummies(sep=', ')], axis=1)



In [44]:

    
df.columns[28:]









    Out[44]:





Index(['G', 'NC-17', 'PG', 'PG-13', 'R', 'TV-14', 'TV-MA', 'TV-PG', 'UNRATED'], dtype='object')



In [45]:

    
for i in df.columns[28:]:
    num_feat.append(i)
cate_feat.remove("rated")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre', 'imdb_votes', 'language', 'production', 'G', 'NC-17', 'PG', 'PG-13', 'R', 'TV-14', 'TV-MA', 'TV-PG', 'UNRATED'] 33
['actors', 'director', 'released', 'runtime'] 4



In [46]:

    
df = df.drop("rated",axis=1)

released



In [47]:

    
# index of released date col
index = df.columns.get_loc("released")
#change date data to timestamp
release_dates = pd.to_datetime(df["released"])
# released date is weekend of not
weekend_list = []
for each in release_dates:
    day_ofweek = each.dayofweek
    if day_ofweek >= 4 and day_ofweek <= 6:
        tag = 1
    else:
        tag = 0
    weekend_list.append(tag)
# released date is on dump months
dumpmonth_list = []
for each in release_dates:
    month = each.month
    if month == 12 or month == 1 or month == 2 or month == 8 or month ==9:
        tag = 1
    else:
        tag = 0
    dumpmonth_list.append(tag)
df.insert(loc=index+1,column = "released_on_weekend",value=weekend_list)
df.insert(loc=index+2,column = "released_on_dump_month",value=dumpmonth_list)



In [48]:

    
num_feat.append("released_on_weekend")
num_feat.append("released_on_dump_month")
cate_feat.remove("released")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre', 'imdb_votes', 'language', 'production', 'G', 'NC-17', 'PG', 'PG-13', 'R', 'TV-14', 'TV-MA', 'TV-PG', 'UNRATED', 'released_on_weekend', 'released_on_dump_month'] 35
['actors', 'director', 'runtime'] 3



In [49]:

    
df = df.drop("released",axis=1)

runtime



In [50]:

    
df["runtime"].dtype









    Out[50]:





dtype('O')



In [51]:

    
df["runtime"] = df["runtime"].map(lambda x:int(x.strip("min")))



In [52]:

    
sns.distplot(df["runtime"])









    Out[52]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1ac32ba8>



In [53]:

    
num_feat.append("runtime")
cate_feat.remove("runtime")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre', 'imdb_votes', 'language', 'production', 'G', 'NC-17', 'PG', 'PG-13', 'R', 'TV-14', 'TV-MA', 'TV-PG', 'UNRATED', 'released_on_weekend', 'released_on_dump_month', 'runtime'] 36
['actors', 'director'] 2

actors & directors



In [54]:

    
df = df.drop(["actors","director"],axis=1)



In [55]:

    
sns.distplot(df["actor_popularity"])









    Out[55]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1ad6ccc0>



In [83]:

    
sns.distplot(df["actor_popularity"].map(lambda x:math.log(x)+0.01))









    Out[83]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1b6b41d0>



In [81]:

    
sns.distplot(df["director_popularity"].map(lambda x:math.log(x)))









    Out[81]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1e36bf98>



In [57]:

    
sns.distplot(df["director_popularity"])









    Out[57]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1b056630>



In [58]:

    
cate_feat.remove("actors")
cate_feat.remove("director")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))









    



['imdb_rating', 'year', 'budget', 'revenue', 'actor_popularity', 'director_popularity', 'country', 'director_popularity', 'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'mystery', 'romance', 'thriller', 'other_genre', 'imdb_votes', 'language', 'production', 'G', 'NC-17', 'PG', 'PG-13', 'R', 'TV-14', 'TV-MA', 'TV-PG', 'UNRATED', 'released_on_weekend', 'released_on_dump_month', 'runtime'] 36
[] 0

Standardization



In [60]:

    
x_train = df[df["year"] <= 2013].drop("revenue",axis=1)
x_test = df[df["year"] > 2013].drop("revenue",axis=1)
y_train = df[df["year"] <= 2013]["revenue"]
y_test = df[df["year"] > 2013]["revenue"]



In [61]:

    
# num_feat.remove("revenue")
# stand_feat = []
# nonstand_feat = []
# for feat in num_feat:
#     if X[feat].nunique() > 2:
#         stand_feat.append(feat)
#     else:
#         nonstand_feat.append(feat)



In [62]:

    
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler_feat = scaler.fit_transform(X[stand_feat])
# X_feat = pd.DataFrame(scaler_feat,columns=X[stand_feat].columns)
# pd.concat([X_feat,X[nonstand_feat]],axis=1)



In [63]:

    
df.shape









    Out[63]:





(2373, 35)



In [65]:

    
fig, ax = plt.subplots()
fig.set_size_inches(16, 10)
sns.heatmap(df.drop("revenue",axis=1).corr())









    Out[65]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1b22d198>



In [66]:

    
# sns.pairplot(df.drop("revenue",axis=1))

Regression Model



In [67]:

    
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)



In [68]:

    
from sklearn.linear_model import LinearRegression



In [71]:

    
lrm = LinearRegression()
lrm.fit(x_train,y_train)
print(lrm.intercept_)
lrm.coef_
cdf = pd.DataFrame(lrm.coef_,x_train.columns,columns=["Coeff"])
predictions = lrm.predict(x_test)
# plt.scatter(y_test, predictions)
sns.distplot((y_test-predictions)) # should be normal distribution









    



22.5238818816






    Out[71]:





<matplotlib.axes._subplots.AxesSubplot at 0x18f1e11a358>



In [73]:

    
cdf









    Out[73]:







  
    
      
      Coeff
    
  
  
    
      country
      -0.109148
    
    
      imdb_rating
      -0.164874
    
    
      imdb_votes
      0.868435
    
    
      language
      0.048905
    
    
      production
      -0.000542
    
    
      released_on_weekend
      0.239644
    
    
      released_on_dump_month
      0.220722
    
    
      runtime
      0.008917
    
    
      year
      -0.011654
    
    
      budget
      0.577497
    
    
      actor_popularity
      -0.011572
    
    
      director_popularity
      0.002933
    
    
      action
      -0.101040
    
    
      adventure
      -0.168098
    
    
      animation
      0.259178
    
    
      biography
      0.064321
    
    
      comedy
      0.020002
    
    
      crime
      -0.348593
    
    
      drama
      -0.482759
    
    
      family
      0.074942
    
    
      fantasy
      -0.356117
    
    
      mystery
      -0.084075
    
    
      romance
      -0.060345
    
    
      thriller
      -0.220852
    
    
      other_genre
      0.126382
    
    
      G
      0.088266
    
    
      NC-17
      -1.243084
    
    
      PG
      -0.035578
    
    
      PG-13
      -0.325429
    
    
      R
      -0.984154
    
    
      TV-14
      1.117981
    
    
      TV-MA
      0.007996
    
    
      TV-PG
      0.762937
    
    
      UNRATED
      0.611066



In [84]:

    
from sklearn import metrics
print(metrics.mean_absolute_error(y_test, predictions))
print(metrics.mean_squared_error(y_test, predictions))
print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))









    



1.08764196967
2.22099405906
1.49029998962



In [ ]:

	Coeff
country	-0.109148
imdb_rating	-0.164874
imdb_votes	0.868435
language	0.048905
production	-0.000542
released_on_weekend	0.239644
released_on_dump_month	0.220722
runtime	0.008917
year	-0.011654
budget	0.577497
actor_popularity	-0.011572
director_popularity	0.002933
action	-0.101040
adventure	-0.168098
animation	0.259178
biography	0.064321
comedy	0.020002
crime	-0.348593
drama	-0.482759
family	0.074942
fantasy	-0.356117
mystery	-0.084075
romance	-0.060345
thriller	-0.220852
other_genre	0.126382
G	0.088266
NC-17	-1.243084
PG	-0.035578
PG-13	-0.325429
R	-0.984154
TV-14	1.117981
TV-MA	0.007996
TV-PG	0.762937
UNRATED	0.611066