In [1]:
import os
import pandas as pd
import sklearn as skl
import holcrawl.shared
In [2]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()
In [3]:
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')
In [4]:
df = pd.read_csv(dataset_path)
In [5]:
df.year.value_counts()
Out[5]:
In [6]:
list(df.columns)
Out[6]:
In [8]:
df['norm_gross'] = df['gross_income'] / df['budget']
In [9]:
df['profit'] = df['gross_income'] - df['budget']
In [10]:
df['ROI'] = (df['gross_income'] - df['budget']) / df['budget']
In [11]:
df['name_length'] = df['name'].map(lambda name: len(name))
In [12]:
len(df)
Out[12]:
In [13]:
df.isnull().sum()
Out[13]:
In [14]:
len(df[df['avg_mc_critic_by_opening'].notnull()])
Out[14]:
In [15]:
# df['opening_weekend_date']
In [16]:
BASE_FEAT_TO_KEEP = [
'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length',
'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]
In [17]:
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]
In [18]:
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)
In [19]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)
In [20]:
# letter_dummies = pd.get_dummies(df['starting_letter'], drop_first=True, prefix='fl')
In [21]:
# dataset = dataset.assign(**{col: letter_dummies[col] for col in letter_dummies.columns})
In [22]:
dataset = dataset.dropna(axis=0)
In [23]:
len(dataset)
Out[23]:
In [24]:
import seaborn as sns
%matplotlib inline
In [25]:
dataset.year.unique()
Out[25]:
In [26]:
dataset.year.value_counts()
Out[26]:
In [27]:
sns.distplot(dataset.year)
Out[27]:
In [28]:
dataset.columns
Out[28]:
In [29]:
# pd.options.display.max_columns = 999
# dataset
In [30]:
import numpy as np
In [31]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [32]:
def draw_pearson_correlation(feature_df):
pearson_df = pd.DataFrame(
data=np.corrcoef(feature_df.T),
index=feature_df.columns,
columns=feature_df.columns
)
lower_left_mask = []
for i in range(len(feature_df.columns)):
lower_left_mask.append([i<j+1 for j in range(len(feature_df.columns))])
lower_left_mask = np.array(lower_left_mask)
plt.figure(figsize=(13,10))
with sns.axes_style("white"):
heatmap = sns.heatmap(
pearson_df,
annot=True,
fmt=".2f",
linewidths=.5,
# mask=lower_left_mask
)
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), fontsize=14);
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=14)
In [33]:
features_for_pearson = features
In [34]:
features_for_pearson['ROI'] = df.ix[features_for_pearson.index]['ROI']
In [35]:
features_for_pearson['gross_income'] = df.ix[features_for_pearson.index]['gross_income']
In [36]:
features_for_pearson = features_for_pearson.dropna(axis=0)
In [37]:
features_for_pearson.columns
Out[37]:
In [38]:
draw_pearson_correlation(features_for_pearson)
In [39]:
pd.options.display.max_rows = 155
df[df['genres.action'] == 1][['budget', 'gross_income', 'name']]
Out[39]:
In [40]:
list(df.columns)
Out[40]:
In [41]:
GENRES_COLS = [col for col in df.columns if 'genres' in col]
In [42]:
def _average_col_by_budget(df, colnames, norm_factor=1, labels=None, figsize=None):
coldf = pd.DataFrame(
data=np.array([
[
(np.mean(df[df[genre] == 1][colname]) / norm_factor) for genre in GENRES_COLS
] for colname in colnames
]).transpose(),
index=[col[col.rfind('.')+1:] for col in GENRES_COLS],
columns=(labels or colnames)
)
ax = coldf.plot(kind='bar', figsize=(figsize or (15, 10)), legend=True, fontsize=13)
plt.xticks(rotation=45, ha='right')
plt.title("Average {} by genre".format(', '.join(labels or colnames)), fontsize=16)
plt.legend(prop={'size':14})
In [43]:
_average_col_by_budget(df, ['ROI'])
In [44]:
len(df[df['genres.documentary'] == 1])
Out[44]:
In [45]:
_average_col_by_budget(df, ['budget', 'gross_income'], norm_factor=1000000, labels=['Budget', 'Gross Income'])
In [46]:
df['mc_avg_user_score_scaled'] = df['mc_avg_user_score'] * 10
df['rating_scaled'] = df['rating'] * 10
In [47]:
_average_col_by_budget(df, ['metascore', 'mc_avg_user_score_scaled','rating_scaled'], figsize=(10,8),
labels=['Metascore', 'Metacritic user rating', 'IMDB user rating'])
plt.ylim(0, 90);
In [48]:
_average_col_by_budget(df, ['duration'], labels=['Duration'])
In [49]:
_average_col_by_budget(df, ['opening_month'], labels=['Month of release'])
In [50]:
_average_col_by_budget(df, ['opening_day'], labels=['Release day of month'])
In [51]:
np.corrcoef([df.rating, df.rating_count])[0,1]
Out[51]:
In [52]:
plt.figure(figsize=(9,8))
sns.regplot(x=df.rating, y=df.rating_count)
plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('IMDB Rating')
plt.ylabel('Number of IMDB ratings')
plt.title('IMDB rating vs # of IMDB ratings');
In [53]:
np.corrcoef([dataset.avg_imdb_user_by_opening, df.ix[dataset.index].gross_income])[0,1]
Out[53]:
In [54]:
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.avg_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000)
# plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('IMDB rating by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
# plt.title('IMDB rating by opening weekend');
In [55]:
np.corrcoef([dataset.num_imdb_user_by_opening, df.ix[dataset.index].gross_income])[0,1]
Out[55]:
In [56]:
plt.figure(figsize=(9,8))
# sns.regplot(x=dataset.avg_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000)
sns.regplot(x=dataset.num_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
# plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('# of IMDB ratings by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
# plt.title('IMDB rating by opening weekend');
In [57]:
np.corrcoef([dataset.avg_mc_critic_by_opening, df.ix[dataset.index].gross_income])[0,1]
Out[57]:
In [58]:
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.avg_mc_critic_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
plt.xlabel('Average critic rating by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
In [59]:
np.corrcoef([dataset.num_mc_critic_by_opening, df.ix[dataset.index].gross_income])[0,1]
Out[59]:
In [60]:
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.num_mc_critic_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
plt.xlabel('# of critic ratings by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
In [61]:
sub_df = df[~(df.budget.isnull() | df.gross_income.isnull())]
In [62]:
len(sub_df)
Out[62]:
In [63]:
np.corrcoef([sub_df.budget, sub_df.gross_income])[0,1]
Out[63]:
In [64]:
plt.figure(figsize=(11,9))
sns.regplot(x=sub_df.budget / 1000000, y=sub_df.gross_income / 1000000, color='red')
Out[64]:
In [65]:
sub_df = df[~(df.budget.isnull() | df.ROI.isnull())]
plt.figure(figsize=(11,9))
sns.regplot(x=sub_df.budget / 1000000, y=sub_df.ROI, color='red')
np.corrcoef([sub_df.budget, sub_df.ROI])[0,1]
Out[65]:
In [69]:
sub_df = df.metascore.dropna()
In [71]:
np.corrcoef([df.ix[sub_df.index].metascore, df.ix[sub_df.index].gross_income])[0,1]
Out[71]:
In [67]:
np.corrcoef([df.rating, df.gross_income])[0,1]
Out[67]:
In [66]:
plt.figure(figsize=(13,9))
a = sns.regplot(x=df['rating']*10, y=np.log(df['gross_income']), color='yellow')
b = sns.regplot(x=df['metascore'], y=np.log(df['gross_income']), color='Green')
plt.xlabel('IMDB rating and Metascore')
plt.ylabel('Gross Income [log]')
plt.title('IMDB rating and Metascore vs Gross Income [log]');
plt.legend([a, b], ['IMDB', 'Metascore'],
loc= 'upper center', fontsize= 'small');