In [166]:
import os
import pandas as pd
import sklearn as skl
# import holcrawl.shared
In [40]:
# dataset_dir = holcrawl.shared._get_dataset_dir_path()
In [41]:
# dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')
In [245]:
df = pd.read_csv('movies_dataset.csv')
In [246]:
df.year.value_counts()
Out[246]:
In [247]:
# list(df.columns)
In [169]:
df['name_length'] = df['name'].map(lambda name: len(name))
In [170]:
len(df)
Out[170]:
In [171]:
df.isnull().sum()
Out[171]:
In [172]:
len(df[df['avg_mc_critic_by_opening'].notnull()])
Out[172]:
In [174]:
BASE_FEAT_TO_KEEP = [
'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length',
'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]
In [175]:
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]
In [176]:
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)
In [177]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)
In [180]:
dataset = dataset.dropna(axis=0)
In [181]:
len(dataset)
Out[181]:
In [186]:
dataset.columns
Out[186]:
In [187]:
pd.options.display.max_columns = 999
dataset.iloc[0:3]
Out[187]:
In [276]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
In [277]:
X = dataset
# Y = df['ROI'].ix[dataset.index]
Y = df['gross_income'].ix[dataset.index]
In [278]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)
In [279]:
FEATURES_TO_SELECT = 10
In [280]:
from sklearn.feature_selection import f_regression, mutual_info_regression
mi = mutual_info_regression(X_train, y_train)
In [281]:
mi_df = pd.DataFrame([mi]).T
mi_df.index = X_train.columns
mi_df.columns = ['MI']
In [282]:
mi_df = mi_df.sort('MI', ascending=False)
mi_df
Out[282]:
In [283]:
support = mi_df.index[0:FEATURES_TO_SELECT]
In [284]:
selected_X_train = X_train.drop([col for col in X_train if col not in support], axis=1)
In [285]:
selected_X_test = X_test.drop([col for col in X_test if col not in support], axis=1)
In [286]:
regr = linear_model.LinearRegression(fit_intercept=True, normalize=True)
regr = regr.fit(selected_X_train, y_train)
In [287]:
coef_df = pd.DataFrame({'coef': regr.coef_}, index=selected_X_train.columns)
coef_df.sort('coef', ascending=False)
Out[287]:
In [288]:
y_predict = regr.predict(selected_X_test)
In [289]:
regr.score(selected_X_test, y_test)
Out[289]:
In [290]:
plt.figure(figsize=(12,9))
plt.scatter(y_test / 1000000, y_predict / 1000000)
plt.xlabel('True Gross Income')
plt.ylabel('Predicted Gross Income');
In [256]:
X_train_weekend_inc = X_train
X_train_weekend_inc['opening_weekend_income'] = df.ix[X_train.index]['opening_weekend_income']
X_test_weekend_inc = X_test
X_test_weekend_inc['opening_weekend_income'] = df.ix[X_test.index]['opening_weekend_income']
In [257]:
mi = mutual_info_regression(X_train_weekend_inc, y_train)
In [258]:
mi_df = pd.DataFrame([mi]).T
mi_df.index = X_train_weekend_inc.columns
mi_df.columns = ['MI']
In [259]:
mi_df = mi_df.sort('MI', ascending=False)
mi_df
Out[259]:
In [260]:
support = mi_df.index[0:FEATURES_TO_SELECT]
In [261]:
selected_X_train_weekend_inc = X_train_weekend_inc
selected_X_test_weekend_inc = X_test_weekend_inc
selected_X_train_weekend_inc = X_train_weekend_inc.drop([col for col in X_train_weekend_inc if col not in support], axis=1)
selected_X_test_weekend_inc = X_test_weekend_inc.drop([col for col in X_test_weekend_inc if col not in support], axis=1)
In [262]:
regr2 = linear_model.LinearRegression(fit_intercept=True, normalize=True)
regr2 = regr2.fit(selected_X_train_weekend_inc, y_train)
In [263]:
coef_df2 = pd.DataFrame({'coef': regr2.coef_}, index=selected_X_train_weekend_inc.columns)
coef_df2.sort('coef', ascending=False)
Out[263]:
In [266]:
y_predict2 = regr2.predict(selected_X_test_weekend_inc)
regr2.score(selected_X_test_weekend_inc, y_test)
Out[266]:
In [267]:
plt.figure(figsize=(12,9))
plt.scatter(y_test / 1000000, y_predict2 / 1000000)
plt.xlabel('True Gross Income')
plt.ylabel('Predicted Gross Income');
# plt.title('Predicted vs True Gross Income');
In [ ]: