In [36]:
import os
import pandas as pd
import numpy as np
import itertools
import operator
import string
from collections import defaultdict, Counter
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
In [46]:
# plotly username and password
with open('../_credentials/plotly.txt', 'r') as infile:
user, pw = infile.read().strip().split(', ')
plotly.tools.set_credentials_file(username=user, api_key=pw)
text_color = 'rgb(107, 107, 107)'
colors_dict = {'grey':'rgb(189, 195, 199)', 'aqua':'rgb( 54, 215, 183)', 'navy':'rgb( 31, 58, 147)',
'purple':'rgb(142, 68, 173)', 'blue':'rgb( 25, 181, 254)', 'green':'rgb( 46, 204, 113)',
'yellow':'rgb(253, 231, 76)', 'orange':'rgb(250, 121, 33)', 'red':'rgb(242, 38, 19)'}
colors_lst = [colors_dict['yellow'], colors_dict['orange'], colors_dict['red'],
colors_dict['green'], colors_dict['blue'], colors_dict['purple'],
colors_dict['navy'], colors_dict['aqua'], colors_dict['grey']]
In [47]:
from src.my_aws import S3
KEY_OMDB_TOR = 'OMDB_Torrents.csv'
KEY_NUM = 'TheNumbers_budgets.csv'
KEY_FINAL = 'Final_Data.csv'
BUCKET = 'movie-torrents'
s3_obj = S3()
df = s3_obj.get_data(KEY_FINAL, BUCKET)
# update dates to datetime objects
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.to_numeric(pd.DatetimeIndex(df['Released']).year)
df['Month'] = pd.to_numeric(pd.DatetimeIndex(df['Released']).month)
In [48]:
# number of titles per year in dataset
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']
yr_start = df['Year'].min(axis=0)
yr_stop = df['Year'].max(axis=0)
trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['red']))
In [49]:
data = [trace]
layout = go.Layout(
title='Movies Released Annually ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Release Year',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Number of Movies',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')
Out[49]:
In [50]:
def df_year_limit(start, stop, df):
mask = (df['Year'] >= start) & (df['Year'] <= stop)
df = df.loc[mask]
return df
In [54]:
yr_start = 1940
yr_stop = 2015
len_before = len(df)
df = df_year_limit(yr_start, yr_stop, df)
len_after = len(df)
print('{0} entries lost ({1}%) due to date cutoff between {2} and {3}'.format(
len_before-len_after, round((len_before-len_after)/len_before *100, 2), yr_start, yr_stop))
In [55]:
# number of titles per year in dataset
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']
trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['orange']))
In [56]:
data = [trace]
layout = go.Layout(
title='Movies Released Annually ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Release Year',
tickfont=dict(size=14,color=text_color)),
yaxis=dict(
title='Number of Movies',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')
Out[56]:
In [57]:
# split genre strings into a numpy array
def split_to_array(ser):
split_array = np.array(ser.strip().replace(',','').split(' '))
return pd.Series(split_array)
# turn numpy array into count of genre occurances
genres = df['Genre'].apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)
# convert series to dataframe for plotting
genre_ser = genres.reset_index()
genre_ser.columns = ['Genre', 'Count']
# bar chart of each genre in dataset
trace = go.Bar(x=genre_ser['Genre'], y=genre_ser['Count'], marker=dict(color=colors_dict['yellow']))
In [58]:
data = [trace]
layout = go.Layout(
title='Genre Occurences ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Genre',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Occurences in Classification',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')
Out[58]:
In [59]:
def convert_frequency(ser, genres=genres):
split_array = np.array(ser.strip().replace(',','').split(' '))
genre = genres.loc[split_array].argmax()
return genre
# add new column to dataframe classifying genre list as single genre of significance
df['Genre_Single'] = df['Genre'].apply(convert_frequency)
In [60]:
# look at number of single genre counts after extraction
df_count = df['Genre_Single'].value_counts().reset_index()
df_count.columns = ['Genre_Single', 'Count']
# bar chart of significant single genre in dataset
trace = go.Bar(x=df_count['Genre_Single'], y=df_count['Count'], marker=dict(color=colors_dict['blue']))
In [61]:
data = [trace]
layout = go.Layout(
title='Dominate Genre Count ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Genre',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Genre Occurences',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')
Out[61]:
In [64]:
def make_bar_trace(x_years, y_counts, name_genre, i):
trace = go.Bar(
x = x_years,
y = y_counts,
name = name_genre,
# marker=dict(
# color=colors_lst[i]),
opacity=0.8
)
return trace
In [65]:
traces = []
for i,genre in enumerate(df_count['Genre_Single'].unique().tolist()):
_genre_df = df[df['Genre_Single'] == genre]
_value_counts = _genre_df['Year'].value_counts()
gen = _value_counts.to_dict()
gen = sorted(gen.items())
year_lst = [yr for yr,ct in gen]
count_lst = [ct for yr,ct in gen]
traces.append(make_bar_trace(year_lst, count_lst, genre, i))
In [66]:
data = traces[::-1]
layout = go.Layout(
title='Genres Annually ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Year',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Number of Films',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
),
dtick=20,
),
barmode='stack',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-bar')
Out[66]:
In [67]:
def fill_missing_year(tup_lst):
years = df['Year'].sort_values().unique().tolist()
itms = [yr for yr, ct in tup_lst]
mis_year = set(years) - set(itms)
for yr in mis_year:
tup_lst.append((yr, 0))
return tup_lst
In [68]:
def tri_tup_dict():
temp = {}
for i, genre in enumerate(df_count['Genre_Single'].unique().tolist()):
_genre_df = df[df['Genre_Single'] == genre]
_value_counts = _genre_df['Year'].value_counts()
gen = _value_counts.to_dict()
gen = sorted(gen.items())
# fill in years with zero counts
fill_missing_year(gen)
# create triple tuple of genre, year, count
years = [yr for yr, ct in gen]
counts = [ct for yr, ct in gen]
genres = [genre] * len(gen)
tri_tup = zip(genres, years, counts)
# create dictionary keyed on genre and year with genre counts as value
for tup in tri_tup:
temp[(tup[0], tup[1])] = tup[2]
return temp
In [69]:
def percentage_dict(tri_dict):
years = [y for y in range(df['Year'].min(), df['Year'].max()+1)]
# by year so that year stack adds up to 100%
for year in years:
temp = {}
# get subset (year) dictionary
for key,ct in tri_dict.items():
if year in key:
temp[key] = ct
# get total for year subset
total = sum(temp.values())
# put percentages back into main dictionary
for key,ct in temp.items():
tri_dict[key] = ct/total*100
return tri_dict
In [70]:
def normalized_check(_dict):
years = [y for y in range(df['Year'].min(), df['Year'].max()+1)]
for year in years:
temp = []
for key,ct in _dict.items():
if year in key:
temp.append(ct)
print('For {0} the total is: {1}'.format(year, sum(temp)))
In [71]:
# normalized_check(percentage_dict(tri_tup_dict()))
In [72]:
def make_filled_line_trace(x_years, y_counts, name_genre, color):
trace = go.Scatter(
x = x_years,
y = y_counts,
name = name_genre,
mode = 'lines',
line = dict(width=0.5,
color = color),
fill = 'tonexty')
return trace
In [73]:
ord_lst = ['Crime', 'Adventure', 'Thriller', 'Horror', 'Action', 'Comedy', 'Drama']
new_dict = {}
ttl = np.array(0)
for _gen in ord_lst:
temp = []
for tup in sorted(percentage_dict(tri_tup_dict()).items()):
if _gen in tup[0]:
temp.append(tup[1])
ttl = ttl + temp
new_dict[_gen] = ttl
In [74]:
traces = []
ord_lst = ['Crime', 'Adventure', 'Thriller', 'Horror', 'Action', 'Comedy', 'Drama']
years = [y for y in range(df['Year'].min(), df['Year'].max()+1)]
for i, genre in enumerate(ord_lst):
yr_lst = years
ct_lst = new_dict[genre].tolist()
traces.append(make_filled_line_trace(yr_lst, ct_lst, genre, colors_lst[i]))
In [75]:
data = traces
layout = go.Layout(
title='Normalized Genres Annually ({0}-{1})'.format(yr_start, yr_stop),
showlegend=True,
autosize=False,
width=1000,
height=600,
margin=go.Margin(
l=60,
r=40,
b=100,
t=100,
pad=4
),
xaxis=dict(
title='Year',
tickfont=dict(
size=14,
color='rgb(107,107,107)'),
type='category',
dtick=2
),
yaxis=dict(
title='Normalized Film Count % (Accumulative)',
titlefont=dict(
size=16,
color='rgb(107,107,107)'),
tickfont=dict(
size=14,
color='rgb(107,107,107)'),
type='linear',
range=[1, 100],
dtick=20,
ticksuffix='%'
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-area-plot')
Out[75]:
In [76]:
len_before = len(df)
ratings = ['PG-13', 'PG', 'G', 'R']
df = df.loc[df['Rated'].isin(ratings)]
len_after = len(df)
print('{0} entries lost ({1}%) due to limiting to only {2} ratings'.format(len_before-len_after,
round((len_before-len_after)/len_before *100, 2), ', '.join(ratings)))
In [83]:
from patsy import dmatrices
patsy_formula = 'Total_Torrents ~ ProductionBudget + Year + Month + Runtime + Genre_Single'
y, x = dmatrices(patsy_formula, data=df, return_type='dataframe')
In [84]:
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()
Out[84]:
In [90]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
mod_lr_score = model.score(x, y)
mod_lr_coef = model.coef_
mod_lr_score
Out[90]:
In [91]:
from sklearn import cross_validation as cv
from sklearn import metrics
x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
model = LinearRegression().fit(x_train, y_train)
# store results
mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
cv_mod_score = model.score(x_train, y_train)
In [94]:
# reset x, y otherwise errors occur
y, x = dmatrices(patsy_formula, data=df, return_type='dataframe')
from sklearn.cross_validation import KFold
kf = KFold(len(df), n_folds=10, shuffle=True)
for train_index, test_index in kf:
x_train, x_test = x.iloc[train_index], x.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])
# store results
mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
cvKf_mod_score = clf2.score(x,y)
In [95]:
#NORMAL RESULTS
print('Model Linear Regression Score = {0}'.format(mod_lr_score))
print(' Mean Square Error = {0}'.format(mean_sq_err))
print(' Cross Validation Model Score = {0}'.format(cv_mod_score))
print(' Mean Squred Error K-Fold = {0}'.format(mean_sq_errKf))
print('Cross Val. K-Fold Model Score = {0}'.format(cvKf_mod_score))
In [103]:
% matplotlib inline
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'ProductionBudget', fig=fig)
In [104]:
df.columns
Out[104]:
In [106]:
df['log_budg']=np.log(df['ProductionBudget'])
#df_sub['log_year']=np.log(df_sub.Year)
#df_sub['log_run']=np.log(df_sub.Runtime)
df['log_tor']=np.log(df['Total_Torrents'])
trans = df[['log_budg', 'Year', 'log_tor']]
plt.rcParams['figure.figsize'] = (15, 15)
pd.tools.plotting.scatter_matrix(trans)
In [108]:
log_patsy_formula = 'log_tor ~ log_budg + Year + Month'
y, x = dmatrices(log_patsy_formula, data=df, return_type='dataframe')
In [109]:
import plotly.plotly as py
from plotly.tools import FigureFactory as FF
df_a = df[['log_budg', 'Year', 'Month', 'log_tor']]
fig = FF.create_scatterplotmatrix(df_a, diag='histogram', index='Month',
height=800, width=800)
py.iplot(fig, filename='Histograms along Diagonal Subplots')
Out[109]:
In [110]:
import statsmodels.formula.api as smf
results = smf.ols(formula=log_patsy_formula, data=df,).fit()
results.summary()
Out[110]:
In [ ]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
# store results
log_mod_lr_score = model.score(x,y)
In [ ]:
from sklearn import cross_validation as cv
from sklearn import metrics
x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
model = LinearRegression().fit(x_train, y_train)
# store results
log_mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cv_mod_score = model.score(x_train, y_train)
In [ ]:
# reset x, y otherwise errors occur
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')
from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)
for train_index, test_index in kf:
x_train, x_test = x.iloc[train_index], x.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])
# store results
log_mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cvKf_mod_score = clf2.score(x,y)
In [ ]:
#LOG RESULTS
print('Log Model Linear Regression Score = {0}'.format(log_mod_lr_score))
print(' Log Mean Square Error = {0}'.format(log_mean_sq_err))
print(' Log Cross Validation Model Score = {0}'.format(log_cv_mod_score))
print(' Log Mean Squred Error K-Fold = {0}'.format(log_mean_sq_errKf))
print('Log Cross Val. K-Fold Model Score = {0}'.format(log_cvKf_mod_score))
In [ ]:
df_TEST = pd.read_csv('data/test_data2.csv', encoding='latin-1')
df_TEST['log_budg']=np.log(df_TEST.Prod_Budget)
df_TEST['log_run']=np.log(df_TEST.Runtime)
df_TEST['log_tor']=np.log(df_TEST.Total_Torrents)
def split_to_array(ser):
split_array = np.array(ser.strip().replace(',','').split(' '))
return pd.Series(split_array)
genres = df_yr.Genre.apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)
def convert_frequency(ser, genres=genres):
split_array = np.array(ser.strip().replace(',','').split(' '))
genre = genres.loc[split_array].argmax()
return genre
df_TEST['Genre_Single'] = df_TEST.Genre.apply(convert_frequency)
log_patsy_formula_test = 'log_tor ~ log_budg + Year + Month + log_run + Genre_Single'
y, x = dmatrices(log_patsy_formula_test, data=df_TEST, return_type='dataframe')
print(clf2.score(x_test, y_test))
print(metrics.mean_squared_error(y_test,model.predict(x_test)))
In [ ]:
_ = plt.plot(y, model.predict(x), 'bo')
In [ ]:
plt.figure(figsize=(25,10))
ind = np.arange(len(yr_dict))
width = 0.35
bar_year = [year for year, count in yr_lst]
bar_count = [count for year, count in yr_lst]
plt.bar(ind, bar_count, width, color='r')
plt.ylabel('Count')
plt.xlabel('Year')
plt.title('Number of Torrents per Year')
plt.xticks(ind + width/2., (bar_year), rotation='vertical')
plt.yticks(np.arange(0, 91, 5))
plt.show()
In [ ]:
#log_tor ~ log_budg + Year + Month + log_run + Genre_Single'
In [ ]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'log_budg', fig=fig)
In [ ]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Year', fig=fig)
In [ ]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Month', fig=fig)
In [ ]:
In [ ]: