In [1]:
import os
import pandas as pd
import numpy as np
import itertools
import operator
import string
from collections import defaultdict, Counter
from s3fs.core import S3FileSystem
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
In [2]:
# plotly validate with credentials
with open('../_credentials/plotly.txt', 'r') as infile:
user, pw = infile.read().strip().split(', ')
plotly.tools.set_credentials_file(username=user, api_key=pw)
text_color = 'rgb(107, 107, 107)'
colors_dict = {'grey':'rgb(189, 195, 199)', 'aqua':'rgb( 54, 215, 183)', 'navy':'rgb( 31, 58, 147)',
'purple':'rgb(142, 68, 173)', 'blue':'rgb( 25, 181, 254)', 'green':'rgb( 46, 204, 113)',
'yellow':'rgb(253, 231, 76)', 'orange':'rgb(250, 121, 33)', 'red':'rgb(242, 38, 19)'}
colors_lst = [colors_dict['yellow'], colors_dict['orange'], colors_dict['red'],
colors_dict['green'], colors_dict['blue'], colors_dict['purple'],
colors_dict['navy'], colors_dict['aqua'], colors_dict['grey']]
In [3]:
# aws keys stored in ini file in same path
os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'
s3 = S3FileSystem(anon=False)
key = 'data.csv'
bucket = 'luther-02'
df = pd.read_csv(s3.open('{}/{}'.format(bucket, key),mode='rb'))
# update dates to datetime objects
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.DatetimeIndex(df['Released']).year
df['Year_Int'] = pd.to_numeric(df['Year'])
df['Month'] = pd.DatetimeIndex(df['Released']).month
# year extremities
yr_start = df['Year'].min(axis=0)
yr_stop = df['Year'].max(axis=0)
In [4]:
# number of titles per year in dataset
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']
# create plotly data trace
trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['red']))
In [5]:
def bar_plot_data(_dataframe, _label, color):
df_temp = _dataframe[_label].value_counts().reset_index()
df_temp.columns = [_label,'Count']
# create plotly data trace
trace = go.Bar(x=df_temp[_label], y=df_temp['Count'], marker=dict(color=colors_dict[color]))
data = [trace]
layout = go.Layout(
title='Quantity of Torrent Titles by Year Released ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Release Year',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Number of Titles',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig, filename='luther_titles_annually({0}-{1})'.format(yr_start, yr_stop))
In [6]:
bar_plot_data(df, 'Year', 'red')
Out[6]:
In [7]:
data = [trace]
layout = go.Layout(
title='Quantity of Torrent Titles by Year Released ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Release Year',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Number of Titles',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_titles_annually({0}-{1})'.format(yr_start, yr_stop))
Out[7]:
In [8]:
def df_year_limit(start, stop, df):
mask = (df['Year'] >= start) & (df['Year'] <= stop)
df = df.loc[mask]
return df
In [9]:
# get count of records before trimming by year cutoff
yr_before = len(df)
print('{0} records in dataframe before trimming by year cutoff'.format(yr_before))
In [10]:
yr_start, yr_stop = (1995, 2015)
# trim by year cutoff
df = df_year_limit(yr_start, yr_stop, df)
yr_after = len(df)
print('{0} entries lost ({1}%) due to date cutoff between {2} and {3}'.format(yr_before-yr_after,
round((yr_before - yr_after)/yr_before *100, 2), yr_start, yr_stop))
In [11]:
# number of titles per year in dataset
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']
trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['blue']))
In [12]:
data = [trace]
layout = go.Layout(
title='Number of Torrent Titles by Release Year ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Release Year',
tickfont=dict(size=14,color=text_color)),
yaxis=dict(
title='Number of Titles',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_films_annually({0}-{1})'.format(yr_start, yr_stop))
Out[12]:
In [13]:
# split genre strings into a numpy array
def split_to_array(ser):
split_array = np.array(ser.strip().replace(',','').split(' '))
return pd.Series(split_array)
# turn numpy array into count of genre occurances
genres = df['Genre'].apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)
# convert series to dataframe for plotting
genre_ser = genres.reset_index()
genre_ser.columns = ['Genre', 'Count']
# bar chart of each genre in dataset
trace = go.Bar(x=genre_ser['Genre'], y=genre_ser['Count'], marker=dict(color=colors_dict['yellow']))
In [14]:
data = [trace]
layout = go.Layout(
title='Count of Genre Classifications ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Genre',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Number of Classifications',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_genre_quantity({0}-{1})'.format(yr_start, yr_stop))
Out[14]:
In [15]:
def convert_frequency(ser, genres=genres):
split_array = np.array(ser.strip().replace(',','').split(' '))
genre = genres.loc[split_array].argmax()
return genre
# add new column to dataframe classifying genre list as single genre of significance
df['Genre_Single'] = df['Genre'].apply(convert_frequency)
In [16]:
# look at number of single genre counts after extraction
df_count = df['Genre_Single'].value_counts().reset_index()
df_count.columns = ['Genre_Single', 'Count']
# bar chart of significant single genre in dataset
trace = go.Bar(x=df_count['Genre_Single'], y=df_count['Count'], marker=dict(color=colors_dict['yellow']))
In [17]:
data = [trace]
layout = go.Layout(
title='Quantity of Dominant Genre Classifications ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Genre',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Quantity of Classifications',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_dominant_genres({0}-{1})'.format(yr_start, yr_stop))
Out[17]:
In [18]:
def df_genre_limit(df, genres):
df = df[~df['Genre_Single'].isin(genres)]
return df
In [19]:
# get count of records before trimming by dominant genre cutoff
genre_before = len(df)
print('{0} records in dataframe before trimming by genres'.format(genre_before))
In [20]:
# trim by dominant genres
cut_genres = ['Romance', 'Western']
df = df_genre_limit(df, cut_genres)
genre_after = len(df)
str_genres = ', '.join(cut_genres)
print('{0} entries lost ({1}%) due to droppping dominant genres {2}'.format(genre_before-genre_after,
round((genre_before - genre_after)/genre_before *100, 2), str_genres))
In [21]:
def get_stackedBar_trace(x_category, y_counts, _name, ind):
'''
x_category -- category from feature set
y_counts -- count of x_category in feature set
_name -- _name of x_category
ind -- number indices for color list
Return: Plotly data trace for bar chart
'''
return go.Bar(x=x_category, y=y_counts, name=_name, marker=dict(color=colors_lst[ind]), opacity=0.8)
In [22]:
def get_stackedBar_traces(df, feature, count_label):
traces = []
for ind, _feat in enumerate(df[feature].unique().tolist()):
temp_df = df[df[feature] == _feat]
_value_counts = temp_df[count_label].value_counts()
temp_dict = _value_counts.to_dict()
temp_dict = sorted(temp_dict.items())
feature_lst = [ft for ft, ct in temp_dict]
count_lst = [ct for ft,ct in temp_dict]
traces.append(get_stackedBar_trace(feature_lst, count_lst, _feat, ind))
return traces
In [23]:
def get_stackedBar(_dataframe, feature, count_label, _title, _x_title, _y_title, _filename='stackedBar'):
date = get_stackedBar_traces(_dataframe, feature, count_label)
layout = go.Layout(
title=_title,
xaxis=dict(
title=_x_title,
tickfont=dict(size=14, color='rgb(107, 107, 107)')
),
yaxis=dict(
title=_y_title,
titlefont=dict(size=16, color='rgb(107, 107, 107)'),
tickfont=dict(size=14, color='rgb(107, 107, 107)'),
dtick=20,
),
barmode='stack',)
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig, filename=_filename)
In [24]:
_title = 'Genres Annually ({0}-{1})'.format(yr_start, yr_stop)
_x_title = 'Year'
_y_title = 'Number of Films'
_filename = 'luther_stackedGenres_years({0}-{1})'.format(yr_start, yr_stop)
get_stackedBar(df, 'Genre_Single', 'Year', _title, _x_title, _y_title, _filename='stackedBar')
Out[24]:
In [27]:
traces = []
for i,genre in enumerate(df['Genre_Single'].unique().tolist()):
_genre_df = df[df['Genre_Single'] == genre]
_value_counts = _genre_df['Year'].value_counts()
gen = _value_counts.to_dict()
gen = sorted(gen.items())
year_lst = [yr for yr,ct in gen]
count_lst = [ct for yr,ct in gen]
traces.append(make_bar_trace(year_lst, count_lst, genre, i))
In [28]:
data = traces[::-1]
layout = go.Layout(
title='Genres Annually ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Year',
tickfont=dict(size=14, color='rgb(107, 107, 107)')
),
yaxis=dict(
title='Number of Films',
titlefont=dict(size=16, color='rgb(107, 107, 107)'),
tickfont=dict(size=14, color='rgb(107, 107, 107)'),
dtick=20,
),
barmode='stack',)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-bar')
In [29]:
# ratings
df_rated = df['Rated'].value_counts().reset_index()
df_rated.columns = ['Rated', 'Count']
df_rated
Out[29]:
In [30]:
def df_genre_limit(df, ratings):
df = df[~df['Rated'].isin(ratings)]
return df
In [31]:
ratings_remove = ['NOT RATED', 'X', 'TV-14', 'NC-17']
df = df_genre_limit(df, ratings_remove)
df_rated = df['Rated'].value_counts().reset_index()
df_rated.columns = ['Rated', 'Count']
# bar chart of ratings
rated_traces = go.Bar(x=df_rated['Rated'], y=df_count['Count'], marker=dict(color=colors_dict['blue']))
In [32]:
data = [rated_traces]
layout = go.Layout(
title='Quantity of Dominant Genre Classifications ({0}-{1})'.format(yr_start, yr_stop),
xaxis=dict(
title='Genre',
tickfont=dict(size=14, color=text_color)),
yaxis=dict(
title='Quantity of Classifications',
titlefont=dict(size=16, color=text_color),
tickfont=dict(size=14, color=text_color)),
barmode='group',
bargap=0.15,
bargroupgap=0.1)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_dominant_genres({0}-{1})'.format(yr_start, yr_stop))
Out[32]:
In [33]:
# get count of records before trimming by year cutoff
rated_before = len(df)
print('{0} records in dataframe before trimming by rating'.format(rated_before))
In [34]:
ratings = ['PG-13', 'PG', 'G', 'R']
df = df.loc[df['Rated'].isin(ratings)]
rated_after = len(df)
print('{0} entries lost ({1}%) due to limiting to only {2} ratings'.format(rated_before-rated_after,
round((rated_before-rated_after)/rated_before *100, 2), ', '.join(ratings)))
print('{0} entries lost total ({1}%)'.format(yr_before-rated_after,
round((yr_before-rated_after)/yr_before *100, 2)))
In [35]:
# Combine Genre_Single and Rating as a new label
df['Genre_Rated'] = df['Genre_Single'] + ' ' + df['Rated']
df['Gen_Rat_Run'] = df['Genre_Rated'] + ' ' + df['Runtime'].apply(lambda x: str(x))
df['Gen_Rat_Bud'] = df['Genre_Rated'] + ' ' + df['Prod_Budget'].apply(lambda x: str(x))
df['Gen_Sin'] = df['Genre_Single']
In [36]:
df.columns
Out[36]:
In [37]:
colors_scat = colors_lst[:-2][::-1]
df_scat = df[['Prod_Budget', 'Runtime', 'Gen_Rat_Bud', 'Gen_Rat_Run', 'Gen_Sin']]
fig = FF.create_scatterplotmatrix(df_scat, diag='histogram', index='Gen_Sin',
height=1000, width=1000, colormap=colors_scat[::-1])
py.iplot(fig, filename='Luther Scatterplot Matrix')
Out[37]:
In [38]:
df['Log_Prod_Bud'] = np.log(df['Prod_Budget'])
df['Log_Runtime'] = np.log(df['Runtime'])
df['Log_Ttl_Tor'] = np.log(df['Total_Torrents'])
In [39]:
colors_scat = colors_lst[:-2][::-1]
df_scat = df[['Log_Ttl_Tor', 'Log_Prod_Bud', 'Log_Runtime', 'Gen_Rat_Bud', 'Gen_Rat_Run', 'Gen_Sin']]
fig = FF.create_scatterplotmatrix(df_scat, diag='histogram', index='Gen_Sin',
height=1000, width=1000, colormap=colors_scat[::-1])
_ = py.iplot(fig, filename='Log Luther Scatterplot Matrix')
In [40]:
df_drama = df[df['Genre_Single'] == 'Drama'].reset_index()
df_drama = df_drama.drop('index',axis=1)
df_drama['Log_Bud_Rated'] = df['Log_Prod_Bud'].apply(lambda x: str(x)) + ' ' + df['Rated']
In [41]:
#df_scat = df_drama[['Log_Ttl_Tor', 'Log_Prod_Bud', 'Log_Runtime', 'Log_Bud_Rated', 'Gen_Sin']]
df_scat = df_drama[['Total_Torrents', 'Prod_Budget', 'Runtime', 'Rated', 'Gen_Sin']]
fig = FF.create_scatterplotmatrix(df_scat, diag='histogram', index='Gen_Sin',
height=1000, width=1000, colormap=colors_scat[::-1])
_ = py.iplot(fig, filename='Log Drama Luther Scatterplot Matrix')
In [42]:
from patsy import dmatrices
patsy_formula = 'Total_Torrents ~ Prod_Budget + Year + Month + Runtime + Genre_Single'
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')
In [43]:
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()
In [44]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
mod_lr_score = model.score(x, y)
mod_lr_coef = model.coef_
model.results
In [45]:
from sklearn import cross_validation as cv
from sklearn import metrics
x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
model = LinearRegression().fit(x_train, y_train)
# store results
mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
cv_mod_score = model.score(x_train, y_train)
In [46]:
# reset x, y otherwise errors occur
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')
from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)
for train_index, test_index in kf:
x_train, x_test = x.iloc[train_index], x.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])
# store results
mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
cvKf_mod_score = clf2.score(x,y)
In [47]:
#NORMAL RESULTS
print('Model Linear Regression Score = {0}'.format(mod_lr_score))
print(' Mean Square Error = {0}'.format(mean_sq_err))
print(' Cross Validation Model Score = {0}'.format(cv_mod_score))
print(' Mean Squred Error K-Fold = {0}'.format(mean_sq_errKf))
print('Cross Val. K-Fold Model Score = {0}'.format(cvKf_mod_score))
In [48]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Prod_Budget', fig=fig)
In [49]:
df.columns
Out[49]:
In [50]:
df_sub['log_budg']=np.log(df_sub.Prod_Budget)
#df_sub['log_year']=np.log(df_sub.Year)
#df_sub['log_run']=np.log(df_sub.Runtime)
df_sub['log_tor']=np.log(df_sub.Total_Torrents)
trans = df_sub[['log_budg', 'Year', 'log_tor']]
plt.rcParams['figure.figsize'] = (15, 15)
pd.tools.plotting.scatter_matrix(trans)
In [51]:
log_patsy_formula = 'log_tor ~ log_budg + Year + Month'
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')
In [52]:
import plotly.plotly as py
from plotly.tools import FigureFactory as FF
df_a = df_sub[['log_budg', 'Year', 'Month', 'log_tor']]
fig = FF.create_scatterplotmatrix(df_a, diag='histogram', index='Month',
height=800, width=800)
py.iplot(fig, filename='Histograms along Diagonal Subplots')
In [53]:
import statsmodels.formula.api as smf
results = smf.ols(formula=log_patsy_formula, data=df_sub,).fit()
results.summary()
In [54]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
# store results
log_mod_lr_score = model.score(x,y)
In [55]:
from sklearn import cross_validation as cv
from sklearn import metrics
x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
model = LinearRegression().fit(x_train, y_train)
# store results
log_mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cv_mod_score = model.score(x_train, y_train)
In [56]:
# reset x, y otherwise errors occur
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')
from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)
for train_index, test_index in kf:
x_train, x_test = x.iloc[train_index], x.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])
# store results
log_mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cvKf_mod_score = clf2.score(x,y)
In [57]:
#LOG RESULTS
print('Log Model Linear Regression Score = {0}'.format(log_mod_lr_score))
print(' Log Mean Square Error = {0}'.format(log_mean_sq_err))
print(' Log Cross Validation Model Score = {0}'.format(log_cv_mod_score))
print(' Log Mean Squred Error K-Fold = {0}'.format(log_mean_sq_errKf))
print('Log Cross Val. K-Fold Model Score = {0}'.format(log_cvKf_mod_score))
In [58]:
df_TEST = pd.read_csv('data/test_data2.csv', encoding='latin-1')
df_TEST['log_budg']=np.log(df_TEST.Prod_Budget)
df_TEST['log_run']=np.log(df_TEST.Runtime)
df_TEST['log_tor']=np.log(df_TEST.Total_Torrents)
def split_to_array(ser):
split_array = np.array(ser.strip().replace(',','').split(' '))
return pd.Series(split_array)
genres = df_yr.Genre.apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)
def convert_frequency(ser, genres=genres):
split_array = np.array(ser.strip().replace(',','').split(' '))
genre = genres.loc[split_array].argmax()
return genre
df_TEST['Genre_Single'] = df_TEST.Genre.apply(convert_frequency)
log_patsy_formula_test = 'log_tor ~ log_budg + Year + Month + log_run + Genre_Single'
y, x = dmatrices(log_patsy_formula_test, data=df_TEST, return_type='dataframe')
print(clf2.score(x_test, y_test))
print(metrics.mean_squared_error(y_test,model.predict(x_test)))
In [59]:
_ = plt.plot(y, model.predict(x), 'bo')
In [60]:
plt.figure(figsize=(25,10))
ind = np.arange(len(yr_dict))
width = 0.35
bar_year = [year for year, count in yr_lst]
bar_count = [count for year, count in yr_lst]
plt.bar(ind, bar_count, width, color='r')
plt.ylabel('Count')
plt.xlabel('Year')
plt.title('Number of Torrents per Year')
plt.xticks(ind + width/2., (bar_year), rotation='vertical')
plt.yticks(np.arange(0, 91, 5))
plt.show()
In [61]:
#log_tor ~ log_budg + Year + Month + log_run + Genre_Single'
In [62]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'log_budg', fig=fig)
In [63]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Year', fig=fig)
In [64]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Month', fig=fig)
In [ ]:
In [ ]: