In [2]:

    
import pandas as pd
import numpy as np
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')

import plotly
import plotly.plotly as py
plotly.tools.set_credentials_file(username='bbiggs', api_key='jrgs2lfv3u')
import seaborn as sns

Read in TRAIN data set and select pertinent columns



In [3]:

    
df = pd.read_csv('data/test_data2.csv', encoding='latin-1')
print(len(df))
df.head()









    



332






    Out[3]:






  
    
      
      Title
      Prod_Budget
      Dom_Gross
      World_Gross
      Released
      Year
      Month
      Rated
      Runtime
      Genre
      Director
      Actors
      Total_Torrents
    
  
  
    
      0
      Lakeview Terrace
      20000000
      39263506
      44821299
      2008-09-19
      2008.0
      9.0
      PG-13
      110
      Crime, Drama, Thriller
      Neil LaBute
      Samuel L. Jackson, Patrick Wilson, Kerry Washi...
      291.0
    
    
      1
      Phantasm II
      3000000
      7000000
      7000000
      1988-07-08
      1988.0
      7.0
      R
      97
      Action, Fantasy, Horror
      Don Coscarelli
      James Le Gros, Reggie Bannister, Angus Scrimm,...
      34.0
    
    
      2
      They Live
      4000000
      13000000
      13000000
      1988-11-04
      1988.0
      11.0
      R
      93
      Sci-Fi, Thriller
      John Carpenter
      Roddy Piper, Keith David, Meg Foster, George '...
      138.0
    
    
      3
      Employee of the Month
      10000000
      28444855
      38364855
      2006-10-06
      2006.0
      10.0
      PG-13
      103
      Comedy, Romance
      Greg Coolidge
      Dane Cook, Jessica Simpson, Dax Shepard, Andy ...
      91.0
    
    
      4
      The Transporter Refueled
      22000000
      16029670
      69698495
      2015-09-04
      2015.0
      9.0
      PG-13
      96
      Action, Crime, Thriller
      Camille Delamarre
      Ed Skrein, Ray Stevenson, Loan Chabanol, Gabri...
      265.0

Convert dates to datetime objects



In [4]:

    
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.DatetimeIndex(df['Released']).year
df['Month'] = pd.DatetimeIndex(df['Released']).month
df.head()









    Out[4]:






  
    
      
      Title
      Prod_Budget
      Dom_Gross
      World_Gross
      Released
      Year
      Month
      Rated
      Runtime
      Genre
      Director
      Actors
      Total_Torrents
    
  
  
    
      0
      Lakeview Terrace
      20000000
      39263506
      44821299
      2008-09-19
      2008
      9
      PG-13
      110
      Crime, Drama, Thriller
      Neil LaBute
      Samuel L. Jackson, Patrick Wilson, Kerry Washi...
      291.0
    
    
      1
      Phantasm II
      3000000
      7000000
      7000000
      1988-07-08
      1988
      7
      R
      97
      Action, Fantasy, Horror
      Don Coscarelli
      James Le Gros, Reggie Bannister, Angus Scrimm,...
      34.0
    
    
      2
      They Live
      4000000
      13000000
      13000000
      1988-11-04
      1988
      11
      R
      93
      Sci-Fi, Thriller
      John Carpenter
      Roddy Piper, Keith David, Meg Foster, George '...
      138.0
    
    
      3
      Employee of the Month
      10000000
      28444855
      38364855
      2006-10-06
      2006
      10
      PG-13
      103
      Comedy, Romance
      Greg Coolidge
      Dane Cook, Jessica Simpson, Dax Shepard, Andy ...
      91.0
    
    
      4
      The Transporter Refueled
      22000000
      16029670
      69698495
      2015-09-04
      2015
      9
      PG-13
      96
      Action, Crime, Thriller
      Camille Delamarre
      Ed Skrein, Ray Stevenson, Loan Chabanol, Gabri...
      265.0



In [5]:

    
import plotly.plotly as py
from plotly.tools import FigureFactory as FF

df_a = df[['Prod_Budget', 'Year', 'Month', 'Total_Torrents']]

fig = FF.create_scatterplotmatrix(df_a, diag='box', index='Prod_Budget', colormap='Portland', colormap_type='seq', endpts=[-1, 0, 1], height=800, width=800, size=12, marker=dict(symbol=25)) py.iplot(fig, filename = 'Partition Numeric Data into Intervals')



In [6]:

    
import plotly.plotly as py
from plotly.tools import FigureFactory as FF

df_a = df[['Prod_Budget', 'Year', 'Month', 'Total_Torrents']]
fig = FF.create_scatterplotmatrix(df_a, diag='histogram', index='Prod_Budget',
                                  height=800, width=800)
py.iplot(fig, filename='Histograms along Diagonal Subplots')









    



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]







    Out[6]:



In [7]:

    
# look at current data set AFTER year cutoff
plt.rcParams['figure.figsize'] = (15, 15)
_ = pd.tools.plotting.scatter_matrix(df_yr)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-aee19d6f05e7> in <module>()
      1 # look at current data set AFTER year cutoff
      2 plt.rcParams['figure.figsize'] = (15, 15)
----> 3 _ = pd.tools.plotting.scatter_matrix(df_yr)

NameError: name 'df_yr' is not defined



In [8]:

    
# unique list of grouped genres as strings
unq_genres = df_yr['Genre'].unique()
unq_genres = unq_genres.tolist()

#print(len(unq_genres))
#print(unq_genres[:10])

# unique list of grouped genres as lists
lst_grp_genres = []
for lst in unq_genres:
    temp = []
    for genre in lst.split(','):
         temp.append(genre)
    lst_grp_genres.append(temp)

#print(len(lst_grp_genres))
#print(lst_grp_genres)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-04efbc8f959b> in <module>()
      1 # unique list of grouped genres as strings
----> 2 unq_genres = df_yr['Genre'].unique()
      3 unq_genres = unq_genres.tolist()
      4 
      5 #print(len(unq_genres))

NameError: name 'df_yr' is not defined



In [9]:

    
# unique list of individual genres
ind_genre = set()

for lst in unq_genres:
    for genre in lst.split(','):
        ind_genre.add(genre.strip())
ind_genre = sorted(ind_genre)

#print(len(ind_genre))
#print(ind_genre)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-cfb997e10f21> in <module>()
      2 ind_genre = set()
      3 
----> 4 for lst in unq_genres:
      5     for genre in lst.split(','):
      6         ind_genre.add(genre.strip())

NameError: name 'unq_genres' is not defined



In [10]:

    
from patsy import dmatrices
patsy_formula = 'Total_Torrents ~ Prod_Budget + Year + Genre_Single'
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-e3338ec6fb51> in <module>()
      1 from patsy import dmatrices
      2 patsy_formula = 'Total_Torrents ~ Prod_Budget + Year + Genre_Single'
----> 3 y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')

NameError: name 'df_sub' is not defined



In [45]:

    
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()









    Out[45]:





OLS Regression Results

  Dep. Variable:      Total_Torrents     R-squared:             0.604 


  Model:                    OLS          Adj. R-squared:        0.602 


  Method:              Least Squares     F-statistic:           245.9 


  Date:              Fri, 15 Jul 2016    Prob (F-statistic):  2.09e-159


  Time:                  10:16:12        Log-Likelihood:      -5006.2 


  No. Observations:          812         AIC:                1.002e+04


  Df Residuals:              806         BIC:                1.005e+04


  Df Model:                    5                                      


  Covariance Type:       nonrobust                                    




                               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept                  -4.461e+04   1769.933    -25.205   0.000  -4.81e+04 -4.11e+04


  Genre_Single[T.Adventure]    -83.9253     27.968     -3.001   0.003   -138.825   -29.026


  Genre_Single[T.Comedy]       -96.9206     12.151     -7.976   0.000   -120.772   -73.069


  Genre_Single[T.Drama]        -59.5208     12.595     -4.726   0.000    -84.244   -34.797


  Prod_Budget                 1.566e-06   9.88e-08     15.851   0.000   1.37e-06  1.76e-06


  Year                          22.3358      0.882     25.317   0.000     20.604    24.068




  Omnibus:        137.527    Durbin-Watson:         2.063


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    348.609


  Skew:            0.886     Prob(JB):           2.00e-76


  Kurtosis:        5.676     Cond. No.           3.08e+10



In [9]:









    



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]







    Out[9]:



In [46]:

    
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
mod_lr_score = model.score(x, y)
mod_lr_coef = model.coef_



In [1]:

    
from sklearn import cross_validation as cv
from sklearn import metrics

x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)

model = LinearRegression().fit(x_train, y_train)
model.summary
# store results
mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
cv_mod_score = model.score(x_train, y_train)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-806c61cb26a7> in <module>()
      2 from sklearn import metrics
      3 
----> 4 x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
      5 
      6 model = LinearRegression().fit(x_train, y_train)

NameError: name 'x' is not defined



In [48]:

    
# reset x, y otherwise errors occur
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')

from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)

for train_index, test_index in kf:
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])

# store results
mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
cvKf_mod_score = clf2.score(x,y)



In [49]:

    
#NORMAL RESULTS
print('Model Linear Regression Score = {0}'.format(mod_lr_score))
print('            Mean Square Error = {0}'.format(mean_sq_err))
print(' Cross Validation Model Score = {0}'.format(cv_mod_score))
print('     Mean Squred Error K-Fold = {0}'.format(mean_sq_errKf))
print('Cross Val. K-Fold Model Score = {0}'.format(cvKf_mod_score))









    



Model Linear Regression Score = 0.6040207184539521
            Mean Square Error = 13255.981394370103
 Cross Validation Model Score = 0.6023914268766934
     Mean Squred Error K-Fold = 13390.435101340325
Cross Val. K-Fold Model Score = 0.6039762636127388



In [50]:

    
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Prod_Budget', fig=fig)



In [51]:

    
_ = plt.plot(y, model.predict(x), 'ro')

Log Transform



In [52]:

    
df.columns









    Out[52]:





Index(['Title', 'Prod_Budget', 'Released', 'Year', 'Month', 'Rated', 'Runtime',
       'Genre', 'Director', 'Actors', 'Total_Torrents'],
      dtype='object')



In [53]:

    
df_sub['log_budg']=np.log(df_sub.Prod_Budget)
#df_sub['log_year']=np.log(df_sub.Year)
#df_sub['log_run']=np.log(df_sub.Runtime)
df_sub['log_tor']=np.log(df_sub.Total_Torrents)

trans = df_sub[['log_budg', 'Year', 'log_tor']]
plt.rcParams['figure.figsize'] = (15, 15)
_ = pd.tools.plotting.scatter_matrix(trans)



In [54]:

    
log_patsy_formula = 'log_tor ~ log_budg + Year + Genre_Single'
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')



In [55]:

    
import statsmodels.formula.api as smf
results = smf.ols(formula=log_patsy_formula, data=df_sub,).fit()
results.summary()









    Out[55]:





OLS Regression Results

  Dep. Variable:          log_tor        R-squared:             0.631 


  Model:                    OLS          Adj. R-squared:        0.629 


  Method:              Least Squares     F-statistic:           275.8 


  Date:              Fri, 15 Jul 2016    Prob (F-statistic):  9.06e-172


  Time:                  10:16:50        Log-Likelihood:      -598.39 


  No. Observations:          812         AIC:                   1209. 


  Df Residuals:              806         BIC:                   1237. 


  Df Model:                    5                                      


  Covariance Type:       nonrobust                                    




                               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept                   -243.2689      7.746    -31.407   0.000   -258.473  -228.065


  Genre_Single[T.Adventure]     -0.2497      0.122     -2.048   0.041     -0.489    -0.010


  Genre_Single[T.Comedy]        -0.4982      0.053     -9.442   0.000     -0.602    -0.395


  Genre_Single[T.Drama]         -0.2787      0.055     -5.101   0.000     -0.386    -0.171


  log_budg                       0.2583      0.021     12.158   0.000      0.217     0.300


  Year                           0.1217      0.004     31.588   0.000      0.114     0.129




  Omnibus:         6.785    Durbin-Watson:         2.164


  Prob(Omnibus):   0.034    Jarque-Bera (JB):      6.813


  Skew:           -0.224    Prob(JB):             0.0332


  Kurtosis:        2.998    Cond. No.           8.73e+05



In [56]:

    
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)

# store results
log_mod_lr_score = model.score(x,y)



In [57]:

    
from sklearn import cross_validation as cv
from sklearn import metrics

x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)

model = LinearRegression().fit(x_train, y_train)

# store results
log_mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cv_mod_score = model.score(x_train, y_train)



In [58]:

    
# reset x, y otherwise errors occur
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')

from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)

for train_index, test_index in kf:
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])

# store results
log_mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cvKf_mod_score = clf2.score(x,y)



In [59]:

    
#LOG RESULTS
print('Log Model Linear Regression Score = {0}'.format(log_mod_lr_score))
print('            Log Mean Square Error = {0}'.format(log_mean_sq_err))
print(' Log Cross Validation Model Score = {0}'.format(log_cv_mod_score))
print('     Log Mean Squred Error K-Fold = {0}'.format(log_mean_sq_errKf))
print('Log Cross Val. K-Fold Model Score = {0}'.format(log_cvKf_mod_score))









    



Log Model Linear Regression Score = 0.6310883914653996
            Log Mean Square Error = 0.24651528811136103
 Log Cross Validation Model Score = 0.6410213030729475
     Log Mean Squred Error K-Fold = 0.25022641720858635
Log Cross Val. K-Fold Model Score = 0.6303238660429156



In [60]:

    
_ = plt.plot(y, model.predict(x), 'bo')



In [61]:

    
plt.figure(figsize=(25,10))

ind = np.arange(len(yr_dict))
width = 0.35

bar_year = [year for year, count in yr_lst]
bar_count = [count for year, count in yr_lst]

plt.bar(ind, bar_count, width, color='r')

plt.ylabel('Count')
plt.xlabel('Year')
plt.title('Number of Torrents per Year')
plt.xticks(ind + width/2., (bar_year), rotation='vertical')
plt.yticks(np.arange(0, 91, 5))

plt.show()



In [62]:

    
#log_tor ~ log_budg + Year + Month + log_run + Genre_Single'



In [63]:

    
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'log_budg', fig=fig)



In [64]:

    
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Year', fig=fig)



In [66]:

    
#fig = plt.figure(figsize=(12,8))
#fig = sm.graphics.plot_regress_exog(results,'Month', fig=fig)



In [71]:

    
plt.figure(figsize=(15,10))
plt.axis([0, 450000000, 0, 1500])
plt.xlabel('Production Budget')
plt.ylabel('Torrents')
plt.title('Torrents vs. Production Budget - All')
plt.scatter(df_sub['Prod_Budget'], df_sub['Total_Torrents'], alpha=0.5, s=50)









    Out[71]:





<matplotlib.collections.PathCollection at 0x10fdf7198>



In [10]:

    
import plotly.plotly as py
from plotly.tools import FigureFactory as FF

df_yr = df[['Prod_Budget', 'Year', 'Month', 'Total_Torrents']]
fig = FF.create_scatterplotmatrix(df_sub, diag='histogram', index='Prod_Budget',
                                  height=800, width=800)
py.iplot(fig, filename='Histograms along Diagonal Subplots')









    



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]







    Out[10]:



In [ ]:

	Title	Prod_Budget	Dom_Gross	World_Gross	Released	Year	Month	Rated	Runtime	Genre	Director	Actors	Total_Torrents
0	Lakeview Terrace	20000000	39263506	44821299	2008-09-19	2008.0	9.0	PG-13	110	Crime, Drama, Thriller	Neil LaBute	Samuel L. Jackson, Patrick Wilson, Kerry Washi...	291.0
1	Phantasm II	3000000	7000000	7000000	1988-07-08	1988.0	7.0	R	97	Action, Fantasy, Horror	Don Coscarelli	James Le Gros, Reggie Bannister, Angus Scrimm,...	34.0
2	They Live	4000000	13000000	13000000	1988-11-04	1988.0	11.0	R	93	Sci-Fi, Thriller	John Carpenter	Roddy Piper, Keith David, Meg Foster, George '...	138.0
3	Employee of the Month	10000000	28444855	38364855	2006-10-06	2006.0	10.0	PG-13	103	Comedy, Romance	Greg Coolidge	Dane Cook, Jessica Simpson, Dax Shepard, Andy ...	91.0
4	The Transporter Refueled	22000000	16029670	69698495	2015-09-04	2015.0	9.0	PG-13	96	Action, Crime, Thriller	Camille Delamarre	Ed Skrein, Ray Stevenson, Loan Chabanol, Gabri...	265.0

Dep. Variable:	Total_Torrents	R-squared:	0.604
Model:	OLS	Adj. R-squared:	0.602
Method:	Least Squares	F-statistic:	245.9
Date:	Fri, 15 Jul 2016	Prob (F-statistic):	2.09e-159
Time:	10:16:12	Log-Likelihood:	-5006.2
No. Observations:	812	AIC:	1.002e+04
Df Residuals:	806	BIC:	1.005e+04
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	-4.461e+04	1769.933	-25.205	0.000	-4.81e+04 -4.11e+04
Genre_Single[T.Adventure]	-83.9253	27.968	-3.001	0.003	-138.825 -29.026
Genre_Single[T.Comedy]	-96.9206	12.151	-7.976	0.000	-120.772 -73.069
Genre_Single[T.Drama]	-59.5208	12.595	-4.726	0.000	-84.244 -34.797
Prod_Budget	1.566e-06	9.88e-08	15.851	0.000	1.37e-06 1.76e-06
Year	22.3358	0.882	25.317	0.000	20.604 24.068

Omnibus:	137.527	Durbin-Watson:	2.063
Prob(Omnibus):	0.000	Jarque-Bera (JB):	348.609
Skew:	0.886	Prob(JB):	2.00e-76
Kurtosis:	5.676	Cond. No.	3.08e+10

Dep. Variable:	log_tor	R-squared:	0.631
Model:	OLS	Adj. R-squared:	0.629
Method:	Least Squares	F-statistic:	275.8
Date:	Fri, 15 Jul 2016	Prob (F-statistic):	9.06e-172
Time:	10:16:50	Log-Likelihood:	-598.39
No. Observations:	812	AIC:	1209.
Df Residuals:	806	BIC:	1237.
Df Model:	5
Covariance Type:	nonrobust

Omnibus:	6.785	Durbin-Watson:	2.164
Prob(Omnibus):	0.034	Jarque-Bera (JB):	6.813
Skew:	-0.224	Prob(JB):	0.0332
Kurtosis:	2.998	Cond. No.	8.73e+05