notebook.community

Edit and run



In [106]:

    
import time

import pandas as pd
import numpy as np

import statsmodels.api as sm    #import statsmodels for the stats models
import scipy.stats    #import for stats functions
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LassoLarsIC
from sklearn.cross_validation import train_test_split
from patsy import dmatrices

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



In [107]:

    
# load dataframe
pickle_name = 'consolidated.pickle'
df = pd.read_pickle(pickle_name)
df.head(3)









    Out[107]:






  
    
      
      budget_i
      margin
      metascore
      month
      percent_int
      profit_i
      title
      userscore
      year
      seq_remake
    
  
  
    
      0
      6.401666e+07
      32.561261
      82
      12
      0.491308
      2.084463e+09
      the-exorcist
      8.8
      1973
      0
    
    
      1
      6.727761e+05
      188.803136
      78
      10
      0.000000
      1.270222e+08
      the-texas-chainsaw-massacre
      7.9
      1974
      0
    
    
      2
      1.159215e+07
      10.976187
      38
      10
      0.254168
      1.272376e+08
      the-texas-chainsaw-massacre
      6.6
      2003
      1



In [108]:

    
y = df['margin']    #extract dependent variable
X = df[['budget_i', 'percent_int', 'year', 'month', 'metascore', 'userscore', 'seq_remake']]



In [109]:

    
X.head()









    Out[109]:






  
    
      
      budget_i
      percent_int
      year
      month
      metascore
      userscore
      seq_remake
    
  
  
    
      0
      6.401666e+07
      0.491308
      1973
      12
      82
      8.8
      0
    
    
      1
      6.727761e+05
      0.000000
      1974
      10
      78
      7.9
      0
    
    
      2
      1.159215e+07
      0.254168
      2003
      10
      38
      6.6
      1
    
    
      3
      5.283679e+07
      0.447631
      1975
      6
      79
      8.7
      0
    
    
      4
      7.492860e+06
      0.000000
      1976
      11
      93
      7.6
      0



In [110]:

    
# get mean, median, max, min
y_mean = np.mean(y)
y_median = np.median(y)
y_max = np.max(y)
y_min = np.min(y)

print y_mean
print y_median
print y_max
print y_min









    



14.0196777763
2.0462829
430.517853333
-0.9983406



In [111]:

    
# get means, median, max, min for sequels and originals

def filter_not_equal(df, df_col, criterion):
    the_filter = df[df_col] != criterion
    df = df[the_filter]
    return df

df_original = filter_not_equal(df, 'seq_remake', 1)
df_sequel = filter_not_equal(df, 'seq_remake', 0)

y_original = df_original['margin']
y_sequel = df_sequel['margin']

y_original_mean = np.mean(y_original)
y_original_median = np.median(y_original)
y_original_max = np.max(y_original)
y_original_min = np.min(y_original)

y_sequel_mean = np.mean(y_sequel)
y_sequel_median = np.median(y_sequel)
y_sequel_max = np.max(y_sequel)
y_sequel_min = np.min(y_sequel)

print 'Originals'
print 'mean %s: '   % y_original_mean
print 'median %s: ' % y_original_median
print 'max %s: '    % y_original_max
print 'min %s: '    % y_original_min
print ' '

print 'Sequels'
print 'mean %s: '   % y_sequel_mean
print 'median %s: ' % y_sequel_median
print 'max %s: '    % y_sequel_max
print 'min %s: '    %  y_sequel_min









    



Originals
mean 15.0961595115: 
median 1.89842143333: 
max 430.517853333: 
min -0.9983406: 
 
Sequels
mean 7.37357315011: 
median 3.77182336: 
max 31.384303: 
min 0.174355285714:



In [112]:

    
ys = np.linspace(min(y),max(y))    # ys = np.linspace(min(y),max(y),200); set the x range



In [113]:

    
#compute the density for the first data
density_y = scipy.stats.gaussian_kde(y)    #use the scipy density calculator
density_y.covariance_factor = lambda : .25    #set the covariance factor so that plots analogous to R's are plotted
density_y._compute_covariance()    #recompute using the new covariance factor

plt.figure(figsize = (40,16))    #initiate the plot
plt.plot(ys,density_y(ys),'b--',lw=4)    #generate the plot

#plt.fill(ys,density_y(ys),'b',alpha=0.3)    #fill in the curves
plt.xlabel('Profit',fontsize=30)    #add an x label
plt.ylabel('Density of Profit',fontsize=30)    #add a y label
plt.text(np.mean(y),max(density_y(ys))/2.0,r'$\widehat{y}$',fontsize=30)    #add LaTeX text for location of the mean

#plt.title('A Simple Density',fontsize=40)    #add a title to the plot
plt.grid(True)
plt.savefig('Density_of_Profit.pdf')    #save the plot to file
plt.show()    #and then show the plot



In [114]:

    
#The variables vs y
plt.figure(figsize = (20,10))    #initiate the plot

plt.subplot(3,3,1)    #plot subplot 1,1 in the 3x3 area
plt.plot(X['budget_i'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$Budget$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(3,3,2)    #plot subplot 1,2 in the 3x3 area
plt.plot(X['percent_int'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$Percent Box Office International$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(3,3,3)    #plot subplot 1,3 in the 3x3 area
plt.plot(X['year'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$Year$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(3,3,4)    #plot subplot 2,1 in the 3x3 area
plt.plot(X['month'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$Month$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(3,3,5)    #plot subplot 2,2 in the 3x3 area
plt.plot(X['metascore'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$Meta Score$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(3,3,6)    #plot subplot 2,3 in the 3x3 area
plt.plot(X['userscore'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$User Score$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(3,3,7)    #plot subplot 2,3 in the 3x3 area
plt.plot(X['seq_remake'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$Sequel / Remake$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.savefig('Variable_Relationships_v_Profit.pdf')    #save the plot to file



In [115]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 3000)



In [116]:

    
# LassoCV: coordinate descent

# Compute paths
print("Computing regularization path using the coordinate descent lasso...")
t1 = time.time()
model = LassoCV(cv=10).fit(X_train, y_train)
t_lasso_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.alphas_)

plt.figure()
ymin, ymax = 2300, 3800
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha: CV estimate')

plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)
print model.coef_









    



Computing regularization path using the coordinate descent lasso...
[ -5.18332708e-07   0.00000000e+00  -0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00  -0.00000000e+00]



In [117]:

    
# LassoLarsCV: least angle regression

# Compute paths
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
model = LassoLarsCV(cv=3).fit(X_train, y_train)
t_lasso_lars_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.cv_alphas_)

plt.figure()
plt.plot(m_log_alphas, model.cv_mse_path_, ':')
plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: Lars (train time: %.2fs)'
          % t_lasso_lars_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)

plt.show()
print model.coef_









    



Computing regularization path using the Lars lasso...






    












    



[ -3.43947562e-07   0.00000000e+00  -2.70970947e-01   1.58860835e+00
   5.47739525e-01   0.00000000e+00   0.00000000e+00]



In [118]:

    
# LassoLarsIC: least angle regression with BIC/AIC criterion

model_bic = LassoLarsIC(criterion='bic')
t1 = time.time()
model_bic.fit(X_train, y_train)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_

model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(X_train, y_train)
alpha_aic_ = model_aic.alpha_


def plot_ic_criterion(model, name, color):
    alpha_ = model.alpha_
    alphas_ = model.alphas_
    criterion_ = model.criterion_
    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
             linewidth=3, label='%s criterion' % name)
    plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
                label='alpha: %s estimate' % name)
    plt.xlabel('-log(alpha)')
    plt.ylabel('criterion')

plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)'
          % t_bic)
print model.coef_









    



[ -3.43947562e-07   0.00000000e+00  -2.70970947e-01   1.58860835e+00
   5.47739525e-01   0.00000000e+00   0.00000000e+00]



In [119]:

    
# https://www.quora.com/What-is-Least-Angle-Regression-and-when-should-it-be-used



In [ ]:

	budget_i	margin	metascore	month	percent_int	profit_i	title	userscore	year	seq_remake
0	6.401666e+07	32.561261	82	12	0.491308	2.084463e+09	the-exorcist	8.8	1973	0
1	6.727761e+05	188.803136	78	10	0.000000	1.270222e+08	the-texas-chainsaw-massacre	7.9	1974	0
2	1.159215e+07	10.976187	38	10	0.254168	1.272376e+08	the-texas-chainsaw-massacre	6.6	2003	1