Intro to Data Science: Final Project 1

Analyzing the NYC Subway Dataset

Section 2

Import Data



In [393]:

    
import operator

import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
import scipy.optimize as op

from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plt
%matplotlib inline

filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'

# import data
data = pd.read_csv(filename)

Functions for Basic Statistics and Learning



In [251]:

    
def describe_variables(features, X, y):
    
    for x in features:
        size, min_max, mean, var, skew, kurt = st.describe(X[:, features.index(x)])
        med = np.median(X[:, features.index(x)])
        std = np.std(X[:, features.index(x)])
        print "x{0} ({1}):\n  min = {2}, max = {3},\n  mean = {4}, median = {5}, var = {6}, std = {7}".format(features.index(x), x, min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
        
    size, min_max, mean, var, skew, kurt = st.describe(y)
    med = np.median(y)
    std = np.std(y)
    print "y (ENTRIESn_hourly):\n  min = {0}, max = {1},\n  mean = {2}, median = {3}, var = {4}, std = {5}".format(min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))

Extract Relevant Data



In [252]:

    
print "Shape of data:" +str(data.shape)
N = data.shape[0]
print "N = " + str(N)
print "0.05 * N = " + str(0.05 * N)









    



Shape of data:(42649, 27)
N = 42649
0.05 * N = 2132.45

Class for Creating Data Samples



In [253]:

    
class CreateDataSample:
    
    quantitative_features = ['hour','day_week','latitude','longitude','rain','tempi']
    categorical_features = ['UNIT','DATEn', 'station']

    def __init__(self,n):
        self.n = n
        self.simple_random_sample_row_indices = np.random.choice(data['ENTRIESn_hourly'].index.values, size=n, replace=False)
        self.sample_df = data.loc[self.simple_random_sample_row_indices]
        self.X = self.sample_df[quantitative_features].values
        self.y = self.sample_df['ENTRIESn_hourly'].values
    
    def add_categorical_dummies(self,categorical_features):
        C = []
        for c in categorical_features:
            if categorical_features.index(c) == 0:
                C = sm.categorical(self.sample_df[c].values, drop=True)
            else:
                C = np.hstack((C,sm.categorical(self.sample_df[c].values, drop=True)))

        self.X = np.hstack((self.X,C))

Formulas Implemented (i.e., not included in modules/packages)

Normal Equation (an alternative to gradient descent)

Return the value of $\theta$ that minimizes the cost function:
$\theta = (X^{T}X)^{-1}X^{T}y = X^{+}y$

Class for Creating Learners



In [447]:

    
class learn:
    
    def __init__(self, sample_sizes, method, test_training_data):
        
        self.sample_sizes = sample_sizes
        self.method = method
        self.test_training_data = test_training_data
        
        self.sample = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test = []
        
        # for OLS
        self.model = []
        self.results = []

    def trial(self, display_results, compare):
        for sample_size in self.sample_sizes:
            self.sample = CreateDataSample(sample_size)
            self.sample.add_categorical_dummies(categorical_features)

            # training sample size
            n = self.sample.X.shape[0]
            
            # number of features
            num_of_features = self.sample.X.shape[1]
            
            # Feature Scaling
            # mean normalization (necessary for minimization not to return NaNs)
            x_i_bar = []
            s_i = []

            for i in np.arange(self.sample.X.shape[1]):
                x_i_bar.append(np.mean(self.sample.X[:,i]))
                s_i.append(np.std(self.sample.X[:,i]))

                self.sample.X[:,i] = np.true_divide((np.subtract(self.sample.X[:,i],x_i_bar[i])),s_i[i])
    
            # final design matrix
            self.sample.X = sm.add_constant(self.sample.X)
            
            # theta parameters 
            theta = np.zeros(((num_of_features+1),1))

            # randomly selected row indices to gather samples
            indices = [i for i in np.arange(n)]
            random_indices = np.random.choice(indices, size=(sample_size/2), replace=False) 

            # select samples
            self.X_train = self.sample.X[random_indices]
            self.y_train = self.sample.y[random_indices]

            # depending on method, implement method
            if self.method == 'OLS':
                model = sm.OLS(self.y_train, self.X_train)
                results = model.fit()
                #results = model.fit_regularized()
                self.model = model
                self.results = results
                
            # scikit-learn Linear Regression
            elif self.method == 'LinReg':
                regr = linear_model.LinearRegression()
                regr.fit(self.X_train, self.y_train)
            # scikit-learn Ridge Regression
            elif self.method == 'Ridge':
                clf = Ridge(alpha=1.0)
                clf.fit(self.X_train, self.y_train)
                
            elif self.method == 'BFGS':
                # cost function
                def J(theta):
                    return (1.0/(2*n)) * (((self.X_train.dot(theta)) - self.y_train).T).dot(self.X_train.dot(theta) - self.y_train)
                # gradient estimated
                solution = op.minimize(J, theta)
                theta = solution.x
                
            elif self.method == 'NormEq':
                theta = (np.linalg.pinv(self.X_train).dot(self.y_train))
                
            elif self.method == 'compare':
                # OLS
                model = sm.OLS(self.y_train, self.X_train)
                results = model.fit()
                # LinReg
                regr = linear_model.LinearRegression()
                regr.fit(self.X_train, self.y_train)
                # Ridge
                clf = Ridge(alpha=1.0)
                clf.fit(self.X_train, self.y_train)
                # NormEq
                theta = (np.linalg.pinv(self.X_train.T.dot(self.X_train)).dot(self.X_train.T)).dot(self.y_train)
                
            random_indices = np.random.choice(indices, size=(sample_size/2), replace=False)

            if self.test_training_data == False:
                self.X_test = self.sample.X[random_indices]
                self.y_test = self.sample.y[random_indices]
            else:
                self.X_test = self.X_train
                self.y_test = self.y_train
                
            if display_results == True:
                print "\nn = {0}".format(n/2)
                if self.method == 'OLS':
                    print "r = {0:.2f}".format(np.sqrt(results.rsquared))
                    print "R^2 = {0:.2f}".format(results.rsquared)
                    
                elif self.method == 'LinReg':
                    print "r = {0:.2f}".format(np.sqrt(regr.score(self.X_test, self.y_test)))
                    print "R^2 = {0:.2f}".format(regr.score(self.X_test, self.y_test))
                elif self.method == 'Ridge':
                    print "r = {0:.2f}".format(np.sqrt(clf.score(self.X_test, self.y_test)))
                    print "R^2 = {0:.2f}".format(clf.score(self.X_test, self.y_test))
                
                # too slow!
                elif self.method == 'BFGS':
                    y_hat = theta.T.dot(self.X_test)
                    print "y_hat = {0:.2f}".format(y_hat)
                
                elif self.method == 'NormEq':
                    y_hat = self.X_test.dot(theta.T)
                    y_bar = np.mean(self.y_test)
                    
                    total_variation = np.sum((self.y_test - y_bar)**2)
                    unexplained_variation = np.sum((self.y_test - y_hat)**2)
                    #explained_variation = np.sum((y_hat - y_bar)**2)
                    
                    r_squared = (1 - (np.true_divide(unexplained_variation,total_variation)))
                    
                    print "r = {0:.2f}".format(np.sqrt(r_squared))
                    print "R^2 = {0:.2f}".format(r_squared)
                    
            if compare == True:
                print "\nn = {0}".format(n/2)
                
                rsquared_values = {}
                r_values = {}
                
                rsquared_values['ols_R2'] = results.rsquared
                r_values['ols_r'] = np.sqrt(rsquared_values['ols_R2'])
                
                #rsquared_values['lin_reg_R2'] = regr.score(self.X_test, self.y_test)    
                #r_values['lin_reg_r'] = np.sqrt(rsquared_values['lin_reg_R2'])
                
                rsquared_values['ridge_R2'] = clf.score(self.X_test, self.y_test)   
                r_values['ridge_r'] = np.sqrt(rsquared_values['ridge_R2'])
            
                # normal equation
                y_hat = self.X_test.dot(theta.T)
                y_bar = np.mean(self.y_test)
                total_variation = np.sum((self.y_test - y_bar)**2)
                unexplained_variation = np.sum((self.y_test - y_hat)**2)
                    
                rsquared_values['norm_eq_R2'] = (1 - (np.true_divide(unexplained_variation,total_variation)))
                r_values['norm_eq_r'] = np.mean(rsquared_values['norm_eq_R2'])
                
                max_rsquared = max(rsquared_values.iteritems(), key=operator.itemgetter(1))
                max_r = max(r_values.iteritems(), key=operator.itemgetter(1))
                
                print "{0} = {1:.2f}".format(max_r[0], max_r[1])
                print "{0} = {1:.2f}".format(max_rsquared[0], max_rsquared[1])

Section 1. Linear Regression

2.1.a What approach did you use to compute the coefficients theta and produce prediction for ENTRIESn_hourly in your regression model?

After comparing a few different methods (Ordinary Least Squares [OLS] from \textit{StatsModels}, two different regression techniques from \textit{scikit-learn}, the Broyden–Fletcher–Goldfarb–Shanno [BFGS] optimization algorithm from \texit{Scipy.optimize}, and a Normal Equations algebraic attempt), OLS from \textit{StatsModels} was chosen due to its consistently higher $r$ and $R^{2}$ values throughout various test sample sizes.



In [437]:

    
sample_sizes = [60, 200, 1000, 3000, 10000, 20000]
compare = learn(sample_sizes, method='compare', test_training_data=False)
compare.trial(display_results=False, compare=True)









    



n = 30
ols_r = 1.00
ols_R2 = 1.00

n = 100
ols_r = 1.00
ols_R2 = 1.00

n = 500
ols_r = 0.80
ols_R2 = 0.64

n = 1500
ols_r = 0.78
ols_R2 = 0.61

n = 5000
ols_r = 0.73
ols_R2 = 0.54

n = 10000
ols_r = 0.70
ols_R2 = 0.49

StatsModels OLS



In [511]:

    
sample_sizes = [1000]
ols = learn(sample_sizes, method='OLS', test_training_data=False)
ols.trial(display_results=True, compare=False)









    



n = 500
r = 0.85
R^2 = 0.73

linear correlation coefficient
$-1 \leq r \leq 1$ : if $r = +1$, then a perfect positive linear relation exists between the explanatory and response variables; if $r = -1$, then a perfect negative linear relation exists between the explanatory and response variables.

coefficient of determination
$0 \leq R^{2} \leq 1$ : if $R^{2} = 0$, the least-squares regression line has no explanatory values; if $R^{2} = 1$, the least-squares regression line explains $100\%$ of the variation in the response variable.

2.2.a What features (input variables) did you use in your model?

quantitative_features = ['hour','day_week','latitude','longitude','rain','tempi'] categorical_features = ['UNIT','DATEn', 'station']

2.2.b Did you use any dummy variables as part of your features?

yes, categorical

2.3 Why did you select these features in your model?

2.4 What are the coefficients (or weights) of the features in your linear regression model?



In [521]:

    
print ols.results.params









    



[  1.72610716e+15   7.55852548e+02  -9.97517811e+14  -6.44871789e+15
  -4.55604288e+15   3.96663605e+01   5.47218041e+02  -1.01457610e+16
  -1.06354531e+16   4.33199011e+15   7.93004775e+14   7.70058142e+15
   4.89279988e+14   1.31546824e+16   1.59894316e+16   4.95754065e+15
   5.53991577e+15  -4.89869380e+15  -4.00177214e+15   1.41790762e+16
   2.45442537e+15   3.96538263e+15  -4.64804260e+14   3.17447216e+15
   3.88401300e+15  -1.13019992e+16  -9.83595018e+15   8.45052807e+15
   4.26754028e+14  -5.62536645e+15   6.99695691e+15  -2.72694531e+15
   1.96368453e+16   6.16772959e+15   1.97153434e+15  -6.71278668e+15
  -8.87136037e+15   2.54190439e+15   1.83150056e+15  -4.74031196e+15
   5.71299434e+15   7.96688902e+15   6.67998289e+15  -6.86086766e+15
   1.07269688e+16   1.12053622e+16   3.52405553e+15   2.31167782e+15
   1.44275534e+16  -8.15003385e+15   1.00372132e+15  -1.57022631e+15
   8.24114216e+15   1.80309848e+14  -1.50190648e+16   1.19502611e+16
  -1.29602512e+16  -9.26802825e+15   6.85755045e+15  -3.91536393e+15
  -3.83622275e+15   9.59404427e+15  -1.36875283e+13   1.04122826e+16
  -3.92610881e+15   2.91666422e+15  -8.02006839e+15  -7.39630905e+15
  -2.15441382e+16   3.44697472e+15  -2.14100104e+15   1.91880315e+15
  -6.89395651e+15   7.99693715e+15  -8.05236448e+15  -5.01125045e+15
   2.98993011e+15  -1.01012733e+16   1.34998565e+16   6.22283276e+15
   5.22782379e+15   1.95940530e+16   6.19897671e+15  -1.07786281e+16
   1.82770233e+15   3.41446141e+15   1.10838197e+16   4.02018962e+15
   5.03988978e+15   1.07556721e+16  -5.34135933e+15  -9.20919422e+15
  -1.66934227e+15   1.65361851e+15  -4.85645138e+15   4.70268559e+15
   1.19350648e+15  -2.53371524e+15  -4.46449438e+15   1.67192602e+15
   2.35489185e+15   8.63150909e+15  -5.38266748e+15  -2.95024853e+15
  -1.01931456e+14  -1.72274301e+15   1.16569606e+15  -5.01527892e+15
   9.25117793e+15  -7.51330624e+13  -3.33737257e+15  -5.66317273e+15
  -1.92753414e+15  -7.25518721e+15  -7.94644419e+13  -7.93332330e+15
  -3.88065686e+15  -1.35440643e+16  -7.03939571e+15   5.02444811e+15
  -1.66783643e+16  -6.95454483e+15  -6.97702266e+15  -1.53227498e+15
  -1.76544971e+15  -6.43944108e+15   1.32829543e+15  -3.45462759e+14
   7.00486516e+15  -2.19745797e+15   3.97521773e+15   4.52971836e+14
   4.78895161e+15  -1.22074249e+16  -1.31722913e+16  -8.36833303e+15
  -3.43629552e+15  -4.14145375e+15  -1.97278185e+14  -1.82787635e+15
   2.52787992e+14  -1.10063137e+15  -7.54695200e+15   1.16117842e+15
  -4.06800034e+15  -4.02928050e+15   2.06862057e+15  -5.05153725e+15
   7.11522796e+15   3.39567180e+15  -4.07083791e+15   1.75067524e+16
  -3.18757383e+15  -8.93261014e+14   3.90043206e+15   5.80743340e+15
   8.47140902e+15   2.50450060e+13   5.14809419e+15   1.66276218e+15
   2.63832589e+15   6.64421810e+15   2.51950626e+15   6.25438368e+15
   2.80919442e+15   6.49759194e+15  -2.52283839e+15   1.75572097e+15
   6.92761388e+15   1.96964913e+15   5.62110327e+15   5.39090826e+15
   8.21266609e+15   1.25967888e+16   2.11842674e+15  -2.84361434e+15
   1.96797945e+15   1.27582933e+14  -2.22657959e+15  -7.36794095e+15
   4.28667895e+15  -2.21870416e+15  -6.15583911e+15  -1.32171925e+15
   5.49178983e+15  -2.19369941e+15   3.13137138e+15  -5.57621973e+14
   1.71041919e+15  -3.74884247e+15   1.17005482e+15  -4.45790543e+15
  -3.37744071e+15  -6.92244879e+15   3.43262617e+15   1.80964650e+15
  -3.38886538e+14  -4.36155018e+15   7.96025477e+14   1.46064717e+15
  -3.13853387e+15  -2.08831774e+14  -5.62428326e+14  -1.08372083e+16
  -2.11029831e+15  -2.26257611e+15  -4.67667756e+15   1.15488524e+15
   2.81858655e+15   5.73851990e+14  -1.55814596e+15  -1.97393467e+15
   2.16954170e+15  -7.59446410e+14   2.32714031e+15  -1.05059236e+16
  -2.73183389e+14  -4.96810186e+13   1.19570125e+15   1.94968679e+15
  -7.16318840e+15  -6.32329388e+15   3.33839287e+14  -5.13578828e+15
  -4.74242246e+14   4.43277873e+15  -1.85946052e+15   5.08151066e+14
   2.33669246e+15  -4.34842166e+15  -3.90125068e+15   1.91764078e+15
   3.22114085e+14   1.20024649e+15   6.38825332e+14   3.64493628e+15
  -3.27085688e+15  -1.47201064e+14  -4.22334825e+15   4.42675122e+15
  -2.61856986e+15  -5.69255740e+15   2.11190596e+15   4.75914917e+15
   4.44166906e+14  -1.86724235e+13   6.95213071e+13   1.37043323e+14
   2.34417247e+14   2.94719448e+14   3.52735930e+14   4.27179668e+14
  -2.37142661e+13   6.68980870e+13   1.51039237e+14   2.27207880e+14
   3.15147037e+14   4.46183153e+14   5.46947519e+14  -2.13613421e+13
   5.92428686e+13   1.55387921e+14   2.61001201e+14   3.24818458e+14
   4.23561862e+14   5.19959852e+14  -2.20629030e+13   5.92428686e+13
   1.53231340e+14   2.73169482e+14   2.78292193e+14   4.05631654e+14
   4.35765011e+14  -2.43388052e+13   7.20329716e+13  -5.62039778e+15
   4.02482587e+15  -6.54568217e+15   8.51641491e+15   1.79507703e+14
   1.08274597e+15  -1.12689190e+16  -6.48695096e+15   6.02681114e+14
   6.27812341e+15   9.88898626e+15   3.59870876e+15  -2.12542861e+15
  -1.04847607e+15   1.62517130e+15   1.16250436e+16  -8.78782858e+14
   3.58148920e+15  -4.80694052e+15  -4.13640681e+15  -4.47603724e+13
  -1.32980015e+15  -2.77244193e+15  -6.38332995e+14   1.27973555e+15
  -6.44168265e+15   5.32141097e+15  -1.38152931e+15  -3.36364666e+13
  -3.55862728e+14  -6.82244127e+14   1.27722672e+14   3.14395148e+15
  -1.25072555e+15   5.94434117e+15   2.66442743e+15   8.84206338e+15
   7.28682721e+15  -2.44417914e+14   4.59796936e+15  -2.58793831e+15
  -1.16198581e+15   4.17156561e+15  -3.75066140e+15  -8.54609207e+15
   2.75802891e+15   1.48958789e+15  -1.01184000e+16  -1.50607939e+16
   6.62069290e+15  -1.73812733e+16  -7.77720841e+13  -2.89056677e+15
   2.23013393e+16   2.32840261e+14   1.61220946e+15  -9.91565770e+15
   3.25890359e+15  -2.71090794e+15   8.31299023e+15   1.22472737e+16
  -9.88655808e+14   3.92911019e+15   3.32353714e+15   1.33277457e+15
  -1.92109115e+15  -1.10279847e+15   6.61323225e+15   8.31865951e+15
  -3.68947666e+14   7.82873691e+15  -4.92711964e+15   6.89521045e+15
   7.88884462e+14   4.08509873e+15  -5.41369091e+15   7.88884462e+14
   3.42906917e+14  -3.93096209e+15  -6.86331828e+15   3.33519641e+15
  -3.19265404e+15   1.31509324e+16   3.95045831e+15   5.11987776e+15
   7.13671528e+14   6.73632090e+15  -3.55862728e+14  -2.97552627e+15
   2.67582190e+15  -4.96156265e+15  -1.91594762e+15   1.53444075e+16
   7.51697467e+15   7.87383210e+13  -4.09436239e+15  -1.19757177e+15
   2.15663256e+15  -9.23972925e+15   5.59373648e+15  -1.22042721e+16
  -3.55862728e+14  -7.51609236e+14  -1.19585247e+15   6.88648454e+14
   1.41552116e+15   9.81447038e+15   7.74583030e+15  -1.42883539e+15
   4.59512481e+15  -8.83939590e+14   5.04487468e+15   8.24949873e+15
   2.55306365e+15  -6.26672759e+15   7.88884462e+14  -5.73172343e+15
   1.27722672e+14   8.99179991e+15  -1.54020614e+15   1.34197927e+16
   1.15243254e+16   2.91119221e+15   2.17715947e+15   7.70314864e+14
  -3.18399245e+15  -3.26775977e+15   5.83528079e+14   1.31264197e+15
   3.41600735e+15  -1.15630999e+16  -4.00384136e+15  -1.27474868e+15
  -5.01681544e+15  -2.81975631e+15  -4.54843901e+15   2.80414366e+14
   1.94303677e+15  -1.36488008e+15   7.53896493e+15  -3.58403717e+15
   7.54335759e+14   1.18079527e+16  -7.35135807e+14   7.55438603e+15
   7.87532313e+15   1.21262120e+16   7.13691794e+14  -4.72575827e+15
   2.27120407e+15  -3.55862728e+14   5.54219856e+15  -3.05416777e+15
   7.35617838e+15   3.16230888e+15  -2.02941477e+14  -1.00478384e+16
   6.70788631e+15   1.34070910e+15  -6.63591376e+15  -2.18448912e+14
   8.54737697e+15  -8.19295794e+15   8.15283593e+14   2.06112116e+14
  -1.00217161e+15  -1.93503060e+15   1.24399938e+16   2.09527368e+15
  -2.61764813e+14  -9.79868276e+14   5.41554208e+14   2.75984767e+15
  -9.57774283e+15   4.81179586e+15  -7.47324477e+15  -7.29960798e+15
   1.70334188e+16   6.40691402e+15   6.06000280e+15  -9.01749227e+15
  -8.38505832e+13  -8.36535280e+14   4.46008198e+15   2.47140968e+15
   1.64317339e+15   7.75734426e+15   9.62047153e+12  -3.56738497e+15
   1.24767671e+16   4.59935514e+15   4.41919145e+15   4.69842542e+15
  -6.86616781e+15  -4.89449692e+13   1.81083224e+15  -9.93636394e+15
   1.08883972e+16   7.29005321e+13   9.00260203e+14   1.15560276e+15
   1.71717265e+14  -1.18697414e+16   2.50579603e+14]



In [519]:

    
y_hat_test = ols.results.predict(ols.X_test)
diff = (y_hat_test - ols.y_test)
y = pd.DataFrame(data={'y_test':ols.y_test,'y_hat_test':y_hat_test,'diff':diff}, columns=['y_test','y_hat_test','diff'])
y['diff'] = diff
proximity = ['close' if d < 50 else 'far' for d in diff]
y['proximity'] = proximity
y['proximity'].describe()









    Out[519]:





count       500
unique        2
top       close
freq        251
Name: proximity, dtype: object



In [491]:

2.5

As noted above, $R^{2}$

2.6.a What does this $R^{2}$ value mean for the goodness of fit for your regression model

2.6.b Do you think this linear model to predict ridership is appropriate for this dataset, given this $R^{2}$ value?

given the value, eh, but not given the results

through these statistical tests, assuming they've been implemented and interpreted correctly, it seems clear that computational power in the world of big data is in the ability to reproduce, not use large data sets. as this relates to big data and more complex and accurate modeling, perhaps significantly larger data sets have more power, btu then they would simply have it by their ever-increasing knowledge base, less regarding their $model$, which is in many ways, so it seems, how human brains are