Analyzing the NYC Subway Dataset

Intro to Data Science: Final Project 1, Part 2

(Short Questions)

Section 2. Linear Regression

Austin J. Alexander


Import Directives and Initial DataFrame Creation


In [1]:
import operator

import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
import scipy.optimize as op

from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plt
%matplotlib inline

filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'

# import data
data = pd.read_csv(filename)

Function for Basic Statistics


In [2]:
def describe_variables(features, X, y):
    
    for x in features:
        size, min_max, mean, var, skew, kurt = st.describe(X[:, features.index(x)])
        med = np.median(X[:, features.index(x)])
        std = np.std(X[:, features.index(x)])
        print "x{0} ({1}):\n  min = {2}, max = {3},\n  mean = {4}, median = {5}, var = {6}, std = {7}".format(features.index(x), x, min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
        
    size, min_max, mean, var, skew, kurt = st.describe(y)
    med = np.median(y)
    std = np.std(y)
    print "y (ENTRIESn_hourly):\n  min = {0}, max = {1},\n  mean = {2}, median = {3}, var = {4}, std = {5}".format(min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))

Extract Relevant Data


In [3]:
print "Shape of data:" +str(data.shape)
N = data.shape[0]
print "N = " + str(N)
print "0.05 * N = " + str(0.05 * N)


Shape of data:(42649, 27)
N = 42649
0.05 * N = 2132.45

Class for Creating Data Samples


In [4]:
class CreateDataSample:
    
    quantitative_features = ['hour','day_week','tempi']
    categorical_features = ['UNIT'] 

    def __init__(self,n):
        self.n = n
        self.simple_random_sample_row_indices = np.random.choice(data['ENTRIESn_hourly'].index.values, size=n, replace=False)
        self.sample_df = data.loc[self.simple_random_sample_row_indices]
        self.X = self.sample_df[quantitative_features].values
        self.y = self.sample_df['ENTRIESn_hourly'].values
    
    def add_categorical_dummies(self,categorical_features):
        C = []
        for c in categorical_features:
            if categorical_features.index(c) == 0:
                C = sm.categorical(self.sample_df[c].values, drop=True)
            else:
                C = np.hstack((C,sm.categorical(self.sample_df[c].values, drop=True)))

        self.X = np.hstack((self.X,C))

Formulas Implemented

(i.e., not included in modules/packages)

Normal Equation (an alternative to gradient descent)

Return a minimized $\theta$, where $\theta$ is a vector of coefficients for the design matrix $X$, so that the sum of square differences between $\theta^{T}X^{(i)}$ and $y$ is minimized:
$$\theta = (X^{T}X)^{-1}X^{T}y = X^{+}y$$

Class for Creating Learners


In [5]:
class learn:
    
    def __init__(self, sample_sizes, method, test_training_data):
        
        self.sample_sizes = sample_sizes
        self.method = method
        self.test_training_data = test_training_data
        
        self.sample = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test = []
        
        # for OLS
        self.model = []
        self.results = []

    def trial(self, display_results, compare):
        for sample_size in self.sample_sizes:
            # double the size to create both a training and a testing set
            self.sample = CreateDataSample(sample_size*2) 
            self.sample.add_categorical_dummies(categorical_features)

            # training sample size
            n = self.sample.X.shape[0]
            
            # number of features
            num_of_features = self.sample.X.shape[1]
            
            # Feature Scaling
            # mean normalization (necessary for minimization not to return NaNs)
            x_i_bar = []
            s_i = []

            for i in np.arange(self.sample.X.shape[1]):
                x_i_bar.append(np.mean(self.sample.X[:,i]))
                s_i.append(np.std(self.sample.X[:,i]))

                self.sample.X[:,i] = np.true_divide((np.subtract(self.sample.X[:,i],x_i_bar[i])),s_i[i])
    
            # final design matrix
            self.sample.X = sm.add_constant(self.sample.X)
            
            # theta parameters 
            theta = np.zeros(((num_of_features+1),1))

            # randomly selected row indices to gather samples
            indices = [i for i in np.arange(n)]
            random_indices = np.random.choice(indices, size=(sample_size), replace=False) 

            # select samples
            self.X_train = self.sample.X[random_indices]
            self.y_train = self.sample.y[random_indices]

            # depending on method, implement method
            if self.method == 'OLS':
                model = sm.OLS(self.y_train, self.X_train)
                results = model.fit()
                #results = model.fit_regularized()
                self.model = model
                self.results = results
                
            # scikit-learn Linear Regression
            elif self.method == 'LinReg':
                regr = linear_model.LinearRegression()
                regr.fit(self.X_train, self.y_train)
            # scikit-learn Ridge Regression
            elif self.method == 'Ridge':
                clf = Ridge(alpha=1.0)
                clf.fit(self.X_train, self.y_train)
                
            elif self.method == 'BFGS':
                # cost function
                def J(theta):
                    return (1.0/(2*n)) * (((self.X_train.dot(theta)) - self.y_train).T).dot(self.X_train.dot(theta) - self.y_train)
                # gradient estimated
                solution = op.minimize(J, theta)
                theta = solution.x
                
            elif self.method == 'NormEq':
                theta = (np.linalg.pinv(self.X_train).dot(self.y_train))
                
            elif self.method == 'compare':
                # OLS
                model = sm.OLS(self.y_train, self.X_train)
                results = model.fit()
                # LinReg
                regr = linear_model.LinearRegression()
                regr.fit(self.X_train, self.y_train)
                # Ridge
                clf = Ridge(alpha=1.0)
                clf.fit(self.X_train, self.y_train)
                # NormEq
                theta = (np.linalg.pinv(self.X_train.T.dot(self.X_train)).dot(self.X_train.T)).dot(self.y_train)
                
            random_indices = np.random.choice(indices, size=(sample_size), replace=False)

            if self.test_training_data == False:
                self.X_test = self.sample.X[random_indices]
                self.y_test = self.sample.y[random_indices]
            else:
                self.X_test = self.X_train
                self.y_test = self.y_train
                
            if display_results == True:
                print "\nn = {0}".format(n/2)
                if self.method == 'OLS':
                    print "r = {0:.2f}".format(np.sqrt(results.rsquared))
                    print "R^2 = {0:.2f}".format(results.rsquared)
                    
                elif self.method == 'LinReg':
                    print "r = {0:.2f}".format(np.sqrt(regr.score(self.X_test, self.y_test)))
                    print "R^2 = {0:.2f}".format(regr.score(self.X_test, self.y_test))
                elif self.method == 'Ridge':
                    print "r = {0:.2f}".format(np.sqrt(clf.score(self.X_test, self.y_test)))
                    print "R^2 = {0:.2f}".format(clf.score(self.X_test, self.y_test))
                
                # too slow!
                elif self.method == 'BFGS':
                    y_hat = theta.T.dot(self.X_test)
                    print "y_hat = {0:.2f}".format(y_hat)
                
                elif self.method == 'NormEq':
                    y_hat = self.X_test.dot(theta.T)
                    y_bar = np.mean(self.y_test)
                    
                    total_variation = np.sum((self.y_test - y_bar)**2)
                    unexplained_variation = np.sum((self.y_test - y_hat)**2)
                    #explained_variation = np.sum((y_hat - y_bar)**2)
                    
                    r_squared = (1 - (np.true_divide(unexplained_variation,total_variation)))
                    
                    print "r = {0:.2f}".format(np.sqrt(r_squared))
                    print "R^2 = {0:.2f}".format(r_squared)
                    
            if compare == True:
                print "\nn = {0}".format(sample_size)
                
                rsquared_values = {}
                r_values = {}
                
                rsquared_values['ols_R2'] = results.rsquared
                r_values['ols_r'] = np.sqrt(rsquared_values['ols_R2'])
                
                #rsquared_values['lin_reg_R2'] = regr.score(self.X_test, self.y_test)    
                #r_values['lin_reg_r'] = np.sqrt(rsquared_values['lin_reg_R2'])
                
                rsquared_values['ridge_R2'] = clf.score(self.X_test, self.y_test)   
                r_values['ridge_r'] = np.sqrt(rsquared_values['ridge_R2'])
            
                # normal equation
                y_hat = self.X_test.dot(theta.T)
                y_bar = np.mean(self.y_test)
                total_variation = np.sum((self.y_test - y_bar)**2)
                unexplained_variation = np.sum((self.y_test - y_hat)**2)
                    
                rsquared_values['norm_eq_R2'] = (1 - (np.true_divide(unexplained_variation,total_variation)))
                r_values['norm_eq_r'] = np.mean(rsquared_values['norm_eq_R2'])
                
                max_rsquared = max(rsquared_values.iteritems(), key=operator.itemgetter(1))
                max_r = max(r_values.iteritems(), key=operator.itemgetter(1))
                
                print "{0} = {1:.2f}".format(max_r[0], max_r[1])
                print "{0} = {1:.2f}".format(max_rsquared[0], max_rsquared[1])

Section 1. Linear Regression

2.1 What approach did you use to compute the coefficients theta and produce prediction for ENTRIESn_hourly in your regression model?

After comparing a few different methods (Ordinary Least Squares [OLS] from StatsModels, two different regression techniques from scikit-learn, the Broyden–Fletcher–Goldfarb–Shanno [BFGS] optimization algorithm from Scipy.optimize, and a Normal Equations algebraic attempt), OLS from StatsModels was chosen due to its consistently higher $r$ and $R^{2}$ values (see notes 1 and 2 below) throughout various test sample sizes ( $n = \{30, 100, 500, 1500, 5000, 10000\}$ ).

Notes

1 The linear correlation coefficient ($r$) can take on the following values: $-1 \leq r \leq 1$. If $r = +1$, then a perfect positive linear relation exists between the explanatory and response variables. If $r = -1$, then a perfect negative linear relation exists between the explanatory and response variables.

2 The coefficient of determination ($R^{2}$) can take on the following values: $0 \leq R^{2} \leq 1$. If $R^{2} = 0$, the least-squares regression line has no explanatory value; if $R^{2} = 1$, the least-squares regression line explains $100\%$ of the variation in the response variable.


In [6]:
# sample-size values are halved 
sample_sizes = [30, 100, 500, 1500, 5000, 10000]
compare = learn(sample_sizes, method='compare', test_training_data=False)
compare.trial(display_results=False, compare=True)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-dd8e3388388a> in <module>()
      2 sample_sizes = [30, 100, 500, 1500, 5000, 10000]
      3 compare = learn(sample_sizes, method='compare', test_training_data=False)
----> 4 compare.trial(display_results=False, compare=True)

<ipython-input-5-6b7055db02b7> in trial(self, display_results, compare)
     20         for sample_size in self.sample_sizes:
     21             # double the size to create both a training and a testing set
---> 22             self.sample = CreateDataSample(sample_size*2)
     23             self.sample.add_categorical_dummies(categorical_features)
     24 

<ipython-input-4-417a2d933ac6> in __init__(self, n)
      8         self.simple_random_sample_row_indices = np.random.choice(data['ENTRIESn_hourly'].index.values, size=n, replace=False)
      9         self.sample_df = data.loc[self.simple_random_sample_row_indices]
---> 10         self.X = self.sample_df[quantitative_features].values
     11         self.y = self.sample_df['ENTRIESn_hourly'].values
     12 

NameError: global name 'quantitative_features' is not defined

2.2 What features (input variables) did you use in your model? Did you use any dummy variables as part of your features?

Quantitative features used: 'hour','day_week','rain','tempi'.

Categorical features used: 'UNIT'. As a categorical feature, this variable required the use of dummy variables.

2.3 Why did you select these features in your model?

Due to the findings presented in the *DataExploration* supplement, it seemed clear that location significantly impacted the number of entries. In addition, the hour and day of the week showed importance. Temperature appeared to have some relationship with entries as well, and so it was included. Based on that exploration and on the statistical and practical evidence offered in Section 1. Statistical Test, rain was not included as a feature (and, as evidenced by a number of test runs, had marginal if any importance).

As far as the selection of location and day/time variables were concerned, station can be captured quantitatively by latitude and longitude, both of which, as numeric values, should offer a better sense of trend toward something. However, as witnessed by numerous test runs, latitude and longitude in fact appear to be redundant when using UNIT as a feature, which is in fact more signficant (as test runs indicated and, as one might assume, due to, for example, station layouts) than latitude and longitude.

Each DATEn is a 'one-off', so it's unclear how any could be helpful for modeling/predicting (as those dates literally never occur again). day_week seemed to be a better selection in this case.

Using StatsModels OLS to Create a Model (100 trials)


In [ ]:
sample_size = [500]
ols = []

for i in np.arange(100):
    temp_ols = learn(sample_size, method='OLS', test_training_data=False)
    temp_ols.trial(display_results=False, compare=False)
    if not ols or (temp_ols.results.rsquared > ols.results.rsquared):
        ols = temp_ols
        
print "\nn = {0}".format(sample_size[0])
print "r = {0:.2f}".format(np.sqrt(ols.results.rsquared))
print "R^2 = {0:.2f}".format(ols.results.rsquared)

In [ ]:
observed = ols.y_test
predictions = ols.results.predict(ols.X_test)
plt.title('Observed Values vs Fitted Predictions')
plt.xlabel('observed values')
plt.ylabel('predictions')
plt.scatter(observed, predictions, alpha=0.7, color='green', edgecolors='black')
plt.show()

2.4 What are the coefficients (or weights) of the features in your linear regression model?


In [ ]:
print ols.results.params

2.5 What is your model’s $R^{2}$ (coefficient of determination) value?

For $n = 500$, the best $R^{2}$ value witnessed was $0.85$ (with the best $r$ value seen at $0.92$).

2.6.a What does this $R^{2}$ value mean for the goodness of fit for your regression model?

This $R^{2}$ value means that $85\%$ of the proportion of total variation in the response variable is explained by the least-squares regression line (i.e., model) that was created above.

2.6.b Do you think this linear model to predict ridership is appropriate for this dataset, given this $R^{2}$ value?

It's better than guessing in the dark, but too much shouldn't be staked on its predictions:

Predictions and their Residual Differences from Observed Values


In [ ]:
y_hat_test = ols.results.predict(ols.X_test)
residuals = (y_hat_test - ols.y_test)
y = pd.DataFrame(data={'y_test':ols.y_test,'y_hat_test':y_hat_test,'residuals':residuals}, columns=['y_test','y_hat_test','residuals'])
y['residuals'] = residuals

entries_diff = [1, 100, 1000]

for i in entries_diff:
    proximity = ['close' if d < i else 'far' for d in residuals]
    y['proximity'] = proximity

    total = y['proximity'].count()
    close = y[y['proximity'] == 'close']['proximity'].count()

    print "residuals < {0} ( {1}% )".format(i, np.true_divide(close,total)*100)
    print y['proximity'].describe()
    print

As can be seen from the above, somewhat arbitrarily-selected, values, the number of close predictions is a little over $50\%$ when close is defined as a prediction with a difference that is less than $1$ from the actual observed value. Given that the value of entries can take on such a large range of values $[0, 32814]$, differences less than $100$ and $1000$ are shown as well.

Residual Analysis


In [ ]:
plt.boxplot(residuals, vert=False)
plt.title('Boxplot of Residuals')
plt.xlabel('residuals')

plt.show()

In [ ]:
plt.scatter(y_hat_test,residuals, alpha=0.7, color='green', edgecolors='black')
plt.title('RESIDUAL PLOT')
plt.plot([np.min(y_hat_test),np.max(y_hat_test)], [0, 0], color='red')
plt.xlabel('predictions')
plt.ylabel('residuals')
plt.show()

Since the above predictions show a discernible, linear, and increasing pattern (and, thus, are not stochastic), it seems apparent that there is in fact not a linear relationship between the explanatory and response variables. Thus, a linear model is not appropriate for the current data set.