In [1]:
import operator
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
import scipy.optimize as op
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [2]:
def describe_variables(features, X, y):
for x in features:
size, min_max, mean, var, skew, kurt = st.describe(X[:, features.index(x)])
med = np.median(X[:, features.index(x)])
std = np.std(X[:, features.index(x)])
print "x{0} ({1}):\n min = {2}, max = {3},\n mean = {4}, median = {5}, var = {6}, std = {7}".format(features.index(x), x, min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
size, min_max, mean, var, skew, kurt = st.describe(y)
med = np.median(y)
std = np.std(y)
print "y (ENTRIESn_hourly):\n min = {0}, max = {1},\n mean = {2}, median = {3}, var = {4}, std = {5}".format(min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
In [3]:
print "Shape of data:" +str(data.shape)
N = data.shape[0]
print "N = " + str(N)
print "0.05 * N = " + str(0.05 * N)
In [4]:
class CreateDataSample:
quantitative_features = ['hour','day_week','tempi']
categorical_features = ['UNIT']
def __init__(self,n):
self.n = n
self.simple_random_sample_row_indices = np.random.choice(data['ENTRIESn_hourly'].index.values, size=n, replace=False)
self.sample_df = data.loc[self.simple_random_sample_row_indices]
self.X = self.sample_df[quantitative_features].values
self.y = self.sample_df['ENTRIESn_hourly'].values
def add_categorical_dummies(self,categorical_features):
C = []
for c in categorical_features:
if categorical_features.index(c) == 0:
C = sm.categorical(self.sample_df[c].values, drop=True)
else:
C = np.hstack((C,sm.categorical(self.sample_df[c].values, drop=True)))
self.X = np.hstack((self.X,C))
In [5]:
class learn:
def __init__(self, sample_sizes, method, test_training_data):
self.sample_sizes = sample_sizes
self.method = method
self.test_training_data = test_training_data
self.sample = []
self.X_train = []
self.y_train = []
self.X_test = []
self.y_test = []
# for OLS
self.model = []
self.results = []
def trial(self, display_results, compare):
for sample_size in self.sample_sizes:
# double the size to create both a training and a testing set
self.sample = CreateDataSample(sample_size*2)
self.sample.add_categorical_dummies(categorical_features)
# training sample size
n = self.sample.X.shape[0]
# number of features
num_of_features = self.sample.X.shape[1]
# Feature Scaling
# mean normalization (necessary for minimization not to return NaNs)
x_i_bar = []
s_i = []
for i in np.arange(self.sample.X.shape[1]):
x_i_bar.append(np.mean(self.sample.X[:,i]))
s_i.append(np.std(self.sample.X[:,i]))
self.sample.X[:,i] = np.true_divide((np.subtract(self.sample.X[:,i],x_i_bar[i])),s_i[i])
# final design matrix
self.sample.X = sm.add_constant(self.sample.X)
# theta parameters
theta = np.zeros(((num_of_features+1),1))
# randomly selected row indices to gather samples
indices = [i for i in np.arange(n)]
random_indices = np.random.choice(indices, size=(sample_size), replace=False)
# select samples
self.X_train = self.sample.X[random_indices]
self.y_train = self.sample.y[random_indices]
# depending on method, implement method
if self.method == 'OLS':
model = sm.OLS(self.y_train, self.X_train)
results = model.fit()
#results = model.fit_regularized()
self.model = model
self.results = results
# scikit-learn Linear Regression
elif self.method == 'LinReg':
regr = linear_model.LinearRegression()
regr.fit(self.X_train, self.y_train)
# scikit-learn Ridge Regression
elif self.method == 'Ridge':
clf = Ridge(alpha=1.0)
clf.fit(self.X_train, self.y_train)
elif self.method == 'BFGS':
# cost function
def J(theta):
return (1.0/(2*n)) * (((self.X_train.dot(theta)) - self.y_train).T).dot(self.X_train.dot(theta) - self.y_train)
# gradient estimated
solution = op.minimize(J, theta)
theta = solution.x
elif self.method == 'NormEq':
theta = (np.linalg.pinv(self.X_train).dot(self.y_train))
elif self.method == 'compare':
# OLS
model = sm.OLS(self.y_train, self.X_train)
results = model.fit()
# LinReg
regr = linear_model.LinearRegression()
regr.fit(self.X_train, self.y_train)
# Ridge
clf = Ridge(alpha=1.0)
clf.fit(self.X_train, self.y_train)
# NormEq
theta = (np.linalg.pinv(self.X_train.T.dot(self.X_train)).dot(self.X_train.T)).dot(self.y_train)
random_indices = np.random.choice(indices, size=(sample_size), replace=False)
if self.test_training_data == False:
self.X_test = self.sample.X[random_indices]
self.y_test = self.sample.y[random_indices]
else:
self.X_test = self.X_train
self.y_test = self.y_train
if display_results == True:
print "\nn = {0}".format(n/2)
if self.method == 'OLS':
print "r = {0:.2f}".format(np.sqrt(results.rsquared))
print "R^2 = {0:.2f}".format(results.rsquared)
elif self.method == 'LinReg':
print "r = {0:.2f}".format(np.sqrt(regr.score(self.X_test, self.y_test)))
print "R^2 = {0:.2f}".format(regr.score(self.X_test, self.y_test))
elif self.method == 'Ridge':
print "r = {0:.2f}".format(np.sqrt(clf.score(self.X_test, self.y_test)))
print "R^2 = {0:.2f}".format(clf.score(self.X_test, self.y_test))
# too slow!
elif self.method == 'BFGS':
y_hat = theta.T.dot(self.X_test)
print "y_hat = {0:.2f}".format(y_hat)
elif self.method == 'NormEq':
y_hat = self.X_test.dot(theta.T)
y_bar = np.mean(self.y_test)
total_variation = np.sum((self.y_test - y_bar)**2)
unexplained_variation = np.sum((self.y_test - y_hat)**2)
#explained_variation = np.sum((y_hat - y_bar)**2)
r_squared = (1 - (np.true_divide(unexplained_variation,total_variation)))
print "r = {0:.2f}".format(np.sqrt(r_squared))
print "R^2 = {0:.2f}".format(r_squared)
if compare == True:
print "\nn = {0}".format(sample_size)
rsquared_values = {}
r_values = {}
rsquared_values['ols_R2'] = results.rsquared
r_values['ols_r'] = np.sqrt(rsquared_values['ols_R2'])
#rsquared_values['lin_reg_R2'] = regr.score(self.X_test, self.y_test)
#r_values['lin_reg_r'] = np.sqrt(rsquared_values['lin_reg_R2'])
rsquared_values['ridge_R2'] = clf.score(self.X_test, self.y_test)
r_values['ridge_r'] = np.sqrt(rsquared_values['ridge_R2'])
# normal equation
y_hat = self.X_test.dot(theta.T)
y_bar = np.mean(self.y_test)
total_variation = np.sum((self.y_test - y_bar)**2)
unexplained_variation = np.sum((self.y_test - y_hat)**2)
rsquared_values['norm_eq_R2'] = (1 - (np.true_divide(unexplained_variation,total_variation)))
r_values['norm_eq_r'] = np.mean(rsquared_values['norm_eq_R2'])
max_rsquared = max(rsquared_values.iteritems(), key=operator.itemgetter(1))
max_r = max(r_values.iteritems(), key=operator.itemgetter(1))
print "{0} = {1:.2f}".format(max_r[0], max_r[1])
print "{0} = {1:.2f}".format(max_rsquared[0], max_rsquared[1])
After comparing a few different methods (Ordinary Least Squares [OLS] from StatsModels, two different regression techniques from scikit-learn, the Broyden–Fletcher–Goldfarb–Shanno [BFGS] optimization algorithm from Scipy.optimize, and a Normal Equations algebraic attempt), OLS from StatsModels was chosen due to its consistently higher $r$ and $R^{2}$ values (see notes 1 and 2 below) throughout various test sample sizes ( $n = \{30, 100, 500, 1500, 5000, 10000\}$ ).
1 The linear correlation coefficient ($r$) can take on the following values: $-1 \leq r \leq 1$. If $r = +1$, then a perfect positive linear relation exists between the explanatory and response variables. If $r = -1$, then a perfect negative linear relation exists between the explanatory and response variables.
2 The coefficient of determination ($R^{2}$) can take on the following values: $0 \leq R^{2} \leq 1$. If $R^{2} = 0$, the least-squares regression line has no explanatory value; if $R^{2} = 1$, the least-squares regression line explains $100\%$ of the variation in the response variable.
In [6]:
# sample-size values are halved
sample_sizes = [30, 100, 500, 1500, 5000, 10000]
compare = learn(sample_sizes, method='compare', test_training_data=False)
compare.trial(display_results=False, compare=True)
Quantitative features used: 'hour','day_week','rain','tempi'.
Categorical features used: 'UNIT'. As a categorical feature, this variable required the use of dummy variables.
Due to the findings presented in the *DataExploration* supplement, it seemed clear that location significantly impacted the number of entries. In addition, the hour and day of the week showed importance. Temperature appeared to have some relationship with entries as well, and so it was included. Based on that exploration and on the statistical and practical evidence offered in Section 1. Statistical Test, rain was not included as a feature (and, as evidenced by a number of test runs, had marginal if any importance).
As far as the selection of location and day/time variables were concerned, station can be captured quantitatively by latitude and longitude, both of which, as numeric values, should offer a better sense of trend toward something. However, as witnessed by numerous test runs, latitude and longitude in fact appear to be redundant when using UNIT as a feature, which is in fact more signficant (as test runs indicated and, as one might assume, due to, for example, station layouts) than latitude and longitude.
Each DATEn is a 'one-off', so it's unclear how any could be helpful for modeling/predicting (as those dates literally never occur again). day_week seemed to be a better selection in this case.
In [ ]:
sample_size = [500]
ols = []
for i in np.arange(100):
temp_ols = learn(sample_size, method='OLS', test_training_data=False)
temp_ols.trial(display_results=False, compare=False)
if not ols or (temp_ols.results.rsquared > ols.results.rsquared):
ols = temp_ols
print "\nn = {0}".format(sample_size[0])
print "r = {0:.2f}".format(np.sqrt(ols.results.rsquared))
print "R^2 = {0:.2f}".format(ols.results.rsquared)
In [ ]:
observed = ols.y_test
predictions = ols.results.predict(ols.X_test)
plt.title('Observed Values vs Fitted Predictions')
plt.xlabel('observed values')
plt.ylabel('predictions')
plt.scatter(observed, predictions, alpha=0.7, color='green', edgecolors='black')
plt.show()
In [ ]:
print ols.results.params
For $n = 500$, the best $R^{2}$ value witnessed was $0.85$ (with the best $r$ value seen at $0.92$).
This $R^{2}$ value means that $85\%$ of the proportion of total variation in the response variable is explained by the least-squares regression line (i.e., model) that was created above.
It's better than guessing in the dark, but too much shouldn't be staked on its predictions:
In [ ]:
y_hat_test = ols.results.predict(ols.X_test)
residuals = (y_hat_test - ols.y_test)
y = pd.DataFrame(data={'y_test':ols.y_test,'y_hat_test':y_hat_test,'residuals':residuals}, columns=['y_test','y_hat_test','residuals'])
y['residuals'] = residuals
entries_diff = [1, 100, 1000]
for i in entries_diff:
proximity = ['close' if d < i else 'far' for d in residuals]
y['proximity'] = proximity
total = y['proximity'].count()
close = y[y['proximity'] == 'close']['proximity'].count()
print "residuals < {0} ( {1}% )".format(i, np.true_divide(close,total)*100)
print y['proximity'].describe()
print
As can be seen from the above, somewhat arbitrarily-selected, values, the number of close predictions is a little over $50\%$ when close is defined as a prediction with a difference that is less than $1$ from the actual observed value. Given that the value of entries can take on such a large range of values $[0, 32814]$, differences less than $100$ and $1000$ are shown as well.
In [ ]:
plt.boxplot(residuals, vert=False)
plt.title('Boxplot of Residuals')
plt.xlabel('residuals')
plt.show()
In [ ]:
plt.scatter(y_hat_test,residuals, alpha=0.7, color='green', edgecolors='black')
plt.title('RESIDUAL PLOT')
plt.plot([np.min(y_hat_test),np.max(y_hat_test)], [0, 0], color='red')
plt.xlabel('predictions')
plt.ylabel('residuals')
plt.show()
Since the above predictions show a discernible, linear, and increasing pattern (and, thus, are not stochastic), it seems apparent that there is in fact not a linear relationship between the explanatory and response variables. Thus, a linear model is not appropriate for the current data set.