In [17]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
import scipy.optimize as op
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
print data.columns.values
In [18]:
data['ENTRIESn_hourly'].describe()
Out[18]:
In [19]:
plt.boxplot(data['ENTRIESn_hourly'], vert=False)
plt.show()
In [20]:
data[data['ENTRIESn_hourly'] == 0].count()[0]
Out[20]:
In [21]:
data[data['ENTRIESn_hourly'] > 500].count()[0]
Out[21]:
In [22]:
data[data['ENTRIESn_hourly'] > 1000].count()[0]
Out[22]:
In [23]:
data[data['ENTRIESn_hourly'] > 5000].count()[0]
Out[23]:
In [24]:
data[data['ENTRIESn_hourly'] > 10000].count()[0]
Out[24]:
In [25]:
plt.figure(figsize = (10,10))
plt.hist(data['ENTRIESn_hourly'], bins=100)
plt.show()
In [26]:
plt.boxplot(data['ENTRIESn_hourly'], vert=False)
plt.show()
In [27]:
# the overwhelming majority of the action is occurring below 10000
#data = data[(data['ENTRIESn_hourly'] <= 10000)]
In [28]:
plt.figure(figsize = (10,10))
plt.hist(data['ENTRIESn_hourly'].values, bins=100)
plt.show()
In [29]:
plt.boxplot(data['ENTRIESn_hourly'].values, vert=False)
plt.show()
In [30]:
class SampleCreator:
def __init__(self,data,categorical_features,quantitative_features):
##markedfordeletion## m = data.shape[0]
##markedfordeletion## random_indices = np.random.choice(np.arange(0,m), size=m, replace=False)
##markedfordeletion## train_indices = random_indices[0:(m-(m*0.10))] # leave about 10% of data for testing
##markedfordeletion## test_indices = random_indices[(m-(m*0.10)):]
##markedfordeletion## # check disjointedness of training and testing indices
##markedfordeletion## for i in train_indices:
##markedfordeletion## if i in test_indices:
##markedfordeletion## print "<!> Training and Testing Sample Overlap <!>"
# response vector
y = data['ENTRIESn_hourly'].values
# get quantitative features
X = data[quantitative_features].values
# Feature Scaling
# mean normalization
x_i_bar = []
s_i = []
for i in np.arange(X.shape[1]):
x_i_bar.append(np.mean(X[:,i]))
s_i.append(np.std(X[:,i]))
X[:,i] = np.true_divide((np.subtract(X[:,i],x_i_bar[i])),s_i[i])
# create dummy variables for categorical features
for feature in categorical_features:
dummies = sm.categorical(data[feature].values, drop=True)
X = np.hstack((X,dummies))
# final design matrix
X = sm.add_constant(X)
##markedfordeletion## # training samples
##markedfordeletion## self.y_train = y[train_indices]
##markedfordeletion## self.X_train = X[train_indices]
##markedfordeletion## # testing samples
##markedfordeletion## self.y_test = y[test_indices]
##markedfordeletion## self.X_test = X[test_indices]
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
After comparing a few different methods (Ordinary Least Squares [OLS] from StatsModels, two different regression techniques from scikit-learn, the Broyden–Fletcher–Goldfarb–Shanno [BFGS] optimization algorithm from Scipy.optimize, and a Normal Equations algebraic attempt), OLS from StatsModels was chosen due to its consistently higher r and R2 values (see notes 1 and 2 below) throughout various test sample sizes ( $n=\{30,100,500,1500,5000,10000\}$ ).
1 The linear correlation coefficient ($r$) can take on the following values: $-1 \leq r \leq 1$. If $r = +1$, then a perfect positive linear relation exists between the explanatory and response variables. If $r = -1$, then a perfect negative linear relation exists between the explanatory and response variables.
2 The coefficient of determination ($R^{2}$) can take on the following values: $0 \leq R^{2} \leq 1$. If $R^{2} = 0$, the least-squares regression line has no explanatory value; if $R^{2} = 1$, the least-squares regression line explains $100\%$ of the variation in the response variable.
Quantitative features used: 'hour','day_week','rain','tempi'.
Categorical features used: 'UNIT'. As a categorical feature, this variable required the use of so-called dummy variables.
Due to the findings presented in the *DataExploration* supplement, it seemed clear that location significantly impacted the number of entries. In addition, the hour and day of the week showed importance. Temperature appeared to have some relationship with entries as well, and so it was included. Based on that exploration and on the statistical and practical evidence offered in Section 1. Statistical Test, rain was not included as a feature (and, as evidenced by a number of test runs, had marginal if any importance).
As far as the selection of location and day/time variables were concerned, station can be captured quantitatively by latitude and longitude, both of which, as numeric values, should offer a better sense of trend toward something. However, as witnessed by numerous test runs, latitude and longitude in fact appear to be redundant when using UNIT as a feature, which is in fact more signficant (as test runs indicated and, as one might assume, due to, for example, station layouts, where some UNITs would be used more than others) than latitude and longitude.
Each DATEn is a 'one-off', so it's unclear how any could be helpful for modeling/predicting (as those dates literally never occur again). day_week seemed to be a better selection in this case.
In [48]:
categorical_features = ['UNIT', 'hour', 'day_week', 'station']
#categorical_features = ['UNIT']
quantitative_features = ['rain', 'tempi']
#quantitative_features = []
# for tracking during trials
best_rsquared = 0
best_results = []
# perform 5 trials; keep model with best R^2
for x in xrange(0,5):
samples = SampleCreator(data,categorical_features,quantitative_features)
model = sm.OLS(samples.y_train,samples.X_train)
results = model.fit()
if results.rsquared > best_rsquared:
best_rsquared = results.rsquared
best_results = results
print "r = {0:.2f}".format(np.sqrt(best_results.rsquared))
print "R^2 = {0:.2f}".format(best_results.rsquared)
In [49]:
X_train = samples.X_train
print X_train.shape
y_train = samples.y_train
print y_train.shape
y_train.shape = (y_train.shape[0],1)
print y_train.shape
X_test = samples.X_test
print X_test.shape
y_test = samples.y_test
print y_test.shape
y_test.shape = (y_test.shape[0],1)
print y_test.shape
In [50]:
ols_y_hat = results.predict(X_test)
ols_y_hat.shape = (ols_y_hat.shape[0],1)
plt.title('Observed Values vs Fitted Predictions')
plt.xlabel('observed values')
plt.ylabel('predictions')
plt.scatter(y_test, ols_y_hat, alpha=0.7, color='green', edgecolors='black')
plt.show()
In [51]:
print best_results.params
For $n = 500$, the best $R^{2}$ value witnessed was $0.85$ (with the best $r$ value seen at $0.92$).
This $R^{2}$ value means that $85\%$ of the proportion of total variation in the response variable is explained by the least-squares regression line (i.e., model) that was created above.
It's better than guessing in the dark, but too much shouldn't be staked on its predictions:
In [52]:
ols_residuals = (ols_y_hat - y_test)
ols_residuals.shape
Out[52]:
As can be seen from the above, somewhat arbitrarily-selected, values, the number of close predictions is a little over $50\%$ when close is defined as a prediction with a difference that is less than $1$ from the actual observed value. Given that the value of entries can take on such a large range of values $[0, 32814]$, differences less than $100$ and $1000$ are shown as well.
In [53]:
plt.boxplot(ols_residuals, vert=False)
plt.title('Boxplot of Residuals')
plt.xlabel('residuals')
plt.show()
In [54]:
plt.scatter(ols_y_hat,ols_residuals, alpha=0.7, color='purple', edgecolors='black')
plt.title('RESIDUAL PLOT')
plt.plot([np.min(ols_y_hat),np.max(ols_y_hat)], [0, 0], color='red')
plt.xlabel('predictions')
plt.ylabel('residuals')
plt.show()
In [55]:
plt.hist(y_test, color='purple', alpha=0.7, label='observations')
plt.hist(ols_y_hat, color='green', alpha=0.5, bins=6, label='ols predictions')
plt.title('OBSERVATIONS vs OLS PREDICTIONS')
plt.ylabel('frequency')
plt.legend()
plt.show()
In [56]:
plt.hist(ols_residuals, color='gray', alpha=0.7)
plt.title('OLS RESIDUALS')
plt.ylabel('frequency')
plt.show()
Since the above predictions show a discernible, linear, and increasing pattern (and, thus, are not stochastic), it seems apparent that there is in fact not a linear relationship between the explanatory and response variables. Thus, a linear model is not appropriate for the current data set.
In [57]:
best_results.summary()
Out[57]: