In [393]:
import operator
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
import scipy.optimize as op
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [251]:
def describe_variables(features, X, y):
for x in features:
size, min_max, mean, var, skew, kurt = st.describe(X[:, features.index(x)])
med = np.median(X[:, features.index(x)])
std = np.std(X[:, features.index(x)])
print "x{0} ({1}):\n min = {2}, max = {3},\n mean = {4}, median = {5}, var = {6}, std = {7}".format(features.index(x), x, min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
size, min_max, mean, var, skew, kurt = st.describe(y)
med = np.median(y)
std = np.std(y)
print "y (ENTRIESn_hourly):\n min = {0}, max = {1},\n mean = {2}, median = {3}, var = {4}, std = {5}".format(min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
In [252]:
print "Shape of data:" +str(data.shape)
N = data.shape[0]
print "N = " + str(N)
print "0.05 * N = " + str(0.05 * N)
In [253]:
class CreateDataSample:
quantitative_features = ['hour','day_week','latitude','longitude','rain','tempi']
categorical_features = ['UNIT','DATEn', 'station']
def __init__(self,n):
self.n = n
self.simple_random_sample_row_indices = np.random.choice(data['ENTRIESn_hourly'].index.values, size=n, replace=False)
self.sample_df = data.loc[self.simple_random_sample_row_indices]
self.X = self.sample_df[quantitative_features].values
self.y = self.sample_df['ENTRIESn_hourly'].values
def add_categorical_dummies(self,categorical_features):
C = []
for c in categorical_features:
if categorical_features.index(c) == 0:
C = sm.categorical(self.sample_df[c].values, drop=True)
else:
C = np.hstack((C,sm.categorical(self.sample_df[c].values, drop=True)))
self.X = np.hstack((self.X,C))
In [447]:
class learn:
def __init__(self, sample_sizes, method, test_training_data):
self.sample_sizes = sample_sizes
self.method = method
self.test_training_data = test_training_data
self.sample = []
self.X_train = []
self.y_train = []
self.X_test = []
self.y_test = []
# for OLS
self.model = []
self.results = []
def trial(self, display_results, compare):
for sample_size in self.sample_sizes:
self.sample = CreateDataSample(sample_size)
self.sample.add_categorical_dummies(categorical_features)
# training sample size
n = self.sample.X.shape[0]
# number of features
num_of_features = self.sample.X.shape[1]
# Feature Scaling
# mean normalization (necessary for minimization not to return NaNs)
x_i_bar = []
s_i = []
for i in np.arange(self.sample.X.shape[1]):
x_i_bar.append(np.mean(self.sample.X[:,i]))
s_i.append(np.std(self.sample.X[:,i]))
self.sample.X[:,i] = np.true_divide((np.subtract(self.sample.X[:,i],x_i_bar[i])),s_i[i])
# final design matrix
self.sample.X = sm.add_constant(self.sample.X)
# theta parameters
theta = np.zeros(((num_of_features+1),1))
# randomly selected row indices to gather samples
indices = [i for i in np.arange(n)]
random_indices = np.random.choice(indices, size=(sample_size/2), replace=False)
# select samples
self.X_train = self.sample.X[random_indices]
self.y_train = self.sample.y[random_indices]
# depending on method, implement method
if self.method == 'OLS':
model = sm.OLS(self.y_train, self.X_train)
results = model.fit()
#results = model.fit_regularized()
self.model = model
self.results = results
# scikit-learn Linear Regression
elif self.method == 'LinReg':
regr = linear_model.LinearRegression()
regr.fit(self.X_train, self.y_train)
# scikit-learn Ridge Regression
elif self.method == 'Ridge':
clf = Ridge(alpha=1.0)
clf.fit(self.X_train, self.y_train)
elif self.method == 'BFGS':
# cost function
def J(theta):
return (1.0/(2*n)) * (((self.X_train.dot(theta)) - self.y_train).T).dot(self.X_train.dot(theta) - self.y_train)
# gradient estimated
solution = op.minimize(J, theta)
theta = solution.x
elif self.method == 'NormEq':
theta = (np.linalg.pinv(self.X_train).dot(self.y_train))
elif self.method == 'compare':
# OLS
model = sm.OLS(self.y_train, self.X_train)
results = model.fit()
# LinReg
regr = linear_model.LinearRegression()
regr.fit(self.X_train, self.y_train)
# Ridge
clf = Ridge(alpha=1.0)
clf.fit(self.X_train, self.y_train)
# NormEq
theta = (np.linalg.pinv(self.X_train.T.dot(self.X_train)).dot(self.X_train.T)).dot(self.y_train)
random_indices = np.random.choice(indices, size=(sample_size/2), replace=False)
if self.test_training_data == False:
self.X_test = self.sample.X[random_indices]
self.y_test = self.sample.y[random_indices]
else:
self.X_test = self.X_train
self.y_test = self.y_train
if display_results == True:
print "\nn = {0}".format(n/2)
if self.method == 'OLS':
print "r = {0:.2f}".format(np.sqrt(results.rsquared))
print "R^2 = {0:.2f}".format(results.rsquared)
elif self.method == 'LinReg':
print "r = {0:.2f}".format(np.sqrt(regr.score(self.X_test, self.y_test)))
print "R^2 = {0:.2f}".format(regr.score(self.X_test, self.y_test))
elif self.method == 'Ridge':
print "r = {0:.2f}".format(np.sqrt(clf.score(self.X_test, self.y_test)))
print "R^2 = {0:.2f}".format(clf.score(self.X_test, self.y_test))
# too slow!
elif self.method == 'BFGS':
y_hat = theta.T.dot(self.X_test)
print "y_hat = {0:.2f}".format(y_hat)
elif self.method == 'NormEq':
y_hat = self.X_test.dot(theta.T)
y_bar = np.mean(self.y_test)
total_variation = np.sum((self.y_test - y_bar)**2)
unexplained_variation = np.sum((self.y_test - y_hat)**2)
#explained_variation = np.sum((y_hat - y_bar)**2)
r_squared = (1 - (np.true_divide(unexplained_variation,total_variation)))
print "r = {0:.2f}".format(np.sqrt(r_squared))
print "R^2 = {0:.2f}".format(r_squared)
if compare == True:
print "\nn = {0}".format(n/2)
rsquared_values = {}
r_values = {}
rsquared_values['ols_R2'] = results.rsquared
r_values['ols_r'] = np.sqrt(rsquared_values['ols_R2'])
#rsquared_values['lin_reg_R2'] = regr.score(self.X_test, self.y_test)
#r_values['lin_reg_r'] = np.sqrt(rsquared_values['lin_reg_R2'])
rsquared_values['ridge_R2'] = clf.score(self.X_test, self.y_test)
r_values['ridge_r'] = np.sqrt(rsquared_values['ridge_R2'])
# normal equation
y_hat = self.X_test.dot(theta.T)
y_bar = np.mean(self.y_test)
total_variation = np.sum((self.y_test - y_bar)**2)
unexplained_variation = np.sum((self.y_test - y_hat)**2)
rsquared_values['norm_eq_R2'] = (1 - (np.true_divide(unexplained_variation,total_variation)))
r_values['norm_eq_r'] = np.mean(rsquared_values['norm_eq_R2'])
max_rsquared = max(rsquared_values.iteritems(), key=operator.itemgetter(1))
max_r = max(r_values.iteritems(), key=operator.itemgetter(1))
print "{0} = {1:.2f}".format(max_r[0], max_r[1])
print "{0} = {1:.2f}".format(max_rsquared[0], max_rsquared[1])
After comparing a few different methods (Ordinary Least Squares [OLS] from \textit{StatsModels}, two different regression techniques from \textit{scikit-learn}, the Broyden–Fletcher–Goldfarb–Shanno [BFGS] optimization algorithm from \texit{Scipy.optimize}, and a Normal Equations algebraic attempt), OLS from \textit{StatsModels} was chosen due to its consistently higher $r$ and $R^{2}$ values throughout various test sample sizes.
In [437]:
sample_sizes = [60, 200, 1000, 3000, 10000, 20000]
compare = learn(sample_sizes, method='compare', test_training_data=False)
compare.trial(display_results=False, compare=True)
In [511]:
sample_sizes = [1000]
ols = learn(sample_sizes, method='OLS', test_training_data=False)
ols.trial(display_results=True, compare=False)
linear correlation coefficient
$-1 \leq r \leq 1$ : if $r = +1$, then a perfect positive linear relation exists between the explanatory and response variables; if $r = -1$, then a perfect negative linear relation exists between the explanatory and response variables.
coefficient of determination
$0 \leq R^{2} \leq 1$ : if $R^{2} = 0$, the least-squares regression line has no explanatory values; if $R^{2} = 1$, the least-squares regression line explains $100\%$ of the variation in the response variable.
In [521]:
print ols.results.params
In [519]:
y_hat_test = ols.results.predict(ols.X_test)
diff = (y_hat_test - ols.y_test)
y = pd.DataFrame(data={'y_test':ols.y_test,'y_hat_test':y_hat_test,'diff':diff}, columns=['y_test','y_hat_test','diff'])
y['diff'] = diff
proximity = ['close' if d < 50 else 'far' for d in diff]
y['proximity'] = proximity
y['proximity'].describe()
Out[519]:
In [491]:
through these statistical tests, assuming they've been implemented and interpreted correctly, it seems clear that computational power in the world of big data is in the ability to reproduce, not use large data sets. as this relates to big data and more complex and accurate modeling, perhaps significantly larger data sets have more power, btu then they would simply have it by their ever-increasing knowledge base, less regarding their $model$, which is in many ways, so it seems, how human brains are