In [393]:
    
import operator
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
import scipy.optimize as op
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
    
In [251]:
    
def describe_variables(features, X, y):
    
    for x in features:
        size, min_max, mean, var, skew, kurt = st.describe(X[:, features.index(x)])
        med = np.median(X[:, features.index(x)])
        std = np.std(X[:, features.index(x)])
        print "x{0} ({1}):\n  min = {2}, max = {3},\n  mean = {4}, median = {5}, var = {6}, std = {7}".format(features.index(x), x, min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
        
    size, min_max, mean, var, skew, kurt = st.describe(y)
    med = np.median(y)
    std = np.std(y)
    print "y (ENTRIESn_hourly):\n  min = {0}, max = {1},\n  mean = {2}, median = {3}, var = {4}, std = {5}".format(min_max[0], min_max[1], np.round(mean, decimals=2), med, np.round(var, decimals=2), np.round(std, decimals=2))
    
In [252]:
    
print "Shape of data:" +str(data.shape)
N = data.shape[0]
print "N = " + str(N)
print "0.05 * N = " + str(0.05 * N)
    
    
In [253]:
    
class CreateDataSample:
    
    quantitative_features = ['hour','day_week','latitude','longitude','rain','tempi']
    categorical_features = ['UNIT','DATEn', 'station']
    def __init__(self,n):
        self.n = n
        self.simple_random_sample_row_indices = np.random.choice(data['ENTRIESn_hourly'].index.values, size=n, replace=False)
        self.sample_df = data.loc[self.simple_random_sample_row_indices]
        self.X = self.sample_df[quantitative_features].values
        self.y = self.sample_df['ENTRIESn_hourly'].values
    
    def add_categorical_dummies(self,categorical_features):
        C = []
        for c in categorical_features:
            if categorical_features.index(c) == 0:
                C = sm.categorical(self.sample_df[c].values, drop=True)
            else:
                C = np.hstack((C,sm.categorical(self.sample_df[c].values, drop=True)))
        self.X = np.hstack((self.X,C))
    
In [447]:
    
class learn:
    
    def __init__(self, sample_sizes, method, test_training_data):
        
        self.sample_sizes = sample_sizes
        self.method = method
        self.test_training_data = test_training_data
        
        self.sample = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test = []
        
        # for OLS
        self.model = []
        self.results = []
    def trial(self, display_results, compare):
        for sample_size in self.sample_sizes:
            self.sample = CreateDataSample(sample_size)
            self.sample.add_categorical_dummies(categorical_features)
            # training sample size
            n = self.sample.X.shape[0]
            
            # number of features
            num_of_features = self.sample.X.shape[1]
            
            # Feature Scaling
            # mean normalization (necessary for minimization not to return NaNs)
            x_i_bar = []
            s_i = []
            for i in np.arange(self.sample.X.shape[1]):
                x_i_bar.append(np.mean(self.sample.X[:,i]))
                s_i.append(np.std(self.sample.X[:,i]))
                self.sample.X[:,i] = np.true_divide((np.subtract(self.sample.X[:,i],x_i_bar[i])),s_i[i])
    
            # final design matrix
            self.sample.X = sm.add_constant(self.sample.X)
            
            # theta parameters 
            theta = np.zeros(((num_of_features+1),1))
            # randomly selected row indices to gather samples
            indices = [i for i in np.arange(n)]
            random_indices = np.random.choice(indices, size=(sample_size/2), replace=False) 
            # select samples
            self.X_train = self.sample.X[random_indices]
            self.y_train = self.sample.y[random_indices]
            # depending on method, implement method
            if self.method == 'OLS':
                model = sm.OLS(self.y_train, self.X_train)
                results = model.fit()
                #results = model.fit_regularized()
                self.model = model
                self.results = results
                
            # scikit-learn Linear Regression
            elif self.method == 'LinReg':
                regr = linear_model.LinearRegression()
                regr.fit(self.X_train, self.y_train)
            # scikit-learn Ridge Regression
            elif self.method == 'Ridge':
                clf = Ridge(alpha=1.0)
                clf.fit(self.X_train, self.y_train)
                
            elif self.method == 'BFGS':
                # cost function
                def J(theta):
                    return (1.0/(2*n)) * (((self.X_train.dot(theta)) - self.y_train).T).dot(self.X_train.dot(theta) - self.y_train)
                # gradient estimated
                solution = op.minimize(J, theta)
                theta = solution.x
                
            elif self.method == 'NormEq':
                theta = (np.linalg.pinv(self.X_train).dot(self.y_train))
                
            elif self.method == 'compare':
                # OLS
                model = sm.OLS(self.y_train, self.X_train)
                results = model.fit()
                # LinReg
                regr = linear_model.LinearRegression()
                regr.fit(self.X_train, self.y_train)
                # Ridge
                clf = Ridge(alpha=1.0)
                clf.fit(self.X_train, self.y_train)
                # NormEq
                theta = (np.linalg.pinv(self.X_train.T.dot(self.X_train)).dot(self.X_train.T)).dot(self.y_train)
                
            random_indices = np.random.choice(indices, size=(sample_size/2), replace=False)
            if self.test_training_data == False:
                self.X_test = self.sample.X[random_indices]
                self.y_test = self.sample.y[random_indices]
            else:
                self.X_test = self.X_train
                self.y_test = self.y_train
                
            if display_results == True:
                print "\nn = {0}".format(n/2)
                if self.method == 'OLS':
                    print "r = {0:.2f}".format(np.sqrt(results.rsquared))
                    print "R^2 = {0:.2f}".format(results.rsquared)
                    
                elif self.method == 'LinReg':
                    print "r = {0:.2f}".format(np.sqrt(regr.score(self.X_test, self.y_test)))
                    print "R^2 = {0:.2f}".format(regr.score(self.X_test, self.y_test))
                elif self.method == 'Ridge':
                    print "r = {0:.2f}".format(np.sqrt(clf.score(self.X_test, self.y_test)))
                    print "R^2 = {0:.2f}".format(clf.score(self.X_test, self.y_test))
                
                # too slow!
                elif self.method == 'BFGS':
                    y_hat = theta.T.dot(self.X_test)
                    print "y_hat = {0:.2f}".format(y_hat)
                
                elif self.method == 'NormEq':
                    y_hat = self.X_test.dot(theta.T)
                    y_bar = np.mean(self.y_test)
                    
                    total_variation = np.sum((self.y_test - y_bar)**2)
                    unexplained_variation = np.sum((self.y_test - y_hat)**2)
                    #explained_variation = np.sum((y_hat - y_bar)**2)
                    
                    r_squared = (1 - (np.true_divide(unexplained_variation,total_variation)))
                    
                    print "r = {0:.2f}".format(np.sqrt(r_squared))
                    print "R^2 = {0:.2f}".format(r_squared)
                    
            if compare == True:
                print "\nn = {0}".format(n/2)
                
                rsquared_values = {}
                r_values = {}
                
                rsquared_values['ols_R2'] = results.rsquared
                r_values['ols_r'] = np.sqrt(rsquared_values['ols_R2'])
                
                #rsquared_values['lin_reg_R2'] = regr.score(self.X_test, self.y_test)    
                #r_values['lin_reg_r'] = np.sqrt(rsquared_values['lin_reg_R2'])
                
                rsquared_values['ridge_R2'] = clf.score(self.X_test, self.y_test)   
                r_values['ridge_r'] = np.sqrt(rsquared_values['ridge_R2'])
            
                # normal equation
                y_hat = self.X_test.dot(theta.T)
                y_bar = np.mean(self.y_test)
                total_variation = np.sum((self.y_test - y_bar)**2)
                unexplained_variation = np.sum((self.y_test - y_hat)**2)
                    
                rsquared_values['norm_eq_R2'] = (1 - (np.true_divide(unexplained_variation,total_variation)))
                r_values['norm_eq_r'] = np.mean(rsquared_values['norm_eq_R2'])
                
                max_rsquared = max(rsquared_values.iteritems(), key=operator.itemgetter(1))
                max_r = max(r_values.iteritems(), key=operator.itemgetter(1))
                
                print "{0} = {1:.2f}".format(max_r[0], max_r[1])
                print "{0} = {1:.2f}".format(max_rsquared[0], max_rsquared[1])
    
After comparing a few different methods (Ordinary Least Squares [OLS] from \textit{StatsModels}, two different regression techniques from \textit{scikit-learn}, the Broyden–Fletcher–Goldfarb–Shanno [BFGS] optimization algorithm from \texit{Scipy.optimize}, and a Normal Equations algebraic attempt), OLS from \textit{StatsModels} was chosen due to its consistently higher $r$ and $R^{2}$ values throughout various test sample sizes.
In [437]:
    
sample_sizes = [60, 200, 1000, 3000, 10000, 20000]
compare = learn(sample_sizes, method='compare', test_training_data=False)
compare.trial(display_results=False, compare=True)
    
    
In [511]:
    
sample_sizes = [1000]
ols = learn(sample_sizes, method='OLS', test_training_data=False)
ols.trial(display_results=True, compare=False)
    
    
linear correlation coefficient
$-1 \leq r \leq 1$ : if $r = +1$, then a perfect positive linear relation exists between the explanatory and response variables; if $r = -1$, then a perfect negative linear relation exists between the explanatory and response variables.
coefficient of determination
$0 \leq R^{2} \leq 1$ : if $R^{2} = 0$, the least-squares regression line has no explanatory values; if $R^{2} = 1$, the least-squares regression line explains $100\%$ of the variation in the response variable.
In [521]:
    
print ols.results.params
    
    
In [519]:
    
y_hat_test = ols.results.predict(ols.X_test)
diff = (y_hat_test - ols.y_test)
y = pd.DataFrame(data={'y_test':ols.y_test,'y_hat_test':y_hat_test,'diff':diff}, columns=['y_test','y_hat_test','diff'])
y['diff'] = diff
proximity = ['close' if d < 50 else 'far' for d in diff]
y['proximity'] = proximity
y['proximity'].describe()
    
    Out[519]:
In [491]:
    
    
through these statistical tests, assuming they've been implemented and interpreted correctly, it seems clear that computational power in the world of big data is in the ability to reproduce, not use large data sets. as this relates to big data and more complex and accurate modeling, perhaps significantly larger data sets have more power, btu then they would simply have it by their ever-increasing knowledge base, less regarding their $model$, which is in many ways, so it seems, how human brains are