ADS PublicSpace-Citibike Project
a. Test if the data follows a normal distributions Q-Q plot - Quantile Quantile
b. Data Normalization ((d-mean)/std) Split both original and normalized datasets into training and testing sets
a. Bi-variate regression on each regressors Evaluate R-squared
b. Multivariate linear regression on all regressors Evaluate coefficients using p-values, R-squared and sum of residue squared
c. Ridge Regression (4 fold cv) Evaluate coefficients using p-values, R-squared and sum of residue squared
d. Lasso Regression (4 fold cv) Evaluate coefficients using p-values, R-squared and sum of residue squared
a*. OLS regression model with polynomial regressors(x)
b*. OLS regression model with transformed y
In [2]:
import numpy as np
import pandas as pd
from pandas.stats.api import ols
#import Quandl
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy import stats
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%pylab inline
In [3]:
master = pd.read_csv('../data/processed/master.csv')
In [4]:
master.head()
Out[4]:
In [5]:
master.bike_lane_score.unique()
Out[5]:
In [6]:
master.shape
Out[6]:
In [33]:
# Create a new data frame with stationID, riderships in month 6-10, and all study features
df0 = master[['station_id','ridership_0615','ridership_0715','ridership_0815','ridership_0915','ridership_1015',\
'bike_lane_score', 'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',\
'median_hh_income', 'pop_density']]
In [34]:
df0.head(3)
Out[34]:
In [35]:
df0.subway_entrance.unique()
Out[35]:
Q-Q plot of the quantiles of x versus the quantiles/ppf of a distribution.
Generates a probability plot of sample data against the quantiles of a
specified theoretical distribution (the normal distribution by default).
probplot
optionally calculates a best-fit line for the data
---R-squared of data being normalaly distribuated--
'traffic_volume': R_2 = 0.71
'median_hh_income': R_2 = 0.81
In [10]:
#sm.qqplot(df0.ix[:,6], fit=True, line='q', ax=None)
In [36]:
measurements = df0.ix[:,13]
stats.probplot(measurements, dist="norm", plot=plt)
Out[36]:
In [31]:
#df0 = df0.ix[:, ~df0.columns.isin(['park', 'subway_entrance'])]
#df0.head(3)
In [47]:
# Nomalize columns that are numeric(continuous values)
# using (X - Xmin)/ (Xmax - Xmin)
df0_n = df0.copy()
In [48]:
def normalize1(df1, df0, col):
col_mean = df0[col].mean()
col_std = df0[col].std()
df1[col] = df0[col].apply(lambda c: (c - col_mean) / col_std)
return col_mean, col_std
In [49]:
# Create a dictionaly that stores all the max and min values of the selected columns' original values
col_n = ['ridership_0615', 'ridership_0715','ridership_0815', 'ridership_0915','ridership_1015',\
'bike_lane_score', 'street_quality_score', 'tree_score', 'traffic_volume', 'median_hh_income', 'pop_density']
col_meanstd = {}
for c in col_n:
vmean, vstd = normalize1(df0_n, df0, c)
col_meanstd[c] = np.array([vmean, vstd])
In [50]:
df0_n.head()
Out[50]:
In [51]:
col_meanstd = pd.DataFrame(col_meanstd.items(), columns=['column', 'mean_std'])
In [52]:
col_meanstd
Out[52]:
In [55]:
# Using 2015/09 diership and the original fator data to run regression
lm = smf.ols(formula = 'ridership_0915 ~ \
bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + \
median_hh_income + pop_density', \
data = df0).fit()
In [56]:
lm.summary()
Out[56]:
In [66]:
# drop nan-- normalized data
df0_nV = df0_n.dropna(subset=['ridership_0915', 'bike_lane_score', 'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',\
'median_hh_income', 'pop_density'])
df0_nV.shape
Out[66]:
In [67]:
# # drop nan-- original data
df0_V = df0.dropna(subset=['ridership_0915', 'bike_lane_score', 'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',\
'median_hh_income', 'pop_density'])
df0_V.shape
Out[67]:
In [104]:
######## Set traning and Testing sets on normalized data
X0n = df0_nV[['bike_lane_score', 'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',\
'median_hh_income', 'pop_density']]
y0n = df0_nV[['ridership_0915']]
In [105]:
X0n.shape
Out[105]:
In [106]:
# for standardized data
X0n_train, X0n_test, y0n_train, y0n_test = train_test_split(X0n, y0n, test_size = 0.295, random_state = 222)
X0n_train.shape, X0n_test.shape, y0n_train.shape, y0n_test.shape
Out[106]:
In [107]:
# Stack X0n and y0n for convience
Xyn_train = pd.concat([y0n_train, X0n_train], axis=1)
Xyn_test = pd.concat([y0n_test, X0n_test], axis=1)
Xyn_test.head(3)
Out[107]:
In [108]:
######## Set traning and Testing sets on original data
X0 = df0_V[['bike_lane_score', 'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',\
'median_hh_income', 'pop_density']]
y0 = df0_V[['ridership_0915']]
In [109]:
# for original data
X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, test_size = 0.295, random_state = 222)
# Check the shapes of each
X0_train.shape, X0_test.shape, y0_train.shape, y0_test.shape
Out[109]:
In [110]:
# Stack X0 and y0 for convience
Xy_train = pd.concat([y0_train, X0_train], axis=1)
Xy_test = pd.concat([y0_test, X0_test], axis=1)
Xy_train.head(3)
Out[110]:
In [141]:
publicspace_factor = ['bike_lane_score', 'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',\
'median_hh_income', 'pop_density']
dic_lm = {} # for creating table
dic_lm0 = {} # for storing models
for i in xrange(0, 8):
lm_ = smf.ols(formula = 'ridership_0915 ~ %s'%(publicspace_factor[i]), data = Xy_train).fit()
dic_lm[publicspace_factor[i]] = [lm_.params[1], lm_.pvalues.values[1], \
lm_.params[0],lm_.pvalues.values[0], \
lm_.rsquared]
# If want to print out the result as a list:
dic_lm0[i] = lm_
#print "\nFeature: {1}\n coef{0} = {2}, intercept{0} = {3}".format(i, publicspace_factor[i], dic_lm[i].params[1], dic_lm[i].params[0])
#print "p-values: \n{0}, \n R-squared = {1}".format(dic_lm[i].pvalues, dic_lm[i].rsquared)
df_lm = pd.DataFrame.from_dict(dic_lm)
df_lm.index = (['coefficient','coeff_p-value','intercept', 'intcp_p-value','r-squared'])
In [142]:
df_lm
Out[142]:
In [138]:
# Save the table as a csv file
#df_lm.to_csv('../reports/figures/ols-bivar-regre-table.csv')
In [152]:
# To better visualize how well the fitted lines are with the datapoints, I used the normalized data
publicspace_factor = ['bike_lane_score', 'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',\
'median_hh_income', 'pop_density']
dic_lm0_n = {} # for storing models
for i in xrange(0, 8):
lm_ = smf.ols(formula = 'ridership_0915 ~ %s'%(publicspace_factor[i]), data = Xyn_train).fit()
dic_lm0_n[i] = lm_ # Storethe result as a list:
In [280]:
# Plot data points and model line for each factor (using Normaliaed Data)
fig = plt.figure(figsize=(12,20))
for i in range(0,8):
plt.subplot(421+i)
xfit = np.linspace(-2, 4, 200)
yfit = xfit * dic_lm0_n[i].params[1] + dic_lm0_n[i].params[0]
plt.plot(xfit, yfit, 'r',linewidth=2)
plt.plot(Xyn_train[publicspace_factor[i]], Xyn_train.ridership_0915, '.')
plt.title('Fig2a-{}: In-Sample points and OLS fitted Line for\n {}'.format(i+1, publicspace_factor[i]), fontsize=15)
plt.xlabel('{}'.format(publicspace_factor[i]), fontsize=12)
plt.ylabel('Normalized Ridership (2015/09)', fontsize=12)
plt.tight_layout(pad=3)
In [281]:
fig.savefig('../reports/figures/3.1-kz-OLS Bi-variate Regressions Figures.png')
In [276]:
# Run OLS regression on
lm0 = smf.ols(formula = 'ridership_0915 ~ \
bike_lane_score + park + street_quality_score + \
+ subway_entrance + tree_score + traffic_volume + \
median_hh_income + pop_density', \
data = Xy_train).fit()
In [277]:
lm0.summary()
Out[277]:
In [164]:
# Test In-Sample and OutofSample RSS and R^2
y0_pred_IS = lm0.predict(X0_train)
err_IS = (y0_pred_IS - np.asarray(y0_train).T).T
RSS_IS = sum(err_IS**2)
R_2_IS = 1 - np.var(err_IS)/np.var(y0_train)
print("The Residual sum of square for OLS regression is: {0}\nThe R-squared is: {1}".format(RSS_IS, R_2_IS))
y0_pred_OS = lm0.predict(X0_test) #compute the prediction for the test sample
err_OS = (y0_pred_OS - np.asarray(y0_test).T).T
RSS_OS = sum(err_OS**2)
R_2_OS = 1 - np.var(err_OS)/np.var(y0_test)
print("\nThe Residual sum of square for OLS regression is: {0}\nThe R-squared is: {1}".format(RSS_OS, R_2_OS))
In [251]:
# Plot Residues
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(121, title='Fig2b-1: OLS Model In-Sample Residues',\
xlabel='Citibike Station', ylabel='Ridership Residue')
ax1.plot(err_IS, '.')
ax1.plot([0,300], [0,0], 'r',linewidth=3)
ax2 = fig.add_subplot(122, title='Fig2b-2: OLS Model Out-Sample Residues',\
xlabel='Citibike Station', ylabel='Ridership Residue')
ax2.plot(err_OS, '.')
ax2.plot([0,150], [0,0], 'r',linewidth=3)
fig.tight_layout(pad= 2)
Figure 2b-1, 2b-2: The residues in both graphs are not evenly distributed along the sizes of the 0 horizontal line. This means that the our OLS multi-variate model does not fit the data well as we expected. It might be due to undectected outliers or that the correlation between the ridership and the public space factors is weak.
In [252]:
fig.savefig('../reports/figures/3.1-kz-OLS-Multiv Regression IS & OS Residue Figures.png')
In [262]:
# 4 fold cross-validation
# potential lambdas 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 30.0, 50, 80, 1e2, 2*1e2, 5*1e2, 8*1e2, 1e3
Ridge = linear_model.RidgeCV(fit_intercept=True, normalize=True, cv=4, store_cv_values=False)
lm_r = Ridge.fit(X0_train, y0_train)
In [263]:
#linear_model.RidgeCV?
In [264]:
print ('Selected Lambda by 4 fold cross validation:\n {0} \
\n\nRidge Model intercept: {1} \
\nRidge Model coefficients: {2}'
.format(lm_r.alpha_, lm_r.intercept_, lm_r.coef_))
In [208]:
# Test In-Sample and OutofSample RSS and R^2
y0_pred_IS_r = lm_r.predict(X0_train)
err_IS_r = y0_pred_IS_r - y0_train
RSS_IS_r = sum(err_IS_r**2)
R_2_IS_r = 1 - np.var(err_IS_r)/np.var(y0_train)
print("The Residual sum of square for IS Ridge is: {0}\nThe R-squared for IS Ridge is: {1}".format(RSS_IS_r.values, R_2_IS_r.values))
y0_pred_OS_r = lm_r.predict(X0_test) #compute the prediction for the Test set
err_OS_r = y0_pred_OS_r - y0_test
RSS_OS_r = sum(err_OS_r**2)
R_2_OS_r = 1 - np.var(err_OS_r)/np.var(y0_test)
print("\nThe Residual sum of square for OS Ridge is: {0}\nThe R-squared for OS Ridge is: {1}".format(RSS_OS_r.values, R_2_OS_r.values))
In [257]:
# Plot Residues
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(121, title='Fig2c-1: Ridge Model In-Sample Residues',\
xlabel='Citibike Station', ylabel='Ridership Residue')
ax1.plot(err_IS_r, '.')
ax1.plot([0,450], [0,0], 'r',linewidth=3)
ax2 = fig.add_subplot(122, title='Fig2c-2: Ridge Model Out-Sample Residues',\
xlabel='Citibike Station', ylabel='Ridership Residue')
ax2.plot(err_OS_r, '.')
ax2.plot([0,450], [0,0], 'r',linewidth=3)
fig.tight_layout(pad=3)
Figure 2c-1, 2c-2: The residues in both graphs are not evenly distributed on each side of the 0 horizontal line. This means that the our Ridge model does not fit the data so well as we expected. It might be due to undetected outliers or that the correlation between the ridership and the public space factors is weak.
In [258]:
fig.savefig('../reports/figures/3.1-kz-Ridge Regression IS & OS Residue Figures.png')
In [198]:
# Possible lambdas: 0.1, 1.0, 10.0, 50, 80, 1e2, 2*1e2, 5*1e2, 8*1e2, 1e3, 1e4, 1e5
LASSO = linear_model.LassoCV(n_alphas=20, alphas=None, \
fit_intercept=True, normalize=True, cv=4, positive=True, random_state=222, selection='random')
lm_l = LASSO.fit(X0_train, np.asarray(y0_train))
In [199]:
lm_l.alphas_
Out[199]:
In [200]:
lm_l.mse_path_
Out[200]:
In [201]:
print ('Selected Lambda by 4 fold cross validation:\n {0} \
\n\nLasso Model intercept: {1} \
\nLasso Model coefficients: {2}'
.format(lm_l.alpha_, lm_l.intercept_, lm_l.coef_))
In [279]:
# Test In-Sample and OutofSample R^2
y0_pred_IS_l = lm_l.predict(X0_train)
y0_pred_IS_l = y0_pred_IS_l.reshape(300,1) # reshape to (212,1) from (212,)
err_IS_l = y0_pred_IS_l - y0_train
RSS_IS_l = sum(err_IS_l**2)
R_2_IS_l = 1 - np.var(err_IS_l)/np.var(y0_train)
print("The Redidual Sum of Square for IS is: {0}\nThe R-squared for IS Lasso is: {1}".format(RSS_IS_l.values, R_2_IS_l.values))
y0_pred_OS_l = lm_l.predict(X0_test) #compute the prediction for the test sample
y0_pred_OS_l = y0_pred_OS_l.reshape(126,1)
# reshape to (212,1) from (212,)
err_OS_l = y0_pred_OS_l - y0_test
RSS_OS_l = (err_OS_l**2).sum()
R_2_OS_l = 1 - np.var(err_OS_l)/np.var(y0_test)
print("\nThe Redidual Sum of Square for OS is: {0}\nThe R-squared for OS Lasso is: {1}".format(RSS_OS_l.values, R_2_OS_l.values))
In [260]:
# Plot Residues
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(121, title='Fig2d-1: Lasso Model In-Sample Residues',\
xlabel='Citibike Station', ylabel='Ridership Residue')
ax1.plot(err_IS_l, '.')
ax1.plot([0,450], [0,0], 'r',linewidth=3)
ax2 = fig.add_subplot(122, title='Fig2d-2: Lasso Model Out-Sample Residues',\
xlabel='Citibike Station', ylabel='Ridership Residue')
ax2.plot(err_OS_l, '.')
ax2.plot([0,450], [0,0], 'r',linewidth=3)
fig.tight_layout(pad=3)
Figure 2d-1, 2d-2: The residues in both graphs are not evenly distributed along the sizes of the 0 horizontal line. This means that the our model does not fit the data well as we expected. It might be due to undectected outliers or that the correlation between the ridership and the public space factors is weak.
In [261]:
fig.savefig('../reports/figures/3.1-kz-LASSO Regression IS & OS Residue Figures.png')
In [ ]: