In [1]:
import pandas as pd
import matplotlib
import pylab as pl
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
%pylab inline


Populating the interactive namespace from numpy and matplotlib

Train merged data on first 300 rows of September rows and predict ridership of the remainder in September

Merged dataset

Calculate the regression in September

  • Did not remove any stations
  • Selected 300 stations for train

In [2]:
merged = pd.read_csv('../data/processed/master.csv')
merged_norm = pd.read_csv('../data/processed/master_norm.csv')

In [3]:
print(len(merged))


452

In [4]:
#dropping all months except September
merged = merged.drop(['ridership_0115', 'ridership_0215', 'ridership_0315', 'ridership_0415', 'ridership_0515', 'ridership_0615', 'ridership_0715', 'ridership_0815', 'ridership_1015', 'ridership_1115', 'ridership_1215'], axis=1)

In [5]:
merged.head()


Out[5]:
station_id ridership_0915 avg_ridership_2015 bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density
0 72 3667.0 2131.615385 0.0 1.0 8.000000 0.0 17.364799 14870.500000 90174.000000 0.000807
1 79 3011.0 1760.538462 3.0 0.0 8.571429 1.0 9.573955 9484.666667 86523.139535 0.000631
2 82 1166.0 766.538462 0.0 1.0 7.333333 0.0 35.070325 16812.500000 73988.000000 0.000511
3 83 1505.0 863.307692 0.0 0.0 7.500000 0.0 0.000000 41976.000000 85199.000000 0.000231
4 116 6558.0 3576.692308 4.0 1.0 8.500000 0.0 47.824344 15948.000000 104974.000000 0.000742

In [6]:
#regress without dropping NA
results = smf.ols('ridership_0915 ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=merged).fit()
results.summary()


Out[6]:
OLS Regression Results
Dep. Variable: ridership_0915 R-squared: 0.224
Model: OLS Adj. R-squared: 0.209
Method: Least Squares F-statistic: 15.01
Date: Thu, 15 Dec 2016 Prob (F-statistic): 2.25e-19
Time: 09:26:15 Log-Likelihood: -3839.5
No. Observations: 426 AIC: 7697.
Df Residuals: 417 BIC: 7734.
Df Model: 8
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 152.8324 1092.630 0.140 0.889 -1994.917 2300.582
bike_lane_score 255.1303 58.344 4.373 0.000 140.445 369.816
park 55.3926 154.598 0.358 0.720 -248.496 359.281
street_quality_score -46.4576 139.690 -0.333 0.740 -321.043 228.127
subway_entrance 1074.3487 303.004 3.546 0.000 478.744 1669.954
tree_score -8.6051 4.769 -1.804 0.072 -17.979 0.769
traffic_volume 0.0011 0.008 0.144 0.885 -0.014 0.017
median_hh_income 0.0200 0.003 6.827 0.000 0.014 0.026
pop_density 1.602e+06 3.45e+05 4.646 0.000 9.24e+05 2.28e+06
Omnibus: 87.393 Durbin-Watson: 1.431
Prob(Omnibus): 0.000 Jarque-Bera (JB): 178.557
Skew: 1.097 Prob(JB): 1.69e-39
Kurtosis: 5.291 Cond. No. 3.32e+08

In [7]:
merged = merged.dropna()

In [8]:
print(len(merged))


426

In [9]:
#regress after dropping NA
results = smf.ols('ridership_0915 ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=merged).fit()
results.summary()


Out[9]:
OLS Regression Results
Dep. Variable: ridership_0915 R-squared: 0.224
Model: OLS Adj. R-squared: 0.209
Method: Least Squares F-statistic: 15.01
Date: Thu, 15 Dec 2016 Prob (F-statistic): 2.25e-19
Time: 09:26:15 Log-Likelihood: -3839.5
No. Observations: 426 AIC: 7697.
Df Residuals: 417 BIC: 7734.
Df Model: 8
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 152.8324 1092.630 0.140 0.889 -1994.917 2300.582
bike_lane_score 255.1303 58.344 4.373 0.000 140.445 369.816
park 55.3926 154.598 0.358 0.720 -248.496 359.281
street_quality_score -46.4576 139.690 -0.333 0.740 -321.043 228.127
subway_entrance 1074.3487 303.004 3.546 0.000 478.744 1669.954
tree_score -8.6051 4.769 -1.804 0.072 -17.979 0.769
traffic_volume 0.0011 0.008 0.144 0.885 -0.014 0.017
median_hh_income 0.0200 0.003 6.827 0.000 0.014 0.026
pop_density 1.602e+06 3.45e+05 4.646 0.000 9.24e+05 2.28e+06
Omnibus: 87.393 Durbin-Watson: 1.431
Prob(Omnibus): 0.000 Jarque-Bera (JB): 178.557
Skew: 1.097 Prob(JB): 1.69e-39
Kurtosis: 5.291 Cond. No. 3.32e+08

In [10]:
# cross-validation on droppedna
R_IS=[]
R_OS=[]
n=1000
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(merged.iloc[:,[3, 4, 5, 6, 7, 8, 9, 10]], merged.iloc[:,1], train_size=300)
    res=LinearRegression(fit_intercept=False)
    res.fit(X_train,y_train)
    R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())                                                                     
    R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))


/Users/kristikorsberg/devel/venv/lib/python2.7/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
IS R-squared for 1000 times is 0.228405486455
OS R-squared for 1000 times is 0.179018444373

In [11]:
#plot significant explanatory variables
#NA still dropped
results_bl = smf.ols('ridership_0915 ~ bike_lane_score', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['bike_lane_score'], merged['ridership_0915'], 'r.')
pl.plot(merged['bike_lane_score'], results_bl.predict(sm.add_constant(merged['bike_lane_score'])), '-')
pl.title('Figure 1:\nCiti Bike Ridership in September 2015 as a Function of Bike Lane Score')
pl.xlabel('Bike Lane Score')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,5.5)


Out[11]:
(-0.5, 5.5)

In [12]:
results_se = smf.ols('ridership_0915 ~ subway_entrance', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['subway_entrance'], merged['ridership_0915'], 'r.')
pl.plot(merged['subway_entrance'], results_se.predict(sm.add_constant(merged['subway_entrance'])), '-')
pl.title('Figure 2:\nCiti Bike Ridership in September 2015 as a Function of Subway Entrances')
pl.xlabel('Subway Entrance')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,1.5)


Out[12]:
(-0.5, 1.5)

In [13]:
results_hi = smf.ols('ridership_0915 ~ median_hh_income', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['median_hh_income'], merged['ridership_0915'], 'r.')
pl.plot(merged['median_hh_income'], results_hi.predict(sm.add_constant(merged['median_hh_income'])), '-')
pl.title('Figure 3:\nCiti Bike Ridership by Station in September 2015 as a Function of Median Household Income')
pl.xlabel('Median Household Income')
pl.ylabel('Citi Bike Ridership')
pl.xlim(25000,255000)


Out[13]:
(25000, 255000)

In [14]:
results_pd = smf.ols('ridership_0915 ~ pop_density', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['pop_density'], merged['ridership_0915'], 'r.')
pl.plot(merged['pop_density'], results_pd.predict(sm.add_constant(merged['pop_density'])), '-')
pl.title('Figure 4:\nCiti Bike Ridership in September 2015 as a Function of Population Density')
pl.xlabel('Population Density')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.0001,.0016)


Out[14]:
(-0.0001, 0.0016)

Captions for Figures 1 - 4

Each plot is a scatter represntation of a count of Citi Bike ridership from each active station in September 2015 as a function of the four statistically significant explanatory variables. Variables 'subway entrance' and 'park' are binary, with a zero indicating that there was no subway or park within the buffer zone of the Citi Bike station, or a one, indicating that these features were there.


In [15]:
#conduct analysis with isolated ridership for weekends in September
wknds = pd.read_csv('../data/processed/sept-wknds.csv')
wknds = wknds.drop('Unnamed: 0', axis=1)
wknds = wknds.rename(columns={'start station id': 'station_id', 'tripduration': 'sept_wkndcounts'})
wknds.head()


Out[15]:
station_id sept_wkndcounts
0 72 893
1 79 598
2 82 262
3 83 519
4 116 1212

In [16]:
merged = merged.drop(['avg_ridership_2015', 'ridership_0915'], axis=1)
wkndmrg = pd.merge(merged, wknds, on='station_id', how='outer')
wkndmrg = wkndmrg.dropna()
wkndmrg.head()


Out[16]:
station_id bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density sept_wkndcounts
0 72 0.0 1.0 8.000000 0.0 17.364799 14870.500000 90174.000000 0.000807 893.0
1 79 3.0 0.0 8.571429 1.0 9.573955 9484.666667 86523.139535 0.000631 598.0
2 82 0.0 1.0 7.333333 0.0 35.070325 16812.500000 73988.000000 0.000511 262.0
3 83 0.0 0.0 7.500000 0.0 0.000000 41976.000000 85199.000000 0.000231 519.0
4 116 4.0 1.0 8.500000 0.0 47.824344 15948.000000 104974.000000 0.000742 1212.0

In [17]:
#regress after dropping NA
results_wknd = smf.ols('sept_wkndcounts ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=wkndmrg).fit()
results_wknd.summary()


Out[17]:
OLS Regression Results
Dep. Variable: sept_wkndcounts R-squared: 0.247
Model: OLS Adj. R-squared: 0.233
Method: Least Squares F-statistic: 16.92
Date: Thu, 15 Dec 2016 Prob (F-statistic): 9.15e-22
Time: 09:26:23 Log-Likelihood: -3177.3
No. Observations: 421 AIC: 6373.
Df Residuals: 412 BIC: 6409.
Df Model: 8
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept -158.8738 256.688 -0.619 0.536 -663.455 345.708
bike_lane_score 74.9617 13.526 5.542 0.000 48.372 101.551
park 93.9931 35.773 2.627 0.009 23.673 164.313
street_quality_score 3.0267 32.770 0.092 0.926 -61.391 67.444
subway_entrance 180.2052 70.042 2.573 0.010 42.521 317.889
tree_score 0.0460 1.113 0.041 0.967 -2.142 2.234
traffic_volume -0.0008 0.002 -0.449 0.654 -0.004 0.003
median_hh_income 0.0047 0.001 6.872 0.000 0.003 0.006
pop_density 3.842e+05 8.03e+04 4.784 0.000 2.26e+05 5.42e+05
Omnibus: 99.072 Durbin-Watson: 1.709
Prob(Omnibus): 0.000 Jarque-Bera (JB): 219.606
Skew: 1.211 Prob(JB): 2.06e-48
Kurtosis: 5.580 Cond. No. 3.33e+08

In [18]:
#cross-validation
R_IS=[]
R_OS=[]
n=1000
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(wkndmrg.iloc[:,[1, 2, 3, 4, 5, 6, 7, 8]], wkndmrg.iloc[:,9], train_size=300)
    res=LinearRegression(fit_intercept=False)
    res.fit(X_train,y_train)
    R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())                                                                     
    R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))


IS R-squared for 1000 times is 0.252101741816
OS R-squared for 1000 times is 0.192740812142

In [19]:
#plot significant explanatory variables
#NA still dropped
results_wknd_bl = smf.ols('sept_wkndcounts ~ bike_lane_score', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['bike_lane_score'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['bike_lane_score'], results_wknd_bl.predict(sm.add_constant(wkndmrg['bike_lane_score'])), '-')
pl.title('Figure 5:\nCiti Bike Ridership in September 2015 as a Function of Bike Lane Score')
pl.xlabel('Bike Lane Score')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(-.5,5.5)


Out[19]:
(-0.5, 5.5)

In [35]:
results_se = smf.ols('sept_wkndcounts ~ park', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['park'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['park'], results_se.predict(sm.add_constant(wkndmrg['park'])), '-')
pl.title('Figure 6:\nCiti Bike Ridership in September 2015 as a Function of the Proximity of Parks')
pl.xlabel('Park')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.1,4.1)


Out[35]:
(-0.1, 4.1)

In [27]:
results_se = smf.ols('sept_wkndcounts ~ subway_entrance', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['subway_entrance'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['subway_entrance'], results_se.predict(sm.add_constant(wkndmrg['subway_entrance'])), '-')
pl.title('Figure 7:\nCiti Bike Ridership in September 2015 as a Function of Subway Entrances')
pl.xlabel('Subway Entrance')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,1.5)


Out[27]:
(-0.5, 1.5)

In [32]:
results_wknd_bl = smf.ols('sept_wkndcounts ~ median_hh_income', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['median_hh_income'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['median_hh_income'], results_wknd_bl.predict(sm.add_constant(wkndmrg['median_hh_income'])), '-')
pl.title('Figure 8:\nCiti Bike Ridership in September 2015 as a Function of Median Household Income')
pl.xlabel('Median Household Income')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(25000,255000)


Out[32]:
(25000, 255000)

In [37]:
results_wknd_bl = smf.ols('sept_wkndcounts ~ pop_density', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['pop_density'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['pop_density'], results_wknd_bl.predict(sm.add_constant(wkndmrg['pop_density'])), '-')
pl.title('Figure 9:\nCiti Bike Ridership in September 2015 as a Function of Population Density')
pl.xlabel('Population Density')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(-.0001,.0016)


Out[37]:
(-0.0001, 0.0016)