In [1]:

    
import pandas as pd
import matplotlib
import pylab as pl
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib

Train merged data on first 300 rows of September rows and predict ridership of the remainder in September

Merged dataset

Calculate the regression in September

Did not remove any stations
Selected 300 stations for train



In [2]:

    
merged = pd.read_csv('../data/processed/master.csv')
merged_norm = pd.read_csv('../data/processed/master_norm.csv')



In [3]:

    
print(len(merged))



In [4]:

    
#dropping all months except September
merged = merged.drop(['ridership_0115', 'ridership_0215', 'ridership_0315', 'ridership_0415', 'ridership_0515', 'ridership_0615', 'ridership_0715', 'ridership_0815', 'ridership_1015', 'ridership_1115', 'ridership_1215'], axis=1)



In [5]:

    
merged.head()









    Out[5]:






  
    
      
      station_id
      ridership_0915
      avg_ridership_2015
      bike_lane_score
      park
      street_quality_score
      subway_entrance
      tree_score
      traffic_volume
      median_hh_income
      pop_density
    
  
  
    
      0
      72
      3667.0
      2131.615385
      0.0
      1.0
      8.000000
      0.0
      17.364799
      14870.500000
      90174.000000
      0.000807
    
    
      1
      79
      3011.0
      1760.538462
      3.0
      0.0
      8.571429
      1.0
      9.573955
      9484.666667
      86523.139535
      0.000631
    
    
      2
      82
      1166.0
      766.538462
      0.0
      1.0
      7.333333
      0.0
      35.070325
      16812.500000
      73988.000000
      0.000511
    
    
      3
      83
      1505.0
      863.307692
      0.0
      0.0
      7.500000
      0.0
      0.000000
      41976.000000
      85199.000000
      0.000231
    
    
      4
      116
      6558.0
      3576.692308
      4.0
      1.0
      8.500000
      0.0
      47.824344
      15948.000000
      104974.000000
      0.000742



In [6]:

    
#regress without dropping NA
results = smf.ols('ridership_0915 ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=merged).fit()
results.summary()









    Out[6]:





OLS Regression Results

  Dep. Variable:      ridership_0915     R-squared:             0.224


  Model:                    OLS          Adj. R-squared:        0.209


  Method:              Least Squares     F-statistic:           15.01


  Date:              Thu, 15 Dec 2016    Prob (F-statistic):  2.25e-19


  Time:                  09:26:15        Log-Likelihood:      -3839.5


  No. Observations:          426         AIC:                   7697.


  Df Residuals:              417         BIC:                   7734.


  Df Model:                    8                                     


  Covariance Type:       nonrobust                                   




                          coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept               152.8324   1092.630      0.140   0.889  -1994.917   2300.582


  bike_lane_score         255.1303     58.344      4.373   0.000    140.445    369.816


  park                     55.3926    154.598      0.358   0.720   -248.496    359.281


  street_quality_score    -46.4576    139.690     -0.333   0.740   -321.043    228.127


  subway_entrance        1074.3487    303.004      3.546   0.000    478.744   1669.954


  tree_score               -8.6051      4.769     -1.804   0.072    -17.979      0.769


  traffic_volume            0.0011      0.008      0.144   0.885     -0.014      0.017


  median_hh_income          0.0200      0.003      6.827   0.000      0.014      0.026


  pop_density            1.602e+06   3.45e+05      4.646   0.000   9.24e+05   2.28e+06




  Omnibus:        87.393    Durbin-Watson:         1.431


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    178.557


  Skew:            1.097    Prob(JB):           1.69e-39


  Kurtosis:        5.291    Cond. No.           3.32e+08



In [7]:

    
merged = merged.dropna()



In [8]:

    
print(len(merged))



In [9]:

    
#regress after dropping NA
results = smf.ols('ridership_0915 ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=merged).fit()
results.summary()









    Out[9]:





OLS Regression Results

  Dep. Variable:      ridership_0915     R-squared:             0.224


  Model:                    OLS          Adj. R-squared:        0.209


  Method:              Least Squares     F-statistic:           15.01


  Date:              Thu, 15 Dec 2016    Prob (F-statistic):  2.25e-19


  Time:                  09:26:15        Log-Likelihood:      -3839.5


  No. Observations:          426         AIC:                   7697.


  Df Residuals:              417         BIC:                   7734.


  Df Model:                    8                                     


  Covariance Type:       nonrobust                                   




                          coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept               152.8324   1092.630      0.140   0.889  -1994.917   2300.582


  bike_lane_score         255.1303     58.344      4.373   0.000    140.445    369.816


  park                     55.3926    154.598      0.358   0.720   -248.496    359.281


  street_quality_score    -46.4576    139.690     -0.333   0.740   -321.043    228.127


  subway_entrance        1074.3487    303.004      3.546   0.000    478.744   1669.954


  tree_score               -8.6051      4.769     -1.804   0.072    -17.979      0.769


  traffic_volume            0.0011      0.008      0.144   0.885     -0.014      0.017


  median_hh_income          0.0200      0.003      6.827   0.000      0.014      0.026


  pop_density            1.602e+06   3.45e+05      4.646   0.000   9.24e+05   2.28e+06




  Omnibus:        87.393    Durbin-Watson:         1.431


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    178.557


  Skew:            1.097    Prob(JB):           1.69e-39


  Kurtosis:        5.291    Cond. No.           3.32e+08



In [10]:

    
# cross-validation on droppedna
R_IS=[]
R_OS=[]
n=1000
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(merged.iloc[:,[3, 4, 5, 6, 7, 8, 9, 10]], merged.iloc[:,1], train_size=300)
    res=LinearRegression(fit_intercept=False)
    res.fit(X_train,y_train)
    R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())                                                                     
    R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))









    



/Users/kristikorsberg/devel/venv/lib/python2.7/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)






    



IS R-squared for 1000 times is 0.228405486455
OS R-squared for 1000 times is 0.179018444373



In [11]:

    
#plot significant explanatory variables
#NA still dropped
results_bl = smf.ols('ridership_0915 ~ bike_lane_score', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['bike_lane_score'], merged['ridership_0915'], 'r.')
pl.plot(merged['bike_lane_score'], results_bl.predict(sm.add_constant(merged['bike_lane_score'])), '-')
pl.title('Figure 1:\nCiti Bike Ridership in September 2015 as a Function of Bike Lane Score')
pl.xlabel('Bike Lane Score')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,5.5)









    Out[11]:





(-0.5, 5.5)



In [12]:

    
results_se = smf.ols('ridership_0915 ~ subway_entrance', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['subway_entrance'], merged['ridership_0915'], 'r.')
pl.plot(merged['subway_entrance'], results_se.predict(sm.add_constant(merged['subway_entrance'])), '-')
pl.title('Figure 2:\nCiti Bike Ridership in September 2015 as a Function of Subway Entrances')
pl.xlabel('Subway Entrance')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,1.5)









    Out[12]:





(-0.5, 1.5)



In [13]:

    
results_hi = smf.ols('ridership_0915 ~ median_hh_income', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['median_hh_income'], merged['ridership_0915'], 'r.')
pl.plot(merged['median_hh_income'], results_hi.predict(sm.add_constant(merged['median_hh_income'])), '-')
pl.title('Figure 3:\nCiti Bike Ridership by Station in September 2015 as a Function of Median Household Income')
pl.xlabel('Median Household Income')
pl.ylabel('Citi Bike Ridership')
pl.xlim(25000,255000)









    Out[13]:





(25000, 255000)



In [14]:

    
results_pd = smf.ols('ridership_0915 ~ pop_density', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['pop_density'], merged['ridership_0915'], 'r.')
pl.plot(merged['pop_density'], results_pd.predict(sm.add_constant(merged['pop_density'])), '-')
pl.title('Figure 4:\nCiti Bike Ridership in September 2015 as a Function of Population Density')
pl.xlabel('Population Density')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.0001,.0016)









    Out[14]:





(-0.0001, 0.0016)

Captions for Figures 1 - 4

Each plot is a scatter represntation of a count of Citi Bike ridership from each active station in September 2015 as a function of the four statistically significant explanatory variables. Variables 'subway entrance' and 'park' are binary, with a zero indicating that there was no subway or park within the buffer zone of the Citi Bike station, or a one, indicating that these features were there.



In [15]:

    
#conduct analysis with isolated ridership for weekends in September
wknds = pd.read_csv('../data/processed/sept-wknds.csv')
wknds = wknds.drop('Unnamed: 0', axis=1)
wknds = wknds.rename(columns={'start station id': 'station_id', 'tripduration': 'sept_wkndcounts'})
wknds.head()









    Out[15]:






  
    
      
      station_id
      sept_wkndcounts
    
  
  
    
      0
      72
      893
    
    
      1
      79
      598
    
    
      2
      82
      262
    
    
      3
      83
      519
    
    
      4
      116
      1212



In [16]:

    
merged = merged.drop(['avg_ridership_2015', 'ridership_0915'], axis=1)
wkndmrg = pd.merge(merged, wknds, on='station_id', how='outer')
wkndmrg = wkndmrg.dropna()
wkndmrg.head()









    Out[16]:






  
    
      
      station_id
      bike_lane_score
      park
      street_quality_score
      subway_entrance
      tree_score
      traffic_volume
      median_hh_income
      pop_density
      sept_wkndcounts
    
  
  
    
      0
      72
      0.0
      1.0
      8.000000
      0.0
      17.364799
      14870.500000
      90174.000000
      0.000807
      893.0
    
    
      1
      79
      3.0
      0.0
      8.571429
      1.0
      9.573955
      9484.666667
      86523.139535
      0.000631
      598.0
    
    
      2
      82
      0.0
      1.0
      7.333333
      0.0
      35.070325
      16812.500000
      73988.000000
      0.000511
      262.0
    
    
      3
      83
      0.0
      0.0
      7.500000
      0.0
      0.000000
      41976.000000
      85199.000000
      0.000231
      519.0
    
    
      4
      116
      4.0
      1.0
      8.500000
      0.0
      47.824344
      15948.000000
      104974.000000
      0.000742
      1212.0



In [17]:

    
#regress after dropping NA
results_wknd = smf.ols('sept_wkndcounts ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=wkndmrg).fit()
results_wknd.summary()









    Out[17]:





OLS Regression Results

  Dep. Variable:      sept_wkndcounts    R-squared:             0.247


  Model:                    OLS          Adj. R-squared:        0.233


  Method:              Least Squares     F-statistic:           16.92


  Date:              Thu, 15 Dec 2016    Prob (F-statistic):  9.15e-22


  Time:                  09:26:23        Log-Likelihood:      -3177.3


  No. Observations:          421         AIC:                   6373.


  Df Residuals:              412         BIC:                   6409.


  Df Model:                    8                                     


  Covariance Type:       nonrobust                                   




                          coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept              -158.8738    256.688     -0.619   0.536   -663.455    345.708


  bike_lane_score          74.9617     13.526      5.542   0.000     48.372    101.551


  park                     93.9931     35.773      2.627   0.009     23.673    164.313


  street_quality_score      3.0267     32.770      0.092   0.926    -61.391     67.444


  subway_entrance         180.2052     70.042      2.573   0.010     42.521    317.889


  tree_score                0.0460      1.113      0.041   0.967     -2.142      2.234


  traffic_volume           -0.0008      0.002     -0.449   0.654     -0.004      0.003


  median_hh_income          0.0047      0.001      6.872   0.000      0.003      0.006


  pop_density            3.842e+05   8.03e+04      4.784   0.000   2.26e+05   5.42e+05




  Omnibus:        99.072    Durbin-Watson:         1.709


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    219.606


  Skew:            1.211    Prob(JB):           2.06e-48


  Kurtosis:        5.580    Cond. No.           3.33e+08



In [18]:

    
#cross-validation
R_IS=[]
R_OS=[]
n=1000
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(wkndmrg.iloc[:,[1, 2, 3, 4, 5, 6, 7, 8]], wkndmrg.iloc[:,9], train_size=300)
    res=LinearRegression(fit_intercept=False)
    res.fit(X_train,y_train)
    R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())                                                                     
    R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))









    



IS R-squared for 1000 times is 0.252101741816
OS R-squared for 1000 times is 0.192740812142



In [19]:

    
#plot significant explanatory variables
#NA still dropped
results_wknd_bl = smf.ols('sept_wkndcounts ~ bike_lane_score', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['bike_lane_score'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['bike_lane_score'], results_wknd_bl.predict(sm.add_constant(wkndmrg['bike_lane_score'])), '-')
pl.title('Figure 5:\nCiti Bike Ridership in September 2015 as a Function of Bike Lane Score')
pl.xlabel('Bike Lane Score')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(-.5,5.5)









    Out[19]:





(-0.5, 5.5)



In [35]:

    
results_se = smf.ols('sept_wkndcounts ~ park', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['park'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['park'], results_se.predict(sm.add_constant(wkndmrg['park'])), '-')
pl.title('Figure 6:\nCiti Bike Ridership in September 2015 as a Function of the Proximity of Parks')
pl.xlabel('Park')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.1,4.1)









    Out[35]:





(-0.1, 4.1)



In [27]:

    
results_se = smf.ols('sept_wkndcounts ~ subway_entrance', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['subway_entrance'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['subway_entrance'], results_se.predict(sm.add_constant(wkndmrg['subway_entrance'])), '-')
pl.title('Figure 7:\nCiti Bike Ridership in September 2015 as a Function of Subway Entrances')
pl.xlabel('Subway Entrance')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,1.5)









    Out[27]:





(-0.5, 1.5)



In [32]:

    
results_wknd_bl = smf.ols('sept_wkndcounts ~ median_hh_income', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['median_hh_income'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['median_hh_income'], results_wknd_bl.predict(sm.add_constant(wkndmrg['median_hh_income'])), '-')
pl.title('Figure 8:\nCiti Bike Ridership in September 2015 as a Function of Median Household Income')
pl.xlabel('Median Household Income')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(25000,255000)









    Out[32]:





(25000, 255000)



In [37]:

    
results_wknd_bl = smf.ols('sept_wkndcounts ~ pop_density', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['pop_density'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['pop_density'], results_wknd_bl.predict(sm.add_constant(wkndmrg['pop_density'])), '-')
pl.title('Figure 9:\nCiti Bike Ridership in September 2015 as a Function of Population Density')
pl.xlabel('Population Density')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(-.0001,.0016)









    Out[37]:





(-0.0001, 0.0016)

	station_id	ridership_0915	avg_ridership_2015	bike_lane_score	park	street_quality_score	subway_entrance	tree_score	traffic_volume	median_hh_income	pop_density
0	72	3667.0	2131.615385	0.0	1.0	8.000000	0.0	17.364799	14870.500000	90174.000000	0.000807
1	79	3011.0	1760.538462	3.0	0.0	8.571429	1.0	9.573955	9484.666667	86523.139535	0.000631
2	82	1166.0	766.538462	0.0	1.0	7.333333	0.0	35.070325	16812.500000	73988.000000	0.000511
3	83	1505.0	863.307692	0.0	0.0	7.500000	0.0	0.000000	41976.000000	85199.000000	0.000231
4	116	6558.0	3576.692308	4.0	1.0	8.500000	0.0	47.824344	15948.000000	104974.000000	0.000742

Dep. Variable:	ridership_0915	R-squared:	0.224
Model:	OLS	Adj. R-squared:	0.209
Method:	Least Squares	F-statistic:	15.01
Date:	Thu, 15 Dec 2016	Prob (F-statistic):	2.25e-19
Time:	09:26:15	Log-Likelihood:	-3839.5
No. Observations:	426	AIC:	7697.
Df Residuals:	417	BIC:	7734.
Df Model:	8
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	152.8324	1092.630	0.140	0.889	-1994.917	2300.582
bike_lane_score	255.1303	58.344	4.373	0.000	140.445	369.816
park	55.3926	154.598	0.358	0.720	-248.496	359.281
street_quality_score	-46.4576	139.690	-0.333	0.740	-321.043	228.127
subway_entrance	1074.3487	303.004	3.546	0.000	478.744	1669.954
tree_score	-8.6051	4.769	-1.804	0.072	-17.979	0.769
traffic_volume	0.0011	0.008	0.144	0.885	-0.014	0.017
median_hh_income	0.0200	0.003	6.827	0.000	0.014	0.026
pop_density	1.602e+06	3.45e+05	4.646	0.000	9.24e+05	2.28e+06

Omnibus:	87.393	Durbin-Watson:	1.431
Prob(Omnibus):	0.000	Jarque-Bera (JB):	178.557
Skew:	1.097	Prob(JB):	1.69e-39
Kurtosis:	5.291	Cond. No.	3.32e+08

Dep. Variable:	sept_wkndcounts	R-squared:	0.247
Model:	OLS	Adj. R-squared:	0.233
Method:	Least Squares	F-statistic:	16.92
Date:	Thu, 15 Dec 2016	Prob (F-statistic):	9.15e-22
Time:	09:26:23	Log-Likelihood:	-3177.3
No. Observations:	421	AIC:	6373.
Df Residuals:	412	BIC:	6409.
Df Model:	8
Covariance Type:	nonrobust

Omnibus:	99.072	Durbin-Watson:	1.709
Prob(Omnibus):	0.000	Jarque-Bera (JB):	219.606
Skew:	1.211	Prob(JB):	2.06e-48
Kurtosis:	5.580	Cond. No.	3.33e+08