In [1]:
import pandas as pd
import matplotlib
import pylab as pl
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
%pylab inline
In [2]:
merged = pd.read_csv('../data/processed/master.csv')
merged_norm = pd.read_csv('../data/processed/master_norm.csv')
In [3]:
print(len(merged))
In [4]:
#dropping all months except September
merged = merged.drop(['ridership_0115', 'ridership_0215', 'ridership_0315', 'ridership_0415', 'ridership_0515', 'ridership_0615', 'ridership_0715', 'ridership_0815', 'ridership_1015', 'ridership_1115', 'ridership_1215'], axis=1)
In [5]:
merged.head()
Out[5]:
In [6]:
#regress without dropping NA
results = smf.ols('ridership_0915 ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=merged).fit()
results.summary()
Out[6]:
In [7]:
merged = merged.dropna()
In [8]:
print(len(merged))
In [9]:
#regress after dropping NA
results = smf.ols('ridership_0915 ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=merged).fit()
results.summary()
Out[9]:
In [10]:
# cross-validation on droppedna
R_IS=[]
R_OS=[]
n=1000
for i in range(n):
X_train, X_test, y_train, y_test = train_test_split(merged.iloc[:,[3, 4, 5, 6, 7, 8, 9, 10]], merged.iloc[:,1], train_size=300)
res=LinearRegression(fit_intercept=False)
res.fit(X_train,y_train)
R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())
R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))
In [11]:
#plot significant explanatory variables
#NA still dropped
results_bl = smf.ols('ridership_0915 ~ bike_lane_score', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['bike_lane_score'], merged['ridership_0915'], 'r.')
pl.plot(merged['bike_lane_score'], results_bl.predict(sm.add_constant(merged['bike_lane_score'])), '-')
pl.title('Figure 1:\nCiti Bike Ridership in September 2015 as a Function of Bike Lane Score')
pl.xlabel('Bike Lane Score')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,5.5)
Out[11]:
In [12]:
results_se = smf.ols('ridership_0915 ~ subway_entrance', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['subway_entrance'], merged['ridership_0915'], 'r.')
pl.plot(merged['subway_entrance'], results_se.predict(sm.add_constant(merged['subway_entrance'])), '-')
pl.title('Figure 2:\nCiti Bike Ridership in September 2015 as a Function of Subway Entrances')
pl.xlabel('Subway Entrance')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,1.5)
Out[12]:
In [13]:
results_hi = smf.ols('ridership_0915 ~ median_hh_income', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['median_hh_income'], merged['ridership_0915'], 'r.')
pl.plot(merged['median_hh_income'], results_hi.predict(sm.add_constant(merged['median_hh_income'])), '-')
pl.title('Figure 3:\nCiti Bike Ridership by Station in September 2015 as a Function of Median Household Income')
pl.xlabel('Median Household Income')
pl.ylabel('Citi Bike Ridership')
pl.xlim(25000,255000)
Out[13]:
In [14]:
results_pd = smf.ols('ridership_0915 ~ pop_density', data=merged).fit()
pl.figure(figsize=(10,10))
pl.plot(merged['pop_density'], merged['ridership_0915'], 'r.')
pl.plot(merged['pop_density'], results_pd.predict(sm.add_constant(merged['pop_density'])), '-')
pl.title('Figure 4:\nCiti Bike Ridership in September 2015 as a Function of Population Density')
pl.xlabel('Population Density')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.0001,.0016)
Out[14]:
Each plot is a scatter represntation of a count of Citi Bike ridership from each active station in September 2015 as a function of the four statistically significant explanatory variables. Variables 'subway entrance' and 'park' are binary, with a zero indicating that there was no subway or park within the buffer zone of the Citi Bike station, or a one, indicating that these features were there.
In [15]:
#conduct analysis with isolated ridership for weekends in September
wknds = pd.read_csv('../data/processed/sept-wknds.csv')
wknds = wknds.drop('Unnamed: 0', axis=1)
wknds = wknds.rename(columns={'start station id': 'station_id', 'tripduration': 'sept_wkndcounts'})
wknds.head()
Out[15]:
In [16]:
merged = merged.drop(['avg_ridership_2015', 'ridership_0915'], axis=1)
wkndmrg = pd.merge(merged, wknds, on='station_id', how='outer')
wkndmrg = wkndmrg.dropna()
wkndmrg.head()
Out[16]:
In [17]:
#regress after dropping NA
results_wknd = smf.ols('sept_wkndcounts ~ bike_lane_score + park + street_quality_score + subway_entrance + tree_score + traffic_volume + median_hh_income + pop_density', data=wkndmrg).fit()
results_wknd.summary()
Out[17]:
In [18]:
#cross-validation
R_IS=[]
R_OS=[]
n=1000
for i in range(n):
X_train, X_test, y_train, y_test = train_test_split(wkndmrg.iloc[:,[1, 2, 3, 4, 5, 6, 7, 8]], wkndmrg.iloc[:,9], train_size=300)
res=LinearRegression(fit_intercept=False)
res.fit(X_train,y_train)
R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())
R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))
In [19]:
#plot significant explanatory variables
#NA still dropped
results_wknd_bl = smf.ols('sept_wkndcounts ~ bike_lane_score', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['bike_lane_score'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['bike_lane_score'], results_wknd_bl.predict(sm.add_constant(wkndmrg['bike_lane_score'])), '-')
pl.title('Figure 5:\nCiti Bike Ridership in September 2015 as a Function of Bike Lane Score')
pl.xlabel('Bike Lane Score')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(-.5,5.5)
Out[19]:
In [35]:
results_se = smf.ols('sept_wkndcounts ~ park', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['park'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['park'], results_se.predict(sm.add_constant(wkndmrg['park'])), '-')
pl.title('Figure 6:\nCiti Bike Ridership in September 2015 as a Function of the Proximity of Parks')
pl.xlabel('Park')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.1,4.1)
Out[35]:
In [27]:
results_se = smf.ols('sept_wkndcounts ~ subway_entrance', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['subway_entrance'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['subway_entrance'], results_se.predict(sm.add_constant(wkndmrg['subway_entrance'])), '-')
pl.title('Figure 7:\nCiti Bike Ridership in September 2015 as a Function of Subway Entrances')
pl.xlabel('Subway Entrance')
pl.ylabel('Citi Bike Ridership')
pl.xlim(-.5,1.5)
Out[27]:
In [32]:
results_wknd_bl = smf.ols('sept_wkndcounts ~ median_hh_income', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['median_hh_income'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['median_hh_income'], results_wknd_bl.predict(sm.add_constant(wkndmrg['median_hh_income'])), '-')
pl.title('Figure 8:\nCiti Bike Ridership in September 2015 as a Function of Median Household Income')
pl.xlabel('Median Household Income')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(25000,255000)
Out[32]:
In [37]:
results_wknd_bl = smf.ols('sept_wkndcounts ~ pop_density', data=wkndmrg).fit()
pl.figure(figsize=(10,10))
pl.plot(wkndmrg['pop_density'], wkndmrg['sept_wkndcounts'], 'r.')
pl.plot(wkndmrg['pop_density'], results_wknd_bl.predict(sm.add_constant(wkndmrg['pop_density'])), '-')
pl.title('Figure 9:\nCiti Bike Ridership in September 2015 as a Function of Population Density')
pl.xlabel('Population Density')
pl.ylabel('Weekend Citi Bike Ridership')
pl.xlim(-.0001,.0016)
Out[37]: