In this notebook I will attempt to answer the following questions using 1 year (~3.25 million rides in 386MB) of Capital Bikeshare data
In [1]:
import pickle
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import scipy.stats
from geopy.distance import vincenty
import folium
matplotlib.style.use('ggplot')
%matplotlib inline
Load data which has been previous imported and cleaned in other notebooks
bike_location
is a dictionary of all Capital Bikeshare stations (key) and their geo-coordinates (value)station_data
is a 2D dictionary of the metro stations (2nd level key) along each line (1st level key) and their geo-coordinates (2nd level values)metro_delays
is a pandas dataframe listing every delay time, affect line, and duration from WMATA bikeshare_rides
is a pandas dataframe listing every ride for the most recent available year (2015-Q4 through 2016-Q3)
In [2]:
bike_location = pickle.load(open("bike_location.p", "rb"))
station_data = pickle.load(open("station_data.p", "rb"))
metro_delays = pickle.load(open("metro_delays.p", "rb"))
bikeshare_rides = pickle.load(open("bikeshare_rides_all.p", "rb"))
create list of all bike share stations to which I have location data then drop all rides which utilize one of these stations
In [3]:
bikestation_list = []
for key in bike_location:
bikestation_list.append(key)
In [4]:
print('There are {:.0f} total rides in the data'.format(len(bikeshare_rides)))
bikeshare_rides = bikeshare_rides.where(bikeshare_rides['Start station'].str[:].isin(bikestation_list)).dropna()
bikeshare_rides = bikeshare_rides.where(bikeshare_rides['End station'].str[:].isin(bikestation_list)).dropna()
print('There are {:.0f} rides to which we have location data'.format(len(bikeshare_rides)))
create some columns to help with indexing later
In [5]:
bikeshare_rides['Hour'] = bikeshare_rides.index.hour
bikeshare_rides['Minute'] = bikeshare_rides.index.minute
bikeshare_rides['Day'] = bikeshare_rides.index.weekday_name
bikeshare_rides['Route'] = bikeshare_rides['Start station'] + " to " + bikeshare_rides['End station']
business_days = pd.bdate_range(bikeshare_rides.index.min(), bikeshare_rides.index.max())
weekend_days = len(business_days) / 5 * 2
print('Number of weekend days: {}'.format(int(weekend_days)))
print('Number of weekday days: {}'.format(int(len(business_days))))
Now let's determine which bikeshare stations are within a given distance to ANY metro station.
Then we can create a flag which indicates if a each bikeshare ride orginates, or terminates, near a metro station (this will be useful later on in our analysis)
Define functions which were initally written and tested in other notebooks
In [6]:
def close_stations(distance):
"""This fn will return a dict of bikeshare stations close
to each metro stop based on the suppled distance in miles"""
lines = ['RD', 'YL', 'GR','BL', 'OR', 'SV']
bikes_close = dict()
for ii in range(len(lines)):
bikes_temp = []
for key_metro in station_data[lines[ii]]:
for key_bike in bike_location:
dist = vincenty(station_data[lines[ii]][key_metro], bike_location[key_bike]).miles
if dist <= distance:
bikes_temp.append(key_bike)
#print([lines[ii], key_metro, key_bike, dist])
bikes_close[lines[ii]] = list(set(bikes_temp))
return bikes_close
In [7]:
stations_close_dict = close_stations(0.15)
In [8]:
nearby_stations = []
for key in stations_close_dict:
nearby_stations = nearby_stations + stations_close_dict[key]
nearby_stations = list(set(nearby_stations))
print('This is how many bike share stations are considered "nearby": {}'.format(str(len(nearby_stations))))
print('This is how many bike share stations are considered "far": {}'.format(str(len(bike_location))))
In [9]:
bikeshare_rides['Start dist'] = np.nan
mask = bikeshare_rides['Start station'].isin(nearby_stations)
bikeshare_rides['Start dist'] = bikeshare_rides['Start dist'].where(~mask, other='near')
bikeshare_rides['Start dist'] = bikeshare_rides['Start dist'].where(mask, other='far')
bikeshare_rides['End dist'] = np.nan
mask = bikeshare_rides['End station'].isin(nearby_stations)
bikeshare_rides['End dist'] = bikeshare_rides['End dist'].where(~mask, other='near')
bikeshare_rides['End dist'] = bikeshare_rides['End dist'].where(mask, other='far')
What is the ratio of Registered vs. Casual riders for all rides during this 1 year period?
In [10]:
bikeshare_rides.groupby("Member Type")['Member Type'].count()
Out[10]:
In [11]:
print('{:.0f}% of all rides are from Registered riders'.format((bikeshare_rides.groupby("Member Type")['Member Type']
.count().iloc[1]) / len(bikeshare_rides)*100))
Slice bikeshare data to get the following 'windows':
In [12]:
bikeshare_weekday = bikeshare_rides[bikeshare_rides['Day'].str[0].isin(list('MTWF'))]
bikeshare_weekend = bikeshare_rides[~bikeshare_rides['Day'].str[0].isin(list('MTWF'))]
In [13]:
print(bikeshare_weekday['Day'].unique())
print(bikeshare_weekend['Day'].unique())
print('Weekday rides: ', bikeshare_weekday['Start station'].count())
print('Weekend rides: ', bikeshare_weekend['Start station'].count())
print('{:.0f}% of the rides are during the work week'.format(bikeshare_weekday['Start station'].count() /
bikeshare_rides['Start station'].count()*100))
Now that we have the different 'windows' to look at weekday vs weekend 24 hour periods
In [14]:
(bikeshare_weekday.groupby('Hour')['Hour'].count() / len(business_days))
Out[14]:
In [15]:
fig, ax = plt.subplots(figsize=[6, 4])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Weekday Ridership')
(bikeshare_weekday.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b', ax=ax)
Out[15]:
In [16]:
(bikeshare_weekend.groupby('Hour')['Hour'].count() / weekend_days)
Out[16]:
In [17]:
fig, ax = plt.subplots(figsize=[6, 4])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Weekend Ridership')
(bikeshare_weekend.groupby('Hour')['Hour'].count() / weekend_days).plot(kind='bar',alpha=0.5, color='r', ax=ax)
plt.show()
In [18]:
fig, ax = plt.subplots(figsize=[16, 8])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Weekday vs Weekend Ridership')
(bikeshare_weekday.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Weekday")
(bikeshare_weekend.groupby('Hour')['Hour'].count() / weekend_days).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Weekend")
plt.legend(frameon=False, fontsize=14, loc=9)
plt.savefig('weekday_v_weekend.png', dpi=600)
A couple clear take aways, There are two spikes during the work week, one for each rush hour (centered on 8AM and 5PM), where as the weekend data just shows gradual changes with a max between 12:00 PM abd 3:00 PM
What can we learn about 'Registered' vs 'Casual' riders during each of these time frames?
(Capital Bikeshare groups their riders based on if they have a long term (registered; Annual Member, 30-Day Member or Day Key Member) or short term (casual; Single Trip, 24-Hour Pass, 3-Day Pass or 5-Day Pass) membership
In [19]:
bikeshare_weekday_casual = bikeshare_weekday[bikeshare_weekday['Member Type'] == 'Casual']
bikeshare_weekday_registered = bikeshare_weekday[bikeshare_weekday['Member Type'] == 'Registered']
bikeshare_weekend_casual = bikeshare_weekend[bikeshare_weekend['Member Type'] == 'Casual']
bikeshare_weekend_registered = bikeshare_weekend[bikeshare_weekend['Member Type'] == 'Registered']
In [20]:
print("Average rides of each type per day \n")
print('Weekday Casual: {}'.format(int(len(bikeshare_weekday_casual) / len(business_days))))
print('Weekday Registered: {}'.format(int(len(bikeshare_weekday_registered) / len(business_days))))
print('Weekend Casual: {}'.format(int(len(bikeshare_weekend_casual) / weekend_days)))
print('Weekend Registered: {}'.format(int(len(bikeshare_weekend_registered) / weekend_days)))
In [21]:
fig, ax = plt.subplots(figsize=[16, 8])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Weekday Casual vs Registered Ridership')
(bikeshare_weekday_casual.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Casual")
(bikeshare_weekday_registered.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Registered")
plt.legend(frameon=False, fontsize=14, loc=9)
plt.savefig('weekday casual v registered.png', dpi=600)
In [22]:
fig, ax = plt.subplots(figsize=[16, 8])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Weekend Casual vs Registered Ridership')
(bikeshare_weekend_casual.groupby('Hour')['Hour'].count() / weekend_days).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Casual")
(bikeshare_weekend_registered.groupby('Hour')['Hour'].count() / weekend_days).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Registered")
plt.legend(frameon=False, fontsize=14, loc=1)
Out[22]:
In [23]:
fig, ax = plt.subplots(figsize=[16, 8])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Casual Weekend vs Weekday Ridership')
(bikeshare_weekend_casual.groupby('Hour')['Hour'].count() / weekend_days).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Weekend Casual")
(bikeshare_weekday_casual.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Weekday Casual")
plt.legend(frameon=False, fontsize=14, loc=1)
Out[23]:
In [24]:
fig, ax = plt.subplots(figsize=[16, 8])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Registered Weekend vs. Weekday Ridership')
(bikeshare_weekend_registered.groupby('Hour')['Hour'].count() / weekend_days).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Weekend Registered")
(bikeshare_weekday_registered.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Weekday Registered")
plt.legend(frameon=False, fontsize=14, loc=1)
Out[24]:
Take aways:
Now, lets slice up the data based on the rush hour patterns for registered used that we saw above.
In [25]:
bikeshare_morning_rush = bikeshare_weekday_registered[(bikeshare_weekday_registered['Hour'] >= 6) &
(bikeshare_weekday_registered['Hour'] <= 9)]
bikeshare_evening_rush = bikeshare_weekday_registered[(bikeshare_weekday_registered['Hour'] >= 16) &
(bikeshare_weekday_registered['Hour'] <= 19)]
bikeshare_no_rush = bikeshare_weekday_registered[~((bikeshare_weekday_registered['Hour'] >= 16) &
(bikeshare_weekday_registered['Hour'] <= 19)) &
~((bikeshare_weekday_registered['Hour'] >= 6) &
(bikeshare_weekday_registered['Hour'] <= 9))]
In [26]:
print(bikeshare_morning_rush['Hour'].unique())
print(bikeshare_evening_rush['Hour'].unique())
print(bikeshare_no_rush['Hour'].unique())
Create label for each trip indicating if it originated or concluded near a metro station (first determine which bike share stations are within 0.15 miles of a metro station)
In [27]:
bikeshare_morning_rush_close = bikeshare_morning_rush[bikeshare_morning_rush['Start station']
.str[:].isin(nearby_stations)]
bikeshare_morning_rush_far = bikeshare_morning_rush[~bikeshare_morning_rush['Start station']
.str[:].isin(nearby_stations)]
bikeshare_evening_rush_close = bikeshare_evening_rush[bikeshare_evening_rush['Start station']
.str[:].isin(nearby_stations)]
bikeshare_evening_rush_far = bikeshare_evening_rush[~bikeshare_evening_rush['Start station']
.str[:].isin(nearby_stations)]
bikeshare_no_rush_close = bikeshare_no_rush[bikeshare_no_rush['Start station']
.str[:].isin(nearby_stations)]
bikeshare_no_rush_far = bikeshare_no_rush[~bikeshare_no_rush['Start station']
.str[:].isin(nearby_stations)]
In [28]:
fig, ax = plt.subplots(figsize=[12, 6])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Rides from Near and Far from Metro Stations During Morning Rush for Registered Riders')
(bikeshare_morning_rush_close.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Close to Metro Stations")
(bikeshare_morning_rush_far.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Far from Metro Stations")
plt.legend(frameon=False, fontsize=14, loc=2)
Out[28]:
In [29]:
print('This is how many bike trips started from close to a metro station: {}'
.format(str(len(bikeshare_morning_rush_close))))
print('This is how many bike trips started from far from a metro station: {}'
.format(str(len(bikeshare_morning_rush_far))))
print('{:.2f}% of rides orginiated from {:.2f}% of stations'.format((len(bikeshare_morning_rush_close)/len(bikeshare_morning_rush)*100),
(len(nearby_stations)/len(bike_location)*100)))
The fact that 20% of bike stations (those within 0.15 miles of a metro station) account for ~ 25% of outgoing rides during the morning rush suggest that many commuters use the two transportation systems together.
In [30]:
fig, ax = plt.subplots(figsize=[12, 6])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Rides from Near and Far from Metro Stations During Evening Rush for Registered Riders')
(bikeshare_evening_rush_close.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Close to Metro Stations")
(bikeshare_evening_rush_far.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Far from Metro Stations")
plt.legend(frameon=False, fontsize=14, loc=1)
Out[30]:
In [31]:
print('This is how many bike trips started from close to a metro station: {}'
.format(str(len(bikeshare_evening_rush_close))))
print('This is how many bike trips started from far from a metro station: {}'
.format(str(len(bikeshare_evening_rush_far))))
print('{:.2f}% of rides orginiated from {:.2f}% of stations'.format((len(bikeshare_evening_rush_close)/len(bikeshare_evening_rush)*100),
(len(nearby_stations)/len(bike_location)*100)))
Does this mean that people do indeed use bike share together with the metro? Perhaps they ride a bike to the metro in the morning, then metro to work. In the evening, they metro to near home, then bike the rest of the way.
In [32]:
fig, ax = plt.subplots(figsize=[14, 6])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Rides from Near and Far from Metro Stations During Non-Rush Hours for Registered Riders')
(bikeshare_no_rush_close.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Close to Metro Stations")
(bikeshare_no_rush_far.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Far from Metro Stations")
plt.legend(frameon=False, fontsize=14, loc=2)
Out[32]:
In [33]:
print('This is how many bike trips started from close to a metro station: {}'
.format(str(len(bikeshare_no_rush_close))))
print('This is how many bike trips started from far from a metro station: {}'
.format(str(len(bikeshare_no_rush_far))))
print('{:.2f}% of rides orginiated from {:.2f}% of stations'.format((len(bikeshare_no_rush_close)/len(bikeshare_no_rush)*100),
(len(nearby_stations)/len(bike_location)*100)))
What about where bike share trips conclude?
In [34]:
bikeshare_morning_rush_close_end = bikeshare_morning_rush[bikeshare_morning_rush['End station']
.str[:].isin(nearby_stations)]
bikeshare_morning_rush_far_end = bikeshare_morning_rush[~bikeshare_morning_rush['End station']
.str[:].isin(nearby_stations)]
bikeshare_evening_rush_close_end = bikeshare_evening_rush[bikeshare_evening_rush['End station']
.str[:].isin(nearby_stations)]
bikeshare_evening_rush_far_end = bikeshare_evening_rush[~bikeshare_evening_rush['End station']
.str[:].isin(nearby_stations)]
bikeshare_no_rush_close_end = bikeshare_no_rush[bikeshare_no_rush['End station']
.str[:].isin(nearby_stations)]
bikeshare_no_rush_far_end = bikeshare_no_rush[~bikeshare_no_rush['End station']
.str[:].isin(nearby_stations)]
In [35]:
fig, ax = plt.subplots(figsize=[12, 6])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Rides Ending at Near and Far Metro Stations During Morning Rush for Registered Riders')
(bikeshare_morning_rush_close_end.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Close to Metro Stations")
(bikeshare_morning_rush_far_end.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Far from Metro Stations")
plt.legend(frameon=False, fontsize=14, loc=2)
Out[35]:
In [36]:
print('This is how many bike trips ended at a close metro station: {}'
.format(str(len(bikeshare_morning_rush_close_end))))
print('This is how many bike trips ended at a far metro station: {}'
.format(str(len(bikeshare_morning_rush_far_end))))
print('{:.2f}% of rides ending at {:.2f}% of stations'.format((len(bikeshare_morning_rush_close_end)/len(bikeshare_morning_rush)*100),
(len(nearby_stations)/len(bike_location)*100)))
In [37]:
fig, ax = plt.subplots(figsize=[12, 6])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Rides Ending at Near and Far Metro Stations During Evening Rush for Registered Riders')
(bikeshare_evening_rush_close_end.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Close to Metro Stations")
(bikeshare_evening_rush_far_end.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Far from Metro Stations")
plt.legend(frameon=False, fontsize=14, loc=1)
Out[37]:
In [38]:
print('This is how many bike trips ended at a close metro station: {}'
.format(str(len(bikeshare_evening_rush_close_end))))
print('This is how many bike trips ended at a far metro station: {}'
.format(str(len(bikeshare_evening_rush_far_end))))
print('{:.2f}% of rides ending at {:.2f}% of stations'.format((len(bikeshare_evening_rush_close_end)/len(bikeshare_evening_rush)*100),
(len(nearby_stations)/len(bike_location)*100)))
In [39]:
fig, ax = plt.subplots(figsize=[14, 6])
ax.set_ylabel('Average number of rides per hour per day')
ax.set_title('Capital Bikeshare Rides Ending at Near and Far Metro Stations During Non-Rush Hours for Registered Riders')
(bikeshare_no_rush_close_end.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar', alpha=0.5, color='b',
ax=ax,label= "Close to Metro Stations")
(bikeshare_no_rush_far_end.groupby('Hour')['Hour'].count() / len(business_days)).plot(kind='bar',alpha=0.5, color='r',
ax=ax, label= "Far from Metro Stations")
plt.legend(frameon=False, fontsize=14, loc=2)
Out[39]:
In [40]:
print('This is how many bike trips ended close to a metro station: {}'
.format(str(len(bikeshare_no_rush_close))))
print('This is how many bike trips ended far from a metro station: {}'
.format(str(len(bikeshare_no_rush_far))))
print('{:.2f}% of rides ending at {:.2f}% of stations'.format((len(bikeshare_no_rush_close_end)/len(bikeshare_no_rush)*100),
(len(nearby_stations)/len(bike_location)*100)))
In [41]:
def route_dist(x, y):
""" this will return the distance between two point using the vincenty method
This method is meant to be 'applied' to a dataframe"""
start = bike_location[x]
end = bike_location[y]
return vincenty(start, end).miles
In [42]:
# this step is very slow - it would have been better to make a list of all unique routes,
# then calculate their distance only once. Next time...
bikeshare_morning_rush['Ride dist'] = bikeshare_morning_rush.apply(lambda x: route_dist(x['Start station'], x['End station']),
axis=1)
In [43]:
bikeshare_morning_rush['Ride dist'].describe()
Out[43]:
In [44]:
bikeshare_evening_rush['Ride dist'] = bikeshare_evening_rush.apply(lambda x: route_dist(x['Start station'], x['End station']),
axis=1)
In [45]:
bikeshare_evening_rush['Ride dist'].describe()
Out[45]:
In [46]:
bikeshare_no_rush['Ride dist'] = bikeshare_no_rush.apply(lambda x: route_dist(x['Start station'], x['End station']),
axis=1)
In [47]:
bikeshare_no_rush['Ride dist'].describe()
Out[47]:
In [48]:
fig, ax = plt.subplots(figsize=[14, 6])
(bikeshare_morning_rush[bikeshare_morning_rush['Ride dist'] < 4]['Ride dist']).plot.hist(bins=30, alpha=0.5, normed = True,
ax=ax, color='r', label='Morning')
(bikeshare_evening_rush[bikeshare_evening_rush['Ride dist'] < 4]['Ride dist']).plot.hist(bins=30, alpha=0.5, normed = True,
ax=ax, color='b', label='Evening')
(bikeshare_no_rush[bikeshare_no_rush['Ride dist'] < 4]['Ride dist']).plot.hist(bins=30, alpha=0.5, normed = True,
ax=ax, color='g', label='Non-Rush')
ax.set_ylabel('% of total rides')
ax.set_title('Ride distance for all registered rides during the 3 time period windows')
plt.legend(frameon=False, fontsize=14, loc=1)
Out[48]:
While rides appear to get slightly shorter throughout the day, there is not a strong dependence.
What about those near and far from a metro station?
In [49]:
fig, ax = plt.subplots(figsize=[14, 6])
temp1 = bikeshare_morning_rush.groupby('End dist').get_group('near')
temp1[temp1['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='r', label='near')
temp2 = bikeshare_morning_rush.groupby('End dist').get_group('far')
temp2[temp2['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='b', label='far')
#(bikeshare_morning_rush[bikeshare_morning_rush['Ride dist'] < 4]['Ride dist']).plot.hist(bins=30, alpha=0.5,
# ax=ax, color='r', label='Morning')
plt.legend(frameon=False, fontsize=14, loc=1)
ax.set_ylabel('% of total rides')
ax.set_title('Ride distance during the morning rush by ending bike station distance to a metro station')
plt.savefig('ride distance morning rush.png', dpi=600)
There is a spike in the distances around 0.5 miles for morning rides that end at stations near metro stations. This spike is also present in evening rides beginning at near metro stations.
In [50]:
fig, ax = plt.subplots(figsize=[14, 6])
temp1 = bikeshare_evening_rush.groupby('Start dist').get_group('near')
temp1[temp1['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='r', label='near')
temp2 = bikeshare_evening_rush.groupby('Start dist').get_group('far')
temp2[temp2['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='b', label='far')
#(bikeshare_morning_rush[bikeshare_morning_rush['Ride dist'] < 4]['Ride dist']).plot.hist(bins=30, alpha=0.5,
# ax=ax, color='r', label='Morning')
plt.legend(frameon=False, fontsize=14, loc=1)
ax.set_ylabel('% of total rides')
ax.set_title('Ride distance during the evening rush by starting bike station distance to a metro station')
plt.savefig('ride distance evening rush.png', dpi=600)
This suggests that there are a significant number of commuters who use the bike share to travel about 0.5 miles to near metro stations in the morning, and away from metro stations in the evening
In [51]:
fig, ax = plt.subplots(figsize=[14, 6])
temp1 = bikeshare_evening_rush.groupby('Start dist').get_group('near')
temp1[temp1['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='r', label='Evening near')
temp2 = bikeshare_morning_rush.groupby('End dist').get_group('near')
temp2[temp2['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='b', label='Morning far')
#(bikeshare_morning_rush[bikeshare_morning_rush['Ride dist'] < 4]['Ride dist']).plot.hist(bins=30, alpha=0.5,
# ax=ax, color='r', label='Morning')
plt.legend(frameon=False, fontsize=14, loc=1)
ax.set_ylabel('% of total rides')
ax.set_title('Ride distance during the evening rush by starting bike station distance to a metro station')
plt.savefig('ride distance morning v evening rush.png', dpi=600)
In [52]:
bikeshare_morning_rush['Ride dist'].quantile(0.50)
Out[52]:
In [53]:
bikeshare_morning_rush['Ride dist'].describe()
Out[53]:
In [54]:
bikeshare_weekday_casual['Ride dist'] = bikeshare_weekday_casual.apply(lambda x: route_dist(x['Start station'], x['End station']),
axis=1)
In [55]:
fig, ax = plt.subplots(figsize=[14, 6])
temp1 = bikeshare_evening_rush.groupby('Start dist').get_group('near')
temp1[temp1['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='r', label='Registered')
temp2 = bikeshare_weekday_casual.groupby('Start dist').get_group('near')
temp2[temp2['Ride dist']< 6]['Ride dist'].plot.hist(bins=30, alpha=0.5, normed=True,
ax=ax, color='b', label='Casual')
#(bikeshare_morning_rush[bikeshare_morning_rush['Ride dist'] < 4]['Ride dist']).plot.hist(bins=30, alpha=0.5,
# ax=ax, color='r', label='Morning')
plt.legend(frameon=False, fontsize=14, loc=1)
ax.set_ylabel('% of total rides')
ax.set_title('Ride distance during the evening rush for rides starting near metro stations Registered vs Casual')
plt.savefig('ride distance evening casual v registered rush.png', dpi=600)
The above plot compares the distance traveled for registered and casual riders departing from bike share station hours near metro stations during the evening rush.
The spike at < 0.2 miles is likely people starting and ending in the same bike share stations
In [56]:
bikeshare_morning_rush.groupby('End station')['End station'].count().sort_values(ascending=False)[:10]
Out[56]:
In [57]:
bikeshare_morning_rush.groupby('Start station')['Start station'].count().sort_values(ascending=False)[:10]
Out[57]:
In [58]:
bikeshare_morning_rush.groupby('Route')['Route'].count().sort_values(ascending=False)[:10]
Out[58]:
In [59]:
bikeshare_evening_rush.groupby('Start station')['Start station'].count().sort_values(ascending=False)[:10]
Out[59]:
In [60]:
bikeshare_evening_rush.groupby('End station')['End station'].count().sort_values(ascending=False)[:10]
Out[60]:
In [61]:
bikeshare_evening_rush.groupby('Route')['Route'].count().sort_values(ascending=False)[:10]
Out[61]:
In [62]:
(bikeshare_evening_rush.groupby('Start dist').get_group('near')
.groupby('Route')['Route'].count().sort_values(ascending=False)[:10])
Out[62]:
ALL of the top 10 most popular routes during the evening rush hour originating from bike stations close to the metro
In [63]:
bikeshare_no_rush.groupby('End station')['End station'].count().sort_values(ascending=False)[:10]
Out[63]:
In [64]:
bikeshare_no_rush.groupby('Start station')['Start station'].count().sort_values(ascending=False)[:10]
Out[64]:
In [65]:
bikeshare_weekday_casual.groupby('Start station')['Start station'].count().sort_values(ascending=False)[:10]
Out[65]:
In [66]:
(bikeshare_weekday_casual.groupby('Route')['Route'].count().sort_values(ascending=False)[:10])
Out[66]:
A quick review of the above lists suggests a couple of things.
Lets try a T-test of the evening rush hour data to see if there is a statistical significance
In [67]:
near = (bikeshare_evening_rush.groupby('Start dist').get_group('near')
.groupby('Start station')['Start station'].count() / len(nearby_stations))
far = (bikeshare_evening_rush.groupby('Start dist').get_group('far')
.groupby('Start station')['Start station'].count()/ (len(bikeshare_rides['Start station'].unique()) - len(nearby_stations)))
print(scipy.stats.ttest_ind(far, near, equal_var=False))
print(scipy.stats.ttest_ind(near, far, equal_var=True))
print("\nMood's median test\n")
print(scipy.stats.median_test(near, far))
print('\n', near.describe(), '\n')
print(far.describe())
In [68]:
near = (bikeshare_evening_rush.groupby('Start dist').get_group('near')
.groupby('Start station')['Start station'].count() / len(nearby_stations))
far = (bikeshare_morning_rush.groupby('Start dist').get_group('near')
.groupby('Start station')['Start station'].count()/ len(nearby_stations))
print(scipy.stats.ttest_ind(far, near, equal_var=False))
print(scipy.stats.ttest_ind(near, far, equal_var=True))
print("\nMood's median test\n")
print(scipy.stats.median_test(near, far))
print('\n', near.describe(), '\n')
print(far.describe())
In [69]:
def line_map_all_near(metro = station_data, bikestations = nearby_stations):
map_osm = folium.Map(location=[38.889931, -77.009003], tiles='Stamen Toner',
zoom_start=13)
for key1 in metro:
for key in metro[key1]:
folium.Marker(metro[key1][key], popup= key,
icon=folium.Icon(color='green')).add_to(map_osm)
for ii in range(len(bikestations)):
folium.Marker(bike_location[bikestations[ii]], popup= bikestations[ii],
icon=folium.Icon(color='red')).add_to(map_osm)
return map_osm
def line_map_all(metro = station_data, bikestations = nearby_stations, bike_loc = bike_location):
map_osm = folium.Map(location=[38.889931, -77.009003], tiles='Stamen Toner',
zoom_start=13)
for key1 in metro:
for key in metro[key1]:
folium.Marker(metro[key1][key], popup= key,
icon=folium.Icon(color='green')).add_to(map_osm)
for key in bike_loc:
if key in bikestations:
folium.Marker(bike_loc[key], popup= key,
icon=folium.Icon(color='red')).add_to(map_osm)
else:
folium.Marker(bike_loc[key], popup= key,
icon=folium.Icon(color='blue')).add_to(map_osm)
return map_osm
In [70]:
near_map = line_map_all_near()
near_map.save('metro_nearbikes_map.html')
near_map
Out[70]:
In [71]:
map_all_stations = line_map_all()
map_all_stations.save('all_stations_map.html')
map_all_stations
Out[71]:
In [72]:
bikeshare_morning_rush.groupby(by='Route')['Route'].count().sort_values(ascending=False).index.str.split('\sto')[:10]
Out[72]:
In [73]:
def line_map_popular(metro = station_data, bikestations = nearby_stations, bike_loc = bike_location):
map_osm = folium.Map(location=[38.889931, -77.009003], tiles='Stamen Toner',
zoom_start=13)
for key1 in metro:
for key in metro[key1]:
folium.Marker(metro[key1][key], popup= key,
icon=folium.Icon(color='green')).add_to(map_osm)
for ii in range(len(bikestations)):
folium.Marker(bike_loc[bikestations[ii][0]], popup= bikestations[ii][0],
icon=folium.Icon(color='red')).add_to(map_osm)
folium.Marker(bike_loc[bikestations[ii][1]], popup= bikestations[ii][1],
icon=folium.Icon(color='blue')).add_to(map_osm)
return map_osm
In [74]:
morning_10_map = line_map_popular(bikestations = bikeshare_morning_rush.groupby(by='Route')['Route'].count()
.sort_values(ascending=False).index.str.split('\sto\s')[:10])
morning_10_map.save('morning_10_map.html')
morning_10_map
Out[74]:
In [75]:
evening_10_map = line_map_popular(bikestations = bikeshare_evening_rush.groupby(by='Route')['Route'].count()
.sort_values(ascending=False).index.str.split('\sto\s')[:10])
evening_10_map.save('evening_10_map.html')
evening_10_map
Out[75]:
In [76]:
(bikeshare_evening_rush.groupby(by='Route')['Route'].count()
.sort_values(ascending=False).index.str.split('\sto')[:10])
Out[76]:
In [77]:
casual_10_map = line_map_popular(bikestations = bikeshare_weekday_casual.groupby(by='Route')['Route'].count()
.sort_values(ascending=False).index.str.split('\sto\s')[:10])
casual_10_map.save('casual_10_map.html')
casual_10_map
Out[77]:
In [ ]: