In [1]:
# Borrowed from https://jakevdp.github.io/blog/2015/10/17/analyzing-pronto-cycleshare-data-with-python-and-pandas/
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
In [3]:
trips = pd.read_csv('DATA/babs_master/trip_master.csv',
parse_dates=['Start Date', 'End Date'],
infer_datetime_format=True)
trips.head()
Out[3]:
In [4]:
# Find the start date
ind = pd.DatetimeIndex(trips['Start Date'])
trips['date'] = ind.date.astype('datetime64')
trips['hour'] = ind.hour
In [5]:
# Count trips by date
by_date = trips.pivot_table('Trip ID', aggfunc='count',
index='date',
columns='Subscription Type', )
In [16]:
fig, ax = plt.subplots(2, figsize=(16, 8))
fig.subplots_adjust(hspace=0.4)
by_date.iloc[:, 0].plot(ax=ax[0], title='Annual Members', color='green');
by_date.iloc[:, 1].plot(ax=ax[1], title='Short-term Users');
In [8]:
by_weekday = by_date.groupby([by_date.index.year,
by_date.index.dayofweek]).mean()
by_weekday.columns.name = None # remove label for plot
fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
by_weekday.loc[2014].plot(title='Average Use by Day of Week (2014)', ax=ax[0]);
by_weekday.loc[2015].plot(title='Average Use by Day of Week (2015)', ax=ax[1]);
for axi in ax:
axi.set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
In [9]:
# count trips by date and by hour
by_hour = trips.pivot_table('Trip ID', aggfunc='count',
index=['date', 'hour'],
columns='Subscription Type').fillna(0).reset_index('hour')
# average these counts by weekend
by_hour['weekend'] = (by_hour.index.dayofweek >= 5)
by_hour = by_hour.groupby(['weekend', 'hour']).mean()
by_hour.index.set_levels([['weekday', 'weekend'],
["{0}:00".format(i) for i in range(24)]],
inplace=True);
by_hour.columns.name = None
In [10]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
by_hour.loc['weekday'].plot(title='Average Hourly Use (Mon-Fri)', ax=ax[0])
by_hour.loc['weekend'].plot(title='Average Hourly Use (Sat-Sun)', ax=ax[1])
ax[0].set_ylabel('Average Trips per Hour');
In [48]:
trips['minutes'] = trips['Duration'] /60
# trips['log_minutes'] = trips.minutes.apply(lambda x: np.log(x))
trips.groupby('Subscription Type')['minutes'].hist(bins=np.arange(61), alpha=0.5, normed=True, Log=True);
plt.xlabel('Duration (minutes)')
plt.ylabel('relative frequency')
plt.title('Trip Durations')
plt.text(34, 0.09, "Free Trips\n\nAdditional Fee", ha='right',size=14, rotation=90, alpha=0.5, color='red')
plt.legend(['Short-term Users', 'Annual Members'])
plt.axvline(30, linestyle='--', color='red', alpha=0.3);
In [23]:
stations = pd.read_csv('DATA/babs_open_data_year_1/201402_babs_open_data/201402_station_data.csv')
pronto_shop = dict(id=54, name="Pronto shop",
terminal="Pronto shop",
lat=47.6173156, long=-122.3414776,
dockcount=100, online='10/13/2014')
stations = stations.append(pronto_shop, ignore_index=True)
In [ ]:
from time import sleep
def query_distances(stations=stations):
"""Query the Google API for bicycling distances"""
latlon_list = ['{0},{1}'.format(lat, lng)
for (lat, lng) in zip(stations.lat, stations.long)]
def create_url(i):
URL = ('https://maps.googleapis.com/maps/api/distancematrix/json?'
'origins={origins}&destinations={destinations}&mode=bicycling')
return URL.format(origins=latlon_list[i],
destinations='|'.join(latlon_list[i + 1:]))
for i in range(len(latlon_list) - 1):
url = create_url(i)
filename = "distances_{0}.json".format(stations.terminal.iloc[i])
print(i, filename)
!curl "{url}" -o {filename}
sleep(11) # only one query per 10 seconds!
def build_distance_matrix(stations=stations):
"""Build a matrix from the Google API results"""
dist = np.zeros((len(stations), len(stations)), dtype=float)
for i, term in enumerate(stations.terminal[:-1]):
filename = 'queried_distances/distances_{0}.json'.format(term)
row = json.load(open(filename))
dist[i, i + 1:] = [el['distance']['value'] for el in row['rows'][0]['elements']]
dist += dist.T
distances = pd.DataFrame(dist, index=stations.terminal,
columns=stations.terminal)
distances.to_csv('station_distances.csv')
return distances
# only call this the first time
import os
if not os.path.exists('station_distances.csv'):
# Note: you can call this function at most ~twice per day!
query_distances()
# Move all the queried files into a directory
# so we don't accidentally overwrite them
if not os.path.exists('queried_distances'):
os.makedirs('queried_distances')
!mv distances_*.json queried_distances
# Build distance matrix and save to CSV
distances = build_distance_matrix()
In [ ]:
distances = pd.read_csv('station_distances.csv', index_col='terminal')
distances.iloc[:5, :5]
In [ ]:
stacked = distances.stack() / 1609.34 # convert meters to miles
stacked.name = 'distance'
trips = trips.join(stacked, on=['from_station_id', 'to_station_id'])
In [ ]:
fig, ax = plt.subplots(figsize=(12, 4))
trips.groupby('Subscription Type')['distance'].hist(bins=np.linspace(0, 6.99, 50),
alpha=0.5, ax=ax);
plt.xlabel('Distance between start & end (miles)')
plt.ylabel('relative frequency')
plt.title('Minimum Distance of Trip')
plt.legend(['Annual Members', 'Short-term Pass']);
In [ ]:
trips['speed'] = trips.distance * 60 / trips.minutes
trips.groupby('Subscription Type')['speed'].hist(bins=np.linspace(0, 15, 50), alpha=0.5, normed=True);
plt.xlabel('lower bound riding speed (MPH)')
plt.ylabel('relative frequency')
plt.title('Rider Speed Lower Bound (MPH)')
plt.legend(['Annual Members', 'Short-term Pass']);