Pronto Cycleshare



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()

Downloading Pronto's Data

Data release page: http://www.prontocycleshare.com/datachallenge



In [2]:

    
import os
if not os.path.exists('open_data_year_one.zip'):
    !curl -O https://s3.amazonaws.com/pronto-data/open_data_year_one.zip
    !unzip open_data_year_one.zip



In [3]:

    
trips = pd.read_csv('2015_trip_data.csv',
                    parse_dates=['starttime', 'stoptime'],
                    infer_datetime_format=True)
trips.columns









    Out[3]:





Index(['trip_id', 'starttime', 'stoptime', 'bikeid', 'tripduration',
       'from_station_name', 'to_station_name', 'from_station_id',
       'to_station_id', 'usertype', 'gender', 'birthyear'],
      dtype='object')

Exploring Trips over Time

Trips Over the Year



In [4]:

    
# Find the start date
ind = pd.DatetimeIndex(trips.starttime)
trips['date'] = ind.date.astype('datetime64')
trips['hour'] = ind.hour



In [5]:

    
# Count trips by date
by_date = trips.pivot_table('trip_id', aggfunc='count',
                            index='date',
                            columns='usertype', )

# Count trips by weekday
weekly = by_date.pivot_table(['Annual Member', 'Short-Term Pass Holder'],
                             index=by_date.index.weekofyear,
                             columns=by_date.index.dayofweek)



In [6]:

    
by_date.sort_values('Short-Term Pass Holder', ascending=False)









    Out[6]:






  
    
      usertype
      Annual Member
      Short-Term Pass Holder
    
    
      date
      
      
    
  
  
    
      2015-04-19
      132
      622
    
    
      2015-07-04
      143
      616
    
    
      2015-04-20
      358
      583
    
    
      2015-05-09
      218
      575
    
    
      2015-04-18
      195
      551
    
    
      2015-08-01
      155
      519
    
    
      2015-06-13
      220
      496
    
    
      2015-07-18
      216
      470
    
    
      2015-08-09
      156
      444
    
    
      2015-07-19
      108
      442
    
    
      2014-10-19
      239
      427
    
    
      2015-07-03
      209
      421
    
    
      2015-07-25
      154
      409
    
    
      2015-06-20
      150
      406
    
    
      2015-05-30
      179
      406
    
    
      2015-08-16
      159
      398
    
    
      2015-07-05
      147
      398
    
    
      2015-07-12
      153
      397
    
    
      2015-05-02
      160
      388
    
    
      2015-05-23
      153
      384
    
    
      2015-08-15
      176
      374
    
    
      2015-09-12
      144
      370
    
    
      2015-06-14
      137
      370
    
    
      2015-05-31
      174
      364
    
    
      2015-08-08
      203
      363
    
    
      2015-08-02
      137
      358
    
    
      2015-07-11
      170
      354
    
    
      2015-06-06
      171
      347
    
    
      2015-09-26
      143
      345
    
    
      2015-07-17
      361
      339
    
    
      ...
      ...
      ...
    
    
      2014-11-03
      212
      25
    
    
      2014-10-30
      203
      24
    
    
      2015-01-27
      225
      24
    
    
      2014-12-20
      54
      24
    
    
      2015-02-25
      183
      23
    
    
      2014-12-03
      229
      23
    
    
      2015-02-11
      283
      23
    
    
      2015-02-10
      264
      23
    
    
      2014-12-08
      182
      23
    
    
      2014-12-24
      107
      22
    
    
      2015-01-22
      254
      22
    
    
      2015-01-20
      298
      22
    
    
      2014-11-28
      40
      22
    
    
      2015-01-23
      193
      21
    
    
      2015-02-03
      221
      21
    
    
      2015-01-02
      116
      20
    
    
      2014-11-25
      154
      20
    
    
      2014-12-05
      190
      19
    
    
      2015-10-07
      234
      19
    
    
      2015-01-05
      162
      18
    
    
      2014-12-23
      109
      18
    
    
      2014-12-10
      154
      17
    
    
      2015-01-15
      189
      17
    
    
      2015-02-05
      156
      16
    
    
      2014-12-17
      175
      16
    
    
      2015-02-02
      168
      14
    
    
      2014-12-11
      175
      10
    
    
      2015-03-15
      26
      8
    
    
      2015-01-04
      43
      8
    
    
      2014-12-18
      158
      7
    
  

365 rows × 2 columns



In [7]:

    
fig, ax = plt.subplots(2, 1, figsize=(16, 8), sharex=True, sharey=True)
fig.subplots_adjust(hspace=0.1)

def add_inset(ax, rect, *args, **kwargs):
    box = ax.get_position()
    inax_position = ax.transAxes.transform(rect[0:2])
    infig_position = ax.figure.transFigure.inverted().transform(inax_position)
    new_rect = list(infig_position) + [box.width * rect[2], box.height * rect[3]]
    return fig.add_axes(new_rect, *args, **kwargs)

color_cycle = plt.rcParams['axes.color_cycle']
for i, col in enumerate(['Annual Member', 'Short-Term Pass Holder']):
    by_date[col].plot(ax=ax[i], title=col, color=color_cycle[i])
    ax[i].set_title(col + 's')

with sns.axes_style('whitegrid'):
    inset = [add_inset(ax[0], [0.07, 0.6, 0.2, 0.32]),
             add_inset(ax[1], [0.07, 0.6, 0.2, 0.32])]

for i, col in enumerate(['Annual Member', 'Short-Term Pass Holder']):
    inset[i].plot(range(7), weekly[col].values.T, color=color_cycle[i], lw=2, alpha=0.05);
    inset[i].plot(range(7), weekly[col].mean(0), color=color_cycle[i], lw=3)
    inset[i].set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
    inset[i].yaxis.set_major_locator(plt.MaxNLocator(5))
    inset[i].set_ylim(0, 500)
    inset[i].set_title('Average by Day of Week')
    
fig.savefig('figs/daily_trend.png', bbox_inches='tight')

Trips Over a Day



In [8]:

    
ind = pd.DatetimeIndex(trips.date)
ind.dayofweek









    Out[8]:





array([0, 0, 0, ..., 0, 0, 0], dtype=int32)



In [9]:

    
trips['weekend'] = (ind.dayofweek > 4)
hourly = trips.pivot_table('trip_id', aggfunc='count',
                           index=['date'], columns=['usertype', 'weekend', 'hour'])
fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharex=True, sharey=True)
fig.subplots_adjust(wspace=0.1)
fmt = plt.FuncFormatter(lambda x, *args: '{0}:00'.format(int(x)))

for weekend in (False, True):
    axi = ax[int(weekend)]
    for i, col in enumerate(['Annual Member', 'Short-Term Pass Holder']):
        vals = hourly[col][weekend].values
        vals = np.hstack([vals, vals[:, :1]])
        axi.plot(range(25), vals.T,
                 color=color_cycle[i], lw=1, alpha=0.05)
        axi.plot(range(25), np.nanmean(vals, 0),
                 color=color_cycle[i], lw=3, label=col)
    axi.xaxis.set_major_locator(plt.MultipleLocator(4))
    axi.xaxis.set_major_formatter(fmt)
    axi.set_ylim(0, 60)
    axi.set_title('Saturday - Sunday' if weekend else 'Monday - Friday')
    axi.legend(loc='upper left')
    axi.set_xlabel('Time of Day')
ax[0].set_ylabel('Number of Trips')
fig.suptitle('Hourly Trends: Weekdays and Weekends', size=14);

fig.savefig('figs/hourly_trend.png', bbox_inches='tight')

Trip Distances & Rider Speed



In [10]:

    
stations = pd.read_csv('2015_station_data.csv')
pronto_shop = dict(id=54, name="Pronto shop",
                   terminal="Pronto shop",
                   lat=47.6173156, long=-122.3414776,
                   dockcount=100, online='10/13/2014')
stations = stations.append(pronto_shop, ignore_index=True)



In [11]:

    
# Here we query the Google Maps API for distances between trips

from time import sleep

def query_distances(stations=stations):
    """Query the Google API for bicycling distances"""
    latlon_list = ['{0},{1}'.format(lat, long)
                   for (lat, long) in zip(stations.lat, stations.long)]

    def create_url(i):
        URL = ('https://maps.googleapis.com/maps/api/distancematrix/json?'
               'origins={origins}&destinations={destinations}&mode=bicycling')
        return URL.format(origins=latlon_list[i],
                          destinations='|'.join(latlon_list[i + 1:]))

    for i in range(len(latlon_list) - 1):
        url = create_url(i)
        filename = "distances_{0}.json".format(stations.terminal.iloc[i])
        print(i, filename)
        !curl "{url}" -o {filename}
        sleep(11) # only one query per 10 seconds!


def build_distance_matrix(stations=stations):
    """Build a matrix from the Google API results"""
    dist = np.zeros((len(stations), len(stations)), dtype=float)
    for i, term in enumerate(stations.terminal[:-1]):
        filename = 'queried_distances/distances_{0}.json'.format(term)
        row = json.load(open(filename))
        dist[i, i + 1:] = [el['distance']['value'] for el in row['rows'][0]['elements']]
    dist += dist.T
    distances = pd.DataFrame(dist, index=stations.terminal,
                             columns=stations.terminal)
    distances.to_csv('station_distances.csv')
    return distances

# only call this the first time
import os
if not os.path.exists('station_distances.csv'):
    # Note: you can call this function at most ~twice per day!
    query_distances()

    # Move all the queried files into a directory
    # so we don't accidentally overwrite them
    if not os.path.exists('queried_distances'):
        os.makedirs('queried_distances')
    !mv distances_*.json queried_distances

    # Build distance matrix and save to CSV
    distances = build_distance_matrix()



In [12]:

    
distances = pd.read_csv('station_distances.csv', index_col='terminal')
distances.iloc[:5, :5]



In [13]:

    
stacked = distances.stack() / 1609.34  # convert meters to miles
stacked.name = 'distance'
tmp = trips.join(stacked, on=['from_station_id', 'to_station_id'])
trips['distance'] = tmp['distance']



In [14]:

    
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(wspace=0.1)

trips['minutes'] = trips.tripduration / 60
trips.groupby('usertype')['minutes'].hist(bins=np.arange(61), alpha=0.5, normed=True, ax=ax[0]);
ax[0].set_xlabel('Duration (minutes)')
ax[0].set_ylabel('relative frequency')
ax[0].set_title('Trip Durations')
ax[0].text(34, 0.09, "Free Trips\n\nAdditional Fee", ha='right',
           size=18, rotation=90, alpha=0.5, color='red')
ax[0].legend(['Annual Members', 'Short-term Pass'])
ax[0].axvline(30, linestyle='--', color='red', alpha=0.3);

trips['minutes'] = trips['tripduration'] / 60
trips['speed'] = trips.distance * 60 / trips.minutes
trips.groupby('usertype')['speed'].hist(bins=np.linspace(0, 20, 50), normed=True,
                                        alpha=0.5, ax=ax[1])
ax[1].set_xlabel('Minimum Speed (mph)')
ax[1].legend(['Annual Members', 'Short-term Pass Holders'])
ax[1].set_title('Rider Speed');

fig.savefig('figs/duration_and_speed.png', bbox_inches='tight')



In [15]:

    
g = sns.FacetGrid(trips, col="usertype", hue='usertype', size=6)
g.map(plt.scatter, "distance", "speed", s=4, alpha=0.2)
g.axes[0, 0].axis([0, 10, 0, 25]);

for axi, col in zip(g.axes.flat, ['Annual Member', 'Short-Term Pass Holder']):
    axi.text(9.5, 16.6, "Free Trips\n\nAdditional Fee", ha='right',
             size=17, rotation=45, alpha=0.5, color='red')
    axi.plot([0, 20], [0, 40], '--r', alpha=0.3)
    axi.axis([0, 10, 0, 20])
    axi.set_title(col + 's')
    axi.set_xlabel('Minimum Distance (miles)')
g.axes[0, 0].set_ylabel('Minimum Speed (MPH)');

g.fig.savefig('figs/distance_vs_speed.png', bbox_inches='tight')









    



/Users/jakevdp/anaconda/envs/python3.4/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Trend with Elevation



In [16]:

    
# Get elevations from the Google Maps API

def get_station_elevations(stations):
    """Get station elevations via Google Maps API"""
    URL = "https://maps.googleapis.com/maps/api/elevation/json?locations="
    locs = '|'.join(['{0},{1}'.format(lat, long)
                     for (lat, long) in zip(stations.lat, stations.long)])
    URL += locs
    !curl "{URL}" -o elevations.json


def process_station_elevations():
    """Convert Elevations JSON output to CSV"""
    import json
    D = json.load(open('elevations.json'))
    def unnest(D):
        loc = D.pop('location')
        loc.update(D)
        return loc
    elevs = pd.DataFrame([unnest(item) for item in D['results']])
    elevs.to_csv('station_elevations.csv')
    return elevs

# only run this the first time:
import os
if not os.path.exists('station_elevations.csv'):
    get_station_elevations(stations)
    process_station_elevations()



In [17]:

    
elevs = pd.read_csv('station_elevations.csv', index_col=0)
elevs.head()









    Out[17]:






  
    
      
      elevation
      lat
      lng
      resolution
    
  
  
    
      0
      37.351780
      47.618418
      -122.350964
      76.351616
    
    
      1
      33.815830
      47.615829
      -122.348564
      76.351616
    
    
      2
      34.274055
      47.616094
      -122.341102
      76.351616
    
    
      3
      44.283257
      47.613110
      -122.344208
      76.351616
    
    
      4
      42.460381
      47.610185
      -122.339641
      76.351616



In [18]:

    
# double check that locations match
print(np.allclose(stations.long, elevs.lng))
print(np.allclose(stations.lat, elevs.lat))









    



True
True



In [19]:

    
stations['elevation'] = elevs['elevation']
elevs.index = stations['terminal']

trips['elevation_start'] = trips.join(elevs, on='from_station_id')['elevation']
trips['elevation_end'] = trips.join(elevs, on='to_station_id')['elevation']
trips['elevation_gain'] = trips['elevation_end'] - trips['elevation_start']



In [20]:

    
len(trips)









    Out[20]:





142846



In [21]:

    
g = sns.FacetGrid(trips, col="usertype", hue='usertype')
g.map(plt.hist, "elevation_gain", bins=np.arange(-145, 150, 10))
g.fig.set_figheight(6)
g.fig.set_figwidth(16);

# plot some lines to guide the eye
for lim in range(60, 150, 20):
    x = np.linspace(-lim, lim, 3)
    for ax in g.axes.flat:
        ax.fill(x, 100 * (lim - abs(x)),
                color='gray', alpha=0.1, zorder=0)
        
g.axes[0, 0].set_title('Annual Members')
g.axes[0, 0].set_xlabel('Elevation Gain (meters)')
g.axes[0, 0].set_ylabel('Number of Rides')
g.axes[0, 1].set_title('Short-Term Pass Holders')
g.axes[0, 1].set_xlabel('Elevation Gain (meters)');

counts = trips.groupby(['usertype', np.sign(trips.elevation_gain)])['trip_id'].count().unstack()
percents = (100 * (counts.T / counts.sum(axis=1)).T).astype(int)


for i, col in enumerate(['Annual Member', 'Short-Term Pass Holder']):
    g.axes[0, i].text(0.98, 0.98,
                      ("Total downhill trips: {c[0]} ({p[0]}%)\n"
                       "Total flat trips: {c[1]} ({p[1]}%)\n"
                       "Total uphill trips: {c[2]} ({p[2]}%)".format(c=counts.loc[col].values,
                                                                     p=percents.loc[col].values)),
                      transform=g.axes[0, i].transAxes, ha='right', va='top', fontsize=14)

g.fig.savefig('figs/elevation.png', bbox_inches='tight')









    



/Users/jakevdp/anaconda/envs/python3.4/lib/python3.4/site-packages/pandas/core/index.py:4281: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return np.sum(name == np.asarray(self.names)) > 1



In [22]:

    
print("total downhill trips:", (trips.elevation_gain < 0).sum())
print("total uphill trips:  ", (trips.elevation_gain > 0).sum())









    



total downhill trips: 80532
total uphill trips:   50493

Trend with Weather



In [23]:

    
weather = pd.read_csv('2015_weather_data.csv', index_col='Date', parse_dates=True)
weather.columns









    Out[23]:





Index(['Max_Temperature_F', 'Mean_Temperature_F', 'Min_TemperatureF',
       'Max_Dew_Point_F', 'MeanDew_Point_F', 'Min_Dewpoint_F', 'Max_Humidity',
       'Mean_Humidity ', 'Min_Humidity ', 'Max_Sea_Level_Pressure_In ',
       'Mean_Sea_Level_Pressure_In ', 'Min_Sea_Level_Pressure_In ',
       'Max_Visibility_Miles ', 'Mean_Visibility_Miles ',
       'Min_Visibility_Miles ', 'Max_Wind_Speed_MPH ', 'Mean_Wind_Speed_MPH ',
       'Max_Gust_Speed_MPH', 'Precipitation_In ', 'Events'],
      dtype='object')



In [24]:

    
by_date = trips.groupby(['date', 'usertype'])['trip_id'].count()
by_date.name = 'count'
by_date = by_date.reset_index('usertype').join(weather)



In [25]:

    
# add a flag indicating weekend
by_date['weekend'] = (by_date.index.dayofweek >= 5)

#----------------------------------------------------------------
# Plot Temperature Trend
g = sns.FacetGrid(by_date, col="weekend", hue='usertype', size=6)
g.map(sns.regplot, "Mean_Temperature_F", "count")
g.add_legend();

# do some formatting
g.axes[0, 0].set_title('')
g.axes[0, 1].set_title('')
g.axes[0, 0].text(0.05, 0.95, 'Monday - Friday', va='top', size=14,
                  transform=g.axes[0, 0].transAxes)
g.axes[0, 1].text(0.05, 0.95, 'Saturday - Sunday', va='top', size=14,
                  transform=g.axes[0, 1].transAxes)
g.fig.text(0.45, 1, "Trend With Temperature", ha='center', va='top', size=16);
for ax in g.axes.flat:
    ax.set_xlabel('Mean Temperature (F)')
g.axes.flat[0].set_ylabel('Rider Count')
g.fig.savefig('figs/temperature.png', bbox_inches='tight')









    



/Users/jakevdp/anaconda/envs/python3.4/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [26]:

    
#----------------------------------------------------------------
# Plot Precipitation
g = sns.FacetGrid(by_date, col="weekend", hue='usertype', size=6)
g.map(sns.regplot, "Precipitation_In ", "count")
g.add_legend();

# do some formatting
g.axes[0, 0].set_ylim(-50, 600);
g.axes[0, 0].set_title('')
g.axes[0, 1].set_title('')
g.axes[0, 0].text(0.95, 0.95, 'Monday - Friday', ha='right', va='top', size=14,
                  transform=g.axes[0, 0].transAxes)
g.axes[0, 1].text(0.95, 0.95, 'Saturday - Sunday', ha='right', va='top', size=14,
                  transform=g.axes[0, 1].transAxes)
g.fig.text(0.45, 1, "Trend With Precipitation", ha='center', va='top', size=16);
for ax in g.axes.flat:
    ax.set_xlabel('Precipitation (inches)')
g.axes.flat[0].set_ylabel('Rider Count');
g.fig.savefig('figs/precipitation.png', bbox_inches='tight')









    



/Users/jakevdp/anaconda/envs/python3.4/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

	BT-01	BT-03	BT-04	BT-05	CBD-13
terminal
BT-01	0	422	1067	867	1342
BT-03	422	0	838	445	920
BT-04	1067	838	0	1094	1121
BT-05	867	445	1094	0	475
CBD-13	1342	920	1121	475	0

usertype	Annual Member	Short-Term Pass Holder
date
2015-04-19	132	622
2015-07-04	143	616
2015-04-20	358	583
2015-05-09	218	575
2015-04-18	195	551
2015-08-01	155	519
2015-06-13	220	496
2015-07-18	216	470
2015-08-09	156	444
2015-07-19	108	442
2014-10-19	239	427
2015-07-03	209	421
2015-07-25	154	409
2015-06-20	150	406
2015-05-30	179	406
2015-08-16	159	398
2015-07-05	147	398
2015-07-12	153	397
2015-05-02	160	388
2015-05-23	153	384
2015-08-15	176	374
2015-09-12	144	370
2015-06-14	137	370
2015-05-31	174	364
2015-08-08	203	363
2015-08-02	137	358
2015-07-11	170	354
2015-06-06	171	347
2015-09-26	143	345
2015-07-17	361	339
...	...	...
2014-11-03	212	25
2014-10-30	203	24
2015-01-27	225	24
2014-12-20	54	24
2015-02-25	183	23
2014-12-03	229	23
2015-02-11	283	23
2015-02-10	264	23
2014-12-08	182	23
2014-12-24	107	22
2015-01-22	254	22
2015-01-20	298	22
2014-11-28	40	22
2015-01-23	193	21
2015-02-03	221	21
2015-01-02	116	20
2014-11-25	154	20
2014-12-05	190	19
2015-10-07	234	19
2015-01-05	162	18
2014-12-23	109	18
2014-12-10	154	17
2015-01-15	189	17
2015-02-05	156	16
2014-12-17	175	16
2015-02-02	168	14
2014-12-11	175	10
2015-03-15	26	8
2015-01-04	43	8
2014-12-18	158	7

	elevation	lat	lng	resolution
0	37.351780	47.618418	-122.350964	76.351616
1	33.815830	47.615829	-122.348564	76.351616
2	34.274055	47.616094	-122.341102	76.351616
3	44.283257	47.613110	-122.344208	76.351616
4	42.460381	47.610185	-122.339641	76.351616