Analyzing Anual Members(subscriber) vs Day-pass members(customer)


In [1]:
# Borrowed from https://jakevdp.github.io/blog/2015/10/17/analyzing-pronto-cycleshare-data-with-python-and-pandas/

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()


/Users/nandini/anaconda/envs/263n/lib/python2.7/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [3]:
trips = pd.read_csv('DATA/babs_master/trip_master.csv',
                    parse_dates=['Start Date', 'End Date'],
                    infer_datetime_format=True)
trips.head()


Out[3]:
Trip ID Duration Start Date Start Station Start Terminal End Date End Station End Terminal Bike # Subscription Type Zip Code
0 4576 63 2013-08-29 14:13:00 South Van Ness at Market 66 2013-08-29 14:14:00 South Van Ness at Market 66 520 Subscriber 94127
1 4607 70 2013-08-29 14:42:00 San Jose City Hall 10 2013-08-29 14:43:00 San Jose City Hall 10 661 Subscriber 95138
2 4130 71 2013-08-29 10:16:00 Mountain View City Hall 27 2013-08-29 10:17:00 Mountain View City Hall 27 48 Subscriber 97214
3 4251 77 2013-08-29 11:29:00 San Jose City Hall 10 2013-08-29 11:30:00 San Jose City Hall 10 26 Subscriber 95060
4 4299 83 2013-08-29 12:02:00 South Van Ness at Market 66 2013-08-29 12:04:00 Market at 10th 67 319 Subscriber 94103

In [4]:
# Find the start date
ind = pd.DatetimeIndex(trips['Start Date'])
trips['date'] = ind.date.astype('datetime64')
trips['hour'] = ind.hour

In [5]:
# Count trips by date
by_date = trips.pivot_table('Trip ID', aggfunc='count',
                            index='date',
                            columns='Subscription Type', )

In [16]:
fig, ax = plt.subplots(2, figsize=(16, 8))
fig.subplots_adjust(hspace=0.4)
by_date.iloc[:, 0].plot(ax=ax[0], title='Annual Members', color='green');
by_date.iloc[:, 1].plot(ax=ax[1], title='Short-term Users');



In [8]:
by_weekday = by_date.groupby([by_date.index.year,
                              by_date.index.dayofweek]).mean()
by_weekday.columns.name = None  # remove label for plot

fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
by_weekday.loc[2014].plot(title='Average Use by Day of Week (2014)', ax=ax[0]);
by_weekday.loc[2015].plot(title='Average Use by Day of Week (2015)', ax=ax[1]);
for axi in ax:
    axi.set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])



In [9]:
# count trips by date and by hour
by_hour = trips.pivot_table('Trip ID', aggfunc='count',
                            index=['date', 'hour'],
                            columns='Subscription Type').fillna(0).reset_index('hour')

# average these counts by weekend
by_hour['weekend'] = (by_hour.index.dayofweek >= 5)
by_hour = by_hour.groupby(['weekend', 'hour']).mean()
by_hour.index.set_levels([['weekday', 'weekend'],
                          ["{0}:00".format(i) for i in range(24)]],
                         inplace=True);
by_hour.columns.name = None

In [10]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
by_hour.loc['weekday'].plot(title='Average Hourly Use (Mon-Fri)', ax=ax[0])
by_hour.loc['weekend'].plot(title='Average Hourly Use (Sat-Sun)', ax=ax[1])
ax[0].set_ylabel('Average Trips per Hour');



In [48]:
trips['minutes'] = trips['Duration'] /60
# trips['log_minutes'] = trips.minutes.apply(lambda x: np.log(x))
trips.groupby('Subscription Type')['minutes'].hist(bins=np.arange(61), alpha=0.5, normed=True, Log=True);
plt.xlabel('Duration (minutes)')
plt.ylabel('relative frequency')
plt.title('Trip Durations')
plt.text(34, 0.09, "Free Trips\n\nAdditional Fee", ha='right',size=14, rotation=90, alpha=0.5, color='red')
plt.legend(['Short-term Users', 'Annual Members'])

plt.axvline(30, linestyle='--', color='red', alpha=0.3);


Estimating Trip Distances


In [23]:
stations = pd.read_csv('DATA/babs_open_data_year_1/201402_babs_open_data/201402_station_data.csv')
pronto_shop = dict(id=54, name="Pronto shop",
                   terminal="Pronto shop",
                   lat=47.6173156, long=-122.3414776,
                   dockcount=100, online='10/13/2014')
stations = stations.append(pronto_shop, ignore_index=True)

In [ ]:
from time import sleep

def query_distances(stations=stations):
    """Query the Google API for bicycling distances"""
    latlon_list = ['{0},{1}'.format(lat, lng)
                   for (lat, lng) in zip(stations.lat, stations.long)]

    def create_url(i):
        URL = ('https://maps.googleapis.com/maps/api/distancematrix/json?'
               'origins={origins}&destinations={destinations}&mode=bicycling')
        return URL.format(origins=latlon_list[i],
                          destinations='|'.join(latlon_list[i + 1:]))

    for i in range(len(latlon_list) - 1):
        url = create_url(i)
        filename = "distances_{0}.json".format(stations.terminal.iloc[i])
        print(i, filename)
        !curl "{url}" -o {filename}
        sleep(11) # only one query per 10 seconds!


def build_distance_matrix(stations=stations):
    """Build a matrix from the Google API results"""
    dist = np.zeros((len(stations), len(stations)), dtype=float)
    for i, term in enumerate(stations.terminal[:-1]):
        filename = 'queried_distances/distances_{0}.json'.format(term)
        row = json.load(open(filename))
        dist[i, i + 1:] = [el['distance']['value'] for el in row['rows'][0]['elements']]
    dist += dist.T
    distances = pd.DataFrame(dist, index=stations.terminal,
                             columns=stations.terminal)
    distances.to_csv('station_distances.csv')
    return distances

# only call this the first time
import os
if not os.path.exists('station_distances.csv'):
    # Note: you can call this function at most ~twice per day!
    query_distances()

    # Move all the queried files into a directory
    # so we don't accidentally overwrite them
    if not os.path.exists('queried_distances'):
        os.makedirs('queried_distances')
    !mv distances_*.json queried_distances

    # Build distance matrix and save to CSV
    distances = build_distance_matrix()

In [ ]:
distances = pd.read_csv('station_distances.csv', index_col='terminal')
distances.iloc[:5, :5]

In [ ]:
stacked = distances.stack() / 1609.34  # convert meters to miles
stacked.name = 'distance'
trips = trips.join(stacked, on=['from_station_id', 'to_station_id'])

In [ ]:
fig, ax = plt.subplots(figsize=(12, 4))
trips.groupby('Subscription Type')['distance'].hist(bins=np.linspace(0, 6.99, 50),
                                           alpha=0.5, ax=ax);
plt.xlabel('Distance between start & end (miles)')
plt.ylabel('relative frequency')
plt.title('Minimum Distance of Trip')
plt.legend(['Annual Members', 'Short-term Pass']);

Rider Speed


In [ ]:
trips['speed'] = trips.distance * 60 / trips.minutes
trips.groupby('Subscription Type')['speed'].hist(bins=np.linspace(0, 15, 50), alpha=0.5, normed=True);
plt.xlabel('lower bound riding speed (MPH)')
plt.ylabel('relative frequency')
plt.title('Rider Speed Lower Bound (MPH)')
plt.legend(['Annual Members', 'Short-term Pass']);