This is a work-in-progress, analyzing the open data from Pronto Cycle Share's Data Challenge.
See ProntoData.ipynb for an intro to the data; this notebook uses pieces that were derived there.
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
In [2]:
trips = pd.read_csv('2015_trip_data.csv',
parse_dates=['starttime', 'stoptime'],
infer_datetime_format=True)
t_start = pd.DatetimeIndex(trips['starttime'])
t_stop = pd.DatetimeIndex(trips['stoptime'])
trips['date'] = t_start.date.astype('datetime64')
trips['starttime'] = t_start.time
trips['stoptime'] = t_stop.time
trips['minuteofday'] = t_start.hour * 60 + t_start.minute
trips['minutes'] = trips.tripduration / 60.
In [3]:
stations = pd.read_csv('2015_station_data.csv')
pronto_shop = dict(id=54, name="Pronto shop",
terminal="Pronto shop",
lat=47.6173156, long=-122.3414776,
dockcount=100, online='10/13/2014')
stations = stations.append(pronto_shop, ignore_index=True)
distances = pd.read_csv('station_distances.csv', index_col='terminal')
distances /= 1609.34 # convert meters to miles
In [4]:
trips['distance'] = [distances.loc[ind] for ind in
zip(trips.from_station_id, trips.to_station_id)]
trips['speed'] = trips.distance * 60 / trips.minutes
In [5]:
trips.head()
Out[5]:
In [6]:
groups = trips.groupby(['from_station_id', 'to_station_id'])
paired = groups.aggregate({'distance':'mean', 'trip_id':'count',
'from_station_name':'first',
'to_station_name':'first'})
paired.rename(columns={'trip_id': 'count'}, inplace=True)
In [7]:
countmat = paired['count'].unstack()
total = countmat + countmat.T
total.values.flat[::total.shape[0] + 1] /= 2
paired['total'] = total.stack()
In [8]:
fig = plt.figure()
ax = plt.axes(yscale='log')
ax.plot(paired['distance'], paired['total'], '.k')
ax.set_xlabel('distance between stations')
ax.set_ylabel('number of trips');
ax.plot(6.83, 95, 's', ms=30, mec='red', mfc='none', mew=1)
fig.savefig('figs/trips_by_distance.png', bbox_inches='tight')
In [9]:
station_id_map = trips.groupby('from_station_id')['from_station_name'].first()
def get_group(id1, id2, include_reverse=False):
query = '(usertype == "Annual Member")'
if include_reverse:
query += (' & ((from_station_id == "{0}" & to_station_id == "{1}") |'
'(from_station_id == "{1}" & to_station_id == "{0}"))')
else:
query += ' & (from_station_id == "{0}" & to_station_id == "{1}")'
return trips.query(query.format(id1, id2))
In [10]:
fig, ax = plt.subplots()
id1, id2 = paired.query('distance > 6 & total > 50').reset_index()['from_station_id']
for route in [(id1, id2), (id2, id1)]:
group = get_group(*route)
names = station_id_map[route[0]], station_id_map[route[1]]
lines = ax.plot(group['starttime'], group['minutes'], 'o', ms=5,
label="{0} $\\to$ {1}".format(*(' '.join(n.split()[:2]) for n in names)))
color = lines[0].get_color()
ax.plot(group['stoptime'], group['minutes'], 'o', ms=5, color=color)
for i in range(group.shape[0]):
ax.plot([group['starttime'].values[i], group['stoptime'].values[i]],
2 * [group['minutes'].values[i]], '-', color=color, alpha=0.3)
ax.text(0.98, 0.02, "{0} born in {1}".format(group['gender'].iloc[0], int(group['birthyear'].iloc[0])),
ha='right', va='bottom', transform=ax.transAxes, fontsize=12)
ax.xaxis.set_major_locator(plt.MultipleLocator(2 * 60 * 60))
ax.legend(loc='best', fontsize=12)
ax.set_title('{0} \nto\n {1}'.format(*names))
ax.set_xlabel('ride start time')
ax.set_ylabel('ride duration');
fig.savefig('figs/power_user.png', bbox_inches='tight')
In [11]:
from sklearn.cluster import MeanShift
def compute_compactness(group, min_samples=25, bandwidth=5):
"""Return a measure of the compactness of the group"""
# arrange data to cluster: divide minuteofday by 10
# to increase effective bandwidth by a factor of 10
X = np.vstack([group.minuteofday / 10,
group.minutes]).T
if X.shape[0] < min_samples:
return 0
# compute the meanshift clusters, and count number of points in each
c = MeanShift(bandwidth=bandwidth).fit_predict(X)
counts = pd.Series(c).groupby(c).count()
# Select only the points from the dominant cluster
c = pd.Series(c).map(counts == counts.max())
if c.sum() < min_samples or c.sum() < 0.9 * len(c):
return 0
else:
return bandwidth / X[c.values].std(1).max()
compute_compactness(get_group('BT-03', 'UD-01'))
Out[11]:
In [12]:
subset = trips.query('usertype == "Annual Member"')
groups = subset.groupby(['from_station_id', 'to_station_id'])
compactness = groups.apply(compute_compactness).fillna(0).unstack()
ranked = compactness.unstack().sort_values(ascending=False)
ranked.iloc[:10]
Out[12]:
In [13]:
def analyze_pair(id1, id2):
subset = get_group(id1, id2, True).copy()
unique = pd.value_counts(subset['birthyear'])
subset = subset[subset['birthyear'] == unique.index[0]]
def mean_time(col):
#s = pd.DatetimeIndex(s.astype('datetime64'))
return np.mean([c.hour * 3600 + c.minute * 60 + c.second
for c in col])
AMPM = subset.groupby('from_station_id')['starttime'].aggregate(mean_time)
AMPM.sort_values(inplace=True)
subset['AMPM'] = np.where(subset.from_station_id == np.argmin(AMPM),
'morning', 'afternoon')
print('{0} -> {1}'.format(id1, id2))
print(pd.value_counts(subset['birthyear']))
print(pd.value_counts(subset['gender']))
print("distance:", distances.loc[id1, id2])
print("Date Range:", subset.date.min(), "to", subset.date.max())
print("-----------------------------")
fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05)
fig.suptitle('{0} $\longrightarrow$ {1} '
''.format(station_id_map[id1],
station_id_map[id2]),
size=14)
ax[0].text(0.02, 0.98, "{1:d} total trips\ndistance = {0:.2f} mi".format(distances.loc[id1, id2],
len(subset)),
ha='left', va='top', transform=ax[0].transAxes, fontsize=14)
names = station_id_map[AMPM.index[0]], station_id_map[AMPM.index[1]]
colors = plt.rcParams['axes.color_cycle']
for from_station, color in zip(AMPM.index, colors):
half = subset[subset.from_station_id == from_station]
if AMPM.index[0] == from_station:
order = names
else:
order = names[::-1]
ax[0].scatter(half['starttime'].values, half['minutes'].values,
c=color,
label="{0} $\\to$ {1}".format(*(' '.join(n.split()[:2]) for n in order)))
ax[0].scatter(half['stoptime'].values, half['minutes'].values,
c=color)
for i in range(half.shape[0]):
ax[0].plot([half['starttime'].values[i], half['stoptime'].values[i]],
2 * [half['minutes'].values[i]], '-', color=color, alpha=0.3)
ax[0].legend(loc='lower right', fontsize=14)
ax[0].xaxis.set_major_locator(plt.MultipleLocator(4 * 60 * 60))
ax[0].set_ylabel('trip duration (minutes)')
ns_in_day = 24 * 60 * 60 * 1E9
subset['daynumber'] = (subset.date - subset.date.iloc[0]).astype(int) / ns_in_day
def dateformat(x, *args):
return str(subset.date.iloc[0] + pd.datetools.timedelta(days=int(x))).split()[0]
ax[1].xaxis.set_major_locator(plt.MaxNLocator(6))
ax[1].xaxis.set_major_formatter(plt.FuncFormatter(dateformat))
for AMPM in ['morning', 'afternoon']:
sns.regplot('daynumber', 'minutes', data=subset.query('AMPM == "{0}"'.format(AMPM)),
ax=ax[1])
ax[1].set_ylabel('')
ax[1].set_xlabel('Date')
ax[1].text(0.98, 0.98, "{0} born in {1}".format(subset['gender'].iloc[0], int(subset['birthyear'].iloc[0])),
ha='right', va='top', transform=ax[1].transAxes, fontsize=14)
return fig
In [14]:
for i, ind in enumerate([0, 1, 3, 4, 7]):
fig = analyze_pair(*ranked.index[ind])
fig.savefig('figs/extreme-user-{0}.png'.format(i + 1))