In [1]:
    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
    
In [2]:
    
trips = pd.read_csv('2015_trip_data.csv',
                    parse_dates=['starttime', 'stoptime'],
                    infer_datetime_format=True)
    
In [3]:
    
ind = pd.DatetimeIndex(trips.starttime)
trips['date'] = ind.date.astype('datetime64')
trips['hour'] = ind.hour
    
In [4]:
    
hourly = trips.pivot_table('trip_id', aggfunc='count',
                           index=['usertype', 'date'], columns='hour').fillna(0)
hourly.head()
    
    Out[4]:
In [5]:
    
from sklearn.decomposition import PCA
data = hourly[np.arange(24)].values
data_pca = PCA(2).fit_transform(data)
hourly['projection1'], hourly['projection2'] = data_pca.T
    
In [6]:
    
hourly['total rides'] = hourly.sum(axis=1)
    
In [7]:
    
hourly.plot('projection1', 'projection2', kind='scatter', c='total rides', cmap='Blues_r');
plt.savefig('figs/pca_raw.png', bbox_inches='tight')
    
    
    
In [8]:
    
from sklearn.mixture import GMM
gmm = GMM(3, covariance_type='full', random_state=2)
data = hourly[['projection1', 'projection2']]
gmm.fit(data)
# require high-probability cluster membership
hourly['cluster'] = (gmm.predict_proba(data)[:, 0] > 0.6).astype(int)
    
In [9]:
    
from datetime import time
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(wspace=0.1)
times = pd.date_range('0:00', '23:59', freq='H').time
times = np.hstack([times, time(23, 59, 59)])
hourly.plot('projection1', 'projection2', c='cluster', kind='scatter', 
            cmap='rainbow', colorbar=False, ax=ax[0]);
for i in range(2):
    vals = hourly.query("cluster == " + str(i))[np.arange(24)]
    vals[24] = vals[0]
    ax[1].plot(times, vals.T, color=plt.cm.rainbow(255 * i), alpha=0.05, lw=0.5)
    ax[1].plot(times, vals.mean(0), color=plt.cm.rainbow(255 * i), lw=3)
    ax[1].set_xticks(4 * 60 * 60 * np.arange(6))
    
ax[1].set_ylim(0, 60);
ax[1].set_ylabel('Rides per hour');
fig.savefig('figs/pca_clustering.png', bbox_inches='tight')
    
    
    
In [10]:
    
fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharex=True, sharey=True)
fig.subplots_adjust(wspace=0.05)
for i, col in enumerate(['Annual Member', 'Short-Term Pass Holder']):
    hourly.loc[col].plot('projection1', 'projection2',  c='cluster', kind='scatter', 
                         cmap='rainbow', colorbar=False, ax=ax[i]);
    ax[i].set_title(col + 's')
    
fig.savefig('figs/pca_annual_vs_shortterm.png', bbox_inches='tight')
    
    
    
In [11]:
    
usertype = hourly.index.get_level_values('usertype')
weekday = hourly.index.get_level_values('date').dayofweek < 5
hourly['commute'] = (weekday & (usertype == "Annual Member"))
fig, ax = plt.subplots()
hourly.plot('projection1', 'projection2', c='commute', kind='scatter', 
            cmap='binary', colorbar=False, ax=ax);
ax.set_title("Annual Member Weekdays vs Other")
fig.savefig('figs/pca_true_weekends.png', bbox_inches='tight')
    
    
    
In [12]:
    
mismatch = hourly.query('cluster == 0 & commute')
mismatch = mismatch.reset_index('usertype')[['usertype', 'projection1', 'projection2']]
mismatch
    
    Out[12]:
In [13]:
    
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays('2014-08', '2015-10', return_name=True)
holidays_all = pd.concat([holidays,
                          "2 Days Before " + holidays.shift(-2, 'D'),
                          "Day Before " + holidays.shift(-1, 'D'),
                          "Day After " + holidays.shift(1, 'D')])
holidays_all = holidays_all.sort_index()
holidays_all.head()
    
    Out[13]:
In [14]:
    
holidays_all.name = 'holiday name'  # required for join
joined = mismatch.join(holidays_all)
joined['holiday name']
    
    Out[14]:
In [15]:
    
set(holidays) - set(joined['holiday name'])
    
    Out[15]:
In [16]:
    
fig, ax = plt.subplots()
hourly.plot('projection1', 'projection2', c='cluster', kind='scatter', 
            cmap='binary', colorbar=False, ax=ax);
ax.set_title("Holidays in Projected Results")
for i, ind in enumerate(joined.sort_values('projection1').index):
    x, y = hourly.loc['Annual Member', ind][['projection1', 'projection2']]
    if i % 2:
        ytext = 20 + 3 * i
    else:
        ytext = -8 - 4 * i
    ax.annotate(joined.loc[ind, 'holiday name'], [x, y], [x , ytext], color='black',
                ha='center', arrowprops=dict(arrowstyle='-', color='black'))
    ax.scatter([x], [y], c='red')
    
for holiday in (set(holidays) - set(joined['holiday name'])):
    ind = holidays[holidays == holiday].index[0]
    #ind = ind.strftime('%Y-%m-%d')
    x, y = hourly.loc['Annual Member', ind][['projection1', 'projection2']]
    ax.annotate(holidays.loc[ind], [x, y], [x + 20, y + 30], color='black',
                ha='center', arrowprops=dict(arrowstyle='-', color='black'))
    ax.scatter([x], [y], c='#00FF00')
ax.set_xlim([-60, 60])
ax.set_ylim([-60, 60])
fig.savefig('figs/pca_holiday_labels.png', bbox_inches='tight')