Mobi data scraped using custom code from Mobi's public API. Weather data similarly pulled from Environment Canada with custom code.
Inspired by an example in Jake Vanderplas' Python Data Science Handbook
In [1]:
%matplotlib notebook
from vanweather import get_weather_range
In [2]:
#Load mobi daily data
thdf = pd.read_pickle('taken_hourly_df.p')
thdf = thdf.drop_duplicates()
tddf = thdf.groupby(pd.TimeGrouper(freq='D')).sum()
tddf.sum(1).tail()
Out[2]:
In [3]:
# Missing data days. Days when my computer was down and only partial data was acquired
tddf.loc['2017-06-24'] = np.nan
tddf.loc['2017-06-25'] = np.nan
tddf.loc['2017-04-17'] = np.nan
tddf.loc['2017-05-19'] = np.nan
tddf.loc['2017-05-10'] = np.nan
In [10]:
# Arbitrary cutoff at November 1st
tddf = tddf.loc[:'2017-11-01']
In [14]:
# Get weather data from custom code
wdf = get_weather_range((2017,4),(2017,11)).fillna(method='pad')
wdf = wdf.tz_localize('US/Pacific') # Needs to match mobi df or they won't interact well
wdf.head()
Out[14]:
In [22]:
# only take dates that we have in the mobi data
wdf = wdf[tddf.index[0]:tddf.index[-1]]
# Add trips data to weather dataframe
wdf['Trips'] = tddf.sum(1)
In [23]:
# Functions for highlighting weekends on plots
from datetime import datetime, timedelta
def find_weekend_indices(datetime_array):
indices=[]
for i in range(len(datetime_array)):
if datetime_array[i].weekday() in [4,5] :
indices.append(i+1)
return indices
def highlight_weekend(datetime_array,weekend_indices,ax):
td = timedelta(hours=12)
for i in weekend_indices:
try:
ax.axvspan(datetime_array[i]-td, datetime_array[i+1]-td, facecolor='gray', edgecolor='none', alpha=.2)
except:
pass
In [24]:
def make_weather_plot(date1='2017-04-20',date2=datetime.today().strftime('%Y-%m-%d')):
f,(ax,wax) = plt.subplots(2,sharex=True,gridspec_kw={'height_ratios':[4.5,1]})
df = wdf.loc[date1:date2]
ax.plot(df.index,df['Trips'],color='green')
ax.fill_between(df.index,df['Trips'],color='green',alpha=0.5)
#ax.set_ylim([0,4000])
ax.set_ylabel('Number of trips')
wax.bar(df.index,df['Total Rainmm'],color='blue',alpha=0.5)
wax.plot(df.index,df['Max Temp'],color='yellow')
wax.set_ylabel('$^\circ$C, mm')
#wax.plot(wdf.index,wdf['Max Temp'])
weekend_indices = find_weekend_indices(wdf.index)
highlight_weekend(df.index,weekend_indices,ax)
highlight_weekend(df.index,weekend_indices,wax)
for tick in wax.get_xticklabels():
tick.set_rotation(45)
f.subplots_adjust(wspace=0, hspace=0)
f.tight_layout()
f.savefig('weather_plot_{}-{}'.format(date1,date2))
In [25]:
make_weather_plot()
In [27]:
make_weather_plot('2017-11')
In [81]:
make_weather_plot('2017-06-02','2017-06-21')
In [82]:
# Add columns for weekday vs weekend and day of week
wdf['isWeekday'] = (wdf.index.weekday < 5).astype(float)
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for i in range(7):
wdf[days[i]] = (wdf.index.dayofweek == i).astype(float)
In [83]:
wdf.head()
Out[83]:
In [84]:
# Add a column for stat holidays, and call holidays weekends
holidays = ['2017-05-22','2017-07-03','2017-08-07','2017-09-04','2017-10-09']
wdf['Holiday'] = 0
# holidays are weekend-like
for holiday in holidays:
wdf.loc[holiday,'isWeekday'] = 0
wdf.loc[holiday,'Holiday'] = 1
In [85]:
wdf.loc['2017-06-11']
Out[85]:
In [86]:
import seaborn as sns
f,ax = plt.subplots()
sns.boxplot(x=wdf['isWeekday'],y=tddf.sum(1))
ax.xaxis.set_ticklabels(['Weekend','Weekday'])
ax.set_ylabel('Daily Trips')
ax.set_xlabel('')
f.savefig('weekdays_vs_weekends_boxplot.png')
In [87]:
f,ax = plt.subplots()
#ax.boxplot([tddf[wdf['isWeekday']==True].sum(1).dropna().values,tddf[wdf['isWeekday']==False].sum(1).dropna().values])
#ax.boxplot(tddf[wdf['isWeekday']==False].sum(1).dropna().values)
sns.boxplot(x=wdf['Total Rainmm']==0.,y=tddf.sum(1))
ax.xaxis.set_ticklabels(['Rain' ,'No Rain'])
ax.set_ylabel('Daily Trips')
ax.set_xlabel('')
f.savefig('rain_vs_norain.png')
In [25]:
# Add column for dry vs wet days
wdf['Dry'] = (wdf['Total Rainmm']==0).astype(int)
In [26]:
wdf[['Dry','Total Rainmm']].head()
Out[26]:
In [89]:
wdf.loc['2017-10-12']
Out[89]:
In [27]:
f,ax = plt.subplots()
wdf.plot('Total Rainmm','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('Rain (mm)')
f.savefig('trips_vs_rainfall.png')
In [28]:
wdf = wdf.dropna(subset=['Trips'])
In [29]:
m,resids,rank, sing, rcond = np.polyfit(wdf['Total Rainmm'],wdf['Trips'],1,full=True)
In [30]:
rainfit = np.polyval(m,wdf['Total Rainmm'])
In [31]:
f,ax = plt.subplots()
wdf.plot('Total Rainmm','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('Rain (mm)')
f.savefig('trips_vs_rainfall.png')
ax.plot(wdf['Total Rainmm'],rainfit,c='darkred')
Out[31]:
In [32]:
f,ax = plt.subplots()
ax.scatter(wdf['Total Rainmm'],rainfit-wdf['Trips'])
ax.hlines(0,0,35)
Out[32]:
In [33]:
from scipy.stats import normaltest
In [34]:
normaltest(rainfit)
Out[34]:
In [35]:
wdf['logRain'] = np.log(wdf['Total Rainmm']+0.01)
In [36]:
f,ax = plt.subplots()
wdf.plot('logRain','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('log(Rain (mm))')
f.savefig('trips_vs_lograin.png')
In [37]:
m = np.polyfit(wdf['logRain'],wdf['Trips'],1)
In [38]:
f,ax = plt.subplots()
wdf.plot('logRain','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('log(Rain (mm) + 1)')
ax.plot(wdf['logRain'],np.polyval(m,wdf['logRain']))
#f.savefig('trips_vs_lograin.png')
Out[38]:
In [39]:
f,ax = plt.subplots()
ax.scatter(wdf['Total Rainmm'],np.polyval(m,wdf['logRain'])-wdf['Trips'])
plt.hlines(0,0,35)
Out[39]:
In [94]:
f,ax = plt.subplots()
wdf.plot('Max Temp','Trips',ax=ax,kind='scatter',cmap='winter_r',c='Total Rainmm',s=50)
ax.set_xlabel('Daily High ($^\circ$C)')
f.get_axes()[-1].set_ylabel('Rain (mm)')
f.savefig('trips_vs_temp.png')
In [95]:
f,ax = plt.subplots()
wdf.plot('Max Temp','Total Rainmm',ax=ax,kind='scatter',s=50)
ax.set_xlabel('Daily High ($^\circ$C)')
f.get_axes()[-1].set_ylabel('Rain (mm)')
f.savefig('rain_vs_temp.png')
In [42]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [47]:
wdf2 = sm.add_constant(wdf)
md = smf.OLS(wdf['Trips'],wdf2[['Max Temp','Total Rainmm','isWeekday','const']])
mdf = md.fit()
s = mdf.summary()
print(s)
In [48]:
wdf['Trips Predicted'] = mdf.predict(wdf2[['Max Temp','Total Rainmm','isWeekday','const']])
f,ax = plt.subplots()
ax.plot(wdf['Trips'])
ax.plot(wdf['Trips Predicted'])
Out[48]:
In [49]:
plt.figure()
plt.scatter(wdf['Trips'],mdf.resid)
plt.hlines(0,450,4000)
plt.gcf().savefig('ols_linlin_resids.png')
normaltest(mdf.resid)
Out[49]:
In [51]:
cols = ['Max Temp', 'Total Rainmm','Spd of Max Gustkm/h',
'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Dry','Holiday']
md2 = smf.OLS(wdf['Trips'],wdf[cols])
mdf2 = md2.fit()
s = mdf2.summary()
print(s)
In [52]:
wdf['Trips Predicted2'] = mdf2.predict(wdf[cols])
f,ax = plt.subplots()
ax.plot(wdf['Trips'])
ax.plot(wdf['Trips Predicted2'])
Out[52]:
In [53]:
mdf.rsquared_adj
Out[53]:
In [54]:
mdf2.rsquared_adj
Out[54]:
In [55]:
plt.figure()
plt.scatter(wdf['Trips'],mdf2.resid)
plt.hlines(0,450,4000)
plt.gcf().savefig('ols_linlog_resids2.png')
normaltest(mdf2.resid)
Out[55]:
In [56]:
sm.qqplot(mdf.resid)
In [57]:
d = pd.datetime(2000, 12, 21)
Hours of daylight could also affect bike usage. Function taken from https://github.com/jakevdp/PythonDataScienceHandbook
In [ ]:
In [58]:
def hours_of_daylight(date, axis=23.44, latitude=49.):
"""Compute the hours of daylight for the given date"""
days = (date - pd.datetime(2015, 12, 21)).days
m = (1. - np.tan(np.radians(latitude))
* np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))
return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.
wdf['daylight_hrs'] = list(map(hours_of_daylight, wdf.index.tz_localize(None)))
In [59]:
wdf['daylight_hrs']
Out[59]:
In [60]:
cols2 = ['Max Temp', 'Total Rainmm',
'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Dry','Holiday','daylight_hrs']
md3 = smf.OLS(wdf['Trips'],wdf[cols2])
mdf3 = md3.fit()
s = mdf3.summary()
print(s)
In [134]:
wdf['Trips Predicted3'] = mdf3.predict(wdf[cols])
f,ax = plt.subplots()
ax.plot(wdf['Trips'],label='Measured')
ax.plot(wdf['Trips Predicted3'],label='Predicted')
ax.set_xlabel('Date')
ax.set_ylabel('Trips')
ax.legend()
f.savefig('/Users/msj/Dropbox/mobi/trip_vs_predicted_full_model.png')
In [62]:
f,ax = plt.subplots()
sm.qqplot(mdf3.resid,ax=ax)
f.savefig('/Users/msj/Dropbox/mobi/qqplot.png')
In [69]:
wdf[(mdf3.resid > 1000) | (mdf3.resid < -1000)][cols2+['Trips','Trips Predicted3']]
Out[69]:
In [68]:
wdf['Spd of Max Gustkm/h'].mean()
Out[68]:
In [64]:
wdf[mdf3.resid < -1000][cols+['Trips','Trips Predicted3']]
Out[64]:
In [65]:
thdf = pd.read_pickle('/Users/msj/Dropbox/mobi/taken_hourly_df.p')
In [66]:
thdf['2017-07-11'].sum(1)
Out[66]:
In [122]:
thdfn = pd.read_pickle('taken_hourly_df.p')
thdfn = thdfn.loc['2017-11-01':]
In [130]:
wdfn = get_weather_range((2017,11),(2017,11)).fillna(method='pad')
wdfn = wdfn.tz_localize('US/Pacific') # Needs to match mobi df or they won't interact well
wdfn.head()
Out[130]:
In [131]:
wdfn = wdfn.tz_localize(None)
thdfn = thdfn.tz_localize(None)
In [132]:
wdfn['Trips'] = thdfn.groupby(pd.TimeGrouper(freq='D')).sum().sum(1)
In [133]:
wdfn
Out[133]:
In [136]:
wdfn['daylight_hrs'] = list(map(hours_of_daylight, wdfn.index.tz_localize(None)))
In [137]:
wdfn['Dry'] = (wdfn['Total Rainmm']==0).astype(int)
In [138]:
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for i in range(7):
wdfn[days[i]] = (wdfn.index.dayofweek == i).astype(float)
In [140]:
wdfn['Holiday'] = 0
wdfn.loc['2017-11-13','Holiday'] = 1
In [143]:
wdfn['Trips Predicted'] = mdf2.predict(wdfn[cols])
In [ ]:
In [144]:
f,ax = plt.subplots()
ax.plot(wdfn['Trips'])
ax.plot(wdfn['Trips Predicted'])
Out[144]:
In [ ]: