Modeling Mobi Bike Usage

Mobi data scraped using custom code from Mobi's public API. Weather data similarly pulled from Environment Canada with custom code.

Inspired by an example in Jake Vanderplas' Python Data Science Handbook



In [1]:

    
%matplotlib notebook
from vanweather import get_weather_range



In [2]:

    
#Load mobi daily data
thdf = pd.read_pickle('taken_hourly_df.p')
thdf = thdf.drop_duplicates()
tddf = thdf.groupby(pd.TimeGrouper(freq='D')).sum()
tddf.sum(1).tail()









    Out[2]:





time
2017-12-17 00:00:00-08:00    225.0
2017-12-18 00:00:00-08:00    974.0
2017-12-19 00:00:00-08:00    284.0
2017-12-20 00:00:00-08:00    755.0
2017-12-21 00:00:00-08:00    481.0
Freq: D, dtype: float64



In [3]:

    
# Missing data days. Days when my computer was down and only partial data was acquired
tddf.loc['2017-06-24'] = np.nan
tddf.loc['2017-06-25'] = np.nan
tddf.loc['2017-04-17'] = np.nan
tddf.loc['2017-05-19'] = np.nan
tddf.loc['2017-05-10'] = np.nan



In [10]:

    
# Arbitrary cutoff at November 1st
tddf = tddf.loc[:'2017-11-01']



In [14]:

    
# Get weather data from custom code
wdf = get_weather_range((2017,4),(2017,11)).fillna(method='pad')
wdf = wdf.tz_localize('US/Pacific') # Needs to match mobi df or they won't interact well
wdf.head()









    Out[14]:







  
    
      
      Max Temp
      Min Temp
      Mean Temp
      Heat Deg Days
      Cool Deg Days
      Total Rainmm
      Total Snowcm
      Total Precipmm
      Snow on Grndcm
      Dir of Max Gust10's deg
      Spd of Max Gustkm/h
    
    
      Day
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017-04-01 00:00:00-07:00
      10.8
      6.4
      8.6
      9.4
      0.0
      11.2
      0.0
      11.2
      NaN
      26.0
      41.0
    
    
      2017-04-02 00:00:00-07:00
      10.6
      5.7
      8.2
      9.8
      0.0
      0.0
      0.0
      0.0
      NaN
      27.0
      52.0
    
    
      2017-04-03 00:00:00-07:00
      10.3
      1.7
      6.0
      12.0
      0.0
      0.0
      0.0
      0.0
      NaN
      27.0
      52.0
    
    
      2017-04-04 00:00:00-07:00
      13.8
      4.7
      9.3
      8.7
      0.0
      7.2
      0.0
      7.2
      NaN
      14.0
      57.0
    
    
      2017-04-05 00:00:00-07:00
      11.1
      8.2
      9.7
      8.3
      0.0
      13.4
      0.0
      13.4
      NaN
      15.0
      41.0



In [22]:

    
# only take dates that we have in the mobi data
wdf = wdf[tddf.index[0]:tddf.index[-1]]

# Add trips data to weather dataframe
wdf['Trips'] = tddf.sum(1)



In [23]:

    
# Functions for highlighting weekends on plots

from datetime import datetime, timedelta

def find_weekend_indices(datetime_array):
    indices=[]
    for i in range(len(datetime_array)):
        if datetime_array[i].weekday() in [4,5] :
            indices.append(i+1)
    return indices

def highlight_weekend(datetime_array,weekend_indices,ax):
    td = timedelta(hours=12)
    for i in weekend_indices:
        try:
            ax.axvspan(datetime_array[i]-td, datetime_array[i+1]-td, facecolor='gray', edgecolor='none', alpha=.2)
        except:
            pass



In [24]:

    
def make_weather_plot(date1='2017-04-20',date2=datetime.today().strftime('%Y-%m-%d')):
    f,(ax,wax) = plt.subplots(2,sharex=True,gridspec_kw={'height_ratios':[4.5,1]})

    df = wdf.loc[date1:date2]
    
    
    ax.plot(df.index,df['Trips'],color='green')
    ax.fill_between(df.index,df['Trips'],color='green',alpha=0.5)
    #ax.set_ylim([0,4000])
    ax.set_ylabel('Number of trips')

    wax.bar(df.index,df['Total Rainmm'],color='blue',alpha=0.5)
    wax.plot(df.index,df['Max Temp'],color='yellow')
    wax.set_ylabel('$^\circ$C, mm')
    #wax.plot(wdf.index,wdf['Max Temp'])

    weekend_indices = find_weekend_indices(wdf.index)
    highlight_weekend(df.index,weekend_indices,ax)
    highlight_weekend(df.index,weekend_indices,wax)

    for tick in wax.get_xticklabels():
        tick.set_rotation(45)
    f.subplots_adjust(wspace=0, hspace=0)
    f.tight_layout()
    f.savefig('weather_plot_{}-{}'.format(date1,date2))



In [25]:

    
make_weather_plot()



In [27]:

    
make_weather_plot('2017-11')



In [81]:

    
make_weather_plot('2017-06-02','2017-06-21')

Correlations

First, look at individual correlations between different weather measures



In [82]:

    
# Add columns for weekday vs weekend and day of week
wdf['isWeekday'] = (wdf.index.weekday < 5).astype(float)
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for i in range(7):
    wdf[days[i]] = (wdf.index.dayofweek == i).astype(float)



In [83]:

    
wdf.head()









    Out[83]:







  
    
      
      Max Temp
      Min Temp
      Mean Temp
      Heat Deg Days
      Cool Deg Days
      Total Rainmm
      Total Snowcm
      Total Precipmm
      Snow on Grndcm
      Dir of Max Gust10's deg
      Spd of Max Gustkm/h
      Trips
      isWeekday
      Mon
      Tue
      Wed
      Thu
      Fri
      Sat
      Sun
    
    
      Day
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017-04-17 00:00:00-07:00
      14.0
      8.6
      11.3
      6.7
      0.0
      5.4
      0.0
      5.4
      NaN
      14.0
      33.0
      NaN
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2017-04-18 00:00:00-07:00
      14.3
      9.1
      11.7
      6.3
      0.0
      5.2
      0.0
      5.2
      NaN
      21.0
      32.0
      1112.0
      1.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2017-04-19 00:00:00-07:00
      12.8
      7.5
      10.2
      7.8
      0.0
      0.8
      0.0
      0.8
      NaN
      10.0
      48.0
      1296.0
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
    
    
      2017-04-20 00:00:00-07:00
      13.2
      8.5
      10.9
      7.1
      0.0
      3.6
      0.0
      3.6
      NaN
      19.0
      33.0
      1528.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
    
    
      2017-04-21 00:00:00-07:00
      14.1
      6.5
      10.3
      7.7
      0.0
      0.0
      0.0
      0.0
      NaN
      20.0
      32.0
      1962.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0



In [84]:

    
# Add a column for stat holidays, and call holidays weekends

holidays = ['2017-05-22','2017-07-03','2017-08-07','2017-09-04','2017-10-09']

wdf['Holiday'] = 0 


# holidays are weekend-like
for holiday in holidays:
    wdf.loc[holiday,'isWeekday'] = 0
    wdf.loc[holiday,'Holiday'] = 1



In [85]:

    
wdf.loc['2017-06-11']









    Out[85]:





Max Temp                     18.8
Min Temp                      9.3
Mean Temp                    14.1
Heat Deg Days                 3.9
Cool Deg Days                 0.0
Total Rainmm                  0.0
Total Snowcm                  0.0
Total Precipmm                0.0
Snow on Grndcm                NaN
Dir of Max Gust10's deg      23.0
Spd of Max Gustkm/h          26.0
Trips                      2374.0
isWeekday                     0.0
Mon                           0.0
Tue                           0.0
Wed                           0.0
Thu                           0.0
Fri                           0.0
Sat                           0.0
Sun                           1.0
Holiday                       0.0
Name: 2017-06-11 00:00:00-07:00, dtype: float64



In [86]:

    
import seaborn as sns
f,ax = plt.subplots()

sns.boxplot(x=wdf['isWeekday'],y=tddf.sum(1))
ax.xaxis.set_ticklabels(['Weekend','Weekday'])
ax.set_ylabel('Daily Trips')
ax.set_xlabel('')
f.savefig('weekdays_vs_weekends_boxplot.png')



In [87]:

    
f,ax = plt.subplots()

#ax.boxplot([tddf[wdf['isWeekday']==True].sum(1).dropna().values,tddf[wdf['isWeekday']==False].sum(1).dropna().values])
#ax.boxplot(tddf[wdf['isWeekday']==False].sum(1).dropna().values)
sns.boxplot(x=wdf['Total Rainmm']==0.,y=tddf.sum(1))
ax.xaxis.set_ticklabels(['Rain' ,'No Rain'])
ax.set_ylabel('Daily Trips')
ax.set_xlabel('')
f.savefig('rain_vs_norain.png')



In [25]:

    
# Add column for dry vs wet days

wdf['Dry'] = (wdf['Total Rainmm']==0).astype(int)



In [26]:

    
wdf[['Dry','Total Rainmm']].head()









    Out[26]:







  
    
      
      Dry
      Total Rainmm
    
    
      Day
      
      
    
  
  
    
      2017-04-17 00:00:00-07:00
      0
      5.4
    
    
      2017-04-18 00:00:00-07:00
      0
      5.2
    
    
      2017-04-19 00:00:00-07:00
      0
      0.8
    
    
      2017-04-20 00:00:00-07:00
      0
      3.6
    
    
      2017-04-21 00:00:00-07:00
      1
      0.0



In [89]:

    
wdf.loc['2017-10-12']









    Out[89]:





Max Temp                      9.6
Min Temp                      6.1
Mean Temp                     7.9
Heat Deg Days                10.1
Cool Deg Days                 0.0
Total Rainmm                 31.4
Total Snowcm                  0.0
Total Precipmm               31.4
Snow on Grndcm                NaN
Dir of Max Gust10's deg       6.0
Spd of Max Gustkm/h          35.0
Trips                      1201.0
isWeekday                     1.0
Mon                           0.0
Tue                           0.0
Wed                           0.0
Thu                           1.0
Fri                           0.0
Sat                           0.0
Sun                           0.0
Holiday                       0.0
Name: 2017-10-12 00:00:00-07:00, dtype: float64



In [27]:

    
f,ax = plt.subplots()
wdf.plot('Total Rainmm','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('Rain (mm)')
f.savefig('trips_vs_rainfall.png')



In [28]:

    
wdf = wdf.dropna(subset=['Trips'])



In [29]:

    
m,resids,rank, sing, rcond = np.polyfit(wdf['Total Rainmm'],wdf['Trips'],1,full=True)



In [30]:

    
rainfit = np.polyval(m,wdf['Total Rainmm'])



In [31]:

    
f,ax = plt.subplots()
wdf.plot('Total Rainmm','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('Rain (mm)')
f.savefig('trips_vs_rainfall.png')
ax.plot(wdf['Total Rainmm'],rainfit,c='darkred')









    














    











    Out[31]:





[<matplotlib.lines.Line2D at 0x118f91eb8>]



In [32]:

    
f,ax = plt.subplots()
ax.scatter(wdf['Total Rainmm'],rainfit-wdf['Trips'])
ax.hlines(0,0,35)









    














    











    Out[32]:





<matplotlib.collections.LineCollection at 0x11c6382b0>



In [33]:

    
from scipy.stats import normaltest



In [34]:

    
normaltest(rainfit)









    Out[34]:





NormaltestResult(statistic=210.09470567685815, pvalue=2.3906407811994729e-46)



In [35]:

    
wdf['logRain'] = np.log(wdf['Total Rainmm']+0.01)



In [36]:

    
f,ax = plt.subplots()
wdf.plot('logRain','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('log(Rain (mm))')
f.savefig('trips_vs_lograin.png')



In [37]:

    
m = np.polyfit(wdf['logRain'],wdf['Trips'],1)



In [38]:

    
f,ax = plt.subplots()
wdf.plot('logRain','Trips',ax=ax,kind='scatter',cmap='Reds',c='Max Temp',s=50)
ax.set_xlabel('log(Rain (mm) + 1)')
ax.plot(wdf['logRain'],np.polyval(m,wdf['logRain']))
#f.savefig('trips_vs_lograin.png')









    














    











    Out[38]:





[<matplotlib.lines.Line2D at 0x11d33bbe0>]



In [39]:

    
f,ax = plt.subplots()
ax.scatter(wdf['Total Rainmm'],np.polyval(m,wdf['logRain'])-wdf['Trips'])
plt.hlines(0,0,35)









    














    











    Out[39]:





<matplotlib.collections.LineCollection at 0x12161b3c8>



In [94]:

    
f,ax = plt.subplots()
wdf.plot('Max Temp','Trips',ax=ax,kind='scatter',cmap='winter_r',c='Total Rainmm',s=50)
ax.set_xlabel('Daily High ($^\circ$C)')
f.get_axes()[-1].set_ylabel('Rain (mm)')
f.savefig('trips_vs_temp.png')



In [95]:

    
f,ax = plt.subplots()
wdf.plot('Max Temp','Total Rainmm',ax=ax,kind='scatter',s=50)
ax.set_xlabel('Daily High ($^\circ$C)')
f.get_axes()[-1].set_ylabel('Rain (mm)')
f.savefig('rain_vs_temp.png')

Statsmodels Linear Regression

Trips ~ m1*Temp + m2*Rain + m3*isWeekday



In [42]:

    
import statsmodels.api as sm
import statsmodels.formula.api as smf









    



/Users/msj/miniconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

First, a simple model just accounting for rain, temperature and weekday vs weekend



In [47]:

    
wdf2 = sm.add_constant(wdf)

md = smf.OLS(wdf['Trips'],wdf2[['Max Temp','Total Rainmm','isWeekday','const']])

mdf = md.fit()

s = mdf.summary()

print(s)









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Trips   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.770
Method:                 Least Squares   F-statistic:                     210.2
Date:                Tue, 21 Nov 2017   Prob (F-statistic):           2.38e-59
Time:                        13:56:33   Log-Likelihood:                -1394.8
No. Observations:                 189   AIC:                             2798.
Df Residuals:                     185   BIC:                             2811.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Max Temp       138.4343      6.975     19.847      0.000     124.673     152.195
Total Rainmm   -41.8400      6.380     -6.558      0.000     -54.427     -29.253
isWeekday      169.0331     62.147      2.720      0.007      46.425     291.641
const         -247.5381    144.689     -1.711      0.089    -532.991      37.914
==============================================================================
Omnibus:                        4.583   Durbin-Watson:                   1.178
Prob(Omnibus):                  0.101   Jarque-Bera (JB):                5.714
Skew:                          -0.115   Prob(JB):                       0.0574
Kurtosis:                       3.820   Cond. No.                         100.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [48]:

    
wdf['Trips Predicted'] = mdf.predict(wdf2[['Max Temp','Total Rainmm','isWeekday','const']])
f,ax = plt.subplots()
ax.plot(wdf['Trips'])
ax.plot(wdf['Trips Predicted'])









    














    











    Out[48]:





[<matplotlib.lines.Line2D at 0x123cb4898>]



In [49]:

    
plt.figure()
plt.scatter(wdf['Trips'],mdf.resid)
plt.hlines(0,450,4000)
plt.gcf().savefig('ols_linlin_resids.png')
normaltest(mdf.resid)









    














    











    Out[49]:





NormaltestResult(statistic=4.5832530934180937, pvalue=0.10110188110903791)

Now add wind, day of week, wet vs dry and holidays.



In [51]:

    
cols = ['Max Temp', 'Total Rainmm','Spd of Max Gustkm/h', 
       'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Dry','Holiday']

md2 = smf.OLS(wdf['Trips'],wdf[cols])

mdf2 = md2.fit()

s = mdf2.summary()

print(s)









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Trips   R-squared:                       0.807
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     67.07
Date:                Tue, 21 Nov 2017   Prob (F-statistic):           3.59e-57
Time:                        13:57:14   Log-Likelihood:                -1379.8
No. Observations:                 189   AIC:                             2784.
Df Residuals:                     177   BIC:                             2823.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Max Temp              126.6582      7.223     17.536      0.000     112.404     140.912
Total Rainmm          -29.9256      6.828     -4.383      0.000     -43.400     -16.452
Spd of Max Gustkm/h     1.1685      2.910      0.402      0.689      -4.574       6.911
Mon                  -212.6901    183.953     -1.156      0.249    -575.713     150.333
Tue                  -222.6356    188.624     -1.180      0.239    -594.877     149.605
Wed                  -198.5704    194.018     -1.023      0.307    -581.458     184.317
Thu                  -119.2990    190.448     -0.626      0.532    -495.139     256.541
Fri                   -59.6566    184.578     -0.323      0.747    -423.914     304.600
Sat                  -177.1645    184.020     -0.963      0.337    -540.321     185.992
Sun                  -453.0209    188.147     -2.408      0.017    -824.321     -81.720
Dry                   349.9727     76.998      4.545      0.000     198.021     501.925
Holiday              -288.7273    184.412     -1.566      0.119    -652.657      75.202
==============================================================================
Omnibus:                        7.511   Durbin-Watson:                   1.082
Prob(Omnibus):                  0.023   Jarque-Bera (JB):               13.136
Skew:                           0.076   Prob(JB):                      0.00140
Kurtosis:                       4.282   Cond. No.                         693.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [52]:

    
wdf['Trips Predicted2'] = mdf2.predict(wdf[cols])
f,ax = plt.subplots()
ax.plot(wdf['Trips'])
ax.plot(wdf['Trips Predicted2'])









    














    











    Out[52]:





[<matplotlib.lines.Line2D at 0x124932b70>]



In [53]:

    
mdf.rsquared_adj









    Out[53]:





0.76953414734813186



In [54]:

    
mdf2.rsquared_adj









    Out[54]:





0.79448927521599111



In [55]:

    
plt.figure()
plt.scatter(wdf['Trips'],mdf2.resid)
plt.hlines(0,450,4000)
plt.gcf().savefig('ols_linlog_resids2.png')
normaltest(mdf2.resid)









    














    











    Out[55]:





NormaltestResult(statistic=7.5111801905263915, pvalue=0.023386646187512416)



In [56]:

    
sm.qqplot(mdf.resid)



In [57]:

    
d = pd.datetime(2000, 12, 21)

Add daylight

Hours of daylight could also affect bike usage. Function taken from https://github.com/jakevdp/PythonDataScienceHandbook



In [ ]:



In [58]:

    
def hours_of_daylight(date, axis=23.44, latitude=49.):
    """Compute the hours of daylight for the given date"""
    
    days = (date - pd.datetime(2015, 12, 21)).days
    m = (1. - np.tan(np.radians(latitude))
         * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))
    return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.

wdf['daylight_hrs'] = list(map(hours_of_daylight, wdf.index.tz_localize(None)))



In [59]:

    
wdf['daylight_hrs']









    Out[59]:





Day
2017-04-18 00:00:00-07:00    13.666948
2017-04-19 00:00:00-07:00    13.725229
2017-04-20 00:00:00-07:00    13.783220
2017-04-21 00:00:00-07:00    13.840906
2017-04-22 00:00:00-07:00    13.898273
2017-04-23 00:00:00-07:00    13.955305
2017-04-24 00:00:00-07:00    14.011987
2017-04-25 00:00:00-07:00    14.068302
2017-04-26 00:00:00-07:00    14.124234
2017-04-27 00:00:00-07:00    14.179766
2017-04-28 00:00:00-07:00    14.234880
2017-04-29 00:00:00-07:00    14.289560
2017-04-30 00:00:00-07:00    14.343785
2017-05-01 00:00:00-07:00    14.397538
2017-05-02 00:00:00-07:00    14.450799
2017-05-03 00:00:00-07:00    14.503549
2017-05-04 00:00:00-07:00    14.555768
2017-05-05 00:00:00-07:00    14.607434
2017-05-06 00:00:00-07:00    14.658528
2017-05-07 00:00:00-07:00    14.709027
2017-05-08 00:00:00-07:00    14.758910
2017-05-09 00:00:00-07:00    14.808154
2017-05-11 00:00:00-07:00    14.904638
2017-05-12 00:00:00-07:00    14.951830
2017-05-13 00:00:00-07:00    14.998293
2017-05-20 00:00:00-07:00    15.301034
2017-05-21 00:00:00-07:00    15.340773
2017-05-22 00:00:00-07:00    15.379559
2017-05-23 00:00:00-07:00    15.417368
2017-05-24 00:00:00-07:00    15.454174
                               ...    
2017-10-03 00:00:00-07:00    11.210470
2017-10-04 00:00:00-07:00    11.149363
2017-10-05 00:00:00-07:00    11.088376
2017-10-06 00:00:00-07:00    11.027517
2017-10-07 00:00:00-07:00    10.966797
2017-10-08 00:00:00-07:00    10.906226
2017-10-09 00:00:00-07:00    10.845815
2017-10-10 00:00:00-07:00    10.785574
2017-10-11 00:00:00-07:00    10.725514
2017-10-12 00:00:00-07:00    10.665647
2017-10-13 00:00:00-07:00    10.605984
2017-10-14 00:00:00-07:00    10.546537
2017-10-15 00:00:00-07:00    10.487319
2017-10-16 00:00:00-07:00    10.428342
2017-10-17 00:00:00-07:00    10.369619
2017-10-18 00:00:00-07:00    10.311163
2017-10-19 00:00:00-07:00    10.252990
2017-10-20 00:00:00-07:00    10.195111
2017-10-21 00:00:00-07:00    10.137543
2017-10-22 00:00:00-07:00    10.080300
2017-10-23 00:00:00-07:00    10.023398
2017-10-24 00:00:00-07:00     9.966851
2017-10-25 00:00:00-07:00     9.910678
2017-10-26 00:00:00-07:00     9.854894
2017-10-27 00:00:00-07:00     9.799516
2017-10-28 00:00:00-07:00     9.744563
2017-10-29 00:00:00-07:00     9.690052
2017-10-30 00:00:00-07:00     9.636001
2017-10-31 00:00:00-07:00     9.582430
2017-11-01 00:00:00-07:00     9.529358
Name: daylight_hrs, Length: 189, dtype: float64



In [60]:

    
cols2 = ['Max Temp', 'Total Rainmm', 
       'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Dry','Holiday','daylight_hrs']

md3 = smf.OLS(wdf['Trips'],wdf[cols2])

mdf3 = md3.fit()

s = mdf3.summary()

print(s)









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Trips   R-squared:                       0.812
Model:                            OLS   Adj. R-squared:                  0.801
Method:                 Least Squares   F-statistic:                     69.65
Date:                Tue, 21 Nov 2017   Prob (F-statistic):           2.49e-58
Time:                        13:57:32   Log-Likelihood:                -1376.9
No. Observations:                 189   AIC:                             2778.
Df Residuals:                     177   BIC:                             2817.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Max Temp       117.3302      8.079     14.522      0.000     101.386     133.274
Total Rainmm   -29.8730      6.715     -4.449      0.000     -43.124     -16.622
Mon           -523.1535    209.403     -2.498      0.013    -936.401    -109.906
Tue           -523.9437    206.488     -2.537      0.012    -931.439    -116.448
Wed           -497.6093    209.458     -2.376      0.019    -910.967     -84.252
Thu           -429.2738    213.234     -2.013      0.046    -850.082      -8.465
Fri           -368.8536    207.548     -1.777      0.077    -778.440      40.733
Sat           -487.4666    206.544     -2.360      0.019    -895.072     -79.861
Sun           -755.6349    207.620     -3.640      0.000   -1165.364    -345.906
Dry            362.8911     75.665      4.796      0.000     213.569     512.214
Holiday       -277.1649    181.679     -1.526      0.129    -635.701      81.371
daylight_hrs    37.2471     15.672      2.377      0.019       6.319      68.175
==============================================================================
Omnibus:                        9.879   Durbin-Watson:                   1.087
Prob(Omnibus):                  0.007   Jarque-Bera (JB):               20.264
Skew:                           0.116   Prob(JB):                     3.98e-05
Kurtosis:                       4.587   Cond. No.                         472.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [134]:

    
wdf['Trips Predicted3'] = mdf3.predict(wdf[cols])
f,ax = plt.subplots()
ax.plot(wdf['Trips'],label='Measured')
ax.plot(wdf['Trips Predicted3'],label='Predicted')
ax.set_xlabel('Date')
ax.set_ylabel('Trips')
ax.legend()
f.savefig('/Users/msj/Dropbox/mobi/trip_vs_predicted_full_model.png')









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-134-2b2db520f1ea> in <module>()
----> 1 wdf['Trips Predicted3'] = mdf3.predict(wdf[cols])
      2 f,ax = plt.subplots()
      3 ax.plot(wdf['Trips'],label='Measured')
      4 ax.plot(wdf['Trips Predicted3'],label='Predicted')
      5 ax.set_xlabel('Date')

~/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1956         if isinstance(key, (Series, np.ndarray, Index, list)):
   1957             # either boolean or fancy integer index
-> 1958             return self._getitem_array(key)
   1959         elif isinstance(key, DataFrame):
   1960             return self._getitem_frame(key)

~/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
   2000             return self.take(indexer, axis=0, convert=False)
   2001         else:
-> 2002             indexer = self.loc._convert_to_indexer(key, axis=1)
   2003             return self.take(indexer, axis=1, convert=True)
   2004 

~/miniconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1229                 mask = check == -1
   1230                 if mask.any():
-> 1231                     raise KeyError('%s not in index' % objarr[mask])
   1232 
   1233                 return _values_from_object(indexer)

KeyError: "['Dry'] not in index"



In [62]:

    
f,ax = plt.subplots()
sm.qqplot(mdf3.resid,ax=ax)
f.savefig('/Users/msj/Dropbox/mobi/qqplot.png')

Outlier checks



In [69]:

    
wdf[(mdf3.resid > 1000) | (mdf3.resid < -1000)][cols2+['Trips','Trips Predicted3']]









    Out[69]:







  
    
      
      Max Temp
      Total Rainmm
      Mon
      Tue
      Wed
      Thu
      Fri
      Sat
      Sun
      Dry
      Holiday
      daylight_hrs
      Trips
      Trips Predicted3
    
    
      Day
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017-07-11 00:00:00-07:00
      21.9
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1
      0
      15.702729
      1583.0
      2949.059880
    
    
      2017-08-12 00:00:00-07:00
      21.6
      0.2
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0
      0
      14.330272
      3846.0
      2604.081174
    
    
      2017-10-12 00:00:00-07:00
      9.6
      31.4
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0
      0
      10.665647
      1201.0
      197.852458



In [68]:

    
wdf['Spd of Max Gustkm/h'].mean()









    Out[68]:





34.444444444444443



In [64]:

    
wdf[mdf3.resid < -1000][cols+['Trips','Trips Predicted3']]









    Out[64]:







  
    
      
      Max Temp
      Total Rainmm
      Spd of Max Gustkm/h
      Mon
      Tue
      Wed
      Thu
      Fri
      Sat
      Sun
      Dry
      Holiday
      Trips
      Trips Predicted3
    
    
      Day
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017-07-11 00:00:00-07:00
      21.9
      0.0
      41.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1
      0
      1583.0
      2949.05988



In [65]:

    
thdf = pd.read_pickle('/Users/msj/Dropbox/mobi/taken_hourly_df.p')



In [66]:

    
thdf['2017-07-11'].sum(1)









    Out[66]:





time
2017-07-11 00:00:00-07:00     20.0
2017-07-11 01:00:00-07:00      2.0
2017-07-11 02:00:00-07:00      2.0
2017-07-11 04:00:00-07:00      5.0
2017-07-11 05:00:00-07:00      5.0
2017-07-11 06:00:00-07:00     44.0
2017-07-11 07:00:00-07:00    106.0
2017-07-11 08:00:00-07:00    239.0
2017-07-11 09:00:00-07:00    197.0
2017-07-11 10:00:00-07:00    143.0
2017-07-11 11:00:00-07:00    149.0
2017-07-11 12:00:00-07:00    156.0
2017-07-11 13:00:00-07:00    166.0
2017-07-11 14:00:00-07:00    183.0
2017-07-11 15:00:00-07:00    120.0
2017-07-11 16:00:00-07:00     17.0
2017-07-11 17:00:00-07:00      3.0
2017-07-11 20:00:00-07:00     22.0
2017-07-11 22:00:00-07:00      4.0
dtype: float64

Use the model to predict November trips



In [122]:

    
thdfn = pd.read_pickle('taken_hourly_df.p')
thdfn = thdfn.loc['2017-11-01':]



In [130]:

    
wdfn = get_weather_range((2017,11),(2017,11)).fillna(method='pad')
wdfn = wdfn.tz_localize('US/Pacific') # Needs to match mobi df or they won't interact well
wdfn.head()









    Out[130]:







  
    
      
      Max Temp
      Min Temp
      Mean Temp
      Heat Deg Days
      Cool Deg Days
      Total Rainmm
      Total Snowcm
      Total Precipmm
      Snow on Grndcm
      Dir of Max Gust10's deg
      Spd of Max Gustkm/h
    
    
      Day
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017-11-01 00:00:00-07:00
      12.2
      2.4
      7.3
      10.7
      0.0
      NaN
      0.0
      NaN
      NaN
      27.0
      39.0
    
    
      2017-11-02 00:00:00-07:00
      8.8
      2.4
      5.6
      12.4
      0.0
      20.8
      0.0
      20.8
      NaN
      12.0
      46.0
    
    
      2017-11-03 00:00:00-07:00
      4.3
      -1.3
      1.5
      16.5
      0.0
      0.4
      0.0
      0.4
      NaN
      9.0
      37.0
    
    
      2017-11-04 00:00:00-07:00
      2.1
      -2.1
      0.0
      18.0
      0.0
      0.0
      0.0
      0.4
      NaN
      10.0
      24.0
    
    
      2017-11-05 00:00:00-07:00
      6.7
      -0.4
      3.2
      14.8
      0.0
      0.0
      0.0
      0.0
      NaN
      6.0
      24.0



In [131]:

    
wdfn = wdfn.tz_localize(None)
thdfn = thdfn.tz_localize(None)



In [132]:

    
wdfn['Trips'] = thdfn.groupby(pd.TimeGrouper(freq='D')).sum().sum(1)



In [133]:

    
wdfn









    Out[133]:







  
    
      
      Max Temp
      Min Temp
      Mean Temp
      Heat Deg Days
      Cool Deg Days
      Total Rainmm
      Total Snowcm
      Total Precipmm
      Snow on Grndcm
      Dir of Max Gust10's deg
      Spd of Max Gustkm/h
      Trips
    
    
      Day
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017-11-01
      12.2
      2.4
      7.3
      10.7
      0.0
      NaN
      0.0
      NaN
      NaN
      27.0
      39.0
      1799.0
    
    
      2017-11-02
      8.8
      2.4
      5.6
      12.4
      0.0
      20.8
      0.0
      20.8
      NaN
      12.0
      46.0
      726.0
    
    
      2017-11-03
      4.3
      -1.3
      1.5
      16.5
      0.0
      0.4
      0.0
      0.4
      NaN
      9.0
      37.0
      1352.0
    
    
      2017-11-04
      2.1
      -2.1
      0.0
      18.0
      0.0
      0.0
      0.0
      0.4
      NaN
      10.0
      24.0
      755.0
    
    
      2017-11-05
      6.7
      -0.4
      3.2
      14.8
      0.0
      0.0
      0.0
      0.0
      NaN
      6.0
      24.0
      1065.0
    
    
      2017-11-06
      6.5
      -4.3
      1.1
      16.9
      0.0
      0.0
      0.0
      0.0
      NaN
      7.0
      22.0
      1460.0
    
    
      2017-11-07
      6.5
      -4.3
      1.1
      16.9
      0.0
      0.0
      0.0
      0.0
      NaN
      7.0
      22.0
      1338.0
    
    
      2017-11-08
      6.5
      -4.3
      1.1
      16.9
      0.0
      0.0
      0.0
      0.0
      NaN
      21.0
      30.0
      1520.0
    
    
      2017-11-09
      9.4
      6.0
      7.7
      10.3
      0.0
      3.6
      0.0
      3.6
      NaN
      23.0
      35.0
      1179.0
    
    
      2017-11-10
      9.4
      6.0
      7.7
      10.3
      0.0
      3.6
      0.0
      3.6
      NaN
      23.0
      35.0
      1767.0
    
    
      2017-11-11
      9.2
      5.7
      7.5
      10.5
      0.0
      7.2
      0.0
      7.2
      NaN
      11.0
      35.0
      706.0
    
    
      2017-11-12
      9.5
      8.1
      8.8
      9.2
      0.0
      14.0
      0.0
      14.0
      NaN
      9.0
      35.0
      472.0
    
    
      2017-11-13
      10.9
      8.1
      9.5
      8.5
      0.0
      8.2
      0.0
      8.2
      NaN
      12.0
      69.0
      469.0
    
    
      2017-11-14
      10.3
      7.0
      8.7
      9.3
      0.0
      5.2
      0.0
      5.2
      NaN
      12.0
      61.0
      635.0
    
    
      2017-11-15
      10.8
      6.7
      8.8
      9.2
      0.0
      12.4
      0.0
      12.4
      NaN
      13.0
      69.0
      1008.0
    
    
      2017-11-16
      8.4
      4.3
      6.4
      11.6
      0.0
      5.0
      0.0
      5.0
      NaN
      16.0
      37.0
      942.0
    
    
      2017-11-17
      8.1
      3.9
      6.0
      12.0
      0.0
      0.6
      0.0
      0.6
      NaN
      27.0
      37.0
      1346.0
    
    
      2017-11-18
      9.7
      6.1
      7.9
      10.1
      0.0
      4.0
      0.0
      4.0
      NaN
      16.0
      32.0
      818.0
    
    
      2017-11-19
      7.8
      5.6
      6.7
      11.3
      0.0
      28.0
      0.0
      28.0
      NaN
      12.0
      59.0
      210.0
    
    
      2017-11-20
      8.9
      3.8
      6.4
      11.6
      0.0
      4.4
      0.0
      4.4
      NaN
      6.0
      35.0
      1717.0
    
    
      2017-11-21
      12.0
      3.3
      7.7
      10.3
      0.0
      24.2
      0.0
      24.2
      NaN
      6.0
      26.0
      300.0



In [136]:

    
wdfn['daylight_hrs'] = list(map(hours_of_daylight, wdfn.index.tz_localize(None)))



In [137]:

    
wdfn['Dry'] = (wdfn['Total Rainmm']==0).astype(int)



In [138]:

    
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for i in range(7):
    wdfn[days[i]] = (wdfn.index.dayofweek == i).astype(float)



In [140]:

    
wdfn['Holiday'] = 0
wdfn.loc['2017-11-13','Holiday'] = 1



In [143]:

    
wdfn['Trips Predicted'] = mdf2.predict(wdfn[cols])



In [ ]:



In [144]:

    
f,ax = plt.subplots()
ax.plot(wdfn['Trips'])
ax.plot(wdfn['Trips Predicted'])









    














    











    Out[144]:





[<matplotlib.lines.Line2D at 0x125a1b080>]



In [ ]:

	Max Temp	Min Temp	Mean Temp	Heat Deg Days	Cool Deg Days	Total Rainmm	Total Snowcm	Total Precipmm	Snow on Grndcm	Dir of Max Gust10's deg	Spd of Max Gustkm/h
Day
2017-04-01 00:00:00-07:00	10.8	6.4	8.6	9.4	0.0	11.2	0.0	11.2	NaN	26.0	41.0
2017-04-02 00:00:00-07:00	10.6	5.7	8.2	9.8	0.0	0.0	0.0	0.0	NaN	27.0	52.0
2017-04-03 00:00:00-07:00	10.3	1.7	6.0	12.0	0.0	0.0	0.0	0.0	NaN	27.0	52.0
2017-04-04 00:00:00-07:00	13.8	4.7	9.3	8.7	0.0	7.2	0.0	7.2	NaN	14.0	57.0
2017-04-05 00:00:00-07:00	11.1	8.2	9.7	8.3	0.0	13.4	0.0	13.4	NaN	15.0	41.0

	Max Temp	Min Temp	Mean Temp	Heat Deg Days	Cool Deg Days	Total Rainmm	Total Snowcm	Total Precipmm	Snow on Grndcm	Dir of Max Gust10's deg	Spd of Max Gustkm/h	Trips	isWeekday	Mon	Tue	Wed	Thu	Fri	Sat	Sun
Day
2017-04-17 00:00:00-07:00	14.0	8.6	11.3	6.7	0.0	5.4	0.0	5.4	NaN	14.0	33.0	NaN	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
2017-04-18 00:00:00-07:00	14.3	9.1	11.7	6.3	0.0	5.2	0.0	5.2	NaN	21.0	32.0	1112.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0
2017-04-19 00:00:00-07:00	12.8	7.5	10.2	7.8	0.0	0.8	0.0	0.8	NaN	10.0	48.0	1296.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0
2017-04-20 00:00:00-07:00	13.2	8.5	10.9	7.1	0.0	3.6	0.0	3.6	NaN	19.0	33.0	1528.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
2017-04-21 00:00:00-07:00	14.1	6.5	10.3	7.7	0.0	0.0	0.0	0.0	NaN	20.0	32.0	1962.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0

	Max Temp	Total Rainmm	Mon	Tue	Wed	Thu	Fri	Sat	Sun	Dry	Holiday	daylight_hrs	Trips	Trips Predicted3
Day
2017-07-11 00:00:00-07:00	21.9	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1	0	15.702729	1583.0	2949.059880
2017-08-12 00:00:00-07:00	21.6	0.2	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0	0	14.330272	3846.0	2604.081174
2017-10-12 00:00:00-07:00	9.6	31.4	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0	0	10.665647	1201.0	197.852458

	Max Temp	Min Temp	Mean Temp	Heat Deg Days	Cool Deg Days	Total Rainmm	Total Snowcm	Total Precipmm	Snow on Grndcm	Dir of Max Gust10's deg	Spd of Max Gustkm/h
Day
2017-11-01 00:00:00-07:00	12.2	2.4	7.3	10.7	0.0	NaN	0.0	NaN	NaN	27.0	39.0
2017-11-02 00:00:00-07:00	8.8	2.4	5.6	12.4	0.0	20.8	0.0	20.8	NaN	12.0	46.0
2017-11-03 00:00:00-07:00	4.3	-1.3	1.5	16.5	0.0	0.4	0.0	0.4	NaN	9.0	37.0
2017-11-04 00:00:00-07:00	2.1	-2.1	0.0	18.0	0.0	0.0	0.0	0.4	NaN	10.0	24.0
2017-11-05 00:00:00-07:00	6.7	-0.4	3.2	14.8	0.0	0.0	0.0	0.0	NaN	6.0	24.0

	Max Temp	Min Temp	Mean Temp	Heat Deg Days	Cool Deg Days	Total Rainmm	Total Snowcm	Total Precipmm	Snow on Grndcm	Dir of Max Gust10's deg	Spd of Max Gustkm/h	Trips
Day
2017-11-01	12.2	2.4	7.3	10.7	0.0	NaN	0.0	NaN	NaN	27.0	39.0	1799.0
2017-11-02	8.8	2.4	5.6	12.4	0.0	20.8	0.0	20.8	NaN	12.0	46.0	726.0
2017-11-03	4.3	-1.3	1.5	16.5	0.0	0.4	0.0	0.4	NaN	9.0	37.0	1352.0
2017-11-04	2.1	-2.1	0.0	18.0	0.0	0.0	0.0	0.4	NaN	10.0	24.0	755.0
2017-11-05	6.7	-0.4	3.2	14.8	0.0	0.0	0.0	0.0	NaN	6.0	24.0	1065.0
2017-11-06	6.5	-4.3	1.1	16.9	0.0	0.0	0.0	0.0	NaN	7.0	22.0	1460.0
2017-11-07	6.5	-4.3	1.1	16.9	0.0	0.0	0.0	0.0	NaN	7.0	22.0	1338.0
2017-11-08	6.5	-4.3	1.1	16.9	0.0	0.0	0.0	0.0	NaN	21.0	30.0	1520.0
2017-11-09	9.4	6.0	7.7	10.3	0.0	3.6	0.0	3.6	NaN	23.0	35.0	1179.0
2017-11-10	9.4	6.0	7.7	10.3	0.0	3.6	0.0	3.6	NaN	23.0	35.0	1767.0
2017-11-11	9.2	5.7	7.5	10.5	0.0	7.2	0.0	7.2	NaN	11.0	35.0	706.0
2017-11-12	9.5	8.1	8.8	9.2	0.0	14.0	0.0	14.0	NaN	9.0	35.0	472.0
2017-11-13	10.9	8.1	9.5	8.5	0.0	8.2	0.0	8.2	NaN	12.0	69.0	469.0
2017-11-14	10.3	7.0	8.7	9.3	0.0	5.2	0.0	5.2	NaN	12.0	61.0	635.0
2017-11-15	10.8	6.7	8.8	9.2	0.0	12.4	0.0	12.4	NaN	13.0	69.0	1008.0
2017-11-16	8.4	4.3	6.4	11.6	0.0	5.0	0.0	5.0	NaN	16.0	37.0	942.0
2017-11-17	8.1	3.9	6.0	12.0	0.0	0.6	0.0	0.6	NaN	27.0	37.0	1346.0
2017-11-18	9.7	6.1	7.9	10.1	0.0	4.0	0.0	4.0	NaN	16.0	32.0	818.0
2017-11-19	7.8	5.6	6.7	11.3	0.0	28.0	0.0	28.0	NaN	12.0	59.0	210.0
2017-11-20	8.9	3.8	6.4	11.6	0.0	4.4	0.0	4.4	NaN	6.0	35.0	1717.0
2017-11-21	12.0	3.3	7.7	10.3	0.0	24.2	0.0	24.2	NaN	6.0	26.0	300.0