Explanatory Model for Car2go Demand

D


In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib
import datetime as dt
import scipy.stats as stats
from scipy.stats import norm
import numpy as np
import math
import seaborn as sns
from  InvarianceTestEllipsoid import InvarianceTestEllipsoid
from autocorrelation import autocorrelation
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import statsmodels.tsa.ar_model as ar_model
import pickle
%matplotlib inline

0. The model:

The model that will be used to demand of zone i

$T_i = B_{k = 1:n}X_{ki} + \epsilon_i$


In [2]:
T = pd.read_csv("DDMFactors.csv")
T.drop('Total Seconds Available', axis = 1,inplace=True)
T = T.fillna(0)
means = T.mean()
stds = T.std()
T = (T-means)/stds
T.hist()


Out[2]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000000BF2C2E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BF8E550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BFA8F28>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000C1572B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000C1A42E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000C1E3438>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000C22ADA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000C2669E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000D27CE48>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000D28EBE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000D30D320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000D3566A0>]], dtype=object)

1. Regression to obtain $m(t)$


In [21]:
Y = T["Trip Demand"]
#generating the factor space
X = T[T.columns[1:-1]]
X


Out[21]:
Pop AgeAvg IncMed PropDet PropHigh PropLow Availability Designated Car2goSpots
0 -0.857757 -0.331377 -0.094794 -0.576339 -0.765997 -0.370147 -0.974131 -0.152059
1 -1.103064 0.385285 -0.493928 -0.687591 -0.433778 -0.994234 1.769440 1.210198
2 -0.873739 0.643283 2.092294 -0.284039 -0.587276 -0.713360 1.827284 -0.152059
3 -0.004648 -1.420704 -0.684681 -0.674806 0.274749 -0.153117 0.529086 -0.106650
4 -1.166940 0.643283 2.092294 -0.515795 -0.689914 -0.934710 -0.946346 -0.152059
5 0.931858 1.675276 -0.873016 -0.636449 1.602598 0.319720 0.508394 -0.083946
6 -1.338867 -0.216712 0.940522 -0.677978 -0.765997 -1.025127 -0.811803 -0.152059
7 0.270814 -0.732708 0.057712 -0.680712 1.176592 -0.314909 0.500246 0.074984
8 -1.288664 -0.216712 0.940522 -0.667352 -0.765997 -0.943697 0.192938 -0.152059
9 -1.071833 0.213286 0.428846 -0.687591 -0.338228 -1.098792 -0.524670 -0.083946
10 -1.344107 -0.216712 0.940522 -0.679087 -0.765997 -1.033627 -0.728779 -0.152059
11 -0.459437 -0.474710 0.063599 -0.687591 0.625285 -1.071029 0.593405 -0.083946
12 -0.928213 0.385285 -0.493928 -0.687591 -0.227218 -0.929223 -0.977944 -0.152059
13 -0.635323 0.729282 0.396772 -0.687591 0.271874 -1.038768 0.275266 -0.095298
14 -1.125443 0.385285 -0.493928 -0.687591 -0.460216 -1.002555 -0.977944 -0.152059
15 -0.582821 -0.503376 -0.800406 -0.687591 0.285803 -0.977868 0.713174 2.856259
16 -0.983149 0.385285 -0.493928 -0.687591 -0.292116 -0.949649 -0.594456 -0.152059
17 0.036627 -0.732708 0.057712 -0.681685 0.901727 -0.425825 -0.606671 -0.152059
18 0.526800 -1.277371 -1.392997 -0.687591 0.798711 -0.351198 -0.820104 -0.152059
19 -0.662487 1.675276 0.096627 -0.534165 -0.712165 0.262213 -0.333149 -0.152059
20 -0.868062 1.388612 1.481376 -0.483023 -0.765997 -0.159507 -0.747889 -0.152059
21 4.283486 -0.392263 -0.998814 -0.674806 5.582555 -0.197845 -0.363256 -0.152059
22 0.248132 1.015947 -1.682928 -0.546951 0.131198 0.613646 -0.602409 -0.152059
23 0.152197 0.155953 -1.795185 -0.674806 0.640805 -0.121169 -0.442732 -0.152059
24 -0.898243 0.385285 -0.493928 -0.687591 -0.191813 -0.918080 0.257124 -0.152059
25 -1.234947 0.385285 -0.493928 -0.687591 -0.589579 -1.043269 -0.977944 -0.152059
26 -1.164427 0.385285 -0.493928 -0.687591 -0.506270 -1.017049 -0.868559 -0.152059
27 -1.230792 0.385285 -0.493928 -0.687591 -0.584670 -1.041724 -0.977944 -0.152059
28 -0.831417 0.213286 0.428846 -0.687591 -0.009078 -1.098792 -0.976639 -0.152059
29 -1.141210 0.213286 0.428846 -0.687591 -0.433210 -1.098792 -0.934851 -0.152059
... ... ... ... ... ... ... ... ...
160 0.775012 -0.079787 1.491313 2.700569 -0.715754 0.677543 0.504282 -0.152059
161 0.199403 0.757949 -0.046939 0.309678 -0.765997 1.802130 -0.061351 -0.152059
162 0.714101 -0.331377 0.545525 0.974525 -0.611679 1.578490 1.227147 -0.152059
163 0.447616 0.327952 -0.375817 0.846670 -0.410708 1.150381 0.232701 -0.152059
164 1.090227 -0.484296 0.027692 0.872241 -0.661922 2.441100 0.036024 -0.152059
165 0.828310 -0.092709 0.005724 0.475890 -0.374820 1.469866 5.396567 -0.152059
166 0.915108 0.491770 0.486743 4.004691 -0.722931 0.281382 -0.242048 -0.152059
167 -0.335091 0.528617 0.347517 1.959009 -0.765997 -0.702631 -0.513136 -0.152059
168 0.609030 0.114573 0.453878 2.189148 -0.765997 0.690323 0.089118 -0.152059
169 0.701919 0.049966 0.089733 0.284107 -0.572203 1.623218 1.455881 -0.152059
170 -0.211746 -0.675375 -0.453200 0.053968 -0.701399 0.460294 -0.290853 -0.152059
171 0.590757 -0.646709 -0.141822 0.744386 -0.654745 1.431527 3.200519 -0.152059
172 -0.392956 0.528617 -0.591675 -0.546951 -0.694221 0.447514 -0.643142 -0.152059
173 -0.635077 -0.704042 0.357285 -0.572522 -0.460950 -0.108389 -0.319528 -0.152059
174 0.746080 -0.137362 -0.225933 0.156252 -0.733698 1.910755 3.603145 -0.152059
175 0.925767 -0.248261 0.293402 -0.073887 0.199385 1.374020 0.453230 -0.152059
176 0.744557 -0.188045 -0.302507 -0.227313 -0.579380 2.128004 0.726071 -0.152059
177 0.114128 -0.159379 0.854166 0.066754 -0.765997 0.939521 -0.454705 -0.152059
178 0.013625 -0.732708 -0.801010 -0.176171 -0.428651 0.421955 0.008378 -0.152059
179 -0.427980 -0.245378 0.200897 0.066754 -0.765997 0.134419 0.743238 -0.152059
180 -0.903389 0.671950 -0.557056 -0.448322 -0.673650 -0.473501 -0.774606 -0.152059
181 -0.631729 0.671950 -0.557056 -0.313156 -0.621483 -0.120270 -0.353606 -0.152059
182 -0.324431 -0.016046 0.601653 0.181823 -0.765997 0.255823 -0.769368 -0.152059
183 -0.315295 0.757949 -0.256338 -0.227313 -0.263568 -0.044492 -0.748283 -0.152059
184 -0.167585 0.413951 -0.477637 0.066754 -0.145138 0.083301 2.493228 0.903690
185 -0.223928 0.270619 0.544634 0.169038 -0.722931 0.313331 -0.587652 -0.152059
186 1.320166 0.477423 -0.651743 2.189148 0.195796 0.805337 0.127866 -0.152059
187 2.118100 0.831664 -0.726999 1.485945 1.620542 0.811727 0.565771 -0.118003
188 -0.886754 0.098620 1.891359 0.350259 -0.741720 -0.865388 1.367692 -0.083946
189 -1.146311 0.098620 1.891359 -0.191180 -0.754385 -0.987153 -0.687240 -0.152059

190 rows × 8 columns

1.1 Using Lasso Regression to shrink factors to zero

The plot below varies the magnitude of the lasso regularization to see which parameters go to zero

Training data: $(x_t,y_t)$

Model Specification: $Y = \beta X + C$

Lasso regularization: $\underset{\beta}{\operatorname{argmin}}\sum_t(y_t - (\beta x_t + C))^2 + \lambda||\beta||_{l1} $

Depending on the value of $\lambda$, the coefficients in beta will shrink to zero


In [26]:
y = Y
L = []
model = sm.OLS(y, X)
for i in range(10):
    results = model.fit_regularized(method = 'elastic_net',alpha=i/40, L1_wt=0.5)
    L.append(results.params)

In [40]:
L = pd.DataFrame(L)
L = L/L.max(axis=0)
L.plot(title = "Coefficient values as the regularization term is varied")
L


Out[40]:
Pop AgeAvg IncMed PropDet PropHigh PropLow Availability Designated Car2goSpots
0 NaN NaN 1.000000 0.319504 1.000000 1.000000 1.000000 1.000000
1 NaN NaN 0.606925 1.000000 0.192349 0.500735 0.976983 0.122558
2 NaN NaN 0.000000 0.824317 0.154649 0.446335 0.957231 0.000000
3 NaN NaN 0.000000 0.716082 0.140695 0.417975 0.933679 0.000000
4 NaN NaN 0.000000 0.611721 0.126672 0.390502 0.910655 0.000000
5 NaN NaN 0.000000 0.507699 0.110500 0.364872 0.887569 0.000000
6 NaN NaN 0.000000 0.405948 0.093526 0.340393 0.864783 0.000000
7 NaN NaN 0.000000 0.307665 0.076631 0.316612 0.842518 0.000000
8 NaN NaN 0.000000 0.212679 0.059827 0.293494 0.820753 0.000000
9 NaN NaN 0.000000 0.120832 0.043121 0.271010 0.799471 0.000000

In [41]:
cols = L.columns[L.ix[len(L)-1] > 0.001]
Xs = X[cols]

1.2 Mean Regression Results (p-values, coefficients .... )


In [42]:
model = sm.OLS(y,Xs)
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:            Trip Demand   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.823
Method:                 Least Squares   F-statistic:                     221.2
Date:                Mon, 06 Nov 2017   Prob (F-statistic):           1.59e-69
Time:                        13:34:20   Log-Likelihood:                -102.82
No. Observations:                 190   AIC:                             213.6
Df Residuals:                     186   BIC:                             226.6
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
PropDet          -0.2258      0.034     -6.717      0.000      -0.292      -0.159
PropHigh          0.0851      0.032      2.667      0.008       0.022       0.148
PropLow           0.2876      0.037      7.777      0.000       0.215       0.361
Availability      0.7653      0.037     20.924      0.000       0.693       0.837
==============================================================================
Omnibus:                       30.643   Durbin-Watson:                   1.607
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               73.865
Skew:                           0.697   Prob(JB):                     9.13e-17
Kurtosis:                       5.718   Cond. No.                         1.98
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [37]:
Comparison = pd.DataFrame(results.predict(Xs))
Comparison["Actual"] = y
Comparison.rename(columns={Comparison.columns[0]: 'Predicted'}, inplace=True)
Comparison.ix[len(y)-365:len(y)].plot(title = "Normalized Zonal Demand for Car2Go in Toronto")


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0xd557908>

2. AR(1) Process for the Residuals

Lets check the residuals to make sure that they are approximately iid

$X_{t+1}= \alpha X_t + \sigma (t)\epsilon$

2.1 The code below is motivation for an AR(1) process for the residuals obtained from above: we see that there is significant correlation among the residuals from the mean process


In [44]:
epsi = Comparison['Actual'] - Comparison['Predicted']
epsi = np.array(epsi)
epsi = np.expand_dims(epsi, axis=0)

lag_ = 10  # number of lags (for auto correlation test)
acf = autocorrelation(epsi, lag_)

lag = 10 # lag to be printed
ell_scale = 2  # ellipsoid radius coefficient
fit = 0  # normal fitting
InvarianceTestEllipsoid(epsi, acf[0,1:], lag, fit, ell_scale);



In [45]:
epsi = Comparison['Actual'] - Comparison['Predicted']
epsi = np.array(epsi)
model = sm.tsa.AR(epsi)
AResults= model.fit(maxlag = 30, ic = "bic",method = 'cmle')
print("The maximum number of required lags for the residuals above according to the Bayes Information Criterion is:")
sm.tsa.AR(epsi).select_order(maxlag = 10, ic = 'bic',method='cmle')


The maximum number of required lags for the residuals above according to the Bayes Information Criterion is:
Out[45]:
2

In [54]:
np.array([epsi[1:-1],epsi[:-2]]).shape
epsi[2:].shape


Out[54]:
(188,)

In [49]:
ar_mod = sm.OLS(epsi[2:],np.array([epsi[1:-1],epsi[:-2]]))
ar_res = ar_mod.fit()
print(ar_res.summary())

ep = ar_res.predict()
print(len(ep),len(epsi))
z = ep - epsi[2:]

plt.plot(epsi[2:],  color='black')
plt.plot(ep, color='blue',linewidth=3)
plt.title('AR(1) Process')
plt.ylabel(" ")
plt.xlabel("Days")
plt.legend()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-49-af88a1f61373> in <module>()
      1 epsi[2:]
----> 2 ar_mod = sm.OLS(epsi[2:],np.array([epsi[1:-1],epsi[:-2]]))
      3 ar_res = ar_mod.fit()
      4 print(ar_res.summary())
      5 

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
    629                  **kwargs):
    630         super(OLS, self).__init__(endog, exog, missing=missing,
--> 631                                   hasconst=hasconst, **kwargs)
    632         if "weights" in self._init_keys:
    633             self._init_keys.remove("weights")

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
    524             weights = weights.squeeze()
    525         super(WLS, self).__init__(endog, exog, missing=missing,
--> 526                                   weights=weights, hasconst=hasconst, **kwargs)
    527         nobs = self.exog.shape[0]
    528         weights = self.weights

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, **kwargs)
     93     """
     94     def __init__(self, endog, exog, **kwargs):
---> 95         super(RegressionModel, self).__init__(endog, exog, **kwargs)
     96         self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
     97 

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
    210 
    211     def __init__(self, endog, exog=None, **kwargs):
--> 212         super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
    213         self.initialize()
    214 

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
     61         hasconst = kwargs.pop('hasconst', None)
     62         self.data = self._handle_data(endog, exog, missing, hasconst,
---> 63                                       **kwargs)
     64         self.k_constant = self.data.k_constant
     65         self.exog = self.data.exog

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\base\model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
     86 
     87     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
---> 88         data = handle_data(endog, exog, missing, hasconst, **kwargs)
     89         # kwargs arrays could have changed, easier to just attach here
     90         for key in kwargs:

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\base\data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
    628     klass = handle_data_class_factory(endog, exog)
    629     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
--> 630                  **kwargs)

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\base\data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
     78         # this has side-effects, attaches k_constant and const_idx
     79         self._handle_constant(hasconst)
---> 80         self._check_integrity()
     81         self._cache = resettable_cache()
     82 

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\base\data.py in _check_integrity(self)
    401         if self.exog is not None:
    402             if len(self.exog) != len(self.endog):
--> 403                 raise ValueError("endog and exog matrices are different sizes")
    404 
    405     def wrap_output(self, obj, how='columns', names=None):

ValueError: endog and exog matrices are different sizes

2.2 Invariance check for the residuals of the AR(1) process


In [15]:
z = np.expand_dims(z, axis=0)

lag_ = 10  # number of lags (for auto correlation test)
acf = autocorrelation(z, lag_)

lag = 10  # lag to be printed
ell_scale = 2  # ellipsoid radius coefficient
fit = 0  # normal fitting
InvarianceTestEllipsoid(z, acf[0,1:], lag, fit, ell_scale);


2.3 As per Benth lets see what the residuals of the AR(1) process are doing...


In [16]:
z = ep - epsi[1:]
plt.plot(z**2)


Out[16]:
[<matplotlib.lines.Line2D at 0xe48f3c8>]