In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import datetime as dt
import scipy.stats as stats
from scipy.stats import norm
import numpy as np
import math
import seaborn as sns
from InvarianceTestEllipsoid import InvarianceTestEllipsoid
from autocorrelation import autocorrelation
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import statsmodels.tsa.ar_model as ar_model
import pickle
%matplotlib inline
In [2]:
T = pd.read_csv("DDMFactors.csv")
T.drop('Total Seconds Available', axis = 1,inplace=True)
T = T.fillna(0)
means = T.mean()
stds = T.std()
T = (T-means)/stds
T.hist()
Out[2]:
In [21]:
Y = T["Trip Demand"]
#generating the factor space
X = T[T.columns[1:-1]]
X
Out[21]:
The plot below varies the magnitude of the lasso regularization to see which parameters go to zero
Training data: $(x_t,y_t)$
Model Specification: $Y = \beta X + C$
Lasso regularization: $\underset{\beta}{\operatorname{argmin}}\sum_t(y_t - (\beta x_t + C))^2 + \lambda||\beta||_{l1} $
Depending on the value of $\lambda$, the coefficients in beta will shrink to zero
In [26]:
y = Y
L = []
model = sm.OLS(y, X)
for i in range(10):
results = model.fit_regularized(method = 'elastic_net',alpha=i/40, L1_wt=0.5)
L.append(results.params)
In [40]:
L = pd.DataFrame(L)
L = L/L.max(axis=0)
L.plot(title = "Coefficient values as the regularization term is varied")
L
Out[40]:
In [41]:
cols = L.columns[L.ix[len(L)-1] > 0.001]
Xs = X[cols]
In [42]:
model = sm.OLS(y,Xs)
results = model.fit()
print(results.summary())
In [37]:
Comparison = pd.DataFrame(results.predict(Xs))
Comparison["Actual"] = y
Comparison.rename(columns={Comparison.columns[0]: 'Predicted'}, inplace=True)
Comparison.ix[len(y)-365:len(y)].plot(title = "Normalized Zonal Demand for Car2Go in Toronto")
Out[37]:
In [44]:
epsi = Comparison['Actual'] - Comparison['Predicted']
epsi = np.array(epsi)
epsi = np.expand_dims(epsi, axis=0)
lag_ = 10 # number of lags (for auto correlation test)
acf = autocorrelation(epsi, lag_)
lag = 10 # lag to be printed
ell_scale = 2 # ellipsoid radius coefficient
fit = 0 # normal fitting
InvarianceTestEllipsoid(epsi, acf[0,1:], lag, fit, ell_scale);
In [45]:
epsi = Comparison['Actual'] - Comparison['Predicted']
epsi = np.array(epsi)
model = sm.tsa.AR(epsi)
AResults= model.fit(maxlag = 30, ic = "bic",method = 'cmle')
print("The maximum number of required lags for the residuals above according to the Bayes Information Criterion is:")
sm.tsa.AR(epsi).select_order(maxlag = 10, ic = 'bic',method='cmle')
Out[45]:
In [54]:
np.array([epsi[1:-1],epsi[:-2]]).shape
epsi[2:].shape
Out[54]:
In [49]:
ar_mod = sm.OLS(epsi[2:],np.array([epsi[1:-1],epsi[:-2]]))
ar_res = ar_mod.fit()
print(ar_res.summary())
ep = ar_res.predict()
print(len(ep),len(epsi))
z = ep - epsi[2:]
plt.plot(epsi[2:], color='black')
plt.plot(ep, color='blue',linewidth=3)
plt.title('AR(1) Process')
plt.ylabel(" ")
plt.xlabel("Days")
plt.legend()
In [15]:
z = np.expand_dims(z, axis=0)
lag_ = 10 # number of lags (for auto correlation test)
acf = autocorrelation(z, lag_)
lag = 10 # lag to be printed
ell_scale = 2 # ellipsoid radius coefficient
fit = 0 # normal fitting
InvarianceTestEllipsoid(z, acf[0,1:], lag, fit, ell_scale);
In [16]:
z = ep - epsi[1:]
plt.plot(z**2)
Out[16]: