In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.stats import multivariate_normal
from scipy.stats import norm
import statsmodels as sm
import statsmodels.tsa.api as tsa
from statsmodels.tsa.base.datetools import dates_from_str
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge
colors = sns.color_palette()
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
In [2]:
# Example (but real) data from the statsmodel database
mdata = sm.datasets.macrodata.load_pandas().data
# prepare the dates index
dates = mdata[['year', 'quarter']].astype(int).astype(str)
quarterly = dates["year"] + "Q" + dates["quarter"]
quarterly = dates_from_str(quarterly)
mdata = mdata[['realgdp','realcons','realinv']]
mdata.index = pd.DatetimeIndex(quarterly)
data = np.log(mdata).diff().dropna()
In [3]:
true_lag = 3
model = tsa.VAR(data)
results = model.fit(true_lag)
In [4]:
M = len(results.names)
L = results.k_ar
mu = results.intercept
A = results.coefs
error = np.asarray(results.resid)
T = error.shape[0]
Sigma = (error.T @ error)/T
In [5]:
def stationary_dist(mu, A, Sigma):
M, L = A.shape[2], A.shape[0]
K = M*L
mu_comp = np.zeros((K, 1))
mu_comp[:M, 0] = mu
A_row = np.hstack([A[i, :, :] for i in range(L)])
A_comp = np.vstack([A_row,
np.hstack([np.eye(M*(L-1)), np.zeros((M*(L-1), M))])])
Sigma_comp = np.zeros((M*L, M*L))
Sigma_comp[:M, :M] = Sigma
mu_stationary = np.linalg.solve(np.eye(K) - A_comp, mu_comp)
Sigma_stationary = sp.linalg.solve_discrete_lyapunov(A_comp, Sigma_comp)
return mu_stationary, Sigma_stationary
In [6]:
# data generating process
def true_model(N, mu, A, Sigma):
'''Simulating the true model'''
M, L = A.shape[2], A.shape[0]
K = M*L
mu_stationary, Sigma_stationary = stationary_dist(mu, A, Sigma)
initial_x = multivariate_normal(mu_stationary.squeeze(), Sigma_stationary).rvs()
shocks = multivariate_normal(np.zeros(len(mu)), Sigma)
error = shocks.rvs(N - L).T
X = np.zeros((M, N))
X[:, :L] = initial_x.reshape(L, M).T
for t in range(N - L):
AX = np.zeros((M, 1))
for lag in range(L):
AX += A[lag, :, :] @ X[:, t + L - 1 - lag].reshape(M, 1)
X[:, L + t] = (mu.reshape(M, 1) + AX + error[:, t].reshape(M, 1)).squeeze()
return pd.DataFrame(data = X.T, index = data.index[-N:])
In [7]:
#----------------------------------------------------------
# Construct the stationary distribution for the plot
#----------------------------------------------------------
mu_stationary, Sigma_stationary = stationary_dist(mu, A, Sigma)
std_stationary = np.sqrt(np.diag(Sigma_stationary[:M]))
In [8]:
fig, ax = plt.subplots(3, 1, figsize = (12, 10))
data['realgdp'].plot(ax = ax[0], color = sns.color_palette()[0], label = 'Sample')
ax[0].set_title('Real GDP')
ax[0].axhline(mu_stationary[0], color = 'g', alpha = .4, label = 'Stationary mean')
ax[0].axhline(mu_stationary[0] + 2*std_stationary[0], linestyle = '--', color = 'g',
alpha = .4, label = r'2$\pm$ stationary stdev')
ax[0].axhline(0, color = 'k', alpha = .8)
ax[0].axhline(mu_stationary[0] - 2*std_stationary[0], linestyle = '--', color = 'g', alpha = .4)
ax[0].set_ylim([-.04, .04])
ax[0].legend(loc = 'best')
data['realcons'].plot(ax = ax[1], color = sns.color_palette()[0])
ax[1].set_title('Real Consumption')
ax[1].axhline(0, color = 'k', alpha = .8)
ax[1].axhline(mu_stationary[1], color = 'g', alpha = .4, label = 'mu_stationary')
ax[1].axhline(mu_stationary[1] + 2*std_stationary[1], linestyle = '--', color = 'g', alpha = .4)
ax[1].axhline(mu_stationary[1] - 2*std_stationary[1], linestyle = '--', color = 'g', alpha = .4)
data['realinv'].plot(ax = ax[2], color = sns.color_palette()[0])
ax[2].set_title('Real Investment')
ax[2].axhline(0, color = 'k', alpha = .8)
ax[2].axhline(mu_stationary[2], color = 'g', alpha = .4, label = 'mu_stationary')
ax[2].axhline(mu_stationary[2] + 2*std_stationary[2], linestyle = '--', color = 'g', alpha = .4)
ax[2].axhline(mu_stationary[2] - 2*std_stationary[2], linestyle = '--', color = 'g', alpha = .4)
ax[2].set_ylim([-.2, .2])
for i in range(4):
simul = true_model(T, mu, A, Sigma)
simul[0].plot(ax=ax[0], color = 'g', alpha = .2)
simul[1].plot(ax=ax[1], color = 'g', alpha = .2)
simul[2].plot(ax=ax[2], color = 'g', alpha = .2)
plt.tight_layout()
plt.savefig('./alternative_samples.png', dpi=800)
In [9]:
def Ezz_inv_gen(lag, mu, A, Sigma):
"""
Generates the population moment E[\tilde{z}\tilde{z}'] and calculates its inverse
"""
M, L = Sigma.shape[0], lag
La = A.shape[0]
K = M*L
mu_comp = np.zeros((K, 1))
mu_comp[:M, 0] = mu
A_row = np.hstack([A[i, :, :] for i in range(La)])
if lag > La:
A_row2 = np.hstack([A_row, np.zeros((M, M*(lag-La)))])
elif lag == La:
A_row2 = A_row
A_comp = np.vstack([A_row2,
np.hstack([np.eye(M*(L-1)), np.zeros((M*(L-1), M))])])
Sigma_comp = np.zeros((K, K))
Sigma_comp[:M, :M] = Sigma
mu_stationary = np.linalg.solve(np.eye(K) - A_comp, mu_comp)
Sigma_stationary = sp.linalg.solve_discrete_lyapunov(A_comp, Sigma_comp)
Ezz = np.vstack([np.hstack([[[1]], mu_stationary.T]),
np.hstack([mu_stationary, Sigma_stationary])])
return np.linalg.inv(Ezz)
In [10]:
def comparison(N, lags, M=3, finite_adj=False):
"""
N : effective sample size (number of dependend obs)
lags : list containg the number of lags for which we calculate relative se
=> sample size used for the calculation is N + lag
"""
store_relative_se = []
store_true_se = []
store_asympt_se = []
for lag in lags:
X_test = true_model(N + lag, mu, A, Sigma)
var_test = tsa.VAR(X_test)
result_test = var_test.fit(lag)
if finite_adj:
k = 1 + M*lag
else:
k = 0
#-------------------------------------------------
# (1) Asymptotic standard errors (Hamilton p.298-299)
#-------------------------------------------------
residuals = np.asarray(result_test.resid)
Omega_hat = (residuals.T @ residuals)/(N-k)
storeX = np.asarray(X_test)
XX = np.ones((N, 1+lag*M))
for j in range(lag):
XX[:, j*M+1:(j+1)*M+1] = storeX[lag-(j+1):-(j+1), :]
Q_hat = (XX.T @ XX)/N
se_asympt = np.sqrt(np.diag(np.kron(Omega_hat, np.linalg.inv(Q_hat)))/N)
store_asympt_se.append(se_asympt)
#-------------------------------------------------
# (2) True standard error (using MC)
#-------------------------------------------------
nn = 1000
store = np.zeros((nn, M*(M*lag + 1)))
for j in range(nn):
var = tsa.VAR(true_model(N + lag, mu, A, Sigma))
res = var.fit(lag)
store[j, :] = np.asarray(res.params).T.flatten()
se_MC = store.std(0)
store_true_se.append(se_MC)
#-------------------------------------------------
# Relative standard values
#-------------------------------------------------
store_relative_se.append(se_MC/se_asympt)
print("Done with lag = {l}".format(l=lag))
return store_relative_se, store_true_se, store_asympt_se
In [11]:
np.random.seed(123)
In [12]:
lags = [3, 8, 15]
relative_se, true_se, asympt_se = comparison(100, lags)
relative_se_adj, true_se_adj, asympt_se_adj = comparison(100, lags, finite_adj=True)
store_AS = []
for ll in lags:
store_AS.append(np.sqrt(np.diag(np.kron(Sigma, Ezz_inv_gen(ll, mu, A, Sigma)))/100))
In [13]:
fig, ax = plt.subplots(1, 3, figsize = (18, 5))
cols = sns.color_palette()
for i, lag in enumerate(lags):
Mlag = M*(M*lag + 1)
ax[i].plot(np.arange(Mlag), relative_se[i], 'o', color = 'k')
ax[i].vlines(np.arange(Mlag), 1, relative_se[i], lw = 1, label = r'$s_n/\widehat{AS}_n$')
ax[i].plot(np.arange(Mlag), true_se[i]/store_AS[i], 'o', color = cols[1])
ax[i].vlines(np.arange(Mlag), 1, true_se[i]/store_AS[i], color = cols[1], lw = 1, label = r'$s_n/AS_n$')
ax[i].axhline(y = 1, color = 'k', alpha = .4)
ax[i].set_title('Number of lags: {l}'.format(l=lag), fontsize = 17)
ax[i].set_ylim([.8, 2.0])
ax[i].set_xlabel('Parameters', fontsize = 15)
ax[i].legend(loc = 'best', fontsize = 15)
plt.tight_layout()
plt.savefig("./relative_se1.png", dpi=800)
In [14]:
fig, ax = plt.subplots(1, 3, figsize = (18, 5))
cols = sns.color_palette()
for i, lag in enumerate(lags):
Mlag = M*(M*lag + 1)
ax[i].plot(np.arange(Mlag), relative_se[i], 'o', color = 'k')
ax[i].vlines(np.arange(Mlag), 1, relative_se[i], lw = 1, label = r'$s_n/\widehat{AS}_n$')
ax[i].plot(np.arange(Mlag), relative_se_adj[i], 'o', color = cols[4])
ax[i].vlines(np.arange(Mlag), 1, relative_se_adj[i], color = cols[4], lw = 1, label = r'$s_n/\widehat{AS}^{adj}_n$')
ax[i].axhline(y = 1, color = 'k', alpha = .4)
ax[i].set_title('Number of lags: {l}'.format(l=lag), fontsize = 17)
ax[i].set_ylim([.8, 2.0])
ax[i].set_xlabel('Parameters', fontsize = 15)
ax[i].legend(loc = 'best', fontsize = 15)
plt.tight_layout()
plt.savefig("./relative_se2.png", dpi=800)
In [15]:
# data generating process
def f(X):
'''True regression function.'''
return 3 + .1*(X[:, 0] + X[:, 0]*X[:, 1] - X[:, 1]**2 + 1.5*X[:, 0]**2)
# generate artificial training data
mu = np.array([1, 3]) # mean
sigma = np.array([[ 3, -3],
[-3, 5]]) # covariance matrix
n = 100 # sample size
np.random.seed(1234)
# joint normal rv for X
mvnorm = multivariate_normal(mu, sigma)
# noise
noise = norm(0, 2)
X = mvnorm.rvs(n)
eps = noise.rvs(n)
y = (f(X) + eps).reshape(n,1)
# put data in dataframe
df = pd.DataFrame(data=np.hstack((y, X)), columns=['y', 'x1', 'x2'])
In [16]:
# Plot partial effects on regression function
n_linsp = 500
# constant for means
O = np.ones((n_linsp, 1))
# standard deviation for relevant range
sd1 = np.sqrt(sigma[0, 0])
sd2 = np.sqrt(sigma[1, 1])
# sd3 = np.sqrt(sigma[2, 2])
# range where data fall with 99% chance
R1 = np.linspace(mu[0]-3*sd1, mu[0]+3*sd1, n_linsp).reshape((n_linsp,1))
R2 = np.linspace(mu[1]-3*sd2, mu[1]+3*sd2, n_linsp).reshape((n_linsp,1))
# R3 = np.linspace(mu[2]-3*sd3, mu[2]+3*sd3, n_linsp).reshape((n_linsp,1))
cols = sns.color_palette()
fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
ax[0].plot(R1, f(np.hstack((R1, O*mu[1]))), c=cols[0], label='$x_1$')
ax[0].scatter(df['x1'], df['y'], s=1, c='r', label='data')
ax[0].set_title('Marginal effect of $x_1$ \non regression function', fontsize=14)
ax[0].set_xlabel('Range of $x_1$ -- covering 99\% probability', fontsize=12)
ax[1].plot(R2, f(np.hstack((O*mu[0], R2))), c=cols[1], label='$x_2$')
ax[1].scatter(df['x2'], df['y'], s=1, c='r', label='')
ax[1].set_title('Marginal effect of $x_2$ \non regression function', fontsize=14)
ax[1].set_xlabel('Range of $x_2$ -- covering 99\% probability', fontsize=12)
#ax[2].plot(R3, f(np.hstack((O*mu[0], O*mu[1], R3))), label='x3')
#ax[2].set_title('Marginal effect of $x_3$ at means')
ax[0].legend(bbox_to_anchor=(2.1, 0.75), loc=2, fontsize = 12, frameon = False)
ax[1].legend(bbox_to_anchor=(1.03, 0.9), loc=2, fontsize = 12, frameon = False)
#plt.legend(bbox_to_anchor=(1.05, 0.9), loc=2, fontsize = 18, frameon = False)
plt.tight_layout()
plt.savefig('finite_marg_effect.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()
In [17]:
# test sample
n_out = 10000
X_out = mvnorm.rvs(n_out)
eps_out = noise.rvs(n_out)
y_out = (f(X_out) + eps_out)#.reshape(n_out,1)
X_out = pd.DataFrame(data=X_out, columns=['x1', 'x2'])
In [18]:
# fit model with interactions and second order terms
# define regression formula
form = 'y ~ x1 + x2 + I(x1**2) + I(x2**2) + x1:x2'
# estimate and get risk for OLS
# =============================
mod = smf.ols(formula=form, data=df)
res = mod.fit()
y_OLS_hat = res.predict(X_out)
R_OLS = ((y_out - y_OLS_hat)**2).mean() - noise.var()
# estimate an get risk for Ridge
# (for multiple penalty parameters)
# =================================
# range of penalty parameter
n_penalty = 50
penalty_end = .1
penalty = np.linspace(0, penalty_end, n_penalty)
R_Ridge = []
for a in penalty:
res_reg = mod.fit_regularized(alpha=a, L1_wt=0)
y_Ridge_hat = res_reg.predict(X_out)
R_Ridge.append(((y_out - y_Ridge_hat)**2).mean() - noise.var())
R_Ridge = np.array(R_Ridge)
In [19]:
# plot risk for OLS and Ridge
R_OLS_vect = np.ones(n_penalty)*R_OLS
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(penalty, R_OLS_vect, c=cols[0], label='OLS')
ax.plot(penalty, R_Ridge, c=cols[1], label='Ridge')
ax.axvline(x = penalty[R_Ridge.argmin()], ls='--' , alpha=.4, color='r', label='optimal model\nfor sample')
ax.set_title('Comparison of true excess losses', fontsize=14)
ax.set_xlabel('Tuning parameter for Ridge -- $\lambda$', fontsize=12)
ax.set_ylabel('Excess loss', fontsize=12)
ax.set_ylim([.17, .22])
ax.set_xlim([-.05*penalty_end, penalty_end])
plt.legend(loc=4, fontsize = 10)
plt.tight_layout()
plt.savefig('finite_ridge_tuning.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()
In [20]:
x = np.linspace(3, 32)
approx_er = 1/(.01*x)
estim_er = np.exp(.1*x)
estim_risk = approx_er + estim_er
In [21]:
sns.set(style="white", palette="muted", color_codes=True)
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
f, ax = plt.subplots(figsize=(6, 4))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.plot(x, approx_er, label='Approximation Error')
plt.plot(x, estim_er, label='Estimation Error')
plt.plot(x, estim_risk, label='Risk of Estimator')
plt.tick_params(
axis='both', # changes apply to the both axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
left='off',
right='off',
labelbottom='off',
labelleft='off')
ax.set_ylim([0,35])
ax.set_xlim([0, 44])
ax.text(6, 28, r'Underfitting', fontsize=17, color='black')
ax.text(22, 28, r'Overfitting', fontsize=17, color='black')
ax.text(34, 23, r'Estimation Error', fontsize=17)
ax.text(34, 28, r'Excess Risk', fontsize=17)
ax.text(34, 3, r'Approximation Error', fontsize=17)
plt.xlabel(r'Size of $\mathcal{A}$', fontsize = 18)
plt.ylabel(r'Risk', fontsize = 18)
plt.savefig('decomp.png', format='png', dpi=800, bbox_inches='tight')
In [ ]: