Code

Date: February 2017



In [1]:

    
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from scipy.stats import multivariate_normal
from scipy.stats import norm

import statsmodels as sm
import statsmodels.tsa.api as tsa
from statsmodels.tsa.base.datetools import dates_from_str
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge

colors = sns.color_palette()
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

Var example



In [2]:

    
# Example (but real) data from the statsmodel database
mdata = sm.datasets.macrodata.load_pandas().data

# prepare the dates index
dates = mdata[['year', 'quarter']].astype(int).astype(str)
quarterly = dates["year"] + "Q" + dates["quarter"]
quarterly = dates_from_str(quarterly)

mdata = mdata[['realgdp','realcons','realinv']]
mdata.index = pd.DatetimeIndex(quarterly)
data = np.log(mdata).diff().dropna()

Let the true DGP be an estimated VAR with `true_lag` number of lags



In [3]:

    
true_lag = 3

model = tsa.VAR(data)
results = model.fit(true_lag)



In [4]:

    
M = len(results.names)
L = results.k_ar
mu = results.intercept
A = results.coefs    

error = np.asarray(results.resid)
T = error.shape[0]
Sigma = (error.T @ error)/T



In [5]:

    
def stationary_dist(mu, A, Sigma):

    M, L = A.shape[2], A.shape[0] 
    K = M*L
    
    mu_comp = np.zeros((K, 1))
    mu_comp[:M, 0] = mu
    A_row = np.hstack([A[i, :, :] for i in range(L)])
    A_comp = np.vstack([A_row, 
                        np.hstack([np.eye(M*(L-1)), np.zeros((M*(L-1), M))])])
    Sigma_comp = np.zeros((M*L, M*L))
    Sigma_comp[:M, :M] = Sigma

    mu_stationary = np.linalg.solve(np.eye(K) - A_comp, mu_comp)
    Sigma_stationary = sp.linalg.solve_discrete_lyapunov(A_comp, Sigma_comp)

    return mu_stationary, Sigma_stationary



In [6]:

    
# data generating process
def true_model(N, mu, A, Sigma):
    '''Simulating the true model'''
    
    M, L = A.shape[2], A.shape[0] 
    K = M*L
    
    mu_stationary, Sigma_stationary = stationary_dist(mu, A, Sigma)
        
    initial_x = multivariate_normal(mu_stationary.squeeze(), Sigma_stationary).rvs()
    shocks = multivariate_normal(np.zeros(len(mu)), Sigma)
    error = shocks.rvs(N - L).T
    
    X = np.zeros((M, N))
    X[:, :L] = initial_x.reshape(L, M).T
    
    for t in range(N - L):
        AX = np.zeros((M, 1))
        for lag in range(L):
            AX += A[lag, :, :] @ X[:, t + L - 1 - lag].reshape(M, 1)
        X[:, L + t] = (mu.reshape(M, 1) + AX + error[:, t].reshape(M, 1)).squeeze()
    
    return pd.DataFrame(data = X.T, index = data.index[-N:])



In [7]:

    
#----------------------------------------------------------
# Construct the stationary distribution for the plot
#----------------------------------------------------------

mu_stationary, Sigma_stationary = stationary_dist(mu, A, Sigma)
std_stationary = np.sqrt(np.diag(Sigma_stationary[:M]))



In [8]:

    
fig, ax = plt.subplots(3, 1, figsize = (12, 10))
data['realgdp'].plot(ax = ax[0], color = sns.color_palette()[0], label = 'Sample')
ax[0].set_title('Real GDP')
ax[0].axhline(mu_stationary[0], color = 'g', alpha = .4, label = 'Stationary mean')
ax[0].axhline(mu_stationary[0] + 2*std_stationary[0], linestyle = '--', color = 'g', 
              alpha = .4, label = r'2$\pm$ stationary stdev')
ax[0].axhline(0, color = 'k', alpha = .8)
ax[0].axhline(mu_stationary[0] - 2*std_stationary[0], linestyle = '--', color = 'g', alpha = .4)
ax[0].set_ylim([-.04, .04])
ax[0].legend(loc = 'best')

data['realcons'].plot(ax = ax[1], color = sns.color_palette()[0])
ax[1].set_title('Real Consumption')
ax[1].axhline(0, color = 'k', alpha = .8)
ax[1].axhline(mu_stationary[1], color = 'g', alpha = .4, label = 'mu_stationary')
ax[1].axhline(mu_stationary[1] + 2*std_stationary[1], linestyle = '--', color = 'g', alpha = .4)
ax[1].axhline(mu_stationary[1] - 2*std_stationary[1], linestyle = '--', color = 'g', alpha = .4)

data['realinv'].plot(ax = ax[2], color = sns.color_palette()[0])
ax[2].set_title('Real Investment')
ax[2].axhline(0, color = 'k', alpha = .8)
ax[2].axhline(mu_stationary[2], color = 'g', alpha = .4, label = 'mu_stationary')
ax[2].axhline(mu_stationary[2] + 2*std_stationary[2], linestyle = '--', color = 'g', alpha = .4)
ax[2].axhline(mu_stationary[2] - 2*std_stationary[2], linestyle = '--', color = 'g', alpha = .4)
ax[2].set_ylim([-.2, .2])

for i in range(4):
    simul = true_model(T, mu, A, Sigma)
    simul[0].plot(ax=ax[0], color = 'g', alpha = .2)
    simul[1].plot(ax=ax[1], color = 'g', alpha = .2)
    simul[2].plot(ax=ax[2], color = 'g', alpha = .2)

plt.tight_layout()
plt.savefig('./alternative_samples.png', dpi=800)



In [9]:

    
def Ezz_inv_gen(lag, mu, A, Sigma):
    """
    Generates the population moment E[\tilde{z}\tilde{z}'] and calculates its inverse
    """
    M, L = Sigma.shape[0], lag 
    La = A.shape[0]
    K = M*L
    
    mu_comp = np.zeros((K, 1))
    mu_comp[:M, 0] = mu
    A_row = np.hstack([A[i, :, :] for i in range(La)])
    
    if lag > La:
        A_row2 = np.hstack([A_row, np.zeros((M, M*(lag-La)))])
    elif lag == La:
        A_row2 = A_row
    
    A_comp = np.vstack([A_row2, 
                        np.hstack([np.eye(M*(L-1)), np.zeros((M*(L-1), M))])])
    Sigma_comp = np.zeros((K, K))
    Sigma_comp[:M, :M] = Sigma

    mu_stationary = np.linalg.solve(np.eye(K) - A_comp, mu_comp)
    Sigma_stationary = sp.linalg.solve_discrete_lyapunov(A_comp, Sigma_comp)

    Ezz = np.vstack([np.hstack([[[1]], mu_stationary.T]),
                 np.hstack([mu_stationary, Sigma_stationary])])

    return np.linalg.inv(Ezz)



In [10]:

    
def comparison(N, lags, M=3, finite_adj=False):
    """
    N       : effective sample size (number of dependend obs)
    lags    : list containg the number of lags for which we calculate relative se
         
         => sample size used for the calculation is N + lag
    """
    
    store_relative_se = []
    store_true_se = []
    store_asympt_se = []
        
    for lag in lags:
        X_test = true_model(N + lag, mu, A, Sigma)
        var_test = tsa.VAR(X_test)
        result_test = var_test.fit(lag)
        
        if finite_adj:
            k = 1 + M*lag
        else:
            k = 0

        #-------------------------------------------------
        # (1) Asymptotic standard errors (Hamilton p.298-299)
        #-------------------------------------------------
        residuals = np.asarray(result_test.resid)
        Omega_hat = (residuals.T @ residuals)/(N-k)

        storeX = np.asarray(X_test)
        XX = np.ones((N, 1+lag*M))
        for j in range(lag):
            XX[:, j*M+1:(j+1)*M+1] = storeX[lag-(j+1):-(j+1), :]
        Q_hat = (XX.T @ XX)/N

        se_asympt = np.sqrt(np.diag(np.kron(Omega_hat, np.linalg.inv(Q_hat)))/N)
        store_asympt_se.append(se_asympt)
        
        #-------------------------------------------------
        # (2) True standard error (using MC)
        #-------------------------------------------------
        nn = 1000
        store = np.zeros((nn, M*(M*lag + 1))) 

        for j in range(nn):
            var = tsa.VAR(true_model(N + lag, mu, A, Sigma))
            res = var.fit(lag)
            store[j, :] = np.asarray(res.params).T.flatten()
            
        se_MC = store.std(0)
        store_true_se.append(se_MC)
        
        #-------------------------------------------------
        # Relative standard values
        #-------------------------------------------------        
        store_relative_se.append(se_MC/se_asympt)
        print("Done with lag = {l}".format(l=lag))
    
    return store_relative_se, store_true_se, store_asympt_se



In [11]:

    
np.random.seed(123)



In [12]:

    
lags = [3, 8, 15]
relative_se, true_se, asympt_se = comparison(100, lags)
relative_se_adj, true_se_adj, asympt_se_adj = comparison(100, lags, finite_adj=True)

store_AS = []
for ll in lags:
    store_AS.append(np.sqrt(np.diag(np.kron(Sigma, Ezz_inv_gen(ll, mu, A, Sigma)))/100))









    



Done with lag = 3
Done with lag = 8
Done with lag = 15
Done with lag = 3
Done with lag = 8
Done with lag = 15



In [13]:

    
fig, ax = plt.subplots(1, 3, figsize = (18, 5))
cols = sns.color_palette()

for i, lag in enumerate(lags):
    Mlag = M*(M*lag + 1)
    ax[i].plot(np.arange(Mlag), relative_se[i], 'o', color = 'k')
    ax[i].vlines(np.arange(Mlag), 1, relative_se[i], lw = 1, label = r'$s_n/\widehat{AS}_n$')
    ax[i].plot(np.arange(Mlag), true_se[i]/store_AS[i], 'o', color = cols[1])
    ax[i].vlines(np.arange(Mlag), 1, true_se[i]/store_AS[i], color = cols[1], lw = 1, label = r'$s_n/AS_n$')
    ax[i].axhline(y = 1, color = 'k', alpha = .4)
    ax[i].set_title('Number of lags: {l}'.format(l=lag), fontsize = 17)
    ax[i].set_ylim([.8, 2.0])
    ax[i].set_xlabel('Parameters', fontsize = 15)
    ax[i].legend(loc = 'best', fontsize = 15)
plt.tight_layout()
plt.savefig("./relative_se1.png", dpi=800)



In [14]:

    
fig, ax = plt.subplots(1, 3, figsize = (18, 5))
cols = sns.color_palette()

for i, lag in enumerate(lags):
    Mlag = M*(M*lag + 1)
    ax[i].plot(np.arange(Mlag), relative_se[i], 'o', color = 'k')
    ax[i].vlines(np.arange(Mlag), 1, relative_se[i], lw = 1, label = r'$s_n/\widehat{AS}_n$')
    ax[i].plot(np.arange(Mlag), relative_se_adj[i], 'o', color = cols[4])
    ax[i].vlines(np.arange(Mlag), 1, relative_se_adj[i], color = cols[4], lw = 1, label = r'$s_n/\widehat{AS}^{adj}_n$')
    ax[i].axhline(y = 1, color = 'k', alpha = .4)
    ax[i].set_title('Number of lags: {l}'.format(l=lag), fontsize = 17)
    ax[i].set_ylim([.8, 2.0])
    ax[i].set_xlabel('Parameters', fontsize = 15)
    ax[i].legend(loc = 'best', fontsize = 15)
plt.tight_layout()
plt.savefig("./relative_se2.png", dpi=800)

Ridge example



In [15]:

    
# data generating process
def f(X):
    '''True regression function.'''
    return 3 + .1*(X[:, 0] + X[:, 0]*X[:, 1] - X[:, 1]**2 + 1.5*X[:, 0]**2)

# generate artificial training data
mu = np.array([1, 3])         # mean
sigma = np.array([[ 3, -3],
                  [-3, 5]])   # covariance matrix
n = 100                       # sample size

np.random.seed(1234)

# joint normal rv for X
mvnorm = multivariate_normal(mu, sigma)

# noise
noise = norm(0, 2)

X = mvnorm.rvs(n)
eps = noise.rvs(n)
y = (f(X) + eps).reshape(n,1)

# put data in dataframe
df = pd.DataFrame(data=np.hstack((y, X)), columns=['y', 'x1', 'x2'])



In [16]:

    
# Plot partial effects on regression function

n_linsp = 500

# constant for means
O = np.ones((n_linsp, 1))

# standard deviation for relevant range
sd1 = np.sqrt(sigma[0, 0])
sd2 = np.sqrt(sigma[1, 1])
# sd3 = np.sqrt(sigma[2, 2])

# range where data fall with 99% chance
R1 = np.linspace(mu[0]-3*sd1, mu[0]+3*sd1, n_linsp).reshape((n_linsp,1))
R2 = np.linspace(mu[1]-3*sd2, mu[1]+3*sd2, n_linsp).reshape((n_linsp,1))
# R3 = np.linspace(mu[2]-3*sd3, mu[2]+3*sd3, n_linsp).reshape((n_linsp,1))

cols = sns.color_palette()

fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True)

ax[0].plot(R1, f(np.hstack((R1, O*mu[1]))), c=cols[0], label='$x_1$')
ax[0].scatter(df['x1'], df['y'], s=1, c='r', label='data')
ax[0].set_title('Marginal effect of $x_1$ \non regression function', fontsize=14)
ax[0].set_xlabel('Range of $x_1$ -- covering 99\% probability', fontsize=12)
ax[1].plot(R2, f(np.hstack((O*mu[0], R2))), c=cols[1], label='$x_2$')
ax[1].scatter(df['x2'], df['y'], s=1, c='r', label='')
ax[1].set_title('Marginal effect of $x_2$ \non regression function', fontsize=14)
ax[1].set_xlabel('Range of $x_2$ -- covering 99\% probability', fontsize=12)
#ax[2].plot(R3, f(np.hstack((O*mu[0], O*mu[1], R3))), label='x3')
#ax[2].set_title('Marginal effect of $x_3$ at means')
ax[0].legend(bbox_to_anchor=(2.1, 0.75), loc=2, fontsize = 12, frameon = False)
ax[1].legend(bbox_to_anchor=(1.03, 0.9), loc=2, fontsize = 12, frameon = False)
#plt.legend(bbox_to_anchor=(1.05, 0.9), loc=2, fontsize = 18, frameon = False)
plt.tight_layout()
plt.savefig('finite_marg_effect.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()



In [17]:

    
# test sample

n_out = 10000
X_out = mvnorm.rvs(n_out)
eps_out = noise.rvs(n_out)
y_out = (f(X_out) + eps_out)#.reshape(n_out,1)

X_out = pd.DataFrame(data=X_out, columns=['x1', 'x2'])



In [18]:

    
# fit model with interactions and second order terms

# define regression formula 
form = 'y ~ x1 + x2 + I(x1**2) + I(x2**2) + x1:x2'

# estimate and get risk for OLS
# =============================

mod = smf.ols(formula=form, data=df)
res = mod.fit()
y_OLS_hat = res.predict(X_out)
R_OLS = ((y_out - y_OLS_hat)**2).mean() - noise.var()

# estimate an get risk for Ridge
# (for multiple penalty parameters)
# =================================

# range of penalty parameter
n_penalty = 50
penalty_end = .1
penalty = np.linspace(0, penalty_end, n_penalty)

R_Ridge = []
for a in penalty:
    res_reg = mod.fit_regularized(alpha=a, L1_wt=0)
    y_Ridge_hat = res_reg.predict(X_out)
    R_Ridge.append(((y_out - y_Ridge_hat)**2).mean() - noise.var())

R_Ridge = np.array(R_Ridge)



In [19]:

    
# plot risk for OLS and Ridge

R_OLS_vect = np.ones(n_penalty)*R_OLS

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(penalty, R_OLS_vect, c=cols[0], label='OLS')
ax.plot(penalty, R_Ridge, c=cols[1], label='Ridge')
ax.axvline(x = penalty[R_Ridge.argmin()], ls='--' , alpha=.4, color='r', label='optimal model\nfor sample')
ax.set_title('Comparison of true excess losses', fontsize=14)
ax.set_xlabel('Tuning parameter for Ridge -- $\lambda$', fontsize=12)
ax.set_ylabel('Excess loss', fontsize=12)
ax.set_ylim([.17, .22])
ax.set_xlim([-.05*penalty_end, penalty_end])
plt.legend(loc=4, fontsize = 10)
plt.tight_layout()
plt.savefig('finite_ridge_tuning.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()

Decomposition figure



In [20]:

    
x = np.linspace(3, 32)
approx_er = 1/(.01*x)
estim_er = np.exp(.1*x)
estim_risk = approx_er + estim_er



In [21]:

    
sns.set(style="white", palette="muted", color_codes=True)

plt.rc('text', usetex=True)
plt.rc('font', family='serif')
f, ax = plt.subplots(figsize=(6, 4))

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.plot(x, approx_er, label='Approximation Error')
plt.plot(x, estim_er, label='Estimation Error')
plt.plot(x, estim_risk, label='Risk of Estimator')
plt.tick_params(
    axis='both',       # changes apply to the both axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    left='off',
    right='off',
    labelbottom='off',
    labelleft='off')
ax.set_ylim([0,35])
ax.set_xlim([0, 44])
ax.text(6, 28, r'Underfitting', fontsize=17, color='black')
ax.text(22, 28, r'Overfitting', fontsize=17, color='black')
ax.text(34, 23, r'Estimation Error', fontsize=17)
ax.text(34, 28, r'Excess Risk', fontsize=17)
ax.text(34, 3, r'Approximation Error', fontsize=17)
plt.xlabel(r'Size of $\mathcal{A}$', fontsize = 18)
plt.ylabel(r'Risk', fontsize = 18)
plt.savefig('decomp.png', format='png', dpi=800, bbox_inches='tight')



In [ ]:

Code

Var example

Let the true DGP be an estimated VAR with true_lag number of lags

Ridge example

Decomposition figure

Let the true DGP be an estimated VAR with `true_lag` number of lags