In [10]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import multivariate_normal
from scipy.stats import norm
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge
colors = sns.color_palette("Blues")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
In [11]:
# data generating process
def f(X):
'''True regression function.'''
return .1*(X[:, 0] + X[:, 0]*X[:, 1] - X[:, 1]**2 - .2*X[:, 2] + 1.5*X[:, 0]**2)
# generate artificial training data
mu = np.array([1, 3, 5]) # mean
sigma = np.array([[ 9, -5, -4],
[-5, 11, 9],
[-4, 9, 13]]) # covariance matrix
n = 500 # sample size
np.random.seed(1234)
# joint normal rv for X
mvnorm = multivariate_normal(mu, sigma)
# noise
noise = norm(0, 1)
X = mvnorm.rvs(n)
eps = noise.rvs(n)
y = (f(X) + eps).reshape(n,1)
# put data in dataframe
df = pd.DataFrame(data=np.hstack((y, X)), columns=['y', 'x1', 'x2', 'x3'])
In [12]:
# Plot partial effects on regression function
n_linsp = 500
# constant for means
O = np.ones((n_linsp, 1))
# standard deviation for relevant range
sd1 = np.sqrt(sigma[0, 0])
sd2 = np.sqrt(sigma[1, 1])
sd3 = np.sqrt(sigma[2, 2])
# range where data fall with 99% chance
R1 = np.linspace(mu[0]-3*sd1, mu[0]+3*sd1, n_linsp).reshape((n_linsp,1))
R2 = np.linspace(mu[1]-3*sd2, mu[1]+3*sd2, n_linsp).reshape((n_linsp,1))
R3 = np.linspace(mu[2]-3*sd3, mu[2]+3*sd3, n_linsp).reshape((n_linsp,1))
fig, ax = plt.subplots(1, 3, figsize=(12, 3))
ax[0].plot(R1, f(np.hstack((R1, O*mu[1], O*mu[2]))), label='x1')
ax[0].set_title('Marginal effect of $x_1$ at means')
ax[1].plot(R2, f(np.hstack((O*mu[0], R2, O*mu[2]))), label='x2')
ax[1].set_title('Marginal effect of $x_2$ at means')
ax[2].plot(R3, f(np.hstack((O*mu[0], O*mu[1], R3))), label='x3')
ax[2].set_title('Marginal effect of $x_3$ at means')
plt.tight_layout()
plt.savefig('finite_marg_effect.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()
In [13]:
# test sample
n_out = 10000
X_out = mvnorm.rvs(n_out)
eps_out = noise.rvs(n_out)
y_out = (f(X_out) + eps_out)#.reshape(n_out,1)
X_out = pd.DataFrame(data=X_out, columns=['x1', 'x2', 'x3'])
In [16]:
# fit model with interactions and second order terms
# define regression formula
form = 'y ~ x1 + x2 + x3 + I(x1**2) + I(x2**2) + I(x3**2) + x1:x2 + x1:x3 + x2:x3'
# estimate and get risk for OLS
# =============================
mod = smf.ols(formula=form, data=df)
res = mod.fit()
y_OLS_hat = res.predict(X_out)
R_OLS = ((y_out - y_OLS_hat)**2).mean() - noise.var()
# estimate an get risk for Ridge
# (for multiple penalty parameters)
# =================================
# range of penalty parameter
n_penalty = 30
penalty_end = 2
penalty = np.linspace(0, penalty_end, n_penalty)
R_Ridge = []
for a in penalty:
res_reg = mod.fit_regularized(alpha=a, L1_wt=0)
y_Ridge_hat = res_reg.predict(X_out)
R_Ridge.append(((y_out - y_Ridge_hat)**2).mean() - noise.var())
R_Ridge = np.array(R_Ridge)
In [17]:
# plot risk for OLS and Ridge
R_OLS_vect = np.ones(n_penalty)*R_OLS
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(penalty, R_OLS_vect, label='OLS')
ax.plot(penalty, R_Ridge, label='Ridge')
ax.set_title('Comparison of simulated losses', fontsize=14)
ax.set_xlabel('Tuning parameter for Ridge -- effective size of the action space', fontsize=14)
ax.set_ylabel('Simulated excess loss', fontsize=12)
ax.set_xlim([-.05*penalty_end, penalty_end])
plt.legend(loc=4)
plt.tight_layout()
plt.savefig('finite_ridge_tuning.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()
In [ ]: