In [10]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import multivariate_normal
from scipy.stats import norm

import pandas as pd
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge

colors = sns.color_palette("Blues")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [11]:
# data generating process
def f(X):
    '''True regression function.'''
    return .1*(X[:, 0] + X[:, 0]*X[:, 1] - X[:, 1]**2 - .2*X[:, 2] + 1.5*X[:, 0]**2)

# generate artificial training data
mu = np.array([1, 3, 5])         # mean
sigma = np.array([[ 9, -5, -4],
                  [-5, 11,  9],
                  [-4,  9, 13]]) # covariance matrix
n = 500                          # sample size

np.random.seed(1234)

# joint normal rv for X
mvnorm = multivariate_normal(mu, sigma)

# noise
noise = norm(0, 1)

X = mvnorm.rvs(n)
eps = noise.rvs(n)
y = (f(X) + eps).reshape(n,1)

# put data in dataframe
df = pd.DataFrame(data=np.hstack((y, X)), columns=['y', 'x1', 'x2', 'x3'])

In [12]:
# Plot partial effects on regression function

n_linsp = 500

# constant for means
O = np.ones((n_linsp, 1))

# standard deviation for relevant range
sd1 = np.sqrt(sigma[0, 0])
sd2 = np.sqrt(sigma[1, 1])
sd3 = np.sqrt(sigma[2, 2])

# range where data fall with 99% chance
R1 = np.linspace(mu[0]-3*sd1, mu[0]+3*sd1, n_linsp).reshape((n_linsp,1))
R2 = np.linspace(mu[1]-3*sd2, mu[1]+3*sd2, n_linsp).reshape((n_linsp,1))
R3 = np.linspace(mu[2]-3*sd3, mu[2]+3*sd3, n_linsp).reshape((n_linsp,1))

fig, ax = plt.subplots(1, 3, figsize=(12, 3))

ax[0].plot(R1, f(np.hstack((R1, O*mu[1], O*mu[2]))), label='x1')
ax[0].set_title('Marginal effect of $x_1$ at means')
ax[1].plot(R2, f(np.hstack((O*mu[0], R2, O*mu[2]))), label='x2')
ax[1].set_title('Marginal effect of $x_2$ at means')
ax[2].plot(R3, f(np.hstack((O*mu[0], O*mu[1], R3))), label='x3')
ax[2].set_title('Marginal effect of $x_3$ at means')
plt.tight_layout()
plt.savefig('finite_marg_effect.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()



In [13]:
# test sample

n_out = 10000
X_out = mvnorm.rvs(n_out)
eps_out = noise.rvs(n_out)
y_out = (f(X_out) + eps_out)#.reshape(n_out,1)

X_out = pd.DataFrame(data=X_out, columns=['x1', 'x2', 'x3'])

In [16]:
# fit model with interactions and second order terms

# define regression formula 
form = 'y ~ x1 + x2 + x3 + I(x1**2) + I(x2**2) + I(x3**2) + x1:x2 + x1:x3 + x2:x3'

# estimate and get risk for OLS
# =============================

mod = smf.ols(formula=form, data=df)
res = mod.fit()
y_OLS_hat = res.predict(X_out)
R_OLS = ((y_out - y_OLS_hat)**2).mean() - noise.var()

# estimate an get risk for Ridge
# (for multiple penalty parameters)
# =================================

# range of penalty parameter
n_penalty = 30
penalty_end = 2
penalty = np.linspace(0, penalty_end, n_penalty)

R_Ridge = []
for a in penalty:
    res_reg = mod.fit_regularized(alpha=a, L1_wt=0)
    y_Ridge_hat = res_reg.predict(X_out)
    R_Ridge.append(((y_out - y_Ridge_hat)**2).mean() - noise.var())

R_Ridge = np.array(R_Ridge)

In [17]:
# plot risk for OLS and Ridge

R_OLS_vect = np.ones(n_penalty)*R_OLS

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(penalty, R_OLS_vect, label='OLS')
ax.plot(penalty, R_Ridge, label='Ridge')
ax.set_title('Comparison of simulated losses', fontsize=14)
ax.set_xlabel('Tuning parameter for Ridge -- effective size of the action space', fontsize=14)
ax.set_ylabel('Simulated excess loss', fontsize=12)
ax.set_xlim([-.05*penalty_end, penalty_end])
plt.legend(loc=4)
plt.tight_layout()
plt.savefig('finite_ridge_tuning.png', format = 'png', dpi = 800, bbox_inches='tight')
plt.show()



In [ ]: