In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# For linear regression
from scipy.stats import multivariate_normal
from scipy.integrate import dblquad
# Shut down warnings for nicer output
import warnings
warnings.filterwarnings('ignore')
colors = sns.color_palette()
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
Coin tossing example
In [2]:
#===================================================
# FUNCTIONS
#===================================================
def relative_entropy(theta0, a):
return theta0 * np.log(theta0/a) + (1 - theta0) * np.log((1 - theta0)/(1 - a))
def quadratic_loss(theta0, a):
return (a - theta0)**2
def loss_distribution(l, dr, loss, true_dist, theta0, y_grid):
"""
Uses the formula for the change of discrete random variable. It takes care of the
fact that relative entropy is not monotone.
"""
eps = 1e-16
if loss == 'relative_entropy':
a1 = sp.optimize.bisect(lambda a: relative_entropy(theta0, a) - l, a = eps, b = theta0)
a2 = sp.optimize.bisect(lambda a: relative_entropy(theta0, a) - l, a = theta0, b = 1 - eps)
elif loss == 'quadratic':
a1 = theta0 - np.sqrt(l)
a2 = theta0 + np.sqrt(l)
if np.isclose(a1, dr).any():
y1 = y_grid[np.isclose(a1, dr)][0]
prob1 = true_dist.pmf(y1)
else:
prob1 = 0.0
if np.isclose(a2, dr).any():
y2 = y_grid[np.isclose(a2, dr)][0]
prob2 = true_dist.pmf(y2)
else:
prob2 = 0.0
if np.isclose(a1, a2):
# around zero loss, the two sides might find the same a
return prob1
else:
return prob1 + prob2
def risk_quadratic(theta0, n, alpha=0, beta=0):
"""
See Casella and Berger, p.332
"""
first_term = n * theta0 * (1 - theta0)/(alpha + beta + n)**2
second_term = ((n * theta0 + alpha)/(alpha + beta + n) - theta0)**2
return first_term + second_term
def loss_figures(theta0, n, alpha, beta, mle=True, entropy=True):
true_dist = stats.binom(n, theta0)
y_grid = np.arange(n + 1) # sum of ones in a sample
a_grid = np.linspace(0, 1, 100) # action space represented as [0, 1]
# The two decision functions (as a function of Y)
decision_rule = y_grid/n
decision_rule_bayes = (y_grid + alpha)/(n + alpha + beta)
if mle and entropy:
"""
MLE with relative entropy loss
"""
loss = relative_entropy(theta0, decision_rule)
loss_dist = np.asarray([loss_distribution(i, decision_rule, "relative_entropy",
true_dist, theta0,
y_grid) for i in loss[1:-1]])
loss_dist = np.hstack([true_dist.pmf(y_grid[0]), loss_dist, true_dist.pmf(y_grid[-1])])
risk = loss @ loss_dist
elif mle and not entropy:
"""
MLE with quadratic loss
"""
loss = quadratic_loss(theta0, decision_rule)
loss_dist = np.asarray([loss_distribution(i, decision_rule, "quadratic",
true_dist, theta0,
y_grid) for i in loss])
risk = risk_quadratic(theta0, n)
elif not mle and entropy:
"""
Bayes with realtive entropy loss
"""
loss = relative_entropy(theta0, decision_rule_bayes)
loss_dist = np.asarray([loss_distribution(i, decision_rule_bayes, "relative_entropy",
true_dist, theta0, y_grid) for i in loss])
risk = loss @ loss_dist
elif not mle and not entropy:
"""
Bayes with quadratic loss
"""
loss = quadratic_loss(theta0, decision_rule_bayes)
loss_dist = np.asarray([loss_distribution(i, decision_rule_bayes, "quadratic",
true_dist, theta0, y_grid) for i in loss])
risk = risk_quadratic(theta0, n, alpha, beta)
return loss, loss_dist, risk
In [3]:
theta0 = .79
n = 25
alpha, beta = 7, 2
#=========================
# Elements of Figure 1
#=========================
true_dist = stats.binom(n, theta0)
y_grid = np.arange(n + 1) # sum of ones in a sample
a_grid = np.linspace(0, 1, 100) # action space represented as [0, 1]
rel_ent = relative_entropy(theta0, a_grid) # form of the loss function
quadratic = quadratic_loss(theta0, a_grid) # form of the loss function
#=========================
# Elements of Figure 2
#=========================
theta0_alt = .39
true_dist_alt = stats.binom(n, theta0_alt)
# The two decision functions (as a function of Y)
decision_rule = y_grid/n
decision_rule_bayes = (y_grid + alpha)/(n + alpha + beta)
#=========================
# Elements of Figure 3
#=========================
loss_re_mle, loss_dist_re_mle, risk_re_mle = loss_figures(theta0, n, alpha, beta)
loss_quad_mle, loss_dist_quad_mle, risk_quad_mle = loss_figures(theta0, n, alpha, beta,
entropy=False)
loss_re_bayes, loss_dist_re_bayes, risk_re_bayes = loss_figures(theta0, n, alpha, beta,
mle=False)
loss_quad_bayes, loss_dist_quad_bayes, risk_quad_bayes = loss_figures(theta0, n, alpha, beta,
mle=False, entropy=False)
loss_re_mle_alt, loss_dist_re_mle_alt, risk_re_mle_alt = loss_figures(theta0_alt,
n, alpha, beta)
loss_quad_mle_alt, loss_dist_quad_mle_alt, risk_quad_mle_alt = loss_figures(theta0_alt, n,
alpha, beta, entropy=False)
loss_re_bayes_alt, loss_dist_re_bayes_alt, risk_re_bayes_alt = loss_figures(theta0_alt, n,
alpha, beta, mle=False)
loss_quad_bayes_alt, loss_dist_quad_bayes_alt, risk_quad_bayes_alt = loss_figures(theta0_alt,
n, alpha, beta,
mle=False, entropy=False)
In [4]:
fig, ax = plt.subplots(1, 2, figsize = (12, 4))
ax[0].set_title('True distribution over Y', fontsize = 14)
ax[0].plot(y_grid, true_dist.pmf(y_grid), 'o', color = sns.color_palette()[3])
ax[0].vlines(y_grid, 0, true_dist.pmf(y_grid), lw = 4, color = sns.color_palette()[3], alpha = .7)
ax[0].set_xlabel(r'Number of ones in the sample', fontsize = 12)
ax[1].set_title('Loss functions over the action space', fontsize = 14)
ax[1].plot(a_grid, rel_ent, lw = 2, label = 'relative entropy loss')
ax[1].plot(a_grid, quadratic, lw = 2, label = 'quadratic loss')
ax[1].axvline(theta0, color = sns.color_palette()[2], lw = 2, label = r'$\theta_0=${t}'.format(t=theta0))
ax[1].legend(loc = 'best', fontsize = 12)
ax[1].set_xlabel(r'Actions $(a)$', fontsize = 12)
plt.tight_layout()
plt.savefig("./example1_fig1.png", dpi=800)
In [5]:
fig, ax = plt.subplots(1, 2, figsize = (12, 4))
ax[0].set_title('Induced action distribution of the MLE estimator', fontsize = 14)
# Small bias
ax[0].plot(decision_rule, true_dist.pmf(y_grid), 'o')
ax[0].vlines(decision_rule, 0, true_dist.pmf(y_grid), lw = 5, alpha = .9, color = sns.color_palette()[0])
ax[0].axvline(theta0, color = sns.color_palette()[2], lw = 2, label = r'$\theta_0=${t}'.format(t=theta0))
# Large bias for Bayes
ax[0].plot(decision_rule, true_dist_alt.pmf(y_grid), 'o', color = sns.color_palette()[0], alpha = .4)
ax[0].vlines(decision_rule, 0, true_dist_alt.pmf(y_grid), lw = 4, alpha = .4, color = sns.color_palette()[0])
ax[0].axvline(theta0_alt, color = sns.color_palette()[2], lw = 2, alpha = .4,
label = r'$\theta=${t}'.format(t=theta0_alt))
ax[0].legend(loc = 'best', fontsize = 12)
ax[0].set_ylim([0, .2])
ax[0].set_xlim([0, 1])
ax[0].set_xlabel(r'Actions $(a)$', fontsize = 12)
ax[1].set_title('Induced action distribution of the Bayes estimator', fontsize = 14)
# Small bias
ax[1].plot(decision_rule_bayes, true_dist.pmf(y_grid), 'o', color = sns.color_palette()[1])
ax[1].vlines(decision_rule_bayes, 0, true_dist.pmf(y_grid), lw = 5, alpha = .9,
color = sns.color_palette()[1])
ax[1].axvline(theta0, color = sns.color_palette()[2], lw = 2, label = r'$\theta_0=${t}'.format(t=theta0))
# Large bias for Bayes
ax[1].plot(decision_rule_bayes, true_dist_alt.pmf(y_grid), 'o', color = sns.color_palette()[1], alpha = .4)
ax[1].vlines(decision_rule_bayes, 0, true_dist_alt.pmf(y_grid), lw = 4, alpha = .4,
color = sns.color_palette()[1])
ax[1].axvline(theta0_alt, color = sns.color_palette()[2], lw = 2, alpha = .4,
label = r'$\theta=${t}'.format(t=theta0_alt))
ax[1].legend(loc = 'best', fontsize = 12)
ax[1].set_ylim([0, .2])
ax[1].set_xlim([0, 1])
ax[1].set_xlabel(r'Actions $(a)$', fontsize = 12)
plt.tight_layout()
plt.savefig("./example1_fig2.png", dpi=800)
In [6]:
fig, ax = plt.subplots(2, 2, figsize = (12, 6))
ax[0, 0].set_title('Induced entropy loss distribution (MLE estimator)', fontsize = 14)
ax[0, 0].vlines(loss_re_mle, 0, loss_dist_re_mle, lw = 9, alpha = .9, color = sns.color_palette()[0])
ax[0, 0].axvline(risk_re_mle, lw = 3, linestyle = '--',
color = sns.color_palette()[0], label = r"Entropy risk ($\theta_0={t}$)".format(t=theta0))
ax[0, 0].vlines(loss_re_mle_alt, 0, loss_dist_re_mle_alt, lw = 9, alpha = .3, color = sns.color_palette()[0])
ax[0, 0].axvline(risk_re_mle_alt, lw = 3, linestyle = '--', alpha = .4,
color = sns.color_palette()[0], label = r"Entropy risk ($\theta={t}$)".format(t=theta0_alt))
ax[0, 0].set_xlim([0, .1])
ax[0, 0].set_ylim([0, .2])
ax[0, 0].set_xlabel('Loss', fontsize=12)
ax[0, 0].legend(loc = 'best', fontsize = 12)
ax[1, 0].set_title('Induced entropy loss distribution (Bayes estimator)', fontsize=14)
ax[1, 0].vlines(loss_re_bayes, 0, loss_dist_re_bayes, lw=9, alpha=.9, color=sns.color_palette()[1])
ax[1, 0].axvline(risk_re_bayes, lw=3, linestyle='--',
color = sns.color_palette()[1], label=r"Entropy risk ($\theta_0={t}$)".format(t=theta0))
ax[1, 0].vlines(loss_re_bayes_alt, 0, loss_dist_re_bayes_alt, lw=9, alpha=.3, color=sns.color_palette()[1])
ax[1, 0].axvline(risk_re_bayes_alt, lw=3, linestyle='--', alpha=.4, color=sns.color_palette()[1],
label=r"Entropy risk ($\theta={t}$)".format(t=theta0_alt))
ax[1, 0].set_xlim([0, .1])
ax[1, 0].set_ylim([0, .2])
ax[1, 0].set_xlabel('Loss')
ax[1, 0].legend(loc='best', fontsize=12)
ax[0, 1].set_title('Induced quadratic loss distribution (MLE estimator)', fontsize=14)
ax[0, 1].vlines(loss_quad_mle, 0, loss_dist_quad_mle, lw=9, alpha=.9, color=sns.color_palette()[0])
ax[0, 1].axvline(risk_quad_mle, lw=3, linestyle='--',
color = sns.color_palette()[0], label=r"Quadratic risk ($\theta_0={t}$)".format(t=theta0))
ax[0, 1].vlines(loss_quad_mle_alt, 0, loss_dist_quad_mle_alt, lw=9, alpha=.3, color=sns.color_palette()[0])
ax[0, 1].axvline(risk_quad_mle_alt, lw=3, linestyle='--', alpha=.4,
color=sns.color_palette()[0], label=r"Quadratic risk ($\theta={t}$)".format(t=theta0_alt))
ax[0, 1].set_xlim([0, .05])
ax[0, 1].set_ylim([0, .2])
ax[0, 1].set_xlabel('Loss', fontsize=12)
ax[0, 1].legend(loc='best', fontsize=12)
ax[1, 1].set_title('Induced quadratic loss distribution (Bayes estimator)', fontsize=14)
ax[1, 1].vlines(loss_quad_bayes, 0, loss_dist_quad_bayes, lw=9, alpha=.9, color=sns.color_palette()[1])
ax[1, 1].axvline(risk_quad_bayes, lw=3, linestyle='--',
color = sns.color_palette()[1], label=r"Quadratic risk ($\theta_0={t}$)".format(t=theta0))
ax[1, 1].vlines(loss_quad_bayes_alt, 0, loss_dist_quad_bayes_alt, lw=9, alpha=.3, color=sns.color_palette()[1])
ax[1, 1].axvline(risk_quad_bayes_alt, lw=3, linestyle = '--', alpha=.4,
color=sns.color_palette()[1], label=r"Quadratic risk ($\theta={t}$)".format(t=theta0_alt))
ax[1, 1].set_xlim([0, .05])
ax[1, 1].set_ylim([0, .2])
ax[1, 1].set_xlabel('Loss', fontsize=12)
ax[1, 1].legend(loc='best', fontsize=12)
plt.tight_layout()
plt.savefig("./example1_fig3.png", dpi=800)
Bayes OLS example
In [7]:
mu = np.array([1, 3]) # mean
sigma = np.array([[4, 1], [1, 8]]) # covariance matrix
n = 50 # sample size
# Bayes priors
mu_bayes = np.array([2, 2])
precis_bayes = np.array([[6, -3], [-3, 6]])
# joint normal rv for (Y,X)
mvnorm = multivariate_normal(mu, sigma)
# decision rule -- OLS estimator
def d_OLS(Z, n):
Y = Z[:, 0]
X = np.stack((np.ones(n), Z[:,1]), axis=-1)
return np.linalg.inv(X.T @ X) @ X.T @ Y
# decision rule -- Bayes
def d_bayes(Z, n):
Y = Z[:, 0]
X = np.stack((np.ones(n), Z[:,1]), axis=-1)
return np.linalg.inv(X.T @ X + precis_bayes) @ (precis_bayes @ mu_bayes + X.T @ Y)
# loss -- define integrand
def loss_int(y, x, b):
'''Defines the integrand under mvnorm distribution.'''
return (y - b[0] - b[1]*x)**2*mvnorm.pdf((y,x))
# simulate distribution over actions and over losses
B_OLS = []
L_OLS = []
B_bayes = []
L_bayes = []
for i in range(1000):
# generate sample
Z = mvnorm.rvs(n)
# get OLS action corrsponding to realized sample
b_OLS = d_OLS(Z, n)
# get Bayes action
b_bayes = d_bayes(Z, n)
# get loss through integration
l_OLS = dblquad(loss_int, -np.inf, np.inf, lambda x: -np.inf, lambda x: np.inf, args=(b_OLS,)) # get loss
l_bayes = dblquad(loss_int, -np.inf, np.inf, lambda x: -np.inf, lambda x: np.inf, args=(b_bayes,)) # get loss
# record action
B_OLS.append(b_OLS)
B_bayes.append(b_bayes)
# record loss
L_OLS.append(l_OLS)
L_bayes.append(l_bayes)
# take first column if integrating
L_OLS = np.array(L_OLS)[:, 0]
L_bayes = np.array(L_bayes)[:, 0]
In [49]:
B_OLS = pd.DataFrame(B_OLS, columns=["$\\beta_0$", "$\\beta_1$"])
B_bayes = pd.DataFrame(B_bayes, columns=["$\\beta_0$", "$\\beta_1$"])
g1 = sns.jointplot(x = "$\\beta_0$", y = "$\\beta_1$", data=B_OLS, kind="kde",
space=0.3, color = sns.color_palette()[0], size=5, xlim = (-.5, 1.6), ylim = (-.1, .4))
g1.ax_joint.plot([mu[0] - sigma[0,1]/sigma[1,1]*mu[1]],[sigma[0,1]/sigma[1,1]], 'ro', color='r', label='best-in-class')
g1.set_axis_labels(r'$\beta_0$', r'$\beta_1$', fontsize=14)
g1.fig.suptitle('Induced action distribution -- OLS', fontsize=14, y=1.04)
plt.savefig("./example2_fig1a.png", dpi=800)
g2 = sns.jointplot(x = "$\\beta_0$", y = "$\\beta_1$", data=B_bayes, kind="kde",
space=0.3, color = sns.color_palette()[0], size=5, xlim = (-.5, 1.6), ylim = (-.1, .4))
g2.ax_joint.plot([mu[0] - sigma[0,1]/sigma[1,1]*mu[1]],[sigma[0,1]/sigma[1,1]], 'ro', color='r', label='best-in-class')
g2.set_axis_labels(r'$\beta_0$', r'$\beta_1$', fontsize=14)
g2.fig.suptitle('Induced action distribution -- Bayes', fontsize=14, y=1.04)
plt.savefig("./example2_fig1b.png", dpi=800)
plt.show()
In [28]:
b_best = [mu[0] - sigma[0,1]/sigma[1,1]*mu[1], sigma[0,1]/sigma[1,1]]
l_best = dblquad(loss_int, -np.inf, np.inf, lambda x: -np.inf, lambda x: np.inf, args=(b_best,))
print(l_best[0])
In [50]:
plt.figure(figsize=(11, 5))
plt.axvline(x=l_best[0], ymin=0, ymax=1, linewidth=3, color = colors[4], label='Best-in-class loss')
plt.axvline(x=L_OLS.mean(), ymin=0, ymax=1, linewidth=3, color = colors[2], label='Risk of OLS')
plt.axvline(x=L_bayes.mean(), ymin=0, ymax=1, linewidth=3, color = colors[3], label='Risk of Bayes')
sns.distplot(L_OLS, bins=50, kde=False, color = colors[0], label='OLS')
sns.distplot(L_bayes, bins=50, kde=False, color = colors[1], label='Bayes')
plt.title('Induced loss distribution', fontsize = 14, y=1.02)
plt.legend(fontsize=12)
plt.xlabel('Loss', fontsize=12)
plt.xlim([3.8, 4.5])
plt.tight_layout()
plt.savefig("./example2_fig2.png", dpi=800)
In [44]:
beta_0 = mu[0] - sigma[0,1]/sigma[1,1]*mu[1]
beta_1 = sigma[0,1]/sigma[1,1]
print('Bias of OLS')
print('==========================')
print('{:.4f} - {:.4f} = {:.4f}'.format(beta_0, B_OLS.mean()[0], beta_0 - B_OLS.mean()[0]))
print('{:.4f} - {:.4f} = {:.4f}\n\n'.format(beta_1, B_OLS.mean()[1], beta_1 - B_OLS.mean()[1]))
print('Bias of Bayes')
print('==========================')
print('{:.4f} - {:.4f} = {:.4f}'.format(beta_0, B_bayes.mean()[0], beta_0 - B_bayes.mean()[0]))
print('{:.4f} - {:.4f} = {:.4f}'.format(beta_1, B_bayes.mean()[1], beta_1 - B_bayes.mean()[1]))
In [45]:
print('Variance of OLS')
print('======================')
print(B_OLS.var())
print('\n\nVarinace of Bayes')
print('======================')
print(B_bayes.var())
In [46]:
print('Risk of OLS: {:.4f} \nRisk of Bayes: {:.4f}'.format(L_OLS.mean(), L_bayes.mean()))
In [ ]: