In [1]:
%load_ext autoreload
%autoreload 2
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (6.0, 6.0)
plt.rcParams['savefig.dpi'] = 100
In [2]:
from straightline_utils import *
In [3]:
(x,y,sigmay) = generate_data()
plot_yerr(x, y, sigmay)
numpy.linalg
.numpy
arrays). We might say that we want to find $m$ and $b$ that minimize$W^2 = \sum_k \frac{(y_k - m x_k - b)^2}{\sigma_k^2}$
np.linalg.lstsq
is doing.
In [4]:
# Linear algebra to implement weighted least squares.
N = len(x)
A = np.zeros((N,2))
A[:,0] = 1. / sigmay
A[:,1] = x / sigmay
yy = y / sigmay
theta,nil,nil,nil = np.linalg.lstsq(A,yy)
b_ls,m_ls = theta
print('Weighted least-squares estimated b,m:', b_ls,m_ls)
plot_yerr(x, y, sigmay)
plot_line(m_ls, b_ls);
In [5]:
# import straightline_pgm
# straightline_pgm.draw();
from IPython.display import Image
Image(filename="straightline_pgm.png",width=600)
Out[5]:
${\rm Pr}(m,b,\{y_k\},\{y^{\rm true}_k\}\,|\,\{x_k\},\{\sigma_k\},H_1) = \prod_k {\rm Pr}(y_k\,|\,y^{\rm true}_k,\sigma_k,H_1) \;{\rm Pr}(y^{\rm true}_k\,|\,x_k,m,b,H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1)$
Where did that product come from? What assumption does it encode?
Why are the $\{x_k\}$ and the $\{\sigma_k\}$ always on the right hand side of the bar, but $\{y_k\}$ is sometimes on the left?
What is the meaning of "$H_1$"? Hint: notice that it too is always on the right hand side of the bar.
What functional form does the conditional PDF ${\rm Pr}(y^{\rm true}_k\,|\,x_k,m,b,H_1)$ have?
What functional form do you think we should assume for the sampling distribution, ${\rm Pr}(y_k\,|\,y^{\rm true}_k,x_k,\sigma_k,H_1)$? Hint: imagine you were generating mock data.
What functional form should we assume for ${\rm Pr}(m,b\,|\,H_1)$?
Talk to your neighbor(s) for 5 minutes about these, note down some brief answers, and be prepared to provide them when called upon.
In [6]:
# %load straightline_answers.md
${\rm Pr}(m,b,\{y_k\}|H_1) = \int\, \prod_k {\rm Pr}(y_k\,|\,y^{\rm true}_k,H_1) \;{\rm Pr}(y^{\rm true}_k\,|\,m,b,H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1) \; d y^{\rm true}_k $
$\longrightarrow {\rm Pr}(m,b,\{y_k\}|H_1) = \prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1) $
${\rm Pr}(m,b\,|\,\{y_k\},H_1) = \frac{1}{Z} \prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1) $
$Z = {\rm Pr}(\{y_k\}\,|\,H_1) = \int \prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1) \;{\rm Pr}(m,b\,|\,H_1)\;dm\,db$
and $\prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1)$ is the joint likelihood for the parameters. ${\rm Pr}(m,b\,|\,H_1)$ is the prior PDF for the parameters.
In [7]:
def straight_line_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values *y* at
known values *x* given Gaussian measurement noise with standard
deviation with known *sigmay*, where the "true" y values are
*y_t = m * x + b*
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
and the parameter vector theta = (m,b).
Returns: scalar log likelihood
'''
m,b = theta
L = 1.0
for k in range(len(y)):
L *= np.exp(-(y[k] - m*x[k] - b)**2/(2.0*sigmay[k]**2))
L /= np.sqrt(2.0 * np.pi * sigmay[k]**2)
return np.log(L)
In [9]:
# %load straightline_log_likelihood.py
def straight_line_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values *y* at
known values *x* given Gaussian measurement noise with standard
deviation with known *sigmay*, where the "true" y values are
*y_t = m * x + b*
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
Returns: scalar log likelihood
'''
m,b = theta
return (np.sum(np.log(1./(np.sqrt(2.*np.pi) * sigmay))) +
np.sum(-0.5 * (y - (m*x + b))**2 / sigmay**2))
In [10]:
def straight_line_log_prior(theta, theta_limits):
m, b = theta
mlimits, blimits = theta_limits
# Uniform in m:
if (m < mlimits[0]) | (m > mlimits[1]):
log_m_prior = -np.inf
else:
log_m_prior = np.log(1.0/(mlimits[1] - mlimits[0]))
# Uniform in b:
if (b < blimits[0]) | (b > blimits[1]):
log_b_prior = -np.inf
else:
log_b_prior = np.log(1.0/(blimits[1] - blimits[0]))
return log_m_prior + log_b_prior
def straight_line_log_posterior(theta, x, y, sigmay, theta_limits):
return (straight_line_log_likelihood(theta, x, y, sigmay) +
straight_line_log_prior(theta, theta_limits))
In [11]:
# Evaluate log P(m,b | x,y,sigmay) on a grid.
# Define uniform prior limits, enforcing positivity in both parameters:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
theta_limits = (mlimits, blimits)
# Set up grid:
mgrid = np.linspace(mlimits[0], mlimits[1], 101)
bgrid = np.linspace(blimits[0], blimits[1], 101)
log_posterior = np.zeros((len(mgrid),len(bgrid)))
# Evaluate log posterior PDF:
for im,m in enumerate(mgrid):
for ib,b in enumerate(bgrid):
theta = (m,b)
log_posterior[im,ib] = straight_line_log_posterior(theta, x, y, sigmay, theta_limits)
# Convert to probability density and plot, taking care with very small values:
posterior = np.exp(log_posterior - log_posterior.max())
plt.imshow(posterior, extent=[blimits[0],blimits[1],mlimits[0],mlimits[1]],cmap='Blues',
interpolation='none', origin='lower', aspect=(blimits[1]-blimits[0])/(mlimits[1]-mlimits[0]),
vmin=0, vmax=1)
plt.contour(bgrid, mgrid, posterior, pdf_contour_levels(posterior), colors='k')
i = np.argmax(posterior)
i,j = np.unravel_index(i, posterior.shape)
print('Grid maximum posterior values (m,b) =', mgrid[i], bgrid[j])
plt.title('Straight line: posterior PDF for parameters');
plt.plot(b_ls, m_ls, 'r+', ms=12, mew=4);
plot_mb_setup(*theta_limits);
In [12]:
def metropolis(log_posterior, theta, theta_limits, stepsize, nsteps=10000):
'''
log_posterior: function of theta
theta_limits: uniform prior ranges
stepsize: scalar or vector proposal distribution width
nsteps: desired number of samples
'''
log_prob = log_posterior(theta, x, y, sigmay, theta_limits)
# Store Markov chain as an array of samples:
chain = np.empty((nsteps, len(theta)))
log_probs = np.empty(nsteps)
# Count our accepted proposals:
naccept = 0
for i in range(nsteps):
theta_new = theta + stepsize * np.random.randn(len(theta))
log_prob_new = log_posterior(theta_new, x, y, sigmay, theta_limits)
if np.log(np.random.rand()) < (log_prob_new - log_prob):
# accept, and move to the proposed position:
theta = theta_new
log_prob = log_prob_new
naccept += 1
else:
# reject, and store the same sample as before:
pass
chain[i] = theta
log_probs[i] = log_prob
acceptance_rate = naccept/float(nsteps)
return chain,log_probs,acceptance_rate
In [17]:
# Initialize m, b at the center of prior:
m = 0.5*(mlimits[0]+mlimits[1])
b = 0.5*(blimits[0]+blimits[1])
theta = np.array([m,b])
# Step sizes, 2% or 5% of the prior
mstep = 0.02*(mlimits[1]-mlimits[0])
bstep = 0.05*(blimits[1]-blimits[0])
stepsize = np.array([mstep,bstep])
# How many steps?
nsteps = 10000
print('Running Metropolis Sampler for', nsteps, 'steps...')
chain, log_probs, acceptance_rate = metropolis(
straight_line_log_posterior, theta, theta_limits, stepsize, nsteps=nsteps
)
print('Acceptance fraction:', acceptance_rate)
In [18]:
# Pull m and b arrays out of the Markov chain and plot them:
mm = [m for m,b in chain]
bb = [b for m,b in chain]
# Traces, for convergence inspection:
plt.figure(figsize=(8,5))
plt.subplot(2,1,1)
plt.plot(mm, 'k-')
plt.ylim(mlimits)
plt.ylabel('m')
plt.subplot(2,1,2)
plt.plot(bb, 'k-')
plt.ylabel('Intercept b')
plt.ylim(blimits)
Out[18]:
In [19]:
# First show contours from gridding calculation:
plt.contour(bgrid, mgrid, posterior, pdf_contour_levels(posterior), colors='k')
plt.gca().set_aspect((blimits[1]-blimits[0])/(mlimits[1]-mlimits[0]))
# Scatterplot of m,b posterior samples, overlaid:
plt.plot(bb, mm, 'b.', alpha=0.01)
plot_mb_setup(mlimits,blimits)
In [20]:
!pip install --upgrade --no-deps corner
In [23]:
import corner
corner.corner(chain[5000:], labels=['m','b'], range=[mlimits,blimits],quantiles=[0.16,0.5,0.84],
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=True, fill_contours=True, levels=[0.68, 0.95],
color='b', bins=80, smooth=1.0);
In [24]:
# Generate a straight line model for each parameter combination, and plot:
X = np.linspace(xlimits[0],xlimits[1],50)
for i in (np.random.rand(100)*len(chain)).astype(int):
m,b = chain[i]
plt.plot(X, b+X*m, 'b-', alpha=0.1)
plot_line(m_ls, b_ls)
# Overlay the data, for comparison:
plot_yerr(x, y, sigmay);
${\rm Pr}(T(d^{\rm rep})>T(d)\,|\,d) = \int I(T(d^{\rm rep})>T(d))\,{\rm Pr}(d^{\rm rep}\,|\,\theta)\,{\rm Pr}(\theta\,|\,d)\;d\theta\,dd^{\rm rep}$
In [25]:
# Test statistics: functions of the data, not the parameters.
# 1) Reduced chisq for the best fit model:
# def test_statistic(x,y,sigmay,b_ls,m_ls):
# return np.sum((y - m_ls*x - b_ls)**2.0/sigmay**2.0)/(len(y)-2)
# 2) Reduced chisq for the best fit m=0 model:
# def test_statistic(x,y,sigmay,dummy1,dummy2):
# return np.sum((y - np.mean(y))**2.0/sigmay**2.0)/(len(y)-1)
# 3) Weighted mean y:
# def test_statistic(x,y,sigmay,dummy1,dummy2):
# return np.sum(y/sigmay**2.0)/np.sum(1.0/sigmay**2.0)
# 4) Variance of y:
# def test_statistic(x,y,sigmay,dummy1,dummy2):
# return np.var(y)
# 5) Correlation coefficient:
import scipy.stats
def test_statistic(x,y,dummy1,dummy2,dummy3):
'''
Pearson r correlation coefficient:
r12 = \sum [(xi - xbar)*(yi - ybar)] / [\sum (xi - xbar)^2 * \sum (yi - ybar)^2]^1/2
'''
r12 = np.sum((x - np.mean(x))*(y - np.mean(y))) / \
np.sqrt(np.sum((x - np.mean(x))**2) * np.sum((y - np.mean(y))**2))
return r12
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T:
T = np.zeros(len(chain))
for k,(m,b) in enumerate(chain):
yrep = b + m*x + np.random.randn(len(x)) * sigmay
T[k] = test_statistic(x,yrep,sigmay,b_ls,m_ls)
# Compare with the test statistic of the data, on a plot:
Td = test_statistic(x, y, sigmay, b_ls, m_ls)
plt.hist(T, 100, histtype='step', color='blue', lw=2, range=(0.0,np.percentile(T,99.0)))
plt.axvline(Td, color='black', linestyle='--', lw=2)
plt.xlabel('Test statistic')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
P = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d) = ",P,"%")
${\rm Pr}(T(d^{\rm rep},\theta)>T(d,\theta)\,|\,d) = \int I(T(d^{\rm rep},\theta)>T(d,\theta))\,{\rm Pr}(d^{\rm rep}\,|\,\theta)\,{\rm Pr}(\theta\,|\,d)\;d\theta\,dd^{\rm rep}$
In [26]:
# Discrepancy: functions of the data AND parameters.
# 1) Reduced chisq for the model:
def test_statistic(x,y,sigmay,b,m):
return np.sum((y - m*x - b)**2.0/sigmay**2.0)/(len(y)-2)
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T,
# AND ALSO its Td (which now depends on the parameters, too):
T = np.zeros(len(chain))
Td = np.zeros(len(chain))
for k,(m,b) in enumerate(chain):
yrep = b + m*x + np.random.randn(len(x)) * sigmay
T[k] = test_statistic(x,yrep,sigmay,b,m)
Td[k] = test_statistic(x,y,sigmay,b,m)
# Compare T with Td, on a scatter plot - how often is T>Td?
plt.scatter(Td, T, color='blue',alpha=0.1)
plt.plot([0.0, 100.0], [0.0, 100.], color='k', linestyle='--', linewidth=2)
plt.xlabel('Observed discrepancy $T(d,\\theta)$')
plt.ylabel('Replicated discrepancy $T(d^{\\rm rep},\\theta)$')
plt.ylim([0.0,np.percentile(Td,99.0)])
plt.xlim([0.0,np.percentile(Td,99.0)])
Out[26]:
In [27]:
# Histogram of differences:
diff = T-Td
plt.hist(diff, 100, histtype='step', color='blue', lw=2, range=(np.percentile(diff,1.0),np.percentile(diff,99.0)))
plt.axvline(0.0, color='black', linestyle='--', lw=2)
plt.xlabel('Difference $T(d^{\\rm rep},\\theta) - T(d,\\theta)$')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
Pline = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d) = ",Pline,"%")
$y = m x + b + q x^2$
In [28]:
def quadratic_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values y at
known values x given Gaussian measurement noise with standard
deviation with known sigmay, where the "true" y values are
y_t = m*x + b + q**2
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
q: quadratic term coefficient
where theta = (m, b, q)
Returns: scalar log likelihood
'''
m, b, q = theta
return (np.sum(np.log(1./(np.sqrt(2.*np.pi) * sigmay))) +
np.sum(-0.5 * (y - (m*x + b + q*x**2))**2 / sigmay**2))
def quadratic_log_prior(theta, theta_limits):
m, b, q = theta
mlimits, blimits, qpars = theta_limits
# m and b:
log_mb_prior = straight_line_log_prior(np.array([m,b]), np.array([mlimits, blimits]))
# q:
log_q_prior = np.log(1./(np.sqrt(2.*np.pi) * qpars[1])) - \
0.5 * (q - qpars[0])**2 / qpars[1]**2
return log_mb_prior + log_q_prior
def quadratic_log_posterior(theta, x, y, sigmay, theta_limits):
return (quadratic_log_likelihood(theta, x, y, sigmay) +
quadratic_log_prior(theta, theta_limits))
In [29]:
# Define uniform prior limits, enforcing positivity in m and b:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
# Define Gaussian prior centered on zero for q:
qpars = [0.0,0.003]
# Initialize m, b at the center of prior:
m = 0.5*(mlimits[0]+mlimits[1])
b = 0.5*(blimits[0]+blimits[1])
q = qpars[0]
# Arrays to pass to the sampler:
qtheta = np.array([m,b,q])
qtheta_limits = (mlimits, blimits, qpars)
# Step sizes, small fractions of the prior width:
mstep = 0.02*(mlimits[1]-mlimits[0])
bstep = 0.05*(blimits[1]-blimits[0])
qstep = 0.02*qpars[1]
stepsize = np.array([mstep,bstep,qstep])
# How many steps?
nsteps = 10000
print('Running Metropolis Sampler for', nsteps, 'steps...')
qchain, log_probs, acceptance_rate = metropolis(
quadratic_log_posterior, qtheta, qtheta_limits, stepsize, nsteps=nsteps
)
print('Acceptance fraction:', acceptance_rate)
In [30]:
# Pull m, b and q arrays out of the Markov chain and plot them:
mm = [m for m,b,q in qchain]
bb = [b for m,b,q in qchain]
qq = [q for m,b,q in qchain]
# Traces, for convergence inspection:
plt.figure(figsize=(8,5))
plt.subplot(3,1,1)
plt.plot(mm, 'k-')
plt.ylim(mlimits)
plt.ylabel('Slope m')
plt.subplot(3,1,2)
plt.plot(bb, 'k-')
plt.ylim(blimits)
plt.ylabel('Intercept b')
plt.subplot(3,1,3)
plt.plot(qq, 'k-')
plt.ylim([qpars[0]-3*qpars[1],qpars[0]+3*qpars[1]])
plt.ylabel('Quadratic coefficient q')
Out[30]:
In [31]:
corner.corner(qchain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=True, fill_contours=True, levels=[0.68, 0.95],
color='green', bins=80, smooth=1.0);
In [32]:
# Posterior visual check, in data space:
X = np.linspace(xlimits[0],xlimits[1],100)
for i in (np.random.rand(100)*len(chain)).astype(int):
m,b,q = qchain[i]
plt.plot(X, b + X*m + q*X**2, 'g-', alpha=0.1)
plot_line(m_ls, b_ls);
plot_yerr(x, y, sigmay)
In [33]:
# Discrepancy: functions of the data AND parameters.
# 1) Reduced chisq for the model:
def test_statistic(x,y,sigmay,m,b,q):
return np.sum((y - m*x - b - q*x**2)**2.0/sigmay**2.0)/(len(y)-3)
# 2) Some other discrepancy measure:
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T,
# AND ALSO its Td:
T = np.zeros(len(qchain))
Td = np.zeros(len(qchain))
for k,(m,b,q) in enumerate(qchain):
yp = b + m*x + q*x**2 + sigmay*np.random.randn(len(x))
T[k] = test_statistic(x,yp,sigmay,m,b,q)
Td[k] = test_statistic(x,y,sigmay,m,b,q)
# Histogram of differences:
diff = T - Td
plt.hist(diff, 100, histtype='step', color='green', lw=2, range=(np.percentile(diff,1.0),np.percentile(diff,99.0)))
plt.axvline(0.0, color='black', linestyle='--', lw=2)
plt.xlabel('Difference $T(d^{\\rm rep},\\theta) - T(d,\\theta)$')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
Pquad = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d,quadratic) = ",Pquad,"%, cf. Pr(T>T(d)|d,straightline) = ",Pline,"%")
All of the above discussion was about model checking - assessment of the ability of a model to make accurate predictions.
Frequentist: assume the null hypothesis $H_0$, and calculate the probability of getting the test statistic (designed according to the alternative hypothesis $H_1$) larger than the observed one, by studying the distribution of a test statistic over a large ensemble of replica datasets (which under certain assumptions is a standard distribution).
Bayesian: assume the alternative hypothesis $H_1$, and calculate the posterior predictive probability distribution for the test statistic given the data, relative to the one test statistic we observed, marginalizing over the model parameters. In general this distribution will be non-standard, but we can often draw samples from the posterior PDF anyway.
$R = \frac{{\rm Pr}(d\,|\,H_1)}{{\rm Pr}(d\,|\,H_0)}$
${\rm Pr}(d\,|\,H) = \int\;{\rm Pr}(d\,|\,\theta,H)\;{\rm Pr}(\theta\,|\,H)\;d\theta$
The consequence of this is that the evidence depends on both model accuracy (goodness of fit, ability to predict data) and efficiency (not over-fitting by having implausible flexibility).
The following figure might help illustrate how the evidence depends on both goodness of fit (through the likelihood) and the complexity of the model (via the prior). In this 1D case, a Gaussian likelihood (red) is integrated over a uniform prior (blue): the evidence can be shown to be given by $E = f \times L_{\rm max}$, where $L_{\rm max}$ is the maximum possible likelihood, and $f$ is the fraction of the blue dashed area that is shaded red. $f$ is 0.31, 0.98, and 0.07 in each case.
In [34]:
from IPython.display import Image
Image('evidence.png')
Out[34]:
The illustration above shows us a few things:
1) The evidence can be made arbitrarily small by increasing the prior volume: the evidence is more conservative than focusing on the goodness of fit ($L_{\rm max}$) alone. Of course if you assign a prior you don't believe, then you should not expect to get out a meaningful answer for ${\rm Pr}(d\,|\,H)$.
2) The evidence is linearly sensitive to prior volume ($f$), but exponentially sensitive to goodness of fit ($L_{\rm max}$). It's still a likelihood, after all.
$\frac{{\rm Pr}(H_1|d)}{{\rm Pr}(H_0|d)} = \frac{{\rm Pr}(d|H_1)}{{\rm Pr}(d|H_0)} \; \frac{{\rm Pr}(H_1)}{{\rm Pr}(H_0)}$
Prior probabilities are very difficult to assign in most practical problems (notice that no theorist ever provides them). So, one way to interpret the evidence ratio is to note that:
A number of sampling algorithms have been developed that do calculate the evidence, during the process of sampling. These include:
${\rm Pr}(d\,|\,H) \approx \frac{1}{N_s} \sum_k\;{\rm Pr}(d\,|\,\theta_s,H)$
Simple Monte Carlo:
Consider the simplest possbile posterior inference, the mean of parameter $\theta$: $\bar{\theta} = \int\,\theta\,{\rm Pr}(\theta\,|\,d,H)\,d\theta$
This is approximated by an average over posterior samples:
$\bar{\theta} \approx \frac{1}{N_s}\,\sum_s\,\theta_s$
Now replace the posterior PDF with Bayes' Theorem:
$\bar{\theta} = \frac{1}{Z} \int\,\theta\,{\rm Pr}(d\,|\,\theta,H)\,{\rm Pr}(\theta\,|\,H)\,d\theta$
and the posterior samples with prior samples:
$\bar{\theta} \approx \frac{1}{(Z N_s)}\,\sum_s\,\theta_s\,{\rm Pr}(d\,|\,\theta_s,H)$
This is a weighted mean - if we don't want $Z$, we just define weights $w_s = {\rm Pr}(d\,|\,\theta_s,H)$ compute the approximate posterior mean as the likelihood-weighted prior mean:
$\bar{\theta} \approx \frac{\sum_s\,w_s\,\theta_s}{\sum_s\,w_s}$
To compute Z, we multiply both sides by $Z$ and replace $\theta$ with 1:
$Z \approx \frac{1}{N_s} \sum_s\,\,w_s$
$R = \frac{{\rm Pr}(d\,|\,{\rm quadratic})}{{\rm Pr}(d\,|\,{\rm straight line})}$
In [35]:
# Draw a large number of prior samples and calculate the log likelihood for each one:
N = 50000
# Set the priors:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
qpars = [0.0,0.003]
# Sample from the prior:
mm = np.random.uniform(mlimits[0],mlimits[1], size=N)
bb = np.random.uniform(blimits[0],blimits[1], size=N)
qq = qpars[0] + qpars[1]*np.random.randn(N)
# We'll store the posterior samples as a "chain" again
schain = []
log_likelihood_straight_line = np.zeros(N)
log_likelihood_quadratic = np.zeros(N)
for i in range(N):
theta = np.array([mm[i], bb[i]])
log_likelihood_straight_line[i] = straight_line_log_likelihood(theta, x, y, sigmay)
qtheta = np.array([mm[i], bb[i], qq[i]])
log_likelihood_quadratic[i] = quadratic_log_likelihood(qtheta, x, y, sigmay)
schain.append((mm[i],bb[i],qq[i]))
Now that we have the log likelihood for each sample, let's check that we did actually sample the posterior well. Here are the corner plots (note that for plotting, the weights don't need to be correctly normalized - and also that we do not want to plot the points as well as the contours, since the points are prior samples not posterior ones!):
In [36]:
# Unnormalized likelihoods for plotting:
unnormalized_likelihood_straight_line = np.exp(log_likelihood_straight_line - log_likelihood_straight_line.max())
unnormalized_likelihood_quadratic = np.exp(log_likelihood_quadratic - log_likelihood_quadratic.max())
In [37]:
corner.corner(schain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
weights=unnormalized_likelihood_straight_line,
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=False, fill_contours=True, levels=[0.68, 0.95],
color='blue', bins=80, smooth=1.0);
In [38]:
corner.corner(schain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
weights=unnormalized_likelihood_quadratic,
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=False, fill_contours=True, levels=[0.68, 0.95],
color='green', bins=80, smooth=1.0);
Now let's compute the evidence - we'll need a special function that stably calculates the average $x$ given an array of $\log x$...
In [40]:
def logaverage(x):
mx = x.max()
return np.log(np.sum(np.exp(x - mx))) + mx - np.log(len(x))
log_evidence_straight_line = logaverage(log_likelihood_straight_line)
log_evidence_quadratic = logaverage(log_likelihood_quadratic)
print('log Evidence for Straight Line Model:', log_evidence_straight_line)
print('log Evidence for Quadratic Model:', log_evidence_quadratic)
print('Evidence ratio in favour of the Quadratic Model:', np.int(np.exp(log_evidence_quadratic - log_evidence_straight_line)),"to 1")
Play around with the Simple Monte Carlo sampler, and see if you can break it.
Try running it a few times, with the same N
: how precise are these evidence estimates from 50000 samples?
We said we believed a priori that $q$ should be small, and set the prior width accordingly. What would happen if we started to doubt ourselves, and re-assigned a broader prior? Try running with some larger values for the Gaussian prior width $q[1]$, and note what happens to the Bayes Factor.
Good things to keep in mind when using, or reading about, the evidence for a model:
Garbage in, garbage out: if you don't believe your prior, why should you or anyone else believe your evidence?
The evidence is only linearly sensitive to prior volume, but exponentially sensitive to goodness of fit. If you want a clearer sense of which model makes the data more probable, get more data.
If you don't believe your priors, the evidence can be a distraction from things you care about more - like accuracy, or cost.
If you do believe your priors, the evidence is very useful: it appears in second-level inferences, when parts of your model that you previously considered to be constant now need to be varied and inferred. The FML is only one parameter liberation away from becoming a likelihood.
.
${\rm Pr}(d_k\,|\,d_t,H) = \int {\rm Pr}(d_k\,|\,\theta,H)\;{\rm Pr}(\theta\,|\,d_t,H)\;d\theta$
Approximating this integral over samples $\theta_{t,s}$ drawn from the "trained" posterior PDF ${\rm Pr}(\theta\,|\,d_t,H)$, we get
${\rm Pr}(d_k\,|\,d_t,H) \approx \frac{1}{N_s}\;\sum_s\;{\rm Pr}(d_k\,|\,\theta_{t,s},H)$
$\prod_k\;{\rm Pr}(d_k\,|\,d_t,H) = \prod_k\; \int {\rm Pr}(d_k\,|\,\theta,H)\; \frac{1}{Z_t}\;{\rm Pr}(d_t\,|\,\theta,H)\;{\rm Pr}(\theta\,|,H) \;d\theta$
where $Z_t = {\rm Pr}(d_t\,|\,H)$, the evidence for $H$ given the training data. The numerator in the integrand is just the overall evidence $Z = {\rm Pr}(d\,|\,H)$, provided the datapoints are independent and the joint likelihood can be simply factorized.
$\prod_k\;{\rm Pr}(d_k\,|\,d_t,H) = \prod_k \frac{Z}{Z_t}$
This is like the evidence, but not the same as it.
$S = \prod_k\;{\rm Pr}(d_k\,|\,d_t,H)$
and the individual score as
$S_k = \frac{1}{N_s}\;\sum_s\;{\rm Pr}(d_k\,|\,\theta_{t,s},H)$
where the samples $\theta_{t,s}$ are drawn from ${\rm Pr}(\theta\,|\,d_t,H)$. This way, $S\approx \frac{Z^N}{\prod_k Z_t}$.
Here's some pseudocode:
split data into training set dt and test datapoint dk
infer Pr(x|dt,H), drawing posterior samples xs
compute score = mean(likelihood(xs,dk))
combine scores for splits via mean(score)
Here, the combined score is not a quantity that has minimum 0.0 and maximum 1.0. The value of the score only makes sense when compared to another model's score: the difference in log scores is something similar to the log Bayes factor log[Pr(dh,dt|H2)/Pr(dh,dt|H1)]
.
scikit-learn
already has a nice API, with (I guess) a Model
class and some very useful cross-validation machinery. Maybe we could write BayesianModel
that inherits from Model
, and give it a sampler, priors, etc, as well as fit
and predict
methods (which could return ensembles of samples instead of point estimates). And then we would define H1 = Model(assumptions1)
and feed it to the GridSearchCV
.def
for much easier re-use, and the plots could be combined to allow much easier comparison. Also, who does linear algebra these days? Surely scikit-learn
has a linear model we could use. Pull requests are most welcome!emcee
), package it up, and compete for business. Can you automate the calculation of the evidence to a desired precision? If you can, I think that in many cases that will probably guarantee that you have sampled the posterior well. .
Copyright (C) Phil Marshall, Daniel Foreman-Mackey & Dustin Lang
This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
In [ ]: