In [ ]:
%load_ext autoreload
%autoreload 2
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (6.0, 6.0)
plt.rcParams['savefig.dpi'] = 100
In [ ]:
from straightline_utils import *
In [ ]:
(x,y,sigmay) = generate_data()
plot_yerr(x, y, sigmay)
In [ ]:
def straight_line_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values *y* at
known values *x* given Gaussian measurement noise with standard
deviation with known *sigmay*, where the "true" y values are
*y_t = m * x + b*
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
Returns: scalar log likelihood
'''
m,b = theta
return (np.sum(np.log(1./(np.sqrt(2.*np.pi) * sigmay))) +
np.sum(-0.5 * (y - (m*x + b))**2 / sigmay**2))
In [ ]:
def straight_line_log_prior(theta, theta_limits):
m, b = theta
mlimits, blimits = theta_limits
# Uniform in m:
if (m < mlimits[0]) | (m > mlimits[1]):
log_m_prior = -np.inf
else:
log_m_prior = np.log(1.0/(mlimits[1] - mlimits[0]))
# Uniform in b:
if (b < blimits[0]) | (b > blimits[1]):
log_b_prior = -np.inf
else:
log_b_prior = np.log(1.0/(blimits[1] - blimits[0]))
return log_m_prior + log_b_prior
def straight_line_log_posterior(theta, x, y, sigmay, theta_limits):
return (straight_line_log_likelihood(theta, x, y, sigmay) +
straight_line_log_prior(theta, theta_limits))
In [ ]:
# Evaluate log P(m,b | x,y,sigmay) on a grid.
# Define uniform prior limits, enforcing positivity in both parameters:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
theta_limits = (mlimits, blimits)
# Set up grid:
mgrid = np.linspace(mlimits[0], mlimits[1], 101)
bgrid = np.linspace(blimits[0], blimits[1], 101)
log_posterior = np.zeros((len(mgrid),len(bgrid)))
# Evaluate log posterior PDF:
for im,m in enumerate(mgrid):
for ib,b in enumerate(bgrid):
theta = (m,b)
log_posterior[im,ib] = straight_line_log_posterior(theta, x, y, sigmay, theta_limits)
# Convert to probability density and plot, taking care with very small values:
posterior = np.exp(log_posterior - log_posterior.max())
plt.imshow(posterior, extent=[blimits[0],blimits[1],mlimits[0],mlimits[1]],cmap='Blues',
interpolation='none', origin='lower', aspect=(blimits[1]-blimits[0])/(mlimits[1]-mlimits[0]),
vmin=0, vmax=1)
plt.contour(bgrid, mgrid, posterior, pdf_contour_levels(posterior), colors='k')
i = np.argmax(posterior)
i,j = np.unravel_index(i, posterior.shape)
print('Grid maximum posterior values (m,b) =', mgrid[i], bgrid[j])
plt.title('Straight line: posterior PDF for parameters');
plot_mb_setup(*theta_limits);
In [ ]:
def metropolis(log_posterior, theta, theta_limits, stepsize, nsteps=10000):
'''
log_posterior: function of theta
theta_limits: uniform prior ranges
stepsize: scalar or vector proposal distribution width
nsteps: desired number of samples
'''
log_prob = log_posterior(theta, x, y, sigmay, theta_limits)
# Store Markov chain as an array of samples:
chain = np.empty((nsteps, len(theta)))
log_probs = np.empty(nsteps)
# Count our accepted proposals:
naccept = 0
for i in range(nsteps):
theta_new = theta + stepsize * np.random.randn(len(theta))
log_prob_new = log_posterior(theta_new, x, y, sigmay, theta_limits)
if np.log(np.random.rand()) < (log_prob_new - log_prob):
# accept, and move to the proposed position:
theta = theta_new
log_prob = log_prob_new
naccept += 1
else:
# reject, and store the same sample as before:
pass
chain[i] = theta
log_probs[i] = log_prob
acceptance_rate = naccept/float(nsteps)
return chain,log_probs,acceptance_rate
In [ ]:
# Initialize m, b at the center of prior:
m = 0.5*(mlimits[0]+mlimits[1])
b = 0.5*(blimits[0]+blimits[1])
theta = np.array([m,b])
# Step sizes, 2% or 5% of the prior
mstep = 0.02*(mlimits[1]-mlimits[0])
bstep = 0.05*(blimits[1]-blimits[0])
stepsize = np.array([mstep,bstep])
# How many steps?
nsteps = 10000
print('Running Metropolis Sampler for', nsteps, 'steps...')
chain, log_probs, acceptance_rate = metropolis(
straight_line_log_posterior, theta, theta_limits, stepsize, nsteps=nsteps
)
print('Acceptance fraction:', acceptance_rate)
In [ ]:
# Pull m and b arrays out of the Markov chain and plot them:
mm = [m for m,b in chain]
bb = [b for m,b in chain]
# Traces, for convergence inspection:
plt.figure(figsize=(8,5))
plt.subplot(2,1,1)
plt.plot(mm, 'k-')
plt.ylim(mlimits)
plt.ylabel('m')
plt.subplot(2,1,2)
plt.plot(bb, 'k-')
plt.ylabel('Intercept b')
plt.ylim(blimits)
In [ ]:
!pip install --upgrade --no-deps corner
In [ ]:
import corner
corner.corner(chain[5000:], labels=['m','b'], range=[mlimits,blimits],quantiles=[0.16,0.5,0.84],
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=True, fill_contours=True, levels=[0.68, 0.95],
color='b', bins=80, smooth=1.0);
In [ ]:
# Generate a straight line model for each parameter combination, and plot:
X = np.linspace(xlimits[0],xlimits[1],50)
for i in (np.random.rand(100)*len(chain)).astype(int):
m,b = chain[i]
plt.plot(X, b+X*m, 'b-', alpha=0.1)
# Overlay the data, for comparison:
plot_yerr(x, y, sigmay);
${\rm Pr}(T(d^{\rm rep})>T(d)\,|\,d) = \int I(T(d^{\rm rep})>T(d))\,{\rm Pr}(d^{\rm rep}\,|\,\theta)\,{\rm Pr}(\theta\,|\,d)\;d\theta\,dd^{\rm rep}$
In [ ]:
# Test statistics: functions of the data, not the parameters.
# 1) Weighted mean y:
# def test_statistic(x,y,sigmay):
# return np.sum(y/sigmay**2.0)/np.sum(1.0/sigmay**2.0)
# 2) Variance of y:
# def test_statistic(x,y,sigmay):
# return np.var(y)
# 3) Pearson r correlation coefficient:
import scipy.stats
def test_statistic(x,y,dummy):
'''
Pearson r correlation coefficient:
r12 = \sum [(xi - xbar)*(yi - ybar)] / [\sum (xi - xbar)^2 * \sum (yi - ybar)^2]^1/2
'''
r12 = np.sum((x - np.mean(x))*(y - np.mean(y))) / \
np.sqrt(np.sum((x - np.mean(x))**2) * np.sum((y - np.mean(y))**2))
return r12
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T:
T = np.zeros(len(chain))
for k,(m,b) in enumerate(chain):
yrep = b + m*x + np.random.randn(len(x)) * sigmay
T[k] = test_statistic(x,yrep,sigmay)
# Compare with the test statistic of the data, on a plot:
Td = test_statistic(x, y, sigmay)
plt.hist(T, 100, histtype='step', color='blue', lw=2, range=(0.0,np.percentile(T,99.0)))
plt.axvline(Td, color='black', linestyle='--', lw=2)
plt.xlabel('Test statistic')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
P = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d) = ",P,"%")
${\rm Pr}(T(d^{\rm rep},\theta)>T(d,\theta)\,|\,d) = \int I(T(d^{\rm rep},\theta)>T(d,\theta))\,{\rm Pr}(d^{\rm rep}\,|\,\theta)\,{\rm Pr}(\theta\,|\,d)\;d\theta\,dd^{\rm rep}$
In [ ]:
# Discrepancy: functions of the data AND parameters.
# 1) Reduced chisq for the model:
def discrepancy(x,y,sigmay,b,m):
return np.sum((y - m*x - b)**2.0/sigmay**2.0)/(len(y)-2)
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T,
# AND ALSO its Td (which now depends on the parameters, too):
T = np.zeros(len(chain))
Td = np.zeros(len(chain))
for k,(m,b) in enumerate(chain):
yrep = b + m*x + np.random.randn(len(x)) * sigmay
T[k] = discrepancy(x,yrep,sigmay,b,m)
Td[k] = discrepancy(x,y,sigmay,b,m)
# Compare T with Td, on a scatter plot - how often is T>Td?
plt.scatter(Td, T, color='blue',alpha=0.1)
plt.plot([0.0, 100.0], [0.0, 100.], color='k', linestyle='--', linewidth=2)
plt.xlabel('Observed discrepancy $T(d,\\theta)$')
plt.ylabel('Replicated discrepancy $T(d^{\\rm rep},\\theta)$')
plt.ylim([0.0,np.percentile(Td,99.0)])
plt.xlim([0.0,np.percentile(Td,99.0)])
In [ ]:
# Histogram of differences:
diff = T-Td
plt.hist(diff, 100, histtype='step', color='blue', lw=2, range=(np.percentile(diff,1.0),np.percentile(diff,99.0)))
plt.axvline(0.0, color='black', linestyle='--', lw=2)
plt.xlabel('Difference $T(d^{\\rm rep},\\theta) - T(d,\\theta)$')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
Pline = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d) = ",Pline,"%")
In some sense, the reduced chi-squared is actually not such an interesting test statistic, because it's very similar to the log likelihood!
Still more powerful discrepancy measures might stress-test different aspects of the model. Talk to your neighbor about where and how this model might be failing, and see if you can design a better discrepancy measure than reduced chi-squared.
$y = m x + b + q x^2$
In [ ]:
def quadratic_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values y at
known values x given Gaussian measurement noise with standard
deviation with known sigmay, where the "true" y values are
y_t = m*x + b + q**2
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
q: quadratic term coefficient
where theta = (m, b, q)
Returns: scalar log likelihood
'''
m, b, q = theta
return (np.sum(np.log(1./(np.sqrt(2.*np.pi) * sigmay))) +
np.sum(-0.5 * (y - (m*x + b + q*x**2))**2 / sigmay**2))
def quadratic_log_prior(theta, theta_limits):
m, b, q = theta
mlimits, blimits, qpars = theta_limits
# m and b:
log_mb_prior = straight_line_log_prior(np.array([m,b]), np.array([mlimits, blimits]))
# q:
log_q_prior = np.log(1./(np.sqrt(2.*np.pi) * qpars[1])) - \
0.5 * (q - qpars[0])**2 / qpars[1]**2
return log_mb_prior + log_q_prior
def quadratic_log_posterior(theta, x, y, sigmay, theta_limits):
return (quadratic_log_likelihood(theta, x, y, sigmay) +
quadratic_log_prior(theta, theta_limits))
In [ ]:
# Define uniform prior limits, enforcing positivity in m and b:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
# Define Gaussian prior centered on zero for q:
qpars = [0.0,0.003]
# Initialize m, b at the center of prior:
m = 0.5*(mlimits[0]+mlimits[1])
b = 0.5*(blimits[0]+blimits[1])
q = qpars[0]
# Arrays to pass to the sampler:
qtheta = np.array([m,b,q])
qtheta_limits = (mlimits, blimits, qpars)
# Step sizes, small fractions of the prior width:
mstep = 0.01*(mlimits[1]-mlimits[0])
bstep = 0.04*(blimits[1]-blimits[0])
qstep = 0.02*qpars[1]
stepsize = np.array([mstep,bstep,qstep])
# How many steps?
nsteps = 10000
print('Running Metropolis Sampler for', nsteps, 'steps...')
qchain, log_probs, acceptance_rate = metropolis(
quadratic_log_posterior, qtheta, qtheta_limits, stepsize, nsteps=nsteps
)
print('Acceptance fraction:', acceptance_rate)
In [ ]:
# Pull m, b and q arrays out of the Markov chain and plot them:
mm = [m for m,b,q in qchain]
bb = [b for m,b,q in qchain]
qq = [q for m,b,q in qchain]
# Traces, for convergence inspection:
plt.figure(figsize=(8,5))
plt.subplot(3,1,1)
plt.plot(mm, 'k-')
plt.ylim(mlimits)
plt.ylabel('Slope m')
plt.subplot(3,1,2)
plt.plot(bb, 'k-')
plt.ylim(blimits)
plt.ylabel('Intercept b')
plt.subplot(3,1,3)
plt.plot(qq, 'k-')
plt.ylim([qpars[0]-3*qpars[1],qpars[0]+3*qpars[1]])
plt.ylabel('Quadratic coefficient q')
In [ ]:
corner.corner(qchain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=True, fill_contours=True, levels=[0.68, 0.95],
color='green', bins=80, smooth=1.0);
In [ ]:
# Posterior visual check, in data space:
X = np.linspace(xlimits[0],xlimits[1],100)
for i in (np.random.rand(100)*len(chain)).astype(int):
m,b,q = qchain[i]
plt.plot(X, b + X*m + q*X**2, 'g-', alpha=0.1)
plot_yerr(x, y, sigmay)
In [ ]:
# Discrepancy: functions of the data AND parameters.
# 1) Reduced chisq for the model:
def discrepancy(x,y,sigmay,m,b,q):
return np.sum((y - m*x - b - q*x**2)**2.0/sigmay**2.0)/(len(y)-3)
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T,
# AND ALSO its Td:
T = np.zeros(len(qchain))
Td = np.zeros(len(qchain))
for k,(m,b,q) in enumerate(qchain):
yp = b + m*x + q*x**2 + sigmay*np.random.randn(len(x))
T[k] = discrepancy(x,yp,sigmay,m,b,q)
Td[k] = discrepancy(x,y,sigmay,m,b,q)
# Histogram of differences:
diff = T - Td
plt.hist(diff, 100, histtype='step', color='green', lw=2, range=(np.percentile(diff,1.0),np.percentile(diff,99.0)))
plt.axvline(0.0, color='black', linestyle='--', lw=2)
plt.xlabel('Difference $T(d^{\\rm rep},\\theta) - T(d,\\theta)$')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
Pquad = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d,quadratic) = ",Pquad,"%, cf. Pr(T>T(d)|d,straightline) = ",Pline,"%")
A number of sampling algorithms have been developed that do calculate the evidence, during the process of sampling. These include:
${\rm Pr}(d\,|\,H) \approx \frac{1}{N_s} \sum_k\;{\rm Pr}(d\,|\,\theta_s,H)$
Simple Monte Carlo:
Consider the simplest possbile posterior inference, the mean of parameter $\theta$: $\bar{\theta} = \int\,\theta\,{\rm Pr}(\theta\,|\,d,H)\,d\theta$
This is approximated by an average over posterior samples:
$\bar{\theta} \approx \frac{1}{N_s}\,\sum_s\,\theta_s$
Now replace the posterior PDF with Bayes' Theorem:
$\bar{\theta} = \frac{1}{Z} \int\,\theta\,{\rm Pr}(d\,|\,\theta,H)\,{\rm Pr}(\theta\,|\,H)\,d\theta$
and the posterior samples with prior samples:
$\bar{\theta} \approx \frac{1}{(Z N_s)}\,\sum_s\,\theta_s\,{\rm Pr}(d\,|\,\theta_s,H)$
This is a weighted mean - if we don't want $Z$, we just define weights $w_s = {\rm Pr}(d\,|\,\theta_s,H)$ compute the approximate posterior mean as the likelihood-weighted prior mean:
$\bar{\theta} \approx \frac{\sum_s\,w_s\,\theta_s}{\sum_s\,w_s}$
To compute Z, we multiply both sides by $Z$ and replace $\theta$ with 1:
$Z \approx \frac{1}{N_s} \sum_s\,\,w_s$
$R = \frac{{\rm Pr}(d\,|\,{\rm quadratic})}{{\rm Pr}(d\,|\,{\rm straight line})}$
In [ ]:
# Draw a large number of prior samples and calculate the log likelihood for each one:
N = 50000
# Set the priors:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
qpars = [0.0,0.003]
# Sample from the prior:
mm = np.random.uniform(mlimits[0],mlimits[1], size=N)
bb = np.random.uniform(blimits[0],blimits[1], size=N)
qq = qpars[0] + qpars[1]*np.random.randn(N)
# We'll store the posterior samples as a "chain" again
schain = []
log_likelihood_straight_line = np.zeros(N)
log_likelihood_quadratic = np.zeros(N)
for i in range(N):
theta = np.array([mm[i], bb[i]])
log_likelihood_straight_line[i] = straight_line_log_likelihood(theta, x, y, sigmay)
qtheta = np.array([mm[i], bb[i], qq[i]])
log_likelihood_quadratic[i] = quadratic_log_likelihood(qtheta, x, y, sigmay)
schain.append((mm[i],bb[i],qq[i]))
Now that we have the log likelihood for each sample, let's check that we did actually sample the posterior well. Here are the corner plots (note that for plotting, the weights don't need to be correctly normalized - and also that we do not want to plot the points as well as the contours, since the points are prior samples not posterior ones!):
In [ ]:
# Unnormalized likelihoods for plotting:
unnormalized_likelihood_straight_line = np.exp(log_likelihood_straight_line - log_likelihood_straight_line.max())
unnormalized_likelihood_quadratic = np.exp(log_likelihood_quadratic - log_likelihood_quadratic.max())
In [ ]:
corner.corner(schain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
weights=unnormalized_likelihood_straight_line,
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=False, fill_contours=True, levels=[0.68, 0.95],
color='blue', bins=80, smooth=1.0);
In [ ]:
corner.corner(schain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
weights=unnormalized_likelihood_quadratic,
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=False, fill_contours=True, levels=[0.68, 0.95],
color='green', bins=80, smooth=1.0);
Now let's compute the evidence - we'll need a special function that stably calculates the average $x$ given an array of $\log x$...
In [ ]:
def logaverage(x):
mx = x.max()
return np.log(np.sum(np.exp(x - mx))) + mx - np.log(len(x))
log_evidence_straight_line = logaverage(log_likelihood_straight_line)
log_evidence_quadratic = logaverage(log_likelihood_quadratic)
print('log Evidence for Straight Line Model:', log_evidence_straight_line)
print('log Evidence for Quadratic Model:', log_evidence_quadratic)
print('Evidence ratio in favour of the Quadratic Model:', np.int(np.exp(log_evidence_quadratic - log_evidence_straight_line)),"to 1")
Play around with the Simple Monte Carlo evidence estimate, and see if you can break it.
Try running it a few times, with the same N
: how precise are these evidence estimates from 50000 samples?
We said we believed a priori that $q$ should be small, and set the prior width accordingly. What would happen if we started to doubt ourselves, and re-assigned a broader prior? Try running with some larger values for the Gaussian prior width $q[1]$, and note what happens to the Bayes Factor.
Good things to keep in mind when using, or reading about, the evidence for a model:
Garbage in, garbage out: if you don't believe your prior, why should you or anyone else believe your evidence?
The evidence is only linearly sensitive to prior volume, but exponentially sensitive to goodness of fit. If you want a clearer sense of which model makes the data more probable, get more data.
If you don't believe your priors, the evidence can be a distraction from things you care about more - like accuracy, or cost.
If you do believe your priors, the evidence is very useful: it appears in second-level inferences, when parts of your model that you previously considered to be constant now need to be varied and inferred. The FML is only one parameter liberation away from becoming a likelihood.
.