In [1]:
%load_ext autoreload
%autoreload 2
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (6.0, 6.0)
plt.rcParams['savefig.dpi'] = 100
In [2]:
from straightline_utils import *
In [3]:
(x,y,sigmay) = generate_data()
plot_yerr(x, y, sigmay)
numpy.linalg
.numpy
arrays). We might say that we want to find $m$ and $b$ that minimize$W^2 = \sum_k \frac{(y_k - m x_k - b)^2}{\sigma_k^2}$
np.linalg.lstsq
is doing.
In [4]:
# Linear algebra to implement weighted least squares.
N = len(x)
A = np.zeros((N,2))
A[:,0] = 1. / sigmay
A[:,1] = x / sigmay
yy = y / sigmay
theta,nil,nil,nil = np.linalg.lstsq(A,yy)
b_ls,m_ls = theta
print('Weighted least-squares estimated b,m:', b_ls,m_ls)
plot_yerr(x, y, sigmay)
plot_line(m_ls, b_ls);
In [5]:
# import straightline_pgm
# straightline_pgm.draw();
from IPython.display import Image
Image(filename="straightline_pgm.png",width=600)
Out[5]:
${\rm Pr}(m,b,\{y_k\},\{y^{\rm true}_k\}\,|\,\{x_k\},\{\sigma_k\},H_1) = \prod_k {\rm Pr}(y_k\,|\,y^{\rm true}_k,\sigma_k,H_1) \;{\rm Pr}(y^{\rm true}_k\,|\,x_k,m,b,H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1)$
Where did that product come from? What assumption does it encode?
Why are the $\{x_k\}$ and the $\{\sigma_k\}$ always on the right hand side of the bar, but $\{y_k\}$ is sometimes on the left?
What is the meaning of "$H_1$"? Hint: notice that it too is always on the right hand side of the bar.
What functional form does the conditional PDF ${\rm Pr}(y^{\rm true}_k\,|\,x_k,m,b,H_1)$ have?
What functional form do you think we should assume for the sampling distribution, ${\rm Pr}(y_k\,|\,y^{\rm true}_k,x_k,\sigma_k,H_1)$? Hint: imagine you were generating mock data.
What functional form should we assume for ${\rm Pr}(m,b\,|\,H_1)$?
Talk to your neighbor(s) for 5 minutes about these, note down some brief answers, and be prepared to provide them when called upon.
In [6]:
# %load straightline_answers.md
${\rm Pr}(m,b,\{y_k\}|H_1) = \int\, \prod_k {\rm Pr}(y_k\,|\,y^{\rm true}_k,H_1) \;{\rm Pr}(y^{\rm true}_k\,|\,m,b,H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1) \; d y^{\rm true}_k $
$\longrightarrow {\rm Pr}(m,b,\{y_k\}|H_1) = \prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1) $
${\rm Pr}(m,b\,|\,\{y_k\},H_1) = \frac{1}{Z} \prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1) \;\;\;{\rm Pr}(m,b\,|\,H_1) $
$Z = {\rm Pr}(\{y_k\}\,|\,H_1) = \int \prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1) \;{\rm Pr}(m,b\,|\,H_1)\;dm\,db$
and $\prod_k {\rm Pr}(y_k\,|\,(m x_k + b),H_1)$ is the joint likelihood for the parameters. ${\rm Pr}(m,b\,|\,H_1)$ is the prior PDF for the parameters.
In [7]:
def straight_line_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values *y* at
known values *x* given Gaussian measurement noise with standard
deviation with known *sigmay*, where the "true" y values are
*y_t = m * x + b*
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
and the parameter vector theta = (m,b).
Returns: scalar log likelihood
'''
m,b = theta
L = 1.0
for k in range(len(y)):
L *= np.exp(-(y[k] - m*x[k] - b)**2/(2.0*sigmay[k]**2))
L /= np.sqrt(2.0 * np.pi * sigmay[k]**2)
return np.log(L)
In [9]:
# %load straightline_log_likelihood.py
def straight_line_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values *y* at
known values *x* given Gaussian measurement noise with standard
deviation with known *sigmay*, where the "true" y values are
*y_t = m * x + b*
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
Returns: scalar log likelihood
'''
m,b = theta
return (np.sum(np.log(1./(np.sqrt(2.*np.pi) * sigmay))) +
np.sum(-0.5 * (y - (m*x + b))**2 / sigmay**2))
In [10]:
def straight_line_log_prior(theta, theta_limits):
m, b = theta
mlimits, blimits = theta_limits
# Uniform in m:
if (m < mlimits[0]) | (m > mlimits[1]):
log_m_prior = -np.inf
else:
log_m_prior = np.log(1.0/(mlimits[1] - mlimits[0]))
# Uniform in b:
if (b < blimits[0]) | (b > blimits[1]):
log_b_prior = -np.inf
else:
log_b_prior = np.log(1.0/(blimits[1] - blimits[0]))
return log_m_prior + log_b_prior
def straight_line_log_posterior(theta, x, y, sigmay, theta_limits):
return (straight_line_log_likelihood(theta, x, y, sigmay) +
straight_line_log_prior(theta, theta_limits))
In [11]:
# Evaluate log P(m,b | x,y,sigmay) on a grid.
# Define uniform prior limits, enforcing positivity in both parameters:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
theta_limits = (mlimits, blimits)
# Set up grid:
mgrid = np.linspace(mlimits[0], mlimits[1], 101)
bgrid = np.linspace(blimits[0], blimits[1], 101)
log_posterior = np.zeros((len(mgrid),len(bgrid)))
# Evaluate log posterior PDF:
for im,m in enumerate(mgrid):
for ib,b in enumerate(bgrid):
theta = (m,b)
log_posterior[im,ib] = straight_line_log_posterior(theta, x, y, sigmay, theta_limits)
# Convert to probability density and plot, taking care with very small values:
posterior = np.exp(log_posterior - log_posterior.max())
plt.imshow(posterior, extent=[blimits[0],blimits[1],mlimits[0],mlimits[1]],cmap='Blues',
interpolation='none', origin='lower', aspect=(blimits[1]-blimits[0])/(mlimits[1]-mlimits[0]),
vmin=0, vmax=1)
plt.contour(bgrid, mgrid, posterior, pdf_contour_levels(posterior), colors='k')
i = np.argmax(posterior)
i,j = np.unravel_index(i, posterior.shape)
print('Grid maximum posterior values (m,b) =', mgrid[i], bgrid[j])
plt.title('Straight line: posterior PDF for parameters');
plt.plot(b_ls, m_ls, 'r+', ms=12, mew=4);
plot_mb_setup(*theta_limits);
In [12]:
def metropolis(log_posterior, theta, theta_limits, stepsize, nsteps=10000):
'''
log_posterior: function of theta
theta_limits: uniform prior ranges
stepsize: scalar or vector proposal distribution width
nsteps: desired number of samples
'''
log_prob = log_posterior(theta, x, y, sigmay, theta_limits)
# Store Markov chain as an array of samples:
chain = np.empty((nsteps, len(theta)))
log_probs = np.empty(nsteps)
# Count our accepted proposals:
naccept = 0
for i in range(nsteps):
theta_new = theta + stepsize * np.random.randn(len(theta))
log_prob_new = log_posterior(theta_new, x, y, sigmay, theta_limits)
if np.log(np.random.rand()) < (log_prob_new - log_prob):
# accept, and move to the proposed position:
theta = theta_new
log_prob = log_prob_new
naccept += 1
else:
# reject, and store the same sample as before:
pass
chain[i] = theta
log_probs[i] = log_prob
acceptance_rate = naccept/float(nsteps)
return chain,log_probs,acceptance_rate
In [17]:
# Initialize m, b at the center of prior:
m = 0.5*(mlimits[0]+mlimits[1])
b = 0.5*(blimits[0]+blimits[1])
theta = np.array([m,b])
# Step sizes, 2% or 5% of the prior
mstep = 0.02*(mlimits[1]-mlimits[0])
bstep = 0.05*(blimits[1]-blimits[0])
stepsize = np.array([mstep,bstep])
# How many steps?
nsteps = 10000
print('Running Metropolis Sampler for', nsteps, 'steps...')
chain, log_probs, acceptance_rate = metropolis(
straight_line_log_posterior, theta, theta_limits, stepsize, nsteps=nsteps
)
print('Acceptance fraction:', acceptance_rate)
In [18]:
# Pull m and b arrays out of the Markov chain and plot them:
mm = [m for m,b in chain]
bb = [b for m,b in chain]
# Traces, for convergence inspection:
plt.figure(figsize=(8,5))
plt.subplot(2,1,1)
plt.plot(mm, 'k-')
plt.ylim(mlimits)
plt.ylabel('m')
plt.subplot(2,1,2)
plt.plot(bb, 'k-')
plt.ylabel('Intercept b')
plt.ylim(blimits)
Out[18]:
In [19]:
# First show contours from gridding calculation:
plt.contour(bgrid, mgrid, posterior, pdf_contour_levels(posterior), colors='k')
plt.gca().set_aspect((blimits[1]-blimits[0])/(mlimits[1]-mlimits[0]))
# Scatterplot of m,b posterior samples, overlaid:
plt.plot(bb, mm, 'b.', alpha=0.01)
plot_mb_setup(mlimits,blimits)
In [20]:
!pip install --upgrade --no-deps corner
In [23]:
import corner
corner.corner(chain[5000:], labels=['m','b'], range=[mlimits,blimits],quantiles=[0.16,0.5,0.84],
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=True, fill_contours=True, levels=[0.68, 0.95],
color='b', bins=80, smooth=1.0);
In [24]:
# Generate a straight line model for each parameter combination, and plot:
X = np.linspace(xlimits[0],xlimits[1],50)
for i in (np.random.rand(100)*len(chain)).astype(int):
m,b = chain[i]
plt.plot(X, b+X*m, 'b-', alpha=0.1)
plot_line(m_ls, b_ls)
# Overlay the data, for comparison:
plot_yerr(x, y, sigmay);
${\rm Pr}(T(d^{\rm rep})>T(d)\,|\,d) = \int I(T(d^{\rm rep})>T(d))\,{\rm Pr}(d^{\rm rep}\,|\,\theta)\,{\rm Pr}(\theta\,|\,d)\;d\theta\,dd^{\rm rep}$
In [25]:
# Test statistics: functions of the data, not the parameters.
# 1) Reduced chisq for the best fit model:
# def test_statistic(x,y,sigmay,b_ls,m_ls):
# return np.sum((y - m_ls*x - b_ls)**2.0/sigmay**2.0)/(len(y)-2)
# 2) Reduced chisq for the best fit m=0 model:
# def test_statistic(x,y,sigmay,dummy1,dummy2):
# return np.sum((y - np.mean(y))**2.0/sigmay**2.0)/(len(y)-1)
# 3) Weighted mean y:
# def test_statistic(x,y,sigmay,dummy1,dummy2):
# return np.sum(y/sigmay**2.0)/np.sum(1.0/sigmay**2.0)
# 4) Variance of y:
# def test_statistic(x,y,sigmay,dummy1,dummy2):
# return np.var(y)
# 5) Correlation coefficient:
import scipy.stats
def test_statistic(x,y,dummy1,dummy2,dummy3):
'''
Pearson r correlation coefficient:
r12 = \sum [(xi - xbar)*(yi - ybar)] / [\sum (xi - xbar)^2 * \sum (yi - ybar)^2]^1/2
'''
r12 = np.sum((x - np.mean(x))*(y - np.mean(y))) / \
np.sqrt(np.sum((x - np.mean(x))**2) * np.sum((y - np.mean(y))**2))
return r12
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T:
T = np.zeros(len(chain))
for k,(m,b) in enumerate(chain):
yrep = b + m*x + np.random.randn(len(x)) * sigmay
T[k] = test_statistic(x,yrep,sigmay,b_ls,m_ls)
# Compare with the test statistic of the data, on a plot:
Td = test_statistic(x, y, sigmay, b_ls, m_ls)
plt.hist(T, 100, histtype='step', color='blue', lw=2, range=(0.0,np.percentile(T,99.0)))
plt.axvline(Td, color='black', linestyle='--', lw=2)
plt.xlabel('Test statistic')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
P = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d) = ",P,"%")
${\rm Pr}(T(d^{\rm rep},\theta)>T(d,\theta)\,|\,d) = \int I(T(d^{\rm rep},\theta)>T(d,\theta))\,{\rm Pr}(d^{\rm rep}\,|\,\theta)\,{\rm Pr}(\theta\,|\,d)\;d\theta\,dd^{\rm rep}$
In [26]:
# Discrepancy: functions of the data AND parameters.
# 1) Reduced chisq for the model:
def test_statistic(x,y,sigmay,b,m):
return np.sum((y - m*x - b)**2.0/sigmay**2.0)/(len(y)-2)
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T,
# AND ALSO its Td (which now depends on the parameters, too):
T = np.zeros(len(chain))
Td = np.zeros(len(chain))
for k,(m,b) in enumerate(chain):
yrep = b + m*x + np.random.randn(len(x)) * sigmay
T[k] = test_statistic(x,yrep,sigmay,b,m)
Td[k] = test_statistic(x,y,sigmay,b,m)
# Compare T with Td, on a scatter plot - how often is T>Td?
plt.scatter(Td, T, color='blue',alpha=0.1)
plt.plot([0.0, 100.0], [0.0, 100.], color='k', linestyle='--', linewidth=2)
plt.xlabel('Observed discrepancy $T(d,\\theta)$')
plt.ylabel('Replicated discrepancy $T(d^{\\rm rep},\\theta)$')
plt.ylim([0.0,np.percentile(Td,99.0)])
plt.xlim([0.0,np.percentile(Td,99.0)])
Out[26]:
In [27]:
# Histogram of differences:
diff = T-Td
plt.hist(diff, 100, histtype='step', color='blue', lw=2, range=(np.percentile(diff,1.0),np.percentile(diff,99.0)))
plt.axvline(0.0, color='black', linestyle='--', lw=2)
plt.xlabel('Difference $T(d^{\\rm rep},\\theta) - T(d,\\theta)$')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
Pline = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d) = ",Pline,"%")
$y = m x + b + q x^2$
In [28]:
def quadratic_log_likelihood(theta, x, y, sigmay):
'''
Returns the log-likelihood of drawing data values y at
known values x given Gaussian measurement noise with standard
deviation with known sigmay, where the "true" y values are
y_t = m*x + b + q**2
x: list of x coordinates
y: list of y coordinates
sigmay: list of y uncertainties
m: scalar slope
b: scalar line intercept
q: quadratic term coefficient
where theta = (m, b, q)
Returns: scalar log likelihood
'''
m, b, q = theta
return (np.sum(np.log(1./(np.sqrt(2.*np.pi) * sigmay))) +
np.sum(-0.5 * (y - (m*x + b + q*x**2))**2 / sigmay**2))
def quadratic_log_prior(theta, theta_limits):
m, b, q = theta
mlimits, blimits, qpars = theta_limits
# m and b:
log_mb_prior = straight_line_log_prior(np.array([m,b]), np.array([mlimits, blimits]))
# q:
log_q_prior = np.log(1./(np.sqrt(2.*np.pi) * qpars[1])) - \
0.5 * (q - qpars[0])**2 / qpars[1]**2
return log_mb_prior + log_q_prior
def quadratic_log_posterior(theta, x, y, sigmay, theta_limits):
return (quadratic_log_likelihood(theta, x, y, sigmay) +
quadratic_log_prior(theta, theta_limits))
In [29]:
# Define uniform prior limits, enforcing positivity in m and b:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
# Define Gaussian prior centered on zero for q:
qpars = [0.0,0.003]
# Initialize m, b at the center of prior:
m = 0.5*(mlimits[0]+mlimits[1])
b = 0.5*(blimits[0]+blimits[1])
q = qpars[0]
# Arrays to pass to the sampler:
qtheta = np.array([m,b,q])
qtheta_limits = (mlimits, blimits, qpars)
# Step sizes, small fractions of the prior width:
mstep = 0.02*(mlimits[1]-mlimits[0])
bstep = 0.05*(blimits[1]-blimits[0])
qstep = 0.02*qpars[1]
stepsize = np.array([mstep,bstep,qstep])
# How many steps?
nsteps = 10000
print('Running Metropolis Sampler for', nsteps, 'steps...')
qchain, log_probs, acceptance_rate = metropolis(
quadratic_log_posterior, qtheta, qtheta_limits, stepsize, nsteps=nsteps
)
print('Acceptance fraction:', acceptance_rate)
In [30]:
# Pull m, b and q arrays out of the Markov chain and plot them:
mm = [m for m,b,q in qchain]
bb = [b for m,b,q in qchain]
qq = [q for m,b,q in qchain]
# Traces, for convergence inspection:
plt.figure(figsize=(8,5))
plt.subplot(3,1,1)
plt.plot(mm, 'k-')
plt.ylim(mlimits)
plt.ylabel('Slope m')
plt.subplot(3,1,2)
plt.plot(bb, 'k-')
plt.ylim(blimits)
plt.ylabel('Intercept b')
plt.subplot(3,1,3)
plt.plot(qq, 'k-')
plt.ylim([qpars[0]-3*qpars[1],qpars[0]+3*qpars[1]])
plt.ylabel('Quadratic coefficient q')
Out[30]:
In [31]:
corner.corner(qchain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=True, fill_contours=True, levels=[0.68, 0.95],
color='green', bins=80, smooth=1.0);
In [32]:
# Posterior visual check, in data space:
X = np.linspace(xlimits[0],xlimits[1],100)
for i in (np.random.rand(100)*len(chain)).astype(int):
m,b,q = qchain[i]
plt.plot(X, b + X*m + q*X**2, 'g-', alpha=0.1)
plot_line(m_ls, b_ls);
plot_yerr(x, y, sigmay)
In [33]:
# Discrepancy: functions of the data AND parameters.
# 1) Reduced chisq for the model:
def test_statistic(x,y,sigmay,m,b,q):
return np.sum((y - m*x - b - q*x**2)**2.0/sigmay**2.0)/(len(y)-3)
# 2) Some other discrepancy measure:
# Approximate the posterior predictive distribution for T,
# by drawing a replica dataset for each sample (m,b) and computing its T,
# AND ALSO its Td:
T = np.zeros(len(qchain))
Td = np.zeros(len(qchain))
for k,(m,b,q) in enumerate(qchain):
yp = b + m*x + q*x**2 + sigmay*np.random.randn(len(x))
T[k] = test_statistic(x,yp,sigmay,m,b,q)
Td[k] = test_statistic(x,y,sigmay,m,b,q)
# Histogram of differences:
diff = T - Td
plt.hist(diff, 100, histtype='step', color='green', lw=2, range=(np.percentile(diff,1.0),np.percentile(diff,99.0)))
plt.axvline(0.0, color='black', linestyle='--', lw=2)
plt.xlabel('Difference $T(d^{\\rm rep},\\theta) - T(d,\\theta)$')
plt.ylabel('Posterior predictive distribution')
# What is Pr(T>T(d)|d)?
greater = (T > Td)
Pquad = 100*len(T[greater])/(1.0*len(T))
print("Pr(T>T(d)|d,quadratic) = ",Pquad,"%, cf. Pr(T>T(d)|d,straightline) = ",Pline,"%")
All of the above discussion was about model checking - assessment of the ability of a model to make accurate predictions.
Frequentist: assume the null hypothesis $H_0$, and calculate the probability of getting the test statistic (designed according to the alternative hypothesis $H_1$) larger than the observed one, by studying the distribution of a test statistic over a large ensemble of replica datasets (which under certain assumptions is a standard distribution).
Bayesian: assume the alternative hypothesis $H_1$, and calculate the posterior predictive probability distribution for the test statistic given the data, relative to the one test statistic we observed, marginalizing over the model parameters. In general this distribution will be non-standard, but we can often draw samples from the posterior PDF anyway.
$R = \frac{{\rm Pr}(d\,|\,H_1)}{{\rm Pr}(d\,|\,H_0)}$
${\rm Pr}(d\,|\,H) = \int\;{\rm Pr}(d\,|\,\theta,H)\;{\rm Pr}(\theta\,|\,H)\;d\theta$
The consequence of this is that the evidence depends on both model accuracy (goodness of fit, ability to predict data) and efficiency (not over-fitting by having implausible flexibility).
The following figure might help illustrate how the evidence depends on both goodness of fit (through the likelihood) and the complexity of the model (via the prior). In this 1D case, a Gaussian likelihood (red) is integrated over a uniform prior (blue): the evidence can be shown to be given by $E = f \times L_{\rm max}$, where $L_{\rm max}$ is the maximum possible likelihood, and $f$ is the fraction of the blue dashed area that is shaded red. $f$ is 0.31, 0.98, and 0.07 in each case.
In [34]:
from IPython.display import Image
Image('evidence.png')
Out[34]:
The illustration above shows us a few things:
1) The evidence can be made arbitrarily small by increasing the prior volume: the evidence is more conservative than focusing on the goodness of fit ($L_{\rm max}$) alone. Of course if you assign a prior you don't believe, then you should not expect to get out a meaningful answer for ${\rm Pr}(d\,|\,H)$.
2) The evidence is linearly sensitive to prior volume ($f$), but exponentially sensitive to goodness of fit ($L_{\rm max}$). It's still a likelihood, after all.
$\frac{{\rm Pr}(H_1|d)}{{\rm Pr}(H_0|d)} = \frac{{\rm Pr}(d|H_1)}{{\rm Pr}(d|H_0)} \; \frac{{\rm Pr}(H_1)}{{\rm Pr}(H_0)}$
Prior probabilities are very difficult to assign in most practical problems (notice that no theorist ever provides them). So, one way to interpret the evidence ratio is to note that:
A number of sampling algorithms have been developed that do calculate the evidence, during the process of sampling. These include:
${\rm Pr}(d\,|\,H) \approx \frac{1}{N_s} \sum_k\;{\rm Pr}(d\,|\,\theta_s,H)$
Simple Monte Carlo:
Consider the simplest possbile posterior inference, the mean of parameter $\theta$: $\bar{\theta} = \int\,\theta\,{\rm Pr}(\theta\,|\,d,H)\,d\theta$
This is approximated by an average over posterior samples:
$\bar{\theta} \approx \frac{1}{N_s}\,\sum_s\,\theta_s$
Now replace the posterior PDF with Bayes' Theorem:
$\bar{\theta} = \frac{1}{Z} \int\,\theta\,{\rm Pr}(d\,|\,\theta,H)\,{\rm Pr}(\theta\,|\,H)\,d\theta$
and the posterior samples with prior samples:
$\bar{\theta} \approx \frac{1}{(Z N_s)}\,\sum_s\,\theta_s\,{\rm Pr}(d\,|\,\theta_s,H)$
This is a weighted mean - if we don't want $Z$, we just define weights $w_s = {\rm Pr}(d\,|\,\theta_s,H)$ compute the approximate posterior mean as the likelihood-weighted prior mean:
$\bar{\theta} \approx \frac{\sum_s\,w_s\,\theta_s}{\sum_s\,w_s}$
To compute Z, we multiply both sides by $Z$ and replace $\theta$ with 1:
$Z \approx \frac{1}{N_s} \sum_s\,\,w_s$
$R = \frac{{\rm Pr}(d\,|\,{\rm quadratic})}{{\rm Pr}(d\,|\,{\rm straight line})}$
In [35]:
# Draw a large number of prior samples and calculate the log likelihood for each one:
N = 50000
# Set the priors:
mlimits = [0.0, 2.0]
blimits = [0.0, 200.0]
qpars = [0.0,0.003]
# Sample from the prior:
mm = np.random.uniform(mlimits[0],mlimits[1], size=N)
bb = np.random.uniform(blimits[0],blimits[1], size=N)
qq = qpars[0] + qpars[1]*np.random.randn(N)
# We'll store the posterior samples as a "chain" again
schain = []
log_likelihood_straight_line = np.zeros(N)
log_likelihood_quadratic = np.zeros(N)
for i in range(N):
theta = np.array([mm[i], bb[i]])
log_likelihood_straight_line[i] = straight_line_log_likelihood(theta, x, y, sigmay)
qtheta = np.array([mm[i], bb[i], qq[i]])
log_likelihood_quadratic[i] = quadratic_log_likelihood(qtheta, x, y, sigmay)
schain.append((mm[i],bb[i],qq[i]))
Now that we have the log likelihood for each sample, let's check that we did actually sample the posterior well. Here are the corner plots (note that for plotting, the weights don't need to be correctly normalized - and also that we do not want to plot the points as well as the contours, since the points are prior samples not posterior ones!):
In [36]:
# Unnormalized likelihoods for plotting:
unnormalized_likelihood_straight_line = np.exp(log_likelihood_straight_line - log_likelihood_straight_line.max())
unnormalized_likelihood_quadratic = np.exp(log_likelihood_quadratic - log_likelihood_quadratic.max())
In [37]:
corner.corner(schain, labels=['m','b','q'], range=[mlimits,blimits,(qpars[0]-3*qpars[1],qpars[0]+3*qpars[1])],quantiles=[0.16,0.5,0.84],
weights=unnormalized_likelihood_straight_line,
show_titles=True, title_args={"fontsize": 12},
plot_datapoints=False, fill_contours=True, levels=[0.68, 0.95],
color='blue', bins=80, smooth=1.0);