This notebook presents PyMC implementations of the Bayesian simple linear regression models described in Kruschke, Ch. 16. We'll start with a no-frills linear regression of a single metric response on a single metric predictor, then modify that for robustness, and finally add hierarchy to deal with repeated measures among multiple subjects.
In [225]:
# Set up the environment for successive computational cells.
# This stuff will not be repeated below, be sure to run this cell first.
import pymc as mc
import numpy as np
import pandas as pd
from IPython.display import Image
class Struct:
# convenience 'bag of attributes' class
def __init__(self, **entries): self.__dict__.update(entries)
def show_graph(model):
"""display a graphical representation of the model"""
graph = mc.graph.graph(model)
Image(graph.create_png())
def posterior_hist(trace, label="", HDI=.95, alpha=1.0):
"""Render a histogram of the posterior sampling distribution of a model variable."""
t = sorted(trace)
h = hist(t, normed=True, alpha=alpha, label="%s:%.2f" % (label, mean(t)))
ymax = max(h[0])
axvline(mean(t), c='red', linestyle='-.')
hdidx = int(len(t)*(1 - HDI)/2.)
annotate("", xy=(t[hdidx],0), xytext=(t[-hdidx],0),
arrowprops=dict(arrowstyle="|-|", connectionstyle="arc3", color='k',lw=2))
def ci_patch(x, ci, **kwds):
"""create a bounding polygon for the confidence intervals vs x"""
x, ci = zip(*sorted(zip(x,ci))) # sort x and ci by x
cit = zip(*ci)
diff = np.array(cit[1]) - np.array(cit[0])
print "CI: min=%f, max=%f" % (min(diff), max(diff))
return Polygon(zip(x,cit[0]) + zip(x, cit[1])[::-1], **kwds)
def plot_regression(x, y, mu, tau, y_sim=None, titlestr="", HDI=.95, color='b'):
"""plot the regression results. X and Y are vectors of floats,
mu is a list of regression mean stochastics for y[i] on x[i],
tau is the precision stochastic
y_sim is a list of unobserved posterior Y stochastics, used for confidence intervals"""
ax = gca()
ax.scatter(x,y,color=color)
if hasattr(x,'name'):
xlabel(x.name)
if hasattr(y,'name'):
ylabel(y.name)
if titlestr:
title(titlestr)
if hasattr(mu[0], 'trace'):
# use the trace summary statistics for mean and slope
means = [mui.stats()['mean'] for mui in mu]
#slope = mu[0].parents['y'][1].stats()['mean']
else:
# use the current values for mean and slope
means = [mui.value for mui in mu]
#slope = mu[0].parents['y'][1].value
sx, smeans = zip(*sorted(zip(x, means))) # sort x,mean pairs by x and return as lists
ax.plot(sx, smeans, color=color)
legend()
# 95% HDI ci for the mean
if hasattr(mu[0], 'trace'):
ci_mu = [tuple(mui.stats()['95% HPD interval']) for mui in mu]
ax.add_patch(ci_patch(x, ci_mu, color=color, alpha=.2))
draw_if_interactive()
if y_sim is not None and hasattr(y_sim[0], 'trace'):
# 95% HDI for the predicted y's
ci_y = [tuple(y_i.stats()['95% HPD interval']) for y_i in y_sim]
ax.add_patch(ci_patch(x, ci_y, color='k', alpha=.2))
draw_if_interactive()
elif tau is not None:
# 95% ci for predicted y's assuming normal centered at current mu
ci_y = [(mui.value - 1.96/sqrt(tau.value), mui.value + 1.96/sqrt(tau.value)) for mui in mu]
ax.add_patch(ci_patch(x, ci_y, color='r', alpha=.2))
draw_if_interactive()
The first data set is height and weight pairs randomly generated using Kruschke's HtWtDataGenerator.R and saved as htwt.csv for loading here:
In [ ]:
# Note: when reading, do not let column 0 be used as the index.
# It starts at 1, and you will later be in a world of pain
# wondering whether you are referring to the item at position
# 1 or the item named 1 (which is in position 0).
htwt = pd.read_csv("data/htwt.csv", index_col=None)
htwt.describe()
In [ ]:
scatter(htwt.height, htwt.weight);
We will fit a simple generalized linear model for weight (y) predicted by height (x):
$y_i \sim N(\mu_i, \tau)$
$\mu_i = \beta_0 + \beta_1x_i$
$\tau = 1/\sigma^2$
In [87]:
import pymc as pm
def make_normal_model(x, y):
"""
return a dict of model variables for the model constructor
"""
# priors for the regression coefficients and variance/precision.
# initialize beta to suit the data, or the MAP initializer may not converge.
beta = mc.Container((mc.Normal("beta_0", mu=mean(y), tau=1e-12, value=mean(y)), # population intercept
mc.Normal("beta_1", mu=0, tau=1e-12, value=0))) # population slope
tau = mc.Gamma("tau", alpha=.001, beta=.001, value=.001) # population precision
# The distribution of each y_i has mean that is a deterministic function of the x_i
mu = mc.Container([mc.LinearCombination("mu_%d" % i, (1, x[i]), beta)
for i in x.index])
# likelihood for the data
y_obs = mc.Container([mc.Normal("y_%d" % i, mu=mu[i], tau=tau, value=y[i], observed=True)
for i in y.index])
# posterior predictions, for generating confidence intervals
y_sim = mc.Container([mc.Normal("y_%d" % i, mu=mu[i], tau=tau) for i in y.index])
return mc.Model(input=locals())
M = make_normal_model(htwt.height, htwt.weight)
print M.beta[0].value, M.beta[1].value
figure()
plot_regression(htwt.height, htwt.weight, M.mu, M.tau, titlestr="pre MAP estimate")
show()
mc.MAP(M).fit()
plot_regression(htwt.height, htwt.weight, M.mu, M.tau, titlestr="initial MAP estimate")
show()
S = mc.MCMC(input=M, db='ram')
In [88]:
S.sample(10000, burn=1000, thin=10)
On to the results:
In [89]:
plot_regression(htwt.height, htwt.weight, M.mu, M.tau, M.y_sim, titlestr="MC posterior estimate")
figure()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()
The blue area is the 95% confidence interval around the estimated mean for each y. The gray area is the 95% HDI of predicted y's drawn as samples from distributions of $\mu$ and $\tau$.
Although the point results above look reasonable, there is a lot of variation in the shapes of the posteriors over repreated runs. And occasionally the MAP estimate blows up and the sampling run never recovers. This is because credible values of $\beta_0$ and $\beta_1$ are highly negatively correlated (a higher y-intercept implies a lower slope so the line will go through the center of the data mass). The MCMC can have trouble traversing the sample space because it cannot change $\beta_0$ and $\beta_1$ independently without falling out of the credible region.
The standard cure for this is to first center the data:
In [96]:
h = pd.Series(htwt.height - htwt.height.mean(), name="height - %d" % int(htwt.height.mean()))
w = pd.Series(htwt.weight - htwt.weight.mean(), name="weight - %d" % int(htwt.weight.mean()))
M = make_normal_model(h,w)
mc.MAP(M).fit()
plot_regression(h, w, M.mu, M.tau, titlestr="initial MAP estimate")
show()
In [97]:
S_c = mc.MCMC(input=M, db='ram')
S_c.sample(10000, burn=1000, thin=10)
In [98]:
plot_regression(h, w, M.mu, M.tau, M.y_sim, titlestr="MC posterior estimate")
show()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()
The $\beta_1$ estimate is nearly the same as before. But this yields much tidier-looking $\beta$ posteriors, with a much tighter interval for $\beta_0$ around 0.
The normal likelihood used for $\mu_i$ is sensitive to outliers. The example in section 16.2 demonstrates this with data for 25 brands of cigarettes (McIntyre1994), attempting to predict tar content from weight:
In [55]:
cig = pd.read_csv("data/McIntyre1994data.csv")
print cig
In [102]:
#w = pd.Series(cig.Wt - cig.Wt.mean(), name="Weight - %d" % cig.Wt.mean())
#t = pd.Series(cig.Tar - cig.Tar.mean(), name="Tar - %d" % cig.Tar.mean())
w = cig.Wt
t = cig.Tar
figure()
M = make_normal_model(w, t)
mc.MAP(M).fit()
plot_regression(w, t, M.mu, M.tau, titlestr="initial MAP estimate")
show()
In [105]:
S = mc.MCMC(input=M, db='ram')
S.sample(50000, burn=1000, thin=10)
In [106]:
plot_regression(w, t, M.mu, M.tau, M.y_sim, titlestr="Normal regression with outlier")
show()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()
The point at the upper right exerts enough influence on the slope (via the Normal likelihood) that the value 0 is not in the credible HDI. Kruschke suggests replacing the Normal likelihood with a t-distributed likelihood, whose fatter tails accommodate observations outside the normal distribution. The degrees-of-freedom parameter will be treated as a continuous adjustment of the fatness of the tails, and will be estimated along with the other model parameters, biased towards smaller values:
In [107]:
def make_t_model(x, y):
"""
return a dict of model variables for the model constructor
"""
# priors for the regression coefficients and variance/precision
beta = mc.Container((mc.Normal("beta_0", mu=0, tau=1e-12, value=mean(y)), # population intercept
mc.Normal("beta_1", mu=0, tau=1e-12, value=0))) # population slope
tau = mc.Gamma("tau", alpha=.001, beta=.001, value=.001) # population precision
sigma = mc.Lambda("sigma", lambda tau=tau: tau**-0.5)
nuGain = 1
udf = mc.Uniform("udf", 0, 1.0, value=0)
nu = mc.Lambda("nu", lambda udf=udf: 1 - nuGain * np.log(1 - udf)) # nu range [1,infty)
# The distribution of each y_i has mean that is a deterministic function of the x_i
mu = mc.Container([mc.LinearCombination("mu_%d" % i, (1, x[i]), beta)
for i in x.index])
# likelihood for the data
z_like = mc.Container([mc.NoncentralT("z_like_%d" % i, mu=mu[i], lam=sigma, nu=nu, value=y[i], observed=True)
for i in y.index])
# simulate posterior predictions for confidence intervals
z_sim = mc.Container([mc.NoncentralT("z_like_%d" % i, mu=mu[i], lam=sigma, nu=nu) for i in y.index])
return mc.Model(input=locals())
In [108]:
M = make_t_model(cig.Wt, cig.Tar)
plot_regression(cig.Wt, cig.Tar, M.mu, M.tau, titlestr="pre MAP estimate")
show()
mc.MAP(M).fit()
plot_regression(cig.Wt, cig.Tar, M.mu, M.tau, titlestr="MAP estimate")
show()
In [110]:
S = mc.MCMC(input=M, db='ram')
S.sample(50000, burn=1000, thin=10)
In [111]:
plot_regression(cig.Wt, cig.Tar, M.mu, M.tau, M.z_sim, titlestr="Noncentral T regression with outlier")
show()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()
posterior_hist(M.nu.trace(), "nu")
legend()
show()
This final example uses the lung toxin clearance dataset from H.A.Feldman. Repeated measurements per subject are modeled via a hierarchic prior on the regression coefficients, and outliers are accommodated via a t-distribution for the likelihoods. There's a small bit of data cleaning to do on the Feldman table, to extract group1 data and exclude missing data . Let's wrangle this with a Pandas Dataframe.
In [250]:
feldman = pd.read_csv("data/feldman.csv", index_col=0)
group1 = feldman.ix[feldman.group == 1]
group1 = group1.ix[group1.retention.notnull()]
group1.subjID -= 1 # life will be simpler if we can index 0-based on subjID
group1.index = range(len(group1.index)) # get a contiguous index
group1[:15]
Out[250]:
In [289]:
def make_subj_model(df, subj, x, y):
"""
return a dict of model variables for the model constructor
df: dataframe of observations, each with associated subject id
subj: name of subject id column in df
x: name of predictor column
y: name of observed column
"""
# population regression parameters beta_0, beta_1
beta_pop_mu = mc.Normal("pop_mu", mu=0, tau=1e-12, value=(0,0), size=2)
beta_pop_tau = mc.Gamma("pop_tau", alpha=.001, beta=.001, value=(.001,.001), size=2)
# subject regression betas are normally distributed around population betas
nsubj = len(df[subj].unique())
beta_subj = mc.Container([
mc.Normal("beta_0", mu=beta_pop_mu[0], tau=beta_pop_tau[0], value=np.zeros(nsubj)*.01, size=nsubj),
mc.Normal("beta_1", mu=beta_pop_mu[1], tau=beta_pop_tau[1], value=np.zeros(nsubj)*.01, size=nsubj)])
# The distribution of each y_i has mean that is a deterministic function of the x_i
# and the subject regression parameters
mu = mc.Container(
[mc.LinearCombination("mu_%d" % idx, (1, row[x]), (beta_subj[0][row[subj]],
beta_subj[1][row[subj]])) for (idx,row) in df.iterrows()])
# each subject has own constant variance prior
m = mc.Gamma("m", alpha=1, beta=.1, value=1)
d = mc.Gamma("d", alpha=1, beta=.1, value=1)
s = mc.Lambda("s", lambda m=m, d=d: m**2 / d**2)
r = mc.Lambda("r", lambda m=m, d=d: m / d**2)
tau_subj = mc.Gamma("tau_subj", alpha=s, beta=r, value=np.ones(nsubj), size=nsubj)
# likelihood for the data, normally distributed as mu_subj_i, tau_subj
y_obs = mc.Container(
[mc.Normal("y_obs_%d" % idx, mu=mu[idx], tau=tau_subj[row[subj]],
value=row[y], observed=True) for (idx,row) in df.iterrows()])
# posterior predictions, for generating confidence intervals
y_sim = mc.Container(
[mc.Normal("y_sim_%d" % idx, mu=mu[idx], tau=tau_subj[row[subj]])
for (idx,row) in df.iterrows()])
return mc.Model(input=locals())
def plot_group_regressions(group, title):
colors = ['b', 'k', 'g', 'r']
for id in group.subjID.unique():
idxs = (group.subjID == id) # boolean vector selects subsets of group rows
plot_regression(group.time[idxs], group.retention[idxs], list(group.mu[idxs]), group.tau[idxs].iloc[0],
y_sim=list(group.y_sim[idxs]), titlestr=title, color=colors[id])
legend()
In [253]:
M = make_subj_model(group1, 'subjID', 'time', 'retention')
In [278]:
group1['mu'] = list(M.mu) # cast container to list
group1['tau'] = [M.tau_subj[row['subjID']] for (idx, row) in group1.iterrows()]
group1['y_sim'] = list(M.y_sim) # cast container to list
print group1
In [270]:
mc.MAP(M).fit()
In [290]:
plot_group_regressions(group1, "MAP estimate")
show()
In [293]:
S = mc.MCMC(input=M, db='ram')
S.sample(50000, burn=2000, thin=10)
In [294]:
plot_group_regressions(group1, "Subject regressions")
show()