Simple Linear Regression

This notebook presents PyMC implementations of the Bayesian simple linear regression models described in Kruschke, Ch. 16. We'll start with a no-frills linear regression of a single metric response on a single metric predictor, then modify that for robustness, and finally add hierarchy to deal with repeated measures among multiple subjects.


In [225]:
# Set up the environment for successive computational cells. 
# This stuff will not be repeated below, be sure to run this cell first.

import pymc as mc
import numpy as np
import pandas as pd
from IPython.display import Image

class Struct:
    # convenience 'bag of attributes' class
    def __init__(self, **entries): self.__dict__.update(entries)

def show_graph(model):
    """display a graphical representation of the model"""
    graph = mc.graph.graph(model)
    Image(graph.create_png())

def posterior_hist(trace, label="", HDI=.95, alpha=1.0):   
    """Render a histogram of the posterior sampling distribution of a model variable."""
    t = sorted(trace)
    h = hist(t, normed=True, alpha=alpha, label="%s:%.2f" % (label, mean(t)))
    ymax = max(h[0])
    axvline(mean(t), c='red', linestyle='-.')
    hdidx = int(len(t)*(1 - HDI)/2.) 
    annotate("", xy=(t[hdidx],0), xytext=(t[-hdidx],0),
             arrowprops=dict(arrowstyle="|-|", connectionstyle="arc3", color='k',lw=2))
    
def ci_patch(x, ci, **kwds):
    """create a bounding polygon for the confidence intervals vs x"""
    x, ci = zip(*sorted(zip(x,ci))) # sort x and ci by x
    cit = zip(*ci) 
    diff = np.array(cit[1]) - np.array(cit[0])
    print "CI: min=%f, max=%f" % (min(diff), max(diff))
    return Polygon(zip(x,cit[0]) + zip(x, cit[1])[::-1], **kwds)

def plot_regression(x, y, mu, tau,  y_sim=None, titlestr="", HDI=.95, color='b'):
    """plot the regression results. X and Y are vectors of floats, 
       mu is a list of regression mean stochastics for y[i] on x[i],
       tau is the precision stochastic
       y_sim is a list of unobserved posterior Y stochastics, used for confidence intervals"""
    ax = gca()
    ax.scatter(x,y,color=color)
    if hasattr(x,'name'):
        xlabel(x.name)
    if hasattr(y,'name'):
        ylabel(y.name)
    if titlestr:
        title(titlestr)
    if hasattr(mu[0], 'trace'):
        # use the trace summary statistics for mean and slope
        means = [mui.stats()['mean'] for mui in mu]
        #slope = mu[0].parents['y'][1].stats()['mean']
    else:
        # use the current values for mean and slope
        means = [mui.value for mui in mu]
        #slope = mu[0].parents['y'][1].value
    sx, smeans = zip(*sorted(zip(x, means))) # sort x,mean pairs by x and return as lists
    ax.plot(sx, smeans, color=color)
    legend()
    # 95% HDI ci for the mean
    if hasattr(mu[0], 'trace'):
        ci_mu = [tuple(mui.stats()['95% HPD interval']) for mui in mu]
        ax.add_patch(ci_patch(x, ci_mu, color=color, alpha=.2))
        draw_if_interactive()
    if y_sim is not None and hasattr(y_sim[0], 'trace'):
        # 95% HDI for the predicted y's
        ci_y = [tuple(y_i.stats()['95% HPD interval']) for y_i in y_sim]
        ax.add_patch(ci_patch(x, ci_y, color='k', alpha=.2))
        draw_if_interactive()
    elif tau is not None:
        # 95% ci for predicted y's assuming normal centered at current mu
        ci_y = [(mui.value - 1.96/sqrt(tau.value), mui.value + 1.96/sqrt(tau.value)) for mui in mu] 
        ax.add_patch(ci_patch(x, ci_y, color='r', alpha=.2))
        draw_if_interactive()

The first data set is height and weight pairs randomly generated using Kruschke's HtWtDataGenerator.R and saved as htwt.csv for loading here:


In [ ]:
# Note: when reading, do not let column 0 be used as the index. 
# It starts at 1, and you will later be in a world of pain
# wondering whether you are referring to the item at position
# 1 or the item named 1 (which is in position 0). 
htwt = pd.read_csv("data/htwt.csv", index_col=None)
htwt.describe()

In [ ]:
scatter(htwt.height, htwt.weight);

We will fit a simple generalized linear model for weight (y) predicted by height (x):

$y_i \sim N(\mu_i, \tau)$

$\mu_i = \beta_0 + \beta_1x_i$

$\tau = 1/\sigma^2$


In [87]:
import pymc as pm

def make_normal_model(x, y):
    """
    return a dict of model variables for the model constructor
    """
    # priors for the regression coefficients and variance/precision.
    # initialize beta to suit the data, or the MAP initializer may not converge.
    beta = mc.Container((mc.Normal("beta_0", mu=mean(y), tau=1e-12, value=mean(y)), # population intercept
                         mc.Normal("beta_1", mu=0, tau=1e-12, value=0))) # population slope
    tau = mc.Gamma("tau", alpha=.001, beta=.001, value=.001)   # population precision

    # The distribution of each y_i has mean that is a deterministic function of the x_i
    mu = mc.Container([mc.LinearCombination("mu_%d" % i, (1, x[i]), beta)
                       for i in x.index])

    # likelihood for the data
    y_obs = mc.Container([mc.Normal("y_%d" % i, mu=mu[i], tau=tau, value=y[i], observed=True) 
                          for i in y.index])
    
    # posterior predictions, for generating confidence intervals
    y_sim = mc.Container([mc.Normal("y_%d" % i, mu=mu[i], tau=tau) for i in y.index])
    
    return mc.Model(input=locals())

M = make_normal_model(htwt.height, htwt.weight)
print M.beta[0].value, M.beta[1].value
figure()
plot_regression(htwt.height, htwt.weight, M.mu, M.tau, titlestr="pre MAP estimate")
show()
mc.MAP(M).fit()
plot_regression(htwt.height, htwt.weight, M.mu, M.tau, titlestr="initial MAP estimate")
show()
S = mc.MCMC(input=M, db='ram')


164.996666667 0.0
CI: min=123.961284, max=123.961284
CI: min=110.890607, max=110.890607

In [88]:
S.sample(10000, burn=1000, thin=10)


 [-----------------100%-----------------] 10000 of 10000 complete in 29.4 sec

On to the results:


In [89]:
plot_regression(htwt.height, htwt.weight, M.mu, M.tau, M.y_sim, titlestr="MC posterior estimate")
figure()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()


CI: min=27.815104, max=46.740338
CI: min=145.090182, max=157.002683

The blue area is the 95% confidence interval around the estimated mean for each y. The gray area is the 95% HDI of predicted y's drawn as samples from distributions of $\mu$ and $\tau$.

Centered Data

Although the point results above look reasonable, there is a lot of variation in the shapes of the posteriors over repreated runs. And occasionally the MAP estimate blows up and the sampling run never recovers. This is because credible values of $\beta_0$ and $\beta_1$ are highly negatively correlated (a higher y-intercept implies a lower slope so the line will go through the center of the data mass). The MCMC can have trouble traversing the sample space because it cannot change $\beta_0$ and $\beta_1$ independently without falling out of the credible region.

The standard cure for this is to first center the data:


In [96]:
h = pd.Series(htwt.height - htwt.height.mean(), name="height - %d" % int(htwt.height.mean()))
w = pd.Series(htwt.weight - htwt.weight.mean(), name="weight - %d" % int(htwt.weight.mean()))
M = make_normal_model(h,w)
mc.MAP(M).fit()
plot_regression(h, w, M.mu, M.tau, titlestr="initial MAP estimate")
show()


CI: min=116.726690, max=116.726690

In [97]:
S_c = mc.MCMC(input=M, db='ram')
S_c.sample(10000, burn=1000, thin=10)


 [-----------------100%-----------------] 10000 of 10000 complete in 28.3 sec

In [98]:
plot_regression(h, w, M.mu, M.tau, M.y_sim, titlestr="MC posterior estimate")
show()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()


CI: min=0.432128, max=59.366041
CI: min=140.785092, max=158.883314

The $\beta_1$ estimate is nearly the same as before. But this yields much tidier-looking $\beta$ posteriors, with a much tighter interval for $\beta_0$ around 0.

Robust Regression

The normal likelihood used for $\mu_i$ is sensitive to outliers. The example in section 16.2 demonstrates this with data for 25 brands of cigarettes (McIntyre1994), attempting to predict tar content from weight:


In [55]:
cig = pd.read_csv("data/McIntyre1994data.csv")
print cig


               Brand   Tar   Nic      Wt    CO
0             Alpine  14.1  0.86  0.9853  13.6
1    BensonAndHedges  16.0  1.06  1.0938  16.6
2         BullDurham  29.8  2.03  1.1650  23.5
3        CamelLights   8.0  0.67  0.9280  10.2
4            Carlton   4.1  0.40  0.9462   5.4
5       Chesterfield  15.0  1.04  0.8885  15.0
6       GoldenLights   8.8  0.76  1.0267   9.0
7               Kent  12.4  0.95  0.9225  12.3
8               Kool  16.6  1.12  0.9372  16.3
9              LandM  14.9  1.02  0.8858  15.4
10        LarkLights  13.7  1.01  0.9643  13.0
11          Marlboro  15.1  0.90  0.9316  14.4
12             Merit   7.8  0.57  0.9705  10.0
13       MultiFilter  11.4  0.78  1.1240  10.2
14     NewportLights   9.0  0.74  0.8517   9.5
15               Now   1.0  0.13  0.7851   1.5
16           OldGold  17.0  1.26  0.9186  18.5
17     PallMallLight  12.8  1.08  1.0395  12.6
18           Raleigh  15.8  0.96  0.9573  17.5
19        SalemUltra   4.5  0.42  0.9106   4.9
20          Tareyton  14.5  1.01  1.0070  15.9
21              True   7.3  0.61  0.9806   8.5
22  ViceroyRichLight   8.6  0.69  0.9693  10.6
23     VirginiaSlims  15.2  1.02  0.9496  13.9
24     WinstonLights  12.0  0.82  1.1184  14.9

In [102]:
#w = pd.Series(cig.Wt - cig.Wt.mean(), name="Weight - %d" % cig.Wt.mean())
#t = pd.Series(cig.Tar - cig.Tar.mean(), name="Tar - %d" % cig.Tar.mean())
w = cig.Wt
t = cig.Tar
figure()
M = make_normal_model(w, t)
mc.MAP(M).fit()
plot_regression(w, t, M.mu, M.tau, titlestr="initial MAP estimate")
show()


CI: min=54.381268, max=54.381268

In [105]:
S = mc.MCMC(input=M, db='ram')
S.sample(50000, burn=1000, thin=10)


 [-----------------100%-----------------] 50000 of 50000 complete in 128.7 sec

In [106]:
plot_regression(w, t, M.mu, M.tau, M.y_sim, titlestr="Normal regression with outlier")
show()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()


CI: min=4.116669, max=9.185046
CI: min=20.270202, max=22.487224

The point at the upper right exerts enough influence on the slope (via the Normal likelihood) that the value 0 is not in the credible HDI. Kruschke suggests replacing the Normal likelihood with a t-distributed likelihood, whose fatter tails accommodate observations outside the normal distribution. The degrees-of-freedom parameter will be treated as a continuous adjustment of the fatness of the tails, and will be estimated along with the other model parameters, biased towards smaller values:


In [107]:
def make_t_model(x, y):
    """
    return a dict of model variables for the model constructor
    """
    # priors for the regression coefficients and variance/precision
    beta = mc.Container((mc.Normal("beta_0", mu=0, tau=1e-12, value=mean(y)), # population intercept
                         mc.Normal("beta_1", mu=0, tau=1e-12, value=0))) # population slope
    tau = mc.Gamma("tau", alpha=.001, beta=.001, value=.001)   # population precision
    sigma = mc.Lambda("sigma", lambda tau=tau: tau**-0.5)
    nuGain = 1
    udf = mc.Uniform("udf", 0, 1.0, value=0)
    nu = mc.Lambda("nu", lambda udf=udf: 1 - nuGain * np.log(1 - udf)) # nu range [1,infty)

    # The distribution of each y_i has mean that is a deterministic function of the x_i
    mu = mc.Container([mc.LinearCombination("mu_%d" % i, (1, x[i]), beta)
                       for i in x.index])

    # likelihood for the data
    z_like = mc.Container([mc.NoncentralT("z_like_%d" % i, mu=mu[i], lam=sigma, nu=nu, value=y[i], observed=True)
                          for i in y.index])
    
    # simulate posterior predictions for confidence intervals
    z_sim = mc.Container([mc.NoncentralT("z_like_%d" % i, mu=mu[i], lam=sigma, nu=nu) for i in y.index])  
    return mc.Model(input=locals())

In [108]:
M = make_t_model(cig.Wt, cig.Tar)
plot_regression(cig.Wt, cig.Tar, M.mu, M.tau, titlestr="pre MAP estimate")
show()
mc.MAP(M).fit()
plot_regression(cig.Wt, cig.Tar, M.mu, M.tau, titlestr="MAP estimate")
show()


CI: min=123.961284, max=123.961284
CI: min=127.413709, max=127.413709

In [110]:
S = mc.MCMC(input=M, db='ram')
S.sample(50000, burn=1000, thin=10)


 [-----------------100%-----------------] 50000 of 50000 complete in 214.8 sec

In [111]:
plot_regression(cig.Wt, cig.Tar, M.mu, M.tau, M.z_sim, titlestr="Noncentral T regression with outlier") 
show()
posterior_hist(M.beta[0].trace(), "beta0")
legend()
show()
posterior_hist(M.beta[1].trace(), "beta1")
legend()
show()
posterior_hist(M.nu.trace(), "nu")
legend()
show()


CI: min=4.279330, max=11.527998
CI: min=27.605732, max=32.712127

Repeated Measures

This final example uses the lung toxin clearance dataset from H.A.Feldman. Repeated measurements per subject are modeled via a hierarchic prior on the regression coefficients, and outliers are accommodated via a t-distribution for the likelihoods. There's a small bit of data cleaning to do on the Feldman table, to extract group1 data and exclude missing data . Let's wrangle this with a Pandas Dataframe.


In [250]:
feldman = pd.read_csv("data/feldman.csv", index_col=0)
group1 = feldman.ix[feldman.group == 1]
group1 = group1.ix[group1.retention.notnull()]
group1.subjID -= 1 # life will be simpler if we can index 0-based on subjID
group1.index = range(len(group1.index)) # get a contiguous index
group1[:15]


Out[250]:
group subjID time retention
0 1 0 0 100
1 1 0 3 102
2 1 0 4 87
3 1 0 10 89
4 1 0 15 57
5 1 0 20 48
6 1 0 25 43
7 1 0 30 33
8 1 1 0 100
9 1 1 3 95
10 1 1 4 91
11 1 1 10 62
12 1 1 15 52
13 1 1 20 55
14 1 1 25 45

In [289]:
def make_subj_model(df, subj, x, y):
    """
    return a dict of model variables for the model constructor
    df: dataframe of observations, each with associated subject id
    subj: name of subject id column in df
    x: name of predictor column
    y: name of observed column
    """
    # population regression parameters beta_0, beta_1
    beta_pop_mu = mc.Normal("pop_mu", mu=0, tau=1e-12, value=(0,0), size=2)
    beta_pop_tau = mc.Gamma("pop_tau", alpha=.001, beta=.001, value=(.001,.001), size=2)

    # subject regression betas are normally distributed around population betas
    nsubj = len(df[subj].unique())
    beta_subj = mc.Container([
     mc.Normal("beta_0", mu=beta_pop_mu[0], tau=beta_pop_tau[0], value=np.zeros(nsubj)*.01, size=nsubj),
     mc.Normal("beta_1", mu=beta_pop_mu[1], tau=beta_pop_tau[1], value=np.zeros(nsubj)*.01, size=nsubj)])

    # The distribution of each y_i has mean that is a deterministic function of the x_i
    # and the subject regression parameters
    mu = mc.Container(
        [mc.LinearCombination("mu_%d" % idx, (1, row[x]), (beta_subj[0][row[subj]], 
          beta_subj[1][row[subj]])) for (idx,row) in df.iterrows()])

    # each subject has own constant variance prior
    m = mc.Gamma("m", alpha=1, beta=.1, value=1)
    d = mc.Gamma("d", alpha=1, beta=.1, value=1)
    s = mc.Lambda("s", lambda m=m, d=d: m**2 / d**2)
    r = mc.Lambda("r", lambda m=m, d=d: m / d**2)
    tau_subj = mc.Gamma("tau_subj", alpha=s, beta=r, value=np.ones(nsubj), size=nsubj)
    
    # likelihood for the data, normally distributed as mu_subj_i, tau_subj
    y_obs = mc.Container(
        [mc.Normal("y_obs_%d" % idx, mu=mu[idx], tau=tau_subj[row[subj]], 
         value=row[y], observed=True) for (idx,row) in df.iterrows()])
    
    # posterior predictions, for generating confidence intervals
    y_sim = mc.Container(
       [mc.Normal("y_sim_%d" % idx, mu=mu[idx], tau=tau_subj[row[subj]]) 
	    for (idx,row) in df.iterrows()])
    
    return mc.Model(input=locals())

def plot_group_regressions(group, title):
    colors = ['b', 'k', 'g', 'r']
    for id in group.subjID.unique():
        idxs = (group.subjID == id) # boolean vector selects subsets of group rows 
        plot_regression(group.time[idxs], group.retention[idxs], list(group.mu[idxs]), group.tau[idxs].iloc[0], 
                        y_sim=list(group.y_sim[idxs]), titlestr=title, color=colors[id])
    legend()

In [253]:
M = make_subj_model(group1, 'subjID', 'time', 'retention')

In [278]:
group1['mu'] = list(M.mu) # cast container to list
group1['tau'] = [M.tau_subj[row['subjID']] for (idx, row) in group1.iterrows()]
group1['y_sim'] = list(M.y_sim) # cast container to list
print group1


    group  subjID  time  retention     mu          tau     y_sim
0       1       0     0        100   mu_0  tau_subj[0]   y_sim_0
1       1       0     3        102   mu_1  tau_subj[0]   y_sim_1
2       1       0     4         87   mu_2  tau_subj[0]   y_sim_2
3       1       0    10         89   mu_3  tau_subj[0]   y_sim_3
4       1       0    15         57   mu_4  tau_subj[0]   y_sim_4
5       1       0    20         48   mu_5  tau_subj[0]   y_sim_5
6       1       0    25         43   mu_6  tau_subj[0]   y_sim_6
7       1       0    30         33   mu_7  tau_subj[0]   y_sim_7
8       1       1     0        100   mu_8  tau_subj[1]   y_sim_8
9       1       1     3         95   mu_9  tau_subj[1]   y_sim_9
10      1       1     4         91  mu_10  tau_subj[1]  y_sim_10
11      1       1    10         62  mu_11  tau_subj[1]  y_sim_11
12      1       1    15         52  mu_12  tau_subj[1]  y_sim_12
13      1       1    20         55  mu_13  tau_subj[1]  y_sim_13
14      1       1    25         45  mu_14  tau_subj[1]  y_sim_14
15      1       1    30         42  mu_15  tau_subj[1]  y_sim_15
16      1       2     0        100  mu_16  tau_subj[2]  y_sim_16
17      1       2     3        112  mu_17  tau_subj[2]  y_sim_17
18      1       2     4         72  mu_18  tau_subj[2]  y_sim_18
19      1       2    10         63  mu_19  tau_subj[2]  y_sim_19
20      1       2    15         49  mu_20  tau_subj[2]  y_sim_20
21      1       2    20         45  mu_21  tau_subj[2]  y_sim_21
22      1       2    25         29  mu_22  tau_subj[2]  y_sim_22
23      1       2    30         36  mu_23  tau_subj[2]  y_sim_23
24      1       3     0        100  mu_24  tau_subj[3]  y_sim_24
25      1       3     3        110  mu_25  tau_subj[3]  y_sim_25
26      1       3     4        129  mu_26  tau_subj[3]  y_sim_26
27      1       3    10         71  mu_27  tau_subj[3]  y_sim_27
28      1       3    15         57  mu_28  tau_subj[3]  y_sim_28
29      1       3    25         29  mu_29  tau_subj[3]  y_sim_29
30      1       3    30         32  mu_30  tau_subj[3]  y_sim_30

In [270]:
mc.MAP(M).fit()

In [290]:
plot_group_regressions(group1, "MAP estimate")
show()


CI: min=201.634374, max=201.634374
CI: min=207.652105, max=207.652105
CI: min=253.353066, max=253.353066
CI: min=240.729917, max=240.729917

In [293]:
S = mc.MCMC(input=M, db='ram')
S.sample(50000, burn=2000, thin=10)


 [-----------------100%-----------------] 50000 of 50000 complete in 307.6 sec

In [294]:
plot_group_regressions(group1, "Subject regressions")
show()


CI: min=17.199729, max=59.267751
CI: min=49.853338, max=72.021280
CI: min=20.221401, max=69.481680
CI: min=66.263943, max=93.399557
CI: min=20.185319, max=47.970193
CI: min=71.930313, max=83.799921
CI: min=24.730844, max=59.288972
CI: min=95.238683, max=107.042296