Chapter 10 - Model Comparision and Hierarchical Modelling


In [1]:
import pandas as pd
import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from theano.tensor import eq
from IPython.display import Image
from matplotlib import gridspec

%matplotlib inline
plt.style.use('seaborn-white')

color = '#87ceeb'

In [2]:
%load_ext watermark
%watermark -p pandas,numpy,pymc3,matplotlib,seaborn,theano


pandas 0.23.4
numpy 1.15.0
pymc3 3.5
matplotlib 2.2.3
seaborn 0.9.0
theano 1.0.2

10.3.2 - Hierarchical MCMC computation of relative model probability

Model (Kruschke, 2015)


In [3]:
Image('images/fig10_2.png', width=300)


Out[3]:

Model 1 - One theta variable

Coin is flipped nine times, resulting in six heads.


In [4]:
with pm.Model() as hierarchical_model:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    kappa = 12
    
    omega = pm.math.switch(eq(m, 0), .25, .75)
    
    theta = pm.Beta('theta', omega*(kappa-2)+1, (1-omega)*(kappa-2)+1)
    
    y = pm.Bernoulli('y', theta, observed=[1,1,1,1,1,1,0,0,0])

pm.model_to_graphviz(hierarchical_model)


Out[4]:
%3 cluster9 9 m m ~ Categorical theta theta ~ Beta m->theta y y ~ Bernoulli theta->y

In [5]:
with hierarchical_model:
    trace = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})


Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:05<00:00, 4231.26draws/s]
The number of effective samples is smaller than 25% for some parameters.

In [6]:
pm.traceplot(trace);



In [7]:
trace_df = (pm.trace_to_dataframe(trace)
            .set_index('m'))
trace_df.head()


Out[7]:
theta
m
1 0.725338
1 0.601793
1 0.568132
1 0.709717
1 0.629714

Figure 10.4 (lower frame)

Note that the models are indexed starting with 0 and not 1, as is the case in Kruschke (2015). So the posterior mean of parameter m is shifted with -1.


In [8]:
fig = plt.figure(figsize=(10,5))

font_d = {'size':16}

# Define gridspec
gs = gridspec.GridSpec(2, 4)
ax1 = plt.subplot(gs[0:,1])
ax2 = plt.subplot(gs[0,2:])
ax3 = plt.subplot(gs[1,2:])

# Distplot m
pm.plot_posterior(np.asarray(trace_df.index), ax=ax1, color=color)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

# Distplot theta for m=0 and m=1 
for model, ax in zip((0,1), (ax2, ax3)):
    pm.plot_posterior(trace_df.loc[model].values.ravel(), point_estimate='mode', ax=ax, color=color)
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)    
    ax.set(xlim=(0,1), xlabel=r'$\theta$')

fig.suptitle('Posterior distribution', size=16, y=1.05)
    
fig.tight_layout(w_pad=2);


Model 2 - Two theta variables without pseudo priors

Coin is flipped nine times, resulting in six heads.


In [9]:
with pm.Model() as hierarchical_model2:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    omega_0 = .25
    kappa_0 = 12
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    omega_1 = .75
    kappa_1 = 12
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y2 = pm.Bernoulli('y2', theta, observed=[1,1,1,1,1,0,0,0])

pm.model_to_graphviz(hierarchical_model2)


Out[9]:
%3 cluster8 8 m m ~ Categorical y2 y2 ~ Bernoulli m->y2 theta_0 theta_0 ~ Beta theta_0->y2 theta_1 theta_1 ~ Beta theta_1->y2

In [10]:
with hierarchical_model2:
    trace2 = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})


Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:06<00:00, 3157.75draws/s]

In [11]:
pm.traceplot(trace2);


Model 3 - Two theta variables with pseudo priors = true prior

Coin is flipped 30 times, resulting in 17 heads.


In [12]:
with pm.Model() as hierarchical_model3:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    # Theta0
    kappa_0_true_p = 20
    kappa_0_pseudo_p = 20
    kappa_0 = pm.math.switch(eq(m, 0), kappa_0_true_p, kappa_0_pseudo_p)
    omega_0_true_p = .10
    omega_0_pseudo_p = .10
    omega_0 = pm.math.switch(eq(m, 0), omega_0_true_p, omega_0_pseudo_p)
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    # Theta1    
    kappa_1_true_p = 20
    kappa_1_pseudo_p = 20 
    kappa_1 = pm.math.switch(eq(m, 1), kappa_1_true_p, kappa_1_pseudo_p)
    omega_1_true_p = .90
    omega_1_pseudo_p = .90
    omega_1 = pm.math.switch(eq(m, 1), omega_1_true_p, omega_1_pseudo_p)
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y3 = pm.Bernoulli('y3', theta, observed=np.r_[17*[1], 13*[0]])

pm.model_to_graphviz(hierarchical_model3)


Out[12]:
%3 cluster30 30 m m ~ Categorical theta_0 theta_0 ~ Beta m->theta_0 theta_1 theta_1 ~ Beta m->theta_1 y3 y3 ~ Bernoulli m->y3 theta_0->y3 theta_1->y3

In [13]:
with hierarchical_model3:
    trace3 = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})


Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:07<00:00, 2851.08draws/s]
The number of effective samples is smaller than 10% for some parameters.

In [14]:
pm.traceplot(trace3);



In [15]:
trace3_df = (pm.trace_to_dataframe(trace3)
             .set_index('m')[['theta_0', 'theta_1']])
trace3_df.head()


Out[15]:
theta_0 theta_1
m
0 0.506201 0.716542
0 0.369498 0.696581
1 0.385291 0.680642
1 0.165792 0.605134
1 0.049298 0.725140

Figure 10.5 (lower part)


In [16]:
fig = plt.figure(figsize=(10,8))

# Define gridspec
gs = gridspec.GridSpec(3,2)
ax1 = plt.subplot(gs[0,:])
ax2 = plt.subplot(gs[1,0])
ax3 = plt.subplot(gs[1,1])
ax4 = plt.subplot(gs[2,0])
ax5 = plt.subplot(gs[2,1])

pm.plot_posterior(np.asarray(trace3_df.index), ax=ax1, color=color)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

for model, theta, ax in zip((0,0,1,1), (0,1,0,1), (ax2, ax3, ax4, ax5)):
    pm.plot_posterior(trace3_df.loc[model, 'theta_{}'.format(theta)].values, ax=ax, color=color)
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)
    ax.set_xlim(0,1)
    ax.set_xlabel(r'$\theta_{}$'.format(theta), fontdict=font_d)
    
fig.tight_layout();


Model 4 - Two theta variables with pseudo priors that mimic posteriors

Coin is flipped 30 times, resulting in 17 heads.


In [17]:
with pm.Model() as hierarchical_model4:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    # Theta0
    kappa_0_true_p = 20
    kappa_0_pseudo_p = 50
    kappa_0 = pm.math.switch(eq(m, 0), kappa_0_true_p, kappa_0_pseudo_p)
    omega_0_true_p = .10
    omega_0_pseudo_p = .40
    omega_0 = pm.math.switch(eq(m, 0), omega_0_true_p, omega_0_pseudo_p)
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    # Theta1    
    kappa_1_true_p = 20
    kappa_1_pseudo_p = 50 
    kappa_1 = pm.math.switch(eq(m, 1), kappa_1_true_p, kappa_1_pseudo_p)
    omega_1_true_p = .90
    omega_1_pseudo_p = .70
    omega_1 = pm.math.switch(eq(m, 1), omega_1_true_p, omega_1_pseudo_p)
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y4 = pm.Bernoulli('y4', theta, observed=np.r_[17*[1], 13*[0]])

pm.model_to_graphviz(hierarchical_model4)


Out[17]:
%3 cluster30 30 m m ~ Categorical theta_0 theta_0 ~ Beta m->theta_0 theta_1 theta_1 ~ Beta m->theta_1 y4 y4 ~ Bernoulli m->y4 theta_0->y4 theta_1->y4

In [18]:
with hierarchical_model4:
    trace4 = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})


Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:07<00:00, 3139.99draws/s]

In [19]:
pm.traceplot(trace4);



In [20]:
trace4_df = (pm.trace_to_dataframe(trace4)
            .set_index('m')[['theta_0', 'theta_1']])
trace4_df.head()


Out[20]:
theta_0 theta_1
m
1 0.426805 0.674804
1 0.357321 0.741863
1 0.348810 0.750423
1 0.424246 0.856375
1 0.537908 0.678404

Figure 10.6 (lower part)


In [21]:
fig = plt.figure(figsize=(10,8))

# Define gridspec
gs = gridspec.GridSpec(3,2)
ax1 = plt.subplot(gs[0,:])
ax2 = plt.subplot(gs[1,0])
ax3 = plt.subplot(gs[1,1])
ax4 = plt.subplot(gs[2,0])
ax5 = plt.subplot(gs[2,1])

pm.plot_posterior(np.asarray(trace4_df.index), ax=ax1, color=color)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

for model, theta, ax in zip((0,0,1,1), (0,1,0,1), (ax2, ax3, ax4, ax5)):
    pm.plot_posterior(trace4_df.loc[model, 'theta_{}'.format(theta)].values, ax=ax, color=color)
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)
    ax.set_xlim(0,1)
    ax.set_xlabel(r'$\theta_{}$'.format(theta), fontdict=font_d)
    
fig.tight_layout();