Chapter 10 - Model Comparision and Hierarchical Modelling

10.3.2 - Hierarchical MCMC computation of relative model probability (Two factories of coins)



In [1]:

    
import pandas as pd
import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from theano.tensor import eq
from IPython.display import Image
from matplotlib import gridspec

%matplotlib inline
plt.style.use('seaborn-white')

color = '#87ceeb'



In [2]:

    
%load_ext watermark
%watermark -p pandas,numpy,pymc3,matplotlib,seaborn,theano









    



pandas 0.23.4
numpy 1.15.0
pymc3 3.5
matplotlib 2.2.3
seaborn 0.9.0
theano 1.0.2

10.3.2 - Hierarchical MCMC computation of relative model probability

Model (Kruschke, 2015)



In [3]:

    
Image('images/fig10_2.png', width=300)









    Out[3]:

Model 1 - One theta variable

Coin is flipped nine times, resulting in six heads.



In [4]:

    
with pm.Model() as hierarchical_model:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    kappa = 12
    
    omega = pm.math.switch(eq(m, 0), .25, .75)
    
    theta = pm.Beta('theta', omega*(kappa-2)+1, (1-omega)*(kappa-2)+1)
    
    y = pm.Bernoulli('y', theta, observed=[1,1,1,1,1,1,0,0,0])

pm.model_to_graphviz(hierarchical_model)









    Out[4]:



In [5]:

    
with hierarchical_model:
    trace = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})









    



Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:05<00:00, 4231.26draws/s]
The number of effective samples is smaller than 25% for some parameters.



In [6]:

    
pm.traceplot(trace);



In [7]:

    
trace_df = (pm.trace_to_dataframe(trace)
            .set_index('m'))
trace_df.head()

Figure 10.4 (lower frame)

Note that the models are indexed starting with 0 and not 1, as is the case in Kruschke (2015). So the posterior mean of parameter m is shifted with -1.



In [8]:

    
fig = plt.figure(figsize=(10,5))

font_d = {'size':16}

# Define gridspec
gs = gridspec.GridSpec(2, 4)
ax1 = plt.subplot(gs[0:,1])
ax2 = plt.subplot(gs[0,2:])
ax3 = plt.subplot(gs[1,2:])

# Distplot m
pm.plot_posterior(np.asarray(trace_df.index), ax=ax1, color=color)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

# Distplot theta for m=0 and m=1 
for model, ax in zip((0,1), (ax2, ax3)):
    pm.plot_posterior(trace_df.loc[model].values.ravel(), point_estimate='mode', ax=ax, color=color)
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)    
    ax.set(xlim=(0,1), xlabel=r'$\theta$')

fig.suptitle('Posterior distribution', size=16, y=1.05)
    
fig.tight_layout(w_pad=2);

Model 2 - Two theta variables without pseudo priors

Coin is flipped nine times, resulting in six heads.



In [9]:

    
with pm.Model() as hierarchical_model2:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    omega_0 = .25
    kappa_0 = 12
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    omega_1 = .75
    kappa_1 = 12
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y2 = pm.Bernoulli('y2', theta, observed=[1,1,1,1,1,0,0,0])

pm.model_to_graphviz(hierarchical_model2)









    Out[9]:



In [10]:

    
with hierarchical_model2:
    trace2 = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})









    



Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:06<00:00, 3157.75draws/s]



In [11]:

    
pm.traceplot(trace2);

Model 3 - Two theta variables with pseudo priors = true prior

Coin is flipped 30 times, resulting in 17 heads.



In [12]:

    
with pm.Model() as hierarchical_model3:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    # Theta0
    kappa_0_true_p = 20
    kappa_0_pseudo_p = 20
    kappa_0 = pm.math.switch(eq(m, 0), kappa_0_true_p, kappa_0_pseudo_p)
    omega_0_true_p = .10
    omega_0_pseudo_p = .10
    omega_0 = pm.math.switch(eq(m, 0), omega_0_true_p, omega_0_pseudo_p)
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    # Theta1    
    kappa_1_true_p = 20
    kappa_1_pseudo_p = 20 
    kappa_1 = pm.math.switch(eq(m, 1), kappa_1_true_p, kappa_1_pseudo_p)
    omega_1_true_p = .90
    omega_1_pseudo_p = .90
    omega_1 = pm.math.switch(eq(m, 1), omega_1_true_p, omega_1_pseudo_p)
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y3 = pm.Bernoulli('y3', theta, observed=np.r_[17*[1], 13*[0]])

pm.model_to_graphviz(hierarchical_model3)









    Out[12]:



In [13]:

    
with hierarchical_model3:
    trace3 = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})









    



Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:07<00:00, 2851.08draws/s]
The number of effective samples is smaller than 10% for some parameters.



In [14]:

    
pm.traceplot(trace3);



In [15]:

    
trace3_df = (pm.trace_to_dataframe(trace3)
             .set_index('m')[['theta_0', 'theta_1']])
trace3_df.head()

Figure 10.5 (lower part)



In [16]:

    
fig = plt.figure(figsize=(10,8))

# Define gridspec
gs = gridspec.GridSpec(3,2)
ax1 = plt.subplot(gs[0,:])
ax2 = plt.subplot(gs[1,0])
ax3 = plt.subplot(gs[1,1])
ax4 = plt.subplot(gs[2,0])
ax5 = plt.subplot(gs[2,1])

pm.plot_posterior(np.asarray(trace3_df.index), ax=ax1, color=color)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

for model, theta, ax in zip((0,0,1,1), (0,1,0,1), (ax2, ax3, ax4, ax5)):
    pm.plot_posterior(trace3_df.loc[model, 'theta_{}'.format(theta)].values, ax=ax, color=color)
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)
    ax.set_xlim(0,1)
    ax.set_xlabel(r'$\theta_{}$'.format(theta), fontdict=font_d)
    
fig.tight_layout();

Model 4 - Two theta variables with pseudo priors that mimic posteriors

Coin is flipped 30 times, resulting in 17 heads.



In [17]:

    
with pm.Model() as hierarchical_model4:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    # Theta0
    kappa_0_true_p = 20
    kappa_0_pseudo_p = 50
    kappa_0 = pm.math.switch(eq(m, 0), kappa_0_true_p, kappa_0_pseudo_p)
    omega_0_true_p = .10
    omega_0_pseudo_p = .40
    omega_0 = pm.math.switch(eq(m, 0), omega_0_true_p, omega_0_pseudo_p)
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    # Theta1    
    kappa_1_true_p = 20
    kappa_1_pseudo_p = 50 
    kappa_1 = pm.math.switch(eq(m, 1), kappa_1_true_p, kappa_1_pseudo_p)
    omega_1_true_p = .90
    omega_1_pseudo_p = .70
    omega_1 = pm.math.switch(eq(m, 1), omega_1_true_p, omega_1_pseudo_p)
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y4 = pm.Bernoulli('y4', theta, observed=np.r_[17*[1], 13*[0]])

pm.model_to_graphviz(hierarchical_model4)









    Out[17]:



In [18]:

    
with hierarchical_model4:
    trace4 = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})









    



Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains: 100%|██████████| 22000/22000 [00:07<00:00, 3139.99draws/s]



In [19]:

    
pm.traceplot(trace4);



In [20]:

    
trace4_df = (pm.trace_to_dataframe(trace4)
            .set_index('m')[['theta_0', 'theta_1']])
trace4_df.head()

Figure 10.6 (lower part)



In [21]:

    
fig = plt.figure(figsize=(10,8))

# Define gridspec
gs = gridspec.GridSpec(3,2)
ax1 = plt.subplot(gs[0,:])
ax2 = plt.subplot(gs[1,0])
ax3 = plt.subplot(gs[1,1])
ax4 = plt.subplot(gs[2,0])
ax5 = plt.subplot(gs[2,1])

pm.plot_posterior(np.asarray(trace4_df.index), ax=ax1, color=color)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

for model, theta, ax in zip((0,0,1,1), (0,1,0,1), (ax2, ax3, ax4, ax5)):
    pm.plot_posterior(trace4_df.loc[model, 'theta_{}'.format(theta)].values, ax=ax, color=color)
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)
    ax.set_xlim(0,1)
    ax.set_xlabel(r'$\theta_{}$'.format(theta), fontdict=font_d)
    
fig.tight_layout();

	theta
m
1	0.725338
1	0.601793
1	0.568132
1	0.709717
1	0.629714

	theta_0	theta_1
m
0	0.506201	0.716542
0	0.369498	0.696581
1	0.385291	0.680642
1	0.165792	0.605134
1	0.049298	0.725140

	theta_0	theta_1
m
1	0.426805	0.674804
1	0.357321	0.741863
1	0.348810	0.750423
1	0.424246	0.856375
1	0.537908	0.678404