In this notebook, we discuss Bayesian statistics from a computational/ practical point of view. We borrow some code examples from chapter 2 of Bayesian Analysis with Python (2nd end) by Osvaldo Martin.
In [1]:
%matplotlib inline
import sklearn
import scipy.stats as stats
import scipy.optimize
import matplotlib.pyplot as plt
import seaborn as sns
import time
import numpy as np
import os
import pandas as pd
In [2]:
# We install various packages for approximate Bayesian inference
# To avoid installing packages the internet every time you open a colab,
# you can use this trick:
# https://stackoverflow.com/questions/55253498/how-do-i-install-a-library-permanently-in-colab
# The PyMC3 package (https://docs.pymc.io) supports HMC and variational inference
# https://docs.pymc.io/notebooks/api_quickstart.html
!pip install pymc3==3.8
import pymc3 as pm
pm.__version__
# The arviz package (https://github.com/arviz-devs/arviz) can be used to make various plots
# of posterior samples generated by any algorithm.
!pip install arviz
import arviz as az
In [0]:
# Plot the Binomial likelihood
n_params = [1, 2, 4] # Number of trials
p_params = [0.25, 0.5, 0.75] # Probability of success
x = np.arange(0, max(n_params)+1)
f,ax = plt.subplots(len(n_params), len(p_params), sharex=True, sharey=True,
figsize=(8, 7), constrained_layout=True)
for i in range(len(n_params)):
for j in range(len(p_params)):
n = n_params[i]
p = p_params[j]
y = stats.binom(n=n, p=p).pmf(x)
ax[i,j].vlines(x, 0, y, colors='C0', lw=5)
ax[i,j].set_ylim(0, 1)
ax[i,j].plot(0, 0, label="N = {:3.2f}\nθ = {:3.2f}".format(n,p), alpha=0)
ax[i,j].legend()
ax[2,1].set_xlabel('y')
ax[1,0].set_ylabel('p(y | θ, N)')
ax[0,0].set_xticks(x)
In [0]:
# Plot the beta prior
params = [0.5, 1, 2, 3]
x = np.linspace(0, 1, 100)
f, ax = plt.subplots(len(params), len(params), sharex=True, sharey=True,
figsize=(8, 7), constrained_layout=True)
for i in range(4):
for j in range(4):
a = params[i]
b = params[j]
y = stats.beta(a, b).pdf(x)
ax[i,j].plot(x, y)
ax[i,j].plot(0, 0, label="α = {:2.1f}\nβ = {:2.1f}".format(a, b), alpha=0)
ax[i,j].legend()
ax[1,0].set_yticks([])
ax[1,0].set_xticks([0, 0.5, 1])
f.text(0.5, 0.05, 'θ', ha='center')
f.text(0.07, 0.5, 'p(θ)', va='center', rotation=0)
In [0]:
# Compute and plot posterior (black vertical line = true parameter value)
plt.figure(figsize=(10, 8))
n_trials = [0, 1, 2, 3, 4, 8, 16, 32, 50, 150]
data = [0, 1, 1, 1, 1, 4, 6, 9, 13, 48]
theta_real = 0.35
beta_params = [(1, 1), (20, 20), (1, 4)]
dist = stats.beta
x = np.linspace(0, 1, 200)
for idx, N in enumerate(n_trials):
if idx == 0:
plt.subplot(4, 3, 2)
plt.xlabel('θ')
else:
plt.subplot(4, 3, idx+3)
plt.xticks([])
y = data[idx]
for (a_prior, b_prior) in beta_params:
p_theta_given_y = dist.pdf(x, a_prior + y, b_prior + N - y)
plt.fill_between(x, 0, p_theta_given_y, alpha=0.7)
plt.axvline(theta_real, ymax=0.3, color='k')
plt.plot(0, 0, label=f'{N:4d} trials\n{y:4d} heads', alpha=0)
plt.xlim(0, 1)
plt.ylim(0, 12)
plt.legend()
plt.yticks([])
plt.tight_layout()
In [3]:
# We illustrate how to compute a 95% posterior credible interval for a random variable
# with a beta distribution.
from scipy.stats import beta
np.random.seed(42)
theta_real = 0.35
ntrials = 100
data = stats.bernoulli.rvs(p=theta_real, size=ntrials)
N = ntrials; N1 = sum(data); N0 = N-N1; # Sufficient statistics
aprior = 1; bprior = 1; # prior
apost = aprior + N1; bpost = bprior + N0 # posterior
# Interval function
alpha = 0.05
CI1 = beta.interval(1-alpha, apost, bpost)
print('{:0.2f}--{:0.2f}'.format(CI1[0], CI1[1])) # (0.06:0.52)
# Use the inverse CDF (percent point function)
l = beta.ppf(alpha/2, apost, bpost)
u = beta.ppf(1-alpha/2, apost, bpost)
CI2 = (l,u)
print('{:0.2f}--{:0.2f}'.format(CI2[0], CI2[1])) # (0.06:0.52)
# Use Monte Carlo sampling
samples = beta.rvs(apost, bpost, size=10000)
samples = np.sort(samples)
CI3 = np.percentile(samples, 100*np.array([alpha/2, 1-alpha/2]))
print('{:0.2f}--{:0.2f}'.format(CI3[0], CI3[1])) # (0.06:0.51)
print(np.mean(samples))
In [4]:
# Use arviz to plot posterior.
# By default, it shows the 94\% interval, but we change it to 95%.
az.plot_posterior({'θ':samples}, credible_interval=0.95);
In [5]:
# See if the parameter is inside the region of practical equivalence centered at 0.5
az.plot_posterior(samples, rope=[0.45, .55]);
Out[5]:
In [6]:
# From the above plot, we see that the HPD does not overlap the ROPE,
#so we can confidently say the parameter is "significiantly different" from 0.5
# We can also verify this by checking if 0.5 is in the HPD
az.plot_posterior(samples, credible_interval=0.95, ref_val=0.5);
In [19]:
# Summarize posterior samples
az.summary(samples)
# We can ignore the warning about not having enough 'chains',
# since we are drawing exact iid samples from the posterior.
Out[19]:
In [0]:
grid = np.linspace(0, 1, 200)
θ_pos = samples #trace['θ']
lossf_a = [np.mean(abs(i - θ_pos)) for i in grid]
lossf_b = [np.mean((i - θ_pos)**2) for i in grid]
for lossf, c in zip([lossf_a, lossf_b], ['C0', 'C1']):
mini = np.argmin(lossf)
plt.plot(grid, lossf, c)
plt.plot(grid[mini], lossf[mini], 'o', color=c)
plt.annotate('{:.3f}'.format(grid[mini]),
(grid[mini], lossf[mini] + 0.03), color=c)
plt.yticks([])
plt.xlabel(r'$\hat \theta$')
We will use pymc3 to approximate the posterior of this simple model. Code is based on this notebook.
In [7]:
data # same as above
Out[7]:
In [9]:
with pm.Model() as model:
# a priori
θ = pm.Beta('θ', alpha=aprior, beta=bprior)
# likelihood
y = pm.Bernoulli('y', p=θ, observed=data)
pm.model_to_graphviz(model) # show the DAG
Out[9]:
In [10]:
# run MCMC (defaults to 2 chains)
with model:
trace = pm.sample(1000, random_seed=123)
In [13]:
# Standard MCMC diagonistics
pm.traceplot(trace);
Rhat = pm.rhat(trace);
print(Rhat) # should be close to 1.0
In [14]:
pm.plot_posterior(trace);
# The samples from MCMC (called "trace") should be similar to the exact
# iid samples from the posterior, plotted above
# Under the hood, pm.plot_posterior(trace) calls az.plot_posterior(trace)
In [21]:
# Summarize posterior samples
pm.summary(trace)
Out[21]:
In [20]:
# Convert posterior samples into a parametric distribution
trace_approx = pm.Empirical(trace, model=model)
# Now plot samples from this distribution
az.plot_posterior(trace_approx.sample(1000));
We use automatic differentiation VI. Details can be found at https://docs.pymc.io/notebooks/variational_api_quickstart.html
In [25]:
niter = 10000
with model:
post = pm.fit(niter, method='advi'); # mean field approximation
In [26]:
# Plot negative ELBO vs iteration to assess convergence
plt.plot(post.hist);
In [27]:
pm.plot_posterior(post.sample(1000));
In [36]:
np.random.seed(0)
N = 100
x = np.random.randn(100)
# Parameters of prior
mu_prior = 1.1
sigma_prior = 1.2 #std
Sigma_prior = sigma_prior**2 #var
# Parameters of likelihood
sigma_x = 1.3
Sigma_x = sigma_x**2
# Bayes rule for Gaussians
Sigma_post = 1/( 1/Sigma_prior + N/Sigma_x )
xbar = np.mean(x)
mu_post = Sigma_post * (1/Sigma_x * N * xbar + 1/Sigma_prior * mu_prior);
print('p(mu|D)=N(mu|{:.3f}, {:.3f})'.format(mu_post, Sigma_post))
Initially we assume the variance is known, so we can compare results to exact infernece.
In [31]:
with pm.Model() as model:
mu = pm.Normal('mu', mu=mu_prior, sd=sigma_prior)
obs = pm.Normal('obs', mu=mu, sd=sigma_x, observed=x)
mcmc_samples = pm.sample(1000, tune=500) # mcmc
pm.plot_posterior(mcmc_samples);
In [37]:
vals = mcmc_samples.get_values('mu')
mu_post_mcmc = np.mean(vals)
Sigma_post_mcmc = np.var(vals)
print('pMCMC(mu|D)=N(mu|{:.3f}, {:.3f})'.format(mu_post_mcmc, Sigma_post_mcmc))
assert np.isclose(mu_post, mu_post_mcmc, atol=1e-1)
assert np.isclose(Sigma_post, Sigma_post_mcmc, atol=1e-1)
In [0]:
# We can also evaluate the log joint at any given point in parameter space.
# The 'obs' variable is already observed (value=x)
# so the only unknown is mu. Let's clamp it to some value
# and compute log p(mu, D)
mu_clamped = -0.5
logp = model.logp({'mu': mu_clamped})
# Computed the log joint "by hand"
log_prior = scipy.stats.norm(mu_prior, sigma_prior).logpdf(mu_clamped)
log_lik = np.sum(scipy.stats.norm(mu_clamped, sigma_x).logpdf(x))
log_joint = log_prior + log_lik
assert np.isclose(logp, log_joint)
Now we consider the case where the mean and variance are both unknown. We also switch to a "real world" dataset, of "chemical shifts", that has a couple of "outliers".
In [39]:
#url = 'https://github.com/aloctavodia/BAP/blob/master/code/data/chemical_shifts.csv'
url = 'https://raw.githubusercontent.com/aloctavodia/BAP/master/code/data/chemical_shifts.csv'
df = pd.read_csv(url)
# b=df.iloc[:,1:].values
#data = df.to_numpy()
data = df.iloc[:,0].values
print(data.shape)
print(data)
In [40]:
az.plot_kde(data, rug=True)
plt.yticks([0], alpha=0)
Out[40]:
In [41]:
# We will infer a posterior for the mean and variance.
# We use a uniform prior for the mean, with support slightly larger than the data.
# We use a truncated normal for the variance, with effective support uniform 0 to 3*std.
r = np.max(data)-np.min(data)
min_mu = np.min(data)-0.1*r
max_mu = np.max(data)+0.1*r
prior_std = 3*np.std(data)
print([min_mu, max_mu, prior_std])
with pm.Model() as model_g:
μ = pm.Uniform('μ', lower=min_mu, upper=max_mu)
σ = pm.HalfNormal('σ', sd=10)
y = pm.Normal('y', mu=μ, sd=σ, observed=data)
pm.model_to_graphviz(model_g) # show the DAG
Out[41]:
In [42]:
with model_g:
trace_g = pm.sample(1000, random_seed=123)
In [43]:
az.plot_trace(trace_g);
In [44]:
az.summary(trace_g)
Out[44]:
In [46]:
az.plot_joint(trace_g, kind='kde', fill_last=False);
In [48]:
# We check how well the gaussian assumption fits our data
# by sampling from the fitted model, and plotting the samples
# and the original data.
# For details, see https://docs.pymc.io/notebooks/posterior_predictive.html
# For the Gaussian model, the mean and variance is higher than for the observed data,
# indicating poor fit.
y_pred_g = pm.sample_posterior_predictive(trace_g, 100, model_g)
print(y_pred_g.keys())
v=y_pred_g['y']
print(type(v))
print(v.shape) # 100 x 47
In [49]:
data_ppc = az.from_pymc3(trace=trace_g, posterior_predictive=y_pred_g)
ax = az.plot_ppc(data_ppc, figsize=(12, 6), mean=False);
ax[0].legend(fontsize=15)
Out[49]:
In [35]:
# We replace the above Gaussian likelihood with a Student t distribution.
# The degree of freedom parameter \nu > 0 (also called the "normality parameter")
# determines how close to Normal the distribution is.
# nu=1 corredsponds to a Cauchy, nu >> 10 corresponds to a Gaussian.
# We put an Exponential prior on nu, with a mean of 30.
with pm.Model() as model_t:
μ = pm.Uniform('μ', 40, 75)
σ = pm.HalfNormal('σ', sd=10)
ν = pm.Exponential('ν', 1/30) # PyMC3 uses inverse of the mean
y = pm.StudentT('y', mu=μ, sd=σ, nu=ν, observed=data)
pm.model_to_graphviz(model_t) # show the DAG
with model_t:
trace_t = pm.sample(1000)
az.plot_trace(trace_t)
az.summary(trace_t)
# We see that E[nu]=4.5, which is fairly far from Gaussian
# We see E[sigma]=2.1, wich is less than the 3.5 estimate from Gaussian likelihood
Out[35]:
In [36]:
# posterior predictive check
y_ppc_t = pm.sample_posterior_predictive(
trace_t, 100, model_t, random_seed=123)
y_pred_t = az.from_pymc3(trace=trace_t, posterior_predictive=y_ppc_t)
az.plot_ppc(y_pred_t, figsize=(12, 6), mean=False)
ax[0].legend(fontsize=15)
plt.xlim(40, 70)
Out[36]:
In [38]:
# Remove outliers from data and compare empirical mean and variance of cleaned data
# to posterior mean and posterior scale of a Student likelihood
# https://gist.github.com/vishalkuo/f4aec300cf6252ed28d3
def removeOutliers(x, outlierConstant=1.5):
a = np.array(x)
upper_quartile = np.percentile(a, 75)
lower_quartile = np.percentile(a, 25)
IQR = (upper_quartile - lower_quartile) * outlierConstant
quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
result = a[np.where((a >= quartileSet[0]) & (a <= quartileSet[1]))]
return result.tolist()
data_clean = removeOutliers(data)
mu_mcmc = np.mean(trace_t.get_values('μ'))
sigma_mcmc = np.mean(trace_t.get_values('σ'))
print([np.mean(data), np.mean(data_clean), mu_mcmc])
print([np.std(data), np.std(data_clean), sigma_mcmc])
We often want to know if one dataset, $D_i$, has a "statistically signficant" difference in one of its parameters, such as its mean $\mu_i$, compared to some other dataset, $D_j$. We can answer this in a Bayesian way by computing $p(\delta_{ij}|D_i,D_j)$, where
$\delta_{ij}=\mu_i-\mu_j$.
To do this, we draw samples from $p(\mu_i|D_i)$ and $p(\mu_j|D_j)$.
Since the magnitude of $\delta_{ij}$ can be hard to interpret, it is common to divide it by the pooled standard deviation, to get a metric known as Cohen's d (see this website for details):
$d_{ij} = \frac{\mu_j - \mu_i}{\sqrt{\frac{\sigma_i^2 + \sigma_j^2}{2}}}$
We can compute $p(d_{ij}|D_i,D_j)$ using posterior samples of $\mu_i,\mu_j,\sigma_i,\sigma_j$.
In [48]:
#We illustrate this below using the same dataset as used in chap 2 of
#["Bayesian Analysis with Python (2nd end)"](https://github.com/aloctavodia/BAP) that records how much tips waiters made.
url = 'https://raw.githubusercontent.com/aloctavodia/BAP/master/code/data/tips.csv'
df = pd.read_csv(url)
df.tail()
Out[48]:
In [46]:
# We look at the effect of day on tip amount.
# We ignore other covariates, such as gender.
sns.violinplot(x='day', y='tip', data=df)
Out[46]:
In [51]:
# We will compute 4 groups, corresponding to Thur-Sun.
x=df['day'].values
print(type(x))
print(x)
print(np.unique(x))
In [67]:
tip_amount = df['tip'].values
days = ['Thur', 'Fri', 'Sat', 'Sun']
idx = pd.Categorical(tips['day'], categories=days).codes
ngroups = len(np.unique(idx))
print(idx)
In [69]:
with pm.Model() as model_cg:
μ = pm.Normal('μ', mu=0, sd=10, shape=ngroups)
σ = pm.HalfNormal('σ', sd=10, shape=ngroups)
y = pm.Normal('y', mu=μ[idx], sd=σ[idx], observed=tip_amount)
print(model_cg.basic_RVs)
with model_cg:
trace_cg = pm.sample(5000)
az.plot_trace(trace_cg)
az.summary(trace_cg)
Out[69]:
In [70]:
# Looking at the posterior mean for the mu_i,
# we see Thur ~ Fri < Sat < Sun.
# But to see if these differences are significant, we should take
# into account the variability. We illustrate this below.
# We see that Thursday and Friday both earn significantly less than Sunday.
# (The threshold of 0 is outside the 95% HPD).
# Other differences are less significant.
fig, ax = plt.subplots(3, 2, figsize=(14, 8), constrained_layout=True)
comparisons = [(i, j) for i in range(ngroups) for j in range(i+1, ngroups)]
pos = [(k, l) for k in range(ngroups-1) for l in (0, 1)] # position of plot
for (i, j), (k, l) in zip(comparisons, pos):
means_diff = trace_cg['μ'][:, i] - trace_cg['μ'][:, j]
d_cohen = (means_diff / np.sqrt((trace_cg['σ'][:, i]**2 + trace_cg['σ'][:, j]**2) / 2)).mean()
az.plot_posterior(means_diff, ref_val=0, ax=ax[k, l], credible_interval=0.95)
name_i = days[i]
name_j = days[j]
str = 'mean {} - mean {}'.format(name_i, name_j)
ax[k, l].set_title(str)
ax[k, l].plot(0, label=f"Cohen's d = {d_cohen:.2f}")
ax[k, l].legend()
In [0]:
We use the "cancer rates" example from the MLAPP2.0 book. For PyMC3 code for a similar rats" example from BDA3, see here.
In [13]:
#https://github.com/probml/pmtk3/blob/master/demos/cancerRatesEb.m
data_y = np.array([0, 0, 2, 0, 1, 1, 0, 2, 1, 3, 0, 1, 1, 1, 54, 0, 0, 1, 3, 0]);
data_n = np.array([1083, 855, 3461, 657, 1208, 1025, 527, 1668, 583, 582, 917, 857,
680, 917, 53637, 874, 395, 581, 588, 383]);
N = len(data_n)
# We put a prior on the mean and precision () of the Beta distribution,
# instead of on the alpha and beta parameters
with pm.Model() as model_h:
mu = pm.Beta('mu', 1., 1.)
kappa = pm.HalfNormal('kappa', 500)
alpha = pm.Deterministic('alpha', mu*kappa)
beta = pm.Deterministic('beta', (1.0-mu)*kappa)
theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=N)
y = pm.Binomial('y', p=theta, observed=data_y, n=data_n)
np.random.seed(0)
with model_h:
trace_h = pm.sample(1000, chains=4)
In [14]:
az.summary(trace_h)
Out[14]:
In [15]:
az.plot_forest(trace_h, var_names=["theta"], credible_interval=0.95);
In [16]:
az.plot_forest(trace_h, var_names=["theta"], combined=True, credible_interval=0.95);
In [17]:
az.plot_forest(trace_h, var_names=["theta"], combined=True, kind='ridgeplot');
In [71]:
# https://github.com/probml/pyprobml/blob/master/scripts/schools8_pymc3.py
# Data of the Eight Schools Model
J = 8
y = np.array([28., 8., -3., 7., -1., 1., 18., 12.])
sigma = np.array([15., 10., 16., 11., 9., 11., 10., 18.])
names=[];
for t in range(8):
names.append('theta {}'.format(t));
print(names)
# Plot raw data
fig, ax = plt.subplots()
y_pos = np.arange(8)
ax.errorbar(y,y_pos, xerr=sigma, fmt='o')
ax.set_yticks(y_pos)
ax.set_yticklabels(names)
ax.invert_yaxis() # labels read top-to-bottom
plt.show()
In [72]:
# Centered model
with pm.Model() as Centered_eight:
mu_alpha = pm.Normal('mu_alpha', mu=0, sigma=5)
sigma_alpha = pm.HalfCauchy('sigma_alpha', beta=5)
alpha = pm.Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape=J)
obs = pm.Normal('obs', mu=alpha, sigma=sigma, observed=y)
np.random.seed(0)
with Centered_eight:
trace_centered = pm.sample(1000, chains=2)
pm.summary(trace_centered).round(2)
# PyMC3 gives multiple warnings about divergences
# Also, see r_hat ~ 1.01, ESS << nchains*1000, especially for sigma_alpha
# We can solve these problems below by using a non-centered parameterization.
# In practice, for this model, the results are very similar.
Out[72]:
In [73]:
# Display the total number and percentage of divergent chains
diverging = trace_centered['diverging']
print('Number of Divergent Chains: {}'.format(diverging.nonzero()[0].size))
diverging_pct = diverging.nonzero()[0].size / len(trace_centered) * 100
print('Percentage of Divergent Chains: {:.1f}'.format(diverging_pct))
In [27]:
# We can see somewhat high auto correlation of the samples
az.plot_autocorr(trace_centered, var_names=['mu_alpha', 'sigma_alpha']);
In [38]:
az.plot_forest(trace_centered, var_names="alpha",
credible_interval=0.95, combined=True);
In [29]:
# Non-centered parameterization
with pm.Model() as NonCentered_eight:
mu_alpha = pm.Normal('mu_alpha', mu=0, sigma=5)
sigma_alpha = pm.HalfCauchy('sigma_alpha', beta=5)
alpha_offset = pm.Normal('alpha_offset', mu=0, sigma=1, shape=J)
alpha = pm.Deterministic('alpha', mu_alpha + sigma_alpha * alpha_offset)
#alpha = pm.Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape=J)
obs = pm.Normal('obs', mu=alpha, sigma=sigma, observed=y)
np.random.seed(0)
with NonCentered_eight:
trace_noncentered = pm.sample(1000)
pm.summary(trace_noncentered).round(2)
# Samples look good: r_hat = 1, ESS ~= nchains*1000
Out[29]:
In [31]:
az.plot_autocorr(trace_noncentered, var_names=['mu_alpha', 'sigma_alpha']);
In [51]:
az.plot_forest(trace_noncentered, var_names="theta",
combined=True, credible_interval=0.95);
In [0]:
# Plot the "funnel of hell"
# Based on
# https://github.com/twiecki/WhileMyMCMCGentlySamples/blob/master/content/downloads/notebooks/GLM_hierarchical_non_centered.ipynb
group = 0
x = pd.Series(trace_centered['alpha'][:, group], name=f'alpha {group}')
y = pd.Series(trace_centered['sigma_alpha'], name='sigma_alpha')
sns.jointplot(x, y);
plt.suptitle('centered')
In [0]:
group = 0
x = pd.Series(trace_noncentered['alpha'][:, group], name=f'alpha {group}')
y = pd.Series(trace_noncentered['sigma_alpha'], name='sigma_alpha')
sns.jointplot(x, y);
plt.suptitle('noncentered')
In [40]:
for group in range(J):
fig, axs = plt.subplots(ncols=2, sharex=True, sharey=True)
x = pd.Series(trace_centered['alpha'][:, group], name=f'alpha {group}')
y = pd.Series(trace_centered['sigma_alpha'], name='sigma_alpha')
axs[0].plot(x, y, '.');
axs[0].set(title='Centered', ylabel='sigma_alpha', xlabel=f'alpha {group}')
x = pd.Series(trace_noncentered['alpha'][:, group], name=f'alpha {group}')
y = pd.Series(trace_noncentered['sigma_alpha'], name='sigma_alpha')
axs[1].plot(x, y, '.');
axs[1].set(title='Non-centered', ylabel='sigma_alpha', xlabel=f'alpha {group}')
This section is based on Bayesian Analysis with Python, ch 3 and this blog post from Thomas Wiecki.
In [57]:
N = 10 #20
M = 8 # num groups
idx = np.repeat(range(M-1), N) # N samples for groups 0-6
idx = np.append(idx, 7) # 1 sample for 7'th group
np.random.seed(123)
#alpha_real = np.random.normal(2.5, 0.5, size=M)
#beta_real = np.random.beta(6, 1, size=M)
#eps_real = np.random.normal(0, 0.5, size=len(idx))
alpha_real = np.random.normal(2.5, 0.5, size=M)
beta_real = np.random.beta(1, 1, size=M) # slope is closer to 0
eps_real = np.random.normal(0, 0.5, size=len(idx))
print(beta_real)
y_m = np.zeros(len(idx))
x_m = np.random.normal(10, 1, len(idx))
y_m = alpha_real[idx] + beta_real[idx] * x_m + eps_real
x_centered = x_m - x_m.mean()
_, ax = plt.subplots(2, 4, figsize=(10, 5), sharex=True, sharey=True)
ax = np.ravel(ax)
j, k = 0, N
for i in range(M):
ax[i].scatter(x_m[j:k], y_m[j:k])
ax[i].set_xlabel(f'x_{i}')
ax[i].set_ylabel(f'y_{i}', rotation=0, labelpad=15)
ax[i].set_xlim(6, 15)
ax[i].set_ylim(1, 18)
j += N
k += N
plt.tight_layout()
In [58]:
# Fit separarate models per group
with pm.Model() as unpooled_model:
α = pm.Normal('α', mu=0, sd=10, shape=M)
β = pm.Normal('β', mu=0, sd=10, shape=M)
ϵ = pm.HalfCauchy('ϵ', 5)
y_pred = pm.Normal('y_pred', mu=α[idx] + β[idx] * x_m,
sd=ϵ, observed=y_m)
trace_up = pm.sample(1000)
az.summary(trace_up)
Out[58]:
In [0]:
def plot_post_pred_samples(trace, nsamples=20):
_, ax = plt.subplots(2, 4, figsize=(10, 5), sharex=True, sharey=True,
constrained_layout=True)
ax = np.ravel(ax)
j, k = 0, N
x_range = np.linspace(x_m.min(), x_m.max(), 10)
X = x_range[:, np.newaxis]
for i in range(M):
ax[i].scatter(x_m[j:k], y_m[j:k])
ax[i].set_xlabel(f'x_{i}')
ax[i].set_ylabel(f'y_{i}', labelpad=17, rotation=0)
alpha_m = trace['α'][:, i].mean()
beta_m = trace['β'][:, i].mean()
ax[i].plot(x_range, alpha_m + beta_m * x_range, c='r', lw=3,
label=f'y = {alpha_m:.2f} + {beta_m:.2f} * x')
plt.xlim(x_m.min()-1, x_m.max()+1)
plt.ylim(y_m.min()-1, y_m.max()+1)
alpha_samples = trace['α'][:,i]
beta_samples = trace['β'][:,i]
ndx = np.random.choice(np.arange(len(alpha_samples)), nsamples)
alpha_samples_thinned = alpha_samples[ndx]
beta_samples_thinned = beta_samples[ndx]
ax[i].plot(x_range, alpha_samples_thinned + beta_samples_thinned * X,
c='gray', alpha=0.5)
j += N
k += N
In [60]:
plot_post_pred_samples(trace_up)
In [61]:
# Fit the centered model to the raw data
with pm.Model() as model_centered:
# hyper-priors
μ_α = pm.Normal('μ_α', mu=0, sd=10)
σ_α = pm.HalfNormal('σ_α', 10)
μ_β = pm.Normal('μ_β', mu=0, sd=10)
σ_β = pm.HalfNormal('σ_β', sd=10)
# priors
α = pm.Normal('α', mu=μ_α, sd=σ_α, shape=M)
β = pm.Normal('β', mu=μ_β, sd=σ_β, shape=M)
ϵ = pm.HalfCauchy('ϵ', 5)
y_pred = pm.Normal('y_pred', mu=α[idx] + β[idx] * x_m,
sd=ϵ, observed=y_m)
trace_centered = pm.sample(1000)
az.summary(trace_centered)
# Lots of warnings about divergence. We will fix this below
# when we switch to non-centered parameterization.
Out[61]:
In [62]:
az.plot_autocorr(trace_centered, var_names=['μ_α', 'σ_α', 'μ_β', 'σ_β']); #sigma_alpha chain is highly correlated
In [63]:
plot_post_pred_samples(trace_centered)
In [65]:
# Fit the non-centered model to the raw data
with pm.Model() as model_noncentered:
# hyper-priors
μ_α = pm.Normal('μ_α', mu=0, sd=10)
σ_α = pm.HalfNormal('σ_α', 10)
μ_β = pm.Normal('μ_β', mu=0, sd=10)
σ_β = pm.HalfNormal('σ_β', sd=10)
# priors
α_offset = pm.Normal('α_offset', mu=0, sd=1, shape=M)
α = pm.Deterministic('α', μ_α + σ_α * α_offset)
β_offset = pm.Normal('β_offset', mu=0, sd=1, shape=M)
β = pm.Deterministic('β', μ_β + σ_β * β_offset)
ϵ = pm.HalfCauchy('ϵ', 5)
y_pred = pm.Normal('y_pred', mu=α[idx] + β[idx] * x_m,
sd=ϵ, observed=y_m)
trace_noncentered = pm.sample(1000)
az.summary(trace_noncentered)
# very few divergences :)
Out[65]:
In [66]:
az.plot_autocorr(trace_noncentered, var_names=['μ_α', 'σ_α', 'μ_β', 'σ_β']);
In [67]:
plot_post_pred_samples(trace_noncentered)
In [0]: