PyMC3 is a Python package for doing MCMC using a variety of samplers, including Metropolis, Slice and Hamiltonian Monte Carlo. See Probabilistic Programming in Python using PyMC for a description. The GitHub site also has many examples and links for further exploration.
In [1]:
! pip install --quiet pymc3
! pip install --quiet daft
! pip install --quiet seaborn
In [2]:
! conda install --yes --quiet mkl-service
Other resources
Some examples are adapted from:
In [3]:
%matplotlib inline
In [4]:
import numpy as np
import numpy.random as rng
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc3 as pm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
import daft
In [5]:
import theano
theano.config.warn.round=False
In [6]:
n = 100
heads = 61
In [7]:
a, b = 10, 10
prior = stats.beta(a, b)
post = stats.beta(heads+a, n-heads+b)
ci = post.interval(0.95)
xs = np.linspace(0, 1, 100)
plt.plot(prior.pdf(xs), label='Prior')
plt.plot(post.pdf(xs), label='Posterior')
plt.axvline(100*heads/n, c='red', alpha=0.4, label='MLE')
plt.xlim([0, 100])
plt.axhline(0.3, ci[0], ci[1], c='black', linewidth=2, label='95% CI');
plt.legend()
pass
In [8]:
pgm = daft.PGM(shape=[2.5, 3.0], origin=[0, -0.5])
pgm.add_node(daft.Node("alpha", r"$\alpha$", 0.5, 2, fixed=True))
pgm.add_node(daft.Node("beta", r"$\beta$", 1.5, 2, fixed=True))
pgm.add_node(daft.Node("p", r"$p$", 1, 1))
pgm.add_node(daft.Node("n", r"$n$", 2, 0, fixed=True))
pgm.add_node(daft.Node("y", r"$y$", 1, 0, observed=True))
pgm.add_edge("alpha", "p")
pgm.add_edge("beta", "p")
pgm.add_edge("n", "y")
pgm.add_edge("p", "y")
pgm.render()
pass
In [9]:
niter = 2000
with pm.Model() as coin_context:
p = pm.Beta('p', alpha=2, beta=2)
y = pm.Binomial('y', n=n, p=p, observed=heads)
trace = pm.sample(niter)
In [10]:
with coin_context:
start = pm.find_MAP()
step = pm.Metropolis()
trace = pm.sample(niter, step=step, start=start, njobs=4, random_seed=123)
In [11]:
start
Out[11]:
In [12]:
t = trace[niter//2:]
t['p'].shape
Out[12]:
In [13]:
p = trace.get_values('p', burn=niter//2, combine=True, chains=[0,2])
p.shape
Out[13]:
In [14]:
pm.autocorrplot(t, varnames=['p'])
pass
In [15]:
pm.effective_n(t)
Out[15]:
where $W$ is the within-chain variance and $\hat{V}$ is the posterior variance estimate for the pooled traces. Values greater than one indicate that one or more chains have not yet converged.
Discrad burn-in steps for each chain. The idea is to see if the starting values of each chain come from the same distribution as the stationary state.
The idea is that $\hat{V}$ is an unbiased estimator of $W$ if the starting values of each chain come from the same distribution as the stationary state. Hence if $\hat{R}$ differs significantly from 1, there is probsbly no convergence and we need more iterations. This is done for each parameter $\theta$.
In [16]:
pm.gelman_rubin(t)
Out[16]:
In [17]:
plt.plot(pm.geweke(t['p'])[:,1], 'o')
plt.axhline(1, c='red')
plt.axhline(-1, c='red')
plt.gca().margins(0.05)
pass
In [18]:
pm.summary(t, varnames=['p'])
In [19]:
pm.traceplot(t, varnames=['p'])
pass
In [20]:
with coin_context:
ppc = pm.sample_ppc(t, samples=100)
ppc['y'].ravel()
Out[20]:
In [21]:
from pymc3.backends import Text
niter = 2000
with pm.Model() as text_save_demo:
p = pm.Beta('p', alpha=2, beta=2)
y = pm.Binomial('y', n=n, p=p, observed=heads)
db = Text('trace')
trace = pm.sample(niter, trace=db)
In [22]:
with text_save_demo:
trace = pm.backends.text.load('trace')
pm.traceplot(trace, varnames=['p'])
In [23]:
trace.varnames
Out[23]:
In [24]:
from pymc3.backends import SQLite
niter = 2000
with pm.Model() as sqlie3_save_demo:
p = pm.Beta('p', alpha=2, beta=2)
y = pm.Binomial('y', n=n, p=p, observed=heads)
db = SQLite('trace.db')
trace = pm.sample(niter, trace=db)
In [25]:
with sqlie3_save_demo:
trace = pm.backends.sqlite.load('trace.db')
pm.traceplot(trace, varnames=['p'])
In [26]:
with pm.Model() as prior_context:
sigma = pm.Gamma('sigma', alpha=2.0, beta=1.0)
mu = pm.Normal('mu', mu=0, sd=sigma)
trace = pm.sample(niter, step=pm.Metropolis())
In [27]:
pm.traceplot(trace, varnames=['mu', 'sigma'])
pass
In [28]:
xs = rng.normal(loc=5, scale=2, size=100)
In [29]:
sns.distplot(xs, rug=True)
pass
In [30]:
niter = 2000
with pm.Model() as normal_context:
mu = pm.Normal('mu', mu=0, sd=100)
sd = pm.HalfCauchy('sd', beta=2)
y = pm.Normal('y', mu=mu, sd=sd, observed=xs)
trace = pm.sample(niter)
In [31]:
hpd = pm.stats.hpd(trace, alpha=0.05)
hpd
Out[31]:
In [32]:
ax = pm.traceplot(trace, varnames=['mu', 'sd'],)
ymin, ymax = ax[0,0].get_ylim()
y = ymin + 0.05*(ymax-ymin)
ax[0, 0].plot(hpd['mu'], [y,y], c='red')
ymin, ymax = ax[1,0].get_ylim()
y = ymin + 0.05*(ymax-ymin)
ax[1, 0].plot(hpd['sd'], [y,y], c='red')
pass
In [33]:
post_mean = pm.df_summary(trace, varnames=trace.varnames)['mean'].to_dict()
post_mean
Out[33]:
In [34]:
normal_context.logp(post_mean)
Out[34]:
In [35]:
with normal_context:
print(pm.stats.dic(trace))
In [36]:
with normal_context:
print(pm.stats.waic(trace))
In [37]:
with normal_context:
print(pm.stats.bpic(trace))
We will show how to estimate regression parameters using a simple linear model
$$ y \sim ax + b $$We can restate the linear model $$y = ax + b + \epsilon$$ as sampling from a probability distribution
$$ y \sim \mathcal{N}(ax + b, \sigma^2) $$Now we can use pymc
to estimate the parameters $a$, $b$ and $\sigma$. We will assume the following priors
In [38]:
import daft
# Instantiate the PGM.
pgm = daft.PGM(shape=[4.0, 3.0], origin=[-0.3, -0.7])
# Hierarchical parameters.
pgm.add_node(daft.Node("alpha", r"$\alpha$", 0.5, 2))
pgm.add_node(daft.Node("beta", r"$\beta$", 1.5, 2))
pgm.add_node(daft.Node("sigma", r"$\sigma$", 0, 0))
# Deterministic variable.
pgm.add_node(daft.Node("mu", r"$\mu_n$", 1, 1))
# Data.
pgm.add_node(daft.Node("x", r"$x_n$", 2, 1, observed=True))
pgm.add_node(daft.Node("y", r"$y_n$", 1, 0, observed=True))
# Add in the edges.
pgm.add_edge("alpha", "mu")
pgm.add_edge("beta", "mu")
pgm.add_edge("x", "mu")
pgm.add_edge("mu", "y")
pgm.add_edge("sigma", "y")
# And a plate.
pgm.add_plate(daft.Plate([0.5, -0.5, 2, 2], label=r"$n = 1, \cdots, N$",
shift=-0.1))
# Render and save.
pgm.render()
pgm.figure.savefig("lm.pdf")
In [39]:
# observed data
np.random.seed(123)
n = 11
_a = 6
_b = 2
x = np.linspace(0, 1, n)
y = _a*x + _b + np.random.randn(n)
In [40]:
niter = 10000
with pm.Model() as linreg:
a = pm.Normal('a', mu=0, sd=100)
b = pm.Normal('b', mu=0, sd=100)
sigma = pm.HalfNormal('sigma', sd=1)
y_est = a*x + b
likelihood = pm.Normal('y', mu=y_est, sd=sigma, observed=y)
trace = pm.sample(niter, random_seed=123)
In [41]:
t = trace[niter//2:]
pm.traceplot(trace, varnames=['a', 'b'])
pass
In [42]:
plt.scatter(x, y, s=30, label='data')
for a_, b_ in zip(t['a'][-100:], t['b'][-100:]):
plt.plot(x, a_*x + b_, c='gray', alpha=0.1)
plt.plot(x, _a*x + _b, label='true regression line', lw=3., c='red')
plt.legend(loc='best')
pass
In [43]:
ppc = pm.sample_ppc(trace, samples=500, model=linreg, size=11)
In [44]:
sns.distplot([np.mean(n) for n in ppc['y']], kde=True)
plt.axvline(np.mean(y), color='red')
pass
In [45]:
df = pd.DataFrame({'x': x, 'y': y})
df.head()
Out[45]:
In [46]:
with pm.Model() as model:
pm.glm.glm('y ~ x', df)
trace = pm.sample(2000)
In [47]:
pm.traceplot(trace, varnames=['Intercept', 'x'])
pass
In [48]:
# observed data
np.random.seed(123)
n = 11
_a = 6
_b = 2
x = np.linspace(0, 1, n)
y = _a*x + _b + np.random.randn(n)
y[5] *=10
df = pd.DataFrame({'x': x, 'y': y})
df.head()
Out[48]:
In [49]:
x, y = StandardScaler().fit_transform(df[['x', 'y']]).T
In [50]:
niter = 10000
with pm.Model() as linreg:
a = pm.Normal('a', mu=0, sd=100)
b = pm.Normal('b', mu=0, sd=100)
sigma = pm.HalfNormal('sigma', sd=1)
y_est = a*x + b
y_obs = pm.Normal('y_obs', mu=y_est, sd=sigma, observed=y)
trace = pm.sample(niter, random_seed=123)
In [51]:
t = trace[niter//2:]
plt.scatter(x, y, s=30, label='data')
for a_, b_ in zip(t['a'][-100:], t['b'][-100:]):
plt.plot(x, a_*x + b_, c='gray', alpha=0.1)
plt.legend(loc='upper left')
pass
In [52]:
niter = 10000
with pm.Model() as robust_linreg:
beta = pm.Normal('beta', 0, 10, shape=2)
nu = pm.Exponential('nu', 1/len(x))
sigma = pm.HalfCauchy('sigma', beta=1)
y_est = beta[0] + beta[1]*x
y_obs = pm.StudentT('y_obs', mu=y_est, sd=sigma, nu=nu, observed=y)
trace = pm.sample(niter, random_seed=123)
In [53]:
t = trace[niter//2:]
plt.scatter(x, y, s=30, label='data')
for a_, b_ in zip(t['beta'][-100:, 1], t['beta'][-100:, 0]):
plt.plot(x, a_*x + b_, c='gray', alpha=0.1)
plt.legend(loc='upper left')
pass
In [54]:
df = pd.read_csv('HtWt.csv')
df.head()
Out[54]:
In [55]:
niter = 1000
with pm.Model() as model:
pm.glm.glm('male ~ height + weight', df, family=pm.glm.families.Binomial())
trace = pm.sample(niter, njobs=4, random_seed=123)
In [56]:
df_trace = pm.trace_to_dataframe(trace[niter//2:])
pd.scatter_matrix(df_trace.ix[-niter//2:, ['height', 'weight', 'Intercept']], diagonal='kde')
plt.tight_layout()
pass
In [57]:
niter = 1000
with pm.Model() as model:
pm.glm.glm('male ~ height + weight', df, family=pm.glm.families.Binomial())
trace = pm.sample(niter, njobs=4, random_seed=123)
In [58]:
df_trace = pm.trace_to_dataframe(trace[niter//2:])
pd.scatter_matrix(df_trace.ix[-niter//2:, ['height', 'weight', 'Intercept']], diagonal='kde')
plt.tight_layout()
pass
In [59]:
height, weight, intercept = df_trace[['height', 'weight', 'Intercept']].mean(0)
def predict(w, h, intercept=intercept, height=height, weight=weight):
"""Predict gender given weight (w) and height (h) values."""
v = intercept + height*h + weight*w
return np.exp(v)/(1+np.exp(v))
# calculate predictions on grid
xs = np.linspace(df.weight.min(), df.weight.max(), 100)
ys = np.linspace(df.height.min(), df.height.max(), 100)
X, Y = np.meshgrid(xs, ys)
Z = predict(X, Y)
plt.figure(figsize=(6,6))
# plot 0.5 contour line - classify as male if above this line
plt.contour(X, Y, Z, levels=[0.5])
# classify all subjects
colors = ['lime' if i else 'yellow' for i in df.male]
ps = predict(df.weight, df.height)
errs = ((ps < 0.5) & df.male) |((ps >= 0.5) & (1-df.male))
plt.scatter(df.weight[errs], df.height[errs], facecolors='red', s=150)
plt.scatter(df.weight, df.height, facecolors=colors, edgecolors='k', s=50, alpha=1);
plt.xlabel('Weight', fontsize=16)
plt.ylabel('Height', fontsize=16)
plt.title('Gender classification by weight and height', fontsize=16)
plt.tight_layout();
Gelman's book has an example where the dose of a drug may be affected to the number of rat deaths in an experiment.
Dose (log g/ml) | # Rats | # Deaths |
---|---|---|
-0.896 | 5 | 0 |
-0.296 | 5 | 1 |
-0.053 | 5 | 3 |
0.727 | 5 | 5 |
We will model the number of deaths as a random sample from a binomial distribution, where $n$ is the number of rats and $p$ the probability of a rat dying. We are given $n = 5$, but we believe that $p$ may be related to the drug dose $x$. As $x$ increases the number of rats dying seems to increase, and since $p$ is a probability, we use the following model:
$$ y \sim \text{Bin}(n, p) \\ \text{logit}(p) = \alpha + \beta x \\ \alpha \sim \mathcal{N}(0, 5) \\ \beta \sim \mathcal{N}(0, 10) $$where we set vague priors for $\alpha$ and $\beta$, the parameters for the logistic model.
In [60]:
n = 5 * np.ones(4)
x = np.array([-0.896, -0.296, -0.053, 0.727])
y = np.array([0, 1, 3, 5])
In [61]:
def invlogit(x):
return np.exp(x) / (1 + np.exp(x))
with pm.Model() as model:
alpha = pm.Normal('alpha', mu=0, sd=5)
beta = pm.Flat('beta')
p = invlogit(alpha + beta*x)
y_obs = pm.Binomial('y_obs', n=n, p=p, observed=y)
trace = pm.sample(niter, random_seed=123)
In [62]:
f = lambda a, b, xp: np.exp(a + b*xp)/(1 + np.exp(a + b*xp))
xp = np.linspace(-1, 1, 100)
a = trace.get_values('alpha').mean()
b = trace.get_values('beta').mean()
plt.plot(xp, f(a, b, xp), c='red')
plt.scatter(x, y/5, s=50);
plt.xlabel('Log does of drug')
plt.ylabel('Risk of death')
pass
This uses the Gelman radon data set and is based off this IPython notebook. Radon levels were measured in houses from all counties in several states. Here we want to know if the presence of a basement affects the level of radon, and if this is affected by which county the house is located in.
The data set provided is just for the state of Minnesota, which has 85 counties with 2 to 116 measurements per county. We only need 3 columns for this example county
, log_radon
, floor
, where floor=0
indicates that there is a basement.
We will perform simple linear regression on log_radon as a function of county and floor.
In [63]:
radon = pd.read_csv('radon.csv')[['county', 'floor', 'log_radon']]
radon.head()
Out[63]:
In the pooled model, we ignore the county infomraiton.
$$ y \sim \mathcal{N}(a + bx, \sigma^2) $$where $y$ is the log radon level, and $x$ an indicator variable for whether there is a basement or not.
We make up some choices for the fairly uniformative priors as usual
$$ a \sim \mathcal{N}(\mu, \sigma_a^2) \\ b \sim \mathcal{N}(\mu, \sigma_b^2) \\ \sigma \sim \text{Gamma(10, 1)} $$However, since the radon level varies by geographical location, it might make sense to include county information in the model. One way to do this is to build a separate regression model for each county, but the sample sizes for some counties may be too small for precise estimates. A compromise between the pooled and separate county models is to use a hierarchical model for patial pooling - the practical efffect of this is to shrink per county estimates towards the group mean, especially for counties with few observations.
With a hierarchical model, there is an $a_c$ and a $b_c$ for each county $c$ just as in the individual county model, but they are no longer independent but assumed to come from a common group distribution
$$ a_c \sim \mathcal{N}(\mu_a, \sigma_a^2) \\ b_c \sim \mathcal{N}(\mu_b, \sigma_b^2) $$we further assume that the hyperparameters come from the following distributions
$$ \mu_a \sim \mathcal{N}(0, 10^2) \\ \sigma_a \sim \text{|Cauchy(1)|} \\ \mu_b \sim \mathcal{N}(0, 10^2) \\ \sigma_b \sim \text{|Cauchy(1)|} \\ $$The variance for observations does not change, so the model for the radon level is
$$ y \sim \mathcal{N}(a_c + b_c x, \sigma^2) $$
In [64]:
county = pd.Categorical(radon['county']).codes
niter = 1000
with pm.Model() as hm:
# County hyperpriors
mu_a = pm.Normal('mu_a', mu=0, sd=10)
sigma_a = pm.HalfCauchy('sigma_a', beta=1)
mu_b = pm.Normal('mu_b', mu=0, sd=10)
sigma_b = pm.HalfCauchy('sigma_b', beta=1)
# County slopes and intercepts
a = pm.Normal('slope', mu=mu_a, sd=sigma_a, shape=len(set(county)))
b = pm.Normal('intercept', mu=mu_b, sd=sigma_b, shape=len(set(county)))
# Houseehold errors
sigma = pm.Gamma("sigma", alpha=10, beta=1)
# Model prediction of radon level
mu = a[county] + b[county] * radon.floor.values
# Data likelihood
y = pm.Normal('y', mu=mu, sd=sigma, observed=radon.log_radon)
hm_trace = pm.sample(niter)
In [65]:
plt.figure(figsize=(8, 60))
pm.forestplot(hm_trace, varnames=['slope', 'intercept'])
pass
In [66]:
from theano.compile.ops import as_op
import theano.tensor as T
In [67]:
np.random.seed(1)
y = np.r_[np.random.normal(-6, 2, 500),
np.random.normal(0, 1, 200),
np.random.normal(4, 1, 300)]
n = y.shape[0]
In [68]:
plt.hist(y, bins=20, normed=True, alpha=0.5)
pass
In [69]:
k = 3
niter = 1000
with pm.Model() as gmm:
p = pm.Dirichlet('p', np.ones(k), shape=k)
mus = pm.Normal('mus', mu=[0, 0, 0], sd=15, shape=k)
sds = pm.HalfNormal('sds', sd=5, shape=k)
Y_obs = pm.NormalMixture('Y_obs', p, mus, sd=sds, observed=y)
trace = pm.sample(niter)
In [70]:
burn = niter//2
trace = trace[burn:]
pm.traceplot(trace, varnames=['p', 'mus', 'sds'])
pass