https://github.com/pymc-devs/pymc3/blob/master/pymc3/examples/censored_data.py
In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pymc3 as pm
import theano.tensor as tt
import seaborn as sns
sns.set()
%matplotlib inline
Data can be left, right, or interval censored. In this example we take interval censoring as it is the most general. In order to deal with data censored only on one side one can simply remove one of the sides. Many modeling problems are of this nature, two common examples are:
In [2]:
# Helper functions
def normal_lcdf(mu, sigma, x):
z = (x - mu) / sigma
return tt.switch(
tt.lt(z, -1.0),
tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2.,
tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.)
)
def normal_lccdf(mu, sigma, x):
z = (x - mu) / sigma
return tt.switch(
tt.gt(z, 1.0),
tt.log(tt.erfcx(z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2.,
tt.log1p(-tt.erfc(-z / tt.sqrt(2.)) / 2.)
)
In [3]:
# Produce normally distributed samples
np.random.seed(123)
size = 500
sigma = 5.
mu = 13.
samples = np.random.normal(mu, sigma, size)
# Set censoring limits.
high = 16.
low = -1.
# Truncate samples
truncated = samples[(samples > low) & (samples < high)]
In [18]:
sns.distplot(truncated, label='truncated')
sns.distplot(samples, label='samples')
plt.legend(bbox_to_anchor=(1, 1))
Out[18]:
In [4]:
# Omniscient model
with pm.Model() as omniscient_model:
mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)
sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)
observed = pm.Normal('observed', mu=mu, sd=sigma, observed=samples)
In [5]:
# Imputed censored model
# Keep tabs on left/right censoring
n_right_censored = len(samples[samples >= high])
n_left_censored = len(samples[samples <= low])
n_observed = len(samples) - n_right_censored - n_left_censored
with pm.Model() as imputed_censored_model:
mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)
sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)
right_censored = pm.Bound(pm.Normal, lower=high)(
'right_censored', mu=mu, sd=sigma, shape=n_right_censored
)
left_censored = pm.Bound(pm.Normal, upper=low)(
'left_censored', mu=mu, sd=sigma, shape=n_left_censored
)
observed = pm.Normal(
'observed',
mu=mu,
sd=sigma,
observed=truncated,
shape=n_observed
)
In [7]:
# Unimputed censored model
def censored_right_likelihood(mu, sigma, n_right_censored, upper_bound):
return n_right_censored * normal_lccdf(mu, sigma, upper_bound)
def censored_left_likelihood(mu, sigma, n_left_censored, lower_bound):
return n_left_censored * normal_lcdf(mu, sigma, lower_bound)
In [8]:
with pm.Model() as unimputed_censored_model:
mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)
sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)
observed = pm.Normal(
'observed',
mu=mu,
sd=sigma,
observed=truncated,
)
right_censored = pm.Potential(
'right_censored',
censored_right_likelihood(mu, sigma, n_right_censored, high)
)
left_censored = pm.Potential(
'left_censored',
censored_left_likelihood(mu, sigma, n_left_censored, low)
)
In [12]:
n=1500
if n == 'short':
n = 50
print('Model with no censored data (omniscient)')
with omniscient_model:
trace = pm.sample(n, njobs=8)
pm.plot_posterior(trace[-1000:], varnames=['mu', 'sigma'])
print('Imputed censored model')
with imputed_censored_model:
trace = pm.sample(n, njobs=8)
pm.plot_posterior(trace[-1000:], varnames=['mu', 'sigma'])
print('Unimputed censored model')
with unimputed_censored_model:
trace = pm.sample(n, njobs=8)
pm.plot_posterior(trace[-1000:], varnames=['mu', 'sigma'])
In [ ]:
In [ ]:
In [ ]: