Inference Data Cookbook

InferenceData is the central data format for ArviZ. InferenceData itself is just a container that maintains references to one or more xarray.Dataset. See the InferenceData structure specification here. Below are various ways to generate an InferenceData object. See here for more on xarray.


In [1]:
import arviz as az
import numpy as np

From 1d numpy array


In [2]:
size = 100
dataset = az.convert_to_inference_data(np.random.randn(size))
print(dataset)
dataset.posterior


Inference data with groups:
	> posterior
Out[2]:
<xarray.Dataset>
Dimensions:  (chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
Data variables:
    x        (chain, draw) float64 0.2265 -0.8065 -0.6446 ... 0.78 -0.9366
Attributes:
    created_at:     2020-03-02T21:01:27.898475
    arviz_version:  0.6.1

From nd numpy array


In [3]:
shape = (1, 2, 3, 4, 5)
dataset = az.convert_to_inference_data(np.random.randn(*shape))
print(dataset)
dataset.posterior


Inference data with groups:
	> posterior
Out[3]:
<xarray.Dataset>
Dimensions:  (chain: 1, draw: 2, x_dim_0: 3, x_dim_1: 4, x_dim_2: 5)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1
  * x_dim_0  (x_dim_0) int64 0 1 2
  * x_dim_1  (x_dim_1) int64 0 1 2 3
  * x_dim_2  (x_dim_2) int64 0 1 2 3 4
Data variables:
    x        (chain, draw, x_dim_0, x_dim_1, x_dim_2) float64 -0.8216 ... 0.2341
Attributes:
    created_at:     2020-03-02T21:01:28.003339
    arviz_version:  0.6.1

From a dictionary


In [4]:
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
dataset = az.convert_to_inference_data(datadict)
print(dataset)
dataset.posterior


Inference data with groups:
	> posterior
Out[4]:
<xarray.Dataset>
Dimensions:  (b_dim_0: 10, c_dim_0: 3, c_dim_1: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b_dim_0  (b_dim_0) int64 0 1 2 3 4 5 6 7 8 9
  * c_dim_0  (c_dim_0) int64 0 1 2
  * c_dim_1  (c_dim_1) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 -0.2659 -0.1299 -0.8517 ... 0.9115 -1.754
    b        (chain, draw, b_dim_0) float64 -1.818 0.6832 ... 0.7159 2.098
    c        (chain, draw, c_dim_0, c_dim_1) float64 1.122 -0.4645 ... 1.353
Attributes:
    created_at:     2020-03-02T21:01:28.098109
    arviz_version:  0.6.1

From dictionary with coords and dims


In [5]:
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
coords = {
    'c1' : np.arange(3), 
    'c2' : np.arange(4), 
    'b1' : np.arange(10)
}
dims = {
    'b' : ['b1'], 
    'c' : ['c1', 'c2']
}

dataset = az.convert_to_inference_data(
    datadict,
    coords=coords,
    dims=dims
)
print(dataset)
dataset.posterior


Inference data with groups:
	> posterior
Out[5]:
<xarray.Dataset>
Dimensions:  (b1: 10, c1: 3, c2: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b1       (b1) int64 0 1 2 3 4 5 6 7 8 9
  * c1       (c1) int64 0 1 2
  * c2       (c2) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 1.07 0.4023 0.1068 ... -1.478 0.5839 -1.104
    b        (chain, draw, b1) float64 -1.361 0.02025 -1.963 ... 0.9772 1.228
    c        (chain, draw, c1, c2) float64 1.122 -0.4066 ... -1.219 -0.7475
Attributes:
    created_at:     2020-03-02T21:01:28.281693
    arviz_version:  0.6.1

From pymc3


In [6]:
import pymc3 as pm
draws = 500
chains = 2

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

with pm.Model() as model:
    mu = pm.Normal('mu', mu=0, sd=5)
    tau = pm.HalfCauchy('tau', beta=5)
    theta_tilde = pm.Normal('theta_tilde', mu=0, sd=1, shape=eight_school_data['J'])
    theta = pm.Deterministic('theta', mu + tau * theta_tilde)
    pm.Normal('obs', mu=theta, sd=eight_school_data['sigma'], observed=eight_school_data['y'])
    
    trace = pm.sample(draws, chains=chains)
    prior = pm.sample_prior_predictive()
    posterior_predictive = pm.sample_posterior_predictive(trace)
    
    pm_data = az.from_pymc3(
            trace=trace,
            prior=prior,
            posterior_predictive=posterior_predictive,
            coords={'school': np.arange(eight_school_data['J'])},
            dims={'theta': ['school'], 'theta_tilde': ['school']},
        )
pm_data


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta_tilde, tau, mu]
100.00% [2000/2000 00:00<00:00 Sampling 2 chains, 2 divergences]
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
100.00% [1000/1000 00:00<00:00]
Out[6]:
Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data
	> prior
	> prior_predictive

From pystan


In [7]:
import pystan

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

stan_model = pystan.StanModel(model_code=schools_code)
fit = stan_model.sampling(data=eight_school_data, control={"adapt_delta" : 0.9})

stan_data = az.from_pystan(
    posterior=fit,
    posterior_predictive='y_hat',
    observed_data=['y'],
    log_likelihood={'y': 'log_lik'},
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
    }
)

stan_data


INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_9d743830ec4a29fb58eb4660d4b9417f NOW.
Out[7]:
Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data

From pyro


In [8]:
import torch 

import pyro
import pyro.distributions as dist
from pyro.infer import MCMC, NUTS, Predictive

pyro.enable_validation(True)
pyro.set_rng_seed(0)

draws = 500
chains = 2
eight_school_data = {
    'J': 8,
    'y': torch.tensor([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': torch.tensor([15., 10., 16., 11., 9., 11., 10., 18.])
}

def model(J, sigma, y=None):
    mu = pyro.sample('mu', dist.Normal(0, 5))
    tau = pyro.sample('tau', dist.HalfCauchy(5))
    with pyro.plate('J', J):
        theta_tilde = pyro.sample('theta_tilde', dist.Normal(0, 1))
        theta = mu + tau * theta_tilde
        return pyro.sample("obs", dist.Normal(theta, sigma), obs=y)

nuts_kernel = NUTS(model, jit_compile=True, ignore_jit_warnings=True)
mcmc = MCMC(nuts_kernel, num_samples=draws, warmup_steps=draws,
            num_chains=chains, disable_progbar=True)
mcmc.run(**eight_school_data)
posterior_samples = mcmc.get_samples()
posterior_predictive = Predictive(model, posterior_samples).get_samples(
    eight_school_data['J'], eight_school_data['sigma']
)
prior = Predictive(model, num_samples=500).get_samples(
    eight_school_data['J'], eight_school_data['sigma']
)

pyro_data = az.from_pyro(
    mcmc, 
    prior=prior, 
    posterior_predictive=posterior_predictive,
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school']}
)
pyro_data


Out[8]:
Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data
	> prior
	> prior_predictive

From emcee


In [9]:
import emcee

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

def log_prior_8school(theta,J):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    # Half-cauchy prior
    if tau<0:
        return -np.inf
    hwhm = 25
    prior_tau = -np.log(tau**2+hwhm**2)
    prior_mu = -(mu/10)**2  # normal prior, loc=0, scale=10
    prior_eta = -np.sum(eta**2)  # normal prior, loc=0, scale=1
    return prior_mu + prior_tau + prior_eta
    
def log_likelihood_8school(theta,y,sigma):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    return -np.sum(((mu + tau * eta - y) / sigma)**2)
    
def lnprob_8school(theta,J,y,sigma):
    prior = log_prior_8school(theta,J)
    if prior <= -np.inf:
        return -np.inf
    like = log_likelihood_8school(theta,y,sigma)
    return like+prior

nwalkers = 40
ndim = eight_school_data['J']+2
draws = 1500
pos = np.random.normal(size=(nwalkers,ndim))
pos[:,1] = np.absolute(pos[:,1])
sampler = emcee.EnsembleSampler(
    nwalkers, 
    ndim, 
    lnprob_8school, 
    args=(
        eight_school_data['J'], 
        eight_school_data['y'], 
        eight_school_data['sigma']
    )
)
sampler.run_mcmc(pos, draws)

# define variable names, it cannot be inferred from emcee
var_names = ['mu','tau']+['eta{}'.format(i) for i in range(eight_school_data['J'])]
emcee_data = az.from_emcee(sampler, var_names = var_names)
emcee_data


Out[9]:
Inference data with groups:
	> posterior
	> sample_stats
	> observed_data

From CmdStanPy


In [10]:
from cmdstanpy import CmdStanModel

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)

stan_file = "./eight_school.stan"
stan_model = CmdStanModel(stan_file=stan_file)
stan_model.compile()

eight_school_data = {'J': 8,
                     'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
                     'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
                    }

stan_fit = stan_model.sample(data=eight_school_data)

cmdstanpy_data = az.from_cmdstanpy(
    posterior=stan_fit,
    posterior_predictive='y_hat',
    observed_data={'y' : eight_school_data["y"]},
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
        }
)
cmdstanpy_data


INFO:cmdstanpy:compiling stan program, exe file: /home/oriol/Public/arviz/doc/notebooks/eight_school
INFO:cmdstanpy:compiler options: CompilerOptions(stanc_options=None, cpp_options=None)
INFO:cmdstanpy:compiled model file: /home/oriol/Public/arviz/doc/notebooks/eight_school
INFO:cmdstanpy:found newer exe file, not recompiling
INFO:cmdstanpy:compiled model file: /home/oriol/Public/arviz/doc/notebooks/eight_school
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:start chain 2
INFO:cmdstanpy:start chain 3
INFO:cmdstanpy:start chain 4
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:finish chain 2
INFO:cmdstanpy:finish chain 3
INFO:cmdstanpy:finish chain 4
Out[10]:
Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data

From CmdStan

See from_cmdstan for details.

CmdStan helpers


In [11]:
# save for CmdStan example (needs CmdStanPy run)
stan_fit.save_csvfiles(dir="sample_data")

In [12]:
schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)

In [13]:
eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

In [14]:
import pystan
pystan.stan_rdump(eight_school_data, "./eight_school.data.R")

In [15]:
# Bash shell
#
# $ cd cmdstan
# $ make build
# $ make path/to/eight_school
# $ cd path/to
# $ for i in {1..4}
#   do
#     ./eight_school sample random seed=12345 \
#       id=$i data file=eight_school.data.R \
#       output file=sample_data/eight_school_samples-$i.csv &
#   done
# $

In [16]:
# Let's use .stan and .csv files created/saved by the CmdStanPy procedure

# glob string
posterior_glob =  "sample_data/eight_school-*-[0-9].csv"
# list of paths
# posterior_list =  [
#     "sample_data/eight_school-*-1.csv",
#     "sample_data/eight_school-*-2.csv",
#     "sample_data/eight_school-*-3.csv",
#     "sample_data/eight_school-*-4.csv",
# ]

obs_data_path = "./eight_school.data.R"

cmdstan_data = az.from_cmdstan(
    posterior = posterior_glob,
    posterior_predictive='y_hat',
    observed_data=obs_data_path,
    observed_data_var="y",
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school'],
          'y': ['school'],
          'log_lik': ['school'],
          'y_hat': ['school'],
          'theta_tilde': ['school']
         }
)
cmdstan_data


arviz.data.io_cmdstan - INFO - glob found 4 files for 'posterior':
1: sample_data/eight_school-202003022202-1.csv
2: sample_data/eight_school-202003022202-2.csv
3: sample_data/eight_school-202003022202-3.csv
4: sample_data/eight_school-202003022202-4.csv
INFO:arviz.data.io_cmdstan:glob found 4 files for 'posterior':
1: sample_data/eight_school-202003022202-1.csv
2: sample_data/eight_school-202003022202-2.csv
3: sample_data/eight_school-202003022202-3.csv
4: sample_data/eight_school-202003022202-4.csv
Out[16]:
Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data

From NumPyro


In [17]:
from jax.random import PRNGKey

import numpyro
import numpyro.distributions as dist
from numpyro.distributions.transforms import AffineTransform
from numpyro.infer import MCMC, NUTS, Predictive

numpyro.set_host_device_count(4)

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

def model(J, sigma, y=None):
    mu = numpyro.sample('mu', dist.Normal(0, 5))
    tau = numpyro.sample('tau', dist.HalfCauchy(5))
    # use non-centered reparameterization
    theta = numpyro.sample('theta', dist.TransformedDistribution(
        dist.Normal(np.zeros(J), 1), AffineTransform(mu, tau)))
    numpyro.sample('y', dist.Normal(theta, sigma), obs=y)

kernel = NUTS(model)
mcmc = MCMC(kernel, num_warmup=500, num_samples=500, num_chains=4, chain_method='parallel')
mcmc.run(PRNGKey(0), **eight_school_data, extra_fields=['num_steps', 'energy'])
posterior_samples = mcmc.get_samples()
posterior_predictive = Predictive(model, posterior_samples).get_samples(
    PRNGKey(1), eight_school_data['J'], eight_school_data['sigma']
)
prior = Predictive(model, num_samples=500).get_samples(
    PRNGKey(2), eight_school_data['J'], eight_school_data['sigma']
)

numpyro_data = az.from_numpyro(
    mcmc, 
    prior=prior, 
    posterior_predictive=posterior_predictive,
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school']}
)
numpyro_data


Out[17]:
Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data
	> prior
	> prior_predictive