Inference Data Cookbook

InferenceData is the central data format for ArviZ. InferenceData itself is just a container that maintains references to one or more xarray.Dataset. See the InferenceData structure specification here. Below are various ways to generate an InferenceData object. See here for more on xarray.



In [1]:

    
import arviz as az
import numpy as np

From 1d numpy array



In [2]:

    
size = 100
dataset = az.convert_to_inference_data(np.random.randn(size))
print(dataset)
dataset.posterior









    



Inference data with groups:
	> posterior






    Out[2]:




<xarray.Dataset>
Dimensions:  (chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
Data variables:
    x        (chain, draw) float64 0.2265 -0.8065 -0.6446 ... 0.78 -0.9366
Attributes:
    created_at:     2020-03-02T21:01:27.898475
    arviz_version:  0.6.1

From nd numpy array



In [3]:

    
shape = (1, 2, 3, 4, 5)
dataset = az.convert_to_inference_data(np.random.randn(*shape))
print(dataset)
dataset.posterior









    



Inference data with groups:
	> posterior






    Out[3]:




<xarray.Dataset>
Dimensions:  (chain: 1, draw: 2, x_dim_0: 3, x_dim_1: 4, x_dim_2: 5)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1
  * x_dim_0  (x_dim_0) int64 0 1 2
  * x_dim_1  (x_dim_1) int64 0 1 2 3
  * x_dim_2  (x_dim_2) int64 0 1 2 3 4
Data variables:
    x        (chain, draw, x_dim_0, x_dim_1, x_dim_2) float64 -0.8216 ... 0.2341
Attributes:
    created_at:     2020-03-02T21:01:28.003339
    arviz_version:  0.6.1

From a dictionary



In [4]:

    
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
dataset = az.convert_to_inference_data(datadict)
print(dataset)
dataset.posterior









    



Inference data with groups:
	> posterior






    Out[4]:




<xarray.Dataset>
Dimensions:  (b_dim_0: 10, c_dim_0: 3, c_dim_1: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b_dim_0  (b_dim_0) int64 0 1 2 3 4 5 6 7 8 9
  * c_dim_0  (c_dim_0) int64 0 1 2
  * c_dim_1  (c_dim_1) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 -0.2659 -0.1299 -0.8517 ... 0.9115 -1.754
    b        (chain, draw, b_dim_0) float64 -1.818 0.6832 ... 0.7159 2.098
    c        (chain, draw, c_dim_0, c_dim_1) float64 1.122 -0.4645 ... 1.353
Attributes:
    created_at:     2020-03-02T21:01:28.098109
    arviz_version:  0.6.1

From dictionary with coords and dims



In [5]:

    
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
coords = {
    'c1' : np.arange(3), 
    'c2' : np.arange(4), 
    'b1' : np.arange(10)
}
dims = {
    'b' : ['b1'], 
    'c' : ['c1', 'c2']
}

dataset = az.convert_to_inference_data(
    datadict,
    coords=coords,
    dims=dims
)
print(dataset)
dataset.posterior









    



Inference data with groups:
	> posterior






    Out[5]:




<xarray.Dataset>
Dimensions:  (b1: 10, c1: 3, c2: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b1       (b1) int64 0 1 2 3 4 5 6 7 8 9
  * c1       (c1) int64 0 1 2
  * c2       (c2) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 1.07 0.4023 0.1068 ... -1.478 0.5839 -1.104
    b        (chain, draw, b1) float64 -1.361 0.02025 -1.963 ... 0.9772 1.228
    c        (chain, draw, c1, c2) float64 1.122 -0.4066 ... -1.219 -0.7475
Attributes:
    created_at:     2020-03-02T21:01:28.281693
    arviz_version:  0.6.1

From pymc3



In [6]:

    
import pymc3 as pm
draws = 500
chains = 2

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

with pm.Model() as model:
    mu = pm.Normal('mu', mu=0, sd=5)
    tau = pm.HalfCauchy('tau', beta=5)
    theta_tilde = pm.Normal('theta_tilde', mu=0, sd=1, shape=eight_school_data['J'])
    theta = pm.Deterministic('theta', mu + tau * theta_tilde)
    pm.Normal('obs', mu=theta, sd=eight_school_data['sigma'], observed=eight_school_data['y'])
    
    trace = pm.sample(draws, chains=chains)
    prior = pm.sample_prior_predictive()
    posterior_predictive = pm.sample_posterior_predictive(trace)
    
    pm_data = az.from_pymc3(
            trace=trace,
            prior=prior,
            posterior_predictive=posterior_predictive,
            coords={'school': np.arange(eight_school_data['J'])},
            dims={'theta': ['school'], 'theta_tilde': ['school']},
        )
pm_data









    



Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta_tilde, tau, mu]






    





    
        
      
      100.00% [2000/2000 00:00<00:00 Sampling 2 chains, 2 divergences]
    
    






    



There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.






    





    
        
      
      100.00% [1000/1000 00:00<00:00]
    
    






    Out[6]:





Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data
	> prior
	> prior_predictive

From pystan



In [7]:

    
import pystan

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

stan_model = pystan.StanModel(model_code=schools_code)
fit = stan_model.sampling(data=eight_school_data, control={"adapt_delta" : 0.9})

stan_data = az.from_pystan(
    posterior=fit,
    posterior_predictive='y_hat',
    observed_data=['y'],
    log_likelihood={'y': 'log_lik'},
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
    }
)

stan_data









    



INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_9d743830ec4a29fb58eb4660d4b9417f NOW.






    Out[7]:





Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data

From pyro



In [8]:

    
import torch 

import pyro
import pyro.distributions as dist
from pyro.infer import MCMC, NUTS, Predictive

pyro.enable_validation(True)
pyro.set_rng_seed(0)

draws = 500
chains = 2
eight_school_data = {
    'J': 8,
    'y': torch.tensor([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': torch.tensor([15., 10., 16., 11., 9., 11., 10., 18.])
}

def model(J, sigma, y=None):
    mu = pyro.sample('mu', dist.Normal(0, 5))
    tau = pyro.sample('tau', dist.HalfCauchy(5))
    with pyro.plate('J', J):
        theta_tilde = pyro.sample('theta_tilde', dist.Normal(0, 1))
        theta = mu + tau * theta_tilde
        return pyro.sample("obs", dist.Normal(theta, sigma), obs=y)

nuts_kernel = NUTS(model, jit_compile=True, ignore_jit_warnings=True)
mcmc = MCMC(nuts_kernel, num_samples=draws, warmup_steps=draws,
            num_chains=chains, disable_progbar=True)
mcmc.run(**eight_school_data)
posterior_samples = mcmc.get_samples()
posterior_predictive = Predictive(model, posterior_samples).get_samples(
    eight_school_data['J'], eight_school_data['sigma']
)
prior = Predictive(model, num_samples=500).get_samples(
    eight_school_data['J'], eight_school_data['sigma']
)

pyro_data = az.from_pyro(
    mcmc, 
    prior=prior, 
    posterior_predictive=posterior_predictive,
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school']}
)
pyro_data









    Out[8]:





Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data
	> prior
	> prior_predictive

From emcee



In [9]:

    
import emcee

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

def log_prior_8school(theta,J):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    # Half-cauchy prior
    if tau<0:
        return -np.inf
    hwhm = 25
    prior_tau = -np.log(tau**2+hwhm**2)
    prior_mu = -(mu/10)**2  # normal prior, loc=0, scale=10
    prior_eta = -np.sum(eta**2)  # normal prior, loc=0, scale=1
    return prior_mu + prior_tau + prior_eta
    
def log_likelihood_8school(theta,y,sigma):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    return -np.sum(((mu + tau * eta - y) / sigma)**2)
    
def lnprob_8school(theta,J,y,sigma):
    prior = log_prior_8school(theta,J)
    if prior <= -np.inf:
        return -np.inf
    like = log_likelihood_8school(theta,y,sigma)
    return like+prior

nwalkers = 40
ndim = eight_school_data['J']+2
draws = 1500
pos = np.random.normal(size=(nwalkers,ndim))
pos[:,1] = np.absolute(pos[:,1])
sampler = emcee.EnsembleSampler(
    nwalkers, 
    ndim, 
    lnprob_8school, 
    args=(
        eight_school_data['J'], 
        eight_school_data['y'], 
        eight_school_data['sigma']
    )
)
sampler.run_mcmc(pos, draws)

# define variable names, it cannot be inferred from emcee
var_names = ['mu','tau']+['eta{}'.format(i) for i in range(eight_school_data['J'])]
emcee_data = az.from_emcee(sampler, var_names = var_names)
emcee_data









    Out[9]:





Inference data with groups:
	> posterior
	> sample_stats
	> observed_data

From CmdStanPy



In [10]:

    
from cmdstanpy import CmdStanModel

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)

stan_file = "./eight_school.stan"
stan_model = CmdStanModel(stan_file=stan_file)
stan_model.compile()

eight_school_data = {'J': 8,
                     'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
                     'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
                    }

stan_fit = stan_model.sample(data=eight_school_data)

cmdstanpy_data = az.from_cmdstanpy(
    posterior=stan_fit,
    posterior_predictive='y_hat',
    observed_data={'y' : eight_school_data["y"]},
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
        }
)
cmdstanpy_data









    



INFO:cmdstanpy:compiling stan program, exe file: /home/oriol/Public/arviz/doc/notebooks/eight_school
INFO:cmdstanpy:compiler options: CompilerOptions(stanc_options=None, cpp_options=None)
INFO:cmdstanpy:compiled model file: /home/oriol/Public/arviz/doc/notebooks/eight_school
INFO:cmdstanpy:found newer exe file, not recompiling
INFO:cmdstanpy:compiled model file: /home/oriol/Public/arviz/doc/notebooks/eight_school
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:start chain 2
INFO:cmdstanpy:start chain 3
INFO:cmdstanpy:start chain 4
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:finish chain 2
INFO:cmdstanpy:finish chain 3
INFO:cmdstanpy:finish chain 4






    Out[10]:





Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data

From CmdStan

See from_cmdstan for details.

CmdStan helpers



In [11]:

    
# save for CmdStan example (needs CmdStanPy run)
stan_fit.save_csvfiles(dir="sample_data")



In [12]:

    
schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)



In [13]:

    
eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}



In [14]:

    
import pystan
pystan.stan_rdump(eight_school_data, "./eight_school.data.R")



In [15]:

    
# Bash shell
#
# $ cd cmdstan
# $ make build
# $ make path/to/eight_school
# $ cd path/to
# $ for i in {1..4}
#   do
#     ./eight_school sample random seed=12345 \
#       id=$i data file=eight_school.data.R \
#       output file=sample_data/eight_school_samples-$i.csv &
#   done
# $



In [16]:

    
# Let's use .stan and .csv files created/saved by the CmdStanPy procedure

# glob string
posterior_glob =  "sample_data/eight_school-*-[0-9].csv"
# list of paths
# posterior_list =  [
#     "sample_data/eight_school-*-1.csv",
#     "sample_data/eight_school-*-2.csv",
#     "sample_data/eight_school-*-3.csv",
#     "sample_data/eight_school-*-4.csv",
# ]

obs_data_path = "./eight_school.data.R"

cmdstan_data = az.from_cmdstan(
    posterior = posterior_glob,
    posterior_predictive='y_hat',
    observed_data=obs_data_path,
    observed_data_var="y",
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school'],
          'y': ['school'],
          'log_lik': ['school'],
          'y_hat': ['school'],
          'theta_tilde': ['school']
         }
)
cmdstan_data









    



arviz.data.io_cmdstan - INFO - glob found 4 files for 'posterior':
1: sample_data/eight_school-202003022202-1.csv
2: sample_data/eight_school-202003022202-2.csv
3: sample_data/eight_school-202003022202-3.csv
4: sample_data/eight_school-202003022202-4.csv
INFO:arviz.data.io_cmdstan:glob found 4 files for 'posterior':
1: sample_data/eight_school-202003022202-1.csv
2: sample_data/eight_school-202003022202-2.csv
3: sample_data/eight_school-202003022202-3.csv
4: sample_data/eight_school-202003022202-4.csv






    Out[16]:





Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data

From NumPyro



In [17]:

    
from jax.random import PRNGKey

import numpyro
import numpyro.distributions as dist
from numpyro.distributions.transforms import AffineTransform
from numpyro.infer import MCMC, NUTS, Predictive

numpyro.set_host_device_count(4)

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

def model(J, sigma, y=None):
    mu = numpyro.sample('mu', dist.Normal(0, 5))
    tau = numpyro.sample('tau', dist.HalfCauchy(5))
    # use non-centered reparameterization
    theta = numpyro.sample('theta', dist.TransformedDistribution(
        dist.Normal(np.zeros(J), 1), AffineTransform(mu, tau)))
    numpyro.sample('y', dist.Normal(theta, sigma), obs=y)

kernel = NUTS(model)
mcmc = MCMC(kernel, num_warmup=500, num_samples=500, num_chains=4, chain_method='parallel')
mcmc.run(PRNGKey(0), **eight_school_data, extra_fields=['num_steps', 'energy'])
posterior_samples = mcmc.get_samples()
posterior_predictive = Predictive(model, posterior_samples).get_samples(
    PRNGKey(1), eight_school_data['J'], eight_school_data['sigma']
)
prior = Predictive(model, num_samples=500).get_samples(
    PRNGKey(2), eight_school_data['J'], eight_school_data['sigma']
)

numpyro_data = az.from_numpyro(
    mcmc, 
    prior=prior, 
    posterior_predictive=posterior_predictive,
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school']}
)
numpyro_data









    Out[17]:





Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data
	> prior
	> prior_predictive