In [0]:
#@title Default title text
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
This notebook is a companion webpage for the NIPS 2018 paper, "Simple, Distributed, and Accelerated Probabilistic Programming" (Tran et al., 2018). See the README.md for details on how to interact with data, models, probabilistic inference, and more. It assumes the following dependencies:
In [0]:
!pip install scipy==1.0.0
!pip install tensor2tensor==1.9.0
!pip install tensorflow==1.12.0rc2 # alternatively, tensorflow-gpu==1.12.0rc2
In [0]:
import numpy as np
import tensorflow as tf
from contextlib import contextmanager
from scipy import stats
from tensor2tensor.layers import common_attention
from tensor2tensor.layers import common_image_attention as cia
from tensor2tensor.models import image_transformer as imgtransformer
from tensor2tensor.models import transformer
tfb = tf.contrib.distributions.bijectors
tfe = tf.contrib.eager
This notebook also requires importing files in this Github repository:
In [0]:
import edward2 as ed
import no_u_turn_sampler # local file import
Certain snippets require eager execution. This is run with the command below.
In [0]:
tf.enable_eager_execution()
In [0]:
def model():
p = ed.Beta(1., 1., name="p")
x = ed.Bernoulli(probs=p,
sample_shape=50,
name="x")
return x
In [0]:
x = model()
print(x)
Figure 2. Variational program (Ranganath et al., 2016), available in eager mode. Python control flow is applicable to generative processes: given a coin flip, the program generates from one of two neural nets. Their outputs can have differing shape (and structure).
In [0]:
def neural_net_negative(noise, inputs):
net = noise + inputs
net = tf.layers.dense(net, 512, activation=tf.nn.relu)
net = tf.layers.dense(net, 64, activation=None)
return net
def neural_net_positive(noise, inputs):
del noise, inputs # unused
return "Hello. I'm a different output type."
def variational(x):
eps = ed.Normal(0., 1., sample_shape=2)
if eps[0] > 0:
return neural_net_positive(eps[1], x)
else:
return neural_net_negative(eps[1], x)
In [0]:
if not tf.executing_eagerly():
raise ValueError("This code snippet requires eager execution.")
x = tf.random_normal([4, 64, 64, 3]) # batch of, e.g., 64x64x3 images
z = variational(x)
if isinstance(z, tf.Tensor):
print(type(z), z.shape) # to avoid printing a huge Tensor
else:
print(z)
Figure 3. Distributed autoregressive flows. The default length is 8, each with 4 independent flows (Papamakarios et al., 2017). Each flow transforms inputs via layers respecting autoregressive ordering. Flows are partitioned across a virtual topology of 4x4 cores (rectangles); each core computes 2 flows and is locally connected; a final core aggregates. The virtual topology aligns with the physical TPU topology: for 4x4 TPUs, it is exact; for 16x16 TPUs, it is duplicated for data parallelism.
In [0]:
class SplitAutoregressiveFlow(tfb.Bijector):
def __init__(flow_size=[4]*8):
self.flows = []
for num_splits in flow_size:
flow = SplitAutoregressiveFlow(masked_network, num_splits)
self.flows.append(flow)
self.flows.append(SplitAutoregressiveFlow(masked_network, 1))
super(SplitAutoregressiveFlow, self).__init__()
def _forward(self, x):
for l, flow in enumerate(self.flows):
with tf.device(tf.contrib.tpu.core(l%4)):
x = flow.forward(x)
return x
def _inverse_and_log_det_jacobian(self, y):
ldj = 0.
for l, flow in enumerate(self.flows[::-1]):
with tf.device(tf.contrib.tpu.core(l%4)):
y, new_ldj = flow.inverse_and_log_det_jacobian(y)
ldj += new_ldj
return y, ldj
class DistributedAutoregressiveFlow(tfb.Bijector):
def __init__(flow_size=[4]*8):
self.flows = []
for num_splits in flow_size:
flow = SplitAutoregressiveFlow(masked_network, num_splits)
self.flows.append(flow)
self.flows.append(SplitAutoregressiveFlow(masked_network, 1))
super(DistributedAutoregressiveFlow, self).__init__()
def _forward(self, x):
for l, flow in enumerate(self.flows):
with tf.device(tf.contrib.tpu.core(l%4)):
x = flow.forward(x)
return x
def _inverse_and_log_det_jacobian(self, y):
ldj = 0.
for l, flow in enumerate(self.flows[::-1]):
with tf.device(tf.contrib.tpu.core(l%4)):
y, new_ldj = flow.inverse_and_log_det_jacobian(y)
ldj += new_ldj
return y, ldj
Figure 4.
Model-parallel VAE with TPUs, generating 16-bit audio from 8-bit latents. The prior and decoder split computation according to distributed autoregressive flows. The encoder may split computation according to compressor; we omit it for space.
In [0]:
def prior():
"""Uniform noise to 8-bit latent, [u1,...,u(T/2)] -> [z1,...,z(T/2)]"""
dist = ed.Independent(ed.Uniform(low=tf.zeros([batch_size, T/2])))
return ed.TransformedDistribution(dist, DistributedAutoregressiveFlow(flow_size))
def decoder(z):
"""Uniform noise + latent to 16-bit audio, [u1,...,uT], [z1,...,z(T/2)] -> [x1,...,xT]"""
dist = ed.Independent(ed.Uniform(low=tf.zeros([batch_size, T])))
dist = ed.TransformedDistribution(dist, tfb.Affine(shift=decompressor(z)))
return ed.TransformedDistribution(dist, DistributedAutoregressiveFlow(flow_size))
def encoder(x):
"""16-bit audio to 8-bit latent, [x1,...,xT] -> [z1,...,z(T/2)]"""
loc, log_scale = tf.split(compressor(x), 2, axis=-1)
return ed.MultivariateNormalDiag(loc=loc, scale=tf.exp(log_scale))
Figure 5. Edward2's core.
trace defines a context; any traceable ops executed during it are replaced by calls to tracer. traceable registers these ops; we register Edward random variables.
In [0]:
STACK = [lambda f, *a, **k: f(*a, **k)]
@contextmanager
def trace(tracer):
STACK.append(tracer)
yield
STACK.pop()
def traceable(f):
def f_wrapped(*a, **k):
STACK[-1](f, *a, **k)
return f_wrapped
Figure 7. A higher-order function which takes a model program as input and returns its log-joint density function.
In [0]:
def make_log_joint_fn(model):
def log_joint_fn(**model_kwargs):
def tracer(rv_call, *args, **kwargs):
name = kwargs.get("name")
kwargs["value"] = model_kwargs.get(name)
rv = rv_call(*args, **kwargs)
log_probs.append(tf.reduce_sum(rv.distribution.log_prob(rv)))
return rv
log_probs = []
with ed.trace(tracer):
model()
return sum(log_probs)
return log_joint_fn
In [0]:
try:
model
except NameError:
raise NameError("This code snippet requires `model` from above.")
log_joint = make_log_joint_fn(model)
p = np.random.uniform()
x = np.round(np.random.normal(size=[50])).astype(np.int32)
out = log_joint(p=p, x=x)
print(out)
Figure 8. A higher-order function which takes a model program as input and returns its causally intervened program. Intervention differs from conditioning: it does not change the sampled value but the distribution.
In [0]:
def mutilate(model, **do_kwargs):
def mutilated_model(*args, **kwargs):
def tracer(rv_call, *args, **kwargs):
name = kwargs.get("name")
if name in do_kwargs:
return do_kwargs[name]
return rv_call(*args, **kwargs)
with ed.trace(tracer):
return model(*args, **kwargs)
return mutilated_model
In [0]:
try:
model
except NameError:
raise NameError("This code snippet requires `model` from above.")
mutilated_model = mutilate(model, p=0.999)
x = mutilated_model()
print(x)
FIgure 9. Data-parallel Image Transformer with TPUs (Parmar et al., 2018). It is a neural autoregressive model which computes the log-probability of a batch of images with self-attention. Edward2 enables representing and training the model as a log-probability function; this is more efficient than the typical representation of programs as a generative process.
In [0]:
get_channel_embeddings = cia.get_channel_embeddings
add_positional_embedding = common_attention.add_positional_embedding
local_attention_1d = cia.local_attention_1d
def image_transformer(inputs, hparams):
x = get_channel_embeddings(3, inputs, hparams.hidden_size)
x = tf.reshape(x, [-1, 32*32*3, hparams.hidden_size])
x = tf.pad(x, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # shift pixels right
x = add_positional_embedding(x, max_length=32*32*3+3, name="pos_embed")
x = tf.nn.dropout(x, keep_prob=0.7)
for _ in range(hparams.num_decoder_layers):
with tf.variable_scope(None, default_name="decoder_layer"):
y = local_attention_1d(x, hparams, attention_type="local_mask_right",
q_padding="LEFT", kv_padding="LEFT")
x = tf.contrib.layers.layer_norm(tf.nn.dropout(y, keep_prob=0.7) + x, begin_norm_axis=-1)
y = tf.layers.dense(x, hparams.filter_size, activation=tf.nn.relu)
y = tf.layers.dense(y, hparams.hidden_size, activation=None)
x = tf.contrib.layers.layer_norm(tf.nn.dropout(y, keep_prob=0.7) + x, begin_norm_axis=-1)
x = tf.reshape(x, [-1, 32, 32, 3, hparams.hidden_size])
logits = tf.layers.dense(x, 256, activation=None)
return ed.Categorical(logits=logits).distribution.log_prob(inputs)
In [0]:
if tf.executing_eagerly():
raise ValueError("This code snippet does not support eager execution.")
batch_size = 4
inputs = tf.random_uniform([batch_size, 32, 32, 3], minval=0, maxval=256, dtype=tf.int32)
hparams = imgtransformer.imagetransformer_cifar10_base()
loss = -tf.reduce_sum(image_transformer(inputs, hparams))
train_op = tf.contrib.tpu.CrossShardOptimizer(tf.train.AdamOptimizer()).minimize(loss)
print(loss)
Figure 10. Core logic in No-U-Turn Sampler (Hoffman and Gelman, 2014). This algorithm has data-dependent non-tail recursion.
See no_u_turn_sampler/ in the Github repository for its full implementation.
Figure 11. Variational inference with preconditioned gradient descent. Edward2 offers writing the probabilistic program and performing arbitrary TensorFlow computation for learning.
In [0]:
try:
model
make_log_joint_fn
except NameError:
raise NameError("This code snippet requires `model`, `make_log_joint_fn` "
" from above.")
class Variational(object):
def __init__(self):
self.parameters = tf.random_normal([2])
def __call__(self, x):
del x # unused; it is a non-amortized approximation
return ed.Deterministic(loc=tf.sigmoid(self.parameters[0]), name="qp")
variational = Variational()
x = tf.random_uniform([50], minval=0, maxval=2, dtype=tf.int32)
alignment = {"qp": "p"}
def loss(x):
qz = variational(x)
log_joint_fn = make_log_joint_fn(model)
kwargs = {alignment[rv.distribution.name]: rv.value
for rv in [qz]}
energy = log_joint_fn(x=x, **kwargs)
entropy = sum([rv.distribution.entropy() for rv in [qz]])
return -energy - entropy
def grad():
with tf.GradientTape() as tape:
tape.watch(variational.parameters)
loss_value = loss(x)
return tape.gradient(loss_value, variational.parameters)
def train(precond):
for _ in range(5):
grads = tf.tensordot(precond, grad(), [[1], [0]])
variational.parameters -= 0.1 * grads
return loss(x)
In [0]:
if not tf.executing_eagerly():
raise ValueError("This code snippet requires eager execution.")
precond = tf.eye(2)
loss_value = train(precond)
print(loss_value)
Figure 12. Learning-to-learn. It finds the optimal preconditioner for train (Figure 11) by differentiating the entire learning algorithm with respect to the preconditioner.
In [0]:
if not tf.executing_eagerly():
raise ValueError("This code snippet requires eager execution.")
precond = tfe.Variable(tf.random_normal([2, 2]))
optimizer = tf.train.AdamOptimizer(1.)
for _ in range(10):
with tf.GradientTape() as tape:
loss_value = train(precond)
grads = tape.gradient(loss_value, [precond])
optimizer.apply_gradients(zip(grads, [precond]))
print(loss_value.numpy(), precond.numpy())
In [0]:
class FakeEdward2ScipyNamespace(object):
pass
for _name in sorted(dir(stats)):
_candidate = getattr(stats, _name)
if isinstance(_candidate, (stats._multivariate.multi_rv_generic,
stats.rv_continuous,
stats.rv_discrete,
stats.rv_histogram)):
_candidate.rvs = ed.traceable(_candidate.rvs)
setattr(FakeEdward2ScipyNamespace, _name, _candidate)
del _candidate
scipy_stats = FakeEdward2ScipyNamespace()
print([name for name in dir(scipy_stats) if not name.startswith("__")])
Below is an Edward2 linear regression program on SciPy.
In [0]:
def make_log_joint_fn(model):
def log_joint_fn(*model_args, **model_kwargs):
def tracer(rv_call, *args, **kwargs):
name = kwargs.pop("name", None)
kwargs.pop("size", None)
kwargs.pop("random_state", None)
value = model_kwargs.get(name)
log_prob_fn = getattr(scipy_stats, rv_call.im_class.__name__[:-4]).logpdf
log_prob = np.sum(log_prob_fn(value, *args, **kwargs))
log_probs.append(log_prob)
return value
log_probs = []
with ed.trace(tracer):
model(*model_args)
return sum(log_probs)
return log_joint_fn
def linear_regression(X):
beta = scipy_stats.norm.rvs(loc=0.0, scale=0.1, size=X.shape[1], name="beta")
loc = np.einsum('ij,j->i', X, beta)
y = scipy_stats.norm.rvs(loc=loc, scale=1., size=1, name="y")
return y
In [0]:
log_joint = make_log_joint_fn(linear_regression)
X = np.random.normal(size=[3, 2])
beta = np.random.normal(size=[2])
y = np.random.normal(size=[3])
out = log_joint(X, beta=beta, y=y)
print(out)
The grammar variational auto-encoder (VAE) (Kusner et al., 2017) posits a generative model over productions from a context-free grammar, and it posits an amortized variational approximation for efficient posterior inference. We train the grammar VAE on synthetic data using the grammar from Kusner et al. (2017; Figure 1).
This example showcases eager execution in order to train the model where data points have a variable number of time steps. However, note that this requires a batch size of 1. In this example, we assume data points arrive in a stream, one at a time. Such a setting requires handling a variable number of time steps as the maximum length is unbounded.
In [0]:
class SmilesGrammar(object):
"""Context-free grammar for SMILES strings."""
nonterminal_symbols = {"smiles", "chain", "branched atom", "atom", "ringbond",
"aromatic organic", "aliphatic organic", "digit"}
alphabet = {"c", "C", "N", "1", "2"}
production_rules = [
("smiles", ["chain"]),
("chain", ["chain", "branched atom"]),
("chain", ["branched atom"]),
("branched atom", ["atom", "ringbond"]),
("branched atom", ["atom"]),
("atom", ["aromatic organic"]),
("atom", ["aliphatic organic"]),
("ringbond", ["digit"]),
("aromatic organic", ["c"]),
("aliphatic organic", ["C"]),
("aliphatic organic", ["N"]),
("digit", ["1"]),
("digit", ["2"]),
]
start_symbol = "smiles"
def mask(self, symbol, on_value=0., off_value=-1e9):
"""Produces a masking tensor for (in)valid production rules."""
mask_values = []
for lhs, _ in self.production_rules:
if symbol in lhs:
mask_value = on_value
else:
mask_value = off_value
mask_values.append(mask_value)
mask_values = tf.reshape(mask_values, [1, len(self.production_rules)])
return mask_values
class ProbabilisticGrammar(tf.keras.Model):
"""Deep generative model over productions which follow a grammar."""
def __init__(self, grammar, latent_size, num_units):
"""Constructs a probabilistic grammar."""
super(ProbabilisticGrammar, self).__init__()
self.grammar = grammar
self.latent_size = latent_size
self.lstm = tf.nn.rnn_cell.LSTMCell(num_units)
self.output_layer = tf.keras.layers.Dense(len(grammar.production_rules))
def call(self, inputs):
"""Runs the model forward to generate a sequence of productions."""
del inputs # unused
latent_code = ed.MultivariateNormalDiag(loc=tf.zeros(self.latent_size),
sample_shape=1,
name="latent_code")
state = self.lstm.zero_state(1, dtype=tf.float32)
t = 0
productions = []
stack = [self.grammar.start_symbol]
while stack:
symbol = stack.pop()
net, state = self.lstm(latent_code, state)
logits = self.output_layer(net) + self.grammar.mask(symbol)
production = ed.OneHotCategorical(logits=logits,
name="production_" + str(t))
_, rhs = self.grammar.production_rules[tf.argmax(production, axis=1)]
for symbol in rhs:
if symbol in self.grammar.nonterminal_symbols:
stack.append(symbol)
productions.append(production)
t += 1
return tf.stack(productions, axis=1)
class ProbabilisticGrammarVariational(tf.keras.Model):
"""Amortized variational posterior for a probabilistic grammar."""
def __init__(self, latent_size):
"""Constructs a variational posterior for a probabilistic grammar."""
super(ProbabilisticGrammarVariational, self).__init__()
self.latent_size = latent_size
self.encoder_net = tf.keras.Sequential([
tf.keras.layers.Conv1D(64, 3, padding="SAME"),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation(tf.nn.elu),
tf.keras.layers.Conv1D(128, 3, padding="SAME"),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation(tf.nn.elu),
tf.keras.layers.Dropout(0.1),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(latent_size * 2, activation=None),
])
def call(self, inputs):
"""Runs the model forward to return a stochastic encoding."""
net = self.encoder_net(tf.cast(inputs, tf.float32))
return ed.MultivariateNormalDiag(
loc=net[..., :self.latent_size],
scale_diag=tf.nn.softplus(net[..., self.latent_size:]),
name="latent_code_posterior")
In [0]:
if not tf.executing_eagerly():
raise ValueError("This code snippet requires eager execution.")
grammar = SmilesGrammar()
probabilistic_grammar = ProbabilisticGrammar(
grammar=grammar, latent_size=8, num_units=128)
probabilistic_grammar_variational = ProbabilisticGrammarVariational(
latent_size=8)
for _ in range(5):
productions = probabilistic_grammar(_)
print("Production Shape: {}".format(productions.shape))
string = grammar.convert_to_string(productions)
print("String: {}".format(string))
encoded_production = probabilistic_grammar_variational(productions)
print("Encoded Productions: {}".format(encoded_production.numpy()))
See tensorflow_probability/examples/grammar_vae.py for the full example.
We demonstrate another level of composability: inference within a probabilistic program. Namely, we apply MCMC to construct a flexible family of distributions for variational inference
(Salimans et al., 2015; Hoffman, 2017). We apply a chain of transition kernels specified by NUTS (nuts) in Section 3.3 and the variational inference algorithm specified by train in Figure 12.
In [0]:
class DeepLatentGaussianModel(tf.keras.Model):
"""Deep generative model."""
def __init__(self, latent_size, data_shape, batch_size):
super(DeepLatentGaussianModel, self).__init__()
self.latent_size = latent_size
self.data_shape = data_shape
self.batch_size = batch_size
self.decoder_net = tf.keras.Sequential([
tf.keras.layers.Dense(512, activation=tf.nn.relu),
tf.keras.layers.Dense(np.prod(data_shape), activation=None),
tf.keras.layers.Reshape(data_shape),
])
def call(self, inputs):
del inputs # unused
latent_code = ed.MultivariateNormalDiag(
loc=tf.zeros([self.batch_size, self.latent_size]),
scale_diag=tf.ones([self.batch_size, self.latent_size]),
name="latent_code")
data = ed.Categorical(logits=self.decoder_net(latent_code), name="data")
return data
class DeepLatentGaussianModelVariational(tf.keras.Model):
"""Amortized variational posterior."""
def __init__(self,
latent_size,
data_shape,
num_transitions,
target_log_prob_fn,
step_size):
super(DeepLatentGaussianModelVariational, self).__init__()
self.latent_size = latent_size
self.data_shape = data_shape
self.num_transitions = num_transitions
self.target_log_prob_fn = target_log_prob_fn
self.step_size = step_size
self.encoder_net = tf.keras.Sequential([
tf.keras.layers.Reshape(np.prod(data_shape)),
tf.keras.layers.Dense(512, activation=tf.nn.relu),
tf.keras.layers.Dense(latent_size * 2, activation=None),
])
def call(self, inputs):
net = encoder_net(inputs)
qz = ed.MultivariateNormalDiag(
loc=net[..., :self.latent_size],
scale_diag=tf.nn.softplus(net[..., self.latent_size:]),
name="latent_code_posterior")
current_target_log_prob = None
current_grads_target_log_prob = None
for _ in range(self.num_transitions):
[
[qz],
current_target_log_prob,
current_grads_target_log_prob,
] = self._kernel(
current_state=[qz],
current_target_log_prob=current_target_log_prob,
current_grads_target_log_prob=current_grads_target_log_prob)
return qz
def _kernel(self, current_state, current_target_log_prob,
current_grads_target_log_prob):
return no_u_turn_sampler.kernel(
current_state=current_state,
target_log_prob_fn=self.target_log_prob_fn,
step_size=self.step_size,
current_target_log_prob=current_target_log_prob,
current_grads_target_log_prob=current_grads_target_log_prob)
In [0]:
latent_size = 50
data_shape = [32, 32, 3, 256]
batch_size = 4
features = tf.random_normal([batch_size] + data_shape)
model = DeepLatentGaussianModel(
latent_size=latent_size,
data_shape=data_shape,
batch_size=batch_size)
variational = DeepLatentGaussianModelVariational(
latent_size=latent_size,
data_shape=data_shape,
step_size=[0.1],
target_log_prob_fn=lambda z: ed.make_log_joint(model)(x=x, z=z),
num_transitions=10)
alignment = {"latent_code_posterior": "latent_code"}
optimizer = tf.train.AdamOptimizer(1e-2)
for _ in range(10):
with tf.GradientTape() as tape:
with ed.trace() as variational_tape:
_ = variational(features)
log_joint_fn = make_log_joint_fn(model)
kwargs = {alignment[rv.distribution.name]: rv.value
for rv in variational_tape.values()}
energy = log_joint_fn(data=features, **kwargs)
entropy = sum([rv.distribution.entropy()
for rv in variational_tape.values()])
loss_value = -energy - entropy
grads = tape.gradient(loss_value, variational.variables)
optimizer.apply_gradients(zip(grads, variational.variables))
print("Step: {:>3d} Loss: {:.3f}".format(step, loss_value))
In [0]:
def logistic_regression(features):
"""Bayesian logistic regression, which returns labels given features."""
coeffs = ed.MultivariateNormalDiag(
loc=tf.zeros(features.shape[1]), name="coeffs")
labels = ed.Bernoulli(
logits=tf.tensordot(features, coeffs, [[1], [0]]), name="labels")
return labels
features = tf.random_uniform([500, 55])
true_coeffs = 5. * tf.random_normal([55])
labels = tf.cast(tf.tensordot(features, true_coeffs, [[1], [0]]) > 0,
dtype=tf.int32)
log_joint = ed.make_log_joint_fn(logistic_regression)
def target_log_prob_fn(coeffs):
return log_joint(features=features, coeffs=coeffs, labels=labels)
In [0]:
if not tf.executing_eagerly():
raise ValueError("This code snippet requires eager execution.")
coeffs_samples = []
target_log_prob = None
grads_target_log_prob = None
for step in range(500):
[
[coeffs],
target_log_prob,
grads_target_log_prob,
] = kernel(target_log_prob_fn=target_log_prob_fn,
current_state=[coeffs],
step_size=[0.1],
current_target_log_prob=target_log_prob,
current_grads_target_log_prob=grads_target_log_prob)
coeffs_samples.append(coeffs)
for coeffs_sample in coeffs_samples:
plt.plot(coeffs_sample.numpy())
plt.show()
See no_u_turn_sampler/logistic_regression.py for the full example.