Preamble


In [1]:
%matplotlib notebook

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import norm

from keras import backend as K
from keras.layers import (Input, Activation, Dense, Lambda, Layer, 
                          add, multiply)
from keras.models import Model, Sequential
from keras.callbacks import TerminateOnNaN
from keras.datasets import mnist

from tqdm import tnrange

import tensorflow as tf


Using TensorFlow backend.

Notebook Environment


In [3]:
plt.style.use('seaborn-notebook')
sns.set_context('notebook')
np.set_printoptions(precision=2,
                    edgeitems=3,
                    linewidth=80,
                    suppress=True)

In [4]:
'TensorFlow version: ' + K.tf.__version__


Out[4]:
'TensorFlow version: 1.4.0'

In [5]:
sess = tf.InteractiveSession()

Example: Diagonal Gaussians


In [6]:
D = 2
q_mu = np.float32([ 1., 4.])
p_mu = np.float32([-3., 2.])
q_sigma = np.ones(D).astype('float32')
p_sigma = 2.5*np.ones(D).astype('float32')

In [7]:
q = tf.distributions.Normal(loc=q_mu, scale=q_sigma)
p = tf.distributions.Normal(loc=p_mu, scale=p_sigma)

In [8]:
fig, ax = plt.subplots(figsize=(5, 5))

ax.scatter(*p.sample(sample_shape=(500,)).eval().T, 
           s=8., alpha=.8, label='p samples')

ax.scatter(*q.sample(sample_shape=(500,)).eval().T, 
           s=8.,alpha=.8, label='q samples')

ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')

ax.legend()

plt.show()


Closed-form KL divergence between diagonal Gaussians


In [9]:
def kl_divergence_gaussians(q_mu, q_sigma, p_mu, p_sigma):
    
    r = q_mu - p_mu
   
    return np.sum(np.log(p_sigma) - np.log(q_sigma)
                  - .5 * (1. - (q_sigma**2 + r**2) / p_sigma**2), 
                  axis=-1)

In [10]:
kl_true = kl_divergence_gaussians(q_mu, q_sigma, p_mu, p_sigma)
kl_true


Out[10]:
2.5925815

In [11]:
tf.reduce_sum(tf.distributions.kl_divergence(
    tf.distributions.Normal(loc=q_mu, scale=q_sigma),
    tf.distributions.Normal(loc=p_mu, scale=p_sigma)), axis=-1).eval()


Out[11]:
2.5925813

Monte Carlo estimation

The KL divergence is an expectation of log density ratios over distribution p. We can approximate it with Monte Carlo samples.


In [12]:
mc_samples = 10000

In [13]:
def log_density_ratio_gaussians(z, q_mu, q_sigma, p_mu, p_sigma):
 
    r_p = (z - p_mu) / p_sigma
    r_q = (z - q_mu) / q_sigma
    
    return np.sum(np.log(p_sigma) - np.log(q_sigma) + 
                  .5 * (r_p**2 - r_q**2), axis=-1)

In [14]:
mc_estimates = pd.Series(
    log_density_ratio_gaussians(
        q.sample(sample_shape=(mc_samples,)).eval(), 
        q_mu, q_sigma, 
        p_mu, p_sigma
    )
)

In [15]:
# cumulative mean of the MC estimates
mean = mc_estimates.expanding().mean()

In [16]:
golden_size = lambda width: (width, 2. * width / (1 + np.sqrt(5)))

In [17]:
fig, ax = plt.subplots(figsize=golden_size(6))

mean.plot(ax=ax, label='Estimated')

ax.axhline(y=kl_true, color='r', linewidth=2., label='True')

ax.set_xlabel('Monte Carlo samples')
ax.set_ylabel('KL Divergence')

ax.set_ylim(.95*kl_true, 1.05*kl_true)

ax.legend()

plt.show()



In [18]:
fig, ax = plt.subplots(figsize=golden_size(6))

ax.plot(np.square(mean - kl_true))

ax.set_xlabel('Monte Carlo samples')
ax.set_ylabel('Squared error')

ax.set_ylim(-0.01, 1.)

plt.show()



In [19]:
mean.index.name = 'samples'
mean_df = pd.DataFrame(mean.rename('mc_estimate'))

In [20]:
g = sns.JointGrid(x='samples', y='mc_estimate', 
                  data=mean_df.reset_index())

g = g.plot_joint(plt.plot)
g = g.plot_marginals(sns.kdeplot, shade=True)

g.ax_marg_x.clear()
g.ax_marg_x.set_xticks([])
g.ax_marg_x.set_yticks([])

g.set_axis_labels('Monte Carlo samples', 'KL Divergence')


Out[20]:
<seaborn.axisgrid.JointGrid at 0x7f9a6802be80>