In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
from scipy.stats import beta, multivariate_normal, uniform, norm
from scipy.misc import derivative
import numpy as np
import matplotlib.pyplot as plt
import copy
import pandas as pd

Use the (local) shape of the distribution to make smarter proposals.

Hamiltonian: quantity that is conserved regardless of position in space.

Metaphor: hockey puck sliding on a (non-flat) surface. Want to be able to describe the state of the puck. The state has two quantities:

  1. Current position, $q$
  2. Momentum, $p$

Hamiltonian: $H(q, p) = U(q) + K(p)$

  • $U(q)$ -- potential energy
  • $K(p)$ -- kinetic energy

In [3]:
dtarget = lambda x: multivariate_normal.pdf(x, mean=(3, 10), cov=[[1, 0], [0, 1]])
x1 = np.linspace(-6, 12, 101)
x2 = np.linspace(-11, 31, 101)
X, Y = np.meshgrid(x1, x2)
Z = np.array(map(dtarget, zip(X.flat, Y.flat))).reshape(101, 101)

In [4]:
plt.figure(figsize=(10,7))
plt.contour(X, Y, Z)
plt.xlim(0, 6)
plt.ylim(7, 13)
plt.show()


The surface of interest will be $U(q) = -\log{f(q)}$

$K(p) = \frac{p^T p}{2m}$, where $m$ = mass of the puck.

Position over time is a function of momentum: $\frac{dq_i}{dt} = \frac{p_i}{m}$

Change in momentum over time is a function of surface gradient: $\frac{dp_i}{dt} = -\frac{\delta U}{\delta q_i}$

Leap-frog algorithm

$ p_i(t + \frac{\epsilon}{2}) = p_i(t) - \frac{\epsilon}{2} \frac{\delta U}{\delta q_i} U(q(t))$

$ q_i(t + \epsilon) = q_i(t) + \frac{\epsilon}{m}p_i(t+\frac{\epsilon}{2})$

$ p_i(t + \epsilon) = p_i(t + \frac{\epsilon}{2}) - \frac{\epsilon}{2} \frac{\delta U}{\delta q_i}(q(t+\epsilon))$

$\epsilon$ -- step size


In [5]:
def HMC_one_step(U, current_q, Eps, L, m=1):
    """
    One step of the Hamiltonian Monte Carlo.
    
    Parameters
    ----------
    U : callable
        A function that takes a single argument, the position.
    q : array-like
        Current position.
    Eps : float
        The step size, epsilon.
    L : int
        Number of leapfrog stpes.
    m : float
        Mass of the particle.
        
    Returns
    -------
    q_out : array
        Path from ``q`` to the proposed position.
    """

    q = copy.copy(current_q)
    Nq = len(q)
    p = multivariate_normal.rvs([0. for i in xrange(Nq)])
    current_p = copy.copy(p)

    out = {}
    
    out['p'] = np.zeros((L, Nq))
    out['p'][0,:] = copy.copy(p)
    out['q'] = np.zeros((L, Nq))
    out['q'][0,:] = copy.copy(q)
    
    for i in xrange(1, L):
        p -= Eps*derivative(U, q, 0.01)/2.
        q += (Eps/m)*p
        out['q'][i, :] = copy.copy(q)
        p -= Eps*derivative(U, q, 0.01)/2.
        out['p'][i, :] = copy.copy(p)
    
    current_U = U(current_q)
    current_K = (current_p**2).sum()/2.
    proposed_U = U(q)
    proposed_K = (p**2).sum()/2.
    
    if uniform.rvs() < exp(current_U - proposed_U + current_K - proposed_K):
        out['value'] = q
    else:
        out['value'] = current_q
    
    return out

In [6]:
plt.figure(figsize=(10,7))
plt.contour(X, Y, Z)
U = lambda x: -1.*np.log(dtarget(x))
chain = HMC_one_step(U, np.array([4., 10.]), Eps=0.2, L=10, m=2)['q']
plt.plot(chain[:, 0], chain[:, 1], 'ro')
plt.plot(chain[:, 0], chain[:, 1], 'r-')
plt.plot(chain[0, 0], chain[0,1], 'bo')
plt.xlim(0, 6)
plt.ylim(7, 13)
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()



In [7]:
def HMC(dtarget, start, Eps=0.2, L=10, m=2, N=1000, num_chains=4):
    """
    Perform an HMC simulation.
    
    Parameters
    ----------
    dtarget : callable
        Target PDF.
    
    """
    
    # Invert the target PDF into a concave surface.
    neg_log_dtarget = lambda x: -1.*np.log(dtarget(x))
    
    # If only one starting position is provided, use it for all chains.
    if len(start.shape) == 1:
        start = np.array([np.array(start) for i in xrange(num_chains)])
        
    chains = []
    for j in xrange(num_chains):
        chain = [start[j, :]]
        for i in xrange(N):
            proposal = HMC_one_step(neg_log_dtarget, 
                                    copy.copy(chain[-1]), 
                                    Eps, L, m)['value']
            chain.append(proposal)
        chains.append(np.array(chain))
    return np.array(chains)

Tuning parameters: step size, number of steps, and "mass"

HMC does not work discrete parameters. STAN is all HMC.

Gelman metric still applies -- we just have a better way of proposing values.


In [8]:
def Gelman(chains):
    if len(chains.shape) == 3:
        N_p = chains.shape[2]
    else:
        N_p = 1
    generate = lambda ptn: np.array([np.array([np.array([ptn(p, i, c) 
                                                         for p in xrange(N_p)
                                               for i in xrange(chains.shape[1])])
                                     for c in xrange(chains.shape[0])])])
    params = generate(lambda p, i, c: 'x{0}'.format(p))
    iters = generate(lambda p, i, c: i)
    labels = generate(lambda p, i, c: c)
                      
    data = zip(chains.flat, params.flat, iters.flat, labels.flat)
    dataframe = pd.DataFrame(data, columns=('Value', 'Parameter', 'Iteration', 'Chain'))

    xbar = dataframe.groupby('Parameter').Value.mean()
    m = chains.shape[0]
    xbar_i = dataframe.groupby(('Parameter', 'Chain')).Value.mean()
    s2_i = dataframe.groupby(('Parameter', 'Chain')).Value.var()
    n = dataframe.groupby(('Parameter', 'Chain')).Value.count().mean()

    W = s2_i.mean()
    B = (n/(m-1.)) * ((xbar_i - xbar)**2).sum()
    sigma2_hat = W*(n-1.)/n + B/n
    R_hat = np.sqrt(sigma2_hat/W)
    n_eff = m*n*sigma2_hat/B    # I missed what this was for.
                      
    return R_hat, n_eff

In [9]:
chains = HMC(dtarget, array([4., 10.]), Eps=0.2, L=5, N=1000)

In [10]:
plt.figure(figsize=(10,7))
plt.contour(X, Y, Z)
plt.plot(chains[0][:, 0], chains[0][:, 1], alpha=0.5)
plt.plot(chains[1][:, 0], chains[1][:, 1], alpha=0.5)
plt.plot(chains[2][:, 0], chains[2][:, 1], alpha=0.5)
plt.plot(chains[3][:, 0], chains[3][:, 1], alpha=0.5)
plt.xlim(0, 6)
plt.ylim(7, 13)
plt.show()



In [11]:
plt.subplot(211)
for i in xrange(chains.shape[0]):
    plt.plot(chains[i,:,0])
plt.ylabel('x1')

plt.subplot(212)
for i in xrange(chains.shape[0]):
    plt.plot(chains[i,:,1])
plt.ylabel('x2')


Out[11]:
<matplotlib.text.Text at 0x109e10090>

In [12]:
Gelman(chains)


Out[12]:
(1.0019118486413818, 831.95510935785444)

Banana-shaped target distribution


In [13]:
dtarget = lambda x: exp( (-x[0]**2)/200. - 0.5*(x[1]+(0.05*x[0]**2) - 100.*0.05)**2)

In [14]:
x1 = np.linspace(-20, 20, 101)
x2 = np.linspace(-15, 10, 101)
X, Y = np.meshgrid(x1, x2)
Z = np.array(map(dtarget, zip(X.flat, Y.flat))).reshape(101, 101)

In [15]:
plt.figure(figsize=(10,7))
plt.contour(X, Y, Z)
plt.show()



In [16]:
start = np.array([[uniform.rvs(loc=-10., scale=15.), 
                   uniform.rvs(loc=0., scale=10)]
                  for i in xrange(4)])
chains = HMC(dtarget, start, Eps=0.7, L=12, m=2, N=10000)

In [17]:
plt.figure(figsize=(10,7))
plt.contour(X, Y, Z)

plt.plot(chains[0][:, 0], chains[0][:, 1], alpha=0.5)
plt.plot(chains[1][:, 0], chains[1][:, 1], alpha=0.5)
plt.plot(chains[2][:, 0], chains[2][:, 1], alpha=0.5)
plt.plot(chains[3][:, 0], chains[3][:, 1], alpha=0.5)
plt.show()



In [18]:
plt.subplot(211)
plt.title(Gelman(chains)[0])
for i in xrange(chains.shape[0]):
    plt.plot(chains[i,:,0])
plt.ylabel('x1')

plt.subplot(212)
for i in xrange(chains.shape[0]):
    plt.plot(chains[i,:,1])
plt.ylabel('x2')

plt.tight_layout()
plt.show()


NUTS Sampler

Toy implementation of No-U-Turn Sampler, described by Hoffman and Gelman (2011). Algorithm 3, page 14.


In [19]:
def Leapfrog(U, theta, r, Eps, m=1.):
    """
    Slightly different update rules, since the negative log of the 
    target PDF is not used.
    """
    gradient = lambda U, theta: derivative(U, theta, 0.01)
    r += (Eps/2.)*gradient(U, theta)
    theta += (Eps/m)*r
    r += (Eps/2.)*gradient(U, theta)
    return copy.copy(theta), copy.copy(r)

In [20]:
def BuildTree(U, theta, r, u, v, j, Eps, m=1., delta_max=1000):
    """
    Recursive tree-building.
    
    TODO: Make this less ugly.
    """
    if j == 0:
        # Take one leapfrog step in the direction v.
        theta_p, r_p = Leapfrog(U, theta, r, v*Eps, m=m)
        n_p = float(u <= exp(U(theta_p) - np.dot(0.5*r_p, r_p)))
        s_p = float(u < exp(delta_max + U(theta_p) - np.dot(0.5*r_p, r_p)))
        return theta_p, r_p, theta_p, r_p, theta_p, n_p, s_p
    else:
        # Recursion -- implicitly build the left and right subtrees.
        rargs = (u, v, j-1., Eps)
        rkwargs = {'m':m}
        theta_n, r_n, theta_f, r_f, theta_p, n_p, s_p = BuildTree(U, theta, r, *rargs, **rkwargs)
        if s_p == 1:
            if v == -1:
                theta_n, r_n, null, null, theta_dp, n_dp, s_dp = BuildTree(U, theta_n, r_n, *rargs, **rkwargs)
            else:
                null, null, theta_f, r_f, theta_dp, n_dp, s_dp = BuildTree(U, theta_f, r_f, *rargs, **rkwargs)
            try:
                if uniform.rvs() <= (n_dp/(n_p + n_dp)):
                    theta_p = copy.copy(theta_dp)
            except ZeroDivisionError:
                pass
            s_p = s_p*s_dp*int(np.dot((theta_f - theta_n), r_n) >= 0)*int( np.dot((theta_f - theta_n), r_f) >= 0)
            n_p += n_dp
        return theta_n, r_n, theta_f, r_f, theta_p, n_p, s_p

In [21]:
def NUTS_one_step(U, theta_last, Eps, m=1.):
    """
    TODO: clean up all the copies -- stop being so paranoid.
    """
    r_not = norm.rvs(0, 1., size=len(theta_last))
    u = uniform.rvs(0, exp(U(theta_last) - np.dot(0.5*r_not, r_not)))
    
    # Initialize.
    theta_m = copy.copy(theta_last)
    theta_n, theta_f = copy.copy(theta_last), copy.copy(theta_last)
    r_n, r_f = copy.copy(r_not), copy.copy(r_not)
    j = 0.
    s = 1.
    n = 1.

    while s == 1.:
        v_j = np.random.choice(np.array([-1., 1.]))    # Choose a direction.
        if v_j == -1:
            theta_n, r_n, null, null, theta_p, n_p, s_p = BuildTree(U, theta_n, r_n, u, v_j, j, Eps, m=m)
        else:
            null, null, theta_f, r_f, theta_p, n_p, s_p = BuildTree(U, theta_f, r_f, u, v_j, j, Eps, m=m)

        if s_p == 1:
            try:
                if uniform.rvs() <= min(1., (n_p/n)):
                    theta_m = copy.copy(theta_p)
            except ZeroDivisionError:
                pass
        s = s_p*int(np.dot((theta_f - theta_n), r_n) >= 0)*int( np.dot((theta_f - theta_n), r_f) >= 0)
        j += 1.

    return theta_m

In [22]:
NUTS_one_step(lambda x: np.log(dtarget(x)), np.array([3.2, 9.1]), 0.02)


Out[22]:
array([ 3.2124508 ,  9.13756381])

In [23]:
def NUTS(dtarget, theta_not, Eps, num_iters=1000, delta_max=1000, m=1.):
    U = lambda x: np.log(dtarget(x))
    
    theta = [theta_not]
    for i in xrange(num_iters):
        theta_i = NUTS_one_step(U, theta[-1], Eps, m=m)
        theta.append(theta_i)
    return theta

Testing on the banana


In [24]:
start = np.array([[uniform.rvs(loc=-10., scale=15.), 
                   uniform.rvs(loc=0., scale=10)]
                  for i in xrange(4)])

In [34]:
chains = np.array([ np.array(NUTS(dtarget, start[i, :], Eps=0.55, m=1.5, num_iters=10000)) for i in xrange(start.shape[0])])

In [35]:
plt.figure(figsize=(10,7))
plt.contour(X, Y, Z)

for i in xrange(chains.shape[0]):
    plt.scatter(chains[i, :, 0], chains[i, :, 1], alpha=0.5, s=0.02)
plt.show()



In [36]:
plt.subplot(211)
plt.title(Gelman(chains)[0])
for i in xrange(chains.shape[0]):
    plt.plot(chains[i, :, 0])
plt.ylabel('x1')

plt.subplot(212)
for i in xrange(chains.shape[0]):
    plt.plot(chains[i, :, 1])
plt.ylabel('x2')

plt.tight_layout()
plt.show()



In [ ]:
plt.hist(chains[0,:,0])

In [ ]: