Author: gully

Date: Jan 9, 2015

Fitting synthetic Si gaps data with Gaussian process and emcee.

Originals available at: https://github.com/dfm/gp-tutorial & https://speakerdeck.com/dfm/an-astronomers-introduction-to-gaussian-processes


In [1]:
%pylab inline
# Auto imports np and plt


Populating the interactive namespace from numpy and matplotlib

In [2]:
from matplotlib import rcParams
#rcParams["savefig.dpi"] = 150
import emcee  # http://dan.iel.fm/emcee
import triangle  # https://github.com/dfm/triangle.py
import numpy as np
import matplotlib.pyplot as plt
from astroML.plotting import hist
from astroML.stats.random import bivariate_normal
from matplotlib.patches import Ellipse
import timeit
#from IPython.display import display, Math, Latex

In [3]:
import seaborn as sns
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
sns.palplot(sns.cubehelix_palette(light=1))


Non-linear physical optics example


In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("hetroscedastic.txt",names=['yerr', 'junk'], skiprows=1)
#df.head()

In [6]:
from etalon import *

np.random.seed(123456)

# Introduce the True data
x = np.arange(1260.0, 2501.0, 10.0)
N = len(x)

d_True = 4120.0
f_True = 0.045
T_DSP = T_gap_Si(x, 0.0)
y_True = T_gap_Si_withFF(x, d_True, f_True)/T_DSP

#Introduce some noise with both measurement uncertainties 
#   and non-trivial correlated errors.

#yerr = 0.0001 + 0.0003*np.random.rand(N)
yerr = df.yerr
yerr_hom = 0.0002*np.ones(N)
hom_cov = np.diag(yerr_hom ** 2)
iid_cov = np.diag(yerr ** 2)

a_True = 0.0016
s_True = 200.0
true_cov = a_True**2.0 * np.exp(-0.5 * (x[:, None]-x[None, :])**2 / s_True**2) + iid_cov
y = np.random.multivariate_normal(y_True, true_cov)
#y = np.random.multivariate_normal(y, iid_cov)

In [7]:
errorbar(x, y, yerr=yerr)
plt.ylim(0.9, 1.01)


Out[7]:
(0.9, 1.01)

Plot II: Make a heatmap of the covariance matrix

The key idea about the covariance matrix is that the on-diagonal terms are the variances (i.e. $\sigma^2$) of the data points, whereas the off-diagonal terms demonstrate the extent to which neighboring values are correlated.


In [8]:
plt.pcolormesh(hom_cov, cmap=cmap);
plt.colorbar();
plt.title('Homoscedastic, no covariance')


Out[8]:
<matplotlib.text.Text at 0x10a23fd10>

In [9]:
plt.pcolormesh(iid_cov, cmap=cmap);
plt.colorbar();
plt.title('Heteroscedastic, no covariance')


Out[9]:
<matplotlib.text.Text at 0x10b64bd90>

In [10]:
#Visualize the covariance
plt.pcolormesh(true_cov, cmap=cmap);
plt.colorbar();
plt.title('Heteroscedastic and covariance')


Out[10]:
<matplotlib.text.Text at 0x10cec5350>

Plot III: Data vs 'truth'

And plot the data with the observational uncertainties. The true line is plotted in black.


In [11]:
plt.errorbar(x, y, yerr=yerr, fmt=".k", capsize=0)
plt.plot(x, y_True, "-r", lw=2, alpha=0.3)
plt.xlabel('$\lambda$');
plt.ylabel('$T_{gap}$');
plt.title('Synthetic Si Transmission');
plt.ylim(0.9, 1.01);


We can't do linear regression in Matrix form, because this is a non-linear problem.

Constructing a guess at the covariance matrix

The off-diagonal terms are characterized by parameters $a$ and $s$.

The term $a$ controls the strength of the interaction. The term $s$ controls the range of correlation.

First we define a natural log likelihood function. You hand it a $d$, a $f$, an $a$, and an $s$ and it gives you the likelihood of the data: $\ln{p}=\ln{p(y|d,f,a,s)}$

Note: x is not passed to this function. The way Python works, x, y, and iid_cov are searched for in the local namespace, then global namespace.


In [12]:
n1 = sellmeier_Si(x)

In [13]:
def lnlike(d, f, lna, lns):
    a, s = np.exp(lna), np.exp(lns)
    off_diag_terms = a**2 * np.exp(-0.5 * (x[:, None] - x[None, :])**2 / s**2)
    C = hom_cov + off_diag_terms
    sgn, logdet = np.linalg.slogdet(C)
    if sgn <= 0:
        return -np.inf
    r = y - T_gap_Si_withFF_fast(x, d, f, n1)/T_DSP
    return -0.5 * (np.dot(r, np.linalg.solve(C, r)) + logdet)

In [14]:
# Apply a uniform prior over some range.
# Shape params are uniform in log space 
def lnprior(d, f, lna, lns):
    if not (3900 < d < 4300 and 0 < f < 1 and -12 < lna < -2 and 0 < lns < 8):
        return -np.inf
    return 0.0

In [15]:
def lnprob(p):
    lp = lnprior(*p)
    if not np.isfinite(lp):
        return -np.inf
    return lp + lnlike(*p)

Run emcee!


In [16]:
ndim, nwalkers = 4, 32
p0 = np.array([d_True, f_True, np.log(a_True), np.log(s_True)]) # Excellent guess
pos = [p0 + 1.0e-3*p0 * np.random.randn(ndim) for i in range(nwalkers)]

sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob)

In [17]:
# This is the burn-in
pos, lp, state = sampler.run_mcmc(pos, 300)

In [18]:
sampler.reset()
pos, lp, state = sampler.run_mcmc(pos, 600)
chain = sampler.chain

In [19]:
fig, axes = plt.subplots(4, 1, figsize=(5, 6), sharex=True)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.96, top=0.98,
                    wspace=0.0, hspace=0.05)
[a.plot(np.arange(chain.shape[1]), chain[:, :, i].T, "k", alpha=0.5)
 for i, a in enumerate(axes)]
[a.set_ylabel("${0}$".format(l)) for a, l in zip(axes, ["d", "f", "\ln a", "\ln s"])]
axes[-1].set_xlim(0, chain.shape[1])
axes[-1].set_xlabel("iteration");



In [20]:
samples = sampler.flatchain
samples_lin = copy(samples)
samples_lin[:, 2:] = np.exp(samples_lin[:, 2:])

In [21]:
fig = triangle.corner(samples_lin, 
                      labels=map("${0}$".format, ["d", "f", "a", "s"]),
                       truths=[d_True, f_True, a_True, s_True])



In [22]:
for d, f, lna, lns in samples[np.random.randint(len(samples), size=100)]:
    plt.plot(x, T_gap_Si_withFF_fast(x, d, f, n1)/T_DSP, color="b", alpha=0.03)

plt.errorbar(x, y, yerr=yerr, fmt=".k", capsize=0)
plt.plot(x, y_True, "--r", lw=2, alpha=1.0)
plt.xlabel('$\lambda$');
plt.ylabel('$T_{gap}$');
plt.title('100 samples from posterior');
plt.ylim(0.9, 1.01);



In [23]:
d_mcmc, f_mcmc, a_mcmc, s_mcmc = map(lambda v: (v[1], v[2]-v[1], v[1]-v[0]),
                             zip(*np.percentile(samples_lin, [16, 50, 84],
                                                axis=0)))

In [24]:
d_mcmc, f_mcmc, a_mcmc, s_mcmc


Out[24]:
((4118.3515183645122, 2.697620569362698, 2.7009011659447424),
 (0.045076605796915123, 0.0004033787867607011, 0.00040457398087738461),
 (0.0013253316059352953, 0.00011055360588542863, 9.4914502203741679e-05),
 (8.9781530856494651, 0.41436658111164881, 0.4367277443449602))

In [25]:
for d, f, a, s in samples_lin[np.random.randint(len(samples), size=3)]:
    #plt.plot(x, T_gap_Si_withFF_fast(x, d, f, n1)/T_DSP, color="b", alpha=0.03)
    off_diag_terms = a**2 * np.exp(-0.5 * (x[:, None] - x[None, :])**2 / s**2)
    C = iid_cov + off_diag_terms
    fit = T_gap_Si_withFF_fast(x, d, f, n1)/T_DSP
    vec = np.random.multivariate_normal(fit, C)
    plt.plot(x, vec, color="g", alpha=0.3)
    
plt.errorbar(x, y, fmt=".k")
plt.xlabel('$\lambda$');
plt.ylabel('$T_{gap}$');
plt.title('Synthetic Si Transmission');
plt.ylim(0.9, 1.01);



In [25]: