Author: gully

Date: Jan 9, 2015

Fitting Si gaps data with Gaussian Process and emcee.

Originals available at: https://github.com/dfm/gp-tutorial & https://speakerdeck.com/dfm/an-astronomers-introduction-to-gaussian-processes


In [16]:
%pylab inline
# Auto imports np and plt


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['ndim', 'hist', 'bivariate_normal']
`%matplotlib` prevents importing * from pylab and numpy

In [17]:
from matplotlib import rcParams
#rcParams["savefig.dpi"] = 150
import emcee  # http://dan.iel.fm/emcee
import triangle  # https://github.com/dfm/triangle.py
import numpy as np
import matplotlib.pyplot as plt
from astroML.plotting import hist
from astroML.stats.random import bivariate_normal
from matplotlib.patches import Ellipse
import timeit
#from IPython.display import display, Math, Latex
from astroML.decorators import pickle_results

In [18]:
import seaborn as sns
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
sns.palplot(sns.cubehelix_palette(light=1))


Non-linear physical optics example

Now with real data.


In [19]:
import os

In [20]:
import pandas as pd
AO_dir = os.getenv("AO_bonding_paper")
df = pd.read_csv(AO_dir+"/data/cln_20130218_cary5000.csv")
df.set_index('wavelength', inplace=True)
df.head()


Out[20]:
Baseline 100%T Baseline 0%T VG05 VG01 VG06 VG02 VG02_pos2 VG02_pos3 VG02_pos4 VG03_pos1 VG03_pos2 VG04_pos1 VG04_pos2 VG06_post Baseline 100%T.1 Baseline 0%T.1 VG05_post
wavelength
2500 7.427119 -0.001234 53.442204 53.466736 53.262798 36.737545 52.972309 53.064991 33.512653 38.421932 52.000980 25.861622 54.211266 53.938526 7.585160 0.000067 53.521866
2490 7.814320 -0.001347 53.430767 53.484589 53.288322 36.678280 53.001614 53.104568 33.588028 37.254417 52.001225 25.784100 54.293777 53.990936 7.991349 0.000207 53.393871
2480 8.378716 -0.000838 53.504936 53.481739 53.299366 36.473946 53.025131 53.119553 33.557705 36.035366 51.969082 25.835318 54.253708 53.963161 8.566405 0.000034 53.493465
2470 8.825397 -0.001670 53.377266 53.474129 53.299530 36.204262 53.002007 53.103672 33.459511 34.838486 51.941597 25.706764 54.266666 53.970634 9.022456 -0.000127 53.406708
2460 9.592031 -0.001384 53.376835 53.469803 53.290367 35.955139 53.008461 53.092026 33.415165 33.569817 51.898754 25.675592 54.245071 53.950542 9.799633 -0.000737 53.352882

Make an estimate for point-to-point measurement uncertainties (heteroscedasticity)

The approach here is to make a running variance window function or something.


In [21]:
mask = (df.index > 1250.0)
df = df[mask]

In [22]:
hdf = pd.read_csv("hetroscedastic.txt",names=['yerr', 'junk'], skiprows=1)

Read in the data and make an estimate for the covariance.


In [23]:
from etalon import *

np.random.seed(123456)

# Introduce the Real data
x = df.index.values
y_True = df.VG03_pos2.values/df.VG06.values

#Truncate to just 1200 nm
wl_mask = (x > 1200.0)
x = x[wl_mask]
y_True = y_True[wl_mask]

N = len(x)

# Define T_DSP for the model
T_DSP = T_gap_Si(x, 0.0)

#Introduce some noise with both measurement uncertainties 
#   and non-trivial correlated errors.

#yerr = 0.0002*np.ones(N)
yerr = hdf.yerr
iid_cov = np.diag(yerr ** 2)

a_True = 0.002
s_True = 100.0
y = y_True

Plot III: Data vs 'truth'

And plot the data with the observational uncertainties. The true line is plotted in black.


In [24]:
plt.errorbar(x, y, yerr=yerr, fmt=".k", capsize=0)
plt.xlabel('$\lambda$');
plt.ylabel('$T_{gap}$');
plt.title('Measured Si Transmission');
plt.ylim(0.9, 1.01);


We can't do linear regression in Matrix form, because this is a non-linear problem.

Constructing a guess at the covariance matrix

The off-diagonal terms are characterized by parameters $a$ and $s$.

The term $a$ controls the strength of the interaction. The term $s$ controls the range of correlation.

First we define a natural log likelihood function. You hand it a $d$, a $f$, an $a$, and an $s$ and it gives you the likelihood of the data: $\ln{p}=\ln{p(y|d,f,a,s)}$

Note: x is not passed to this function. The way Python works, x, y, and iid_cov are searched for in the local namespace, then global namespace.


In [25]:
n1 = sellmeier_Si(x)

In [26]:
def lnlike(d, f, lna, lns):
    a, s = np.exp(lna), np.exp(lns)
    off_diag_terms = a**2 * np.exp(-0.5 * (x[:, None] - x[None, :])**2 / s**2)
    C = iid_cov + off_diag_terms
    sgn, logdet = np.linalg.slogdet(C)
    if sgn <= 0:
        return -np.inf
    r = y - T_gap_Si_withFF_fast(x, d, f, n1)/T_DSP
    return -0.5 * (np.dot(r, np.linalg.solve(C, r)) + logdet)

In [27]:
# Apply a uniform prior over some range.
# Shape params are uniform in log space 
def lnprior(d, f, lna, lns):
    if not (3800 < d < 4300 and 0 < f < 0.5 and -12 < lna < -2 and 0 < lns < 8):
        return -np.inf
    return 0.0

In [28]:
def lnprob(p):
    lp = lnprior(*p)
    if not np.isfinite(lp):
        return -np.inf
    return lp + lnlike(*p)

Run emcee!


In [29]:
ndim, nwalkers = 4, 32
d_Guess = 4120.0
f_Guess = 0.045
p0 = np.array([d_Guess, f_Guess, np.log(a_True), np.log(s_True)]) # Decent guess
pos = [p0 + 1.0e-2*p0 * np.random.randn(ndim) for i in range(nwalkers)]

sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob)

In [30]:
@pickle_results('SiGaps_03_data-sampler.pkl')
def hammer_time(ndim, nwalkers, d_Guess, f_Guess, a_Guess, s_Guess, nburnins, ntrials):
    
    # Initialize the walkers
    p0 = np.array([d_Guess, f_Guess, np.log(a_Guess), np.log(s_Guess)])
    pos = [p0 + 1.0e-2*p0 * np.random.randn(ndim) for i in range(nwalkers)]
    
    sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob)
    
    pos, lp, state = sampler.run_mcmc(pos, nburnins)
    sampler.reset()
    pos, lp, state = sampler.run_mcmc(pos, ntrials)
    return sampler

In [31]:
np.random.seed(78704)
ndim, nwalkers = 4, 32
d_Guess = 4120.0
f_Guess = 0.045
a_Guess = 0.0016
s_Guess = 100.0
nburnins = 200
ntrials = 600

In [32]:
sampler = hammer_time(ndim, nwalkers, d_Guess, f_Guess, a_Guess, s_Guess, nburnins, ntrials)


@pickle_results: computing results and saving to 'SiGaps_03_data-sampler.pkl'

In [33]:
fig, axes = plt.subplots(4, 1, figsize=(5, 6), sharex=True)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.96, top=0.98,
                    wspace=0.0, hspace=0.05)
[a.plot(np.arange(chain.shape[1]), chain[:, :, i].T, "k", alpha=0.5)
 for i, a in enumerate(axes)]
[a.set_ylabel("${0}$".format(l)) for a, l in zip(axes, ["d", "f", "\ln a", "\ln s"])]
axes[-1].set_xlim(0, chain.shape[1])
axes[-1].set_xlabel("iteration");


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-33-a62f5ee81a23> in <module>()
      3                     wspace=0.0, hspace=0.05)
      4 [a.plot(np.arange(chain.shape[1]), chain[:, :, i].T, "k", alpha=0.5)
----> 5  for i, a in enumerate(axes)]
      6 [a.set_ylabel("${0}$".format(l)) for a, l in zip(axes, ["d", "f", "\ln a", "\ln s"])]
      7 axes[-1].set_xlim(0, chain.shape[1])

NameError: name 'chain' is not defined

In [ ]:
samples = sampler.flatchain
samples_lin = copy(samples)
samples_lin[:, 2:] = np.exp(samples_lin[:, 2:])

In [ ]:
fig = triangle.corner(samples_lin, 
                      labels=map("${0}$".format, ["d", "f", "a", "s"]), 
                      quantiles=[0.16, 0.84])

In [ ]:
off_diag_terms = 0.0018**2 * np.exp(-0.5 * (x[:, None] - x[None, :])**2 / 12.0**2)
C = iid_cov + off_diag_terms
fit = T_gap_Si_withFF_fast(x, 4093.5, 0.04572, n1)/T_DSP

vec = np.random.multivariate_normal(fit, C)

In [ ]:
for d, f, a, s in samples_lin[np.random.randint(len(samples), size=3)]:
    #plt.plot(x, T_gap_Si_withFF_fast(x, d, f, n1)/T_DSP, color="r", alpha=0.03)
    off_diag_terms = a**2 * np.exp(-0.5 * (x[:, None] - x[None, :])**2 / s**2)
    C = iid_cov + off_diag_terms
    fit = T_gap_Si_withFF_fast(x, 4093.5, 0.04572, n1)/T_DSP
    vec = np.random.multivariate_normal(fit, C)
    plt.plot(x, vec, color="g", alpha=0.3)
    
plt.errorbar(x, y, yerr, fmt=".k")
plt.xlabel('$\lambda$');
plt.ylabel('$T_{gap}$');
plt.title('Observed Si Transmission');
plt.ylim(0.9, 1.01);

In [ ]:
d_mcmc, f_mcmc, a_mcmc, s_mcmc = map(lambda v: (v[1], v[2]-v[1], v[1]-v[0]),
                             zip(*np.percentile(samples_lin, [16, 50, 84],
                                                axis=0)))

In [ ]:
d_mcmc, f_mcmc, a_mcmc, s_mcmc

In [ ]: