I did this a bit flippantly before, but I want to fomalize the process by which we estimate the uncertainty on emulator predictions.


In [63]:
from pearce.emulator import SpicyBuffalo, LemonPepperWet
from pearce.mocks import cat_dict
import numpy as np
from os import path

In [64]:
import matplotlib
#matplotlib.use('Agg')
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [65]:
#xi gg
training_file = '/scratch/users/swmclau2/xi_zheng07_cosmo_lowmsat/PearceRedMagicXiCosmoFixedNd.hdf5'
#test_file= '/scratch/users/swmclau2/xi_zheng07_cosmo_test_lowmsat2/'
test_file =  '/scratch/users/swmclau2/xi_zheng07_cosmo_test_lowmsat2/PearceRedMagicXiCosmoFixedNd_Test.hdf5'
#xi gm training_file = '/scratch/users/swmclau2/xi_gm_cosmo/PearceRedMagicXiGMCosmoFixedNd.hdf5' test_file = '/scratch/users/swmclau2/xi_gm_cosmo_test2/PearceRedMagicXiGMCosmoFixedNdTest.hdf5'

In [66]:
em_method = 'gp'
split_method = 'random'

In [67]:
a = 1.0
z = 1.0/a - 1.0

In [68]:
fixed_params = {'z':z}#, 'cosmo': 0}#, 'r':24.06822623}
hp = np.loadtxt('/home/users/swmclau2/Git/pearce/bin/optimization/sloppy_joes_result_indiv_bins.npy')

In [69]:
from glob import glob

In [70]:
hp = []
for fname in sorted(glob('/home/users/swmclau2/Git/pearce/bin/optimization/sloppy_joes_indiv_bins/sloppy*.npy')):
    #print fname, len(hp)
    hp.append(np.loadtxt(fname))

In [71]:
len(hp)


Out[71]:
18

In [72]:
param_names = ['ombh2', 'omch2', 'w0', 'ns', 'ln10As', 'H0', 'Neff', 'logM0', 'sigma_logM', 'logM1', 'alpha']

In [73]:
pnames = ['bias', 'amp']
pnames.extend(param_names)
pnames.append('amp')
pnames.extend(param_names)
from collections import defaultdict metric = defaultdict(list) for val, pname in zip(hp, pnames): metric[pname].append(val)

In [74]:
from collections import defaultdict
metric = []

for _hp in hp:
    metric.append(defaultdict(list))
    for val, pname in zip(_hp, pnames):
        metric[-1][pname].append(val)
np.random.seed(0) emu = SpicyBuffalo(training_file, method = em_method, fixed_params=fixed_params, custom_mean_function = 'linear', downsample_factor = 0.05, hyperparams = {'metric':metric})

In [75]:
np.random.seed(0)
emu = LemonPepperWet(training_file, method = em_method, fixed_params=fixed_params,
                 custom_mean_function = None, downsample_factor = 0.1)#, hyperparams = {'metric':metric})

In [76]:
emu.get_param_names()


Out[76]:
['ombh2',
 'omch2',
 'w0',
 'ns',
 'ln10As',
 'H0',
 'Neff',
 'logM0',
 'sigma_logM',
 'logM1',
 'alpha']
test_x, test_y, test_ycov, info = emu.get_data(test_file, emu.fixed_params, attach_params = False)
gof = emu.goodness_of_fit(test_file, statistic = 'frac') #print gof.mean(), np.median(gof) for g in gof: print g.mean(), np.median(g)

In [77]:
pred_y, data_y = emu.goodness_of_fit(test_file, statistic = None)

In [78]:
print len(pred_y[0])


35000

In [79]:
test_x, _, test_err, _ = emu.get_data(test_file, emu.fixed_params)

t, old_idxs  = emu._whiten(test_x)

In [80]:
train_x, train_y, train_err, info = emu.get_data(training_file, emu.fixed_params)


/home/users/swmclau2/.local/lib/python2.7/site-packages/pearce/emulator/emu.py:294: UserWarning: WARNING: NaN detected. Skipped 21 points in training data.
  warnings.warn('WARNING: NaN detected. Skipped %d points in training data.' % (num_skipped))

In [ ]:


In [81]:
len(train_err)


Out[81]:
40000

In [82]:
mean_func_at_params = emu.mean_function(t)

In [83]:
mean_func_at_params.shape


Out[83]:
(18,)
mean_func_at_params = self.mean_function(t) if self._downsample_factor == 1.0: y = self.y else: y = self.downsample_y mu = [] err = [] for bin_no, (t_in_bin, mfc,_y, emulator) in enumerate(izip(t, mean_func_at_params, y, self._emulators)): #print '_y', _y if self.method == 'gp': if gp_errs: local_mu, local_err = emulator.predict(_y, t_in_bin, return_cov = False,return_var=True) else: #print _y #print t_in_bin local_mu = emulator.predict(_y, t_in_bin, return_cov = False,return_var=False) #print local_mu local_err = np.ones_like(local_mu) else: local_mu = emulator.predict(t_in_bin) local_err = np.ones_like(local_mu) # weight with this instead of the errors. #print 'local_mu, mfc', local_mu, mfc mu.append(self._y_std[bin_no]*(local_mu + mfc) + self._y_mean[bin_no]) err.append(local_err*self._y_std[bin_no]) #now, figure out how to return to the shape of t combined_mu = np.zeros((t_size,)) combined_err = np.zeros((t_size,)) for r_idx, bin_idxs in enumerate(old_idxs): combined_mu[bin_idxs] = mu[r_idx] combined_err[bin_idxs] = err[r_idx] # Reshape to be consistent with my other implementation if not gp_errs: return combined_mu return combined_mu, combined_err

In [84]:
plt_idx = 4249
print t[0][plt_idx]
print pred_y[0][plt_idx]
print data_y[0][plt_idx]
#print mean_func_at_params[0][plt_idx]


[ 1.33619946 -1.41073566  1.2937706   1.09007699 -0.99425337 -1.29157473
 -1.01915118 -0.62179221 -0.20608562  0.74996107 -0.50398343]
3.86065096638
3.99002394681
emu._emulators[0].predict(np.array([t[0][plt_idx]]), emu.downsample_y[0])
for plt_idx in np.random.choice(len(pred_y[0]), 100): print plt_idx pred_plot = np.array([pred_y[bi][plt_idx] for bi in xrange(len(pred_y))]) data_plot = np.array([data_y[bi][plt_idx] for bi in xrange(len(pred_y))]) mean_plot = np.array([mean_func_at_params[bi][plt_idx] for bi in xrange(len(pred_y))]) #mean_plot = np.array([mean_func_at_params[bi] for bi in xrange(len(pred_y))]) plt.plot(emu.scale_bin_centers, pred_plot, label = "Emu") plt.plot(emu.scale_bin_centers, data_plot, label = "Data") plt.plot(emu.scale_bin_centers, mean_plot, label = "Mean") plt.xscale('log') plt.legend(loc = 'best') plt.show()
hod_no = 10 cosmo_no = 2 #for cosmo_no in xrange(7): for realization_no in xrange(5): for idx in xrange(len(pred_y)): py = pred_y[idx][cosmo_no*5+realization_no+hod_no::1000] dy = data_y[idx][cosmo_no*5+realization_no+hod_no::1000] diffmat = np.abs(10**py-10**dy) denommat = 10**dy acc = diffmat/denommat mean_acc = np.mean(acc) print mean_acc, #plt.vlines(mean_acc, 0, 000) #plt.hist(acc, bins = np.logspace(-3, 3, 30)) #plt.xscale('log') #plt.show() print
resmat = np.zeros( (len(pred_y), pred_y[-1].shape[0])) for bin_no in xrange(len(pred_y)): py, dy = pred_y[bin_no], data_y[bin_no] resmat[bin_no, :py.shape[0]] = 10**py - 10**dy

In [85]:
resmat = np.zeros(( 7, 5, 1000, 18))
resmat_log = np.zeros_like(resmat)
predmat = np.zeros(( 7, 5, 1000, 18))

datamat = np.zeros(( 7, 5, 1000, 18))

for bin_no in xrange(len(pred_y)):
    py, dy = pred_y[bin_no], data_y[bin_no]
    for cosmo_no in xrange(7):
        cpy, cdy = py[cosmo_no*5000:(cosmo_no+1)*5000], dy[cosmo_no*5000:(cosmo_no+1)*5000]
        #for realization_no in xrange(5):
        for hod_no in xrange(1000):
            crpy, crdy = cpy[hod_no::1000], cdy[hod_no::1000]

            #for hod_no in xrange(1000):
            #print crpy.std()
            predmat[cosmo_no, :,  hod_no , bin_no] = crpy
            datamat[cosmo_no, :,  hod_no , bin_no] = crdy
            resmat[cosmo_no, :,  hod_no , bin_no] = 10**crpy - 10**crdy
            resmat_log[cosmo_no, :,  hod_no , bin_no] = crpy - crdy

In [86]:
resmat.shape


Out[86]:
(7, 5, 1000, 18)

In [87]:
N = 5
for cosmo_no in xrange(7):
    for hod_no in xrange(1000):
        r = resmat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C').T
        print r.shape
        scovmat = r.dot(r.T)/r.shape[1]
        #fig = plt.figure(figsize = (10, 6))
        #plt.subplot(121)
        im = plt.imshow(np.log10(np.abs(scovmat) ))
        plt.colorbar(im)
        #plt.subplot(122)
        #pred = 10**predmat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C').T
        #print pred.shape
        #data = datamat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C').T
        #scovmat2 = np.cov(pred)
        #im = plt.imshow(np.log10(np.abs(scovmat2) ), cmap = 'viridis', vmin = -5,vmax = 9)
        #plt.colorbar(im)
        
        plt.show()
        N-=1
        
        if N == 0:
            break
    if N==0:
        break


(18, 5)
(18, 5)
(18, 5)
(18, 5)
(18, 5)

In [88]:
N = 50
for cosmo_no in xrange(7):
    for hod_no in xrange(1000):
        
        pred = 10**predmat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C')
        data = 10**datamat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C')
        for p,d in zip(pred,data):
            plt.plot(emu.scale_bin_centers, np.abs(p-d)/d)
        #for d in data:
        #    plt.plot(emu.scale_bin_centers, d)
        
        plt.xscale('log')
        plt.yscale('log')
        plt.show()
        N-=1
        
        if N == 0:
            break
    if N==0:
        break



In [89]:
resmat_flat = resmat.reshape((-1, 18)).T
datamat_flat = datamat.reshape((-1, 18)).T
#resmat_hodrealav = np.mean(resmat_realav, axis = 1)

In [90]:
r_idx = 10
t_bin = t[r_idx]
acc_bin = np.abs(resmat_flat[r_idx])/datamat_flat[r_idx]

In [91]:
t_bin.shape, acc_bin.shape


Out[91]:
((35000, 11), (35000,))

In [92]:
percentiles = np.percentile(acc_bin, range(101))
norm_acc_bin = np.digitize(acc_bin, percentiles)
#norm_acc_bin = 100*((acc_bin - acc_bin.min())/acc_bin.max()).astype(int)

In [93]:
palette = sns.diverging_palette(220, 20, n=len(percentiles)-1, as_cmap=True)
#sns.set_palette(palette)

In [94]:
pnames = emu.get_param_names()

In [95]:
for axes1 in xrange(7,11):
    for axes2 in xrange(axes1+1, 11):
        cbar = plt.scatter(t_bin[:,axes1 ], t_bin[:,axes2], c = norm_acc_bin,cmap = palette, alpha = 0.2)
        plt.colorbar(cbar)
        plt.xlabel(pnames[axes1])
        plt.ylabel(pnames[axes2])
        #plt.gray()
        plt.show()



In [96]:
test_err_bin = test_err[:, r_idx, r_idx]

In [97]:
plt.hist(np.log10(test_err_bin) )


Out[97]:
(array([   21.,   136.,   635.,  2505.,  6071.,  9794.,  9807.,  4954.,
          997.,    80.]),
 array([-6.4880701 , -6.23495877, -5.98184744, -5.72873611, -5.47562478,
        -5.22251345, -4.96940212, -4.71629079, -4.46317947, -4.21006814,
        -3.95695681]),
 <a list of 10 Patch objects>)

Why are these different?


In [98]:
plt.hist(np.log10(np.sqrt(emu.yerr[r_idx])) )


Out[98]:
(array([   164.,   2085.,   8315.,  10257.,   8919.,   5685.,   2963.,
          1287.,    289.,     36.]),
 array([-1.41924799, -1.30072147, -1.18219495, -1.06366842, -0.9451419 ,
        -0.82661538, -0.70808885, -0.58956233, -0.47103581, -0.35250928,
        -0.23398276]),
 <a list of 10 Patch objects>)

In [99]:
len(emu.yerr)


Out[99]:
18

In [100]:
percentiles = np.percentile(np.log10(test_err_bin), [0, 10,20,30,40,50,60,70,80,90,100])
norm_err_bin = np.digitize(np.log10(test_err_bin), percentiles)

In [101]:
#relevant stat is uncertainty in training error, not test error
for axes1 in xrange(7,11):
    for axes2 in xrange(axes1+1, 11):
        cbar = plt.scatter(t_bin[:,axes1 ], t_bin[:,axes2], c = norm_err_bin,cmap = palette, alpha = 0.2)
        plt.colorbar(cbar)
        plt.xlabel(pnames[axes1])
        plt.ylabel(pnames[axes2])
        #plt.gray()
        plt.show()



In [102]:
test_err_diag= np.array([test_err[:, r_idx, r_idx] for r_idx in xrange(emu.n_bins)] )

In [103]:
test_err_diag.mean(axis = 1)


Out[103]:
array([  6.66416262e-04,   3.90266113e-04,   2.40298112e-04,
         1.59608324e-04,   1.11817891e-04,   8.52685284e-05,
         6.85463332e-05,   3.27251532e-05,   1.61159551e-05,
         1.23840494e-05,   1.22300077e-05,   1.20386761e-05,
         8.81174492e-06,   6.19478076e-06,   5.05835371e-06,
         4.66060676e-06,   5.20829565e-06,   7.42500106e-06])

In [112]:
np.sqrt(np.mean(np.square(np.abs(resmat_flat)/(10**datamat_flat) )))


Out[112]:
0.084350297620042369

In [104]:
plt.plot(emu.scale_bin_centers, np.mean(np.abs(resmat_flat)/(10**datamat_flat), axis = 1) )
plt.plot(emu.scale_bin_centers, np.mean(np.abs(test_err_diag), axis = 1) )

plt.xscale('log')



In [105]:
resmat_log_flat = resmat_log.reshape((-1, 18)).T
emu.yerr[0].mean(), emu.y[0]

In [106]:
np.mean(np.array([ye/np.abs(y+1e-9) for ye, y in zip(emu.yerr, emu.y)]) , axis = 1)


Out[106]:
array([  6.88971478e+02,   6.13597662e-03,   6.64106546e-03,
         7.28736319e-03,   8.16241541e-03,   9.42955257e-03,
         7.25344587e+01,   1.10902183e-02,   1.14167158e-02,
         1.31384973e-02,   1.63399965e-02,   2.17118471e-02,
         3.10878552e-02,   5.23228689e-02,   2.06445287e-01,
         8.96476256e-01,   1.11314614e-01,   3.06048094e-02])

In [107]:
plt.plot(emu.scale_bin_centers, np.mean(np.abs(resmat_log_flat)/np.abs(datamat_flat), axis = 1), label = 'Pred acc' )
#plt.plot(emu.scale_bin_centers, np.mean(np.array([ye/np.abs(y+1e-9) for ye, y in zip(emu.yerr, emu.y)]), axis = 1), label = 'Training Err' )
plt.yscale('log')
plt.legend(loc ='best')
plt.xscale('log')



In [108]:
for res, dat in zip(resmat_flat, 10**datamat_flat):
    plt.hist(res/dat, bins = 30)#, bins = np.linspace(-1, 1, 30))
    plt.yscale('log')
    plt.show()



In [109]:
scovmat = resmat_flat.dot(resmat_flat.T)/(len(pred_y[0])-1)

In [110]:
im = plt.imshow(np.log10(scovmat) )
plt.colorbar(im)
plt.show()



In [111]:
print np.sqrt(np.diag(scovmat))


[  6.82852886e+03   4.22204695e+03   2.60080324e+03   1.57332998e+03
   9.29063406e+02   5.35138494e+02   2.99363194e+02   1.61047184e+02
   8.45135299e+01   3.97037362e+01   1.44902002e+01   3.71410566e+00
   8.29729209e-01   2.73203089e-01   1.34804020e-01   6.63547077e-02
   3.28807034e-02   2.44989037e-02]
# xi_gg [ 5.24570257e+03 3.23374673e+03 1.87386389e+03 1.21527448e+03 6.75245722e+02 3.90243992e+02 2.29159542e+02 1.35977239e+02 7.92194628e+01 3.78496692e+01 1.44829661e+01 4.31545126e+00 8.90656761e-01 2.21231511e-01 1.58627067e-01 5.99630335e-02 3.33024301e-02 2.38020596e-02]
# xi_gm [ 6.49230284e+02 2.93763883e+02 1.38709271e+02 6.46778563e+01 3.78016132e+01 2.51962597e+01 1.64736353e+01 1.11197889e+01 6.22022662e+00 2.74484762e+00 1.04167114e+00 3.10671790e-01 9.63501742e-02 5.49056973e-02 3.15483462e-02 1.63669320e-02 8.80627565e-03 5.35716884e-03]

In [ ]:


In [ ]:


In [ ]: