Author: James Bourbeau
In [1]:
    
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
    
    
In [2]:
    
%matplotlib inline
from __future__ import division, print_function
import os
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from scipy.interpolate import UnivariateSpline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
import comptools as comp
import comptools.analysis.plotting as plotting
color_dict = comp.analysis.get_color_dict()
    
    
    
    
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [3]:
    
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
    
Get composition classifier pipeline
Define energy binning for this analysis
In [4]:
    
energybins = comp.analysis.get_energybins()
    
[ back to top ]
In [5]:
    
df_sim_train, df_sim_test = comp.load_sim(config='IC86.2012', log_energy_min=6.0, log_energy_max=8.3)
    
In [6]:
    
log_energy_sim_test = df_sim_test['lap_log_energy']
log_reco_energy_sim_test = df_sim_test['lap_log_energy']
log_true_energy_sim_test = df_sim_test['MC_log_energy']
    
In [7]:
    
feature_list, feature_labels = comp.analysis.get_training_features()
    
In [8]:
    
pipeline_str = 'BDT'
pipeline = comp.get_pipeline(pipeline_str)
    
In [9]:
    
pipeline
    
    Out[9]:
In [10]:
    
pipeline.fit(df_sim_train[feature_list], df_sim_train['target'])
    
    Out[10]:
In [ ]:
    
    
In [ ]:
    
    
In [13]:
    
efficiencies = {}
efficiencies_err = {}
for composition in comp_list:
    eff_path = os.path.join(comp.paths.comp_data_dir, 'unfolding',
                            'efficiency-{}.npy'.format(composition))
    efficiencies[composition] = np.load(eff_path)
    eff_err_path = os.path.join(comp.paths.comp_data_dir, 'unfolding',
                                'efficiency-err-{}.npy'.format(composition))
    efficiencies_err[composition] = np.load(eff_err_path)
    
Plot efficiencies
In [16]:
    
fig, ax = plt.subplots()
for composition in comp_list:
    ax.errorbar(energybins.log_energy_midpoints, efficiencies[composition],
                yerr=efficiencies_err[composition], color=color_dict[composition],
                label=composition, marker='.')
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
ax.grid()
ax.ticklabel_format(style='sci',axis='y')
ax.yaxis.major.formatter.set_powerlimits((0,0))
plt.show()
    
    
In [43]:
    
det_efficiencies = np.empty(len(energybins.log_energy_midpoints)*2)
det_efficiencies[::2] = efficiencies['light']
det_efficiencies[1::2] = efficiencies['heavy']
    
In [44]:
    
det_efficiencies_err = np.empty_like(det_efficiencies)
det_efficiencies_err[::2] = efficiencies_err['light']
det_efficiencies_err[1::2] = efficiencies_err['heavy']
    
In [ ]:
    
    
In [11]:
    
df_sim = comp.load_sim(config='IC86.2012', test_size=0)
    
In [12]:
    
df_sim.lap_cos_zenith.min(), df_sim.lap_cos_zenith.max()
    
    Out[12]:
In [13]:
    
(df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min())/2
    
    Out[13]:
In [14]:
    
counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy, bins=energybins.log_energy_bins)[0]
counts_MC_energy_bins
    
    Out[14]:
In [15]:
    
counts_reco_energy_bins = np.histogram(df_sim.lap_log_energy, bins=energybins.log_energy_bins)[0]
counts_reco_energy_bins
    
    Out[15]:
In [16]:
    
counts_reco_energy_bins / counts_MC_energy_bins
    
    Out[16]:
In [17]:
    
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_MC_energy_bins, ax=ax)
ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
ax.set_ylabel('\# true events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()
    
    
    
In [18]:
    
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins, ax=ax)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('\# reco events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()
    
    
In [19]:
    
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins / counts_MC_energy_bins, ax=ax)
ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('\# reco events / \# true events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [20]:
    
def run_to_energy_bin(run):
    ebin_first = 5.0
    ebin_last = 7.9
    # Taken from http://simprod.icecube.wisc.edu/cgi-bin/simulation/cgi/cfg?dataset=12360
    return (ebin_first*10+(run-1)%(ebin_last*10-ebin_first*10+1))/10
    
In [21]:
    
def thrown_showers_per_ebin(sim_list, log_energy_bins=None):
    e_bins = []
    for sim in sim_list:
        print(sim)
        for f in comp.simfunctions.get_level3_sim_files_iterator(sim):
            start_idx = f.find('Run')
            run = int(f[start_idx+3: start_idx+9])
            e_bin = run_to_energy_bin(run)
            e_bins.append(e_bin)
    if log_energy_bins is None:
        log_energy_bins = np.arange(5, 8.1, 0.1)
    log_energy_midpoints = (log_energy_bins[1:] + log_energy_bins[:-1]) / 2
    vals = np.histogram(e_bins, bins=log_energy_bins)[0]
    n_resamples = 100
    n_showers_per_file = n_resamples
    thrown_showers = vals * n_showers_per_file
    return thrown_showers
    
In [22]:
    
sim_list = [12360, 12362, 12630, 12631]
thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
    
    
In [23]:
    
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins / thrown_showers, ax=ax, 
                    label='Reco energy bins', color='C0', alpha=0.75)
plotting.plot_steps(energybins.log_energy_bins, counts_MC_energy_bins / thrown_showers, ax=ax, 
                    label='MC energy bins', color='C1', alpha=0.75)
# ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('\# passed events / \# thrown events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0)
ax.grid()
ax.legend()
plt.show()
    
    
In [47]:
    
sim_dict = {'light': [12360, 12630],
            'heavy': [12362, 12631]}
efficiencies = np.empty(len(energybins.log_energy_midpoints)*2)
efficiencies_err = np.empty_like(efficiencies)
fig, ax = plt.subplots()
for composition, sim_list in sim_dict.items():
    comp_mask = df_sim.MC_comp_class == composition
    # Get number of thrown showers in each energy bin
    thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
    # Get number of showers in each energy bin that pass cuts
    counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy[comp_mask],
                                         bins=energybins.log_energy_bins)[0].astype(float)
    
#     # (Maybe??) Scale larger thrown radius up by area ratio
    large_thrown_radius = energybins.log_energy_midpoints > 7
#     counts_MC_energy_bins[large_thrown_radius] *= (1700/1100)**2 
    # Calculate detection efficiency and add them to normalizations array
    eff, eff_err = comp.ratio_error(counts_MC_energy_bins, np.sqrt(counts_MC_energy_bins),
                                    thrown_showers, np.sqrt(thrown_showers))
    
    start_idx = 0 if composition == 'light' else 1
    efficiencies[start_idx::2] = eff
    efficiencies_err[start_idx::2] = eff_err
    # Plot detection efficiencies
    plotting.plot_steps(energybins.log_energy_bins, eff, yerr=eff_err, ax=ax, 
                        label=composition, color=color_dict[composition], alpha=0.75)
    
    spl = UnivariateSpline(energybins.log_energy_midpoints[~large_thrown_radius],
                           eff[~large_thrown_radius], s=5)
    ax.plot(energybins.log_energy_midpoints[~large_thrown_radius],
            spl(energybins.log_energy_midpoints[[~large_thrown_radius]]), color=color_dict[composition])
    
    spl = UnivariateSpline(energybins.log_energy_midpoints[large_thrown_radius],
                           eff[large_thrown_radius], s=5)
    ax.plot(energybins.log_energy_midpoints[large_thrown_radius],
            spl(energybins.log_energy_midpoints[[large_thrown_radius]]), color=color_dict[composition])
# ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
# ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0)
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax.grid()
ax.legend()
efficiency_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
                                  'detection-efficiency.png')
comp.check_output_dir(efficiency_outfile)
plt.savefig(efficiency_outfile)
plt.show()
    
    
    
In [44]:
    
# sim_dict = {'light': [12360, 12630],
#             'heavy': [12362, 12631]}
# ebins = np.arange(5.0, 8.1, 0.1)
# emidpoints = (ebins[1:] + ebins[:-1]) / 2
# efficiencies = np.empty((len(ebins)-1)*2)
# efficiencies_err = np.empty((len(ebins)-1)*2)
# # efficiencies_err = np.empty(len(energybins.log_energy_midpoints)*2)
# fig, ax = plt.subplots()
# for composition, sim_list in sim_dict.items():
#     comp_mask = df_sim.MC_comp_class == composition
#     # Get number of thrown showers in each energy bin
#     thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=ebins)
# #     thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
#     # Get number of showers in each energy bin that pass cuts
#     counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy[comp_mask],
#                                          bins=ebins)[0].astype(float)
    
# #     # (Maybe??) Scale larger thrown radius up by area ratio
#     large_thrown_radius = emidpoints > 7
# #     counts_MC_energy_bins[large_thrown_radius] *= (1700/1100)**2 
#     # Calculate detection efficiency and add them to normalizations array
#     eff, eff_err = comp.ratio_error(counts_MC_energy_bins, np.sqrt(counts_MC_energy_bins),
#                                     thrown_showers, np.sqrt(thrown_showers))
    
#     start_idx = 0 if composition == 'light' else 1
#     efficiencies[start_idx::2] = eff
#     efficiencies_err[start_idx::2] = eff_err
#     # Plot detection efficiencies
#     plotting.plot_steps(ebins, eff, yerr=eff_err, ax=ax, 
#                         label=composition, color=color_dict[composition], alpha=0.75)
    
#     spl = UnivariateSpline(emidpoints[~large_thrown_radius], eff[~large_thrown_radius], s=5)
#     ax.plot(emidpoints[~large_thrown_radius], spl(emidpoints[[~large_thrown_radius]]), color=color_dict[composition])
# # ax.axhline(1.0, marker='None', ls='-.', color='k')
# ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
# ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
# # ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
# ax.set_ylim(0)
# plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
# ax.grid()
# ax.legend()
# efficiency_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
#                                   'detection-efficiency.png')
# comp.check_output_dir(efficiency_outfile)
# plt.savefig(efficiency_outfile)
# plt.show()
    
In [43]:
    
efficiencies, efficiencies_err
    
    Out[43]:
In [26]:
    
@np.vectorize
def get_sim_thrown_radius(log_energy):
    if log_energy <= 6:
        thrown_radius = 800.0
    elif (log_energy > 6) & (log_energy <=7):
        thrown_radius = 1100.0
    elif (log_energy > 7) & (log_energy <=8):
        thrown_radius = 1700.0
    else:
        raise ValueError('Invalid energy entered')
    return thrown_radius
    
In [27]:
    
thrown_radii = get_sim_thrown_radius(energybins.log_energy_midpoints)
thrown_area = np.pi * (thrown_radii**2)
geom_factor = (df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min()) / 2
print(geom_factor)
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, thrown_area * geom_factor)
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('Thrown area [$\mathrm{m^2}$]')
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax.grid()
thrown_area_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
                                  'thrown-area.png')
comp.check_output_dir(thrown_area_outfile)
plt.savefig(thrown_area_outfile)
plt.show()
    
    
    
In [ ]:
    
    
In [ ]:
    
    
In [29]:
    
sim_list = [12360, 12362, 12630, 12631]
thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
    
    
In [31]:
    
passed_showers = np.histogram(df_sim.loc[:, 'MC_log_energy'], bins=energybins.log_energy_bins)[0]
    
In [32]:
    
efficiency, efficiency_err = comp.ratio_error(passed_showers, np.sqrt(passed_showers),
                                              thrown_showers, np.sqrt(thrown_showers))
    
In [33]:
    
geom_factor = (df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min()) / 2
    
In [35]:
    
thrown_radii = get_sim_thrown_radius(energybins.log_energy_midpoints)
thrown_areas = np.pi * thrown_radii**2
    
In [36]:
    
eff_area = efficiency * thrown_areas * geom_factor
    
In [41]:
    
plotting.plot_steps(energybins.log_energy_bins, eff_area)
plt.ylim(0)
plt.show()
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [18]:
    
df_data = comp.load_data(config='IC86.2012', log_energy_min=6.0, log_energy_max=8.3)
    
In [19]:
    
X_data = comp.dataframe_functions.dataframe_to_array(df_data, feature_list + ['lap_log_energy'])
log_energy_data = X_data[:,-1]
X_data = X_data[:,:-1]
    
In [20]:
    
# is_finite_mask = np.isfinite(data.X)
# not_finite_mask = np.logical_not(is_finite_mask)
# finite_data_mask = np.logical_not(np.any(not_finite_mask, axis=1))
# data = data[finite_data_mask]
    
In [21]:
    
data_predictions = pipeline.predict(X_data)
    
In [22]:
    
# Get composition masks
data_labels = np.array([comp.dataframe_functions.label_to_comp(pred) for pred in data_predictions])
data_light_mask = data_labels == 'light'
data_heavy_mask = data_labels == 'heavy'
    
In [23]:
    
# Get number of identified comp in each energy bin
df_flux = {}
comp_list = ['light', 'heavy']
for composition in comp_list:
    comp_mask = data_labels == composition
    df_flux['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
                                            bins=energybins.log_energy_bins)[0]
    df_flux['counts_' + composition + '_err'] = np.sqrt(df_flux['counts_' + composition])
df_flux['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total'])
    
In [24]:
    
# Get number of identified comp in each energy bin
unfolding_df = pd.DataFrame(df_flux)
comp_list = ['light', 'heavy']
for composition in comp_list:
    comp_mask = data_labels == composition
    unfolding_df['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
                                            bins=energybins.log_energy_bins)[0]
    unfolding_df['counts_' + composition + '_err'] = np.sqrt(df_flux['counts_' + composition])
unfolding_df['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
unfolding_df['counts_total_err'] = np.sqrt(unfolding_df['counts_total'])
    
In [25]:
    
unfolding_df.index.rename('log_energy_bin_idx', inplace=True)
    
In [26]:
    
unfolding_df
    
    Out[26]:
[ back to top ]
In [53]:
    
# num_particles, num_particles_err = comp.analysis.get_num_particles(sim_train, data, pipeline, comp_list)
    
In [54]:
    
# unfolding_df['counts_light'] = num_particles['light']
# unfolding_df['counts_heavy'] = num_particles['heavy']
# unfolding_df['counts_err_light'] = num_particles_err['light']
# unfolding_df['counts_err_heavy'] = num_particles_err['heavy']
    
In [46]:
    
# pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(df_sim_test[feature_list])
true_comp = df_sim_test['target'].apply(comp.dataframe_functions.label_to_comp)
pred_comp = pd.Series([comp.dataframe_functions.label_to_comp(i) for i in test_predictions])
    
In [47]:
    
# response_list = []
# response_err_list = []
# sim_bin_idxs = np.digitize(log_energy_sim_test, energybins.log_energy_bins) - 1
# energy_bin_idx = np.unique(sim_bin_idxs)
# # energy_bin_idx = energy_bin_idx[1:]
# print(energy_bin_idx)
# # print(energybins.energy_midpoints.shape)
# for bin_idx in energy_bin_idx:
#     if (bin_idx == -1) or (bin_idx == energybins.energy_midpoints.shape[0]):
#         continue
#     sim_bin_mask = sim_bin_idxs == bin_idx
#     response_mat = confusion_matrix(true_comp[sim_bin_mask], pred_comp[sim_bin_mask],
#                                     labels=comp_list)
#     # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
#     response_mat = response_mat.T
#     # Get response matrix statistical error
#     response_mat_err = np.sqrt(response_mat)
#     response_err_list.append(response_mat_err)
#     response_list.append(response_mat)
# block_response = block_diag(response_list).toarray()
# # block_response = np.flipud(block_response)
# # Normalize along MC comp axis to go from counts to probabilities
# block_response = block_response / block_response.sum(axis=0)
# print('block_response = \n{}'.format(block_response))
# block_response_err = block_diag(response_err_list).toarray()
# # block_response_err = np.flipud(block_response_err)
# block_response_err = block_response_err / block_response_err.sum(axis=0)
# print('block_response_err = \n{}'.format(block_response_err))
    
In [48]:
    
# res_mat_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'block_response.txt')
# res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'block_response_err.txt')
# comp.check_output_dir(res_mat_outfile)
# comp.check_output_dir(res_mat_err_outfile)
# np.savetxt(res_mat_outfile, block_response)
# np.savetxt(res_mat_err_outfile, block_response_err)
    
In [ ]:
    
    
In [49]:
    
true_ebin_idxs = np.digitize(log_true_energy_sim_test, energybins.log_energy_bins) - 1
reco_ebin_idxs = np.digitize(log_reco_energy_sim_test, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(true_ebin_idxs)
print(energy_bin_idx)
hstack_list = []
for true_ebin_idx in energy_bin_idx:
    if (true_ebin_idx == -1) or (true_ebin_idx == energybins.energy_midpoints.shape[0]):
        continue
    true_ebin_mask = true_ebin_idxs == true_ebin_idx
    
    vstack_list = []
    for reco_ebin_idx in energy_bin_idx:
        if (reco_ebin_idx == -1) or (reco_ebin_idx == energybins.energy_midpoints.shape[0]):
            continue
        reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
        
        combined_mask = true_ebin_mask & reco_ebin_mask
        if combined_mask.sum() == 0:
            response_mat = np.zeros((2, 2), dtype=int)
        else:
            response_mat = confusion_matrix(true_comp[true_ebin_mask & reco_ebin_mask],
                                            pred_comp[true_ebin_mask & reco_ebin_mask],
                                            labels=comp_list)
        # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
        response_mat = response_mat.T
        vstack_list.append(response_mat)
    hstack_list.append(np.vstack(vstack_list))
    
res = np.hstack(hstack_list)
res_err = np.sqrt(res)
    
    
In [51]:
    
# res_normalized = res / res.sum(axis=0)
# # Error propagate the repsonse matrix error
# res_err_normalized = res_err / res.sum(axis=0)
# # res_err = res_err / res_err.sum(axis=0)
# res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
# res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
#                                                                res.sum(axis=0), res_col_sum_err,
#                                                                nan_to_num=True)
res_col_sum = res.sum(axis=0)
res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
normalizations, normalizations_err = comp.analysis.ratio_error(res_col_sum, res_col_sum_err,
                                                               det_efficiencies, det_efficiencies_err)
res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
                                                               normalizations, normalizations_err,
                                                               nan_to_num=True)
    
    
In [52]:
    
np.testing.assert_allclose(res_normalized.sum(axis=0), det_efficiencies)
    
In [53]:
    
res_normalized
    
    Out[53]:
In [54]:
    
res_normalized_err
    
    Out[54]:
In [108]:
    
# res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
# res_col_sum_err
    
In [109]:
    
# res.sum(axis=0)
    
In [110]:
    
# res1, res1_err = comp.analysis.ratio_error(res, res_err, res.sum(axis=0), res_col_sum_err)
    
In [111]:
    
# np.testing.assert_array_equal(res_normalized, res1)
    
In [112]:
    
# plt.plot(res.sum(axis=0))
    
In [55]:
    
fig, ax = plt.subplots()
# h = np.flipud(block_response)
sns.heatmap(res[24:26, 24:26], annot=True, fmt='d', ax=ax, square=True,
           xticklabels=['light', 'heavy'], yticklabels=['light', 'heavy'],
           cbar_kws={'label': 'Counts'}, vmin=0, cmap='viridis')
ax.invert_yaxis()
plt.xlabel('True composition')
plt.ylabel('Pred composition')
plt.title('$\mathrm{7.6 < \log_{10}(E_{true}/GeV) < 7.7}$' + '\n$\mathrm{7.6 < \log_{10}(E_{reco}/GeV) < 7.7}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-single-energy-bin.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
    
    
In [56]:
    
plt.imshow(res, origin='lower', cmap='viridis')
# ax = sns.heatmap(res, square=True, xticklabels=2, yticklabels=2, 
# ax = sns.heatmap(res, square=True, mask=res==0, xticklabels=2, yticklabels=2, 
#             cbar_kws={'label': 'Counts'})
ax.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
# ax.invert_yaxis()
for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i+0.5, marker='None', ls=':', color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i+0.5, marker='None', ls=':', color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
# plt.grid()
# plt.xticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])], 
#            rotation='vertical')
# plt.yticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])])
plt.colorbar(label='Counts')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
    
    
In [57]:
    
plt.imshow(np.sqrt(res), origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.colorbar(label='Count errors', format='%d')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics-err.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
    
    
In [58]:
    
plt.imshow(res_normalized, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix')
plt.colorbar(label='$\mathrm{P(E_i|C_{\mu})}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
    
    
In [59]:
    
plt.imshow(res_normalized_err, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix error')
plt.colorbar(label='$\mathrm{\delta P(E_i|C_{\mu})}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-err.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
    
    
In [61]:
    
res_mat_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'response.txt')
res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'response_err.txt')
comp.check_output_dir(res_mat_outfile)
comp.check_output_dir(res_mat_err_outfile)
np.savetxt(res_mat_outfile, res_normalized)
np.savetxt(res_mat_err_outfile, res_normalized_err)
    
In [60]:
    
from icecube.weighting.weighting import from_simprod
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5
    
In [61]:
    
df_sim = comp.load_sim(config='IC86.2012', test_size=0)
df_sim.head()
    
    Out[61]:
In [ ]:
    
    
In [62]:
    
flux = GaisserH3a()
model_flux = {}
for ptype in [2212, 1000020040, 1000070140, 1000130270, 1000260560]:
    model_flux[ptype] = flux(energybins.energy_midpoints, ptype)
model_flux_df = pd.DataFrame.from_records(model_flux)
model_flux_df.index = energybins.energy_midpoints
model_flux_df
    
    Out[62]:
In [63]:
    
fig, ax = plt.subplots()
for key in model_flux_df.columns:
    ax.plot(np.log10(model_flux_df.index), model_flux_df.index**2.7*model_flux_df[key], label=key)
ax.plot(np.log10(model_flux_df.index), model_flux_df.index**2.7*model_flux_df.sum(axis=1), label='All particle')
ax.set_yscale("log", nonposy='clip')
ax.set_ylim(1e3, 1e5)
ax.grid()
ax.legend()
plt.show()
    
    
In [64]:
    
model_flux_df.sum(axis=1)
    
    Out[64]:
In [ ]:
    
    
In [65]:
    
simlist = np.unique(df_sim['sim'])
for i, sim in enumerate(simlist):
    gcd_file, sim_files = comp.simfunctions.get_level3_sim_files(sim)
    num_files = len(sim_files)
    if i == 0:
        generator = num_files*from_simprod(int(sim))
    else:
        generator += num_files*from_simprod(int(sim))
    
In [66]:
    
priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5']
# priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5', 'uniform', 'alllight', 'allheavy']
model_ptypes = {}
model_ptypes['h3a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
model_ptypes['h4a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
model_ptypes['Hoerandel5'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
    
In [67]:
    
priors = defaultdict(list)
for flux, name in zip([GaisserH3a(), GaisserH3a(), GaisserH4a(), Hoerandel5(), Hoerandel5()],
                      ['h3a', 'antih3a', 'h4a', 'Hoerandel5', 'antiHoerandel5']):
    priors_raw = defaultdict(list)
    for energy_mid in energybins.energy_midpoints:
        energy = [energy_mid]*5
        ptype = [2212, 1000020040, 1000070140, 1000130270, 1000260560]
        weights = flux(energy, ptype)
#         light_prior = weights[:2].sum()/weights.sum()
#         heavy_prior = weights[2:].sum()/weights.sum()
        light_prior = weights[:2].sum()
        heavy_prior = weights[2:].sum()
        if 'anti' in name:
            priors_raw['light'].append(heavy_prior)
            priors_raw['heavy'].append(light_prior)
        else:
            priors_raw['light'].append(light_prior)
            priors_raw['heavy'].append(heavy_prior)
        priors[name].extend([light_prior, heavy_prior])
    unfolding_df['{}_flux_light'.format(name)] = priors_raw['light']
    unfolding_df['{}_flux_heavy'.format(name)] = priors_raw['heavy']
    
# unfolding_df['uniform_flux_light'] = [0.5]*len(priors_raw['light'])
# unfolding_df['uniform_flux_heavy'] = [0.5]*len(priors_raw['heavy'])
# unfolding_df['alllight_flux_light'] = [0.9]*len(priors_raw['light'])
# unfolding_df['alllight_flux_heavy'] = [0.1]*len(priors_raw['heavy'])
# unfolding_df['allheavy_flux_light'] = [0.1]*len(priors_raw['light'])
# unfolding_df['allheavy_flux_heavy'] = [0.9]*len(priors_raw['heavy'])
    
In [68]:
    
unfolding_df
    
    Out[68]:
In [69]:
    
unfolding_df_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'unfolding-dataframe.csv')
comp.check_output_dir(unfolding_df_outfile)
unfolding_df.to_csv(unfolding_df_outfile)
    
In [70]:
    
formatted_df = pd.DataFrame()
    
In [71]:
    
counts_formatted = []
priors_formatted = defaultdict(list)
for index, row in unfolding_df.iterrows():
    counts_formatted.extend([row['counts_light'], row['counts_heavy']])
    for priors_name in priors_list:
        priors_formatted[priors_name].extend([row[priors_name+'_flux_light'], row[priors_name+'_flux_heavy']])
        
formatted_df['counts'] = counts_formatted
formatted_df['counts_err'] = np.sqrt(counts_formatted)
for key, value in priors_formatted.iteritems():
    formatted_df[key+'_flux'] = value
    formatted_df[key+'_priors'] = formatted_df[key+'_flux'] / formatted_df[key+'_flux'].sum()
formatted_df.index.rename('log_energy_bin_idx', inplace=True)
    
In [72]:
    
formatted_df.head()
    
    Out[72]:
Save formatted DataFrame to disk
In [73]:
    
formatted_df_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding',
                                    'unfolding-dataframe-PyUnfold-formatted.csv')
comp.check_output_dir(formatted_df_outfile)
formatted_df.to_csv(formatted_df_outfile)
    
In [77]:
    
model_to_ls = {'h3a': '-.', 'h4a': ':', 'Hoerandel5': '-', 'antih3a': '--'}
model_to_marker = {'h3a': '.', 'h4a': '^', 'Hoerandel5': '*', 'antih3a': '.'}
fig, ax = plt.subplots()
for model in ['h3a', 'h4a', 'Hoerandel5', 'antih3a']:
    key = '{}_priors'.format(model)
    light_priors = formatted_df[key][::2]
    heavy_priors = formatted_df[key][1::2]
    ax.plot(energybins.log_energy_midpoints, light_priors,
            color=color_dict['light'], ls=model_to_ls[model], marker=model_to_marker[model],
            label='{} light'.format(model), alpha=0.75)
    ax.plot(energybins.log_energy_midpoints, heavy_priors,
            color=color_dict['heavy'], ls=model_to_ls[model], marker=model_to_marker[model],
            label='{} heavy'.format(model), alpha=0.75)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('Prior')
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
# ax.set_ylim([0, 1])
ax.set_yscale("log", nonposy='clip')
ax.grid()
leg = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),
                 frameon=False, fancybox=False, numpoints=1)
priors_outfile = os.path.join(comp.paths.figures_dir, 'unfolding/iter-bayesian', 'priors.png')
comp.check_output_dir(priors_outfile)
plt.savefig(priors_outfile)
plt.show()
    
    
In [130]:
    
woo = pd.read_csv(formatted_df_outfile, index_col='log_energy_bin_idx')
    
In [131]:
    
woo
    
    Out[131]:
In [59]:
    
# with open('pyunfold_dict.json', 'w') as outfile:
#     data = {'counts': counts_observed,
#             'block_response': block_response.tolist(),
#             'block_response_err': block_response_err.tolist()}
#     for model in ['h3a', 'h4a', 'Hoerandel5']:
#         data['priors_{}'.format(model)] = priors[model]
#     json.dump(data, outfile)
    
    
In [116]:
    
df = pd.read_csv(formatted_df_outfile, index_col='log_energy_bin_idx')
    
In [117]:
    
df
    
    Out[117]:
In [ ]: