Author: James Bourbeau
In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
In [2]:
from __future__ import division, print_function
import os
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from scipy.interpolate import UnivariateSpline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
import comptools as comp
import comptools.analysis.plotting as plotting
color_dict = comp.analysis.get_color_dict()
%matplotlib inline
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [14]:
# config = 'IC79.2010'
config = 'IC86.2012'
num_groups = 4
comp_list = comp.get_comp_list(num_groups=num_groups)
In [15]:
comp_list
Out[15]:
Get composition classifier pipeline
Define energy binning for this analysis
In [16]:
energybins = comp.analysis.get_energybins(config=config)
[ back to top ]
In [17]:
log_energy_min = energybins.log_energy_min
log_energy_max = energybins.log_energy_max
In [18]:
df_sim_train, df_sim_test = comp.load_sim(config=config, log_energy_min=log_energy_min, log_energy_max=log_energy_max)
In [19]:
df_sim_train.reco_log_energy.min(), df_sim_train.reco_log_energy.max()
Out[19]:
In [20]:
log_reco_energy_sim_test = df_sim_test['reco_log_energy']
log_true_energy_sim_test = df_sim_test['MC_log_energy']
In [21]:
feature_list, feature_labels = comp.analysis.get_training_features()
In [22]:
pipeline_str = 'BDT_comp_{}_{}-groups'.format(config, num_groups)
pipeline = comp.get_pipeline(pipeline_str)
In [23]:
pipeline = pipeline.fit(df_sim_train[feature_list], df_sim_train['comp_target_{}'.format(num_groups)])
In [24]:
eff_path = os.path.join(comp.paths.comp_data_dir, config, 'efficiencies',
'efficiency_fit_num_groups_{}.hdf'.format(num_groups))
df_eff = pd.read_hdf(eff_path)
In [25]:
df_eff.head()
Out[25]:
In [26]:
fig, ax = plt.subplots()
for composition in comp_list:
ax.errorbar(energybins.log_energy_midpoints, df_eff['eff_median_{}'.format(composition)],
yerr=[df_eff['eff_err_low_{}'.format(composition)],
df_eff['eff_err_high_{}'.format(composition)]],
color=color_dict[composition], label=composition, marker='.')
ax.axvline(6.4, marker='None', ls='-.', color='k')
ax.axvline(7.8, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Detection efficienies')
ax.grid()
ax.legend()
ax.ticklabel_format(style='sci',axis='y')
ax.yaxis.major.formatter.set_powerlimits((0,0))
plt.show()
Format for PyUnfold response matrix use
In [27]:
# efficiencies, efficiencies_err = [], []
# for idx, row in df_efficiency.iterrows():
# for composition in comp_list:
# efficiencies.append(row['eff_median_{}'.format(composition)])
# efficiencies_err.append(row['eff_err_low_{}'.format(composition)])
# efficiencies = np.asarray(efficiencies)
# efficiencies_err = np.asarray(efficiencies_err)
efficiencies, efficiencies_err = [], []
for idx, row in df_eff.iterrows():
for composition in comp_list:
efficiencies.append(row['eff_median_{}'.format(composition)])
efficiencies_err.append(row['eff_err_low_{}'.format(composition)])
efficiencies = np.asarray(efficiencies)
efficiencies_err = np.asarray(efficiencies_err)
In [28]:
efficiencies
Out[28]:
In [29]:
df_data = comp.load_data(config=config, columns=feature_list,
log_energy_min=log_energy_min, log_energy_max=log_energy_max,
n_jobs=20, verbose=True)
In [30]:
df_data.shape
Out[30]:
In [31]:
X_data = comp.dataframe_functions.dataframe_to_array(df_data, feature_list + ['reco_log_energy'])
log_energy_data = X_data[:, -1]
X_data = X_data[:, :-1]
In [32]:
log_energy_data.min(), log_energy_data.max()
Out[32]:
In [33]:
data_predictions = pipeline.predict(X_data)
In [34]:
# Get composition masks
data_labels = np.array(comp.composition_encoding.decode_composition_groups(data_predictions, num_groups=num_groups))
In [35]:
# Get number of identified comp in each energy bin
unfolding_df = pd.DataFrame()
for composition in comp_list:
comp_mask = data_labels == composition
unfolding_df['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
bins=energybins.log_energy_bins)[0]
unfolding_df['counts_' + composition + '_err'] = np.sqrt(unfolding_df['counts_' + composition])
unfolding_df['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
unfolding_df['counts_total_err'] = np.sqrt(unfolding_df['counts_total'])
In [36]:
unfolding_df.index.rename('log_energy_bin_idx', inplace=True)
In [37]:
unfolding_df.head()
Out[37]:
In [38]:
fig, ax = plt.subplots()
for composition in comp_list:
ax.plot(unfolding_df['counts_{}'.format(composition)], color=color_dict[composition])
ax.set_yscale("log", nonposy='clip')
ax.grid()
plt.show()
[ back to top ]
In [39]:
test_predictions = pipeline.predict(df_sim_test[feature_list])
true_comp = df_sim_test['comp_group_{}'.format(num_groups)].values
pred_comp = np.array(comp.composition_encoding.decode_composition_groups(test_predictions,
num_groups=num_groups))
In [40]:
true_comp
Out[40]:
In [52]:
true_ebin_idxs = np.digitize(log_true_energy_sim_test, energybins.log_energy_bins) - 1
reco_ebin_idxs = np.digitize(log_reco_energy_sim_test, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(true_ebin_idxs)
print(range(-1, len(energybins.log_energy_midpoints)+1))
hstack_list = []
# for true_ebin_idx in energy_bin_idx:
for true_ebin_idx in range(-1, len(energybins.log_energy_midpoints)+1):
if (true_ebin_idx == -1) or (true_ebin_idx == energybins.energy_midpoints.shape[0]):
continue
true_ebin_mask = true_ebin_idxs == true_ebin_idx
vstack_list = []
# for reco_ebin_idx in energy_bin_idx:
for reco_ebin_idx in range(-1, len(energybins.log_energy_midpoints)+1):
if (reco_ebin_idx == -1) or (reco_ebin_idx == energybins.energy_midpoints.shape[0]):
continue
reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
combined_mask = true_ebin_mask & reco_ebin_mask
if combined_mask.sum() == 0:
response_mat = np.zeros((num_groups, num_groups), dtype=int)
else:
response_mat = confusion_matrix(true_comp[true_ebin_mask & reco_ebin_mask],
pred_comp[true_ebin_mask & reco_ebin_mask],
labels=comp_list)
# Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
response_mat = response_mat.T
vstack_list.append(response_mat)
hstack_list.append(np.vstack(vstack_list))
res = np.hstack(hstack_list)
res_err = np.sqrt(res)
In [53]:
res.shape
Out[53]:
In [54]:
plt.imshow(res, origin='lower')
Out[54]:
In [60]:
from itertools import product
num_groups = len(comp_list)
num_ebins = len(energybins.log_energy_midpoints)
e_bin_iter = product(range(num_ebins), range(num_ebins))
res2 = np.zeros((num_ebins * num_groups, num_ebins * num_groups), dtype=int)
for true_ebin_idx, reco_ebin_idx in e_bin_iter:
# print(true_ebin_idx, reco_ebin_idx)
true_ebin_mask = true_ebin_idxs == true_ebin_idx
reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
ebin_mask = true_ebin_mask & reco_ebin_mask
if ebin_mask.sum() == 0:
continue
else:
response_mat = confusion_matrix(true_comp[ebin_mask],
pred_comp[ebin_mask],
labels=comp_list)
# Transpose response matrix to get MC comp on x-axis
# and reco comp on y-axis
# response_mat = np.flipud(response_mat)
response_mat = response_mat.T
res2[num_groups * reco_ebin_idx : num_groups * (reco_ebin_idx + 1),
num_groups * true_ebin_idx : num_groups * (true_ebin_idx + 1)] = response_mat
In [61]:
plt.imshow(res2, origin='lower')
Out[61]:
In [62]:
np.testing.assert_array_equal(res2, res)
In [68]:
reco_ebin_idx = 4
true_ebin_idx = 4
plt.imshow(res2[num_groups * reco_ebin_idx : num_groups * (reco_ebin_idx + 1),
num_groups * true_ebin_idx : num_groups * (true_ebin_idx + 1)],
origin='lower')
Out[68]:
In [131]:
res_col_sum = res.sum(axis=0)
res_col_sum_err = np.array([np.sqrt(np.nansum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
normalizations, normalizations_err = comp.analysis.ratio_error(res_col_sum, res_col_sum_err,
efficiencies, efficiencies_err,
nan_to_num=True)
res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
normalizations, normalizations_err,
nan_to_num=True)
In [132]:
res_normalized = np.nan_to_num(res_normalized)
res_normalized_err = np.nan_to_num(res_normalized_err)
In [133]:
np.testing.assert_allclose(res_normalized.sum(axis=0), efficiencies)
In [134]:
res
Out[134]:
In [135]:
fig, ax = plt.subplots()
# h = np.flipud(block_response)
idx = 4*num_groups
sns.heatmap(res[idx:idx+num_groups, idx:idx+num_groups], annot=True, fmt='d', ax=ax, square=True,
xticklabels=comp_list, yticklabels=comp_list,
cbar_kws={'label': 'Counts'}, vmin=0, cmap='viridis')
ax.invert_yaxis()
plt.xlabel('True composition')
plt.ylabel('Pred composition')
plt.title('$\mathrm{7.6 < \log_{10}(E_{true}/GeV) < 7.7}$' + '\n$\mathrm{7.6 < \log_{10}(E_{reco}/GeV) < 7.7}$')
# res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-single-energy-bin.png')
# comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()
In [136]:
plt.imshow(res, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
# ax = sns.heatmap(res, square=True, xticklabels=2, yticklabels=2,
# ax = sns.heatmap(res, square=True, mask=res==0, xticklabels=2, yticklabels=2,
# cbar_kws={'label': 'Counts'})
ax.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
# ax.invert_yaxis()
for i in np.arange(0, res.shape[0], 2):
plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
# plt.axvline(i+0.5, marker='None', ls=':', color='gray')
for i in np.arange(0, res.shape[0], 2):
plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
# plt.axhline(i+0.5, marker='None', ls=':', color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
# plt.grid()
# plt.xticks(np.arange(0.5, res.shape[0], 2),
# ['{}'.format(i+1) for i in range(res.shape[0])],
# rotation='vertical')
# plt.yticks(np.arange(0.5, res.shape[0], 2),
# ['{}'.format(i+1) for i in range(res.shape[0])])
plt.colorbar(label='Counts')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()
In [137]:
plt.imshow(np.sqrt(res), origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
for i in np.arange(0, res.shape[0], 2):
plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.colorbar(label='Count errors', format='%d')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics-err.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()
In [138]:
plt.imshow(res_normalized, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
# for i in np.arange(0, res.shape[0], 2):
# plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
# plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix')
# plt.colorbar(label='A.U.')
plt.colorbar(label='$\mathrm{P(E_i|C_{\mu})}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', config, 'response_matrix',
'response-matrix_{}-groups.png'.format(num_groups))
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
In [139]:
plt.imshow(res_normalized_err, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
# for i in np.arange(0, res.shape[0], 2):
# plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
# plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix error')
plt.colorbar(label='$\mathrm{\delta P(E_i|C_{\mu})}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-err.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()
In [140]:
res_mat_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
'response_{}-groups.txt'.format(num_groups))
res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
'response_err_{}-groups.txt'.format(num_groups))
comp.check_output_dir(res_mat_outfile)
comp.check_output_dir(res_mat_err_outfile)
np.savetxt(res_mat_outfile, res_normalized)
np.savetxt(res_mat_err_outfile, res_normalized_err)
In [141]:
from icecube.weighting.weighting import from_simprod, PDGCode, ParticleType
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5, Hoerandel_IT, CompiledFlux
In [142]:
df_sim = comp.load_sim(config=config, test_size=0, log_energy_min=6.0, log_energy_max=8.3)
df_sim.head()
Out[142]:
In [143]:
p = PDGCode().values
pdg_codes = np.array([2212, 1000020040, 1000080160, 1000260560])
particle_names = [p[pdg_code].name for pdg_code in pdg_codes]
In [144]:
particle_names
Out[144]:
In [145]:
group_names = np.array(comp.composition_encoding.composition_group_labels(particle_names, num_groups=num_groups))
group_names
Out[145]:
In [146]:
comp_to_pdg_list = {composition: pdg_codes[group_names == composition] for composition in comp_list}
In [147]:
comp_to_pdg_list
Out[147]:
In [148]:
# Replace O16Nucleus with N14Nucleus + Al27Nucleus
for composition, pdg_list in comp_to_pdg_list.iteritems():
if 1000080160 in pdg_list:
pdg_list = pdg_list[pdg_list != 1000080160]
comp_to_pdg_list[composition] = np.append(pdg_list, [1000070140, 1000130270])
else:
continue
In [149]:
comp_to_pdg_list
Out[149]:
In [150]:
priors_list = ['H3a', 'H4a', 'Polygonato']
In [151]:
# priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5']
# # priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5', 'uniform', 'alllight', 'allheavy']
# model_ptypes = {}
# model_ptypes['h3a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
# model_ptypes['h4a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
# model_ptypes['Hoerandel5'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
In [152]:
fig, ax = plt.subplots()
for flux, name, marker in zip([GaisserH3a(), GaisserH4a(), Hoerandel5()],
priors_list,
'.^*o'):
for composition in comp_list:
comp_flux = []
for energy_mid in energybins.energy_midpoints:
flux_energy_mid = flux(energy_mid, comp_to_pdg_list[composition]).sum()
comp_flux.append(flux_energy_mid)
# Normalize flux in each energy bin to a probability
comp_flux = np.asarray(comp_flux)
prior_key = '{}_flux_{}'.format(name, composition)
unfolding_df[prior_key] = comp_flux
# Plot result
ax.plot(energybins.log_energy_midpoints, energybins.energy_midpoints**2.7*comp_flux,
color=color_dict[composition], alpha=0.75, marker=marker, ls=':',
label='{} ({})'.format(name, composition))
ax.set_yscale("log", nonposy='clip')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('$\mathrm{ E^{2.7} \ J(E) \ [GeV^{1.7} m^{-2} sr^{-1} s^{-1}]}$')
ax.grid()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
priors_outfile = os.path.join(comp.paths.figures_dir, 'unfolding',
'priors_flux_{}-groups.png'.format(num_groups))
comp.check_output_dir(priors_outfile)
plt.savefig(priors_outfile)
plt.show()
unfolding_df.head()
Out[152]:
In [153]:
# unfolding_df_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
# 'unfolding_{}-groups.hdf'.format(num_groups))
# comp.check_output_dir(unfolding_df_outfile)
# unfolding_df.to_hdf(unfolding_df_outfile, 'dataframe', format='table')
In [154]:
formatted_df = pd.DataFrame()
In [155]:
counts_formatted = []
priors_formatted = defaultdict(list)
for index, row in unfolding_df.iterrows():
for composition in comp_list:
counts_formatted.append(row['counts_{}'.format(composition)])
for priors_name in priors_list:
priors_formatted[priors_name].append(row['{}_flux_{}'.format(priors_name, composition)])
formatted_df['counts'] = counts_formatted
formatted_df['counts_err'] = np.sqrt(counts_formatted)
formatted_df['efficiencies'] = efficiencies
formatted_df['efficiencies_err'] = efficiencies_err
for key, value in priors_formatted.iteritems():
formatted_df[key+'_flux'] = value
formatted_df[key+'_prior'] = formatted_df[key+'_flux'] / formatted_df[key+'_flux'].sum()
formatted_df.index.rename('log_energy_bin_idx', inplace=True)
In [156]:
formatted_df.head()
Out[156]:
In [157]:
prior_sums = formatted_df[[col for col in formatted_df.columns if 'prior' in col]].sum()
np.testing.assert_allclose(prior_sums, np.ones_like(prior_sums))
Save formatted DataFrame to disk
In [158]:
formatted_df_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
'unfolding-df_{}-groups.hdf'.format(num_groups))
comp.check_output_dir(formatted_df_outfile)
formatted_df.to_hdf(formatted_df_outfile, 'dataframe', format='table')
In [ ]:
In [ ]:
In [ ]:
In [ ]: