Author: James Bourbeau
In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
In [2]:
%matplotlib inline
from __future__ import division, print_function
import os
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from scipy.interpolate import UnivariateSpline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
import comptools as comp
import comptools.analysis.plotting as plotting
color_dict = comp.analysis.get_color_dict()
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [3]:
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
Get composition classifier pipeline
Define energy binning for this analysis
In [4]:
energybins = comp.analysis.get_energybins()
[ back to top ]
In [5]:
df_sim_train, df_sim_test = comp.load_sim(config='IC86.2012', log_energy_min=6.0, log_energy_max=8.3)
In [6]:
log_energy_sim_test = df_sim_test['lap_log_energy']
log_reco_energy_sim_test = df_sim_test['lap_log_energy']
log_true_energy_sim_test = df_sim_test['MC_log_energy']
In [7]:
feature_list, feature_labels = comp.analysis.get_training_features()
In [8]:
pipeline_str = 'BDT'
pipeline = comp.get_pipeline(pipeline_str)
In [9]:
pipeline
Out[9]:
In [10]:
pipeline.fit(df_sim_train[feature_list], df_sim_train['target'])
Out[10]:
In [ ]:
In [ ]:
In [13]:
efficiencies = {}
efficiencies_err = {}
for composition in comp_list:
eff_path = os.path.join(comp.paths.comp_data_dir, 'unfolding',
'efficiency-{}.npy'.format(composition))
efficiencies[composition] = np.load(eff_path)
eff_err_path = os.path.join(comp.paths.comp_data_dir, 'unfolding',
'efficiency-err-{}.npy'.format(composition))
efficiencies_err[composition] = np.load(eff_err_path)
Plot efficiencies
In [16]:
fig, ax = plt.subplots()
for composition in comp_list:
ax.errorbar(energybins.log_energy_midpoints, efficiencies[composition],
yerr=efficiencies_err[composition], color=color_dict[composition],
label=composition, marker='.')
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
ax.grid()
ax.ticklabel_format(style='sci',axis='y')
ax.yaxis.major.formatter.set_powerlimits((0,0))
plt.show()
In [43]:
det_efficiencies = np.empty(len(energybins.log_energy_midpoints)*2)
det_efficiencies[::2] = efficiencies['light']
det_efficiencies[1::2] = efficiencies['heavy']
In [44]:
det_efficiencies_err = np.empty_like(det_efficiencies)
det_efficiencies_err[::2] = efficiencies_err['light']
det_efficiencies_err[1::2] = efficiencies_err['heavy']
In [ ]:
In [11]:
df_sim = comp.load_sim(config='IC86.2012', test_size=0)
In [12]:
df_sim.lap_cos_zenith.min(), df_sim.lap_cos_zenith.max()
Out[12]:
In [13]:
(df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min())/2
Out[13]:
In [14]:
counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy, bins=energybins.log_energy_bins)[0]
counts_MC_energy_bins
Out[14]:
In [15]:
counts_reco_energy_bins = np.histogram(df_sim.lap_log_energy, bins=energybins.log_energy_bins)[0]
counts_reco_energy_bins
Out[15]:
In [16]:
counts_reco_energy_bins / counts_MC_energy_bins
Out[16]:
In [17]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_MC_energy_bins, ax=ax)
ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
ax.set_ylabel('\# true events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()
In [18]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins, ax=ax)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('\# reco events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()
In [19]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins / counts_MC_energy_bins, ax=ax)
ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('\# reco events / \# true events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [20]:
def run_to_energy_bin(run):
ebin_first = 5.0
ebin_last = 7.9
# Taken from http://simprod.icecube.wisc.edu/cgi-bin/simulation/cgi/cfg?dataset=12360
return (ebin_first*10+(run-1)%(ebin_last*10-ebin_first*10+1))/10
In [21]:
def thrown_showers_per_ebin(sim_list, log_energy_bins=None):
e_bins = []
for sim in sim_list:
print(sim)
for f in comp.simfunctions.get_level3_sim_files_iterator(sim):
start_idx = f.find('Run')
run = int(f[start_idx+3: start_idx+9])
e_bin = run_to_energy_bin(run)
e_bins.append(e_bin)
if log_energy_bins is None:
log_energy_bins = np.arange(5, 8.1, 0.1)
log_energy_midpoints = (log_energy_bins[1:] + log_energy_bins[:-1]) / 2
vals = np.histogram(e_bins, bins=log_energy_bins)[0]
n_resamples = 100
n_showers_per_file = n_resamples
thrown_showers = vals * n_showers_per_file
return thrown_showers
In [22]:
sim_list = [12360, 12362, 12630, 12631]
thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
In [23]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins / thrown_showers, ax=ax,
label='Reco energy bins', color='C0', alpha=0.75)
plotting.plot_steps(energybins.log_energy_bins, counts_MC_energy_bins / thrown_showers, ax=ax,
label='MC energy bins', color='C1', alpha=0.75)
# ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('\# passed events / \# thrown events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0)
ax.grid()
ax.legend()
plt.show()
In [47]:
sim_dict = {'light': [12360, 12630],
'heavy': [12362, 12631]}
efficiencies = np.empty(len(energybins.log_energy_midpoints)*2)
efficiencies_err = np.empty_like(efficiencies)
fig, ax = plt.subplots()
for composition, sim_list in sim_dict.items():
comp_mask = df_sim.MC_comp_class == composition
# Get number of thrown showers in each energy bin
thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
# Get number of showers in each energy bin that pass cuts
counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy[comp_mask],
bins=energybins.log_energy_bins)[0].astype(float)
# # (Maybe??) Scale larger thrown radius up by area ratio
large_thrown_radius = energybins.log_energy_midpoints > 7
# counts_MC_energy_bins[large_thrown_radius] *= (1700/1100)**2
# Calculate detection efficiency and add them to normalizations array
eff, eff_err = comp.ratio_error(counts_MC_energy_bins, np.sqrt(counts_MC_energy_bins),
thrown_showers, np.sqrt(thrown_showers))
start_idx = 0 if composition == 'light' else 1
efficiencies[start_idx::2] = eff
efficiencies_err[start_idx::2] = eff_err
# Plot detection efficiencies
plotting.plot_steps(energybins.log_energy_bins, eff, yerr=eff_err, ax=ax,
label=composition, color=color_dict[composition], alpha=0.75)
spl = UnivariateSpline(energybins.log_energy_midpoints[~large_thrown_radius],
eff[~large_thrown_radius], s=5)
ax.plot(energybins.log_energy_midpoints[~large_thrown_radius],
spl(energybins.log_energy_midpoints[[~large_thrown_radius]]), color=color_dict[composition])
spl = UnivariateSpline(energybins.log_energy_midpoints[large_thrown_radius],
eff[large_thrown_radius], s=5)
ax.plot(energybins.log_energy_midpoints[large_thrown_radius],
spl(energybins.log_energy_midpoints[[large_thrown_radius]]), color=color_dict[composition])
# ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
# ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0)
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax.grid()
ax.legend()
efficiency_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
'detection-efficiency.png')
comp.check_output_dir(efficiency_outfile)
plt.savefig(efficiency_outfile)
plt.show()
In [44]:
# sim_dict = {'light': [12360, 12630],
# 'heavy': [12362, 12631]}
# ebins = np.arange(5.0, 8.1, 0.1)
# emidpoints = (ebins[1:] + ebins[:-1]) / 2
# efficiencies = np.empty((len(ebins)-1)*2)
# efficiencies_err = np.empty((len(ebins)-1)*2)
# # efficiencies_err = np.empty(len(energybins.log_energy_midpoints)*2)
# fig, ax = plt.subplots()
# for composition, sim_list in sim_dict.items():
# comp_mask = df_sim.MC_comp_class == composition
# # Get number of thrown showers in each energy bin
# thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=ebins)
# # thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
# # Get number of showers in each energy bin that pass cuts
# counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy[comp_mask],
# bins=ebins)[0].astype(float)
# # # (Maybe??) Scale larger thrown radius up by area ratio
# large_thrown_radius = emidpoints > 7
# # counts_MC_energy_bins[large_thrown_radius] *= (1700/1100)**2
# # Calculate detection efficiency and add them to normalizations array
# eff, eff_err = comp.ratio_error(counts_MC_energy_bins, np.sqrt(counts_MC_energy_bins),
# thrown_showers, np.sqrt(thrown_showers))
# start_idx = 0 if composition == 'light' else 1
# efficiencies[start_idx::2] = eff
# efficiencies_err[start_idx::2] = eff_err
# # Plot detection efficiencies
# plotting.plot_steps(ebins, eff, yerr=eff_err, ax=ax,
# label=composition, color=color_dict[composition], alpha=0.75)
# spl = UnivariateSpline(emidpoints[~large_thrown_radius], eff[~large_thrown_radius], s=5)
# ax.plot(emidpoints[~large_thrown_radius], spl(emidpoints[[~large_thrown_radius]]), color=color_dict[composition])
# # ax.axhline(1.0, marker='None', ls='-.', color='k')
# ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
# ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
# # ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
# ax.set_ylim(0)
# plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
# ax.grid()
# ax.legend()
# efficiency_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
# 'detection-efficiency.png')
# comp.check_output_dir(efficiency_outfile)
# plt.savefig(efficiency_outfile)
# plt.show()
In [43]:
efficiencies, efficiencies_err
Out[43]:
In [26]:
@np.vectorize
def get_sim_thrown_radius(log_energy):
if log_energy <= 6:
thrown_radius = 800.0
elif (log_energy > 6) & (log_energy <=7):
thrown_radius = 1100.0
elif (log_energy > 7) & (log_energy <=8):
thrown_radius = 1700.0
else:
raise ValueError('Invalid energy entered')
return thrown_radius
In [27]:
thrown_radii = get_sim_thrown_radius(energybins.log_energy_midpoints)
thrown_area = np.pi * (thrown_radii**2)
geom_factor = (df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min()) / 2
print(geom_factor)
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, thrown_area * geom_factor)
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('Thrown area [$\mathrm{m^2}$]')
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax.grid()
thrown_area_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
'thrown-area.png')
comp.check_output_dir(thrown_area_outfile)
plt.savefig(thrown_area_outfile)
plt.show()
In [ ]:
In [ ]:
In [29]:
sim_list = [12360, 12362, 12630, 12631]
thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
In [31]:
passed_showers = np.histogram(df_sim.loc[:, 'MC_log_energy'], bins=energybins.log_energy_bins)[0]
In [32]:
efficiency, efficiency_err = comp.ratio_error(passed_showers, np.sqrt(passed_showers),
thrown_showers, np.sqrt(thrown_showers))
In [33]:
geom_factor = (df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min()) / 2
In [35]:
thrown_radii = get_sim_thrown_radius(energybins.log_energy_midpoints)
thrown_areas = np.pi * thrown_radii**2
In [36]:
eff_area = efficiency * thrown_areas * geom_factor
In [41]:
plotting.plot_steps(energybins.log_energy_bins, eff_area)
plt.ylim(0)
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [18]:
df_data = comp.load_data(config='IC86.2012', log_energy_min=6.0, log_energy_max=8.3)
In [19]:
X_data = comp.dataframe_functions.dataframe_to_array(df_data, feature_list + ['lap_log_energy'])
log_energy_data = X_data[:,-1]
X_data = X_data[:,:-1]
In [20]:
# is_finite_mask = np.isfinite(data.X)
# not_finite_mask = np.logical_not(is_finite_mask)
# finite_data_mask = np.logical_not(np.any(not_finite_mask, axis=1))
# data = data[finite_data_mask]
In [21]:
data_predictions = pipeline.predict(X_data)
In [22]:
# Get composition masks
data_labels = np.array([comp.dataframe_functions.label_to_comp(pred) for pred in data_predictions])
data_light_mask = data_labels == 'light'
data_heavy_mask = data_labels == 'heavy'
In [23]:
# Get number of identified comp in each energy bin
df_flux = {}
comp_list = ['light', 'heavy']
for composition in comp_list:
comp_mask = data_labels == composition
df_flux['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
bins=energybins.log_energy_bins)[0]
df_flux['counts_' + composition + '_err'] = np.sqrt(df_flux['counts_' + composition])
df_flux['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total'])
In [24]:
# Get number of identified comp in each energy bin
unfolding_df = pd.DataFrame(df_flux)
comp_list = ['light', 'heavy']
for composition in comp_list:
comp_mask = data_labels == composition
unfolding_df['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
bins=energybins.log_energy_bins)[0]
unfolding_df['counts_' + composition + '_err'] = np.sqrt(df_flux['counts_' + composition])
unfolding_df['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
unfolding_df['counts_total_err'] = np.sqrt(unfolding_df['counts_total'])
In [25]:
unfolding_df.index.rename('log_energy_bin_idx', inplace=True)
In [26]:
unfolding_df
Out[26]:
[ back to top ]
In [53]:
# num_particles, num_particles_err = comp.analysis.get_num_particles(sim_train, data, pipeline, comp_list)
In [54]:
# unfolding_df['counts_light'] = num_particles['light']
# unfolding_df['counts_heavy'] = num_particles['heavy']
# unfolding_df['counts_err_light'] = num_particles_err['light']
# unfolding_df['counts_err_heavy'] = num_particles_err['heavy']
In [46]:
# pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(df_sim_test[feature_list])
true_comp = df_sim_test['target'].apply(comp.dataframe_functions.label_to_comp)
pred_comp = pd.Series([comp.dataframe_functions.label_to_comp(i) for i in test_predictions])
In [47]:
# response_list = []
# response_err_list = []
# sim_bin_idxs = np.digitize(log_energy_sim_test, energybins.log_energy_bins) - 1
# energy_bin_idx = np.unique(sim_bin_idxs)
# # energy_bin_idx = energy_bin_idx[1:]
# print(energy_bin_idx)
# # print(energybins.energy_midpoints.shape)
# for bin_idx in energy_bin_idx:
# if (bin_idx == -1) or (bin_idx == energybins.energy_midpoints.shape[0]):
# continue
# sim_bin_mask = sim_bin_idxs == bin_idx
# response_mat = confusion_matrix(true_comp[sim_bin_mask], pred_comp[sim_bin_mask],
# labels=comp_list)
# # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
# response_mat = response_mat.T
# # Get response matrix statistical error
# response_mat_err = np.sqrt(response_mat)
# response_err_list.append(response_mat_err)
# response_list.append(response_mat)
# block_response = block_diag(response_list).toarray()
# # block_response = np.flipud(block_response)
# # Normalize along MC comp axis to go from counts to probabilities
# block_response = block_response / block_response.sum(axis=0)
# print('block_response = \n{}'.format(block_response))
# block_response_err = block_diag(response_err_list).toarray()
# # block_response_err = np.flipud(block_response_err)
# block_response_err = block_response_err / block_response_err.sum(axis=0)
# print('block_response_err = \n{}'.format(block_response_err))
In [48]:
# res_mat_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'block_response.txt')
# res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'block_response_err.txt')
# comp.check_output_dir(res_mat_outfile)
# comp.check_output_dir(res_mat_err_outfile)
# np.savetxt(res_mat_outfile, block_response)
# np.savetxt(res_mat_err_outfile, block_response_err)
In [ ]:
In [49]:
true_ebin_idxs = np.digitize(log_true_energy_sim_test, energybins.log_energy_bins) - 1
reco_ebin_idxs = np.digitize(log_reco_energy_sim_test, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(true_ebin_idxs)
print(energy_bin_idx)
hstack_list = []
for true_ebin_idx in energy_bin_idx:
if (true_ebin_idx == -1) or (true_ebin_idx == energybins.energy_midpoints.shape[0]):
continue
true_ebin_mask = true_ebin_idxs == true_ebin_idx
vstack_list = []
for reco_ebin_idx in energy_bin_idx:
if (reco_ebin_idx == -1) or (reco_ebin_idx == energybins.energy_midpoints.shape[0]):
continue
reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
combined_mask = true_ebin_mask & reco_ebin_mask
if combined_mask.sum() == 0:
response_mat = np.zeros((2, 2), dtype=int)
else:
response_mat = confusion_matrix(true_comp[true_ebin_mask & reco_ebin_mask],
pred_comp[true_ebin_mask & reco_ebin_mask],
labels=comp_list)
# Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
response_mat = response_mat.T
vstack_list.append(response_mat)
hstack_list.append(np.vstack(vstack_list))
res = np.hstack(hstack_list)
res_err = np.sqrt(res)
In [51]:
# res_normalized = res / res.sum(axis=0)
# # Error propagate the repsonse matrix error
# res_err_normalized = res_err / res.sum(axis=0)
# # res_err = res_err / res_err.sum(axis=0)
# res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
# res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
# res.sum(axis=0), res_col_sum_err,
# nan_to_num=True)
res_col_sum = res.sum(axis=0)
res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
normalizations, normalizations_err = comp.analysis.ratio_error(res_col_sum, res_col_sum_err,
det_efficiencies, det_efficiencies_err)
res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
normalizations, normalizations_err,
nan_to_num=True)
In [52]:
np.testing.assert_allclose(res_normalized.sum(axis=0), det_efficiencies)
In [53]:
res_normalized
Out[53]:
In [54]:
res_normalized_err
Out[54]:
In [108]:
# res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
# res_col_sum_err
In [109]:
# res.sum(axis=0)
In [110]:
# res1, res1_err = comp.analysis.ratio_error(res, res_err, res.sum(axis=0), res_col_sum_err)
In [111]:
# np.testing.assert_array_equal(res_normalized, res1)
In [112]:
# plt.plot(res.sum(axis=0))
In [55]:
fig, ax = plt.subplots()
# h = np.flipud(block_response)
sns.heatmap(res[24:26, 24:26], annot=True, fmt='d', ax=ax, square=True,
xticklabels=['light', 'heavy'], yticklabels=['light', 'heavy'],
cbar_kws={'label': 'Counts'}, vmin=0, cmap='viridis')
ax.invert_yaxis()
plt.xlabel('True composition')
plt.ylabel('Pred composition')
plt.title('$\mathrm{7.6 < \log_{10}(E_{true}/GeV) < 7.7}$' + '\n$\mathrm{7.6 < \log_{10}(E_{reco}/GeV) < 7.7}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-single-energy-bin.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
In [56]:
plt.imshow(res, origin='lower', cmap='viridis')
# ax = sns.heatmap(res, square=True, xticklabels=2, yticklabels=2,
# ax = sns.heatmap(res, square=True, mask=res==0, xticklabels=2, yticklabels=2,
# cbar_kws={'label': 'Counts'})
ax.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
# ax.invert_yaxis()
for i in np.arange(0, res.shape[0], 2):
plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
# plt.axvline(i+0.5, marker='None', ls=':', color='gray')
for i in np.arange(0, res.shape[0], 2):
plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
# plt.axhline(i+0.5, marker='None', ls=':', color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
# plt.grid()
# plt.xticks(np.arange(0.5, res.shape[0], 2),
# ['{}'.format(i+1) for i in range(res.shape[0])],
# rotation='vertical')
# plt.yticks(np.arange(0.5, res.shape[0], 2),
# ['{}'.format(i+1) for i in range(res.shape[0])])
plt.colorbar(label='Counts')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
In [57]:
plt.imshow(np.sqrt(res), origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
for i in np.arange(0, res.shape[0], 2):
plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.colorbar(label='Count errors', format='%d')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics-err.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
In [58]:
plt.imshow(res_normalized, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
for i in np.arange(0, res.shape[0], 2):
plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix')
plt.colorbar(label='$\mathrm{P(E_i|C_{\mu})}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
In [59]:
plt.imshow(res_normalized_err, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')
for i in np.arange(0, res.shape[0], 2):
plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix error')
plt.colorbar(label='$\mathrm{\delta P(E_i|C_{\mu})}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-err.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()
In [61]:
res_mat_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'response.txt')
res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'response_err.txt')
comp.check_output_dir(res_mat_outfile)
comp.check_output_dir(res_mat_err_outfile)
np.savetxt(res_mat_outfile, res_normalized)
np.savetxt(res_mat_err_outfile, res_normalized_err)
In [60]:
from icecube.weighting.weighting import from_simprod
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5
In [61]:
df_sim = comp.load_sim(config='IC86.2012', test_size=0)
df_sim.head()
Out[61]:
In [ ]:
In [62]:
flux = GaisserH3a()
model_flux = {}
for ptype in [2212, 1000020040, 1000070140, 1000130270, 1000260560]:
model_flux[ptype] = flux(energybins.energy_midpoints, ptype)
model_flux_df = pd.DataFrame.from_records(model_flux)
model_flux_df.index = energybins.energy_midpoints
model_flux_df
Out[62]:
In [63]:
fig, ax = plt.subplots()
for key in model_flux_df.columns:
ax.plot(np.log10(model_flux_df.index), model_flux_df.index**2.7*model_flux_df[key], label=key)
ax.plot(np.log10(model_flux_df.index), model_flux_df.index**2.7*model_flux_df.sum(axis=1), label='All particle')
ax.set_yscale("log", nonposy='clip')
ax.set_ylim(1e3, 1e5)
ax.grid()
ax.legend()
plt.show()
In [64]:
model_flux_df.sum(axis=1)
Out[64]:
In [ ]:
In [65]:
simlist = np.unique(df_sim['sim'])
for i, sim in enumerate(simlist):
gcd_file, sim_files = comp.simfunctions.get_level3_sim_files(sim)
num_files = len(sim_files)
if i == 0:
generator = num_files*from_simprod(int(sim))
else:
generator += num_files*from_simprod(int(sim))
In [66]:
priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5']
# priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5', 'uniform', 'alllight', 'allheavy']
model_ptypes = {}
model_ptypes['h3a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
model_ptypes['h4a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
model_ptypes['Hoerandel5'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
In [67]:
priors = defaultdict(list)
for flux, name in zip([GaisserH3a(), GaisserH3a(), GaisserH4a(), Hoerandel5(), Hoerandel5()],
['h3a', 'antih3a', 'h4a', 'Hoerandel5', 'antiHoerandel5']):
priors_raw = defaultdict(list)
for energy_mid in energybins.energy_midpoints:
energy = [energy_mid]*5
ptype = [2212, 1000020040, 1000070140, 1000130270, 1000260560]
weights = flux(energy, ptype)
# light_prior = weights[:2].sum()/weights.sum()
# heavy_prior = weights[2:].sum()/weights.sum()
light_prior = weights[:2].sum()
heavy_prior = weights[2:].sum()
if 'anti' in name:
priors_raw['light'].append(heavy_prior)
priors_raw['heavy'].append(light_prior)
else:
priors_raw['light'].append(light_prior)
priors_raw['heavy'].append(heavy_prior)
priors[name].extend([light_prior, heavy_prior])
unfolding_df['{}_flux_light'.format(name)] = priors_raw['light']
unfolding_df['{}_flux_heavy'.format(name)] = priors_raw['heavy']
# unfolding_df['uniform_flux_light'] = [0.5]*len(priors_raw['light'])
# unfolding_df['uniform_flux_heavy'] = [0.5]*len(priors_raw['heavy'])
# unfolding_df['alllight_flux_light'] = [0.9]*len(priors_raw['light'])
# unfolding_df['alllight_flux_heavy'] = [0.1]*len(priors_raw['heavy'])
# unfolding_df['allheavy_flux_light'] = [0.1]*len(priors_raw['light'])
# unfolding_df['allheavy_flux_heavy'] = [0.9]*len(priors_raw['heavy'])
In [68]:
unfolding_df
Out[68]:
In [69]:
unfolding_df_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'unfolding-dataframe.csv')
comp.check_output_dir(unfolding_df_outfile)
unfolding_df.to_csv(unfolding_df_outfile)
In [70]:
formatted_df = pd.DataFrame()
In [71]:
counts_formatted = []
priors_formatted = defaultdict(list)
for index, row in unfolding_df.iterrows():
counts_formatted.extend([row['counts_light'], row['counts_heavy']])
for priors_name in priors_list:
priors_formatted[priors_name].extend([row[priors_name+'_flux_light'], row[priors_name+'_flux_heavy']])
formatted_df['counts'] = counts_formatted
formatted_df['counts_err'] = np.sqrt(counts_formatted)
for key, value in priors_formatted.iteritems():
formatted_df[key+'_flux'] = value
formatted_df[key+'_priors'] = formatted_df[key+'_flux'] / formatted_df[key+'_flux'].sum()
formatted_df.index.rename('log_energy_bin_idx', inplace=True)
In [72]:
formatted_df.head()
Out[72]:
Save formatted DataFrame to disk
In [73]:
formatted_df_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding',
'unfolding-dataframe-PyUnfold-formatted.csv')
comp.check_output_dir(formatted_df_outfile)
formatted_df.to_csv(formatted_df_outfile)
In [77]:
model_to_ls = {'h3a': '-.', 'h4a': ':', 'Hoerandel5': '-', 'antih3a': '--'}
model_to_marker = {'h3a': '.', 'h4a': '^', 'Hoerandel5': '*', 'antih3a': '.'}
fig, ax = plt.subplots()
for model in ['h3a', 'h4a', 'Hoerandel5', 'antih3a']:
key = '{}_priors'.format(model)
light_priors = formatted_df[key][::2]
heavy_priors = formatted_df[key][1::2]
ax.plot(energybins.log_energy_midpoints, light_priors,
color=color_dict['light'], ls=model_to_ls[model], marker=model_to_marker[model],
label='{} light'.format(model), alpha=0.75)
ax.plot(energybins.log_energy_midpoints, heavy_priors,
color=color_dict['heavy'], ls=model_to_ls[model], marker=model_to_marker[model],
label='{} heavy'.format(model), alpha=0.75)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('Prior')
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
# ax.set_ylim([0, 1])
ax.set_yscale("log", nonposy='clip')
ax.grid()
leg = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),
frameon=False, fancybox=False, numpoints=1)
priors_outfile = os.path.join(comp.paths.figures_dir, 'unfolding/iter-bayesian', 'priors.png')
comp.check_output_dir(priors_outfile)
plt.savefig(priors_outfile)
plt.show()
In [130]:
woo = pd.read_csv(formatted_df_outfile, index_col='log_energy_bin_idx')
In [131]:
woo
Out[131]:
In [59]:
# with open('pyunfold_dict.json', 'w') as outfile:
# data = {'counts': counts_observed,
# 'block_response': block_response.tolist(),
# 'block_response_err': block_response_err.tolist()}
# for model in ['h3a', 'h4a', 'Hoerandel5']:
# data['priors_{}'.format(model)] = priors[model]
# json.dump(data, outfile)
In [116]:
df = pd.read_csv(formatted_df_outfile, index_col='log_energy_bin_idx')
In [117]:
df
Out[117]:
In [ ]: