Author: James Bourbeau



In [1]:

    
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend









    



last updated: 2017-12-18 

CPython 2.7.13
IPython 5.3.0

numpy 1.13.3
matplotlib 2.1.1
scipy 0.19.0
pandas 0.20.1
sklearn 0.19.0
mlxtend 0.7.0

Formatting for PyUnfold use

Define analysis free parameters
Data preprocessing
Fitting random forest
Fraction correctly identified
Spectrum
Unfolding
Feature importance



In [2]:

    
from __future__ import division, print_function
import os
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from scipy.interpolate import UnivariateSpline

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold

import comptools as comp
import comptools.analysis.plotting as plotting
color_dict = comp.analysis.get_color_dict()

%matplotlib inline

Define analysis free parameters

[ back to top ]

Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions



In [14]:

    
# config = 'IC79.2010'
config = 'IC86.2012'
num_groups = 4
comp_list = comp.get_comp_list(num_groups=num_groups)



In [15]:

    
comp_list









    Out[15]:





['PPlus', 'He4Nucleus', 'O16Nucleus', 'Fe56Nucleus']

Get composition classifier pipeline

Define energy binning for this analysis



In [16]:

    
energybins = comp.analysis.get_energybins(config=config)

Data preprocessing

[ back to top ]

Load simulation/data dataframe and apply specified quality cuts
Extract desired features from dataframe
Get separate testing and training datasets
Feature transformation



In [17]:

    
log_energy_min = energybins.log_energy_min
log_energy_max = energybins.log_energy_max



In [18]:

    
df_sim_train, df_sim_test = comp.load_sim(config=config, log_energy_min=log_energy_min, log_energy_max=log_energy_max)



In [19]:

    
df_sim_train.reco_log_energy.min(), df_sim_train.reco_log_energy.max()









    Out[19]:





(6.1000010333148031, 7.9790607442006838)



In [20]:

    
log_reco_energy_sim_test = df_sim_test['reco_log_energy']
log_true_energy_sim_test = df_sim_test['MC_log_energy']



In [21]:

    
feature_list, feature_labels = comp.analysis.get_training_features()



In [22]:

    
pipeline_str = 'BDT_comp_{}_{}-groups'.format(config, num_groups)
pipeline = comp.get_pipeline(pipeline_str)



In [23]:

    
pipeline = pipeline.fit(df_sim_train[feature_list], df_sim_train['comp_target_{}'.format(num_groups)])

Load fitted effective area



In [24]:

    
eff_path = os.path.join(comp.paths.comp_data_dir, config, 'efficiencies', 
                        'efficiency_fit_num_groups_{}.hdf'.format(num_groups))
df_eff = pd.read_hdf(eff_path)



In [25]:

    
df_eff.head()









    Out[25]:







  
    
      
      eff_median_PPlus
      eff_err_low_PPlus
      eff_err_high_PPlus
      eff_median_He4Nucleus
      eff_err_low_He4Nucleus
      eff_err_high_He4Nucleus
      eff_median_O16Nucleus
      eff_err_low_O16Nucleus
      eff_err_high_O16Nucleus
      eff_median_Fe56Nucleus
      eff_err_low_Fe56Nucleus
      eff_err_high_Fe56Nucleus
    
  
  
    
      0
      0.006948
      0.000107
      0.000114
      0.007265
      0.000114
      0.000122
      0.006650
      0.000111
      0.000106
      0.006148
      0.000120
      0.000123
    
    
      1
      0.007616
      0.000090
      0.000093
      0.008219
      0.000094
      0.000098
      0.007516
      0.000085
      0.000087
      0.007408
      0.000098
      0.000098
    
    
      2
      0.008009
      0.000088
      0.000088
      0.008716
      0.000092
      0.000081
      0.007946
      0.000088
      0.000092
      0.008033
      0.000097
      0.000089
    
    
      3
      0.008248
      0.000095
      0.000090
      0.008936
      0.000087
      0.000086
      0.008174
      0.000091
      0.000092
      0.008316
      0.000098
      0.000093
    
    
      4
      0.008406
      0.000096
      0.000091
      0.009025
      0.000086
      0.000088
      0.008328
      0.000086
      0.000085
      0.008461
      0.000092
      0.000094



In [26]:

    
fig, ax = plt.subplots()
for composition in comp_list:
    ax.errorbar(energybins.log_energy_midpoints, df_eff['eff_median_{}'.format(composition)],
                yerr=[df_eff['eff_err_low_{}'.format(composition)],
                      df_eff['eff_err_high_{}'.format(composition)]], 
                color=color_dict[composition], label=composition, marker='.')
ax.axvline(6.4, marker='None', ls='-.', color='k')
ax.axvline(7.8, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Detection efficienies')
ax.grid()
ax.legend()
ax.ticklabel_format(style='sci',axis='y')
ax.yaxis.major.formatter.set_powerlimits((0,0))
plt.show()









    



/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/matplotlib/figure.py:2022: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "

Format for PyUnfold response matrix use



In [27]:

    
# efficiencies, efficiencies_err = [], []
# for idx, row in df_efficiency.iterrows():
#     for composition in comp_list:
#         efficiencies.append(row['eff_median_{}'.format(composition)])
#         efficiencies_err.append(row['eff_err_low_{}'.format(composition)])
# efficiencies = np.asarray(efficiencies)
# efficiencies_err = np.asarray(efficiencies_err)

efficiencies, efficiencies_err = [], []
for idx, row in df_eff.iterrows():
    for composition in comp_list:
        efficiencies.append(row['eff_median_{}'.format(composition)])
        efficiencies_err.append(row['eff_err_low_{}'.format(composition)])
efficiencies = np.asarray(efficiencies)
efficiencies_err = np.asarray(efficiencies_err)



In [28]:

    
efficiencies









    Out[28]:





array([ 0.00694769,  0.00726502,  0.0066502 ,  0.006148  ,  0.00761559,
        0.00821899,  0.00751581,  0.00740767,  0.00800872,  0.00871612,
        0.00794553,  0.00803339,  0.00824815,  0.00893584,  0.00817404,
        0.00831638,  0.00840596,  0.00902503,  0.00832766,  0.00846139,
        0.00852371,  0.0090508 ,  0.00845108,  0.00855262,  0.00862318,
        0.00904661,  0.00856244,  0.00862511,  0.00871585,  0.00903417,
        0.00867256,  0.00869195,  0.00880034,  0.00901436,  0.00878016,
        0.00875509,  0.00888107,  0.00899264,  0.00888727,  0.00881485,
        0.00896387,  0.00896808,  0.00899263,  0.0088754 ,  0.00904676,
        0.00894244,  0.00909869,  0.00893557,  0.0091264 ,  0.00891583,
        0.00920647,  0.0089947 ,  0.00920763,  0.00888895,  0.00931702,
        0.00905797,  0.00929032,  0.00886503,  0.00942443,  0.00912067,
        0.00937247,  0.00883978,  0.00953276,  0.00918312,  0.00945559,
        0.00881503,  0.0096383 ,  0.00924188,  0.00953546,  0.00878829,
        0.00974728,  0.009301  ,  0.00961673,  0.00876297,  0.00985172,
        0.0093614 ])



In [29]:

    
df_data = comp.load_data(config=config, columns=feature_list,
                         log_energy_min=log_energy_min, log_energy_max=log_energy_max,
                         n_jobs=20, verbose=True)









    



[########################################] | 100% Completed |  3min 35.1s



In [30]:

    
df_data.shape









    Out[30]:





(6741064, 4)



In [31]:

    
X_data = comp.dataframe_functions.dataframe_to_array(df_data, feature_list + ['reco_log_energy'])
log_energy_data = X_data[:, -1]
X_data = X_data[:, :-1]



In [32]:

    
log_energy_data.min(), log_energy_data.max()









    Out[32]:





(6.100000935606773, 7.9790607442006829)



In [33]:

    
data_predictions = pipeline.predict(X_data)



In [34]:

    
# Get composition masks
data_labels = np.array(comp.composition_encoding.decode_composition_groups(data_predictions, num_groups=num_groups))



In [35]:

    
# Get number of identified comp in each energy bin
unfolding_df = pd.DataFrame()
for composition in comp_list:
    comp_mask = data_labels == composition
    unfolding_df['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
                                                         bins=energybins.log_energy_bins)[0]
    unfolding_df['counts_' + composition + '_err'] = np.sqrt(unfolding_df['counts_' + composition])

unfolding_df['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
unfolding_df['counts_total_err'] = np.sqrt(unfolding_df['counts_total'])



In [36]:

    
unfolding_df.index.rename('log_energy_bin_idx', inplace=True)



In [37]:

    
unfolding_df.head()









    Out[37]:







  
    
      
      counts_PPlus
      counts_PPlus_err
      counts_He4Nucleus
      counts_He4Nucleus_err
      counts_O16Nucleus
      counts_O16Nucleus_err
      counts_Fe56Nucleus
      counts_Fe56Nucleus_err
      counts_total
      counts_total_err
    
    
      log_energy_bin_idx
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      754922
      868.862475
      570328
      755.200636
      193868
      440.304440
      288581
      537.197357
      1807699
      1344.506973
    
    
      1
      498171
      705.812298
      517090
      719.089702
      138570
      372.249916
      447026
      668.600030
      1600857
      1265.249778
    
    
      2
      316327
      562.429551
      389327
      623.960736
      115550
      339.926463
      317950
      563.870553
      1139154
      1067.311576
    
    
      3
      194517
      441.040814
      275513
      524.893322
      104070
      322.598822
      193827
      440.257879
      767927
      876.314441
    
    
      4
      186787
      432.188616
      161261
      401.573156
      33523
      183.092873
      161779
      402.217603
      543350
      737.122785



In [38]:

    
fig, ax = plt.subplots()
for composition in comp_list:
    ax.plot(unfolding_df['counts_{}'.format(composition)], color=color_dict[composition])
ax.set_yscale("log", nonposy='clip')
ax.grid()
plt.show()

Spectrum

[ back to top ]

Response matrix



In [39]:

    
test_predictions = pipeline.predict(df_sim_test[feature_list])
true_comp = df_sim_test['comp_group_{}'.format(num_groups)].values
pred_comp = np.array(comp.composition_encoding.decode_composition_groups(test_predictions,
                                                                         num_groups=num_groups))



In [40]:

    
true_comp









    Out[40]:





array(['O16Nucleus', 'Fe56Nucleus', 'PPlus', ..., 'He4Nucleus',
       'He4Nucleus', 'Fe56Nucleus'], dtype=object)



In [52]:

    
true_ebin_idxs = np.digitize(log_true_energy_sim_test, energybins.log_energy_bins) - 1
reco_ebin_idxs = np.digitize(log_reco_energy_sim_test, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(true_ebin_idxs)
print(range(-1, len(energybins.log_energy_midpoints)+1))

hstack_list = []
# for true_ebin_idx in energy_bin_idx:
for true_ebin_idx in range(-1, len(energybins.log_energy_midpoints)+1):
    if (true_ebin_idx == -1) or (true_ebin_idx == energybins.energy_midpoints.shape[0]):
        continue
    true_ebin_mask = true_ebin_idxs == true_ebin_idx
    
    vstack_list = []
#     for reco_ebin_idx in energy_bin_idx:
    for reco_ebin_idx in range(-1, len(energybins.log_energy_midpoints)+1):
        if (reco_ebin_idx == -1) or (reco_ebin_idx == energybins.energy_midpoints.shape[0]):
            continue
        reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
        
        combined_mask = true_ebin_mask & reco_ebin_mask
        if combined_mask.sum() == 0:
            response_mat = np.zeros((num_groups, num_groups), dtype=int)
        else:
            response_mat = confusion_matrix(true_comp[true_ebin_mask & reco_ebin_mask],
                                            pred_comp[true_ebin_mask & reco_ebin_mask],
                                            labels=comp_list)
        # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
        response_mat = response_mat.T
        vstack_list.append(response_mat)
    hstack_list.append(np.vstack(vstack_list))
    
res = np.hstack(hstack_list)
res_err = np.sqrt(res)









    



[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]



In [53]:

    
res.shape









    Out[53]:





(76, 76)



In [54]:

    
plt.imshow(res, origin='lower')









    Out[54]:





<matplotlib.image.AxesImage at 0x7fbfbca31e90>



In [60]:

    
from itertools import product
num_groups = len(comp_list)
num_ebins = len(energybins.log_energy_midpoints)

e_bin_iter = product(range(num_ebins), range(num_ebins))
res2 = np.zeros((num_ebins * num_groups, num_ebins * num_groups), dtype=int)
for true_ebin_idx, reco_ebin_idx in e_bin_iter:
#     print(true_ebin_idx, reco_ebin_idx)
    true_ebin_mask = true_ebin_idxs == true_ebin_idx
    reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
    ebin_mask = true_ebin_mask & reco_ebin_mask
    if ebin_mask.sum() == 0:
        continue
    else:
        response_mat = confusion_matrix(true_comp[ebin_mask],
                                        pred_comp[ebin_mask],
                                        labels=comp_list)
        # Transpose response matrix to get MC comp on x-axis
        # and reco comp on y-axis
#         response_mat = np.flipud(response_mat)
        response_mat = response_mat.T

    res2[num_groups * reco_ebin_idx : num_groups * (reco_ebin_idx + 1),
         num_groups * true_ebin_idx : num_groups * (true_ebin_idx + 1)] = response_mat



In [61]:

    
plt.imshow(res2, origin='lower')









    Out[61]:





<matplotlib.image.AxesImage at 0x7fbfbca8aa90>



In [62]:

    
np.testing.assert_array_equal(res2, res)



In [68]:

    
reco_ebin_idx = 4
true_ebin_idx = 4
plt.imshow(res2[num_groups * reco_ebin_idx : num_groups * (reco_ebin_idx + 1),
                num_groups * true_ebin_idx : num_groups * (true_ebin_idx + 1)],
           origin='lower')









    Out[68]:





<matplotlib.image.AxesImage at 0x7fbfbcb43c10>

Normalize response matrix column-wise (i.e. $P(E|C)$)



In [131]:

    
res_col_sum = res.sum(axis=0)
res_col_sum_err = np.array([np.sqrt(np.nansum(res_err[:, i]**2)) for i in range(res_err.shape[1])])

normalizations, normalizations_err = comp.analysis.ratio_error(res_col_sum, res_col_sum_err,
                                                               efficiencies, efficiencies_err,
                                                               nan_to_num=True)

res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
                                                               normalizations, normalizations_err,
                                                               nan_to_num=True)



In [132]:

    
res_normalized = np.nan_to_num(res_normalized)
res_normalized_err = np.nan_to_num(res_normalized_err)



In [133]:

    
np.testing.assert_allclose(res_normalized.sum(axis=0), efficiencies)



In [134]:

    
res









    Out[134]:





array([[228,  84,  85, ...,   0,   0,   0],
       [  5,   2,   4, ...,   0,   0,   0],
       [ 10,  31,  44, ...,   0,   0,   0],
       ..., 
       [  0,   0,   0, ..., 213,  42,   9],
       [  0,   0,   0, ...,  43,  50,  29],
       [  0,   0,   0, ...,  17,  53, 110]])



In [135]:

    
fig, ax = plt.subplots()
# h = np.flipud(block_response)
idx = 4*num_groups
sns.heatmap(res[idx:idx+num_groups, idx:idx+num_groups], annot=True, fmt='d', ax=ax, square=True,
            xticklabels=comp_list, yticklabels=comp_list,
            cbar_kws={'label': 'Counts'}, vmin=0, cmap='viridis')
ax.invert_yaxis()
plt.xlabel('True composition')
plt.ylabel('Pred composition')
plt.title('$\mathrm{7.6 < \log_{10}(E_{true}/GeV) < 7.7}$' + '\n$\mathrm{7.6 < \log_{10}(E_{reco}/GeV) < 7.7}$')
# res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-single-energy-bin.png')
# comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [136]:

    
plt.imshow(res, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# ax = sns.heatmap(res, square=True, xticklabels=2, yticklabels=2, 
# ax = sns.heatmap(res, square=True, mask=res==0, xticklabels=2, yticklabels=2, 
#             cbar_kws={'label': 'Counts'})

ax.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# ax.invert_yaxis()

for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i+0.5, marker='None', ls=':', color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i+0.5, marker='None', ls=':', color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
# plt.grid()

# plt.xticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])], 
#            rotation='vertical')
# plt.yticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])])

plt.colorbar(label='Counts')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [137]:

    
plt.imshow(np.sqrt(res), origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')

plt.colorbar(label='Count errors', format='%d')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics-err.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [138]:

    
plt.imshow(res_normalized, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix')

# plt.colorbar(label='A.U.')
plt.colorbar(label='$\mathrm{P(E_i|C_{\mu})}$')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', config, 'response_matrix',
                               'response-matrix_{}-groups.png'.format(num_groups))
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()



In [139]:

    
plt.imshow(res_normalized_err, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix error')

plt.colorbar(label='$\mathrm{\delta P(E_i|C_{\mu})}$')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-err.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [140]:

    
res_mat_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 
                               'response_{}-groups.txt'.format(num_groups))
res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 
                                   'response_err_{}-groups.txt'.format(num_groups))

comp.check_output_dir(res_mat_outfile)
comp.check_output_dir(res_mat_err_outfile)

np.savetxt(res_mat_outfile, res_normalized)
np.savetxt(res_mat_err_outfile, res_normalized_err)

Priors array



In [141]:

    
from icecube.weighting.weighting import from_simprod, PDGCode, ParticleType
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5, Hoerandel_IT, CompiledFlux



In [142]:

    
df_sim = comp.load_sim(config=config, test_size=0, log_energy_min=6.0, log_energy_max=8.3)
df_sim.head()









    Out[142]:







  
    
      
      FractionContainment_Laputop_IceTop
      FractionContainment_Laputop_InIce
      FractionContainment_MCPrimary_IceTop
      FractionContainment_MCPrimary_InIce
      IceTopMaxSignal
      IceTopMaxSignalInEdge
      IceTopMaxSignalString
      IceTopNeighbourMaxSignal
      InIce_charge_1_60
      MC_azimuth
      ...
      lap_log_energy
      lap_cos_zenith
      log_s50
      log_s80
      log_s125
      log_s180
      log_s250
      log_s500
      log_dEdX
      reco_log_energy
    
  
  
    
      12360_9_58_0
      0.599457
      0.825440
      0.603860
      0.834725
      915.565613
      0
      64
      266.637909
      72.200000
      2.912543
      ...
      5.993534
      0.970588
      0.965718
      0.472972
      -0.018253
      -0.436491
      -0.826276
      -1.689150
      0.669241
      6.017261
    
    
      12360_10_14_0
      0.672916
      0.596048
      0.675481
      0.649017
      28.609421
      0
      29
      12.178488
      180.050000
      5.673565
      ...
      6.033214
      0.932002
      1.058991
      0.515847
      -0.023232
      -0.480570
      -0.905580
      -1.842779
      0.733548
      6.047088
    
    
      12360_10_22_0
      0.537961
      0.760892
      0.543832
      0.771945
      101.481819
      0
      38
      68.172859
      244.275001
      5.673565
      ...
      6.041893
      0.925519
      1.140504
      0.560367
      -0.013838
      -0.499876
      -0.950742
      -1.942496
      0.758869
      6.054328
    
    
      12360_10_39_0
      0.676321
      0.597996
      0.674722
      0.655864
      20.867756
      0
      20
      10.946673
      165.950000
      5.673565
      ...
      6.054139
      0.930699
      1.007560
      0.502415
      -0.000583
      -0.428441
      -0.826892
      -1.708051
      0.363861
      6.040946
    
    
      12360_10_55_0
      0.255498
      0.933506
      0.248732
      0.951277
      37.628693
      0
      80
      21.158876
      27.025000
      5.673565
      ...
      6.001900
      0.926979
      1.077422
      0.507438
      -0.057127
      -0.535289
      -0.979059
      -1.955840
      0.663036
      6.029730
    
  

5 rows × 75 columns



In [143]:

    
p = PDGCode().values
pdg_codes = np.array([2212, 1000020040, 1000080160, 1000260560])
particle_names = [p[pdg_code].name for pdg_code in pdg_codes]



In [144]:

    
particle_names









    Out[144]:





['PPlus', 'He4Nucleus', 'O16Nucleus', 'Fe56Nucleus']



In [145]:

    
group_names = np.array(comp.composition_encoding.composition_group_labels(particle_names, num_groups=num_groups))
group_names









    Out[145]:





array(['light', 'light', 'intermediate', 'heavy'], 
      dtype='|S12')



In [146]:

    
comp_to_pdg_list = {composition: pdg_codes[group_names == composition] for composition in comp_list}



In [147]:

    
comp_to_pdg_list









    Out[147]:





{'heavy': array([1000260560]),
 'intermediate': array([1000080160]),
 'light': array([      2212, 1000020040])}



In [148]:

    
# Replace O16Nucleus with N14Nucleus + Al27Nucleus
for composition, pdg_list in comp_to_pdg_list.iteritems():
    if 1000080160 in pdg_list:
        pdg_list = pdg_list[pdg_list != 1000080160]
        comp_to_pdg_list[composition] = np.append(pdg_list, [1000070140, 1000130270])
    else:
        continue



In [149]:

    
comp_to_pdg_list









    Out[149]:





{'heavy': array([1000260560]),
 'intermediate': array([1000070140, 1000130270]),
 'light': array([      2212, 1000020040])}



In [150]:

    
priors_list = ['H3a', 'H4a', 'Polygonato']



In [151]:

    
# priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5']
# # priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5', 'uniform', 'alllight', 'allheavy']
# model_ptypes = {}
# model_ptypes['h3a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
# model_ptypes['h4a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
# model_ptypes['Hoerandel5'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}



In [152]:

    
fig, ax = plt.subplots()
for flux, name, marker in zip([GaisserH3a(), GaisserH4a(), Hoerandel5()],
                              priors_list,
                              '.^*o'):
    for composition in comp_list:
        comp_flux = []
        for energy_mid in energybins.energy_midpoints:
            flux_energy_mid = flux(energy_mid, comp_to_pdg_list[composition]).sum()
            comp_flux.append(flux_energy_mid)
        # Normalize flux in each energy bin to a probability
        comp_flux = np.asarray(comp_flux)
        prior_key = '{}_flux_{}'.format(name, composition)
        unfolding_df[prior_key] = comp_flux
        
        # Plot result
        ax.plot(energybins.log_energy_midpoints, energybins.energy_midpoints**2.7*comp_flux,
                color=color_dict[composition], alpha=0.75, marker=marker, ls=':',
                label='{} ({})'.format(name, composition))
ax.set_yscale("log", nonposy='clip')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('$\mathrm{ E^{2.7} \ J(E) \ [GeV^{1.7} m^{-2} sr^{-1} s^{-1}]}$')
ax.grid()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
priors_outfile = os.path.join(comp.paths.figures_dir, 'unfolding',
                              'priors_flux_{}-groups.png'.format(num_groups))
comp.check_output_dir(priors_outfile)
plt.savefig(priors_outfile)
plt.show()

unfolding_df.head()









    












    Out[152]:







  
    
      
      counts_light
      counts_light_err
      counts_intermediate
      counts_intermediate_err
      counts_heavy
      counts_heavy_err
      counts_total
      counts_total_err
      H3a_flux_light
      H3a_flux_intermediate
      H3a_flux_heavy
      H4a_flux_light
      H4a_flux_intermediate
      H4a_flux_heavy
      Polygonato_flux_light
      Polygonato_flux_intermediate
      Polygonato_flux_heavy
    
    
      log_energy_bin_idx
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      1457721
      1207.361172
      43116
      207.643926
      294313
      542.506221
      1795150
      1339.832079
      7.187420e-13
      2.431070e-13
      1.651639e-13
      7.333020e-13
      2.391451e-13
      1.631829e-13
      4.929408e-13
      2.394229e-13
      1.626849e-13
    
    
      1
      1132154
      1064.027255
      24671
      157.070048
      425020
      651.935580
      1581845
      1257.714196
      3.743101e-13
      1.326787e-13
      9.061387e-14
      3.821589e-13
      1.303989e-13
      8.947391e-14
      2.569923e-13
      1.291849e-13
      8.980226e-14
    
    
      2
      808442
      899.134028
      21007
      144.937918
      296194
      544.237081
      1125643
      1060.963242
      1.925426e-13
      7.229419e-14
      4.969588e-14
      1.967681e-13
      7.098235e-14
      4.903990e-14
      1.318928e-13
      6.966331e-14
      4.956720e-14
    
    
      3
      566089
      752.388862
      15537
      124.647503
      175464
      418.884232
      757090
      870.109189
      9.758423e-14
      3.930950e-14
      2.724088e-14
      9.985590e-14
      3.855464e-14
      2.686341e-14
      6.629980e-14
      3.753236e-14
      2.735588e-14
    
    
      4
      391163
      625.430252
      13319
      115.407972
      130735
      361.572953
      535217
      731.585265
      4.859649e-14
      2.131730e-14
      1.492133e-14
      4.981589e-14
      2.088294e-14
      1.470413e-14
      3.247099e-14
      2.019331e-14
      1.509482e-14



In [153]:

    
# unfolding_df_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
#                                     'unfolding_{}-groups.hdf'.format(num_groups))
# comp.check_output_dir(unfolding_df_outfile)
# unfolding_df.to_hdf(unfolding_df_outfile, 'dataframe', format='table')

Formatting for PyUnfold use



In [154]:

    
formatted_df = pd.DataFrame()



In [155]:

    
counts_formatted = []
priors_formatted = defaultdict(list)
for index, row in unfolding_df.iterrows():
    for composition in comp_list:
        counts_formatted.append(row['counts_{}'.format(composition)])
        for priors_name in priors_list:
            priors_formatted[priors_name].append(row['{}_flux_{}'.format(priors_name, composition)])
        
formatted_df['counts'] = counts_formatted
formatted_df['counts_err'] = np.sqrt(counts_formatted)

formatted_df['efficiencies'] = efficiencies
formatted_df['efficiencies_err'] = efficiencies_err


for key, value in priors_formatted.iteritems():
    formatted_df[key+'_flux'] = value
    formatted_df[key+'_prior'] = formatted_df[key+'_flux'] / formatted_df[key+'_flux'].sum()

formatted_df.index.rename('log_energy_bin_idx', inplace=True)



In [156]:

    
formatted_df.head()









    Out[156]:







  
    
      
      counts
      counts_err
      efficiencies
      efficiencies_err
      H3a_flux
      H3a_prior
      Polygonato_flux
      Polygonato_prior
      H4a_flux
      H4a_prior
    
    
      log_energy_bin_idx
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      1457721.0
      1207.361172
      0.007109
      0.000073
      7.187420e-13
      0.302586
      4.929408e-13
      0.260752
      7.333020e-13
      0.306455
    
    
      1
      43116.0
      207.643926
      0.006655
      0.000100
      2.431070e-13
      0.102347
      2.394229e-13
      0.126648
      2.391451e-13
      0.099941
    
    
      2
      294313.0
      542.506221
      0.006151
      0.000119
      1.651639e-13
      0.069533
      1.626849e-13
      0.086056
      1.631829e-13
      0.068196
    
    
      3
      1132154.0
      1064.027255
      0.007920
      0.000066
      3.743101e-13
      0.157582
      2.569923e-13
      0.135942
      3.821589e-13
      0.159708
    
    
      4
      24671.0
      157.070048
      0.007525
      0.000089
      1.326787e-13
      0.055857
      1.291849e-13
      0.068335
      1.303989e-13
      0.054495



In [157]:

    
prior_sums = formatted_df[[col for col in formatted_df.columns if 'prior' in col]].sum()
np.testing.assert_allclose(prior_sums, np.ones_like(prior_sums))

Save formatted DataFrame to disk



In [158]:

    
formatted_df_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 
                                    'unfolding-df_{}-groups.hdf'.format(num_groups))
comp.check_output_dir(formatted_df_outfile)
formatted_df.to_hdf(formatted_df_outfile, 'dataframe', format='table')



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	eff_median_PPlus	eff_err_low_PPlus	eff_err_high_PPlus	eff_median_He4Nucleus	eff_err_low_He4Nucleus	eff_err_high_He4Nucleus	eff_median_O16Nucleus	eff_err_low_O16Nucleus	eff_err_high_O16Nucleus	eff_median_Fe56Nucleus	eff_err_low_Fe56Nucleus	eff_err_high_Fe56Nucleus
0	0.006948	0.000107	0.000114	0.007265	0.000114	0.000122	0.006650	0.000111	0.000106	0.006148	0.000120	0.000123
1	0.007616	0.000090	0.000093	0.008219	0.000094	0.000098	0.007516	0.000085	0.000087	0.007408	0.000098	0.000098
2	0.008009	0.000088	0.000088	0.008716	0.000092	0.000081	0.007946	0.000088	0.000092	0.008033	0.000097	0.000089
3	0.008248	0.000095	0.000090	0.008936	0.000087	0.000086	0.008174	0.000091	0.000092	0.008316	0.000098	0.000093
4	0.008406	0.000096	0.000091	0.009025	0.000086	0.000088	0.008328	0.000086	0.000085	0.008461	0.000092	0.000094

	counts_PPlus	counts_PPlus_err	counts_He4Nucleus	counts_He4Nucleus_err	counts_O16Nucleus	counts_O16Nucleus_err	counts_Fe56Nucleus	counts_Fe56Nucleus_err	counts_total	counts_total_err
log_energy_bin_idx
0	754922	868.862475	570328	755.200636	193868	440.304440	288581	537.197357	1807699	1344.506973
1	498171	705.812298	517090	719.089702	138570	372.249916	447026	668.600030	1600857	1265.249778
2	316327	562.429551	389327	623.960736	115550	339.926463	317950	563.870553	1139154	1067.311576
3	194517	441.040814	275513	524.893322	104070	322.598822	193827	440.257879	767927	876.314441
4	186787	432.188616	161261	401.573156	33523	183.092873	161779	402.217603	543350	737.122785

	FractionContainment_Laputop_IceTop	FractionContainment_Laputop_InIce	FractionContainment_MCPrimary_IceTop	FractionContainment_MCPrimary_InIce	IceTopMaxSignal	IceTopMaxSignalString	IceTopNeighbourMaxSignal	InIce_charge_1_60	MC_azimuth	...	lap_log_energy	lap_cos_zenith	log_s50	log_s80	log_s125	log_s180	log_s250	log_s500	log_dEdX	reco_log_energy
12360_9_58_0	0.599457	0.825440	0.603860	0.834725	915.565613	64	266.637909	72.200000	2.912543	...	5.993534	0.970588	0.965718	0.472972	-0.018253	-0.436491	-0.826276	-1.689150	0.669241	6.017261
12360_10_14_0	0.672916	0.596048	0.675481	0.649017	28.609421	29	12.178488	180.050000	5.673565	...	6.033214	0.932002	1.058991	0.515847	-0.023232	-0.480570	-0.905580	-1.842779	0.733548	6.047088
12360_10_22_0	0.537961	0.760892	0.543832	0.771945	101.481819	38	68.172859	244.275001	5.673565	...	6.041893	0.925519	1.140504	0.560367	-0.013838	-0.499876	-0.950742	-1.942496	0.758869	6.054328
12360_10_39_0	0.676321	0.597996	0.674722	0.655864	20.867756	20	10.946673	165.950000	5.673565	...	6.054139	0.930699	1.007560	0.502415	-0.000583	-0.428441	-0.826892	-1.708051	0.363861	6.040946
12360_10_55_0	0.255498	0.933506	0.248732	0.951277	37.628693	80	21.158876	27.025000	5.673565	...	6.001900	0.926979	1.077422	0.507438	-0.057127	-0.535289	-0.979059	-1.955840	0.663036	6.029730

	counts_light	counts_light_err	counts_intermediate	counts_intermediate_err	counts_heavy	counts_heavy_err	counts_total	counts_total_err	H3a_flux_light	H3a_flux_intermediate	H3a_flux_heavy	H4a_flux_light	H4a_flux_intermediate	H4a_flux_heavy	Polygonato_flux_light	Polygonato_flux_intermediate	Polygonato_flux_heavy
log_energy_bin_idx
0	1457721	1207.361172	43116	207.643926	294313	542.506221	1795150	1339.832079	7.187420e-13	2.431070e-13	1.651639e-13	7.333020e-13	2.391451e-13	1.631829e-13	4.929408e-13	2.394229e-13	1.626849e-13
1	1132154	1064.027255	24671	157.070048	425020	651.935580	1581845	1257.714196	3.743101e-13	1.326787e-13	9.061387e-14	3.821589e-13	1.303989e-13	8.947391e-14	2.569923e-13	1.291849e-13	8.980226e-14
2	808442	899.134028	21007	144.937918	296194	544.237081	1125643	1060.963242	1.925426e-13	7.229419e-14	4.969588e-14	1.967681e-13	7.098235e-14	4.903990e-14	1.318928e-13	6.966331e-14	4.956720e-14
3	566089	752.388862	15537	124.647503	175464	418.884232	757090	870.109189	9.758423e-14	3.930950e-14	2.724088e-14	9.985590e-14	3.855464e-14	2.686341e-14	6.629980e-14	3.753236e-14	2.735588e-14
4	391163	625.430252	13319	115.407972	130735	361.572953	535217	731.585265	4.859649e-14	2.131730e-14	1.492133e-14	4.981589e-14	2.088294e-14	1.470413e-14	3.247099e-14	2.019331e-14	1.509482e-14

	counts	counts_err	efficiencies	efficiencies_err	H3a_flux	H3a_prior	Polygonato_flux	Polygonato_prior	H4a_flux	H4a_prior
log_energy_bin_idx
0	1457721.0	1207.361172	0.007109	0.000073	7.187420e-13	0.302586	4.929408e-13	0.260752	7.333020e-13	0.306455
1	43116.0	207.643926	0.006655	0.000100	2.431070e-13	0.102347	2.394229e-13	0.126648	2.391451e-13	0.099941
2	294313.0	542.506221	0.006151	0.000119	1.651639e-13	0.069533	1.626849e-13	0.086056	1.631829e-13	0.068196
3	1132154.0	1064.027255	0.007920	0.000066	3.743101e-13	0.157582	2.569923e-13	0.135942	3.821589e-13	0.159708
4	24671.0	157.070048	0.007525	0.000089	1.326787e-13	0.055857	1.291849e-13	0.068335	1.303989e-13	0.054495

Formatting for PyUnfold use

Table of contents

Define analysis free parameters

Data preprocessing

Load fitted effective area

Spectrum

Response matrix

Normalize response matrix column-wise (i.e. $P(E|C)$)

Priors array

Formatting for PyUnfold use