In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend


last updated: 2017-12-18 

CPython 2.7.13
IPython 5.3.0

numpy 1.13.3
matplotlib 2.1.1
scipy 0.19.0
pandas 0.20.1
sklearn 0.19.0
mlxtend 0.7.0

In [2]:
from __future__ import division, print_function
import os
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from scipy.interpolate import UnivariateSpline

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold

import comptools as comp
import comptools.analysis.plotting as plotting
color_dict = comp.analysis.get_color_dict()

%matplotlib inline

Define analysis free parameters

[ back to top ]

Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions


In [14]:
# config = 'IC79.2010'
config = 'IC86.2012'
num_groups = 4
comp_list = comp.get_comp_list(num_groups=num_groups)

In [15]:
comp_list


Out[15]:
['PPlus', 'He4Nucleus', 'O16Nucleus', 'Fe56Nucleus']

Get composition classifier pipeline

Define energy binning for this analysis


In [16]:
energybins = comp.analysis.get_energybins(config=config)

Data preprocessing

[ back to top ]

  1. Load simulation/data dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets
  4. Feature transformation

In [17]:
log_energy_min = energybins.log_energy_min
log_energy_max = energybins.log_energy_max

In [18]:
df_sim_train, df_sim_test = comp.load_sim(config=config, log_energy_min=log_energy_min, log_energy_max=log_energy_max)

In [19]:
df_sim_train.reco_log_energy.min(), df_sim_train.reco_log_energy.max()


Out[19]:
(6.1000010333148031, 7.9790607442006838)

In [20]:
log_reco_energy_sim_test = df_sim_test['reco_log_energy']
log_true_energy_sim_test = df_sim_test['MC_log_energy']

In [21]:
feature_list, feature_labels = comp.analysis.get_training_features()

In [22]:
pipeline_str = 'BDT_comp_{}_{}-groups'.format(config, num_groups)
pipeline = comp.get_pipeline(pipeline_str)

In [23]:
pipeline = pipeline.fit(df_sim_train[feature_list], df_sim_train['comp_target_{}'.format(num_groups)])

Load fitted effective area


In [24]:
eff_path = os.path.join(comp.paths.comp_data_dir, config, 'efficiencies', 
                        'efficiency_fit_num_groups_{}.hdf'.format(num_groups))
df_eff = pd.read_hdf(eff_path)

In [25]:
df_eff.head()


Out[25]:
eff_median_PPlus eff_err_low_PPlus eff_err_high_PPlus eff_median_He4Nucleus eff_err_low_He4Nucleus eff_err_high_He4Nucleus eff_median_O16Nucleus eff_err_low_O16Nucleus eff_err_high_O16Nucleus eff_median_Fe56Nucleus eff_err_low_Fe56Nucleus eff_err_high_Fe56Nucleus
0 0.006948 0.000107 0.000114 0.007265 0.000114 0.000122 0.006650 0.000111 0.000106 0.006148 0.000120 0.000123
1 0.007616 0.000090 0.000093 0.008219 0.000094 0.000098 0.007516 0.000085 0.000087 0.007408 0.000098 0.000098
2 0.008009 0.000088 0.000088 0.008716 0.000092 0.000081 0.007946 0.000088 0.000092 0.008033 0.000097 0.000089
3 0.008248 0.000095 0.000090 0.008936 0.000087 0.000086 0.008174 0.000091 0.000092 0.008316 0.000098 0.000093
4 0.008406 0.000096 0.000091 0.009025 0.000086 0.000088 0.008328 0.000086 0.000085 0.008461 0.000092 0.000094

In [26]:
fig, ax = plt.subplots()
for composition in comp_list:
    ax.errorbar(energybins.log_energy_midpoints, df_eff['eff_median_{}'.format(composition)],
                yerr=[df_eff['eff_err_low_{}'.format(composition)],
                      df_eff['eff_err_high_{}'.format(composition)]], 
                color=color_dict[composition], label=composition, marker='.')
ax.axvline(6.4, marker='None', ls='-.', color='k')
ax.axvline(7.8, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Detection efficienies')
ax.grid()
ax.legend()
ax.ticklabel_format(style='sci',axis='y')
ax.yaxis.major.formatter.set_powerlimits((0,0))
plt.show()


/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/matplotlib/figure.py:2022: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "

Format for PyUnfold response matrix use


In [27]:
# efficiencies, efficiencies_err = [], []
# for idx, row in df_efficiency.iterrows():
#     for composition in comp_list:
#         efficiencies.append(row['eff_median_{}'.format(composition)])
#         efficiencies_err.append(row['eff_err_low_{}'.format(composition)])
# efficiencies = np.asarray(efficiencies)
# efficiencies_err = np.asarray(efficiencies_err)

efficiencies, efficiencies_err = [], []
for idx, row in df_eff.iterrows():
    for composition in comp_list:
        efficiencies.append(row['eff_median_{}'.format(composition)])
        efficiencies_err.append(row['eff_err_low_{}'.format(composition)])
efficiencies = np.asarray(efficiencies)
efficiencies_err = np.asarray(efficiencies_err)

In [28]:
efficiencies


Out[28]:
array([ 0.00694769,  0.00726502,  0.0066502 ,  0.006148  ,  0.00761559,
        0.00821899,  0.00751581,  0.00740767,  0.00800872,  0.00871612,
        0.00794553,  0.00803339,  0.00824815,  0.00893584,  0.00817404,
        0.00831638,  0.00840596,  0.00902503,  0.00832766,  0.00846139,
        0.00852371,  0.0090508 ,  0.00845108,  0.00855262,  0.00862318,
        0.00904661,  0.00856244,  0.00862511,  0.00871585,  0.00903417,
        0.00867256,  0.00869195,  0.00880034,  0.00901436,  0.00878016,
        0.00875509,  0.00888107,  0.00899264,  0.00888727,  0.00881485,
        0.00896387,  0.00896808,  0.00899263,  0.0088754 ,  0.00904676,
        0.00894244,  0.00909869,  0.00893557,  0.0091264 ,  0.00891583,
        0.00920647,  0.0089947 ,  0.00920763,  0.00888895,  0.00931702,
        0.00905797,  0.00929032,  0.00886503,  0.00942443,  0.00912067,
        0.00937247,  0.00883978,  0.00953276,  0.00918312,  0.00945559,
        0.00881503,  0.0096383 ,  0.00924188,  0.00953546,  0.00878829,
        0.00974728,  0.009301  ,  0.00961673,  0.00876297,  0.00985172,
        0.0093614 ])

In [29]:
df_data = comp.load_data(config=config, columns=feature_list,
                         log_energy_min=log_energy_min, log_energy_max=log_energy_max,
                         n_jobs=20, verbose=True)


[########################################] | 100% Completed |  3min 35.1s

In [30]:
df_data.shape


Out[30]:
(6741064, 4)

In [31]:
X_data = comp.dataframe_functions.dataframe_to_array(df_data, feature_list + ['reco_log_energy'])
log_energy_data = X_data[:, -1]
X_data = X_data[:, :-1]

In [32]:
log_energy_data.min(), log_energy_data.max()


Out[32]:
(6.100000935606773, 7.9790607442006829)

In [33]:
data_predictions = pipeline.predict(X_data)

In [34]:
# Get composition masks
data_labels = np.array(comp.composition_encoding.decode_composition_groups(data_predictions, num_groups=num_groups))

In [35]:
# Get number of identified comp in each energy bin
unfolding_df = pd.DataFrame()
for composition in comp_list:
    comp_mask = data_labels == composition
    unfolding_df['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
                                                         bins=energybins.log_energy_bins)[0]
    unfolding_df['counts_' + composition + '_err'] = np.sqrt(unfolding_df['counts_' + composition])

unfolding_df['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
unfolding_df['counts_total_err'] = np.sqrt(unfolding_df['counts_total'])

In [36]:
unfolding_df.index.rename('log_energy_bin_idx', inplace=True)

In [37]:
unfolding_df.head()


Out[37]:
counts_PPlus counts_PPlus_err counts_He4Nucleus counts_He4Nucleus_err counts_O16Nucleus counts_O16Nucleus_err counts_Fe56Nucleus counts_Fe56Nucleus_err counts_total counts_total_err
log_energy_bin_idx
0 754922 868.862475 570328 755.200636 193868 440.304440 288581 537.197357 1807699 1344.506973
1 498171 705.812298 517090 719.089702 138570 372.249916 447026 668.600030 1600857 1265.249778
2 316327 562.429551 389327 623.960736 115550 339.926463 317950 563.870553 1139154 1067.311576
3 194517 441.040814 275513 524.893322 104070 322.598822 193827 440.257879 767927 876.314441
4 186787 432.188616 161261 401.573156 33523 183.092873 161779 402.217603 543350 737.122785

In [38]:
fig, ax = plt.subplots()
for composition in comp_list:
    ax.plot(unfolding_df['counts_{}'.format(composition)], color=color_dict[composition])
ax.set_yscale("log", nonposy='clip')
ax.grid()
plt.show()


Spectrum

[ back to top ]

Response matrix


In [39]:
test_predictions = pipeline.predict(df_sim_test[feature_list])
true_comp = df_sim_test['comp_group_{}'.format(num_groups)].values
pred_comp = np.array(comp.composition_encoding.decode_composition_groups(test_predictions,
                                                                         num_groups=num_groups))

In [40]:
true_comp


Out[40]:
array(['O16Nucleus', 'Fe56Nucleus', 'PPlus', ..., 'He4Nucleus',
       'He4Nucleus', 'Fe56Nucleus'], dtype=object)

In [52]:
true_ebin_idxs = np.digitize(log_true_energy_sim_test, energybins.log_energy_bins) - 1
reco_ebin_idxs = np.digitize(log_reco_energy_sim_test, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(true_ebin_idxs)
print(range(-1, len(energybins.log_energy_midpoints)+1))

hstack_list = []
# for true_ebin_idx in energy_bin_idx:
for true_ebin_idx in range(-1, len(energybins.log_energy_midpoints)+1):
    if (true_ebin_idx == -1) or (true_ebin_idx == energybins.energy_midpoints.shape[0]):
        continue
    true_ebin_mask = true_ebin_idxs == true_ebin_idx
    
    vstack_list = []
#     for reco_ebin_idx in energy_bin_idx:
    for reco_ebin_idx in range(-1, len(energybins.log_energy_midpoints)+1):
        if (reco_ebin_idx == -1) or (reco_ebin_idx == energybins.energy_midpoints.shape[0]):
            continue
        reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
        
        combined_mask = true_ebin_mask & reco_ebin_mask
        if combined_mask.sum() == 0:
            response_mat = np.zeros((num_groups, num_groups), dtype=int)
        else:
            response_mat = confusion_matrix(true_comp[true_ebin_mask & reco_ebin_mask],
                                            pred_comp[true_ebin_mask & reco_ebin_mask],
                                            labels=comp_list)
        # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
        response_mat = response_mat.T
        vstack_list.append(response_mat)
    hstack_list.append(np.vstack(vstack_list))
    
res = np.hstack(hstack_list)
res_err = np.sqrt(res)


[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [53]:
res.shape


Out[53]:
(76, 76)

In [54]:
plt.imshow(res, origin='lower')


Out[54]:
<matplotlib.image.AxesImage at 0x7fbfbca31e90>

In [60]:
from itertools import product
num_groups = len(comp_list)
num_ebins = len(energybins.log_energy_midpoints)

e_bin_iter = product(range(num_ebins), range(num_ebins))
res2 = np.zeros((num_ebins * num_groups, num_ebins * num_groups), dtype=int)
for true_ebin_idx, reco_ebin_idx in e_bin_iter:
#     print(true_ebin_idx, reco_ebin_idx)
    true_ebin_mask = true_ebin_idxs == true_ebin_idx
    reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
    ebin_mask = true_ebin_mask & reco_ebin_mask
    if ebin_mask.sum() == 0:
        continue
    else:
        response_mat = confusion_matrix(true_comp[ebin_mask],
                                        pred_comp[ebin_mask],
                                        labels=comp_list)
        # Transpose response matrix to get MC comp on x-axis
        # and reco comp on y-axis
#         response_mat = np.flipud(response_mat)
        response_mat = response_mat.T

    res2[num_groups * reco_ebin_idx : num_groups * (reco_ebin_idx + 1),
         num_groups * true_ebin_idx : num_groups * (true_ebin_idx + 1)] = response_mat

In [61]:
plt.imshow(res2, origin='lower')


Out[61]:
<matplotlib.image.AxesImage at 0x7fbfbca8aa90>

In [62]:
np.testing.assert_array_equal(res2, res)

In [68]:
reco_ebin_idx = 4
true_ebin_idx = 4
plt.imshow(res2[num_groups * reco_ebin_idx : num_groups * (reco_ebin_idx + 1),
                num_groups * true_ebin_idx : num_groups * (true_ebin_idx + 1)],
           origin='lower')


Out[68]:
<matplotlib.image.AxesImage at 0x7fbfbcb43c10>

Normalize response matrix column-wise (i.e. $P(E|C)$)


In [131]:
res_col_sum = res.sum(axis=0)
res_col_sum_err = np.array([np.sqrt(np.nansum(res_err[:, i]**2)) for i in range(res_err.shape[1])])

normalizations, normalizations_err = comp.analysis.ratio_error(res_col_sum, res_col_sum_err,
                                                               efficiencies, efficiencies_err,
                                                               nan_to_num=True)

res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
                                                               normalizations, normalizations_err,
                                                               nan_to_num=True)

In [132]:
res_normalized = np.nan_to_num(res_normalized)
res_normalized_err = np.nan_to_num(res_normalized_err)

In [133]:
np.testing.assert_allclose(res_normalized.sum(axis=0), efficiencies)

In [134]:
res


Out[134]:
array([[228,  84,  85, ...,   0,   0,   0],
       [  5,   2,   4, ...,   0,   0,   0],
       [ 10,  31,  44, ...,   0,   0,   0],
       ..., 
       [  0,   0,   0, ..., 213,  42,   9],
       [  0,   0,   0, ...,  43,  50,  29],
       [  0,   0,   0, ...,  17,  53, 110]])

In [135]:
fig, ax = plt.subplots()
# h = np.flipud(block_response)
idx = 4*num_groups
sns.heatmap(res[idx:idx+num_groups, idx:idx+num_groups], annot=True, fmt='d', ax=ax, square=True,
            xticklabels=comp_list, yticklabels=comp_list,
            cbar_kws={'label': 'Counts'}, vmin=0, cmap='viridis')
ax.invert_yaxis()
plt.xlabel('True composition')
plt.ylabel('Pred composition')
plt.title('$\mathrm{7.6 < \log_{10}(E_{true}/GeV) < 7.7}$' + '\n$\mathrm{7.6 < \log_{10}(E_{reco}/GeV) < 7.7}$')
# res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-single-energy-bin.png')
# comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [136]:
plt.imshow(res, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# ax = sns.heatmap(res, square=True, xticklabels=2, yticklabels=2, 
# ax = sns.heatmap(res, square=True, mask=res==0, xticklabels=2, yticklabels=2, 
#             cbar_kws={'label': 'Counts'})

ax.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# ax.invert_yaxis()

for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i+0.5, marker='None', ls=':', color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i+0.5, marker='None', ls=':', color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
# plt.grid()

# plt.xticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])], 
#            rotation='vertical')
# plt.yticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])])

plt.colorbar(label='Counts')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [137]:
plt.imshow(np.sqrt(res), origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')

plt.colorbar(label='Count errors', format='%d')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics-err.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [138]:
plt.imshow(res_normalized, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix')

# plt.colorbar(label='A.U.')
plt.colorbar(label='$\mathrm{P(E_i|C_{\mu})}$')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', config, 'response_matrix',
                               'response-matrix_{}-groups.png'.format(num_groups))
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()



In [139]:
plt.imshow(res_normalized_err, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix error')

plt.colorbar(label='$\mathrm{\delta P(E_i|C_{\mu})}$')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-err.png')
comp.check_output_dir(res_mat_outfile)
# plt.savefig(res_mat_outfile)
plt.show()



In [140]:
res_mat_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 
                               'response_{}-groups.txt'.format(num_groups))
res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 
                                   'response_err_{}-groups.txt'.format(num_groups))

comp.check_output_dir(res_mat_outfile)
comp.check_output_dir(res_mat_err_outfile)

np.savetxt(res_mat_outfile, res_normalized)
np.savetxt(res_mat_err_outfile, res_normalized_err)

Priors array


In [141]:
from icecube.weighting.weighting import from_simprod, PDGCode, ParticleType
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5, Hoerandel_IT, CompiledFlux

In [142]:
df_sim = comp.load_sim(config=config, test_size=0, log_energy_min=6.0, log_energy_max=8.3)
df_sim.head()


Out[142]:
FractionContainment_Laputop_IceTop FractionContainment_Laputop_InIce FractionContainment_MCPrimary_IceTop FractionContainment_MCPrimary_InIce IceTopMaxSignal IceTopMaxSignalInEdge IceTopMaxSignalString IceTopNeighbourMaxSignal InIce_charge_1_60 MC_azimuth ... lap_log_energy lap_cos_zenith log_s50 log_s80 log_s125 log_s180 log_s250 log_s500 log_dEdX reco_log_energy
12360_9_58_0 0.599457 0.825440 0.603860 0.834725 915.565613 0 64 266.637909 72.200000 2.912543 ... 5.993534 0.970588 0.965718 0.472972 -0.018253 -0.436491 -0.826276 -1.689150 0.669241 6.017261
12360_10_14_0 0.672916 0.596048 0.675481 0.649017 28.609421 0 29 12.178488 180.050000 5.673565 ... 6.033214 0.932002 1.058991 0.515847 -0.023232 -0.480570 -0.905580 -1.842779 0.733548 6.047088
12360_10_22_0 0.537961 0.760892 0.543832 0.771945 101.481819 0 38 68.172859 244.275001 5.673565 ... 6.041893 0.925519 1.140504 0.560367 -0.013838 -0.499876 -0.950742 -1.942496 0.758869 6.054328
12360_10_39_0 0.676321 0.597996 0.674722 0.655864 20.867756 0 20 10.946673 165.950000 5.673565 ... 6.054139 0.930699 1.007560 0.502415 -0.000583 -0.428441 -0.826892 -1.708051 0.363861 6.040946
12360_10_55_0 0.255498 0.933506 0.248732 0.951277 37.628693 0 80 21.158876 27.025000 5.673565 ... 6.001900 0.926979 1.077422 0.507438 -0.057127 -0.535289 -0.979059 -1.955840 0.663036 6.029730

5 rows × 75 columns


In [143]:
p = PDGCode().values
pdg_codes = np.array([2212, 1000020040, 1000080160, 1000260560])
particle_names = [p[pdg_code].name for pdg_code in pdg_codes]

In [144]:
particle_names


Out[144]:
['PPlus', 'He4Nucleus', 'O16Nucleus', 'Fe56Nucleus']

In [145]:
group_names = np.array(comp.composition_encoding.composition_group_labels(particle_names, num_groups=num_groups))
group_names


Out[145]:
array(['light', 'light', 'intermediate', 'heavy'], 
      dtype='|S12')

In [146]:
comp_to_pdg_list = {composition: pdg_codes[group_names == composition] for composition in comp_list}

In [147]:
comp_to_pdg_list


Out[147]:
{'heavy': array([1000260560]),
 'intermediate': array([1000080160]),
 'light': array([      2212, 1000020040])}

In [148]:
# Replace O16Nucleus with N14Nucleus + Al27Nucleus
for composition, pdg_list in comp_to_pdg_list.iteritems():
    if 1000080160 in pdg_list:
        pdg_list = pdg_list[pdg_list != 1000080160]
        comp_to_pdg_list[composition] = np.append(pdg_list, [1000070140, 1000130270])
    else:
        continue

In [149]:
comp_to_pdg_list


Out[149]:
{'heavy': array([1000260560]),
 'intermediate': array([1000070140, 1000130270]),
 'light': array([      2212, 1000020040])}

In [150]:
priors_list = ['H3a', 'H4a', 'Polygonato']

In [151]:
# priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5']
# # priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5', 'uniform', 'alllight', 'allheavy']
# model_ptypes = {}
# model_ptypes['h3a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
# model_ptypes['h4a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
# model_ptypes['Hoerandel5'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}

In [152]:
fig, ax = plt.subplots()
for flux, name, marker in zip([GaisserH3a(), GaisserH4a(), Hoerandel5()],
                              priors_list,
                              '.^*o'):
    for composition in comp_list:
        comp_flux = []
        for energy_mid in energybins.energy_midpoints:
            flux_energy_mid = flux(energy_mid, comp_to_pdg_list[composition]).sum()
            comp_flux.append(flux_energy_mid)
        # Normalize flux in each energy bin to a probability
        comp_flux = np.asarray(comp_flux)
        prior_key = '{}_flux_{}'.format(name, composition)
        unfolding_df[prior_key] = comp_flux
        
        # Plot result
        ax.plot(energybins.log_energy_midpoints, energybins.energy_midpoints**2.7*comp_flux,
                color=color_dict[composition], alpha=0.75, marker=marker, ls=':',
                label='{} ({})'.format(name, composition))
ax.set_yscale("log", nonposy='clip')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('$\mathrm{ E^{2.7} \ J(E) \ [GeV^{1.7} m^{-2} sr^{-1} s^{-1}]}$')
ax.grid()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
priors_outfile = os.path.join(comp.paths.figures_dir, 'unfolding',
                              'priors_flux_{}-groups.png'.format(num_groups))
comp.check_output_dir(priors_outfile)
plt.savefig(priors_outfile)
plt.show()

unfolding_df.head()


Out[152]:
counts_light counts_light_err counts_intermediate counts_intermediate_err counts_heavy counts_heavy_err counts_total counts_total_err H3a_flux_light H3a_flux_intermediate H3a_flux_heavy H4a_flux_light H4a_flux_intermediate H4a_flux_heavy Polygonato_flux_light Polygonato_flux_intermediate Polygonato_flux_heavy
log_energy_bin_idx
0 1457721 1207.361172 43116 207.643926 294313 542.506221 1795150 1339.832079 7.187420e-13 2.431070e-13 1.651639e-13 7.333020e-13 2.391451e-13 1.631829e-13 4.929408e-13 2.394229e-13 1.626849e-13
1 1132154 1064.027255 24671 157.070048 425020 651.935580 1581845 1257.714196 3.743101e-13 1.326787e-13 9.061387e-14 3.821589e-13 1.303989e-13 8.947391e-14 2.569923e-13 1.291849e-13 8.980226e-14
2 808442 899.134028 21007 144.937918 296194 544.237081 1125643 1060.963242 1.925426e-13 7.229419e-14 4.969588e-14 1.967681e-13 7.098235e-14 4.903990e-14 1.318928e-13 6.966331e-14 4.956720e-14
3 566089 752.388862 15537 124.647503 175464 418.884232 757090 870.109189 9.758423e-14 3.930950e-14 2.724088e-14 9.985590e-14 3.855464e-14 2.686341e-14 6.629980e-14 3.753236e-14 2.735588e-14
4 391163 625.430252 13319 115.407972 130735 361.572953 535217 731.585265 4.859649e-14 2.131730e-14 1.492133e-14 4.981589e-14 2.088294e-14 1.470413e-14 3.247099e-14 2.019331e-14 1.509482e-14

In [153]:
# unfolding_df_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
#                                     'unfolding_{}-groups.hdf'.format(num_groups))
# comp.check_output_dir(unfolding_df_outfile)
# unfolding_df.to_hdf(unfolding_df_outfile, 'dataframe', format='table')

Formatting for PyUnfold use


In [154]:
formatted_df = pd.DataFrame()

In [155]:
counts_formatted = []
priors_formatted = defaultdict(list)
for index, row in unfolding_df.iterrows():
    for composition in comp_list:
        counts_formatted.append(row['counts_{}'.format(composition)])
        for priors_name in priors_list:
            priors_formatted[priors_name].append(row['{}_flux_{}'.format(priors_name, composition)])
        
formatted_df['counts'] = counts_formatted
formatted_df['counts_err'] = np.sqrt(counts_formatted)

formatted_df['efficiencies'] = efficiencies
formatted_df['efficiencies_err'] = efficiencies_err


for key, value in priors_formatted.iteritems():
    formatted_df[key+'_flux'] = value
    formatted_df[key+'_prior'] = formatted_df[key+'_flux'] / formatted_df[key+'_flux'].sum()

formatted_df.index.rename('log_energy_bin_idx', inplace=True)

In [156]:
formatted_df.head()


Out[156]:
counts counts_err efficiencies efficiencies_err H3a_flux H3a_prior Polygonato_flux Polygonato_prior H4a_flux H4a_prior
log_energy_bin_idx
0 1457721.0 1207.361172 0.007109 0.000073 7.187420e-13 0.302586 4.929408e-13 0.260752 7.333020e-13 0.306455
1 43116.0 207.643926 0.006655 0.000100 2.431070e-13 0.102347 2.394229e-13 0.126648 2.391451e-13 0.099941
2 294313.0 542.506221 0.006151 0.000119 1.651639e-13 0.069533 1.626849e-13 0.086056 1.631829e-13 0.068196
3 1132154.0 1064.027255 0.007920 0.000066 3.743101e-13 0.157582 2.569923e-13 0.135942 3.821589e-13 0.159708
4 24671.0 157.070048 0.007525 0.000089 1.326787e-13 0.055857 1.291849e-13 0.068335 1.303989e-13 0.054495

In [157]:
prior_sums = formatted_df[[col for col in formatted_df.columns if 'prior' in col]].sum()
np.testing.assert_allclose(prior_sums, np.ones_like(prior_sums))

Save formatted DataFrame to disk


In [158]:
formatted_df_outfile = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 
                                    'unfolding-df_{}-groups.hdf'.format(num_groups))
comp.check_output_dir(formatted_df_outfile)
formatted_df.to_hdf(formatted_df_outfile, 'dataframe', format='table')

In [ ]:


In [ ]:


In [ ]:


In [ ]: