In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend


last updated: 2017-09-25 

CPython 2.7.13
IPython 5.3.0

numpy 1.12.1
matplotlib 2.0.2
scipy 0.19.0
pandas 0.20.1
sklearn 0.18.1
mlxtend 0.7.0

In [2]:
%matplotlib inline
from __future__ import division, print_function
import os
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from scipy.interpolate import UnivariateSpline

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold

import comptools as comp
import comptools.analysis.plotting as plotting

color_dict = comp.analysis.get_color_dict()


/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Welcome to JupyROOT 6.09/02

Define analysis free parameters

[ back to top ]

Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions


In [3]:
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']

Get composition classifier pipeline

Define energy binning for this analysis


In [4]:
energybins = comp.analysis.get_energybins()

Data preprocessing

[ back to top ]

  1. Load simulation/data dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets
  4. Feature transformation

In [5]:
df_sim_train, df_sim_test = comp.load_sim(config='IC86.2012', log_energy_min=6.0, log_energy_max=8.3)

In [6]:
log_energy_sim_test = df_sim_test['lap_log_energy']
log_reco_energy_sim_test = df_sim_test['lap_log_energy']
log_true_energy_sim_test = df_sim_test['MC_log_energy']

In [7]:
feature_list, feature_labels = comp.analysis.get_training_features()

In [8]:
pipeline_str = 'BDT'
pipeline = comp.get_pipeline(pipeline_str)

In [9]:
pipeline


Out[9]:
Pipeline(steps=[('classifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=2,
              subsample=1.0, verbose=0, warm_start=False))])

In [10]:
pipeline.fit(df_sim_train[feature_list], df_sim_train['target'])


Out[10]:
Pipeline(steps=[('classifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=2,
              subsample=1.0, verbose=0, warm_start=False))])

In [ ]:


In [ ]:

Load detection efficiency arrays


In [13]:
efficiencies = {}
efficiencies_err = {}
for composition in comp_list:
    eff_path = os.path.join(comp.paths.comp_data_dir, 'unfolding',
                            'efficiency-{}.npy'.format(composition))
    efficiencies[composition] = np.load(eff_path)
    eff_err_path = os.path.join(comp.paths.comp_data_dir, 'unfolding',
                                'efficiency-err-{}.npy'.format(composition))
    efficiencies_err[composition] = np.load(eff_err_path)

Plot efficiencies


In [16]:
fig, ax = plt.subplots()
for composition in comp_list:
    ax.errorbar(energybins.log_energy_midpoints, efficiencies[composition],
                yerr=efficiencies_err[composition], color=color_dict[composition],
                label=composition, marker='.')
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
ax.grid()
ax.ticklabel_format(style='sci',axis='y')
ax.yaxis.major.formatter.set_powerlimits((0,0))
plt.show()



In [43]:
det_efficiencies = np.empty(len(energybins.log_energy_midpoints)*2)
det_efficiencies[::2] = efficiencies['light']
det_efficiencies[1::2] = efficiencies['heavy']

In [44]:
det_efficiencies_err = np.empty_like(det_efficiencies)
det_efficiencies_err[::2] = efficiencies_err['light']
det_efficiencies_err[1::2] = efficiencies_err['heavy']

In [ ]:


In [11]:
df_sim = comp.load_sim(config='IC86.2012', test_size=0)

In [12]:
df_sim.lap_cos_zenith.min(), df_sim.lap_cos_zenith.max()


Out[12]:
(0.8071662437908339, 0.99999988925917194)

In [13]:
(df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min())/2


Out[13]:
0.90358306652500286

In [14]:
counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy, bins=energybins.log_energy_bins)[0]
counts_MC_energy_bins


Out[14]:
array([5316, 5202, 5595, 5897, 5508, 5518, 2353, 2319, 2287, 2385, 2261,
       2490, 2469, 2423, 2677, 2396])

In [15]:
counts_reco_energy_bins = np.histogram(df_sim.lap_log_energy, bins=energybins.log_energy_bins)[0]
counts_reco_energy_bins


Out[15]:
array([4839, 4995, 5488, 5481, 5135, 4197, 2447, 2253, 2317, 2389, 2316,
       2458, 2475, 2535, 2443, 1729])

In [16]:
counts_reco_energy_bins / counts_MC_energy_bins


Out[16]:
array([ 0.91027088,  0.96020761,  0.98087578,  0.92945566,  0.93228032,
        0.76060167,  1.039949  ,  0.97153946,  1.01311762,  1.00167715,
        1.02432552,  0.98714859,  1.00243013,  1.04622369,  0.91258872,
        0.72161937])

In [17]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_MC_energy_bins, ax=ax)
ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
ax.set_ylabel('\# true events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()


/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/matplotlib/figure.py:1743: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [18]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins, ax=ax)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('\# reco events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()



In [19]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins / counts_MC_energy_bins, ax=ax)
ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('\# reco events / \# true events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.grid()
plt.show()



In [ ]:


In [ ]:


In [ ]:


In [20]:
def run_to_energy_bin(run):
    ebin_first = 5.0
    ebin_last = 7.9
    # Taken from http://simprod.icecube.wisc.edu/cgi-bin/simulation/cgi/cfg?dataset=12360
    return (ebin_first*10+(run-1)%(ebin_last*10-ebin_first*10+1))/10

In [21]:
def thrown_showers_per_ebin(sim_list, log_energy_bins=None):
    e_bins = []
    for sim in sim_list:
        print(sim)
        for f in comp.simfunctions.get_level3_sim_files_iterator(sim):
            start_idx = f.find('Run')
            run = int(f[start_idx+3: start_idx+9])
            e_bin = run_to_energy_bin(run)
            e_bins.append(e_bin)

    if log_energy_bins is None:
        log_energy_bins = np.arange(5, 8.1, 0.1)
    log_energy_midpoints = (log_energy_bins[1:] + log_energy_bins[:-1]) / 2
    vals = np.histogram(e_bins, bins=log_energy_bins)[0]

    n_resamples = 100
    n_showers_per_file = n_resamples
    thrown_showers = vals * n_showers_per_file

    return thrown_showers

In [22]:
sim_list = [12360, 12362, 12630, 12631]
thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)


12360
12362
12630
12631

In [23]:
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, counts_reco_energy_bins / thrown_showers, ax=ax, 
                    label='Reco energy bins', color='C0', alpha=0.75)
plotting.plot_steps(energybins.log_energy_bins, counts_MC_energy_bins / thrown_showers, ax=ax, 
                    label='MC energy bins', color='C1', alpha=0.75)
# ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
ax.set_ylabel('\# passed events / \# thrown events')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0)
ax.grid()
ax.legend()
plt.show()



In [47]:
sim_dict = {'light': [12360, 12630],
            'heavy': [12362, 12631]}
efficiencies = np.empty(len(energybins.log_energy_midpoints)*2)
efficiencies_err = np.empty_like(efficiencies)
fig, ax = plt.subplots()
for composition, sim_list in sim_dict.items():
    comp_mask = df_sim.MC_comp_class == composition
    # Get number of thrown showers in each energy bin
    thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
    # Get number of showers in each energy bin that pass cuts
    counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy[comp_mask],
                                         bins=energybins.log_energy_bins)[0].astype(float)
    
#     # (Maybe??) Scale larger thrown radius up by area ratio
    large_thrown_radius = energybins.log_energy_midpoints > 7
#     counts_MC_energy_bins[large_thrown_radius] *= (1700/1100)**2 

    # Calculate detection efficiency and add them to normalizations array
    eff, eff_err = comp.ratio_error(counts_MC_energy_bins, np.sqrt(counts_MC_energy_bins),
                                    thrown_showers, np.sqrt(thrown_showers))
    
    start_idx = 0 if composition == 'light' else 1
    efficiencies[start_idx::2] = eff
    efficiencies_err[start_idx::2] = eff_err
    # Plot detection efficiencies
    plotting.plot_steps(energybins.log_energy_bins, eff, yerr=eff_err, ax=ax, 
                        label=composition, color=color_dict[composition], alpha=0.75)
    
    spl = UnivariateSpline(energybins.log_energy_midpoints[~large_thrown_radius],
                           eff[~large_thrown_radius], s=5)
    ax.plot(energybins.log_energy_midpoints[~large_thrown_radius],
            spl(energybins.log_energy_midpoints[[~large_thrown_radius]]), color=color_dict[composition])
    
    spl = UnivariateSpline(energybins.log_energy_midpoints[large_thrown_radius],
                           eff[large_thrown_radius], s=5)
    ax.plot(energybins.log_energy_midpoints[large_thrown_radius],
            spl(energybins.log_energy_midpoints[[large_thrown_radius]]), color=color_dict[composition])

# ax.axhline(1.0, marker='None', ls='-.', color='k')
ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
# ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0)
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax.grid()
ax.legend()
efficiency_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
                                  'detection-efficiency.png')
comp.check_output_dir(efficiency_outfile)
plt.savefig(efficiency_outfile)
plt.show()


12362
12631
12360
12630

In [44]:
# sim_dict = {'light': [12360, 12630],
#             'heavy': [12362, 12631]}
# ebins = np.arange(5.0, 8.1, 0.1)
# emidpoints = (ebins[1:] + ebins[:-1]) / 2
# efficiencies = np.empty((len(ebins)-1)*2)
# efficiencies_err = np.empty((len(ebins)-1)*2)
# # efficiencies_err = np.empty(len(energybins.log_energy_midpoints)*2)
# fig, ax = plt.subplots()
# for composition, sim_list in sim_dict.items():
#     comp_mask = df_sim.MC_comp_class == composition
#     # Get number of thrown showers in each energy bin
#     thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=ebins)
# #     thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)
#     # Get number of showers in each energy bin that pass cuts
#     counts_MC_energy_bins = np.histogram(df_sim.MC_log_energy[comp_mask],
#                                          bins=ebins)[0].astype(float)
    
# #     # (Maybe??) Scale larger thrown radius up by area ratio
#     large_thrown_radius = emidpoints > 7
# #     counts_MC_energy_bins[large_thrown_radius] *= (1700/1100)**2 

#     # Calculate detection efficiency and add them to normalizations array
#     eff, eff_err = comp.ratio_error(counts_MC_energy_bins, np.sqrt(counts_MC_energy_bins),
#                                     thrown_showers, np.sqrt(thrown_showers))
    
#     start_idx = 0 if composition == 'light' else 1
#     efficiencies[start_idx::2] = eff
#     efficiencies_err[start_idx::2] = eff_err
#     # Plot detection efficiencies
#     plotting.plot_steps(ebins, eff, yerr=eff_err, ax=ax, 
#                         label=composition, color=color_dict[composition], alpha=0.75)
    
#     spl = UnivariateSpline(emidpoints[~large_thrown_radius], eff[~large_thrown_radius], s=5)
#     ax.plot(emidpoints[~large_thrown_radius], spl(emidpoints[[~large_thrown_radius]]), color=color_dict[composition])

# # ax.axhline(1.0, marker='None', ls='-.', color='k')
# ax.set_xlabel('$\mathrm{\log_{10}(E_{MC}/GeV)}$')
# ax.set_ylabel('Detection efficiency \n($\mathrm{N_{passed}/N_{thrown}}$)')
# # ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
# ax.set_ylim(0)
# plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
# ax.grid()
# ax.legend()
# efficiency_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
#                                   'detection-efficiency.png')
# comp.check_output_dir(efficiency_outfile)
# plt.savefig(efficiency_outfile)
# plt.show()

In [43]:
efficiencies, efficiencies_err


Out[43]:
(array([  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.51400454e-05,   7.55287009e-06,   2.25733634e-04,
          3.01659125e-05,   5.26711813e-04,   1.43072289e-04,
          1.86274510e-03,   5.65184627e-04,   4.79150872e-03,
          1.89006024e-03,   1.11640212e-02,   5.20030234e-03,
          2.00228137e-02,   1.26797880e-02,   1.30990174e-02,
          1.04147813e-02,   1.71493213e-02,   1.54731264e-02,
          1.89291101e-02,   1.74333587e-02,   2.02950076e-02,
          1.93030303e-02,   1.88855193e-02,   2.13368580e-02,
          2.06914894e-02,   1.88804265e-02,   2.28495034e-02,
          1.96974281e-02,   2.24408848e-02,   2.23863636e-02,
          2.17236143e-02,   2.00226929e-02,   2.07908745e-02,
          2.10909091e-02,   8.57580398e-03,   9.34090909e-03,
          9.33789954e-03,   8.25396825e-03,   8.27534039e-03,
          9.03787879e-03,   9.28679818e-03,   8.83561644e-03,
          8.10790274e-03,   9.07294833e-03,   9.47488584e-03,
          9.42467827e-03,   8.74147081e-03,   9.96969697e-03,
          9.00230238e-03,   9.54198473e-03,   1.06778370e-02,
          9.65177896e-03,   8.95579268e-03,   9.30640244e-03]),
 array([             nan,              nan,              nan,
                     nan,              nan,              nan,
          1.07057098e-05,   7.55289861e-06,   4.12177858e-05,
          1.50831838e-05,   6.29706800e-05,   3.28253822e-05,
          1.18633971e-04,   6.52803392e-05,   1.91052059e-04,
          1.19412233e-04,   2.92106491e-04,   1.98774505e-04,
          3.94098353e-04,   3.11774549e-04,   3.16712631e-04,
          2.81710582e-04,   3.62697083e-04,   3.44883046e-04,
          3.81386505e-04,   3.67545540e-04,   3.95768989e-04,
          3.86080119e-04,   3.81948460e-04,   4.05700687e-04,
          4.00603905e-04,   3.82767580e-04,   4.22546423e-04,
          3.89784618e-04,   4.18348047e-04,   4.16401548e-04,
          4.10525245e-04,   3.93052382e-04,   4.01737055e-04,
          4.03917704e-04,   2.57347591e-04,   2.67255434e-04,
          2.67821357e-04,   2.50805147e-04,   2.51227270e-04,
          2.62845185e-04,   2.66675179e-04,   2.60453950e-04,
          2.49218243e-04,   2.63759163e-04,   2.69796971e-04,
          2.68360639e-04,   2.58559081e-04,   2.76190150e-04,
          2.64028415e-04,   2.71172661e-04,   2.86692051e-04,
          2.71605201e-04,   2.62434645e-04,   2.67568825e-04]))

In [26]:
@np.vectorize
def get_sim_thrown_radius(log_energy):
    if log_energy <= 6:
        thrown_radius = 800.0
    elif (log_energy > 6) & (log_energy <=7):
        thrown_radius = 1100.0
    elif (log_energy > 7) & (log_energy <=8):
        thrown_radius = 1700.0
    else:
        raise ValueError('Invalid energy entered')
    return thrown_radius

In [27]:
thrown_radii = get_sim_thrown_radius(energybins.log_energy_midpoints)
thrown_area = np.pi * (thrown_radii**2)
geom_factor = (df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min()) / 2
print(geom_factor)
fig, ax = plt.subplots()
plotting.plot_steps(energybins.log_energy_bins, thrown_area * geom_factor)
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('Thrown area [$\mathrm{m^2}$]')
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax.grid()
thrown_area_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'iter-bayesian',
                                  'thrown-area.png')
comp.check_output_dir(thrown_area_outfile)
plt.savefig(thrown_area_outfile)
plt.show()


0.903583066525

In [ ]:


In [ ]:


In [29]:
sim_list = [12360, 12362, 12630, 12631]
thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=energybins.log_energy_bins)


12360
12362
12630
12631

In [31]:
passed_showers = np.histogram(df_sim.loc[:, 'MC_log_energy'], bins=energybins.log_energy_bins)[0]

In [32]:
efficiency, efficiency_err = comp.ratio_error(passed_showers, np.sqrt(passed_showers),
                                              thrown_showers, np.sqrt(thrown_showers))

In [33]:
geom_factor = (df_sim.lap_cos_zenith.max() + df_sim.lap_cos_zenith.min()) / 2

In [35]:
thrown_radii = get_sim_thrown_radius(energybins.log_energy_midpoints)
thrown_areas = np.pi * thrown_radii**2

In [36]:
eff_area = efficiency * thrown_areas * geom_factor

In [41]:
plotting.plot_steps(energybins.log_energy_bins, eff_area)
plt.ylim(0)
plt.show()



In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [18]:
df_data = comp.load_data(config='IC86.2012', log_energy_min=6.0, log_energy_max=8.3)

In [19]:
X_data = comp.dataframe_functions.dataframe_to_array(df_data, feature_list + ['lap_log_energy'])
log_energy_data = X_data[:,-1]
X_data = X_data[:,:-1]

In [20]:
# is_finite_mask = np.isfinite(data.X)
# not_finite_mask = np.logical_not(is_finite_mask)
# finite_data_mask = np.logical_not(np.any(not_finite_mask, axis=1))
# data = data[finite_data_mask]

In [21]:
data_predictions = pipeline.predict(X_data)

In [22]:
# Get composition masks
data_labels = np.array([comp.dataframe_functions.label_to_comp(pred) for pred in data_predictions])
data_light_mask = data_labels == 'light'
data_heavy_mask = data_labels == 'heavy'

In [23]:
# Get number of identified comp in each energy bin
df_flux = {}
comp_list = ['light', 'heavy']
for composition in comp_list:
    comp_mask = data_labels == composition
    df_flux['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
                                            bins=energybins.log_energy_bins)[0]
    df_flux['counts_' + composition + '_err'] = np.sqrt(df_flux['counts_' + composition])

df_flux['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total'])

In [24]:
# Get number of identified comp in each energy bin
unfolding_df = pd.DataFrame(df_flux)
comp_list = ['light', 'heavy']
for composition in comp_list:
    comp_mask = data_labels == composition
    unfolding_df['counts_' + composition] = np.histogram(log_energy_data[comp_mask],
                                            bins=energybins.log_energy_bins)[0]
    unfolding_df['counts_' + composition + '_err'] = np.sqrt(df_flux['counts_' + composition])

unfolding_df['counts_total'] = np.histogram(log_energy_data, bins=energybins.log_energy_bins)[0]
unfolding_df['counts_total_err'] = np.sqrt(unfolding_df['counts_total'])

In [25]:
unfolding_df.index.rename('log_energy_bin_idx', inplace=True)

In [26]:
unfolding_df


Out[26]:
counts_heavy counts_heavy_err counts_light counts_light_err counts_total counts_total_err
log_energy_bin_idx
0 225390 474.752567 368260 606.844296 593650 770.486859
1 143534 378.858813 258636 508.562681 402170 634.168747
2 98418 313.716432 165013 406.217922 263431 513.255297
3 69488 263.605766 99269 315.069834 168757 410.800438
4 40943 202.343767 64699 254.359981 105642 325.026153
5 30707 175.234129 34266 185.110778 64973 254.898019
6 19891 141.035457 20045 141.580366 39936 199.839936
7 12010 109.590146 12740 112.871608 24750 157.321327
8 8434 91.836812 7049 83.958323 15483 124.430704
9 5534 74.390860 4372 66.121101 9906 99.528890
10 3604 60.033324 2732 52.268537 6336 79.598995
11 2753 52.469038 1392 37.309516 4145 64.381674
12 1880 43.358967 721 26.851443 2601 51.000000
13 1163 34.102786 464 21.540659 1627 40.336088
14 758 27.531800 298 17.262677 1056 32.496154
15 558 23.622024 145 12.041595 703 26.514147

Spectrum

[ back to top ]

Number of events observed


In [53]:
# num_particles, num_particles_err = comp.analysis.get_num_particles(sim_train, data, pipeline, comp_list)

In [54]:
# unfolding_df['counts_light'] = num_particles['light']
# unfolding_df['counts_heavy'] = num_particles['heavy']
# unfolding_df['counts_err_light'] = num_particles_err['light']
# unfolding_df['counts_err_heavy'] = num_particles_err['heavy']

Block diagonal response matrix and error


In [46]:
# pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(df_sim_test[feature_list])
true_comp = df_sim_test['target'].apply(comp.dataframe_functions.label_to_comp)
pred_comp = pd.Series([comp.dataframe_functions.label_to_comp(i) for i in test_predictions])

In [47]:
# response_list = []
# response_err_list = []
# sim_bin_idxs = np.digitize(log_energy_sim_test, energybins.log_energy_bins) - 1
# energy_bin_idx = np.unique(sim_bin_idxs)
# # energy_bin_idx = energy_bin_idx[1:]
# print(energy_bin_idx)
# # print(energybins.energy_midpoints.shape)
# for bin_idx in energy_bin_idx:
#     if (bin_idx == -1) or (bin_idx == energybins.energy_midpoints.shape[0]):
#         continue
#     sim_bin_mask = sim_bin_idxs == bin_idx
#     response_mat = confusion_matrix(true_comp[sim_bin_mask], pred_comp[sim_bin_mask],
#                                     labels=comp_list)
#     # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
#     response_mat = response_mat.T
#     # Get response matrix statistical error
#     response_mat_err = np.sqrt(response_mat)
#     response_err_list.append(response_mat_err)
#     response_list.append(response_mat)
# block_response = block_diag(response_list).toarray()
# # block_response = np.flipud(block_response)

# # Normalize along MC comp axis to go from counts to probabilities
# block_response = block_response / block_response.sum(axis=0)
# print('block_response = \n{}'.format(block_response))
# block_response_err = block_diag(response_err_list).toarray()
# # block_response_err = np.flipud(block_response_err)
# block_response_err = block_response_err / block_response_err.sum(axis=0)
# print('block_response_err = \n{}'.format(block_response_err))

In [48]:
# res_mat_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'block_response.txt')
# res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'block_response_err.txt')

# comp.check_output_dir(res_mat_outfile)
# comp.check_output_dir(res_mat_err_outfile)

# np.savetxt(res_mat_outfile, block_response)
# np.savetxt(res_mat_err_outfile, block_response_err)

In [ ]:


In [49]:
true_ebin_idxs = np.digitize(log_true_energy_sim_test, energybins.log_energy_bins) - 1
reco_ebin_idxs = np.digitize(log_reco_energy_sim_test, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(true_ebin_idxs)
print(energy_bin_idx)

hstack_list = []
for true_ebin_idx in energy_bin_idx:
    if (true_ebin_idx == -1) or (true_ebin_idx == energybins.energy_midpoints.shape[0]):
        continue
    true_ebin_mask = true_ebin_idxs == true_ebin_idx
    
    vstack_list = []
    for reco_ebin_idx in energy_bin_idx:
        if (reco_ebin_idx == -1) or (reco_ebin_idx == energybins.energy_midpoints.shape[0]):
            continue
        reco_ebin_mask = reco_ebin_idxs == reco_ebin_idx
        
        combined_mask = true_ebin_mask & reco_ebin_mask
        if combined_mask.sum() == 0:
            response_mat = np.zeros((2, 2), dtype=int)
        else:
            response_mat = confusion_matrix(true_comp[true_ebin_mask & reco_ebin_mask],
                                            pred_comp[true_ebin_mask & reco_ebin_mask],
                                            labels=comp_list)
        # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
        response_mat = response_mat.T
        vstack_list.append(response_mat)
    hstack_list.append(np.vstack(vstack_list))
    
res = np.hstack(hstack_list)
res_err = np.sqrt(res)


[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]

Normalize response matrix column-wise (i.e. $P(E|C)$)


In [51]:
# res_normalized = res / res.sum(axis=0)
# # Error propagate the repsonse matrix error
# res_err_normalized = res_err / res.sum(axis=0)
# # res_err = res_err / res_err.sum(axis=0)

# res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
# res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
#                                                                res.sum(axis=0), res_col_sum_err,
#                                                                nan_to_num=True)

res_col_sum = res.sum(axis=0)
res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])

normalizations, normalizations_err = comp.analysis.ratio_error(res_col_sum, res_col_sum_err,
                                                               det_efficiencies, det_efficiencies_err)

res_normalized, res_normalized_err = comp.analysis.ratio_error(res, res_err,
                                                               normalizations, normalizations_err,
                                                               nan_to_num=True)


/home/jbourbeau/cr-composition/comptools/analysis/data_functions.py:27: RuntimeWarning: invalid value encountered in true_divide
  ratio_err = np.abs(ratio) * np.sqrt((num_err / num)**2 + (den_err / den)**2)

In [52]:
np.testing.assert_allclose(res_normalized.sum(axis=0), det_efficiencies)

In [53]:
res_normalized


Out[53]:
array([[  1.18431017e-02,   5.62066958e-03,   5.56357050e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  4.02218547e-03,   1.20219877e-02,   3.11439001e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.51941229e-03,   1.56129711e-03,   8.76867090e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          5.45735108e-03,   5.81493559e-04,   1.45389678e-03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          5.12427331e-05,   3.75327661e-03,   1.05003656e-03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.53728199e-04,   2.00879593e-03,   6.86562369e-03]])

In [54]:
res_normalized_err


Out[54]:
array([[  1.02490428e-03,   7.52242735e-04,   4.63488680e-04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  5.19175382e-04,   1.23285146e-03,   3.29804059e-04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  4.80567203e-04,   3.62889712e-04,   6.17159880e-04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          4.72975305e-04,   1.28074960e-04,   2.13050072e-04],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          3.63359916e-05,   3.77172195e-04,   1.77563884e-04],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          6.32873245e-05,   2.55789791e-04,   5.69318071e-04]])

In [108]:
# res_col_sum_err = np.array([np.sqrt(np.sum(res_err[:, i]**2)) for i in range(res_err.shape[1])])
# res_col_sum_err

In [109]:
# res.sum(axis=0)

In [110]:
# res1, res1_err = comp.analysis.ratio_error(res, res_err, res.sum(axis=0), res_col_sum_err)

In [111]:
# np.testing.assert_array_equal(res_normalized, res1)

In [112]:
# plt.plot(res.sum(axis=0))

In [55]:
fig, ax = plt.subplots()
# h = np.flipud(block_response)
sns.heatmap(res[24:26, 24:26], annot=True, fmt='d', ax=ax, square=True,
           xticklabels=['light', 'heavy'], yticklabels=['light', 'heavy'],
           cbar_kws={'label': 'Counts'}, vmin=0, cmap='viridis')
ax.invert_yaxis()
plt.xlabel('True composition')
plt.ylabel('Pred composition')
plt.title('$\mathrm{7.6 < \log_{10}(E_{true}/GeV) < 7.7}$' + '\n$\mathrm{7.6 < \log_{10}(E_{reco}/GeV) < 7.7}$')
res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-single-energy-bin.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()



In [56]:
plt.imshow(res, origin='lower', cmap='viridis')

# ax = sns.heatmap(res, square=True, xticklabels=2, yticklabels=2, 
# ax = sns.heatmap(res, square=True, mask=res==0, xticklabels=2, yticklabels=2, 
#             cbar_kws={'label': 'Counts'})

ax.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

# ax.invert_yaxis()



for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axvline(i+0.5, marker='None', ls=':', color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
# for i in np.arange(0, res.shape[0], 2):
#     plt.axhline(i+0.5, marker='None', ls=':', color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
# plt.grid()

# plt.xticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])], 
#            rotation='vertical')
# plt.yticks(np.arange(0.5, res.shape[0], 2),
#            ['{}'.format(i+1) for i in range(res.shape[0])])

plt.colorbar(label='Counts')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()



In [57]:
plt.imshow(np.sqrt(res), origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')

plt.colorbar(label='Count errors', format='%d')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-statistics-err.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()



In [58]:
plt.imshow(res_normalized, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix')

plt.colorbar(label='$\mathrm{P(E_i|C_{\mu})}$')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()



In [59]:
plt.imshow(res_normalized_err, origin='lower', cmap='viridis')
plt.plot([0, res.shape[0]-1], [0, res.shape[1]-1], marker='None', ls=':', color='C1')

for i in np.arange(0, res.shape[0], 2):
    plt.axvline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
for i in np.arange(0, res.shape[0], 2):
    plt.axhline(i-0.5, marker='None', ls='-', lw=0.5, color='gray')
    
plt.xlabel('True bin')
plt.ylabel('Reconstructed bin')
plt.title('Response matrix error')

plt.colorbar(label='$\mathrm{\delta P(E_i|C_{\mu})}$')

res_mat_outfile = os.path.join(comp.paths.figures_dir, 'unfolding', 'response-matrix-err.png')
comp.check_output_dir(res_mat_outfile)
plt.savefig(res_mat_outfile)
plt.show()



In [61]:
res_mat_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'response.txt')
res_mat_err_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'response_err.txt')

comp.check_output_dir(res_mat_outfile)
comp.check_output_dir(res_mat_err_outfile)

np.savetxt(res_mat_outfile, res_normalized)
np.savetxt(res_mat_err_outfile, res_normalized_err)

Priors array


In [60]:
from icecube.weighting.weighting import from_simprod
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5

In [61]:
df_sim = comp.load_sim(config='IC86.2012', test_size=0)
df_sim.head()


Out[61]:
FractionContainment_Laputop_IceTop FractionContainment_Laputop_InIce FractionContainment_MCPrimary_IceTop FractionContainment_MCPrimary_InIce IceTopMaxSignal IceTopMaxSignalInEdge IceTopMaxSignalString IceTopNeighbourMaxSignal InIce_charge_1_60 MC_azimuth ... log_s80 log_s125 log_s180 log_s250 log_s500 log_dEdX log_d4r_peak_energy log_d4r_peak_sigma target is_thinned
0 0.672916 0.596048 0.675481 0.649017 28.609421 0 29 12.178488 180.050000 5.673565 ... 0.515847 -0.023232 -0.480570 -0.905580 -1.842779 0.733548 1.251190 0.905364 0 False
1 0.537961 0.760892 0.543832 0.771945 101.481819 0 38 68.172859 244.275001 5.673565 ... 0.560367 -0.013838 -0.499876 -0.950742 -1.942496 0.758869 1.096959 0.790880 0 False
2 0.676321 0.597996 0.674722 0.655864 20.867756 0 20 10.946673 165.950000 5.673565 ... 0.502415 -0.000583 -0.428441 -0.826892 -1.708051 0.363861 1.511975 1.486399 0 False
3 0.255498 0.933506 0.248732 0.951277 37.628693 0 80 21.158876 27.025000 5.673565 ... 0.507438 -0.057127 -0.535289 -0.979059 -1.955840 0.663036 0.807850 0.452333 0 False
4 0.389801 0.887318 0.378421 0.896916 19.827286 0 27 6.790532 59.850000 5.673565 ... 0.533604 -0.027082 -0.502074 -0.942989 -1.913746 0.510647 0.917567 0.767052 0 False

5 rows × 83 columns


In [ ]:


In [62]:
flux = GaisserH3a()
model_flux = {}
for ptype in [2212, 1000020040, 1000070140, 1000130270, 1000260560]:
    model_flux[ptype] = flux(energybins.energy_midpoints, ptype)
model_flux_df = pd.DataFrame.from_records(model_flux)
model_flux_df.index = energybins.energy_midpoints
model_flux_df


Out[62]:
2212 1000020040 1000070140 1000130270 1000260560
2.818383e+06 3.365669e-14 6.392754e-14 2.639297e-14 1.291653e-14 2.724088e-14
3.548134e+06 1.593250e-14 3.266399e-14 1.423278e-14 7.084518e-15 1.492133e-14
4.466836e+06 7.341097e-15 1.637122e-14 7.639624e-15 3.881777e-15 8.165225e-15
5.623413e+06 3.290494e-15 8.016199e-15 4.077046e-15 2.123919e-15 4.462350e-15
7.079458e+06 1.439990e-15 3.818003e-15 2.160371e-15 1.159933e-15 2.434567e-15
8.912509e+06 6.217267e-16 1.761449e-15 1.134859e-15 6.319534e-16 1.325349e-15
1.122018e+07 2.697351e-16 7.849163e-16 5.899540e-16 3.432667e-16 7.194980e-16
1.412538e+07 1.201025e-16 3.380352e-16 3.029216e-16 1.857741e-16 3.892281e-16
1.778279e+07 5.550538e-17 1.417695e-16 1.533412e-16 1.001021e-16 2.096403e-16
2.238721e+07 2.639513e-17 5.896673e-17 7.640552e-17 5.366749e-17 1.123032e-16
2.818383e+07 1.261851e-17 2.500624e-17 3.744697e-17 2.861132e-17 5.976306e-17
3.548134e+07 5.925300e-18 1.107337e-17 1.806894e-17 1.516225e-17 3.155050e-17
4.466836e+07 2.695835e-18 5.125129e-18 8.613758e-18 7.986996e-18 1.649959e-17
5.623413e+07 1.186024e-18 2.420720e-18 4.084171e-18 4.184115e-18 8.534752e-18
7.079458e+07 5.101860e-19 1.132645e-18 1.943615e-18 2.182014e-18 4.361142e-18
8.912509e+07 2.205102e-19 5.150783e-19 9.359229e-19 1.134286e-18 2.199731e-18

In [63]:
fig, ax = plt.subplots()
for key in model_flux_df.columns:
    ax.plot(np.log10(model_flux_df.index), model_flux_df.index**2.7*model_flux_df[key], label=key)
ax.plot(np.log10(model_flux_df.index), model_flux_df.index**2.7*model_flux_df.sum(axis=1), label='All particle')
ax.set_yscale("log", nonposy='clip')
ax.set_ylim(1e3, 1e5)
ax.grid()
ax.legend()
plt.show()



In [64]:
model_flux_df.sum(axis=1)


Out[64]:
2.818383e+06    1.641346e-13
3.548134e+06    8.483512e-14
4.466836e+06    4.339894e-14
5.623413e+06    2.197001e-14
7.079458e+06    1.101287e-14
8.912509e+06    5.475336e-15
1.122018e+07    2.707370e-15
1.412538e+07    1.336061e-15
1.778279e+07    6.603585e-16
2.238721e+07    3.277380e-16
2.818383e+07    1.634461e-16
3.548134e+07    8.178036e-17
4.466836e+07    4.092131e-17
5.623413e+07    2.040978e-17
7.079458e+07    1.012960e-17
8.912509e+07    5.005529e-18
dtype: float64

In [ ]:


In [65]:
simlist = np.unique(df_sim['sim'])
for i, sim in enumerate(simlist):
    gcd_file, sim_files = comp.simfunctions.get_level3_sim_files(sim)
    num_files = len(sim_files)
    if i == 0:
        generator = num_files*from_simprod(int(sim))
    else:
        generator += num_files*from_simprod(int(sim))

In [66]:
priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5']
# priors_list = ['h3a', 'h4a', 'antih3a', 'Hoerandel5', 'antiHoerandel5', 'uniform', 'alllight', 'allheavy']
model_ptypes = {}
model_ptypes['h3a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
model_ptypes['h4a'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}
model_ptypes['Hoerandel5'] = {'light': [2212, 1000020040], 'heavy': [1000070140, 1000130270, 1000260560]}

In [67]:
priors = defaultdict(list)
for flux, name in zip([GaisserH3a(), GaisserH3a(), GaisserH4a(), Hoerandel5(), Hoerandel5()],
                      ['h3a', 'antih3a', 'h4a', 'Hoerandel5', 'antiHoerandel5']):
    priors_raw = defaultdict(list)
    for energy_mid in energybins.energy_midpoints:
        energy = [energy_mid]*5
        ptype = [2212, 1000020040, 1000070140, 1000130270, 1000260560]
        weights = flux(energy, ptype)
#         light_prior = weights[:2].sum()/weights.sum()
#         heavy_prior = weights[2:].sum()/weights.sum()
        light_prior = weights[:2].sum()
        heavy_prior = weights[2:].sum()
        if 'anti' in name:
            priors_raw['light'].append(heavy_prior)
            priors_raw['heavy'].append(light_prior)
        else:
            priors_raw['light'].append(light_prior)
            priors_raw['heavy'].append(heavy_prior)
        priors[name].extend([light_prior, heavy_prior])
    unfolding_df['{}_flux_light'.format(name)] = priors_raw['light']
    unfolding_df['{}_flux_heavy'.format(name)] = priors_raw['heavy']
    
# unfolding_df['uniform_flux_light'] = [0.5]*len(priors_raw['light'])
# unfolding_df['uniform_flux_heavy'] = [0.5]*len(priors_raw['heavy'])

# unfolding_df['alllight_flux_light'] = [0.9]*len(priors_raw['light'])
# unfolding_df['alllight_flux_heavy'] = [0.1]*len(priors_raw['heavy'])

# unfolding_df['allheavy_flux_light'] = [0.1]*len(priors_raw['light'])
# unfolding_df['allheavy_flux_heavy'] = [0.9]*len(priors_raw['heavy'])

In [68]:
unfolding_df


Out[68]:
counts_heavy counts_heavy_err counts_light counts_light_err counts_total counts_total_err h3a_flux_light h3a_flux_heavy antih3a_flux_light antih3a_flux_heavy h4a_flux_light h4a_flux_heavy Hoerandel5_flux_light Hoerandel5_flux_heavy antiHoerandel5_flux_light antiHoerandel5_flux_heavy
log_energy_bin_idx
0 233037 482.739060 360613 600.510616 593650 770.486859 9.758423e-14 6.655038e-14 6.655038e-14 9.758423e-14 9.985590e-14 6.541805e-14 6.629980e-14 6.488824e-14 6.488824e-14 6.629980e-14
1 145728 381.743369 256442 506.401027 402170 634.168747 4.859649e-14 3.623863e-14 3.623863e-14 4.859649e-14 4.981589e-14 3.558706e-14 3.247099e-14 3.528813e-14 3.528813e-14 3.247099e-14
2 99038 314.703035 164393 405.454066 263431 513.255297 2.371232e-14 1.968663e-14 1.968663e-14 2.371232e-14 2.436579e-14 1.931170e-14 1.541614e-14 1.916844e-14 1.916844e-14 1.541614e-14
3 66484 257.844915 102273 319.801501 168757 410.800438 1.130669e-14 1.066331e-14 1.066331e-14 1.130669e-14 1.165624e-14 1.044758e-14 7.062637e-15 1.039346e-14 1.039346e-14 7.062637e-15
4 40849 202.111355 64793 254.544692 105642 325.026153 5.257994e-15 5.754872e-15 5.754872e-15 5.257994e-15 5.444601e-15 5.630737e-15 3.109625e-15 5.620064e-15 5.620064e-15 3.109625e-15
5 29647 172.183042 35326 187.952122 64973 254.898019 2.383175e-15 3.092161e-15 3.092161e-15 2.383175e-15 2.482578e-15 3.020735e-15 1.311693e-15 3.026594e-15 3.026594e-15 1.311693e-15
6 19869 140.957440 20067 141.658039 39936 199.839936 1.054651e-15 1.652719e-15 1.652719e-15 1.054651e-15 1.107476e-15 1.611621e-15 5.294563e-16 1.620381e-15 1.620381e-15 5.294563e-16
7 12063 109.831689 12687 112.636584 24750 157.321327 4.581377e-16 8.779237e-16 8.779237e-16 4.581377e-16 4.861355e-16 8.542774e-16 2.048821e-16 8.604621e-16 8.604621e-16 2.048821e-16
8 7940 89.106678 7543 86.850446 15483 124.430704 1.972749e-16 4.630835e-16 4.630835e-16 1.972749e-16 2.120716e-16 4.494787e-16 7.637870e-17 4.519852e-16 4.519852e-16 7.637870e-17
9 5442 73.769913 4464 66.813172 9906 99.528890 8.536186e-17 2.423762e-16 2.423762e-16 8.536186e-17 9.315715e-17 2.345489e-16 2.761680e-17 2.341804e-16 2.341804e-16 2.761680e-17
10 3558 59.648973 2778 52.706736 6336 79.598995 3.762475e-17 1.258213e-16 1.258213e-16 3.762475e-17 4.171736e-17 1.213184e-16 9.754514e-18 1.193495e-16 1.193495e-16 9.754514e-18
11 2722 52.172790 1423 37.722672 4145 64.381674 1.699867e-17 6.478170e-17 6.478170e-17 1.699867e-17 1.913925e-17 6.219132e-17 3.387166e-18 5.968111e-17 5.968111e-17 3.387166e-18
12 1886 43.428102 715 26.739484 2601 51.000000 7.820965e-18 3.310034e-17 3.310034e-17 7.820965e-18 8.936023e-18 3.161033e-17 1.162203e-18 2.920519e-17 2.920519e-17 1.162203e-18
13 1134 33.674916 493 22.203603 1627 40.336088 3.606744e-18 1.680304e-17 1.680304e-17 3.606744e-18 4.185068e-18 1.594605e-17 3.955419e-19 1.394010e-17 1.394010e-17 3.955419e-19
14 758 27.531800 298 17.262677 1056 32.496154 1.642831e-18 8.486771e-18 8.486771e-18 1.642831e-18 1.941410e-18 7.993934e-18 1.338868e-19 6.461990e-18 6.461990e-18 1.338868e-19
15 559 23.643181 144 12.000000 703 26.514147 7.355884e-19 4.269940e-18 4.269940e-18 7.355884e-19 8.890250e-19 3.986566e-18 4.515669e-20 2.893904e-18 2.893904e-18 4.515669e-20

In [69]:
unfolding_df_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding', 'unfolding-dataframe.csv')
comp.check_output_dir(unfolding_df_outfile)
unfolding_df.to_csv(unfolding_df_outfile)

Formatting for PyUnfold use


In [70]:
formatted_df = pd.DataFrame()

In [71]:
counts_formatted = []
priors_formatted = defaultdict(list)
for index, row in unfolding_df.iterrows():
    counts_formatted.extend([row['counts_light'], row['counts_heavy']])
    for priors_name in priors_list:
        priors_formatted[priors_name].extend([row[priors_name+'_flux_light'], row[priors_name+'_flux_heavy']])
        
formatted_df['counts'] = counts_formatted
formatted_df['counts_err'] = np.sqrt(counts_formatted)


for key, value in priors_formatted.iteritems():
    formatted_df[key+'_flux'] = value
    formatted_df[key+'_priors'] = formatted_df[key+'_flux'] / formatted_df[key+'_flux'].sum()

formatted_df.index.rename('log_energy_bin_idx', inplace=True)

In [72]:
formatted_df.head()


Out[72]:
counts counts_err Hoerandel5_flux Hoerandel5_priors h3a_flux h3a_priors antiHoerandel5_flux antiHoerandel5_priors h4a_flux h4a_priors antih3a_flux antih3a_priors
log_energy_bin_idx
0 360613.0 600.510616 6.629980e-14 0.247104 9.758423e-14 0.290274 6.488824e-14 0.241843 9.985590e-14 0.295078 6.655038e-14 0.197960
1 233037.0 482.739060 6.488824e-14 0.241843 6.655038e-14 0.197960 6.629980e-14 0.247104 6.541805e-14 0.193313 9.758423e-14 0.290274
2 256442.0 506.401027 3.247099e-14 0.121022 4.859649e-14 0.144555 3.528813e-14 0.131521 4.981589e-14 0.147208 3.623863e-14 0.107795
3 145728.0 381.743369 3.528813e-14 0.131521 3.623863e-14 0.107795 3.247099e-14 0.121022 3.558706e-14 0.105161 4.859649e-14 0.144555
4 164393.0 405.454066 1.541614e-14 0.057457 2.371232e-14 0.070535 1.916844e-14 0.071442 2.436579e-14 0.072002 1.968663e-14 0.058560

Save formatted DataFrame to disk


In [73]:
formatted_df_outfile = os.path.join(comp.paths.comp_data_dir, 'unfolding',
                                    'unfolding-dataframe-PyUnfold-formatted.csv')
comp.check_output_dir(formatted_df_outfile)
formatted_df.to_csv(formatted_df_outfile)

In [77]:
model_to_ls = {'h3a': '-.', 'h4a': ':', 'Hoerandel5': '-', 'antih3a': '--'}
model_to_marker = {'h3a': '.', 'h4a': '^', 'Hoerandel5': '*', 'antih3a': '.'}

fig, ax = plt.subplots()
for model in ['h3a', 'h4a', 'Hoerandel5', 'antih3a']:
    key = '{}_priors'.format(model)
    light_priors = formatted_df[key][::2]
    heavy_priors = formatted_df[key][1::2]
    ax.plot(energybins.log_energy_midpoints, light_priors,
            color=color_dict['light'], ls=model_to_ls[model], marker=model_to_marker[model],
            label='{} light'.format(model), alpha=0.75)
    ax.plot(energybins.log_energy_midpoints, heavy_priors,
            color=color_dict['heavy'], ls=model_to_ls[model], marker=model_to_marker[model],
            label='{} heavy'.format(model), alpha=0.75)

ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('Prior')
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
# ax.set_ylim([0, 1])
ax.set_yscale("log", nonposy='clip')
ax.grid()
leg = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),
                 frameon=False, fancybox=False, numpoints=1)
priors_outfile = os.path.join(comp.paths.figures_dir, 'unfolding/iter-bayesian', 'priors.png')
comp.check_output_dir(priors_outfile)
plt.savefig(priors_outfile)
plt.show()



In [130]:
woo = pd.read_csv(formatted_df_outfile, index_col='log_energy_bin_idx')

In [131]:
woo


Out[131]:
counts counts_err Hoerandel5_flux Hoerandel5_priors h3a_flux h3a_priors antiHoerandel5_flux antiHoerandel5_priors h4a_flux h4a_priors antih3a_flux antih3a_priors
log_energy_bin_idx
0 360613.0 600.510616 6.629980e-14 2.471037e-01 9.758423e-14 0.290274 6.488824e-14 2.418428e-01 9.985590e-14 0.295078 6.655038e-14 0.197960
1 233037.0 482.739060 6.488824e-14 2.418428e-01 6.655038e-14 0.197960 6.629980e-14 2.471037e-01 6.541805e-14 0.193313 9.758423e-14 0.290274
2 256442.0 506.401027 3.247099e-14 1.210215e-01 4.859649e-14 0.144555 3.528813e-14 1.315212e-01 4.981589e-14 0.147208 3.623863e-14 0.107795
3 145728.0 381.743369 3.528813e-14 1.315212e-01 3.623863e-14 0.107795 3.247099e-14 1.210215e-01 3.558706e-14 0.105161 4.859649e-14 0.144555
4 164393.0 405.454066 1.541614e-14 5.745698e-02 2.371232e-14 0.070535 1.916844e-14 7.144202e-02 2.436579e-14 0.072002 1.968663e-14 0.058560
5 99038.0 314.703035 1.916844e-14 7.144202e-02 1.968663e-14 0.058560 1.541614e-14 5.745698e-02 1.931170e-14 0.057067 2.371232e-14 0.070535
6 102273.0 319.801501 7.062637e-15 2.632291e-02 1.130669e-14 0.033633 1.039346e-14 3.873710e-02 1.165624e-14 0.034445 1.066331e-14 0.031719
7 66484.0 257.844915 1.039346e-14 3.873710e-02 1.066331e-14 0.031719 7.062637e-15 2.632291e-02 1.044758e-14 0.030873 1.130669e-14 0.033633
8 64793.0 254.544692 3.109625e-15 1.158978e-02 5.257994e-15 0.015640 5.620064e-15 2.094635e-02 5.444601e-15 0.016089 5.754872e-15 0.017118
9 40849.0 202.111355 5.620064e-15 2.094635e-02 5.754872e-15 0.017118 3.109625e-15 1.158978e-02 5.630737e-15 0.016639 5.257994e-15 0.015640
10 35326.0 187.952122 1.311693e-15 4.888767e-03 2.383175e-15 0.007089 3.026594e-15 1.128032e-02 2.482578e-15 0.007336 3.092161e-15 0.009198
11 29647.0 172.183042 3.026594e-15 1.128032e-02 3.092161e-15 0.009198 1.311693e-15 4.888767e-03 3.020735e-15 0.008926 2.383175e-15 0.007089
12 20067.0 141.658039 5.294563e-16 1.973319e-03 1.054651e-15 0.003137 1.620381e-15 6.039267e-03 1.107476e-15 0.003273 1.652719e-15 0.004916
13 19869.0 140.957440 1.620381e-15 6.039267e-03 1.652719e-15 0.004916 5.294563e-16 1.973319e-03 1.611621e-15 0.004762 1.054651e-15 0.003137
14 12687.0 112.636584 2.048821e-16 7.636090e-04 4.581377e-16 0.001363 8.604621e-16 3.206999e-03 4.861355e-16 0.001437 8.779237e-16 0.002611
15 12063.0 109.831689 8.604621e-16 3.206999e-03 8.779237e-16 0.002611 2.048821e-16 7.636090e-04 8.542774e-16 0.002524 4.581377e-16 0.001363
16 7543.0 86.850446 7.637870e-17 2.846685e-04 1.972749e-16 0.000587 4.519852e-16 1.684579e-03 2.120716e-16 0.000627 4.630835e-16 0.001377
17 7940.0 89.106678 4.519852e-16 1.684579e-03 4.630835e-16 0.001377 7.637870e-17 2.846685e-04 4.494787e-16 0.001328 1.972749e-16 0.000587
18 4464.0 66.813172 2.761680e-17 1.029296e-04 8.536186e-17 0.000254 2.341804e-16 8.728057e-04 9.315715e-17 0.000275 2.423762e-16 0.000721
19 5442.0 73.769913 2.341804e-16 8.728057e-04 2.423762e-16 0.000721 2.761680e-17 1.029296e-04 2.345489e-16 0.000693 8.536186e-17 0.000254
20 2778.0 52.706736 9.754514e-18 3.635572e-05 3.762475e-17 0.000112 1.193495e-16 4.448236e-04 4.171736e-17 0.000123 1.258213e-16 0.000374
21 3558.0 59.648973 1.193495e-16 4.448236e-04 1.258213e-16 0.000374 9.754514e-18 3.635572e-05 1.213184e-16 0.000359 3.762475e-17 0.000112
22 1423.0 37.722672 3.387166e-18 1.262419e-05 1.699867e-17 0.000051 5.968111e-17 2.224354e-04 1.913925e-17 0.000057 6.478170e-17 0.000193
23 2722.0 52.172790 5.968111e-17 2.224354e-04 6.478170e-17 0.000193 3.387166e-18 1.262419e-05 6.219132e-17 0.000184 1.699867e-17 0.000051
24 715.0 26.739484 1.162203e-18 4.331609e-06 7.820965e-18 0.000023 2.920519e-17 1.088497e-04 8.936023e-18 0.000026 3.310034e-17 0.000098
25 1886.0 43.428102 2.920519e-17 1.088497e-04 3.310034e-17 0.000098 1.162203e-18 4.331609e-06 3.161033e-17 0.000093 7.820965e-18 0.000023
26 493.0 22.203603 3.955419e-19 1.474211e-06 3.606744e-18 0.000011 1.394010e-17 5.195566e-05 4.185068e-18 0.000012 1.680304e-17 0.000050
27 1134.0 33.674916 1.394010e-17 5.195566e-05 1.680304e-17 0.000050 3.955419e-19 1.474211e-06 1.594605e-17 0.000047 3.606744e-18 0.000011
28 298.0 17.262677 1.338868e-19 4.990051e-07 1.642831e-18 0.000005 6.461990e-18 2.408426e-05 1.941410e-18 0.000006 8.486771e-18 0.000025
29 758.0 27.531800 6.461990e-18 2.408426e-05 8.486771e-18 0.000025 1.338868e-19 4.990051e-07 7.993934e-18 0.000024 1.642831e-18 0.000005
30 144.0 12.000000 4.515669e-20 1.683020e-07 7.355884e-19 0.000002 2.893904e-18 1.078577e-05 8.890250e-19 0.000003 4.269940e-18 0.000013
31 559.0 23.643181 2.893904e-18 1.078577e-05 4.269940e-18 0.000013 4.515669e-20 1.683020e-07 3.986566e-18 0.000012 7.355884e-19 0.000002

In [59]:
# with open('pyunfold_dict.json', 'w') as outfile:
#     data = {'counts': counts_observed,
#             'block_response': block_response.tolist(),
#             'block_response_err': block_response_err.tolist()}
#     for model in ['h3a', 'h4a', 'Hoerandel5']:
#         data['priors_{}'.format(model)] = priors[model]
#     json.dump(data, outfile)



NameErrorTraceback (most recent call last)
<ipython-input-59-e1f5f3c0597f> in <module>()
      1 with open('pyunfold_dict.json', 'w') as outfile:
----> 2     data = {'counts': counts_observed,
      3             'block_response': block_response.tolist(),
      4             'block_response_err': block_response_err.tolist()}
      5     for model in ['h3a', 'h4a', 'Hoerandel5']:

NameError: name 'counts_observed' is not defined

In [116]:
df = pd.read_csv(formatted_df_outfile, index_col='log_energy_bin_idx')

In [117]:
df


Out[117]:
counts allheavy_priors h3a_priors h4a_priors alllight_priors Hoerandel5_priors uniform_priors antih3a_priors antiHoerandel5_priors
log_energy_bin_idx
0 360613.0 0.1 0.594538 0.604184 0.9 0.505380 0.5 0.405462 0.494620
1 233037.0 0.9 0.405462 0.395816 0.1 0.494620 0.5 0.594538 0.505380
2 256442.0 0.1 0.572835 0.583304 0.9 0.479212 0.5 0.427165 0.520788
3 145728.0 0.9 0.427165 0.416696 0.1 0.520788 0.5 0.572835 0.479212
4 164393.0 0.1 0.546380 0.557857 0.9 0.445752 0.5 0.453620 0.554248
5 99038.0 0.9 0.453620 0.442143 0.1 0.554248 0.5 0.546380 0.445752
6 102273.0 0.1 0.514642 0.527341 0.9 0.404594 0.5 0.485358 0.595406
7 66484.0 0.9 0.485358 0.472659 0.1 0.595406 0.5 0.514642 0.404594
8 64793.0 0.1 0.477441 0.491597 0.9 0.356213 0.5 0.522559 0.643787
9 40849.0 0.9 0.522559 0.508403 0.1 0.643787 0.5 0.477441 0.356213
10 35326.0 0.1 0.435256 0.451106 0.9 0.302353 0.5 0.564744 0.697647
11 29647.0 0.9 0.564744 0.548894 0.1 0.697647 0.5 0.435256 0.302353
12 20067.0 0.1 0.389548 0.407295 0.9 0.246277 0.5 0.610452 0.753723
13 19869.0 0.9 0.610452 0.592705 0.1 0.753723 0.5 0.389548 0.246277
14 12687.0 0.1 0.342902 0.362676 0.9 0.192315 0.5 0.657098 0.807685
15 12063.0 0.9 0.657098 0.637324 0.1 0.807685 0.5 0.342902 0.192315
16 7543.0 0.1 0.298739 0.320568 0.9 0.144557 0.5 0.701261 0.855443
17 7940.0 0.9 0.701261 0.679432 0.1 0.855443 0.5 0.298739 0.144557
18 4464.0 0.1 0.260458 0.284270 0.9 0.105489 0.5 0.739542 0.894511
19 5442.0 0.9 0.739542 0.715730 0.1 0.894511 0.5 0.260458 0.105489
20 2778.0 0.1 0.230197 0.255879 0.9 0.075555 0.5 0.769803 0.924445
21 3558.0 0.9 0.769803 0.744121 0.1 0.924445 0.5 0.230197 0.075555
22 1423.0 0.1 0.207858 0.235327 0.9 0.053706 0.5 0.792142 0.946294
23 2722.0 0.9 0.792142 0.764673 0.1 0.946294 0.5 0.207858 0.053706
24 715.0 0.1 0.191122 0.220390 0.9 0.038271 0.5 0.808878 0.961729
25 1886.0 0.9 0.808878 0.779610 0.1 0.961729 0.5 0.191122 0.038271
26 493.0 0.1 0.176716 0.207891 0.9 0.027592 0.5 0.823284 0.972408
27 1134.0 0.9 0.823284 0.792109 0.1 0.972408 0.5 0.176716 0.027592
28 298.0 0.1 0.162181 0.195404 0.9 0.020299 0.5 0.837819 0.979701
29 758.0 0.9 0.837819 0.804596 0.1 0.979701 0.5 0.162181 0.020299
30 144.0 0.1 0.146955 0.182342 0.9 0.015364 0.5 0.853045 0.984636
31 559.0 0.9 0.853045 0.817658 0.1 0.984636 0.5 0.146955 0.015364

In [ ]: