Author: James Bourbeau
In [1]:
    
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
    
    
In [21]:
    
%matplotlib inline
from __future__ import division, print_function
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
import composition as comp
import composition.analysis.plotting as plotting
    
color_dict = {'light': 'C0', 'heavy': 'C1', 'total': 'C2',
             'P': 'C0', 'He': 'C1', 'O': 'C3', 'Fe':'C4'}
    
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [4]:
    
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
    
Get composition classifier pipeline
In [5]:
    
pipeline_str = 'GBDT'
pipeline = comp.get_pipeline(pipeline_str)
    
Define energy binning for this analysis
In [6]:
    
energybins = comp.analysis.get_energybins()
    
[ back to top ]
In [7]:
    
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)
    
    
In [8]:
    
# Compute the correlation matrix
df_sim = comp.load_dataframe(datatype='sim', config='IC79')
feature_list, feature_labels = comp.analysis.get_training_features()
corr = df_sim[feature_list].corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0,
            square=True, xticklabels=feature_labels, yticklabels=feature_labels,
            linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=True, ax=ax)
# outfile = args.outdir + '/feature_covariance.png'
# plt.savefig(outfile)
plt.show()
    
    
    
    
In [9]:
    
data = comp.preprocess_data(comp_class=comp_class, return_energy=True)
    
    
    
    
In [14]:
    
is_finite_mask = np.isfinite(data.X)
not_finite_mask = np.logical_not(is_finite_mask)
finite_data_mask = np.logical_not(np.any(not_finite_mask, axis=1))
data = data[finite_data_mask]
    
Run classifier over training and testing sets to get an idea of the degree of overfitting
In [19]:
    
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
pipeline.fit(sim_train.X, sim_train.y)
train_pred = pipeline.predict(sim_train.X)
train_acc = accuracy_score(sim_train.y, train_pred)
print('Training accuracy = {:.2%}'.format(train_acc))
test_pred = pipeline.predict(sim_test.X)
test_acc = accuracy_score(sim_test.y, test_pred)
print('Testing accuracy = {:.2%}'.format(test_acc))
print('=' * 30)
    
    
In [23]:
    
len(energybins.energy_midpoints)*2
    
    Out[23]:
In [9]:
    
def get_frac_correct(train, test, pipeline, comp_list):
    
    assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
    assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
    assert train.y is not None, 'train must have true y values'
    assert test.y is not None, 'test must have true y values'
    
    pipeline.fit(train.X, train.y)
    test_predictions = pipeline.predict(test.X)
    correctly_identified_mask = (test_predictions == test.y)
    # Construct MC composition masks
    MC_comp_mask = {}
    for composition in comp_list:
        MC_comp_mask[composition] = (test.le.inverse_transform(test.y) == composition)
    MC_comp_mask['total'] = np.array([True]*len(test))
    
    reco_frac, reco_frac_err = {}, {}
    for composition in comp_list+['total']:
        comp_mask = MC_comp_mask[composition]
        # Get number of MC comp in each reco energy bin
        num_MC_energy = np.histogram(test.log_energy[comp_mask],
                                     bins=energybins.log_energy_bins)[0]
        num_MC_energy_err = np.sqrt(num_MC_energy)
        # Get number of correctly identified comp in each reco energy bin
        num_reco_energy = np.histogram(test.log_energy[comp_mask & correctly_identified_mask],
                                       bins=energybins.log_energy_bins)[0]
        num_reco_energy_err = np.sqrt(num_reco_energy)
        # Calculate correctly identified fractions as a function of MC energy
        reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
            num_reco_energy, num_reco_energy_err,
            num_MC_energy, num_MC_energy_err)
    
    return reco_frac, reco_frac_err
    
In [10]:
    
# Split training data into CV training and testing folds
kf = KFold(n_splits=10)
frac_correct_folds = defaultdict(list)
fold_num = 0
print('Fold ', end='')
for train_index, test_index in kf.split(sim_train.X):
    fold_num += 1
    print('{}...'.format(fold_num), end='')
    
    reco_frac, reco_frac_err = get_frac_correct(sim_train[train_index],
                                                sim_train[test_index],
                                                pipeline, comp_list)
        
    for composition in comp_list:
        frac_correct_folds[composition].append(reco_frac[composition])
    frac_correct_folds['total'].append(reco_frac['total'])
frac_correct_gen_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}
scores = np.array(frac_correct_folds['total'])
score = scores.mean(axis=1).mean()
score_std = scores.mean(axis=1).std()
    
    
In [11]:
    
reco_frac, reco_frac_stat_err = get_frac_correct(sim_train, sim_test, pipeline, comp_list)
    
In [12]:
    
# Plot fraction of events correctlt classified vs energy
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    err = np.sqrt(frac_correct_gen_err[composition]**2 + reco_frac_stat_err[composition]**2)
    plotting.plot_steps(energybins.log_energy_midpoints, reco_frac[composition], err, ax,
                        color_dict[composition], composition)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
ax.grid()
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
cv_str = 'Accuracy: {:0.2f}\% (+/- {:0.1f}\%)'.format(score*100, score_std*100)
ax.text(7.4, 0.2, cv_str,
        ha="center", va="center", size=10,
        bbox=dict(boxstyle='round', fc="white", ec="gray", lw=0.8))
plt.savefig('/home/jbourbeau/public_html/figures/frac-correct.png')
plt.show()
    
    
    
[ back to top ]
In [10]:
    
def get_num_comp_reco(train, test, pipeline, comp_list):
    
    assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
    assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
    assert train.y is not None, 'train must have true y values'
    
    pipeline.fit(train.X, train.y)
    test_predictions = pipeline.predict(test.X)
    # Get number of correctly identified comp in each reco energy bin
    num_reco_energy, num_reco_energy_err = {}, {}
    for composition in comp_list:
        print('composition = {}'.format(composition))
        comp_mask = train.le.inverse_transform(test_predictions) == composition
        print('sum(comp_mask) = {}'.format(np.sum(comp_mask)))
        print(test.log_energy[comp_mask])
        num_reco_energy[composition] = np.histogram(test.log_energy[comp_mask],
            bins=energybins.log_energy_bins)[0]
        num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])
    num_reco_energy['total'] = np.histogram(test.log_energy, bins=energybins.log_energy_bins)[0]
    num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])
    
    return num_reco_energy, num_reco_energy_err
    
In [15]:
    
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(sim_train, data, pipeline, comp_list)
import pprint
pprint.pprint(num_reco_energy)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.8)))
    
    
In [16]:
    
counts_observed = []
for light_counts, heavy_counts in zip(num_reco_energy['light'], num_reco_energy['heavy']):
    counts_observed.extend([light_counts, heavy_counts])
print('counts_observed = {}'.format(counts_observed))
    
    
In [17]:
    
pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(sim_test.X)
true_comp = sim_train.le.inverse_transform(sim_test.y)
pred_comp = sim_train.le.inverse_transform(test_predictions)
    
In [18]:
    
response_list = []
response_err_list = []
sim_bin_idxs = np.digitize(sim_test.log_energy, energybins.log_energy_bins) - 1
data_bin_idxs = np.digitize(data.log_energy, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(sim_bin_idxs)
# energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
for bin_idx in energy_bin_idx:
    sim_bin_mask = sim_bin_idxs == bin_idx
    data_bin_mask = data_bin_idxs == bin_idx
    response_mat = confusion_matrix(true_comp[sim_bin_mask], pred_comp[sim_bin_mask],
            labels=comp_list)
    # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
    response_mat = response_mat.T
    # Get response matrix statistical error
    response_mat_err = np.sqrt(response_mat)/response_mat.sum(axis=0, keepdims=True)
    response_err_list.append(response_mat_err)
    # Normalize along MC comp axis to go from counts to probabilities
    response_mat = response_mat.astype(float)/response_mat.sum(axis=0, keepdims=True)
    response_list.append(response_mat)
block_response = block_diag(response_list).toarray()
block_response = np.flipud(block_response)
print('block_response = \n{}'.format(block_response))
block_response_err = block_diag(response_err_list).toarray()
block_response_err = np.flipud(block_response_err)
print('block_response_err = \n{}'.format(block_response_err))
    
    
In [22]:
    
from icecube.weighting.weighting import from_simprod
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5
    
In [23]:
    
simlist = np.unique(df_sim['sim'])
for i, sim in enumerate(simlist):
    _, sim_files = comp.simfunctions.get_level3_sim_files(sim)
    num_files = len(sim_files)
    if i == 0:
        generator = num_files*from_simprod(int(sim))
    else:
        generator += num_files*from_simprod(int(sim))
    
In [ ]:
    
flux = GaisserH3a()
    
In [97]:
    
df_sim[['MC_comp', 'MC_type']][df_sim.MC_comp == 'O']
    
    Out[97]:
In [25]:
    
priors = defaultdict(list)
for flux, name in zip([GaisserH3a(), GaisserH4a(), Hoerandel5()], ['h3a', 'h4a', 'Hoerandel5']):
    for energy_mid in energybins.energy_midpoints:
        energy = [energy_mid]*4
        ptype = [2212, 1000020040, 1000080160, 1000260560]
        weights = flux(energy, ptype)/generator(energy, ptype)
    #     print(weights)
        light_prior = weights[:2].sum()/weights.sum()
        heavy_prior = weights[2:].sum()/weights.sum()
        priors[name].extend([light_prior, heavy_prior])
print(priors['h3a'])
print(priors['h4a'])
    
    
In [28]:
    
len(counts_observed)
    
    Out[28]:
In [26]:
    
with open('pyunfold_dict.json', 'w') as outfile:
    data = {'counts': counts_observed,
            'block_response':block_response.tolist(),
            'block_response_err':block_response_err.tolist()}
    for model in ['h3a', 'h4a', 'Hoerandel5']:
        data['priors_{}'.format(model)] = priors[model]
    json.dump(data, outfile)
    
In [19]:
    
# Live-time information
goodrunlist = pd.read_table('/data/ana/CosmicRay/IceTop_GRL/IC79_2010_GoodRunInfo_4IceTop.txt', skiprows=[0, 3])
goodrunlist.head()
    
    Out[19]:
In [20]:
    
livetimes = goodrunlist['LiveTime(s)']
livetime = np.sum(livetimes[goodrunlist['Good_it_L2'] == 1])
print('livetime (seconds) = {}'.format(livetime))
print('livetime (days) = {}'.format(livetime/(24*60*60)))
    
    
In [21]:
    
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy[composition]
    y_err = num_reco_energy_err[composition]
    # Add time duration
    y = y / livetime
    y_err = y / livetime
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err,
#                 color=color_dict[composition], label=composition,
#                 marker='.', linestyle='None')
    plotting.plot_steps(energybins.log_energy_midpoints, y, y_err,
                        ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Rate [s$^{-1}$]')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid(linestyle=':')
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
plt.show()
    
    
In [22]:
    
eff_area, eff_area_error, energy_midpoints = comp.analysis.get_effective_area(df_sim,
                                                            energybins.energy_bins, energy='MC')
fix, ax = plt.subplots()
ax.errorbar(np.log10(energy_midpoints), eff_area, yerr=eff_area_error)
ax.set_ylabel('Effective area [$m^2$]')
ax.set_xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.grid()
plt.show()
    
    
    
In [23]:
    
energybins.log_energy_bins
    
    Out[23]:
In [24]:
    
# Plot fraction of events vs energy
# fig, ax = plt.subplots(figsize=(8, 6))
fig = plt.figure()
ax = plt.gca()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy[composition]/energybins.energy_bin_widths
    y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energybins.energy_midpoints**2.7 * y
    y_err = energybins.energy_midpoints**2.7 * y_err
#     print(y)
#     print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
    
# Add 3-year scraped flux
df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
df_light = pd.DataFrame.from_dict({'energy': df_proton.energy, 
                                  'flux': df_proton.flux + df_helium.flux})
df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
df_heavy = pd.DataFrame.from_dict({'energy': df_oxygen.energy, 
                                  'flux': df_oxygen.flux + df_iron.flux})
if comp_class:
    ax.plot(np.log10(df_light.energy), df_light.flux, label='3 yr light',
            marker='.', ls=':')
    ax.plot(np.log10(df_heavy.energy), df_heavy.flux, label='3 yr heavy',
            marker='.', ls=':')
    ax.plot(np.log10(df_heavy.energy), df_heavy.flux+df_light.flux, label='3 yr total',
            marker='.', ls=':')
else:
    ax.plot(np.log10(df_proton.energy), df_proton.flux, label='3 yr proton',
            marker='.', ls=':')
    ax.plot(np.log10(df_helium.energy), df_helium.flux, label='3 yr helium',
            marker='.', ls=':', color=color_dict['He'])
    ax.plot(np.log10(df_oxygen.energy), df_oxygen.flux, label='3 yr oxygen',
            marker='.', ls=':', color=color_dict['O'])
    ax.plot(np.log10(df_iron.energy), df_iron.flux, label='3 yr iron',
        marker='.', ls=':', color=color_dict['Fe'])
    ax.plot(np.log10(df_iron.energy), df_proton.flux+df_helium.flux+df_oxygen.flux+df_iron.flux, label='3 yr total',
    marker='.', ls=':', color='C2')
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.15),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
    
    
In [24]:
    
if not comp_class:
    # Add 3-year scraped flux
    df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
    df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
    df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
    df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
    # Plot fraction of events vs energy
    fig, axarr = plt.subplots(2, 2, figsize=(8, 6))
    for composition, ax in zip(comp_list + ['total'], axarr.flatten()):
        # Calculate dN/dE
        y = num_reco_energy[composition]/energybins.energy_bin_widths
        y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
        # Add effective area
        y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
        # Add solid angle
        y = y / solid_angle
        y_err = y_err / solid_angle
        # Add time duration
        y = y / livetime
        y_err = y / livetime
        y = energybins.energy_midpoints**2.7 * y
        y_err = energybins.energy_midpoints**2.7 * y_err
        plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
        # Load 3-year flux
        df_3yr = pd.read_csv('3yearscraped/{}'.format(composition), sep='\t',
                             header=None, names=['energy', 'flux'])
        ax.plot(np.log10(df_3yr.energy), df_3yr.flux, label='3 yr {}'.format(composition),
                        marker='.', ls=':', color=color_dict[composition])
        ax.set_yscale("log", nonposy='clip')
        # ax.set_xscale("log", nonposy='clip')
        ax.set_xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
        ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
        ax.set_xlim([6.3, 8])
        ax.set_ylim([10**3, 10**5])
        ax.grid(linestyle='dotted', which="both")
        ax.legend()
    plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
    plt.show()
    
[ back to top ]
In [20]:
    
reco_frac['light']
    
    Out[20]:
In [21]:
    
reco_frac['heavy']
    
    Out[21]:
In [22]:
    
num_reco_energy['light']
    
    Out[22]:
In [23]:
    
num_reco_energy['heavy']
    
    Out[23]:
In [9]:
    
pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(sim_test.X)
true_comp = sim_train.le.inverse_transform(sim_test.y)
pred_comp = sim_train.le.inverse_transform(test_predictions)
print(true_comp)
print(pred_comp)
    
    
In [10]:
    
data_pred = pipeline.predict(data.X)
observed_comp = sim_train.le.inverse_transform(data_pred)
print(observed_comp)
    
    
In [11]:
    
comp_list
    
    Out[11]:
In [16]:
    
sim_bin_idxs = np.digitize(sim_test.log_energy, energybins.log_energy_bins) - 1
data_bin_idxs = np.digitize(data.log_energy, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(sim_bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
for bin_idx in energy_bin_idx:
    sim_bin_mask = sim_bin_idxs == bin_idx
    data_bin_mask = data_bin_idxs == bin_idx
    unfolder = comp.analysis.Unfolder()
    unfolded_events = unfolder.unfold(true_MC_comp=true_comp[sim_bin_mask],
                                        reco_MC_comp=pred_comp[sim_bin_mask],
                                        observed_comp=observed_comp[data_bin_mask],
                                        priors=[0.5, 0.5],
                                        labels=comp_list)
    print(unfolded_events)
    for i, composition in enumerate(comp_list):
        num_reco_energy_unfolded[composition].append(unfolded_events[i])
for composition in comp_list:
    num_reco_energy_unfolded[composition] = np.array(num_reco_energy_unfolded[composition])
num_reco_energy_unfolded['total'] = np.sum([num_reco_energy_unfolded[composition] for composition in comp_list], axis=0)
print(num_reco_energy_unfolded)
    
    
In [27]:
    
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy_unfolded[composition]/energy_bin_widths
    y_err = np.sqrt(y)/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energy_midpoints**2.7 * y
    y_err = energy_midpoints**2.7 * y_err
    print(y)
    print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
    
    
    
    
Get confusion matrix for each energy bin
In [99]:
    
bin_idxs = np.digitize(energy_test_sim, log_energy_bins) - 1
energy_bin_idx = np.unique(bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
response_mat = []
for bin_idx in energy_bin_idx:
    energy_bin_mask = bin_idxs == bin_idx
    confmat = confusion_matrix(true_comp[energy_bin_mask], pred_comp[energy_bin_mask], labels=comp_list)
    confmat = np.divide(confmat.T, confmat.sum(axis=1, dtype=float)).T
    response_mat.append(confmat)
    
    
In [100]:
    
response_mat
    
    Out[100]:
In [134]:
    
r = np.dstack((np.copy(num_reco_energy['light']), np.copy(num_reco_energy['heavy'])))[0]
for unfold_iter in range(50):
    print('Unfolding iteration {}...'.format(unfold_iter))
    if unfold_iter == 0:
        u = r
    fs = []
    for bin_idx in energy_bin_idx:
#         print(u)
        f = np.dot(response_mat[bin_idx], u[bin_idx])
        f[f < 0] = 0
        fs.append(f)
#         print(f)
    u = u + (r - fs)
#     u[u < 0] = 0
#     print(u)
unfolded_counts_iter = {}
unfolded_counts_iter['light'] = u[:,0]
unfolded_counts_iter['heavy'] = u[:,1]
unfolded_counts_iter['total'] = u.sum(axis=1)
print(unfolded_counts_iter)
    
    
In [135]:
    
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = unfolded_counts_iter[composition]/energy_bin_widths
    y_err = np.sqrt(y)/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energy_midpoints**2.7 * y
    y_err = energy_midpoints**2.7 * y_err
    print(y)
    print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
    
    
    
    
In [106]:
    
print(num_reco_energy)
    
    
In [107]:
    
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='None', cmap=cmap,
               vmin=0, vmax=1.0)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, '{:0.3f}'.format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True composition')
    plt.xlabel('Predicted composition')
fig, ax = plt.subplots()
plot_confusion_matrix(confmat, classes=['light', 'heavy'], normalize=True,
                      title='Confusion matrix, without normalization')
# # Plot normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
#                       title='Normalized confusion matrix')
plt.show()
    
    
    
In [63]:
    
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)
inverse = np.linalg.inv(confmat)
inverse
    
    Out[63]:
In [64]:
    
confmat
    
    Out[64]:
In [66]:
    
comp_list = ['light', 'heavy']
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(X_train_sim, y_train_sim, X_test_data, comp_list)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
energy_bin_widths = 10**energy_bins[1:] - 10**energy_bins[:-1]
def get_energy_res(df_sim, energy_bins):
    reco_log_energy = df_sim['lap_log_energy'].values 
    MC_log_energy = df_sim['MC_log_energy'].values
    energy_res = reco_log_energy - MC_log_energy
    bin_centers, bin_medians, energy_err = comp.analysis.data_functions.get_medians(reco_log_energy,
                                                                               energy_res,
                                                                               energy_bins)
    return np.abs(bin_medians)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.85)))
# solid_angle = 2*np.pi*(1-np.cos(40*(np.pi/180)))
print(solid_angle)
print(2*np.pi*(1-np.cos(40*(np.pi/180))))
# Live-time information
start_time = np.amin(df_data['start_time_mjd'].values)
end_time = np.amax(df_data['end_time_mjd'].values)
day_to_sec = 24 * 60 * 60.
dt = day_to_sec * (end_time - start_time)
print(dt)
# Plot fraction of events vs energy
fig, ax = plt.subplots()
for i, composition in enumerate(comp_list):
    num_reco_bin = np.array([[i, j] for i, j in zip(num_reco_energy['light'], num_reco_energy['heavy'])])
#     print(num_reco_bin)
    num_reco = np.array([np.dot(inverse, i) for i in num_reco_bin])
    print(num_reco)
    num_reco_2 = {'light': num_reco[:, 0], 'heavy': num_reco[:, 1]}
    # Calculate dN/dE
    y = num_reco_2[composition]/energy_bin_widths
    y_err = num_reco_energy_err[composition]/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / dt
    y_err = y / dt
    # Add energy scaling 
    energy_err = get_energy_res(df_sim, energy_bins)
    energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = (10**energy_midpoints)**2.7 * y
    y_err = (10**energy_midpoints)**2.7 * y_err
    plotting.plot_steps(energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid()
leg = plt.legend(loc='upper center', 
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
plt.show()
    
    
    
In [44]:
    
pipeline.get_params()['classifier__max_depth']
    
    Out[44]:
In [47]:
    
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
fig, axarr = plt.subplots(1, 2)
for composition, ax in zip(comp_list, axarr.flatten()):
    MC_comp_mask = (df_sim['MC_comp_class'] == composition)
    MC_log_energy = df_sim['MC_log_energy'][MC_comp_mask].values
    reco_log_energy = df_sim['lap_log_energy'][MC_comp_mask].values
    plotting.histogram_2D(MC_log_energy, reco_log_energy, energy_bins, log_counts=True, ax=ax)
    ax.plot([0,10], [0,10], marker='None', linestyle='-.')
    ax.set_xlim([6.2, 8])
    ax.set_ylim([6.2, 8])
    ax.set_xlabel('$\log_{10}(E_{\mathrm{MC}}/\mathrm{GeV})$')
    ax.set_ylabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
    ax.set_title('{} response matrix'.format(composition))
plt.tight_layout()
plt.show()
    
    
In [10]:
    
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
10**energy_bins[1:] - 10**energy_bins[:-1]
    
    Out[10]:
In [ ]:
    
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)
    
In [ ]:
    
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_
    
In [ ]:
    
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)
    
In [ ]:
    
pipeline.named_steps['classifier'].decision_path(X_test)
    
In [48]:
    
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
# test_probs = defaultdict(list)
fig, ax = plt.subplots()
test_predictions = pipeline.predict(X_test_data)
test_probs = pipeline.predict_proba(X_test_data)
for class_ in pipeline.classes_:
    test_predictions == le.inverse_transform(class_)
    plt.hist(test_probs[:, class_], bins=np.linspace(0, 1, 50),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend()
plt.grid()
plt.show()
    
    
In [5]:
    
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
    comp_mask = (le.inverse_transform(y_test) == composition)
    probs = np.copy(test_probs[comp_mask])
    print('probs = {}'.format(probs.shape))
    weighted_mass = np.zeros(len(probs))
    for class_ in pipeline.classes_:
        c = le.inverse_transform(class_)
        weighted_mass += comp.simfunctions.comp2mass(c) * probs[:, class_]
    print('min = {}'.format(min(weighted_mass)))
    print('max = {}'.format(max(weighted_mass)))
    ax.hist(weighted_mass, bins=np.linspace(0, 5, 100),
             histtype='step', label=None, color='darkgray',
             alpha=1.0, log=False)
    for c in comp_list:
        ax.axvline(comp.simfunctions.comp2mass(c), color=color_dict[c],
                   marker='None', linestyle='-')
    ax.set_ylabel('Counts')
    ax.set_xlabel('Weighted atomic number')
    ax.set_title('MC {}'.format(composition))
    ax.grid()
plt.tight_layout()
plt.show()
    
    
    
    
In [15]:
    
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
    comp_mask = (le.inverse_transform(y_test) == composition)
    probs = np.copy(test_probs[comp_mask])
    weighted_mass = np.zeros(len(probs))
    for class_ in pipeline.classes_:
        c = le.inverse_transform(class_)
        ax.hist(probs[:, class_], bins=np.linspace(0, 1, 50),
                 histtype='step', label=c, color=color_dict[c],
                 alpha=1.0, log=True)
    ax.legend(title='Reco comp', framealpha=0.5)
    ax.set_ylabel('Counts')
    ax.set_xlabel('Testing set class probabilities')
    ax.set_title('MC {}'.format(composition))
    ax.grid()
plt.tight_layout()
plt.show()
    
    
    
In [25]:
    
comp_list = ['light', 'heavy']
test_probs = defaultdict(list)
fig, ax = plt.subplots()
# test_probs = pipeline.predict_proba(X_test)
for event in pipeline.predict_proba(X_test_data):
    composition = le.inverse_transform(np.argmax(event))
    test_probs[composition].append(np.amax(event))
for composition in comp_list:
    plt.hist(test_probs[composition], bins=np.linspace(0, 1, 100),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=False)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend(title='Reco comp')
plt.grid()
plt.show()
    
    
    
In [ ]:
    
    
In [ ]: