Author: James Bourbeau
In [1]:
    
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
    
    
In [2]:
    
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
    
    
In [3]:
    
%matplotlib inline
from __future__ import division, print_function
from collections import defaultdict
import itertools
import numpy as np
from scipy import interp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import composition as comp
import composition.analysis.plotting as plotting
    
color_dict = {'light': 'C0', 'heavy': 'C1', 'total': 'C2',
             'P': 'C0', 'He': 'C1', 'O': 'C3', 'Fe':'C4'}
    
    
In [4]:
    
true = [1]*300 + [2]*100
reco = [1]*200 + [2]*100 + [1]*50 + [2]*50
observed = [1]*240 + [2]*158
    
In [5]:
    
unfolder = comp.analysis.Unfolder()
unfolder.unfold(true, reco, priors=[0.75, 0.25],
               labels=[1, 2], observed_comp=observed)
    
    
    Out[5]:
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [4]:
    
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
    
Get composition classifier pipeline
In [5]:
    
pipeline_str = 'xgboost'
pipeline = comp.get_pipeline(pipeline_str)
    
Define energy binning for this analysis
In [6]:
    
energybins = comp.analysis.get_energybins()
    
[ back to top ]
In [7]:
    
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)
    
    
    
    
In [8]:
    
data = comp.preprocess_data(comp_class=comp_class, return_energy=True)
    
    
Run classifier over training and testing sets to get an idea of the degree of overfitting
In [11]:
    
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
pipeline.fit(sim_train.X, sim_train.y)
train_pred = pipeline.predict(sim_train.X)
train_acc = accuracy_score(sim_train.y, train_pred)
print('Training accuracy = {:.2%}'.format(train_acc))
test_pred = pipeline.predict(sim_test.X)
test_acc = accuracy_score(sim_test.y, test_pred)
print('Testing accuracy = {:.2%}'.format(test_acc))
print('=' * 30)
    
    
In [ ]:
    
print('=' * 30)
print(clf_name)
scores_acc = cross_val_score(estimator=pipeline, X=sim_train.X, y=sim_train.y,
                         scoring='accuracy', cv=3, n_jobs=10)
print('CV accuracy: {:.2%} (+/- {:.2%})'.format(scores_acc.mean(), scores_acc.std()))
print('=' * 30)
    
In [11]:
    
print('=' * 30)
print(clf_name)
pipeline.fit(sim_train.X, sim_train.y)
y_pred = pipeline.predict(sim_train.X)
print(classification_report(sim_train.y, y_pred, digits=3))
print('=' * 30)
    
    
In [ ]:
    
print('=' * 30)
print(clf_name)
if comp_class:
    scores_f1 = cross_val_score(estimator=pipeline, X=sim_train.X, y=sim_train.y,
                             scoring='f1', cv=2, n_jobs=10)
    print('CV f1: {:.2%} (+/- {:.2%})'.format(scores_f1.mean(), scores_f1.std()))
else:
    scores_f1 = cross_val_score(estimator=pipeline, X=sim_train.X, y=sim_train.y,
                             scoring='f1', cv=2, n_jobs=10)
    print('CV f1: {:.2%} (+/- {:.2%})'.format(scores_f1.mean(), scores_f1.std()))
print('=' * 30)
    
    
    
[ back to top ]
In [9]:
    
def get_frac_correct(train, test, pipeline, comp_list):
    
    assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
    assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
    assert train.y is not None, 'train must have true y values'
    assert test.y is not None, 'test must have true y values'
    
    pipeline.fit(train.X, train.y)
    test_predictions = pipeline.predict(test.X)
    correctly_identified_mask = (test_predictions == test.y)
    # Construct MC composition masks
    MC_comp_mask = {}
    for composition in comp_list:
        MC_comp_mask[composition] = (test.le.inverse_transform(test.y) == composition)
    MC_comp_mask['total'] = np.array([True]*len(test))
    
    reco_frac, reco_frac_err = {}, {}
    for composition in comp_list+['total']:
        comp_mask = MC_comp_mask[composition]
        # Get number of MC comp in each reco energy bin
        num_MC_energy = np.histogram(test.log_energy[comp_mask],
                                     bins=energybins.log_energy_bins)[0]
        num_MC_energy_err = np.sqrt(num_MC_energy)
        # Get number of correctly identified comp in each reco energy bin
        num_reco_energy = np.histogram(test.log_energy[comp_mask & correctly_identified_mask],
                                       bins=energybins.log_energy_bins)[0]
        num_reco_energy_err = np.sqrt(num_reco_energy)
        # Calculate correctly identified fractions as a function of MC energy
        reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
            num_reco_energy, num_reco_energy_err,
            num_MC_energy, num_MC_energy_err)
    
    return reco_frac, reco_frac_err
    
In [10]:
    
# Split training data into CV training and testing folds
kf = KFold(n_splits=10)
frac_correct_folds = defaultdict(list)
fold_num = 0
print('Fold ', end='')
for train_index, test_index in kf.split(sim_train.X):
    fold_num += 1
    print('{}...'.format(fold_num), end='')
    
    reco_frac, reco_frac_err = get_frac_correct(sim_train[train_index],
                                                sim_train[test_index],
                                                pipeline, comp_list)
        
    for composition in comp_list:
        frac_correct_folds[composition].append(reco_frac[composition])
    frac_correct_folds['total'].append(reco_frac['total'])
frac_correct_gen_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}
scores = np.array(frac_correct_folds['total'])
score = scores.mean(axis=1).mean()
score_std = scores.mean(axis=1).std()
    
    
In [11]:
    
reco_frac, reco_frac_stat_err = get_frac_correct(sim_train, sim_test, pipeline, comp_list)
# step_x = energybins.log_energy_midpoints
# step_x = np.append(step_x[0]-energybins.log_energy_bin_width/2, step_x)
# step_x = np.append(step_x, step_x[-1]+energybins.log_energy_bin_width/2)
    
In [12]:
    
# Plot fraction of events correctlt classified vs energy
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    err = np.sqrt(frac_correct_gen_err[composition]**2 + reco_frac_stat_err[composition]**2)
    plotting.plot_steps(energybins.log_energy_midpoints, reco_frac[composition], err, ax,
                        color_dict[composition], composition)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
ax.grid()
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
cv_str = 'Accuracy: {:0.2f}\% (+/- {:0.1f}\%)'.format(score*100, score_std*100)
ax.text(7.4, 0.2, cv_str,
        ha="center", va="center", size=10,
        bbox=dict(boxstyle='round', fc="white", ec="gray", lw=0.8))
plt.savefig('/home/jbourbeau/public_html/figures/frac-correct.png')
plt.show()
    
    
    
[ back to top ]
In [22]:
    
def get_num_comp_reco(train, test, pipeline, comp_list):
    
    assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
    assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
    assert train.y is not None, 'train must have true y values'
    
    pipeline.fit(train.X, train.y)
    test_predictions = pipeline.predict(test.X)
    # Get number of correctly identified comp in each reco energy bin
    num_reco_energy, num_reco_energy_err = {}, {}
    for composition in comp_list:
        print('composition = {}'.format(composition))
        comp_mask = train.le.inverse_transform(test_predictions) == composition
        print('sum(comp_mask) = {}'.format(np.sum(comp_mask)))
        print(test.log_energy[comp_mask])
        num_reco_energy[composition] = np.histogram(test.log_energy[comp_mask],
            bins=energybins.log_energy_bins)[0]
        num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])
    num_reco_energy['total'] = np.histogram(test.log_energy, bins=energybins.log_energy_bins)[0]
    num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])
    
    return num_reco_energy, num_reco_energy_err
    
In [23]:
    
df_sim = comp.load_dataframe(datatype='sim', config='IC79')
    
    
In [24]:
    
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(sim_train, data, pipeline, comp_list)
import pprint
pprint.pprint(num_reco_energy)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.8)))
    
    
In [25]:
    
print(num_reco_energy['light'].sum())
print(num_reco_energy['heavy'].sum())
frac_light = num_reco_energy['light'].sum()/num_reco_energy['total'].sum()
print(frac_light)
    
    
In [27]:
    
# Live-time information
goodrunlist = pd.read_table('/data/ana/CosmicRay/IceTop_GRL/IC79_2010_GoodRunInfo_4IceTop.txt', skiprows=[0, 3])
goodrunlist.head()
    
    Out[27]:
In [28]:
    
livetimes = goodrunlist['LiveTime(s)']
livetime = np.sum(livetimes[goodrunlist['Good_it_L2'] == 1])
print('livetime (seconds) = {}'.format(livetime))
print('livetime (days) = {}'.format(livetime/(24*60*60)))
    
    
In [30]:
    
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy[composition]
    y_err = num_reco_energy_err[composition]
    # Add time duration
    y = y / livetime
    y_err = y / livetime
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err,
#                 color=color_dict[composition], label=composition,
#                 marker='.', linestyle='None')
    plotting.plot_steps(energybins.log_energy_midpoints, y, y_err,
                        ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Rate [s$^{-1}$]')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid(linestyle=':')
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
plt.show()
    
    
In [29]:
    
eff_area, eff_area_error, energy_midpoints = comp.analysis.get_effective_area(df_sim, energybins.energy_bins)
    
    
In [40]:
    
# Plot fraction of events vs energy
# fig, ax = plt.subplots(figsize=(8, 6))
fig = plt.figure()
ax = plt.gca()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy[composition]/energybins.energy_bin_widths
    y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energybins.energy_midpoints**2.7 * y
    y_err = energybins.energy_midpoints**2.7 * y_err
#     print(y)
#     print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
    
# Add 3-year scraped flux
df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
df_light = pd.DataFrame.from_dict({'energy': df_proton.energy, 
                                  'flux': df_proton.flux + df_helium.flux})
df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
df_heavy = pd.DataFrame.from_dict({'energy': df_oxygen.energy, 
                                  'flux': df_oxygen.flux + df_iron.flux})
if comp_class:
    ax.plot(np.log10(df_light.energy), df_light.flux, label='3 yr light',
            marker='.', ls=':')
    ax.plot(np.log10(df_heavy.energy), df_heavy.flux, label='3 yr heavy',
            marker='.', ls=':')
    ax.plot(np.log10(df_heavy.energy), df_heavy.flux+df_light.flux, label='3 yr total',
            marker='.', ls=':')
else:
    ax.plot(np.log10(df_proton.energy), df_proton.flux, label='3 yr proton',
            marker='.', ls=':')
    ax.plot(np.log10(df_helium.energy), df_helium.flux, label='3 yr helium',
            marker='.', ls=':', color=color_dict['He'])
    ax.plot(np.log10(df_oxygen.energy), df_oxygen.flux, label='3 yr oxygen',
            marker='.', ls=':', color=color_dict['O'])
    ax.plot(np.log10(df_iron.energy), df_iron.flux, label='3 yr iron',
        marker='.', ls=':', color=color_dict['Fe'])
    ax.plot(np.log10(df_iron.energy), df_proton.flux+df_helium.flux+df_oxygen.flux+df_iron.flux, label='3 yr total',
    marker='.', ls=':', color='C2')
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.15),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
    
    
In [31]:
    
if not comp_class:
    # Add 3-year scraped flux
    df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
    df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
    df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
    df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
    # Plot fraction of events vs energy
    fig, axarr = plt.subplots(2, 2, figsize=(8, 6))
    for composition, ax in zip(comp_list + ['total'], axarr.flatten()):
        # Calculate dN/dE
        y = num_reco_energy[composition]/energybins.energy_bin_widths
        y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
        # Add effective area
        y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
        # Add solid angle
        y = y / solid_angle
        y_err = y_err / solid_angle
        # Add time duration
        y = y / livetime
        y_err = y / livetime
        y = energybins.energy_midpoints**2.7 * y
        y_err = energybins.energy_midpoints**2.7 * y_err
        plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
        # Load 3-year flux
        df_3yr = pd.read_csv('3yearscraped/{}'.format(composition), sep='\t',
                             header=None, names=['energy', 'flux'])
        ax.plot(np.log10(df_3yr.energy), df_3yr.flux, label='3 yr {}'.format(composition),
                        marker='.', ls=':', color=color_dict[composition])
        ax.set_yscale("log", nonposy='clip')
        # ax.set_xscale("log", nonposy='clip')
        ax.set_xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
        ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
        ax.set_xlim([6.3, 8])
        ax.set_ylim([10**3, 10**5])
        ax.grid(linestyle='dotted', which="both")
        ax.legend()
    plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
    plt.show()
    
    
[ back to top ]
In [20]:
    
reco_frac['light']
    
    Out[20]:
In [21]:
    
reco_frac['heavy']
    
    Out[21]:
In [22]:
    
num_reco_energy['light']
    
    Out[22]:
In [23]:
    
num_reco_energy['heavy']
    
    Out[23]:
In [9]:
    
pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(sim_test.X)
true_comp = sim_train.le.inverse_transform(sim_train.y)
pred_comp = sim_train.le.inverse_transform(test_predictions)
print(true_comp)
print(pred_comp)
    
    
In [10]:
    
unfolder = comp.analysis.Unfolder()
unfolder.unfold(sim_test.y, test_predictions, priors=[0.5, 0.5],
               labels=sim_train.le.transform(comp_list))
    
    Out[10]:
In [25]:
    
bin_idxs = np.digitize(sim_test.log_energy, log_energy_bins) - 1
energy_bin_idx = np.unique(bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
for bin_idx in energy_bin_idx:
    energy_bin_mask = bin_idxs == bin_idx
    unfolder.unfold()
    confmat = confusion_matrix(true_comp[energy_bin_mask], pred_comp[energy_bin_mask], labels=comp_list)
    confmat = np.divide(confmat.T, confmat.sum(axis=1, dtype=float)).T
    inv_confmat = np.linalg.inv(confmat)
    counts = np.array([num_reco_energy[composition][bin_idx] for composition in comp_list])
    unfolded_counts = np.dot(inv_confmat, counts)
#     unfolded_counts[unfolded_counts < 0] = 0
    num_reco_energy_unfolded['light'].append(unfolded_counts[0])
    num_reco_energy_unfolded['heavy'].append(unfolded_counts[1])
    num_reco_energy_unfolded['total'].append(unfolded_counts.sum())
print(num_reco_energy_unfolded)
    
    
In [26]:
    
unfolded_counts.sum()
    
    Out[26]:
In [27]:
    
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy_unfolded[composition]/energy_bin_widths
    y_err = np.sqrt(y)/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energy_midpoints**2.7 * y
    y_err = energy_midpoints**2.7 * y_err
    print(y)
    print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
    
    
    
    
Get confusion matrix for each energy bin
In [99]:
    
bin_idxs = np.digitize(energy_test_sim, log_energy_bins) - 1
energy_bin_idx = np.unique(bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
response_mat = []
for bin_idx in energy_bin_idx:
    energy_bin_mask = bin_idxs == bin_idx
    confmat = confusion_matrix(true_comp[energy_bin_mask], pred_comp[energy_bin_mask], labels=comp_list)
    confmat = np.divide(confmat.T, confmat.sum(axis=1, dtype=float)).T
    response_mat.append(confmat)
    
    
In [100]:
    
response_mat
    
    Out[100]:
In [134]:
    
r = np.dstack((np.copy(num_reco_energy['light']), np.copy(num_reco_energy['heavy'])))[0]
for unfold_iter in range(50):
    print('Unfolding iteration {}...'.format(unfold_iter))
    if unfold_iter == 0:
        u = r
    fs = []
    for bin_idx in energy_bin_idx:
#         print(u)
        f = np.dot(response_mat[bin_idx], u[bin_idx])
        f[f < 0] = 0
        fs.append(f)
#         print(f)
    u = u + (r - fs)
#     u[u < 0] = 0
#     print(u)
unfolded_counts_iter = {}
unfolded_counts_iter['light'] = u[:,0]
unfolded_counts_iter['heavy'] = u[:,1]
unfolded_counts_iter['total'] = u.sum(axis=1)
print(unfolded_counts_iter)
    
    
In [135]:
    
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = unfolded_counts_iter[composition]/energy_bin_widths
    y_err = np.sqrt(y)/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energy_midpoints**2.7 * y
    y_err = energy_midpoints**2.7 * y_err
    print(y)
    print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
    
    
    
    
In [106]:
    
print(num_reco_energy)
    
    
In [107]:
    
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='None', cmap=cmap,
               vmin=0, vmax=1.0)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, '{:0.3f}'.format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True composition')
    plt.xlabel('Predicted composition')
fig, ax = plt.subplots()
plot_confusion_matrix(confmat, classes=['light', 'heavy'], normalize=True,
                      title='Confusion matrix, without normalization')
# # Plot normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
#                       title='Normalized confusion matrix')
plt.show()
    
    
    
In [63]:
    
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)
inverse = np.linalg.inv(confmat)
inverse
    
    Out[63]:
In [64]:
    
confmat
    
    Out[64]:
In [66]:
    
comp_list = ['light', 'heavy']
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(X_train_sim, y_train_sim, X_test_data, comp_list)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
energy_bin_widths = 10**energy_bins[1:] - 10**energy_bins[:-1]
def get_energy_res(df_sim, energy_bins):
    reco_log_energy = df_sim['lap_log_energy'].values 
    MC_log_energy = df_sim['MC_log_energy'].values
    energy_res = reco_log_energy - MC_log_energy
    bin_centers, bin_medians, energy_err = comp.analysis.data_functions.get_medians(reco_log_energy,
                                                                               energy_res,
                                                                               energy_bins)
    return np.abs(bin_medians)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.85)))
# solid_angle = 2*np.pi*(1-np.cos(40*(np.pi/180)))
print(solid_angle)
print(2*np.pi*(1-np.cos(40*(np.pi/180))))
# Live-time information
start_time = np.amin(df_data['start_time_mjd'].values)
end_time = np.amax(df_data['end_time_mjd'].values)
day_to_sec = 24 * 60 * 60.
dt = day_to_sec * (end_time - start_time)
print(dt)
# Plot fraction of events vs energy
fig, ax = plt.subplots()
for i, composition in enumerate(comp_list):
    num_reco_bin = np.array([[i, j] for i, j in zip(num_reco_energy['light'], num_reco_energy['heavy'])])
#     print(num_reco_bin)
    num_reco = np.array([np.dot(inverse, i) for i in num_reco_bin])
    print(num_reco)
    num_reco_2 = {'light': num_reco[:, 0], 'heavy': num_reco[:, 1]}
    # Calculate dN/dE
    y = num_reco_2[composition]/energy_bin_widths
    y_err = num_reco_energy_err[composition]/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / dt
    y_err = y / dt
    # Add energy scaling 
    energy_err = get_energy_res(df_sim, energy_bins)
    energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = (10**energy_midpoints)**2.7 * y
    y_err = (10**energy_midpoints)**2.7 * y_err
    plotting.plot_steps(energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid()
leg = plt.legend(loc='upper center', 
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
plt.show()
    
    
    
In [44]:
    
pipeline.get_params()['classifier__max_depth']
    
    Out[44]:
In [47]:
    
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
fig, axarr = plt.subplots(1, 2)
for composition, ax in zip(comp_list, axarr.flatten()):
    MC_comp_mask = (df_sim['MC_comp_class'] == composition)
    MC_log_energy = df_sim['MC_log_energy'][MC_comp_mask].values
    reco_log_energy = df_sim['lap_log_energy'][MC_comp_mask].values
    plotting.histogram_2D(MC_log_energy, reco_log_energy, energy_bins, log_counts=True, ax=ax)
    ax.plot([0,10], [0,10], marker='None', linestyle='-.')
    ax.set_xlim([6.2, 8])
    ax.set_ylim([6.2, 8])
    ax.set_xlabel('$\log_{10}(E_{\mathrm{MC}}/\mathrm{GeV})$')
    ax.set_ylabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
    ax.set_title('{} response matrix'.format(composition))
plt.tight_layout()
plt.show()
    
    
In [10]:
    
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
10**energy_bins[1:] - 10**energy_bins[:-1]
    
    Out[10]:
In [ ]:
    
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)
    
In [ ]:
    
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_
    
In [ ]:
    
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)
    
In [ ]:
    
pipeline.named_steps['classifier'].decision_path(X_test)
    
In [48]:
    
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
# test_probs = defaultdict(list)
fig, ax = plt.subplots()
test_predictions = pipeline.predict(X_test_data)
test_probs = pipeline.predict_proba(X_test_data)
for class_ in pipeline.classes_:
    test_predictions == le.inverse_transform(class_)
    plt.hist(test_probs[:, class_], bins=np.linspace(0, 1, 50),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend()
plt.grid()
plt.show()
    
    
In [5]:
    
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
    comp_mask = (le.inverse_transform(y_test) == composition)
    probs = np.copy(test_probs[comp_mask])
    print('probs = {}'.format(probs.shape))
    weighted_mass = np.zeros(len(probs))
    for class_ in pipeline.classes_:
        c = le.inverse_transform(class_)
        weighted_mass += comp.simfunctions.comp2mass(c) * probs[:, class_]
    print('min = {}'.format(min(weighted_mass)))
    print('max = {}'.format(max(weighted_mass)))
    ax.hist(weighted_mass, bins=np.linspace(0, 5, 100),
             histtype='step', label=None, color='darkgray',
             alpha=1.0, log=False)
    for c in comp_list:
        ax.axvline(comp.simfunctions.comp2mass(c), color=color_dict[c],
                   marker='None', linestyle='-')
    ax.set_ylabel('Counts')
    ax.set_xlabel('Weighted atomic number')
    ax.set_title('MC {}'.format(composition))
    ax.grid()
plt.tight_layout()
plt.show()
    
    
    
    
In [15]:
    
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
    comp_mask = (le.inverse_transform(y_test) == composition)
    probs = np.copy(test_probs[comp_mask])
    weighted_mass = np.zeros(len(probs))
    for class_ in pipeline.classes_:
        c = le.inverse_transform(class_)
        ax.hist(probs[:, class_], bins=np.linspace(0, 1, 50),
                 histtype='step', label=c, color=color_dict[c],
                 alpha=1.0, log=True)
    ax.legend(title='Reco comp', framealpha=0.5)
    ax.set_ylabel('Counts')
    ax.set_xlabel('Testing set class probabilities')
    ax.set_title('MC {}'.format(composition))
    ax.grid()
plt.tight_layout()
plt.show()
    
    
    
In [25]:
    
comp_list = ['light', 'heavy']
test_probs = defaultdict(list)
fig, ax = plt.subplots()
# test_probs = pipeline.predict_proba(X_test)
for event in pipeline.predict_proba(X_test_data):
    composition = le.inverse_transform(np.argmax(event))
    test_probs[composition].append(np.amax(event))
for composition in comp_list:
    plt.hist(test_probs[composition], bins=np.linspace(0, 1, 100),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=False)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend(title='Reco comp')
plt.grid()
plt.show()
    
    
    
In [ ]:
    
    
In [ ]: