In [1]:
from __future__ import division, print_function
import os
from collections import defaultdict
import numpy as np
from scipy import interp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import comptools as comp
import comptools.analysis.plotting as plotting
# color_dict allows for a consistent color-coding for each composition
color_dict = comp.analysis.get_color_dict()
%matplotlib inline
In [2]:
df_sim_train, df_sim_test = comp.load_sim(config='IC86.2012')
feature_list, feature_labels = comp.analysis.get_training_features()
In [3]:
comp_class = True
target = 'MC_comp_class' if comp_class else 'MC_comp'
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
In [4]:
pipeline_str = 'BDT'
pipeline = comp.get_pipeline(pipeline_str)
In [5]:
df_sim_train.MC_comp.value_counts()
Out[5]:
In [6]:
energybins = comp.analysis.get_energybins()
In [11]:
cos_zenith_min = 0.85
cos_zenith_max = 0.9
cos_zenith_mask = np.logical_and(df_sim_train['lap_cos_zenith'] <= cos_zenith_max,
df_sim_train['lap_cos_zenith'] >= cos_zenith_min)
cos_zenith_mask.sum()
Out[11]:
In [115]:
training_comps = ['PPlus', 'Fe56Nucleus']
comp_to_label_dict = {'PPlus': 0, 'Fe56Nucleus': 1}
cmaps = ['Blues', 'Oranges']
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
# Get restricted training DataFrame
comps_mask = df_sim_train['MC_comp'].isin(training_comps)
df_train = df_sim_train.loc[comps_mask].copy()
df_train['target'] = df_train['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline = comp.get_pipeline(pipeline_str)
# pipeline.named_steps['classifier'].set_params(loss='deviance')
pipeline.fit(df_train[feature_list], df_train['target'])
fig, ax = plt.subplots()
for MC_comp, cmap in zip(training_comps, cmaps):
comp_mask = df_train['MC_comp'] == MC_comp
sns.kdeplot(df_train.loc[cos_zenith_mask & comp_mask, 'log_s125'],
df_train.loc[cos_zenith_mask & comp_mask, 'log_dEdX'],
n_levels=10, cmap=cmap, label=MC_comp, alpha=0.5, ax=ax)
# Plot decision region using mlxtend
ax = plot_decision_regions(df_train.loc[cos_zenith_mask, feature_list].values,
df_train.loc[cos_zenith_mask, 'target'].values,
clf=pipeline, feature_index=[1, 2],
filler_feature_values={0: np.mean([cos_zenith_min, cos_zenith_max])},
res=0.02, legend=1, colors='C0,C1', hide_spines=False,
ax=ax)
ax.set_ylabel('$\mathrm{\\log_{10}(dE/dX)}$')
ax.set_xlabel('$\mathrm{\\log_{10}(S_{125})}$')
zenith_range_str = '{:0.2f} '.format(cos_zenith_min) + \
'$\mathrm{ \leq \cos(\\theta_{reco}) \leq }$' + \
' {:0.2f}'.format(cos_zenith_max)
ax.text(-0.3, 3.05, zenith_range_str, fontsize=14)
ax.set_xlim(-0.5, 2.5)
ax.set_ylim(0, 3.5)
ax.grid()
ax.legend(loc='upper center')
decision_region_outfile = os.path.join(comp.paths.figures_dir, 'decision_regions', 'P-Fe-only.png')
comp.check_output_dir(decision_region_outfile)
plt.savefig(decision_region_outfile)
plt.show()
In [114]:
training_comps = ['O16Nucleus', 'Fe56Nucleus']
comp_to_label_dict = {'O16Nucleus': 0, 'Fe56Nucleus': 1}
cmaps = ['Reds', 'Oranges']
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
# Get restricted training DataFrame
comps_mask = df_sim_train['MC_comp'].isin(training_comps)
df_train = df_sim_train.loc[comps_mask].copy()
df_train['target'] = df_train['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline = comp.get_pipeline(pipeline_str)
# pipeline.named_steps['classifier'].set_params(loss='deviance')
pipeline.fit(df_train[feature_list], df_train['target'])
fig, ax = plt.subplots()
for MC_comp, cmap in zip(training_comps, cmaps):
comp_mask = df_train['MC_comp'] == MC_comp
sns.kdeplot(df_train.loc[cos_zenith_mask & comp_mask, 'log_s125'],
df_train.loc[cos_zenith_mask & comp_mask, 'log_dEdX'],
n_levels=10, cmap=cmap, label=MC_comp, alpha=0.5, ax=ax)
# Plot decision region using mlxtend
ax = plot_decision_regions(df_train.loc[cos_zenith_mask, feature_list].values,
df_train.loc[cos_zenith_mask, 'target'].values,
clf=pipeline, feature_index=[1, 2],
filler_feature_values={0: np.mean([cos_zenith_min, cos_zenith_max])},
res=0.02, legend=1, colors='C3,C1', hide_spines=False,
ax=ax)
ax.set_ylabel('$\mathrm{\\log_{10}(dE/dX)}$')
ax.set_xlabel('$\mathrm{\\log_{10}(S_{125})}$')
zenith_range_str = '{:0.2f} '.format(cos_zenith_min) + \
'$\mathrm{ \leq \cos(\\theta_{reco}) \leq }$' + \
' {:0.2f}'.format(cos_zenith_max)
ax.text(-0.3, 3.05, zenith_range_str, fontsize=14)
ax.set_xlim(-0.5, 2.5)
ax.set_ylim(0, 3.5)
ax.grid()
ax.legend(loc='upper center')
decision_region_outfile = os.path.join(comp.paths.figures_dir, 'decision_regions', 'O-Fe-only.png')
comp.check_output_dir(decision_region_outfile)
plt.savefig(decision_region_outfile)
plt.show()
In [116]:
training_comps = ['PPlus', 'Fe56Nucleus', 'O16Nucleus']
comp_to_label_dict = {'PPlus': 0, 'Fe56Nucleus': 1, 'O16Nucleus':2}
cmaps = ['Blues', 'Oranges', 'Reds']
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
# Get restricted training DataFrame
comps_mask = df_sim_train['MC_comp'].isin(training_comps)
df_train = df_sim_train.loc[comps_mask].copy()
df_train['target'] = df_train['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline = comp.get_pipeline(pipeline_str)
pipeline.named_steps['classifier'].set_params(loss='deviance')
pipeline.fit(df_train[feature_list], df_train['target'])
fig, ax = plt.subplots()
for MC_comp, cmap in zip(training_comps, cmaps):
comp_mask = df_train['MC_comp'] == MC_comp
sns.kdeplot(df_train.loc[cos_zenith_mask & comp_mask, 'log_s125'],
df_train.loc[cos_zenith_mask & comp_mask, 'log_dEdX'],
n_levels=10, cmap=cmap, label=MC_comp, alpha=0.5, ax=ax)
# # Plot decision region using mlxtend
# ax = plot_decision_regions(df_train.loc[cos_zenith_mask, feature_list].values,
# df_train.loc[cos_zenith_mask, 'target'].values,
# clf=pipeline, feature_index=[1, 2],
# filler_feature_values={0: np.mean([cos_zenith_min, cos_zenith_max])},
# res=0.02, legend=1, colors='C0,C1,C3', hide_spines=False,
# ax=ax)
ax.set_ylabel('$\mathrm{\\log_{10}(dE/dX)}$')
ax.set_xlabel('$\mathrm{\\log_{10}(S_{125})}$')
zenith_range_str = '{:0.2f} '.format(cos_zenith_min) + \
'$\mathrm{ \leq \cos(\\theta_{reco}) \leq }$' + \
' {:0.2f}'.format(cos_zenith_max)
ax.text(-0.3, 3.05, zenith_range_str, fontsize=14)
ax.set_xlim(-0.5, 2.5)
ax.set_ylim(0, 3.5)
ax.grid()
ax.legend(loc='upper center')
decision_region_outfile = os.path.join(comp.paths.figures_dir, 'decision_regions', 'P-O-Fe-only.png')
comp.check_output_dir(decision_region_outfile)
plt.savefig(decision_region_outfile)
plt.show()
In [119]:
training_comps = ['PPlus', 'Fe56Nucleus', 'O16Nucleus', 'He4Nucleus']
comp_to_label_dict = {'PPlus': 0, 'Fe56Nucleus': 1, 'O16Nucleus':2, 'He4Nucleus': 3}
cmaps = ['Blues', 'Oranges', 'Reds', 'Purples']
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
# Get restricted training DataFrame
comps_mask = df_sim_train['MC_comp'].isin(training_comps)
df_train = df_sim_train.loc[comps_mask].copy()
df_train['target'] = df_train['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline = comp.get_pipeline(pipeline_str)
pipeline.named_steps['classifier'].set_params(loss='deviance')
pipeline.fit(df_train[feature_list], df_train['target'])
fig, ax = plt.subplots()
for MC_comp, cmap in zip(training_comps, cmaps):
comp_mask = df_train['MC_comp'] == MC_comp
sns.kdeplot(df_train.loc[cos_zenith_mask & comp_mask, 'log_s125'],
df_train.loc[cos_zenith_mask & comp_mask, 'log_dEdX'],
n_levels=10, cmap=cmap, label=MC_comp, alpha=0.5, ax=ax)
# Plot decision region using mlxtend
ax = plot_decision_regions(df_train.loc[cos_zenith_mask, feature_list].values,
df_train.loc[cos_zenith_mask, 'target'].values,
clf=pipeline, feature_index=[1, 2],
filler_feature_values={0: np.mean([cos_zenith_min, cos_zenith_max])},
res=0.02, legend=1, colors='C0,C1,C3,C4', hide_spines=False,
ax=ax)
ax.set_ylabel('$\mathrm{\\log_{10}(dE/dX)}$')
ax.set_xlabel('$\mathrm{\\log_{10}(S_{125})}$')
zenith_range_str = '{:0.2f} '.format(cos_zenith_min) + \
'$\mathrm{ \leq \cos(\\theta_{reco}) \leq }$' + \
' {:0.2f}'.format(cos_zenith_max)
ax.text(-0.3, 3.05, zenith_range_str, fontsize=14)
ax.set_xlim(-0.5, 2.5)
ax.set_ylim(0, 3.5)
ax.grid()
ax.legend(loc='upper center')
decision_region_outfile = os.path.join(comp.paths.figures_dir, 'decision_regions', 'P-He-O-Fe.png')
comp.check_output_dir(decision_region_outfile)
plt.savefig(decision_region_outfile)
plt.show()
In [ ]:
In [ ]:
In [83]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_accuracy = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
for train_index, test_index in skf.split(df_sim_train[feature_list], df_sim_train['target']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train.iloc[train_index]
df_sim_test_fold = df_sim_train.iloc[test_index]
# Fit pipeline with training data for this fold
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_label = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
pred_comp = np.array([comp.label_to_comp(i) for i in pred_label])
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
light_mask = pred_comp == 'light'
counts_light = np.histogram(log_enegy[light_mask], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_light = counts_light/counts_total
cv_accuracy[MC_comp].append(frac_light)
In [84]:
df_frac_light = pd.DataFrame.from_dict(cv_accuracy)
df_frac_light
Out[84]:
In [85]:
fig, ax = plt.subplots()
for MC_comp in df_frac_light.columns:
frac_light_mean = df_frac_light[MC_comp].mean(axis=0)
frac_light_std = df_frac_light[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_light_mean, yerr=frac_light_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
ax.axhline(0.5, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction classified as light')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title(r'\underline{Class definitions}'+'\nLight: P, He | \t Heavy: O, Fe')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-classified-light_4-comp-training.png')
plt.savefig(outfile)
plt.show()
In [86]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_accuracy = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
for train_index, test_index in skf.split(df_sim_train[feature_list], df_sim_train['target']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train.iloc[train_index]
df_sim_test_fold = df_sim_train.iloc[test_index]
# Restrict to only training on Fe56Nucleus and PPlus simulation
iron_proton_mask = df_sim_train_fold['MC_comp'].isin(['Fe56Nucleus', 'PPlus']).values
# Fit pipeline with training data for this fold
pipeline.fit(df_sim_train_fold[feature_list][iron_proton_mask],
df_sim_train_fold['target'][iron_proton_mask])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_label = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
pred_comp = np.array([comp.label_to_comp(i) for i in pred_label])
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
light_mask = pred_comp == 'light'
counts_light = np.histogram(log_enegy[light_mask], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_light = counts_light/counts_total
cv_accuracy[MC_comp].append(frac_light)
In [87]:
df_frac_light = pd.DataFrame.from_dict(cv_accuracy)
df_frac_light
Out[87]:
In [88]:
fig, ax = plt.subplots()
for MC_comp in df_frac_light.columns:
frac_light_mean = df_frac_light[MC_comp].mean(axis=0)
frac_light_std = df_frac_light[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_light_mean, yerr=frac_light_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
ax.axhline(0.5, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction classified as light')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title(r'\underline{Class definitions}'+'\nLight: P | \t Heavy: Fe')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-classified-light_P-Fe-training-only.png')
plt.savefig(outfile)
plt.show()
In [13]:
comp_to_label_dict = {'PPlus': 0, 'Fe56Nucleus': 1}
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
In [14]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_correct = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
# Restrict to only training on PPlus and He4Nucleus simulation
proton_iron_mask = df_sim_train['MC_comp'].isin(['PPlus', 'Fe56Nucleus']).values
for train_index, test_index in skf.split(df_sim_train.loc[proton_iron_mask, feature_list],
df_sim_train.loc[proton_iron_mask, 'target']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train[proton_iron_mask].iloc[train_index].copy()
df_sim_train_fold['target'] = df_sim_train_fold['MC_comp'].apply(comp_to_label)
df_sim_test_fold = df_sim_train[proton_iron_mask].iloc[test_index].copy()
df_sim_test_fold['target'] = df_sim_test_fold['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_comp = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
correctly_identified = pred_comp == df_sim_test_fold['target'][comp_mask]
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
counts_correct = np.histogram(log_enegy[correctly_identified], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_correct = counts_correct/counts_total
cv_correct[MC_comp].append(frac_correct)
In [15]:
df_frac_correct = pd.DataFrame.from_dict({key: value for key, value in cv_correct.items() if len(value) != 0})
df_frac_correct
Out[15]:
In [17]:
fig, ax = plt.subplots()
for MC_comp in df_frac_correct.columns:
frac_light_mean = df_frac_correct[MC_comp].mean(axis=0)
frac_light_std = df_frac_correct[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_light_mean, yerr=frac_light_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
ax.axhline(0.5, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction correctly classified')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title(r'\underline{Class definitions}'+'\nClass 1: P | \t Class 2: Fe')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-correct_P-light-Fe-heavy-training.png')
plt.savefig(outfile)
plt.show()
In [7]:
comp_to_label_dict = {'PPlus': 0, 'He4Nucleus': 1}
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
In [ ]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_accuracy = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
# Restrict to only training on PPlus and He4Nucleus simulation
proton_helium_mask = df_sim_train['MC_comp'].isin(['PPlus', 'He4Nucleus']).values
for train_index, test_index in skf.split(df_sim_train.loc[proton_helium_mask, feature_list],
df_sim_train.loc[proton_helium_mask, 'target']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train[proton_helium_mask].iloc[train_index].copy()
df_sim_train_fold['target'] = df_sim_train_fold['MC_comp'].apply(comp_to_label)
df_sim_test_fold = df_sim_train[proton_helium_mask].iloc[test_index].copy()
df_sim_test_fold['target'] = df_sim_test_fold['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_comp = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
light_mask = pred_comp == 0
counts_light = np.histogram(log_enegy[light_mask], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_light = counts_light/counts_total
cv_accuracy[MC_comp].append(frac_light)
In [91]:
df_frac_light = pd.DataFrame.from_dict({key: value for key, value in cv_accuracy.items() if len(value) != 0})
df_frac_light
Out[91]:
In [92]:
fig, ax = plt.subplots()
for MC_comp in df_frac_light.columns:
frac_light_mean = df_frac_light[MC_comp].mean(axis=0)
frac_light_std = df_frac_light[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_light_mean, yerr=frac_light_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
ax.axhline(0.5, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction classified as light')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title(r'\underline{Class definitions}'+'\nLight: P | \t Heavy: He')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-classified-light_P-light-He-heavy-training.png')
plt.savefig(outfile)
plt.show()
In [10]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_correct = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
# Restrict to only training on PPlus and He4Nucleus simulation
proton_helium_mask = df_sim_train['MC_comp'].isin(['PPlus', 'He4Nucleus']).values
for train_index, test_index in skf.split(df_sim_train.loc[proton_helium_mask, feature_list],
df_sim_train.loc[proton_helium_mask, 'target']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train[proton_helium_mask].iloc[train_index].copy()
df_sim_train_fold['target'] = df_sim_train_fold['MC_comp'].apply(comp_to_label)
df_sim_test_fold = df_sim_train[proton_helium_mask].iloc[test_index].copy()
df_sim_test_fold['target'] = df_sim_test_fold['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_comp = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
correctly_identified = pred_comp == df_sim_test_fold['target'][comp_mask]
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
counts_correct = np.histogram(log_enegy[correctly_identified], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_correct = counts_correct/counts_total
cv_correct[MC_comp].append(frac_correct)
In [11]:
df_frac_correct = pd.DataFrame.from_dict({key: value for key, value in cv_correct.items() if len(value) != 0})
df_frac_correct
Out[11]:
In [12]:
fig, ax = plt.subplots()
for MC_comp in df_frac_correct.columns:
frac_light_mean = df_frac_correct[MC_comp].mean(axis=0)
frac_light_std = df_frac_correct[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_light_mean, yerr=frac_light_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
ax.axhline(0.5, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction correctly classified')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title(r'\underline{Class definitions}'+'\nClass 1: P | \t Class 2: He')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-correct_P-light-He-heavy-training.png')
plt.savefig(outfile)
plt.show()
In [93]:
comp_to_label_dict = {'PPlus': 0, 'He4Nucleus': 1, 'O16Nucleus': 1, 'Fe56Nucleus': 1}
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
In [94]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_accuracy = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
for train_index, test_index in skf.split(df_sim_train[feature_list], df_sim_train['target']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train.iloc[train_index].copy()
df_sim_train_fold['target'] = df_sim_train_fold['MC_comp'].apply(comp_to_label)
df_sim_test_fold = df_sim_train.iloc[test_index].copy()
df_sim_test_fold['target'] = df_sim_test_fold['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_comp = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
light_mask = pred_comp == 0
counts_light = np.histogram(log_enegy[light_mask], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_light = counts_light/counts_total
cv_accuracy[MC_comp].append(frac_light)
In [95]:
df_frac_light = pd.DataFrame.from_dict(cv_accuracy)
df_frac_light
Out[95]:
In [96]:
fig, ax = plt.subplots()
for MC_comp in df_frac_light.columns:
frac_light_mean = df_frac_light[MC_comp].mean(axis=0)
frac_light_std = df_frac_light[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_light_mean, yerr=frac_light_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
ax.axhline(0.5, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction classified as light')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title(r'\underline{Class definitions}'+'\nLight: P | \t Heavy: He, O, Fe')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-classified-light_P-light-else-heavy-training.png')
plt.savefig(outfile)
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [47]:
from itertools import combinations
In [48]:
MC_comp_list = ['PPlus', 'He4Nucleus', 'O16Nucleus', 'Fe56Nucleus']
get_MC_comp_name = {'PPlus':'P', 'He4Nucleus':'He', 'O16Nucleus':'O', 'Fe56Nucleus':'Fe'}
In [49]:
a = list(combinations(MC_comp_list, 2))
In [50]:
comp1, comp2 = a[0]
In [51]:
comp1
Out[51]:
In [90]:
for comp1, comp2 in list(combinations(MC_comp_list, 2)):
comp1_name = get_MC_comp_name[comp1]
comp2_name = get_MC_comp_name[comp2]
print(comp1_name, comp2_name)
comp_to_label_dict = {comp1: 0, comp2: 1}
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_correct = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
# Restrict to only training on PPlus and He4Nucleus simulation
comps_mask = df_sim_train['MC_comp'].isin([comp1, comp2]).values
for train_index, test_index in skf.split(df_sim_train.loc[comps_mask, feature_list],
df_sim_train.loc[comps_mask, 'target']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train[comps_mask].iloc[train_index].copy()
df_sim_train_fold['target'] = df_sim_train_fold['MC_comp'].apply(comp_to_label)
df_sim_test_fold = df_sim_train[comps_mask].iloc[test_index].copy()
df_sim_test_fold['target'] = df_sim_test_fold['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline = comp.get_pipeline(pipeline_str)
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_comp = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
correctly_identified = pred_comp == df_sim_test_fold['target'][comp_mask]
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
counts_correct = np.histogram(log_enegy[correctly_identified], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_correct = counts_correct/counts_total
cv_correct[MC_comp].append(frac_correct)
df_frac_correct = pd.DataFrame.from_dict({key: value for key, value in cv_correct.items() if len(value) != 0})
fig, ax = plt.subplots()
for MC_comp in df_frac_correct.columns:
frac_mean = df_frac_correct[MC_comp].mean(axis=0)
frac_std = df_frac_correct[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_mean, yerr=frac_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
for composition, name, y_coord in zip([comp1, comp2], [comp1_name, comp2_name], [0.25, 0.05]):
acc_mean = np.concatenate(df_frac_correct[composition]).mean()
acc_std = np.concatenate(df_frac_correct[composition]).std()
ax.text(6.6, y_coord, '{} accuracy:\n {:0.2f} +/- {:0.2f}'.format(name, acc_mean, acc_std),
fontsize=16)
ax.axhline(0.5, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction correctly classified')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
# ax.set_title(r'\underline{Class definitions}'+'\nClass 1: {} | \t Class 2: {}'.format(comp1_name, comp2_name))
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
ax.legend(loc='lower right')
outfile = os.path.join(comp.paths.figures_dir,
'fraction-correct_{}-light-{}-heavy-training.png'.format(comp1_name, comp2_name))
plt.savefig(outfile)
plt.show()
In [84]:
comp_to_label_dict = {'PPlus': 0, 'He4Nucleus': 1, 'O16Nucleus': 2, 'Fe56Nucleus': 3}
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_correct = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
# Restrict to only training on PPlus and He4Nucleus simulation
for train_index, test_index in skf.split(df_sim_train[feature_list], df_sim_train['MC_comp']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train.iloc[train_index].copy()
df_sim_train_fold['target'] = df_sim_train_fold['MC_comp'].apply(comp_to_label)
df_sim_test_fold = df_sim_train.iloc[test_index].copy()
df_sim_test_fold['target'] = df_sim_test_fold['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline = comp.get_pipeline(pipeline_str)
pipeline.named_steps['classifier'].set_params(loss='deviance')
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_comp = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
correctly_identified = pred_comp == df_sim_test_fold['target'][comp_mask]
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
counts_correct = np.histogram(log_enegy[correctly_identified], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_correct = counts_correct/counts_total
cv_correct[MC_comp].append(frac_correct)
df_frac_correct = pd.DataFrame.from_dict({key: value for key, value in cv_correct.items() if len(value) != 0})
fig, ax = plt.subplots()
for MC_comp in df_frac_correct.columns:
frac_mean = df_frac_correct[MC_comp].mean(axis=0)
frac_std = df_frac_correct[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_mean, yerr=frac_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
np.concatenate(df_frac_correct.Fe56Nucleus).mean()
np.concatenate(df_frac_correct.Fe56Nucleus).std()
ax.axhline(0.25, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction correctly classified')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title('4-composition classification')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-correct_4-component-training.png')
plt.savefig(outfile)
plt.show()
In [10]:
comp_to_label_dict = {'PPlus': 0, 'O16Nucleus': 1, 'Fe56Nucleus': 2}
def comp_to_label(composition):
try:
return comp_to_label_dict[composition]
except KeyError:
raise KeyError('Incorrect composition ({}) entered'.format(composition))
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cv_correct = {MC_comp: [] for MC_comp in df_sim_train.MC_comp.unique()}
# Restrict to only training on PPlus and He4Nucleus simulation
comps_mask = df_sim_train['MC_comp'].isin(['PPlus', 'O16Nucleus', 'Fe56Nucleus']).values
for train_index, test_index in skf.split(df_sim_train.loc[comps_mask, feature_list],
df_sim_train.loc[comps_mask, 'MC_comp']):
# Get training/testing data for this fold
df_sim_train_fold = df_sim_train[comps_mask].iloc[train_index].copy()
df_sim_train_fold['target'] = df_sim_train_fold['MC_comp'].apply(comp_to_label)
df_sim_test_fold = df_sim_train[comps_mask].iloc[test_index].copy()
df_sim_test_fold['target'] = df_sim_test_fold['MC_comp'].apply(comp_to_label)
# Fit pipeline with training data for this fold
pipeline = comp.get_pipeline(pipeline_str)
pipeline.named_steps['classifier'].set_params(loss='deviance')
pipeline.fit(df_sim_train_fold[feature_list], df_sim_train_fold['target'])
# Get fraction of events classified as light for each MC composition
for MC_comp in df_sim_train_fold.MC_comp.unique():
comp_mask = df_sim_test_fold['MC_comp'] == MC_comp
pred_comp = pipeline.predict(df_sim_test_fold[feature_list][comp_mask])
correctly_identified = pred_comp == df_sim_test_fold['target'][comp_mask]
log_enegy = df_sim_test_fold['lap_log_energy'][comp_mask].values
counts_correct = np.histogram(log_enegy[correctly_identified], bins=energybins.log_energy_bins)[0]
counts_total = np.histogram(log_enegy, bins=energybins.log_energy_bins)[0]
frac_correct = counts_correct/counts_total
cv_correct[MC_comp].append(frac_correct)
df_frac_correct = pd.DataFrame.from_dict({key: value for key, value in cv_correct.items() if len(value) != 0})
fig, ax = plt.subplots()
for MC_comp in df_frac_correct.columns:
frac_mean = df_frac_correct[MC_comp].mean(axis=0)
frac_std = df_frac_correct[MC_comp].values.std(axis=0)
plotting.plot_steps(energybins.log_energy_bins, frac_mean, yerr=frac_std,
color=color_dict[MC_comp], label=MC_comp, ax=ax)
np.concatenate(df_frac_correct.Fe56Nucleus).mean()
np.concatenate(df_frac_correct.Fe56Nucleus).std()
ax.axhline(1/3, marker='None', ls='-.', c='k', label='Guessing')
ax.set_xlim(energybins.log_energy_min, energybins.log_energy_max)
ax.set_ylim(0, 1)
ax.set_ylabel('Fraction correctly classified')
ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_title('4-composition classification')
ax.grid()
# leg = plt.legend(loc='upper center', frameon=False,
# bbox_to_anchor=(0.5, # horizontal
# 1.25),# vertical
# ncol=3, fancybox=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
outfile = os.path.join(comp.paths.figures_dir, 'fraction-correct_3-component-training.png')
plt.savefig(outfile)
plt.show()
In [ ]: