In [1]:
%load_ext watermark
%watermark -a 'Author: James Bourbeau' -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
In [2]:
from __future__ import division, print_function
from collections import defaultdict
import itertools
import numpy as np
from scipy import interp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import pyprind
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import comptools as comp
import comptools.analysis.plotting as plotting
# color_dict allows for a consistent color-coding for each composition
color_dict = comp.analysis.get_color_dict()
%matplotlib inline
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [7]:
config = 'IC86.2012'
# config = 'IC79'
comp_class = True
comp_key = 'MC_comp_class' if comp_class else 'MC_comp'
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
Get composition classifier pipeline
In [8]:
pipeline_str = 'GBDT'
pipeline = comp.get_pipeline(pipeline_str)
Define energy binning for this analysis
In [9]:
energybins = comp.analysis.get_energybins()
[ back to top ]
In [10]:
sim_train, sim_test = comp.load_dataframe(datatype='sim', config=config, comp_key=comp_key)
In [18]:
len(sim_train) + len(sim_test)
Out[18]:
In [12]:
# feature_list = ['lap_cos_zenith', 'log_s125', 'log_dEdX', 'invqweighted_inice_radius_1_60']
# feature_list, feature_labels = comp.analysis.get_training_features(feature_list)
feature_list, feature_labels = comp.analysis.get_training_features()
[ back to top ]
Calculate classifier performance via 10-fold CV
In [14]:
frac_correct_folds = comp.analysis.get_CV_frac_correct(sim_train, feature_list, pipeline_str, comp_list)
frac_correct_gen_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}
In [16]:
fig, ax = plt.subplots()
for composition in comp_list:
# for composition in comp_list + ['total']:
print(composition)
performance_mean = np.mean(frac_correct_folds[composition], axis=0)
performance_std = np.std(frac_correct_folds[composition], axis=0)
# err = np.sqrt(frac_correct_gen_err[composition]**2 + reco_frac_stat_err[composition]**2)
plotting.plot_steps(energybins.log_energy_bins, performance_mean, yerr=performance_std,
ax=ax, color=color_dict[composition], label=composition)
plt.xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
ax.set_ylabel('Classification accuracy')
# ax.set_ylabel('Classification accuracy \n (statistical + 10-fold CV error)')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
ax.grid()
leg = plt.legend(loc='upper center', frameon=False,
bbox_to_anchor=(0.5, # horizontal
1.15),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
cv_str = 'Avg. accuracy:\n{:0.2f}\% (+/- {:0.1f}\%)'.format(np.mean(frac_correct_folds['total'])*100,
np.std(frac_correct_folds['total'])*100)
ax.text(7.4, 0.2, cv_str,
ha="center", va="center", size=10,
bbox=dict(boxstyle='round', fc="white", ec="gray", lw=0.8))
# plt.savefig('/home/jbourbeau/public_html/figures/frac-correct-{}.png'.format(pipeline_str))
plt.show()
Determine the mean and standard deviation of the fraction correctly classified for each energy bin
In [10]:
avg_frac_correct_data = {'values': np.mean(frac_correct_folds['total'], axis=0), 'errors': np.std(frac_correct_folds['total'], axis=0)}
avg_frac_correct, avg_frac_correct_err = comp.analysis.averaging_error(**avg_frac_correct_data)
In [11]:
reco_frac, reco_frac_stat_err = comp.analysis.get_frac_correct(sim_train, sim_test, pipeline, comp_list)
In [13]:
# Plot fraction of events correctlt classified vs energy
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
err = np.sqrt(frac_correct_gen_err[composition]**2 + reco_frac_stat_err[composition]**2)
plotting.plot_steps(energybins.log_energy_bins, reco_frac[composition], err, ax,
color_dict[composition], composition)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
ax.grid()
leg = plt.legend(loc='upper center', frameon=False,
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
cv_str = 'Accuracy: {:0.2f}\% (+/- {:0.1f}\%)'.format(avg_frac_correct*100, avg_frac_correct_err*100)
ax.text(7.4, 0.2, cv_str,
ha="center", va="center", size=10,
bbox=dict(boxstyle='round', fc="white", ec="gray", lw=0.8))
plt.savefig('/home/jbourbeau/public_html/figures/frac-correct-{}.png'.format(pipeline_str))
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: