In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
sys.path
Out[1]:
In [2]:
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid
import composition as comp
%matplotlib inline
In [3]:
sns.set_palette('muted')
sns.set_color_codes()
In [4]:
df, cut_dict = comp.load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
'StationDensity', 'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
# standard_cut_keys = ['LLHLF_reco_exists', 'LLHLF_zenith', 'num_hits_1_30', 'IT_signal',
# 'StationDensity', 'max_qfrac_1_30', 'LLHLF_containment', 'energy_range']
# standard_cut_keys = ['LLHlap_reco_exists', 'LLHlap_zenith', 'num_hits_1_30', 'IT_signal',
# 'StationDensity', 'max_qfrac_1_30', 'LLHlap_containment', 'energy_range']
# standard_cut_keys = ['reco_exists', 'reco_zenith', 'num_hits_1_60', 'IT_signal',
# 'StationDensity', 'max_qfrac_1_60', 'reco_containment', 'energy_range']
for key in standard_cut_keys:
selection_mask *= cut_dict[key]
df = df[selection_mask]
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
df, feature_list)
print('number training events = ' + str(y_train.shape[0]))
Get error in charge vs. energy distribution
In [ ]:
In [5]:
pipeline = comp.get_pipeline('KN')
param_range = np.arange(3, 200, 25)
param_grid = {'classifier__n_neighbors': param_range}
gs = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=10)
gs = gs.fit(X_train, y_train)
print('best GS CV score = {}'.format(gs.best_score_))
print('best GS CV depths = {}'.format(gs.best_params_))
print('Grid scores on development set:')
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
pipeline.set_params(**gs.best_params_)
pipeline.fit(X_train, y_train)
scaler = pipeline.named_steps['scaler']
clf = pipeline.named_steps['classifier']
In [6]:
clf_name = clf.__class__.__name__
print('=' * 30)
print(clf_name)
test_predictions = pipeline.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = pipeline.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))
scores = cross_val_score(
estimator=pipeline, X=X_test, y=y_test, cv=10, n_jobs=10)
print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
print('=' * 30)
In [ ]:
In [7]:
comp_list = ['P', 'He', 'Fe']
# comp_list = ['P', 'Fe']
# comp_list = le.inverse_transform(np.unique(y_test))
correctly_identified_mask = (test_predictions == y_test)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
# energy_bins = np.arange(6.2, 9.51, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
log_energy = X_test[:, 0]
# Construct MC composition masks
MC_comp_mask = {}
for composition in comp_list:
MC_comp_mask[composition] = (le.inverse_transform(y_test) == composition)
# Get number of MC comp in each reco energy bin
num_MC_energy, num_MC_energy_err = {}, {}
for composition in comp_list:
num_MC_energy[composition] = np.histogram(log_energy[MC_comp_mask[composition]],
bins=energy_bins)[0]
num_MC_energy_err[composition] = np.sqrt(num_MC_energy[composition])
num_MC_energy['total'] = np.histogram(log_energy, bins=energy_bins)[0]
num_MC_energy_err['total'] = np.sqrt(num_MC_energy['total'])
# Get number of correctly identified comp in each reco energy bin
num_reco_energy, num_reco_energy_err = {}, {}
for composition in comp_list:
num_reco_energy[composition] = np.histogram(
log_energy[MC_comp_mask[composition] & correctly_identified_mask],
bins=energy_bins)[0]
num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])
num_reco_energy['total'] = np.histogram(log_energy[correctly_identified_mask],
bins=energy_bins)[0]
num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])
# Calculate correctly identified fractions as a function of MC energy
reco_frac, reco_frac_err = {}, {}
for composition in comp_list:
print(composition)
reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
num_reco_energy[composition], num_reco_energy_err[composition],
num_MC_energy[composition], num_MC_energy_err[composition])
reco_frac['total'], reco_frac_err['total'] = comp.ratio_error(
num_reco_energy['total'], num_reco_energy_err['total'],
num_MC_energy['total'], num_MC_energy_err['total'])
In [8]:
# Plot fraction of events vs energy
fig, ax = plt.subplots()
for composition in comp_list:
ax.errorbar(energy_midpoints, reco_frac[composition],
yerr=reco_frac_err[composition],
# xerr=energy_bin_width / 2,
marker='.', markersize=10,
label=composition)
ax.errorbar(energy_midpoints, reco_frac['total'],
yerr=reco_frac_err['total'],
# xerr=energy_bin_width / 2,
marker='.', markersize=10,
label='Total')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([6.2, 8.0])
# ax.set_xlim([6.2, 9.5])
plt.grid()
plt.legend(loc=3)
# place a text box in upper left in axes coords
textstr = 'Training features: \n'
for i, label in enumerate(feature_labels):
if (i == len(feature_labels)-1):
textstr += '{}) '.format(i+1) + label
else:
textstr += '{}) '.format(i+1) + label + '\n'
print('textstr = \n' + textstr)
props = dict(facecolor='white')
ax.text(0.7, 0.3, textstr, transform=ax.transAxes, fontsize=8,
verticalalignment='top', bbox=props)
outfile = '/home/jbourbeau/public_html/figures/composition' + \
'/fraction-reco-correct_vs_reco-energy_RF.png'
plt.savefig(outfile)
plt.show()
In [11]:
num_features = len(feature_list)
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]
fig, ax = plt.subplots()
# feature_labels = np.array(['$\\log_{10}({\mathrm{E/GeV})$', 'InIce charge',
# '$\cos(\\theta)$', '$\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$', 'NChannels'])
for f in range(num_features):
print('{}) {}'.format(f + 1, importances[indices[f]]))
plt.ylabel('Feature Importances')
plt.bar(range(num_features),
importances[indices],
align='center')
plt.xticks(range(num_features),
feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
plt.ylim([0, .40])
plt.show()
In [10]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)
Out[10]:
In [11]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_
Out[11]:
In [12]:
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)
Out[12]:
In [13]:
pipeline.named_steps['classifier'].decision_path(X_test)
Out[13]:
In [ ]: