In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
In [2]:
from __future__ import division, print_function
import argparse
from collections import defaultdict
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns
from scipy.stats import binned_statistic
from sklearn.metrics import accuracy_score, confusion_matrix, brier_score_loss, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
import composition as comp
import composition.analysis.plotting as plotting
# Plotting-related
sns.set_palette('muted')
sns.set_color_codes()
color_dict = defaultdict()
for i, composition in enumerate(['light', 'heavy', 'total']):
color_dict[composition] = sns.color_palette('muted').as_hex()[i]
%matplotlib inline
In [3]:
df_sim, cut_dict_sim = comp.load_dataframe(type_='sim', config='IT73', return_cut_dict=True)
selection_mask = np.array([True] * len(df_sim))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
'_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
selection_mask *= cut_dict_sim[key]
df_sim = df_sim[selection_mask]
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train_sim, X_test_sim, y_train_sim, y_test_sim, le = comp.get_train_test_sets(
df_sim, feature_list, comp_class=True, train_he=True, test_he=True)
print('number training events = ' + str(y_train_sim.shape[0]))
print('number testing events = ' + str(y_test_sim.shape[0]))
In [84]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
test_probs = pipeline.predict_proba(X_test_sim)
correctly_identified_mask = (test_predictions == y_test_sim)
prob_bins = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
for composition in comp_list:
c = le.transform([composition])
mask = correctly_identified_mask
ax.hist(test_probs[mask, c[0]], prob_bins, histtype='step', label=composition, color=color_dict[composition])
ax.axvline(0.5, linestyle=':', marker='None')
plt.legend()
plt.show()
In [85]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
test_probs = pipeline.predict_proba(X_test_sim)
correctly_identified_mask = (test_predictions == y_test_sim)
prob_bins = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
for composition in comp_list:
c = le.transform([composition])
mask = np.logical_not(correctly_identified_mask)
ax.hist(test_probs[mask, c[0]], prob_bins, histtype='step', label=composition, color=color_dict[composition])
ax.axvline(0.5, linestyle=':', marker='None')
plt.legend()
plt.show()
In [62]:
comp_list = ['light', 'heavy']
def frac_correct(array):
num_correct = np.sum(array)
num_in_bin = len(array)
frac = num_correct/num_in_bin
return frac
# Split training data into CV training and testing folds
kf = KFold(n_splits=10)
frac_correct_folds = defaultdict(list)
fold_num = 0
print('Fold ', end='')
for train_index, test_index in kf.split(X_train_sim):
fold_num += 1
print('{}...'.format(fold_num), end='')
X_train_fold, X_test_fold = X_train_sim[train_index], X_train_sim[test_index]
y_train_fold, y_test_fold = y_train_sim[train_index], y_train_sim[test_index]
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_fold, y_train_fold)
test_predictions = pipeline.predict(X_test_fold)
test_probs = pipeline.predict_proba(X_test_fold)
correctly_identified_mask = (test_predictions == y_test_fold)
prob_bins = np.linspace(0, 1, 25)
for composition in comp_list:
MC_comp_mask = (le.inverse_transform(y_test_fold) == composition)
frac, bin_edges, binnumber = binned_statistic(np.a(test_probs, axis=1)[MC_comp_mask],
correctly_identified_mask[MC_comp_mask],
frac_correct,
prob_bins)
frac_correct_folds[composition].append(frac)
frac, bin_edges, binnumber = binned_statistic(np.amax(test_probs, axis=1),
correctly_identified_mask,
frac_correct,
prob_bins)
frac_correct_folds['total'].append(frac)
frac_correct_gen_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}
In [63]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
test_probs = pipeline.predict_proba(X_test_sim)
correctly_identified_mask = (test_predictions == y_test_sim)
print(test_predictions)
print(test_probs)
prob_bins = np.linspace(0, 1, 25)
prob_midpoints = (prob_bins[1:] + prob_bins[:-1]) / 2
def frac_correct(array):
num_correct = np.sum(array)
num_in_bin = len(array)
frac = num_correct/num_in_bin
return frac
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for composition in comp_list:
MC_comp_mask = (le.inverse_transform(y_test_sim) == composition)
frac, bin_edges, binnumber = binned_statistic(np.a(test_probs, axis=1)[MC_comp_mask],
correctly_identified_mask[MC_comp_mask],
frac_correct,
prob_bins)
ax.errorbar(prob_midpoints, frac, yerr=frac_correct_gen_err[composition],
marker='.', label=composition, alpha=0.75)
frac, bin_edges, binnumber = binned_statistic(np.a(test_probs, axis=1),
correctly_identified_mask,
frac_correct,
prob_bins)
ax.errorbar(prob_midpoints, frac, yerr=frac_correct_gen_err['total'],
marker='.', label='total', alpha=0.75)
ax.set_xlabel('Prediction probability')
ax.set_ylabel('Fraction correctly identified')
plt.legend(loc=2)
plt.show()
In [66]:
def plot_calibration_curve(est, name, fig_index):
"""Plot calibration curve for est w/o and with calibration. """
# Calibrated with isotonic calibration
isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')
# Calibrated with sigmoid calibration
sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid')
# Logistic regression with no calibration as baseline
lr = LogisticRegression(C=1., solver='lbfgs')
fig = plt.figure(fig_index, figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))
ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(lr, 'Logistic'),
(est, name),
(isotonic, name + ' + Isotonic'),
(sigmoid, name + ' + Sigmoid')]:
clf.fit(X_train_sim, y_train_sim)
y_pred = clf.predict(X_test_sim)
if hasattr(clf, "predict_proba"):
prob_pos = clf.predict_proba(X_test_sim)[:, 1]
else: # use decision function
prob_pos = clf.decision_function(X_test_sim)
prob_pos = \
(prob_pos - prob_pos.min()) / (prob_pos.() - prob_pos.min())
clf_score = brier_score_loss(y_test_sim, prob_pos, pos_label=y_test_sim.())
print("%s:" % name)
print("\tBrier: %1.3f" % (clf_score))
print("\tPrecision: %1.3f" % precision_score(y_test_sim, y_pred))
print("\tRecall: %1.3f" % recall_score(y_test_sim, y_pred))
print("\tF1: %1.3f\n" % f1_score(y_test_sim, y_pred))
fraction_of_positives, mean_predicted_value = \
calibration_curve(y_test_sim, prob_pos, n_bins=20)
ax1.plot(mean_predicted_value, fraction_of_positives, ".:",
label="%s (%1.3f)" % (name, clf_score))
ax2.hist(prob_pos, range=(0, 1), bins=20, label=name,
histtype="step", lw=2)
ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots (reliability curve)')
ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)
plt.tight_layout()
# Plot calibration curve for Gaussian Naive Bayes
plot_calibration_curve(pipeline, "RF", 1)
plt.show()
In [6]:
def get_frac_correct(X_train, X_test, y_train, y_test, comp_list):
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
correctly_identified_mask = (test_predictions == y_test)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
log_energy = X_test[:, 0]
# Construct MC composition masks
MC_comp_mask = {}
for composition in comp_list:
MC_comp_mask[composition] = (le.inverse_transform(y_test) == composition)
# Get number of MC comp in each reco energy bin
num_MC_energy, num_MC_energy_err = {}, {}
for composition in comp_list:
num_MC_energy[composition] = np.histogram(log_energy[MC_comp_mask[composition]],
bins=energy_bins)[0]
num_MC_energy_err[composition] = np.sqrt(num_MC_energy[composition])
num_MC_energy['total'] = np.histogram(log_energy, bins=energy_bins)[0]
num_MC_energy_err['total'] = np.sqrt(num_MC_energy['total'])
# Get number of correctly identified comp in each reco energy bin
num_reco_energy, num_reco_energy_err = {}, {}
for composition in comp_list:
num_reco_energy[composition] = np.histogram(
log_energy[MC_comp_mask[composition] & correctly_identified_mask],
bins=energy_bins)[0]
num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])
num_reco_energy['total'] = np.histogram(log_energy[correctly_identified_mask], bins=energy_bins)[0]
num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])
# Calculate correctly identified fractions as a function of MC energy
reco_frac, reco_frac_err = {}, {}
for composition in comp_list:
reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
num_reco_energy[composition], num_reco_energy_err[composition],
num_MC_energy[composition], num_MC_energy_err[composition])
frac_correct_folds[composition].append(reco_frac[composition])
reco_frac['total'], reco_frac_err['total'] = comp.ratio_error(
num_reco_energy['total'], num_reco_energy_err['total'],
num_MC_energy['total'], num_MC_energy_err['total'])
return reco_frac, reco_frac_err
In [7]:
comp_list = ['light', 'heavy']
# Split training data into CV training and testing folds
kf = KFold(n_splits=10)
frac_correct_folds = defaultdict(list)
fold_num = 0
print('Fold ', end='')
for train_index, test_index in kf.split(X_train_sim):
fold_num += 1
print('{}...'.format(fold_num), end='')
X_train_fold, X_test_fold = X_train_sim[train_index], X_train_sim[test_index]
y_train_fold, y_test_fold = y_train_sim[train_index], y_train_sim[test_index]
reco_frac, reco_frac_err = get_frac_correct(X_train_fold, X_test_fold,
y_train_fold, y_test_fold,
comp_list)
for composition in comp_list:
frac_correct_folds[composition].append(reco_frac[composition])
frac_correct_folds['total'].append(reco_frac['total'])
frac_correct_gen_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}
In [8]:
comp_list = ['light', 'heavy']
reco_frac, reco_frac_stat_err = get_frac_correct(X_train_sim, X_test_sim,
y_train_sim, y_test_sim,
comp_list)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
step_x = energy_midpoints
step_x = np.append(step_x[0]-energy_bin_width/2, step_x)
step_x = np.append(step_x, step_x[-1]+energy_bin_width/2)
# Plot fraction of events correctlt classified vs energy
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
err = np.sqrt(frac_correct_gen_err[composition]**2 + reco_frac_stat_err[composition]**2)
plotting.plot_steps(energy_midpoints, reco_frac[composition], err, ax, color_dict[composition], composition)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([6.2, 8.0])
ax.grid()
leg = plt.legend(loc='upper center',
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
# place a text box in upper left in axes coords
textstr = '$\mathrm{\underline{Training \ features}}$: \n'
for i, label in enumerate(feature_labels):
if (i == len(feature_labels)-1):
textstr += '{}) '.format(i+1) + label
else:
textstr += '{}) '.format(i+1) + label + '\n'
props = dict(facecolor='white', linewidth=0)
ax.text(1.025, 0.855, textstr, transform=ax.transAxes, fontsize=8,
verticalalignment='top', bbox=props)
cvstr = '$\mathrm{\underline{CV \ score}}$:\n' + '{:0.2f}\% (+/- {:.2}\%)'.format(scores.mean()*100, scores.std()*100)
print(cvstr)
props = dict(facecolor='white', linewidth=0)
ax.text(1.025, 0.9825, cvstr, transform=ax.transAxes, fontsize=8,
verticalalignment='top', bbox=props)
plt.show()
In [9]:
def get_num_comp_reco(X_train, y_train, X_test, comp_list):
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
log_energy = X_test[:, 0]
# Get number of correctly identified comp in each reco energy bin
num_reco_energy, num_reco_energy_err = {}, {}
for composition in comp_list:
num_reco_energy[composition] = np.histogram(
log_energy[le.inverse_transform(test_predictions) == composition],
bins=energy_bins)[0]
num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])
num_reco_energy['total'] = np.histogram(log_energy, bins=energy_bins)[0]
num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])
return num_reco_energy, num_reco_energy_err
In [10]:
eff_area, eff_area_error, energy_midpoints = comp.analysis.get_effective_area(df_sim)
In [13]:
comp_list = ['light', 'heavy']
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(X_train_sim, y_train_sim, X_test_data, comp_list)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
energy_bin_widths = 10**energy_bins[1:] - 10**energy_bins[:-1]
def get_energy_res(df_sim, energy_bins):
reco_log_energy = df_sim['lap_log_energy'].values
MC_log_energy = df_sim['MC_log_energy'].values
energy_res = reco_log_energy - MC_log_energy
bin_centers, bin_medians, energy_err = comp.analysis.data_functions.get_medians(reco_log_energy,
energy_res,
energy_bins)
return np.abs(bin_medians)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.85)))
# solid_angle = 2*np.pi*(1-np.cos(40*(np.pi/180)))
print(solid_angle)
print(2*np.pi*(1-np.cos(40*(np.pi/180))))
# Live-time information
start_time = np.amin(df_data['start_time_mjd'].values)
end_time = np.a(df_data['end_time_mjd'].values)
day_to_sec = 24 * 60 * 60.
dt = day_to_sec * (end_time - start_time)
print(dt)
# Plot fraction of events vs energy
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
# Calculate dN/dE
y = num_reco_energy[composition]/energy_bin_widths
y_err = num_reco_energy_err[composition]/energy_bin_widths
# Add effective area
y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
# Add solid angle
y = y / solid_angle
y_err = y_err / solid_angle
# Add time duration
y = y / dt
y_err = y / dt
# Add energy scaling
energy_err = get_energy_res(df_sim, energy_bins)
energy_err = np.array(energy_err)
print(10**energy_err)
y = (10**energy_midpoints)**2.7 * y
y_err = (10**energy_midpoints)**2.7 * y_err
plotting.plot_steps(energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.2, 8.0])
ax.set_ylim([10**2, 10**5])
ax.grid()
leg = plt.legend(loc='upper center',
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
plt.show()
In [12]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Greens):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='None', cmap=cmap,
vmin=0, v=1.0)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, '{:0.3f}'.format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True composition')
plt.xlabel('Predicted composition')
fig, ax = plt.subplots()
plot_confusion_matrix(confmat, classes=['light', 'heavy'], normalize=True,
title='Confusion matrix, without normalization')
# # Plot normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
# title='Normalized confusion matrix')
plt.show()
In [44]:
pipeline.get_params()['classifier___depth']
Out[44]:
In [47]:
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
fig, axarr = plt.subplots(1, 2)
for composition, ax in zip(comp_list, axarr.flatten()):
MC_comp_mask = (df_sim['MC_comp_class'] == composition)
MC_log_energy = df_sim['MC_log_energy'][MC_comp_mask].values
reco_log_energy = df_sim['lap_log_energy'][MC_comp_mask].values
plotting.histogram_2D(MC_log_energy, reco_log_energy, energy_bins, log_counts=True, ax=ax)
ax.plot([0,10], [0,10], marker='None', linestyle='-.')
ax.set_xlim([6.2, 8])
ax.set_ylim([6.2, 8])
ax.set_xlabel('$\log_{10}(E_{\mathrm{MC}}/\mathrm{GeV})$')
ax.set_ylabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_title('{} response matrix'.format(composition))
plt.tight_layout()
plt.show()
In [10]:
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
10**energy_bins[1:] - 10**energy_bins[:-1]
Out[10]:
In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print((prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)
In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print((dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print((dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_
In [ ]:
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)
In [ ]:
pipeline.named_steps['classifier'].decision_path(X_test)
In [48]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
# test_probs = defaultdict(list)
fig, ax = plt.subplots()
test_predictions = pipeline.predict(X_test_data)
test_probs = pipeline.predict_proba(X_test_data)
for class_ in pipeline.classes_:
test_predictions == le.inverse_transform(class_)
plt.hist(test_probs[:, class_], bins=np.linspace(0, 1, 50),
histtype='step', label=composition,
color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend()
plt.grid()
plt.show()
In [5]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
comp_mask = (le.inverse_transform(y_test) == composition)
probs = np.copy(test_probs[comp_mask])
print('probs = {}'.format(probs.shape))
weighted_mass = np.zeros(len(probs))
for class_ in pipeline.classes_:
c = le.inverse_transform(class_)
weighted_mass += comp.simfunctions.comp2mass(c) * probs[:, class_]
print('min = {}'.format(min(weighted_mass)))
print(' = {}'.format((weighted_mass)))
ax.hist(weighted_mass, bins=np.linspace(0, 5, 100),
histtype='step', label=None, color='darkgray',
alpha=1.0, log=False)
for c in comp_list:
ax.axvline(comp.simfunctions.comp2mass(c), color=color_dict[c],
marker='None', linestyle='-')
ax.set_ylabel('Counts')
ax.set_xlabel('Weighted atomic number')
ax.set_title('MC {}'.format(composition))
ax.grid()
plt.tight_layout()
plt.show()
In [15]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
comp_mask = (le.inverse_transform(y_test) == composition)
probs = np.copy(test_probs[comp_mask])
weighted_mass = np.zeros(len(probs))
for class_ in pipeline.classes_:
c = le.inverse_transform(class_)
ax.hist(probs[:, class_], bins=np.linspace(0, 1, 50),
histtype='step', label=c, color=color_dict[c],
alpha=1.0, log=True)
ax.legend(title='Reco comp', framealpha=0.5)
ax.set_ylabel('Counts')
ax.set_xlabel('Testing set class probabilities')
ax.set_title('MC {}'.format(composition))
ax.grid()
plt.tight_layout()
plt.show()
In [25]:
comp_list = ['light', 'heavy']
test_probs = defaultdict(list)
fig, ax = plt.subplots()
# test_probs = pipeline.predict_proba(X_test)
for event in pipeline.predict_proba(X_test_data):
composition = le.inverse_transform(np.arg(event))
test_probs[composition].append(np.a(event))
for composition in comp_list:
plt.hist(test_probs[composition], bins=np.linspace(0, 1, 100),
histtype='step', label=composition,
color=color_dict[composition], alpha=0.8, log=False)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend(title='Reco comp')
plt.grid()
plt.show()
In [ ]:
In [ ]: