Author: James Bourbeau
In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
In [21]:
%matplotlib inline
from __future__ import division, print_function
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold
import composition as comp
import composition.analysis.plotting as plotting
color_dict = {'light': 'C0', 'heavy': 'C1', 'total': 'C2',
'P': 'C0', 'He': 'C1', 'O': 'C3', 'Fe':'C4'}
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [4]:
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
Get composition classifier pipeline
In [5]:
pipeline_str = 'GBDT'
pipeline = comp.get_pipeline(pipeline_str)
Define energy binning for this analysis
In [6]:
energybins = comp.analysis.get_energybins()
[ back to top ]
In [7]:
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)
In [8]:
# Compute the correlation matrix
df_sim = comp.load_dataframe(datatype='sim', config='IC79')
feature_list, feature_labels = comp.analysis.get_training_features()
corr = df_sim[feature_list].corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0,
square=True, xticklabels=feature_labels, yticklabels=feature_labels,
linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=True, ax=ax)
# outfile = args.outdir + '/feature_covariance.png'
# plt.savefig(outfile)
plt.show()
In [9]:
data = comp.preprocess_data(comp_class=comp_class, return_energy=True)
In [14]:
is_finite_mask = np.isfinite(data.X)
not_finite_mask = np.logical_not(is_finite_mask)
finite_data_mask = np.logical_not(np.any(not_finite_mask, axis=1))
data = data[finite_data_mask]
Run classifier over training and testing sets to get an idea of the degree of overfitting
In [19]:
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
pipeline.fit(sim_train.X, sim_train.y)
train_pred = pipeline.predict(sim_train.X)
train_acc = accuracy_score(sim_train.y, train_pred)
print('Training accuracy = {:.2%}'.format(train_acc))
test_pred = pipeline.predict(sim_test.X)
test_acc = accuracy_score(sim_test.y, test_pred)
print('Testing accuracy = {:.2%}'.format(test_acc))
print('=' * 30)
In [23]:
len(energybins.energy_midpoints)*2
Out[23]:
In [9]:
def get_frac_correct(train, test, pipeline, comp_list):
assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
assert train.y is not None, 'train must have true y values'
assert test.y is not None, 'test must have true y values'
pipeline.fit(train.X, train.y)
test_predictions = pipeline.predict(test.X)
correctly_identified_mask = (test_predictions == test.y)
# Construct MC composition masks
MC_comp_mask = {}
for composition in comp_list:
MC_comp_mask[composition] = (test.le.inverse_transform(test.y) == composition)
MC_comp_mask['total'] = np.array([True]*len(test))
reco_frac, reco_frac_err = {}, {}
for composition in comp_list+['total']:
comp_mask = MC_comp_mask[composition]
# Get number of MC comp in each reco energy bin
num_MC_energy = np.histogram(test.log_energy[comp_mask],
bins=energybins.log_energy_bins)[0]
num_MC_energy_err = np.sqrt(num_MC_energy)
# Get number of correctly identified comp in each reco energy bin
num_reco_energy = np.histogram(test.log_energy[comp_mask & correctly_identified_mask],
bins=energybins.log_energy_bins)[0]
num_reco_energy_err = np.sqrt(num_reco_energy)
# Calculate correctly identified fractions as a function of MC energy
reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
num_reco_energy, num_reco_energy_err,
num_MC_energy, num_MC_energy_err)
return reco_frac, reco_frac_err
In [10]:
# Split training data into CV training and testing folds
kf = KFold(n_splits=10)
frac_correct_folds = defaultdict(list)
fold_num = 0
print('Fold ', end='')
for train_index, test_index in kf.split(sim_train.X):
fold_num += 1
print('{}...'.format(fold_num), end='')
reco_frac, reco_frac_err = get_frac_correct(sim_train[train_index],
sim_train[test_index],
pipeline, comp_list)
for composition in comp_list:
frac_correct_folds[composition].append(reco_frac[composition])
frac_correct_folds['total'].append(reco_frac['total'])
frac_correct_gen_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}
scores = np.array(frac_correct_folds['total'])
score = scores.mean(axis=1).mean()
score_std = scores.mean(axis=1).std()
In [11]:
reco_frac, reco_frac_stat_err = get_frac_correct(sim_train, sim_test, pipeline, comp_list)
In [12]:
# Plot fraction of events correctlt classified vs energy
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
err = np.sqrt(frac_correct_gen_err[composition]**2 + reco_frac_stat_err[composition]**2)
plotting.plot_steps(energybins.log_energy_midpoints, reco_frac[composition], err, ax,
color_dict[composition], composition)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
ax.grid()
leg = plt.legend(loc='upper center', frameon=False,
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
cv_str = 'Accuracy: {:0.2f}\% (+/- {:0.1f}\%)'.format(score*100, score_std*100)
ax.text(7.4, 0.2, cv_str,
ha="center", va="center", size=10,
bbox=dict(boxstyle='round', fc="white", ec="gray", lw=0.8))
plt.savefig('/home/jbourbeau/public_html/figures/frac-correct.png')
plt.show()
[ back to top ]
In [10]:
def get_num_comp_reco(train, test, pipeline, comp_list):
assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
assert train.y is not None, 'train must have true y values'
pipeline.fit(train.X, train.y)
test_predictions = pipeline.predict(test.X)
# Get number of correctly identified comp in each reco energy bin
num_reco_energy, num_reco_energy_err = {}, {}
for composition in comp_list:
print('composition = {}'.format(composition))
comp_mask = train.le.inverse_transform(test_predictions) == composition
print('sum(comp_mask) = {}'.format(np.sum(comp_mask)))
print(test.log_energy[comp_mask])
num_reco_energy[composition] = np.histogram(test.log_energy[comp_mask],
bins=energybins.log_energy_bins)[0]
num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])
num_reco_energy['total'] = np.histogram(test.log_energy, bins=energybins.log_energy_bins)[0]
num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])
return num_reco_energy, num_reco_energy_err
In [15]:
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(sim_train, data, pipeline, comp_list)
import pprint
pprint.pprint(num_reco_energy)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.8)))
In [16]:
counts_observed = []
for light_counts, heavy_counts in zip(num_reco_energy['light'], num_reco_energy['heavy']):
counts_observed.extend([light_counts, heavy_counts])
print('counts_observed = {}'.format(counts_observed))
In [17]:
pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(sim_test.X)
true_comp = sim_train.le.inverse_transform(sim_test.y)
pred_comp = sim_train.le.inverse_transform(test_predictions)
In [18]:
response_list = []
response_err_list = []
sim_bin_idxs = np.digitize(sim_test.log_energy, energybins.log_energy_bins) - 1
data_bin_idxs = np.digitize(data.log_energy, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(sim_bin_idxs)
# energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
for bin_idx in energy_bin_idx:
sim_bin_mask = sim_bin_idxs == bin_idx
data_bin_mask = data_bin_idxs == bin_idx
response_mat = confusion_matrix(true_comp[sim_bin_mask], pred_comp[sim_bin_mask],
labels=comp_list)
# Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
response_mat = response_mat.T
# Get response matrix statistical error
response_mat_err = np.sqrt(response_mat)/response_mat.sum(axis=0, keepdims=True)
response_err_list.append(response_mat_err)
# Normalize along MC comp axis to go from counts to probabilities
response_mat = response_mat.astype(float)/response_mat.sum(axis=0, keepdims=True)
response_list.append(response_mat)
block_response = block_diag(response_list).toarray()
block_response = np.flipud(block_response)
print('block_response = \n{}'.format(block_response))
block_response_err = block_diag(response_err_list).toarray()
block_response_err = np.flipud(block_response_err)
print('block_response_err = \n{}'.format(block_response_err))
In [22]:
from icecube.weighting.weighting import from_simprod
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5
In [23]:
simlist = np.unique(df_sim['sim'])
for i, sim in enumerate(simlist):
_, sim_files = comp.simfunctions.get_level3_sim_files(sim)
num_files = len(sim_files)
if i == 0:
generator = num_files*from_simprod(int(sim))
else:
generator += num_files*from_simprod(int(sim))
In [ ]:
flux = GaisserH3a()
In [97]:
df_sim[['MC_comp', 'MC_type']][df_sim.MC_comp == 'O']
Out[97]:
In [25]:
priors = defaultdict(list)
for flux, name in zip([GaisserH3a(), GaisserH4a(), Hoerandel5()], ['h3a', 'h4a', 'Hoerandel5']):
for energy_mid in energybins.energy_midpoints:
energy = [energy_mid]*4
ptype = [2212, 1000020040, 1000080160, 1000260560]
weights = flux(energy, ptype)/generator(energy, ptype)
# print(weights)
light_prior = weights[:2].sum()/weights.sum()
heavy_prior = weights[2:].sum()/weights.sum()
priors[name].extend([light_prior, heavy_prior])
print(priors['h3a'])
print(priors['h4a'])
In [28]:
len(counts_observed)
Out[28]:
In [26]:
with open('pyunfold_dict.json', 'w') as outfile:
data = {'counts': counts_observed,
'block_response':block_response.tolist(),
'block_response_err':block_response_err.tolist()}
for model in ['h3a', 'h4a', 'Hoerandel5']:
data['priors_{}'.format(model)] = priors[model]
json.dump(data, outfile)
In [19]:
# Live-time information
goodrunlist = pd.read_table('/data/ana/CosmicRay/IceTop_GRL/IC79_2010_GoodRunInfo_4IceTop.txt', skiprows=[0, 3])
goodrunlist.head()
Out[19]:
In [20]:
livetimes = goodrunlist['LiveTime(s)']
livetime = np.sum(livetimes[goodrunlist['Good_it_L2'] == 1])
print('livetime (seconds) = {}'.format(livetime))
print('livetime (days) = {}'.format(livetime/(24*60*60)))
In [21]:
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
# Calculate dN/dE
y = num_reco_energy[composition]
y_err = num_reco_energy_err[composition]
# Add time duration
y = y / livetime
y_err = y / livetime
# ax.errorbar(log_energy_midpoints, y, yerr=y_err,
# color=color_dict[composition], label=composition,
# marker='.', linestyle='None')
plotting.plot_steps(energybins.log_energy_midpoints, y, y_err,
ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Rate [s$^{-1}$]')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid(linestyle=':')
leg = plt.legend(loc='upper center', frameon=False,
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
plt.show()
In [22]:
eff_area, eff_area_error, energy_midpoints = comp.analysis.get_effective_area(df_sim,
energybins.energy_bins, energy='MC')
fix, ax = plt.subplots()
ax.errorbar(np.log10(energy_midpoints), eff_area, yerr=eff_area_error)
ax.set_ylabel('Effective area [$m^2$]')
ax.set_xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.grid()
plt.show()
In [23]:
energybins.log_energy_bins
Out[23]:
In [24]:
# Plot fraction of events vs energy
# fig, ax = plt.subplots(figsize=(8, 6))
fig = plt.figure()
ax = plt.gca()
for composition in comp_list + ['total']:
# Calculate dN/dE
y = num_reco_energy[composition]/energybins.energy_bin_widths
y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
# Add effective area
y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
# Add solid angle
y = y / solid_angle
y_err = y_err / solid_angle
# Add time duration
y = y / livetime
y_err = y / livetime
# Add energy scaling
# energy_err = get_energy_res(df_sim, energy_bins)
# energy_err = np.array(energy_err)
# print(10**energy_err)
y = energybins.energy_midpoints**2.7 * y
y_err = energybins.energy_midpoints**2.7 * y_err
# print(y)
# print(y_err)
# ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
# marker='.', markersize=8)
plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
# Add 3-year scraped flux
df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
df_light = pd.DataFrame.from_dict({'energy': df_proton.energy,
'flux': df_proton.flux + df_helium.flux})
df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
df_heavy = pd.DataFrame.from_dict({'energy': df_oxygen.energy,
'flux': df_oxygen.flux + df_iron.flux})
if comp_class:
ax.plot(np.log10(df_light.energy), df_light.flux, label='3 yr light',
marker='.', ls=':')
ax.plot(np.log10(df_heavy.energy), df_heavy.flux, label='3 yr heavy',
marker='.', ls=':')
ax.plot(np.log10(df_heavy.energy), df_heavy.flux+df_light.flux, label='3 yr total',
marker='.', ls=':')
else:
ax.plot(np.log10(df_proton.energy), df_proton.flux, label='3 yr proton',
marker='.', ls=':')
ax.plot(np.log10(df_helium.energy), df_helium.flux, label='3 yr helium',
marker='.', ls=':', color=color_dict['He'])
ax.plot(np.log10(df_oxygen.energy), df_oxygen.flux, label='3 yr oxygen',
marker='.', ls=':', color=color_dict['O'])
ax.plot(np.log10(df_iron.energy), df_iron.flux, label='3 yr iron',
marker='.', ls=':', color=color_dict['Fe'])
ax.plot(np.log10(df_iron.energy), df_proton.flux+df_helium.flux+df_oxygen.flux+df_iron.flux, label='3 yr total',
marker='.', ls=':', color='C2')
leg = plt.legend(loc='upper center', frameon=False,
bbox_to_anchor=(0.5, # horizontal
1.15),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
In [24]:
if not comp_class:
# Add 3-year scraped flux
df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
# Plot fraction of events vs energy
fig, axarr = plt.subplots(2, 2, figsize=(8, 6))
for composition, ax in zip(comp_list + ['total'], axarr.flatten()):
# Calculate dN/dE
y = num_reco_energy[composition]/energybins.energy_bin_widths
y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
# Add effective area
y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
# Add solid angle
y = y / solid_angle
y_err = y_err / solid_angle
# Add time duration
y = y / livetime
y_err = y / livetime
y = energybins.energy_midpoints**2.7 * y
y_err = energybins.energy_midpoints**2.7 * y_err
plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
# Load 3-year flux
df_3yr = pd.read_csv('3yearscraped/{}'.format(composition), sep='\t',
header=None, names=['energy', 'flux'])
ax.plot(np.log10(df_3yr.energy), df_3yr.flux, label='3 yr {}'.format(composition),
marker='.', ls=':', color=color_dict[composition])
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
ax.set_xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
ax.legend()
plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
[ back to top ]
In [20]:
reco_frac['light']
Out[20]:
In [21]:
reco_frac['heavy']
Out[21]:
In [22]:
num_reco_energy['light']
Out[22]:
In [23]:
num_reco_energy['heavy']
Out[23]:
In [9]:
pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(sim_test.X)
true_comp = sim_train.le.inverse_transform(sim_test.y)
pred_comp = sim_train.le.inverse_transform(test_predictions)
print(true_comp)
print(pred_comp)
In [10]:
data_pred = pipeline.predict(data.X)
observed_comp = sim_train.le.inverse_transform(data_pred)
print(observed_comp)
In [11]:
comp_list
Out[11]:
In [16]:
sim_bin_idxs = np.digitize(sim_test.log_energy, energybins.log_energy_bins) - 1
data_bin_idxs = np.digitize(data.log_energy, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(sim_bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
for bin_idx in energy_bin_idx:
sim_bin_mask = sim_bin_idxs == bin_idx
data_bin_mask = data_bin_idxs == bin_idx
unfolder = comp.analysis.Unfolder()
unfolded_events = unfolder.unfold(true_MC_comp=true_comp[sim_bin_mask],
reco_MC_comp=pred_comp[sim_bin_mask],
observed_comp=observed_comp[data_bin_mask],
priors=[0.5, 0.5],
labels=comp_list)
print(unfolded_events)
for i, composition in enumerate(comp_list):
num_reco_energy_unfolded[composition].append(unfolded_events[i])
for composition in comp_list:
num_reco_energy_unfolded[composition] = np.array(num_reco_energy_unfolded[composition])
num_reco_energy_unfolded['total'] = np.sum([num_reco_energy_unfolded[composition] for composition in comp_list], axis=0)
print(num_reco_energy_unfolded)
In [27]:
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
# Calculate dN/dE
y = num_reco_energy_unfolded[composition]/energy_bin_widths
y_err = np.sqrt(y)/energy_bin_widths
# Add effective area
y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
# Add solid angle
y = y / solid_angle
y_err = y_err / solid_angle
# Add time duration
y = y / livetime
y_err = y / livetime
# Add energy scaling
# energy_err = get_energy_res(df_sim, energy_bins)
# energy_err = np.array(energy_err)
# print(10**energy_err)
y = energy_midpoints**2.7 * y
y_err = energy_midpoints**2.7 * y_err
print(y)
print(y_err)
# ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
# marker='.', markersize=8)
plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
Get confusion matrix for each energy bin
In [99]:
bin_idxs = np.digitize(energy_test_sim, log_energy_bins) - 1
energy_bin_idx = np.unique(bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
response_mat = []
for bin_idx in energy_bin_idx:
energy_bin_mask = bin_idxs == bin_idx
confmat = confusion_matrix(true_comp[energy_bin_mask], pred_comp[energy_bin_mask], labels=comp_list)
confmat = np.divide(confmat.T, confmat.sum(axis=1, dtype=float)).T
response_mat.append(confmat)
In [100]:
response_mat
Out[100]:
In [134]:
r = np.dstack((np.copy(num_reco_energy['light']), np.copy(num_reco_energy['heavy'])))[0]
for unfold_iter in range(50):
print('Unfolding iteration {}...'.format(unfold_iter))
if unfold_iter == 0:
u = r
fs = []
for bin_idx in energy_bin_idx:
# print(u)
f = np.dot(response_mat[bin_idx], u[bin_idx])
f[f < 0] = 0
fs.append(f)
# print(f)
u = u + (r - fs)
# u[u < 0] = 0
# print(u)
unfolded_counts_iter = {}
unfolded_counts_iter['light'] = u[:,0]
unfolded_counts_iter['heavy'] = u[:,1]
unfolded_counts_iter['total'] = u.sum(axis=1)
print(unfolded_counts_iter)
In [135]:
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
# Calculate dN/dE
y = unfolded_counts_iter[composition]/energy_bin_widths
y_err = np.sqrt(y)/energy_bin_widths
# Add effective area
y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
# Add solid angle
y = y / solid_angle
y_err = y_err / solid_angle
# Add time duration
y = y / livetime
y_err = y / livetime
# Add energy scaling
# energy_err = get_energy_res(df_sim, energy_bins)
# energy_err = np.array(energy_err)
# print(10**energy_err)
y = energy_midpoints**2.7 * y
y_err = energy_midpoints**2.7 * y_err
print(y)
print(y_err)
# ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
# marker='.', markersize=8)
plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()
In [106]:
print(num_reco_energy)
In [107]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Greens):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='None', cmap=cmap,
vmin=0, vmax=1.0)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, '{:0.3f}'.format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True composition')
plt.xlabel('Predicted composition')
fig, ax = plt.subplots()
plot_confusion_matrix(confmat, classes=['light', 'heavy'], normalize=True,
title='Confusion matrix, without normalization')
# # Plot normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
# title='Normalized confusion matrix')
plt.show()
In [63]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)
inverse = np.linalg.inv(confmat)
inverse
Out[63]:
In [64]:
confmat
Out[64]:
In [66]:
comp_list = ['light', 'heavy']
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(X_train_sim, y_train_sim, X_test_data, comp_list)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
energy_bin_widths = 10**energy_bins[1:] - 10**energy_bins[:-1]
def get_energy_res(df_sim, energy_bins):
reco_log_energy = df_sim['lap_log_energy'].values
MC_log_energy = df_sim['MC_log_energy'].values
energy_res = reco_log_energy - MC_log_energy
bin_centers, bin_medians, energy_err = comp.analysis.data_functions.get_medians(reco_log_energy,
energy_res,
energy_bins)
return np.abs(bin_medians)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.85)))
# solid_angle = 2*np.pi*(1-np.cos(40*(np.pi/180)))
print(solid_angle)
print(2*np.pi*(1-np.cos(40*(np.pi/180))))
# Live-time information
start_time = np.amin(df_data['start_time_mjd'].values)
end_time = np.amax(df_data['end_time_mjd'].values)
day_to_sec = 24 * 60 * 60.
dt = day_to_sec * (end_time - start_time)
print(dt)
# Plot fraction of events vs energy
fig, ax = plt.subplots()
for i, composition in enumerate(comp_list):
num_reco_bin = np.array([[i, j] for i, j in zip(num_reco_energy['light'], num_reco_energy['heavy'])])
# print(num_reco_bin)
num_reco = np.array([np.dot(inverse, i) for i in num_reco_bin])
print(num_reco)
num_reco_2 = {'light': num_reco[:, 0], 'heavy': num_reco[:, 1]}
# Calculate dN/dE
y = num_reco_2[composition]/energy_bin_widths
y_err = num_reco_energy_err[composition]/energy_bin_widths
# Add effective area
y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
# Add solid angle
y = y / solid_angle
y_err = y_err / solid_angle
# Add time duration
y = y / dt
y_err = y / dt
# Add energy scaling
energy_err = get_energy_res(df_sim, energy_bins)
energy_err = np.array(energy_err)
# print(10**energy_err)
y = (10**energy_midpoints)**2.7 * y
y_err = (10**energy_midpoints)**2.7 * y_err
plotting.plot_steps(energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid()
leg = plt.legend(loc='upper center',
bbox_to_anchor=(0.5, # horizontal
1.1),# vertical
ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(3.0)
plt.show()
In [44]:
pipeline.get_params()['classifier__max_depth']
Out[44]:
In [47]:
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
fig, axarr = plt.subplots(1, 2)
for composition, ax in zip(comp_list, axarr.flatten()):
MC_comp_mask = (df_sim['MC_comp_class'] == composition)
MC_log_energy = df_sim['MC_log_energy'][MC_comp_mask].values
reco_log_energy = df_sim['lap_log_energy'][MC_comp_mask].values
plotting.histogram_2D(MC_log_energy, reco_log_energy, energy_bins, log_counts=True, ax=ax)
ax.plot([0,10], [0,10], marker='None', linestyle='-.')
ax.set_xlim([6.2, 8])
ax.set_ylim([6.2, 8])
ax.set_xlabel('$\log_{10}(E_{\mathrm{MC}}/\mathrm{GeV})$')
ax.set_ylabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_title('{} response matrix'.format(composition))
plt.tight_layout()
plt.show()
In [10]:
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
10**energy_bins[1:] - 10**energy_bins[:-1]
Out[10]:
In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)
In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_
In [ ]:
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)
In [ ]:
pipeline.named_steps['classifier'].decision_path(X_test)
In [48]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
# test_probs = defaultdict(list)
fig, ax = plt.subplots()
test_predictions = pipeline.predict(X_test_data)
test_probs = pipeline.predict_proba(X_test_data)
for class_ in pipeline.classes_:
test_predictions == le.inverse_transform(class_)
plt.hist(test_probs[:, class_], bins=np.linspace(0, 1, 50),
histtype='step', label=composition,
color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend()
plt.grid()
plt.show()
In [5]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
comp_mask = (le.inverse_transform(y_test) == composition)
probs = np.copy(test_probs[comp_mask])
print('probs = {}'.format(probs.shape))
weighted_mass = np.zeros(len(probs))
for class_ in pipeline.classes_:
c = le.inverse_transform(class_)
weighted_mass += comp.simfunctions.comp2mass(c) * probs[:, class_]
print('min = {}'.format(min(weighted_mass)))
print('max = {}'.format(max(weighted_mass)))
ax.hist(weighted_mass, bins=np.linspace(0, 5, 100),
histtype='step', label=None, color='darkgray',
alpha=1.0, log=False)
for c in comp_list:
ax.axvline(comp.simfunctions.comp2mass(c), color=color_dict[c],
marker='None', linestyle='-')
ax.set_ylabel('Counts')
ax.set_xlabel('Weighted atomic number')
ax.set_title('MC {}'.format(composition))
ax.grid()
plt.tight_layout()
plt.show()
In [15]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)
comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
comp_mask = (le.inverse_transform(y_test) == composition)
probs = np.copy(test_probs[comp_mask])
weighted_mass = np.zeros(len(probs))
for class_ in pipeline.classes_:
c = le.inverse_transform(class_)
ax.hist(probs[:, class_], bins=np.linspace(0, 1, 50),
histtype='step', label=c, color=color_dict[c],
alpha=1.0, log=True)
ax.legend(title='Reco comp', framealpha=0.5)
ax.set_ylabel('Counts')
ax.set_xlabel('Testing set class probabilities')
ax.set_title('MC {}'.format(composition))
ax.grid()
plt.tight_layout()
plt.show()
In [25]:
comp_list = ['light', 'heavy']
test_probs = defaultdict(list)
fig, ax = plt.subplots()
# test_probs = pipeline.predict_proba(X_test)
for event in pipeline.predict_proba(X_test_data):
composition = le.inverse_transform(np.argmax(event))
test_probs[composition].append(np.amax(event))
for composition in comp_list:
plt.hist(test_probs[composition], bins=np.linspace(0, 1, 100),
histtype='step', label=composition,
color=color_dict[composition], alpha=0.8, log=False)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend(title='Reco comp')
plt.grid()
plt.show()
In [ ]:
In [ ]: