In [63]:
from __future__ import division
import os
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import comptools as comp
color_dict = comp.color_dict
%matplotlib inline
In [8]:
config = 'IC86.2012'
num_groups = 4
In [18]:
comp_list = comp.get_comp_list(num_groups=num_groups)
energybins = comp.get_energybins(config=config)
log_energy_min = energybins.log_energy_min
log_energy_max = energybins.log_energy_max
# Load simulation training/testing DataFrames
print('Loading simulation training/testing DataFrames...')
df_sim_train, df_sim_test = comp.load_sim(config=config,
log_energy_min=log_energy_min,
log_energy_max=log_energy_max,
test_size=0.5,
verbose=True)
In [59]:
label_dict = {'reco_log_energy': '$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$',
'lap_log_energy': '$\log_{10}(E_{\mathrm{Lap}}/\mathrm{GeV})$',
'log_s50': '$\log_{10}(S_{\mathrm{50}})$',
'log_s80': '$\log_{10}(S_{\mathrm{80}})$',
'log_s125': '$\log_{10}(S_{\mathrm{125}})$',
'log_s180': '$\log_{10}(S_{\mathrm{180}})$',
'log_s250': '$\log_{10}(S_{\mathrm{250}})$',
'log_s500': '$\log_{10}(S_{\mathrm{500}})$',
'lap_rlogl': '$r\log_{10}(l)$',
'lap_beta': 'lap beta',
'InIce_log_charge_1_60': 'InIce charge',
'InIce_log_charge_1_45': 'InIce charge (top 75\%)',
'InIce_charge_1_30': 'InIce charge (top 50\%)',
'InIce_log_charge_1_30': '$\log_{10}(InIce charge (top 50))$',
'InIce_log_charge_1_15': 'InIce charge (top 25\%)',
'InIce_log_charge_1_6': 'InIce charge (top 10\%)',
'reco_cos_zenith': '$\cos(\\theta_{\mathrm{reco}})$',
'lap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
'LLHlap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
'LLHLF_cos_zenith': '$\cos(\\theta_{\mathrm{LLH+COG}})$',
'lap_chi2': '$\chi^2_{\mathrm{Lap}}/\mathrm{n.d.f}$',
'NChannels_1_60': 'NChannels',
'NChannels_1_45': 'NChannels (top 75\%)',
'NChannels_1_30': 'NChannels (top 50\%)',
'NChannels_1_15': 'NChannels (top 25\%)',
'NChannels_1_6': 'NChannels (top 10\%)',
'log_NChannels_1_30': '$\log_{10}$(NChannels (top 50\%))',
'StationDensity': 'StationDensity',
'charge_nchannels_ratio': 'Charge/NChannels',
'stationdensity_charge_ratio': 'StationDensity/Charge',
'NHits_1_30': 'NHits',
'log_NHits_1_30': '$\log_{10}$(NHits (top 50\%))',
'charge_nhits_ratio': 'Charge/NHits',
'nhits_nchannels_ratio': 'NHits/NChannels',
'stationdensity_nchannels_ratio': 'StationDensity/NChannels',
'stationdensity_nhits_ratio': 'StationDensity/NHits',
'llhratio': 'llhratio',
'n_he_stoch_standard': 'Num HE stochastics (standard)',
'n_he_stoch_strong': 'Num HE stochastics (strong)',
'eloss_1500_standard': 'dE/dX (standard)',
'log_dEdX': '$\mathrm{\log_{10}(dE/dX)}$',
'eloss_1500_strong': 'dE/dX (strong)',
'num_millipede_particles': '$N_{\mathrm{mil}}$',
'avg_inice_radius': '$\mathrm{\langle R_{\mu} \\rangle }$',
'invqweighted_inice_radius_1_60': '$\mathrm{R_{\mu \ bundle}}$',
'avg_inice_radius_1_60': '$\mathrm{R_{\mu \ bundle}}$',
'avg_inice_radius_Laputop': '$R_{\mathrm{core, Lap}}$',
'FractionContainment_Laputop_InIce': '$C_{\mathrm{IC}}$',
'Laputop_IceTop_FractionContainment': '$C_{\mathrm{IT}}$',
'max_inice_radius': '$R_{\mathrm{max}}$',
'invcharge_inice_radius': '$R_{\mathrm{q,core}}$',
'lap_zenith': 'zenith',
'NStations': 'NStations',
'IceTop_charge': 'IT charge',
'IceTop_charge_175m': 'Signal greater 175m',
'log_IceTop_charge_175m': '$\log_{10}(Q_{IT, 175})$',
'IT_charge_ratio': 'IT charge ratio',
'refit_beta': '$\mathrm{\\beta_{refit}}$',
'log_d4r_peak_energy': '$\mathrm{\log_{10}(E_{D4R})}$',
'log_d4r_peak_sigma': '$\mathrm{\log_{10}(\sigma E_{D4R})}$',
'd4r_N': 'D4R N',
'median_inice_radius': 'Median InIce'
}
In [60]:
feature_list = ['lap_cos_zenith', 'log_s125', 'log_dEdX', 'avg_inice_radius']
# feature_list = ['lap_cos_zenith', 'log_s125', 'log_dEdX', 'avg_inice_radius', 'StationDensity']
feature_labels = [label_dict[feature] for feature in feature_list]
In [64]:
d = pd.DataFrame(df_sim_train, columns=feature_list)
# Compute the correlation matrix
corr = d.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0, vmin=-1, vmax=1,
square=True, xticklabels=feature_labels, yticklabels=feature_labels,
linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=True, ax=ax)
corr_outfile = os.path.join(comp.paths.figures_dir, 'feature_distributions',
'correlation-matrix-{}.png'.format(config))
comp.check_output_dir(corr_outfile)
plt.savefig(corr_outfile)
plt.show()
In [65]:
with sns.axes_style(rc={'axes.grid': True}) as _:
# with mpl.rc_context(rc={'text.usetex': False}) as _, sns.axes_style(rc={'axes.grid': True}) as _:
s125_mask = (df_sim_train['log_s125'] < 1.0) & (df_sim_train['log_s125'] > 0.9)
g = sns.pairplot(df_sim_train[s125_mask], vars=feature_list,
hue='comp_group_{}'.format(num_groups),
hue_order=comp_list,
palette=color_dict,
diag_kws={'bins': 20, 'alpha': 0.7},
plot_kws={'lw': 0, 'alpha': 0.2})
# Make PairPlot lower diagonal
for i, j in zip(*np.triu_indices_from(g.axes, 1)):
g.axes[i, j].set_visible(False)
# Set axes labels to use formatted feature_labels
for idx in range(len(g.axes)):
g.axes[idx, 0].set_ylabel(feature_labels[idx])
g.axes[-1, idx].set_xlabel(feature_labels[idx])
# Modify legend title
g.fig.get_children()[-1].set_title('MC composition')
# Save figure
pairplot_outfile = os.path.join(comp.paths.figures_dir, 'feature_distributions',
'pairplot-{}.png'.format(config))
comp.check_output_dir(pairplot_outfile)
plt.savefig(pairplot_outfile)
In [ ]:
In [ ]:
In [14]:
with mpl.rc_context(rc={'text.usetex': False}):
for feature in feature_list:
fig, ax = plt.subplots()
df_sim_train[feature].plot(kind='hist', bins=100, ax=ax)
ax.set_xlabel(feature)
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [3]:
df, cut_dict = comp.load_sim(type_='sim', config='IT73', return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
selection_mask *= cut_dict[key]
df = df[selection_mask]
# feature_list, feature_labels = comp.get_training_features()
feature_list = np.array(['lap_log_energy', 'InIce_log_charge_1_30', 'lap_cos_zenith',
'log_NChannels_1_30', 'log_s125', 'StationDensity',
'charge_nchannels_ratio', 'charge_nhits_ratio',
'stationdensity_charge_ratio', 'nchannels_nhits_ratio',
'stationdensity_nchannels_ratio', 'stationdensity_nhits_ratio',
'lap_likelihood', 'log_NHits_1_30', 'log_s50', 'log_s80',
'log_s180', 'log_s250', 'log_s500'])
label_dict = {'reco_log_energy': '$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$',
'lap_log_energy': '$\log_{10}(E_{\mathrm{Lap}}/\mathrm{GeV})$',
'log_s50': '$\log_{10}(S_{\mathrm{50}})$',
'log_s80': '$\log_{10}(S_{\mathrm{80}})$',
'log_s125': '$\log_{10}(S_{\mathrm{125}})$',
'log_s180': '$\log_{10}(S_{\mathrm{180}})$',
'log_s250': '$\log_{10}(S_{\mathrm{250}})$',
'log_s500': '$\log_{10}(S_{\mathrm{500}})$',
'lap_likelihood': '$r\log_{10}(l)$',
'InIce_charge_1_30': 'InIce charge (top 50\%)',
'InIce_log_charge_1_30': '$\log_{10}$(InIce charge (top 50\%))',
'lap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
'LLHlap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
'LLHLF_cos_zenith': '$\cos(\\theta_{\mathrm{LLH+COG}})$',
'lap_chi2': '$\chi^2_{\mathrm{Lap}}/\mathrm{n.d.f}$',
'NChannels_1_30': 'NChannels (top 50\%)',
'log_NChannels_1_30' : '$\log_{10}$(NChannels (top 50\%))',
'StationDensity': 'StationDensity',
'charge_nchannels_ratio': 'Charge/NChannels',
'stationdensity_charge_ratio': 'StationDensity/Charge',
'NHits_1_30': 'NHits',
'log_NHits_1_30': '$\log_{10}$(NHits (top 50\%))',
'charge_nhits_ratio': 'Charge/NHits',
'nchannels_nhits_ratio': 'NChannels/NHits',
'stationdensity_nchannels_ratio': 'StationDensity/NChannels',
'stationdensity_nhits_ratio': 'StationDensity/NHits'
}
feature_labels = np.array([label_dict[feature] for feature in feature_list])
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
df, feature_list, comp_class=True)
print('number training events = ' + str(y_train.shape[0]))
print('number testing events = ' + str(y_test.shape[0]))
In [4]:
d = pd.DataFrame(df, columns=feature_list)
# Compute the correlation matrix
corr = d.corr()
# corr = d.corr().values
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0,
square=True, xticklabels=feature_labels, yticklabels=feature_labels,
linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=False, ax=ax)
plt.show()
In [5]:
d.var()
Out[5]:
In [6]:
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
num_features = len(feature_list)
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(df, feature_list)
In [6]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
scaler = pipeline.named_steps['scaler']
clf = pipeline.named_steps['classifier']
clf_name = clf.__class__.__name__
In [7]:
test_predictions = pipeline.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = pipeline.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))
In [8]:
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]
fig, ax = plt.subplots()
# feature_labels = np.array(['$\\log_{10}({\mathrm{E/GeV})$', 'InIce charge',
# '$\cos(\\theta)$', '$\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$', 'NChannels'])
for f in range(num_features):
print('{}) {}'.format(f + 1, importances[indices[f]]))
plt.ylabel('Feature Importances')
plt.bar(range(num_features),
importances[indices],
align='center')
plt.xticks(range(num_features),
feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
plt.show()
In [15]:
corr = d.corr()
In [16]:
df.MC_log_energy.plot(kind='hist', xlim=[6.2, 8], bins=np.arange(6.2, 8, 0.05))
plt.xlabel('$\\log_{10}({\mathrm{E/GeV})$')
Out[16]:
In [17]:
df.reco_log_energy.plot(kind='hist', xlim=[6.2, 8], bins=np.arange(6.2, 8, 0.05))
plt.xlabel('$\\log_{10}({\mathrm{E/GeV})$')
Out[17]:
In [18]:
fig, ax = plt.subplots()
df.NChannels[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(0, 1200, 50), alpha=0.5, label='P')
df.NChannels[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(0, 1200, 50), alpha=0.5, label='Fe')
plt.legend()
Out[18]:
In [13]:
fig, ax = plt.subplots()
df.lap_likelihood[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='P', log=True)
df.lap_likelihood[df.MC_comp == 'He'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='He')
df.lap_likelihood[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='Fe')
plt.legend()
Out[13]:
In [10]:
fig, ax = plt.subplots()
df.lap_ndf[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='P', log=True)
df.lap_ndf[df.MC_comp == 'He'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='He')
df.lap_ndf[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='Fe')
plt.legend()
Out[10]:
In [ ]: