In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
In [2]:
from __future__ import division
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import composition as comp
# Plotting-related
sns.set_palette('muted')
sns.set_color_codes()
color_dict = defaultdict()
for i, composition in enumerate(['light', 'heavy', 'total']):
color_dict[composition] = sns.color_palette('muted').as_hex()[i]
%matplotlib inline
In [3]:
df, cut_dict = comp.load_dataframe(type_='sim', config='IT73', return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
selection_mask *= cut_dict[key]
df = df[selection_mask]
# feature_list, feature_labels = comp.get_training_features()
feature_list = np.array(['lap_log_energy', 'InIce_log_charge_1_30', 'lap_cos_zenith',
'log_NChannels_1_30', 'log_s125', 'StationDensity',
'charge_nchannels_ratio', 'charge_nhits_ratio',
'stationdensity_charge_ratio', 'nchannels_nhits_ratio',
'stationdensity_nchannels_ratio', 'stationdensity_nhits_ratio',
'lap_likelihood', 'log_NHits_1_30', 'log_s50', 'log_s80',
'log_s180', 'log_s250', 'log_s500'])
label_dict = {'reco_log_energy': '$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$',
'lap_log_energy': '$\log_{10}(E_{\mathrm{Lap}}/\mathrm{GeV})$',
'log_s50': '$\log_{10}(S_{\mathrm{50}})$',
'log_s80': '$\log_{10}(S_{\mathrm{80}})$',
'log_s125': '$\log_{10}(S_{\mathrm{125}})$',
'log_s180': '$\log_{10}(S_{\mathrm{180}})$',
'log_s250': '$\log_{10}(S_{\mathrm{250}})$',
'log_s500': '$\log_{10}(S_{\mathrm{500}})$',
'lap_likelihood': '$r\log_{10}(l)$',
'InIce_charge_1_30': 'InIce charge (top 50\%)',
'InIce_log_charge_1_30': '$\log_{10}$(InIce charge (top 50\%))',
'lap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
'LLHlap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
'LLHLF_cos_zenith': '$\cos(\\theta_{\mathrm{LLH+COG}})$',
'lap_chi2': '$\chi^2_{\mathrm{Lap}}/\mathrm{n.d.f}$',
'NChannels_1_30': 'NChannels (top 50\%)',
'log_NChannels_1_30' : '$\log_{10}$(NChannels (top 50\%))',
'StationDensity': 'StationDensity',
'charge_nchannels_ratio': 'Charge/NChannels',
'stationdensity_charge_ratio': 'StationDensity/Charge',
'NHits_1_30': 'NHits',
'log_NHits_1_30': '$\log_{10}$(NHits (top 50\%))',
'charge_nhits_ratio': 'Charge/NHits',
'nchannels_nhits_ratio': 'NChannels/NHits',
'stationdensity_nchannels_ratio': 'StationDensity/NChannels',
'stationdensity_nhits_ratio': 'StationDensity/NHits'
}
feature_labels = np.array([label_dict[feature] for feature in feature_list])
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
df, feature_list, comp_class=True)
print('number training events = ' + str(y_train.shape[0]))
print('number testing events = ' + str(y_test.shape[0]))
In [4]:
d = pd.DataFrame(df, columns=feature_list)
# Compute the correlation matrix
corr = d.corr()
# corr = d.corr().values
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0,
square=True, xticklabels=feature_labels, yticklabels=feature_labels,
linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=False, ax=ax)
plt.show()
In [5]:
d.var()
Out[5]:
In [6]:
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
num_features = len(feature_list)
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(df, feature_list)
In [6]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
scaler = pipeline.named_steps['scaler']
clf = pipeline.named_steps['classifier']
clf_name = clf.__class__.__name__
In [7]:
test_predictions = pipeline.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = pipeline.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))
In [8]:
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]
fig, ax = plt.subplots()
# feature_labels = np.array(['$\\log_{10}({\mathrm{E/GeV})$', 'InIce charge',
# '$\cos(\\theta)$', '$\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$', 'NChannels'])
for f in range(num_features):
print('{}) {}'.format(f + 1, importances[indices[f]]))
plt.ylabel('Feature Importances')
plt.bar(range(num_features),
importances[indices],
align='center')
plt.xticks(range(num_features),
feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
plt.show()
In [15]:
corr = d.corr()
In [16]:
df.MC_log_energy.plot(kind='hist', xlim=[6.2, 8], bins=np.arange(6.2, 8, 0.05))
plt.xlabel('$\\log_{10}({\mathrm{E/GeV})$')
Out[16]:
In [17]:
df.reco_log_energy.plot(kind='hist', xlim=[6.2, 8], bins=np.arange(6.2, 8, 0.05))
plt.xlabel('$\\log_{10}({\mathrm{E/GeV})$')
Out[17]:
In [18]:
fig, ax = plt.subplots()
df.NChannels[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(0, 1200, 50), alpha=0.5, label='P')
df.NChannels[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(0, 1200, 50), alpha=0.5, label='Fe')
plt.legend()
Out[18]:
In [13]:
fig, ax = plt.subplots()
df.lap_likelihood[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='P', log=True)
df.lap_likelihood[df.MC_comp == 'He'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='He')
df.lap_likelihood[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='Fe')
plt.legend()
Out[13]:
In [10]:
fig, ax = plt.subplots()
df.lap_ndf[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='P', log=True)
df.lap_ndf[df.MC_comp == 'He'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='He')
df.lap_ndf[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='Fe')
plt.legend()
Out[10]:
In [ ]: