In [6]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
sys.path
Out[6]:
In [7]:
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid
import composition as comp
%matplotlib inline
In [8]:
sns.set_palette('muted')
sns.set_color_codes()
In [9]:
layers = ['_1_60', '_1_45', '_1_30', '_1_15', '_1_6']
# feature_lists = [['reco_log_energy', 'InIce_log_charge_1_60', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_60'],
# ['reco_log_energy', 'InIce_log_charge_1_45', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_45'],
# ['reco_log_energy', 'InIce_log_charge_1_30', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_30'],
# ['reco_log_energy', 'InIce_log_charge_1_15', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_15'],
# ['reco_log_energy', 'InIce_log_charge_1_6', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_6']]
feature_lists = [['reco_log_energy', 'InIce_log_charge_1_60', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_60', 'log_s125'],
['reco_log_energy', 'InIce_log_charge_1_45', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_45', 'log_s125'],
['reco_log_energy', 'InIce_log_charge_1_30', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_30', 'log_s125'],
['reco_log_energy', 'InIce_log_charge_1_15', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_15', 'log_s125'],
['reco_log_energy', 'InIce_log_charge_1_6', 'LLHlap_cos_zenith', 'lap_chi2', 'NChannels_1_6', 'log_s125']]
feature_labels = np.array(['$\\log_{10}({\mathrm{E/GeV})$', 'InIce charge', '$\cos(\\theta)$',
'$\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$','NChannels', 's125'])
label_list = ['Full detector', 'Top 75\%', 'Top 50\%', 'Top 25\%', 'Top 10\%']
In [10]:
fig, ax = plt.subplots()
for feature_list, label, layer in zip(feature_lists, label_list, layers):
df, cut_dict = comp.load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['LLHlap_reco_exists', 'LLHlap_zenith', 'NStations', 'IT_signal',
'StationDensity', 'LLHlap_containment',
'energy_range']
standard_cut_keys += ['NChannels' + layer, 'max_qfrac' + layer]
for key in standard_cut_keys:
selection_mask *= cut_dict[key]
df = df[selection_mask]
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(df, feature_list)
print('number training events = ' + str(y_train.shape[0]))
pipeline = comp.get_pipeline('RF')
param_range = np.arange(1, 20)
param_grid = {'classifier__max_depth': param_range}
gs = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=10)
gs = gs.fit(X_train, y_train)
print('best GS CV score = {}'.format(gs.best_score_))
print('best GS CV depths = {}'.format(gs.best_params_))
print('Grid scores on development set:')
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
pipeline.set_params(**gs.best_params_)
pipeline.fit(X_train, y_train)
scaler = pipeline.named_steps['scaler']
clf = pipeline.named_steps['classifier']
# Plot fraction of events vs energy
num_features = len(feature_list)
importances = pipeline.named_steps['classifier'].feature_importances_
# indices = np.argsort(importances)[::-1]
# for f in range(num_features):
# print('{}) {}'.format(f + 1, importances[indices[f]]))
ax.set_ylabel('Feature Importances')
ax.plot(range(num_features),
importances, linestyle='-', label=label, alpha=0.75)
plt.xticks(range(num_features),
feature_labels, rotation=90)
ax.set_xlim([-1, len(feature_list)])
ax.set_ylim([0, .40])
plt.legend()
plt.grid()
plt.show()
In [ ]: