In [1]:
from __future__ import division
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid
from sklearn.neighbors import KNeighborsClassifier

from load_sim import load_sim
from preprocessing import get_train_test_sets
from features import get_training_features
from pipelines import get_pipeline
import plotting_functions as plotting
import data_functions as data_functions

%matplotlib inline

In [2]:
sns.set_palette('muted')
sns.set_color_codes()

In [3]:
df, cut_dict = load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['reco_exists', 'reco_zenith', 'num_hits', 'IT_signal',
                     'StationDensity', 'max_charge_frac', 'reco_containment',
                     'energy_range']
for key in standard_cut_keys:
    selection_mask *= cut_dict[key]

df = df[selection_mask]

feature_list, feature_labels = get_training_features()
print(feature_list)
X_train, X_test, y_train, y_test, le = get_train_test_sets(
    df, feature_list)

print('number training events = ' + str(y_train.shape[0]))


load_sim.py:70: RuntimeWarning: divide by zero encountered in log10
  df['reco_log_energy'] = np.nan_to_num(np.log10(df['reco_energy']))
['reco_log_energy', 'InIce_log_charge_half', 'reco_cos_zenith', 'lap_chi2', 'NChannels_half']
number training events = 36125

In [4]:
pipeline = get_pipeline('RF')
param_range = np.arange(1, 20)
param_grid = {'classifier__max_depth': param_range}
gs = GridSearchCV(estimator=pipeline,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  verbose=1,
                  n_jobs=10)
gs = gs.fit(X_train, y_train)
print('best GS CV score = {}'.format(gs.best_score_))
print('best GS CV depths = {}'.format(gs.best_params_))
print('Grid scores on development set:')
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
pipeline.set_params(**gs.best_params_)
pipeline.fit(X_train, y_train)
scaler = pipeline.named_steps['scaler']
clf = pipeline.named_steps['classifier']


Fitting 5 folds for each of 19 candidates, totalling 95 fits
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    9.3s
[Parallel(n_jobs=10)]: Done  95 out of  95 | elapsed:   43.4s finished
best GS CV score = 0.801467128028
best GS CV depths = {'classifier__max_depth': 12}
Grid scores on development set:
0.700 (+/-0.038) for {'classifier__max_depth': 1}
0.722 (+/-0.007) for {'classifier__max_depth': 2}
0.732 (+/-0.007) for {'classifier__max_depth': 3}
0.742 (+/-0.007) for {'classifier__max_depth': 4}
0.760 (+/-0.007) for {'classifier__max_depth': 5}
0.772 (+/-0.008) for {'classifier__max_depth': 6}
0.786 (+/-0.007) for {'classifier__max_depth': 7}
0.792 (+/-0.006) for {'classifier__max_depth': 8}
0.795 (+/-0.006) for {'classifier__max_depth': 9}
0.797 (+/-0.012) for {'classifier__max_depth': 10}
0.800 (+/-0.011) for {'classifier__max_depth': 11}
0.801 (+/-0.010) for {'classifier__max_depth': 12}
0.800 (+/-0.008) for {'classifier__max_depth': 13}
0.800 (+/-0.012) for {'classifier__max_depth': 14}
0.801 (+/-0.011) for {'classifier__max_depth': 15}
0.800 (+/-0.013) for {'classifier__max_depth': 16}
0.800 (+/-0.011) for {'classifier__max_depth': 17}
0.798 (+/-0.012) for {'classifier__max_depth': 18}
0.798 (+/-0.012) for {'classifier__max_depth': 19}

In [5]:
clf_name = clf.__class__.__name__
print('=' * 30)
print(clf_name)
test_predictions = pipeline.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = pipeline.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))
scores = cross_val_score(
    estimator=pipeline, X=X_test, y=y_test, cv=10, n_jobs=10)
print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
print('=' * 30)


==============================
RandomForestClassifier
Test accuracy: 80.4495%
Train accuracy: 87.2415%
CV score: 79.95% (+/- 0.84%)
==============================

In [6]:
correctly_identified_mask = (test_predictions == y_test)

energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
# energy_bins = np.arange(6.2, 9.51, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
log_energy = X_test[:, 0]

MC_proton_mask = (le.inverse_transform(y_test) == 'P')
MC_iron_mask = (le.inverse_transform(y_test) == 'Fe')
# Get number of MC proton and iron as a function of MC energy
num_MC_protons_energy = np.histogram(log_energy[MC_proton_mask],
                                     bins=energy_bins)[0]
num_MC_protons_energy_err = np.sqrt(num_MC_protons_energy)
num_MC_irons_energy = np.histogram(log_energy[MC_iron_mask],
                                   bins=energy_bins)[0]
num_MC_irons_energy_err = np.sqrt(num_MC_irons_energy)
num_MC_total_energy = np.histogram(log_energy, bins=energy_bins)[0]
num_MC_total_energy_err = np.sqrt(num_MC_total_energy)

# Get number of reco proton and iron as a function of MC energy
num_reco_proton_energy = np.histogram(
    log_energy[MC_proton_mask & correctly_identified_mask],
    bins=energy_bins)[0]
num_reco_proton_energy_err = np.sqrt(num_reco_proton_energy)
num_reco_iron_energy = np.histogram(
    log_energy[MC_iron_mask & correctly_identified_mask],
    bins=energy_bins)[0]
num_reco_iron_energy_err = np.sqrt(num_reco_iron_energy)
num_reco_total_energy = np.histogram(
    log_energy[correctly_identified_mask],
    bins=energy_bins)[0]
num_reco_total_energy_err = np.sqrt(num_reco_total_energy)

# Calculate reco proton and iron fractions as a function of MC energy
reco_proton_frac, reco_proton_frac_err = data_functions.ratio_error(
    num_reco_proton_energy, num_reco_proton_energy_err,
    num_MC_protons_energy, num_MC_protons_energy_err)

reco_iron_frac, reco_iron_frac_err = data_functions.ratio_error(
    num_reco_iron_energy, num_reco_iron_energy_err,
    num_MC_irons_energy, num_MC_irons_energy_err)

reco_total_frac, reco_total_frac_err = data_functions.ratio_error(
    num_reco_total_energy, num_reco_total_energy_err,
    num_MC_total_energy, num_MC_total_energy_err)

In [7]:
# Plot fraction of events vs energy
fig, ax = plt.subplots()
ax.errorbar(energy_midpoints, reco_proton_frac,
            yerr=reco_proton_frac_err,
            # xerr=energy_bin_width / 2,
            marker='.', markersize=10,
            label='Proton')
ax.errorbar(energy_midpoints, reco_iron_frac,
            yerr=reco_iron_frac_err,
            # xerr=energy_bin_width / 2,
            marker='.', markersize=10,
            label='Iron')
ax.errorbar(energy_midpoints, reco_total_frac,
            yerr=reco_total_frac_err,
            # xerr=energy_bin_width / 2,
            marker='.', markersize=10,
            label='Total')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([6.2, 8.0])
# ax.set_xlim([6.2, 9.5])
plt.grid()
plt.legend(loc=3)
# place a text box in upper left in axes coords
textstr = 'Training features: \n'
for i, label in enumerate(feature_labels):
    if (i == len(feature_labels)-1):
        textstr += '{}) '.format(i+1) + label
    else:
        textstr += '{}) '.format(i+1) + label + '\n'
print('textstr = \n' + textstr)
props = dict(facecolor='white')
ax.text(0.65, 0.3, textstr, transform=ax.transAxes, fontsize=8,
        verticalalignment='top', bbox=props)
outfile = '/home/jbourbeau/public_html/figures/composition' + \
          '/fraction-reco-correct_vs_reco-energy_RF.png'
plt.savefig(outfile)
plt.show()


textstr = 
Training features: 
1) $\log_{10}({\mathrm{E/GeV})$
2) InIce charge half
3) $\cos(\theta)$
4) $\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$
5) NChannels half

In [8]:
num_features = len(feature_list)
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

fig, ax = plt.subplots()
# feature_labels = np.array(['$\\log_{10}({\mathrm{E/GeV})$', 'InIce charge',
#                            '$\cos(\\theta)$', '$\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$', 'NChannels'])
for f in range(num_features):
    print('{}) {}'.format(f + 1, importances[indices[f]]))

plt.ylabel('Feature Importances')
plt.bar(range(num_features),
        importances[indices],
        align='center')

plt.xticks(range(num_features),
           feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
plt.ylim([0, .35])
plt.show()


1) 0.332684432476
2) 0.236612651633
3) 0.161498583041
4) 0.159862629573
5) 0.109341703277

In [ ]: