Training set systematics

Table of contents

  1. Data preprocessing
  2. Fitting random forest
  3. Feature importance

In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')


Added to PYTHONPATH

In [2]:
import argparse
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, train_test_split

import composition as comp
import composition.analysis.plotting as plotting

# Plotting-related
sns.set_palette('muted')
sns.set_color_codes()
color_dict = {'P': 'b', 'He': 'g', 'Fe': 'm', 'O': 'r'}
%matplotlib inline


/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Data preprocessing

  1. Load simulation dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets

In [3]:
df, cut_dict = comp.load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
                     'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
    selection_mask *= cut_dict[key]

df = df[selection_mask]

feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
    df, feature_list, train_he=True, test_he=True)

print('number training events = ' + str(y_train.shape[0]))
print('number testing events = ' + str(y_test.shape[0]))


/home/jbourbeau/cr-composition/composition/load_sim.py:109: RuntimeWarning: divide by zero encountered in log10
  df['log_NChannels_1_30'] = np.nan_to_num(np.log10(df['NChannels_1_30']))
training features = ['lap_log_energy', 'InIce_log_charge_1_30', 'lap_cos_zenith', 'NChannels_1_30', 'log_s125', 'StationDensity']
number training events = 145932
number testing events = 62543

In [12]:
# Split data into training and test samples
training_subset_size = np.arange(0.05, 1.0, 0.05)
pipeline = comp.get_pipeline('RF')
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print(clf_name)
train_acc = []
test_acc = []
for subset_size in training_subset_size:
    X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(X_train, y_train,
                                                        train_size=subset_size,
                                                        random_state=42)
    print('=' * 30)
    print('Training subset fraction = {}'.format(subset_size))
    print('len(training) = {}'.format(len(X_train_subset)))
    pipeline = comp.get_pipeline('RF')
    pipeline.fit(X_train_subset, y_train_subset)
    train_predictions = pipeline.predict(X_train_subset)
    train_accuracy = accuracy_score(y_train_subset, train_predictions)
    train_acc.append(train_accuracy)
    print('Train accuracy: {:.4%}'.format(train_accuracy))
    
    print('len(testing) = {}'.format(len(X_test)))
    test_predictions = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    test_acc.append(test_accuracy)
    print('Test accuracy: {:.4%}'.format(test_accuracy))
    
    print('=' * 30)


RandomForestClassifier
==============================
Training subset size = 0.05
len(training) = 7296
Train accuracy: 54.0844%
len(testing) = 62543
Test accuracy: 42.1902%
==============================
==============================
Training subset size = 0.1
len(training) = 14593
Train accuracy: 49.1400%
len(testing) = 62543
Test accuracy: 42.1598%
==============================
==============================
Training subset size = 0.15
len(training) = 21889
Train accuracy: 47.3526%
len(testing) = 62543
Test accuracy: 42.0990%
==============================
==============================
Training subset size = 0.2
len(training) = 29186
Train accuracy: 46.6696%
len(testing) = 62543
Test accuracy: 42.4748%
==============================
==============================
Training subset size = 0.25
len(training) = 36483
Train accuracy: 46.0132%
len(testing) = 62543
Test accuracy: 42.3485%
==============================
==============================
Training subset size = 0.3
len(training) = 43779
Train accuracy: 45.6292%
len(testing) = 62543
Test accuracy: 42.5163%
==============================
==============================
Training subset size = 0.35
len(training) = 51076
Train accuracy: 45.3618%
len(testing) = 62543
Test accuracy: 42.5403%
==============================
==============================
Training subset size = 0.4
len(training) = 58372
Train accuracy: 45.1963%
len(testing) = 62543
Test accuracy: 42.2973%
==============================
==============================
Training subset size = 0.45
len(training) = 65669
Train accuracy: 44.9771%
len(testing) = 62543
Test accuracy: 42.4124%
==============================
==============================
Training subset size = 0.5
len(training) = 72966
Train accuracy: 44.9031%
len(testing) = 62543
Test accuracy: 42.4252%
==============================
==============================
Training subset size = 0.55
len(training) = 80262
Train accuracy: 44.5977%
len(testing) = 62543
Test accuracy: 42.4300%
==============================
==============================
Training subset size = 0.6
len(training) = 87559
Train accuracy: 44.6442%
len(testing) = 62543
Test accuracy: 42.5931%
==============================
==============================
Training subset size = 0.65
len(training) = 94855
Train accuracy: 44.7567%
len(testing) = 62543
Test accuracy: 42.6778%
==============================
==============================
Training subset size = 0.7
len(training) = 102152
Train accuracy: 44.5816%
len(testing) = 62543
Test accuracy: 42.5947%
==============================
==============================
Training subset size = 0.75
len(training) = 109449
Train accuracy: 44.4134%
len(testing) = 62543
Test accuracy: 42.6315%
==============================
==============================
Training subset size = 0.8
len(training) = 116745
Train accuracy: 44.4002%
len(testing) = 62543
Test accuracy: 42.6539%
==============================
==============================
Training subset size = 0.85
len(training) = 124042
Train accuracy: 44.2689%
len(testing) = 62543
Test accuracy: 42.4540%
==============================
==============================
Training subset size = 0.9
len(training) = 131338
Train accuracy: 44.2515%
len(testing) = 62543
Test accuracy: 42.5995%
==============================
==============================
Training subset size = 0.95
len(training) = 138635
Train accuracy: 44.1014%
len(testing) = 62543
Test accuracy: 42.5451%
==============================

In [21]:
fig, ax = plt.subplots()
ax.plot(training_subset_size, train_acc, label='Training accuracy', linestyle=':')
ax.plot(training_subset_size, test_acc, label='Testing accuracy', linestyle=':')
ax.set_xlabel('Training set subsample fraction')
ax.set_ylabel('Accuracy')
ax.set_ylim([0.4, 0.6])
ax.legend(numpoints=1)
ax.grid()
plt.show()



In [25]:
# Split data into training and test samples
training_subset_size = np.arange(0.0005, 0.05, 0.005)
pipeline = comp.get_pipeline('RF')
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print(clf_name)
train_acc = []
test_acc = []
for subset_size in training_subset_size:
    X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(X_train, y_train,
                                                        train_size=subset_size,
                                                        random_state=42)
    print('=' * 30)
    print('Training subset fraction = {}'.format(subset_size))
    print('len(training) = {}'.format(len(X_train_subset)))
    pipeline = comp.get_pipeline('RF')
    pipeline.fit(X_train_subset, y_train_subset)
    train_predictions = pipeline.predict(X_train_subset)
    train_accuracy = accuracy_score(y_train_subset, train_predictions)
    train_acc.append(train_accuracy)
    print('Train accuracy: {:.4%}'.format(train_accuracy))
    
    print('len(testing) = {}'.format(len(X_test)))
    test_predictions = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    test_acc.append(test_accuracy)
    print('Test accuracy: {:.4%}'.format(test_accuracy))
    
    print('=' * 30)


RandomForestClassifier
==============================
Training subset fraction = 0.0005
len(training) = 72
Train accuracy: 100.0000%
len(testing) = 62543
Test accuracy: 34.1557%
==============================
==============================
Training subset fraction = 0.0055
len(training) = 802
Train accuracy: 80.6733%
len(testing) = 62543
Test accuracy: 38.7989%
==============================
==============================
Training subset fraction = 0.0105
len(training) = 1532
Train accuracy: 70.1044%
len(testing) = 62543
Test accuracy: 39.5344%
==============================
==============================
Training subset fraction = 0.0155
len(training) = 2261
Train accuracy: 63.5559%
len(testing) = 62543
Test accuracy: 40.1452%
==============================
==============================
Training subset fraction = 0.0205
len(training) = 2991
Train accuracy: 60.8826%
len(testing) = 62543
Test accuracy: 40.4266%
==============================
==============================
Training subset fraction = 0.0255
len(training) = 3721
Train accuracy: 60.4676%
len(testing) = 62543
Test accuracy: 41.0757%
==============================
==============================
Training subset fraction = 0.0305
len(training) = 4450
Train accuracy: 58.1798%
len(testing) = 62543
Test accuracy: 41.3651%
==============================
==============================
Training subset fraction = 0.0355
len(training) = 5180
Train accuracy: 57.4324%
len(testing) = 62543
Test accuracy: 41.4099%
==============================
==============================
Training subset fraction = 0.0405
len(training) = 5910
Train accuracy: 55.3807%
len(testing) = 62543
Test accuracy: 41.7505%
==============================
==============================
Training subset fraction = 0.0455
len(training) = 6639
Train accuracy: 54.7070%
len(testing) = 62543
Test accuracy: 41.9312%
==============================

In [27]:
fig, ax = plt.subplots()
ax.plot(training_subset_size, train_acc, label='Training accuracy', linestyle=':')
ax.plot(training_subset_size, test_acc, label='Testing accuracy', linestyle=':')
ax.set_xlabel('Training set subsample fraction')
ax.set_ylabel('Accuracy')
# ax.set_ylim([0.4, 0.6])
ax.legend(numpoints=1)
ax.grid()
plt.show()


Fit random forest and run 10-fold CV validation


In [12]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
test_predictions = pipeline.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = pipeline.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))
scores = cross_val_score(
    estimator=pipeline, X=X_test, y=y_test, cv=10, n_jobs=20)
print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
print('=' * 30)


==============================
RandomForestClassifier
Test accuracy: 42.8373%
Train accuracy: 44.0700%
CV score: 42.93% (+/- 0.47%)
==============================

In [7]:
clf = pipeline.named_steps['classifier']
decisiontree = clf.estimators_[0]
tree =  decisiontree.tree_
leaf_samples = []
for tree in clf.estimators_:
    leaf_samples.extend(tree.tree_.n_node_samples[tree.apply(X_test.astype(np.float32))])
print(np.min(leaf_samples))
print(np.max(leaf_samples))


1
19511

In [8]:
counts, bins, pathches = plt.hist(leaf_samples, bins=np.linspace(0, 1000, 75), log=True)
plt.xlim([0, 1000])


Out[8]:
(0, 1000)

In [9]:
counts, bins, pathches = plt.hist(leaf_samples, bins=np.linspace(0, 200, 100), log=True)



In [13]:
comp_list = ['P', 'He', 'O', 'Fe']

correctly_identified_mask = (test_predictions == y_test)

# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
# energy_bins = np.arange(6.2, 9.51, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
log_energy = X_test[:, 0]

# Construct MC composition masks
MC_comp_mask = {}
for composition in comp_list:
    MC_comp_mask[composition] = (le.inverse_transform(y_test) == composition)

# Get number of MC comp in each reco energy bin
num_MC_energy, num_MC_energy_err = {}, {}
for composition in comp_list:
    num_MC_energy[composition] = np.histogram(log_energy[MC_comp_mask[composition]],
                                     bins=energy_bins)[0]
    num_MC_energy_err[composition] = np.sqrt(num_MC_energy[composition])

num_MC_energy['total'] = np.histogram(log_energy, bins=energy_bins)[0]
num_MC_energy_err['total'] = np.sqrt(num_MC_energy['total'])


# Get number of correctly identified comp in each reco energy bin
num_reco_energy, num_reco_energy_err = {}, {}
for composition in comp_list:
    num_reco_energy[composition] = np.histogram(
        log_energy[MC_comp_mask[composition] & correctly_identified_mask],
        bins=energy_bins)[0]
    num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])

num_reco_energy['total'] = np.histogram(log_energy[correctly_identified_mask], bins=energy_bins)[0]
num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])



# Calculate correctly identified fractions as a function of MC energy
reco_frac, reco_frac_err = {}, {}
for composition in comp_list:
    print(composition)
    reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
        num_reco_energy[composition], num_reco_energy_err[composition],
        num_MC_energy[composition], num_MC_energy_err[composition])
    
reco_frac['total'], reco_frac_err['total'] = comp.ratio_error(
        num_reco_energy['total'], num_reco_energy_err['total'],
        num_MC_energy['total'], num_MC_energy_err['total'])


P
He
O
Fe

In [14]:
# Plot fraction of events vs energy
fig, ax = plt.subplots()
for composition in comp_list:
    ebar = ax.errorbar(energy_midpoints, reco_frac[composition],
                yerr=reco_frac_err[composition],
                # xerr=energy_bin_width / 2,
                marker=None, markersize=5,
#                 alpha=0.8, color=next(color_gen))
                alpha=0.8)
    step_x = energy_midpoints
    step_x = np.append(step_x[0]-energy_bin_width/2, step_x)
    step_x = np.append(step_x, step_x[-1]+energy_bin_width/2)
    step_y = reco_frac[composition]
    step_y = np.append(step_y[0], step_y)
    step_y = np.append(step_y, step_y[-1])
    ax.step(step_x, step_y, where='mid',
            marker=None, color=ebar[0].get_color(), linewidth=1.5,
            linestyle='-', label=composition, alpha=0.8)
ebar = ax.errorbar(energy_midpoints, reco_frac['total'],
                yerr=reco_frac_err['total'],
                # xerr=energy_bin_width / 2,
                marker=None, markersize=5,
                alpha=0.8)
step_y = reco_frac['total']
step_y = np.append(step_y[0], step_y)
step_y = np.append(step_y, step_y[-1])
ax.step(step_x, step_y, where='mid', marker=None,
        color=ebar[0].get_color(), linewidth=1.5, label='Total', alpha=0.8)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([6.2, 8.0])
# ax.set_xlim([6.2, 9.5])
plt.grid()
leg = plt.legend(loc='upper center', 
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
# place a text box in upper left in axes coords
textstr = '$\mathrm{\underline{Training \ features}}$: \n'
for i, label in enumerate(feature_labels):
    if (i == len(feature_labels)-1):
        textstr += '{}) '.format(i+1) + label
    else:
        textstr += '{}) '.format(i+1) + label + '\n'
props = dict(facecolor='white', linewidth=0)
ax.text(1.025, 0.855, textstr, transform=ax.transAxes, fontsize=8,
        verticalalignment='top', bbox=props)
cvstr = '$\mathrm{\underline{CV \ score}}$:\n' + '{:0.2f}\% (+/- {:.2}\%)'.format(scores.mean()*100, scores.std()*100)
print(cvstr)
props = dict(facecolor='white', linewidth=0)
ax.text(1.025, 0.9825, cvstr, transform=ax.transAxes, fontsize=8,
        verticalalignment='top', bbox=props)
outfile = '/home/jbourbeau/public_html/figures/composition' + \
          '/fraction-reco-correct_vs_reco-energy_RF.png'
plt.savefig(outfile)
plt.show()


$\mathrm{\underline{CV \ score}}$:
42.93\% (+/- 0.47\%)

In [7]:
a = pd.DataFrame([np.sum(df.MC_comp == composition) for composition in comp_list],
                 index=comp_list, columns=['MC Compositions'])
print(a)
a.plot.pie(subplots=True, figsize=(4,4), legend=False, autopct='%.2f')
a = pd.DataFrame([np.sum(le.inverse_transform(test_predictions) == composition) for composition in comp_list], index=comp_list, columns=['after'])
print(a)
a.plot.pie(subplots=True, figsize=(2,2))


    MC Compositions
P             53093
He            53210
O             51829
Fe            50343
    after
P   27219
He  16993
O   12985
Fe  26193
Out[7]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f9144f4f250>], dtype=object)

Feature importance


In [8]:
num_features = len(feature_list)
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

fig, ax = plt.subplots()
for f in range(num_features):
    print('{}) {}'.format(f + 1, importances[indices[f]]))

plt.ylabel('Feature Importances')
plt.bar(range(num_features),
        importances[indices],
        align='center')

plt.xticks(range(num_features),
           feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
# plt.ylim([0, .40])
plt.show()


1) 0.387783867673
2) 0.282746986276
3) 0.162070349702
4) 0.124910448661
5) 0.0424883476872

In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)

In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_

In [ ]:
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)

In [ ]:
pipeline.named_steps['classifier'].decision_path(X_test)

In [ ]:
comp_list = np.unique(df['MC_comp'])
# test_probs = defaultdict(list)
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
for class_ in pipeline.classes_:
    composition = le.inverse_transform(class_)
    plt.hist(test_probs[:, class_], bins=np.linspace(0, 1, 50),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend()
plt.grid()
plt.show()

In [10]:
comp_list = np.unique(df['MC_comp'])
test_probs = defaultdict(list)
fig, ax = plt.subplots()
# test_probs = pipeline.predict_proba(X_test)
for event in pipeline.predict_proba(X_test):
    composition = le.inverse_transform(np.argmax(event))
#     print(composition)
    test_probs[composition].append(np.amax(event))
for composition in comp_list:
    plt.hist(test_probs[composition], bins=np.linspace(0, 1, 75),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend(title='Reco comp')
plt.grid()
plt.show()



In [ ]: