SGD fraction correct analysis

Table of contents

  1. Data preprocessing
  2. Fitting classifier

In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')


Added to PYTHONPATH

In [2]:
import argparse
from collections import defaultdict
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, GridSearchCV

import composition as comp
import composition.analysis.plotting as plotting

# Plotting-related
sns.set_palette('muted')
sns.set_color_codes()
color_dict = defaultdict()
for i, composition in enumerate(['P', 'He', 'O', 'Fe', 'total']):
    color_dict[composition] = sns.color_palette('muted').as_hex()[i]
%matplotlib inline


/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Data preprocessing

  1. Load simulation dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets

In [3]:
df, cut_dict = comp.load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
                     'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
    selection_mask *= cut_dict[key]

df = df[selection_mask]

feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
    df, feature_list, train_he=True, test_he=True)

print('number training events = ' + str(y_train.shape[0]))
print('number testing events = ' + str(y_test.shape[0]))


/home/jbourbeau/cr-composition/composition/load_sim.py:109: RuntimeWarning: divide by zero encountered in log10
  df['log_NChannels_1_30'] = np.nan_to_num(np.log10(df['NChannels_1_30']))
training features = ['lap_log_energy', 'InIce_log_charge_1_30', 'lap_cos_zenith', 'NChannels_1_30', 'log_s125', 'StationDensity']
number training events = 145543
number testing events = 62376

Grid search to find optimal hyperparameters


In [23]:
pipeline = comp.get_pipeline('SGD')
param_grid = {'classifier__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
              'classifier__penalty': ['none', 'l2', 'l1', 'elasticnet']}
gs = GridSearchCV(estimator=pipeline,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  verbose=1,
                  n_jobs=20)
gs = gs.fit(X_train, y_train)
print('best GS CV score = {}'.format(gs.best_score_))
print('best GS CV depths = {}'.format(gs.best_params_))
print('Grid scores on development set:')
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
pipeline.set_params(**gs.best_params_)
pipeline.fit(X_train, y_train)
scaler = pipeline.named_steps['scaler']
clf = pipeline.named_steps['classifier']


Fitting 10 folds for each of 20 candidates, totalling 200 fits
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   11.8s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:   13.0s finished
best GS CV score = 0.400410875137
best GS CV depths = {'classifier__loss': 'log', 'classifier__penalty': 'none'}
Grid scores on development set:
0.391 (+/-0.015) for {'classifier__loss': 'hinge', 'classifier__penalty': 'none'}
0.392 (+/-0.014) for {'classifier__loss': 'hinge', 'classifier__penalty': 'l2'}
0.393 (+/-0.014) for {'classifier__loss': 'hinge', 'classifier__penalty': 'l1'}
0.394 (+/-0.016) for {'classifier__loss': 'hinge', 'classifier__penalty': 'elasticnet'}
0.400 (+/-0.012) for {'classifier__loss': 'log', 'classifier__penalty': 'none'}
0.400 (+/-0.010) for {'classifier__loss': 'log', 'classifier__penalty': 'l2'}
0.400 (+/-0.008) for {'classifier__loss': 'log', 'classifier__penalty': 'l1'}
0.400 (+/-0.008) for {'classifier__loss': 'log', 'classifier__penalty': 'elasticnet'}
0.374 (+/-0.021) for {'classifier__loss': 'modified_huber', 'classifier__penalty': 'none'}
0.374 (+/-0.020) for {'classifier__loss': 'modified_huber', 'classifier__penalty': 'l2'}
0.383 (+/-0.023) for {'classifier__loss': 'modified_huber', 'classifier__penalty': 'l1'}
0.379 (+/-0.027) for {'classifier__loss': 'modified_huber', 'classifier__penalty': 'elasticnet'}
0.312 (+/-0.034) for {'classifier__loss': 'squared_hinge', 'classifier__penalty': 'none'}
0.300 (+/-0.020) for {'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
0.307 (+/-0.031) for {'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
0.298 (+/-0.020) for {'classifier__loss': 'squared_hinge', 'classifier__penalty': 'elasticnet'}
0.320 (+/-0.035) for {'classifier__loss': 'perceptron', 'classifier__penalty': 'none'}
0.318 (+/-0.043) for {'classifier__loss': 'perceptron', 'classifier__penalty': 'l2'}
0.324 (+/-0.054) for {'classifier__loss': 'perceptron', 'classifier__penalty': 'l1'}
0.321 (+/-0.031) for {'classifier__loss': 'perceptron', 'classifier__penalty': 'elasticnet'}

Fit random forest and run 10-fold CV validation


In [4]:
pipeline = comp.get_pipeline('SGD')
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
scores = cross_val_score(
    estimator=pipeline, X=X_train, y=y_train, cv=10, n_jobs=20)
print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
print('=' * 30)


==============================
SGDClassifier
CV score: 40.04% (+/- 0.58%)
==============================

In [5]:
def get_frac_correct(X_train, X_test, y_train, y_test, comp_list):
    
    pipeline = comp.get_pipeline('SGD')
    pipeline.fit(X_train, y_train)
    test_predictions = pipeline.predict(X_test)
    correctly_identified_mask = (test_predictions == y_test)

    # Energy-related variables
    energy_bin_width = 0.1
    energy_bins = np.arange(6.2, 8.1, energy_bin_width)
    energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
    log_energy = X_test[:, 0]

    # Construct MC composition masks
    MC_comp_mask = {}
    for composition in comp_list:
        MC_comp_mask[composition] = (le.inverse_transform(y_test) == composition)

    # Get number of MC comp in each reco energy bin
    num_MC_energy, num_MC_energy_err = {}, {}
    for composition in comp_list:
        num_MC_energy[composition] = np.histogram(log_energy[MC_comp_mask[composition]],
                                         bins=energy_bins)[0]
        num_MC_energy_err[composition] = np.sqrt(num_MC_energy[composition])

    num_MC_energy['total'] = np.histogram(log_energy, bins=energy_bins)[0]
    num_MC_energy_err['total'] = np.sqrt(num_MC_energy['total'])


    # Get number of correctly identified comp in each reco energy bin
    num_reco_energy, num_reco_energy_err = {}, {}
    for composition in comp_list:
        num_reco_energy[composition] = np.histogram(
            log_energy[MC_comp_mask[composition] & correctly_identified_mask],
            bins=energy_bins)[0]
        num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])

    num_reco_energy['total'] = np.histogram(log_energy[correctly_identified_mask], bins=energy_bins)[0]
    num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])

    # Calculate correctly identified fractions as a function of MC energy
    reco_frac, reco_frac_err = {}, {}
    for composition in comp_list:
#         print(composition)
        reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
            num_reco_energy[composition], num_reco_energy_err[composition],
            num_MC_energy[composition], num_MC_energy_err[composition])
        frac_correct_folds[composition].append(reco_frac[composition])

    reco_frac['total'], reco_frac_err['total'] = comp.ratio_error(
            num_reco_energy['total'], num_reco_energy_err['total'],
            num_MC_energy['total'], num_MC_energy_err['total'])
    
    return reco_frac, reco_frac_err

Compute systematic in fraction correct via CV


In [6]:
comp_list = ['P', 'He', 'O', 'Fe']
# Split data into training and test samples
kf = KFold(n_splits=10)
frac_correct_folds = defaultdict(list)
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    
    reco_frac, reco_frac_err = get_frac_correct(X_train_fold, X_test_fold,
                                                y_train_fold, y_test_fold,
                                                comp_list)
    for composition in comp_list:
        frac_correct_folds[composition].append(reco_frac[composition])
    frac_correct_folds['total'].append(reco_frac['total'])
frac_correct_sys_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}


/home/jbourbeau/cr-composition/composition/analysis/data_functions.py:11: RuntimeWarning: invalid value encountered in true_divide
  ratio_err = ratio * np.sqrt((num_err / num)**2 + (den_err / den)**2)

In [7]:
reco_frac, reco_frac_sterr = get_frac_correct(X_train, X_test,
                                              y_train, y_test,
                                              comp_list)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
step_x = energy_midpoints
step_x = np.append(step_x[0]-energy_bin_width/2, step_x)
step_x = np.append(step_x, step_x[-1]+energy_bin_width/2)
# Plot fraction of events vs energy
def plot_steps(x, y, y_err, ax, color, label):
    step_x = x
    x_widths = x[1:]-x[:-1]
    if len(np.unique(x_widths)) != 1:
        raise('Unequal bins...')
    x_width = np.unique(x_widths)[0]
    step_x = np.append(step_x[0]-x_width/2, step_x)
    step_x = np.append(step_x, step_x[-1]+x_width/2)
    
    step_y = y
    step_y = np.append(step_y[0], step_y)
    step_y = np.append(step_y, step_y[-1])
    
    err_upper = y + y_err
    err_upper = np.append(err_upper[0], err_upper)
    err_upper = np.append(err_upper, err_upper[-1])
    err_lower = y - y_err
    err_lower = np.append(err_lower[0], err_lower)
    err_lower = np.append(err_lower, err_lower[-1])
    
    ax.step(step_x, step_y, where='mid',
            marker=None, color=color, linewidth=1,
            linestyle='-', label=label, alpha=0.8)
    ax.fill_between(step_x, err_upper, err_lower,
                    alpha=0.15, color=color,
                    step='mid', linewidth=1)
    
    return step_x, step_y

fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    err = np.sqrt(frac_correct_sys_err[composition]**2+reco_frac_sterr[composition]**2)
    plot_steps(energy_midpoints, reco_frac[composition], err, ax, color_dict[composition], composition)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([6.2, 8.0])
ax.grid()
leg = plt.legend(loc='upper center', 
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)

# place a text box in upper left in axes coords
textstr = '$\mathrm{\underline{Training \ features}}$: \n'
for i, label in enumerate(feature_labels):
    if (i == len(feature_labels)-1):
        textstr += '{}) '.format(i+1) + label
    else:
        textstr += '{}) '.format(i+1) + label + '\n'
props = dict(facecolor='white', linewidth=0)
ax.text(1.025, 0.855, textstr, transform=ax.transAxes, fontsize=8,
        verticalalignment='top', bbox=props)
cvstr = '$\mathrm{\underline{CV \ score}}$:\n' + '{:0.2f}\% (+/- {:.2}\%)'.format(scores.mean()*100, scores.std()*100)
print(cvstr)
props = dict(facecolor='white', linewidth=0)
ax.text(1.025, 0.9825, cvstr, transform=ax.transAxes, fontsize=8,
        verticalalignment='top', bbox=props)
plt.show()


$\mathrm{\underline{CV \ score}}$:
40.04\% (+/- 0.58\%)

In [15]:
a = pd.DataFrame([np.sum(df.MC_comp == composition) for composition in comp_list],
                 index=comp_list, columns=['MC Compositions'])
print(a)
a.plot.pie(subplots=True, figsize=(4,4), legend=False, autopct='%.2f')
a = pd.DataFrame([np.sum(le.inverse_transform(test_predictions) == composition) for composition in comp_list], index=comp_list, columns=['after'])
print(a)
a.plot.pie(subplots=True, figsize=(2,2))


    MC Compositions
P             52941
He            53066
O             51691
Fe            50221

NameErrorTraceback (most recent call last)
<ipython-input-15-c2e202b3b33e> in <module>()
      3 print(a)
      4 a.plot.pie(subplots=True, figsize=(4,4), legend=False, autopct='%.2f')
----> 5 a = pd.DataFrame([np.sum(le.inverse_transform(test_predictions) == composition) for composition in comp_list], index=comp_list, columns=['after'])
      6 print(a)
      7 a.plot.pie(subplots=True, figsize=(2,2))

NameError: name 'test_predictions' is not defined

Feature importance


In [13]:
num_features = len(feature_list)
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

fig, ax = plt.subplots()
for f in range(num_features):
    print('{}) {}'.format(f + 1, importances[indices[f]]))

plt.ylabel('Feature Importances')
plt.bar(range(num_features),
        importances[indices],
        align='center')

plt.xticks(range(num_features),
           feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
# plt.ylim([0, .40])
plt.show()


1) 0.406473947902
2) 0.283826242805
3) 0.141256788312
4) 0.125846138943
5) 0.0309333112274
6) 0.0116635708116

In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)

In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_

In [ ]:
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)

In [ ]:
pipeline.named_steps['classifier'].decision_path(X_test)

In [ ]:
comp_list = np.unique(df['MC_comp'])
# test_probs = defaultdict(list)
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
for class_ in pipeline.classes_:
    composition = le.inverse_transform(class_)
    plt.hist(test_probs[:, class_], bins=np.linspace(0, 1, 50),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend()
plt.grid()
plt.show()

In [10]:
comp_list = np.unique(df['MC_comp'])
test_probs = defaultdict(list)
fig, ax = plt.subplots()
# test_probs = pipeline.predict_proba(X_test)
for event in pipeline.predict_proba(X_test):
    composition = le.inverse_transform(np.argmax(event))
#     print(composition)
    test_probs[composition].append(np.amax(event))
for composition in comp_list:
    plt.hist(test_probs[composition], bins=np.linspace(0, 1, 75),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend(title='Reco comp')
plt.grid()
plt.show()



In [ ]: