In [1]:
import argparse
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, learning_curve

import comptools as comp
import comptools.analysis.plotting as plotting

# Plotting-related
color_dict = comp.analysis.get_color_dict()
%matplotlib inline

Data preprocessing

  1. Load simulation dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets

In [3]:
# config = 'IC79.2010'
config = 'IC86.2012'
num_groups = 4
comp_list = comp.get_comp_list(num_groups=num_groups)
energybins = comp.analysis.get_energybins(config)

In [4]:
df_sim_train, df_sim_test = comp.load_sim(config=config,
                                          log_energy_min=energybins.log_energy_min,
                                          log_energy_max=energybins.log_energy_max)

In [6]:
feature_list, feature_labels = comp.analysis.get_training_features()
pipeline_str = 'BDT_comp_{}_{}-groups'.format(config, num_groups)

Feature selection


In [6]:
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# pipeline = comp.get_pipeline('RF')
# sfs = SFS(pipeline, 
#           k_features=6, 
#           forward=True, 
#           floating=False, 
#           scoring='accuracy',
#           print_progress=True,
#           cv=3,
#           n_jobs=10)
# sfs = sfs.fit(X_train_sim, y_train_sim)


Features: 6/6

Feature transform


In [7]:
# X_train_sim = sfs.transform(X_train_sim)
# X_test_sim = sfs.transform(X_test_sim)

Produce 10-fold CV learning curve


In [18]:
pipeline = comp.get_pipeline(pipeline_str)

train_sizes, train_scores, test_scores =\
    learning_curve(estimator=pipeline,
                   X=df_sim_train[feature_list],
                   y=df_sim_train['comp_target_{}'.format(num_groups)],
                   train_sizes=np.linspace(0.1, 1.0, 10),
                   cv=10,
                   n_jobs=20,
                   verbose=3)


[learning_curve] Training set sizes: [ 4472  8945 13417 17890 22363 26835 31308 35780 40253 44726]
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.441158720579, total=   1.4s
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.451418225709, total=   2.3s
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.455130784708, total=   2.1s
[CV]  ................................................................
[CV] ........................... , score=0.452021726011, total=   3.7s
[CV]  ................................................................
[CV] ........................... , score=0.456941649899, total=   2.0s
[CV]  ................................................................
[CV] ........................... , score=0.464386317907, total=   4.5s
[CV]  ................................................................
[CV] ............................ , score=0.46338028169, total=   5.2s
[CV] ........................... , score=0.459154929577, total=   3.5s
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.458257895796, total=   7.3s
[CV]  ................................................................
[CV] ........................... , score=0.457252061959, total=   7.9s
[CV]  ................................................................
[CV] ........................... , score=0.457947686117, total=   5.2s
[CV]  ................................................................
[CV] ........................... , score=0.468209255533, total=   7.4s
[CV]  ................................................................
[CV] ........................... , score=0.466197183099, total=   7.7s
[CV]  ................................................................
[CV] ........................... , score=0.455441561054, total=   9.7s
[CV]  ................................................................
[CV] ........................... , score=0.440442655936, total=   1.8s
[CV]  ................................................................
[CV] ........................... , score=0.460160965795, total=   7.1s
[CV]  ................................................................
[CV] ........................... , score=0.456849728425, total=  11.2s
[CV]  ................................................................
[CV] ........................... , score=0.455734406439, total=   2.7s
[CV]  ................................................................
[CV] ........................... , score=0.460362173038, total=   7.7s
[CV]  ................................................................
[CV] ........................... , score=0.466800804829, total=  10.8s
[CV]  ................................................................
[CV] ........................... , score=0.469818913481, total=  11.2s
[CV]  ................................................................
[CV] ........................... , score=0.467605633803, total=   8.2s
[CV]  ................................................................
[CV] ........................... , score=0.449698189135, total=   4.5s
[CV]  ................................................................
[CV] ........................... , score=0.451710261569, total=   1.4s
[CV]  ................................................................
[CV] ............................ , score=0.45282639308, total=  14.5s
[CV]  ................................................................
[CV] ........................... , score=0.454124748491, total=   5.6s
[CV]  ................................................................
[CV] ............................ , score=0.45814889336, total=   2.5s
[CV]  ................................................................
[CV] ........................... , score=0.464788732394, total=  14.7s
[CV] ........................... , score=0.465392354125, total=  10.6s
[CV]  ................................................................
[CV] ........................... , score=0.463581488934, total=  15.0s
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.455935613682, total=   7.7s
[CV]  ................................................................
[CV] ........................... , score=0.451217058942, total=  17.7s
[CV]  ................................................................
[CV] ........................... , score=0.460563380282, total=   4.3s
[CV]  ................................................................
[Parallel(n_jobs=20)]: Done  33 out of 100 | elapsed:   19.6s remaining:   39.9s
[CV] ............................ , score=0.44245472837, total=   2.0s
[CV]  ................................................................
[CV] ........................... , score=0.458551307847, total=   9.8s
[CV]  ................................................................
[CV] ........................... , score=0.462374245473, total=  14.0s
[CV]  ................................................................
[CV] ........................... , score=0.442052313883, total=   2.7s
[CV]  ................................................................
[CV] ........................... , score=0.462173038229, total=  18.6s
[CV]  ................................................................
[CV] ........................... , score=0.451820559244, total=  20.9s
[CV]  ................................................................
[CV] ........................... , score=0.456941649899, total=   6.9s
[CV]  ................................................................
[CV] ........................... , score=0.458551307847, total=   7.0s
[CV]  ................................................................
[CV] ........................... , score=0.444265593561, total=   3.9s
[CV]  ................................................................
[CV] ........................... , score=0.454728370221, total=  11.6s
[CV]  ................................................................
[CV] ........................... , score=0.466197183099, total=  15.7s
[CV]  ................................................................
[CV] ........................... , score=0.460362173038, total=  12.0s
[CV] ........................... , score=0.447484909457, total=   1.6s
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.462575452716, total=   8.6s
[CV]  ................................................................
[CV] ........................... , score=0.462575452716, total=  17.2s
[CV]  ................................................................
[CV] ........................... , score=0.454124748491, total=   2.7s
[CV]  ................................................................
[CV] ............................ , score=0.45291750503, total=   6.6s
[CV]  ................................................................
[CV] ........................... , score=0.462374245473, total=  14.2s
[CV] ........................... , score=0.459758551308, total=  10.1s
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.454124748491, total=   4.4s
[CV]  ................................................................
[CV] ........................... , score=0.451710261569, total=   7.6s
[CV]  ................................................................
[CV] ........................... , score=0.436217303823, total=   1.4s
[CV]  ................................................................
[CV] ........................... , score=0.460160965795, total=  15.8s
[CV]  ................................................................
[CV] ........................... , score=0.458953722334, total=   6.0s
[CV]  ................................................................
[CV] ........................... , score=0.453521126761, total=   9.1s
[CV]  ................................................................
[CV] ........................... , score=0.444466800805, total=   2.9s
[CV]  ................................................................
[CV] ........................... , score=0.460160965795, total=  13.6s
[CV]  ................................................................
[CV] ........................... , score=0.464989939638, total=  14.0s
[CV]  ................................................................
[CV] ........................... , score=0.451911468813, total=   4.9s
[CV]  ................................................................
[CV] ........................... , score=0.456136820926, total=   9.1s
[CV]  ................................................................
[CV] ........................... , score=0.453118712274, total=  12.3s
[CV]  ................................................................
[CV] ........................... , score=0.452898550725, total=   2.3s
[CV]  ................................................................
[CV] ............................ , score=0.45814889336, total=  10.1s
[CV]  ................................................................
[CV] ........................... , score=0.464788732394, total=  17.1s
[CV]  ................................................................
[Parallel(n_jobs=20)]: Done  67 out of 100 | elapsed:   36.2s remaining:   17.8s
[CV] ........................... , score=0.453722334004, total=   6.7s
[CV]  ................................................................
[CV] ............................ , score=0.45754527163, total=  11.7s
[CV]  ................................................................
[CV] ........................... , score=0.446256038647, total=   3.9s
[CV]  ................................................................
[CV] ........................... , score=0.454929577465, total=  15.4s
[CV]  ................................................................
[CV] ........................... , score=0.456740442656, total=  15.9s
[CV] ........................... , score=0.457746478873, total=  12.7s
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................... , score=0.453118712274, total=  10.0s
[CV]  ................................................................
[CV] ........................... , score=0.445048309179, total=   1.9s
[CV]  ................................................................
[CV] ........................... , score=0.454911433172, total=   6.4s
[CV]  ................................................................
[CV] ........................... , score=0.456740442656, total=  17.7s
[CV]  ................................................................
[CV] ........................... , score=0.451106639839, total=  11.2s
[CV]  ................................................................
[CV] ........................... , score=0.455331991952, total=  10.8s
[CV]  ................................................................
[CV] ........................... , score=0.461770623742, total=  15.8s
[CV]  ................................................................
[CV] ........................... , score=0.453703703704, total=   7.9s
[CV] ........................... , score=0.454911433172, total=   4.0s
[CV] ........................... , score=0.450080515298, total=   5.1s
[CV] ........................... , score=0.450281803543, total=   9.2s
[CV] ........................... , score=0.464587525151, total=  17.4s
[CV] ........................... , score=0.456723027375, total=   9.8s
[CV] ........................... , score=0.453722334004, total=  14.5s
[CV] ........................... , score=0.452093397746, total=   6.9s
[CV] ........................... , score=0.456740442656, total=  15.5s
[CV] ........................... , score=0.454911433172, total=   7.1s
[CV] ........................... , score=0.453301127214, total=  13.4s
[CV] ........................... , score=0.455917874396, total=  13.1s
[CV] ........................... , score=0.453099838969, total=   9.9s
[CV] ........................... , score=0.456941649899, total=  19.7s
[CV] ............................ , score=0.45692431562, total=  11.6s
[CV] ........................... , score=0.461352657005, total=  15.6s
[CV] ........................... , score=0.455314009662, total=  13.3s
[CV] ............................ , score=0.45692431562, total=  16.9s
[CV] ........................... , score=0.458132045089, total=  13.9s
[CV] ........................... , score=0.459138486312, total=  15.3s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:   58.1s finished

In [19]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [26]:
fig, ax = plt.subplots()
ax.plot(train_sizes, train_mean,
         color='C0', linestyle='-',
         marker='o', markersize=5,
         label='Training set')

ax.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.25, color='C0')

ax.plot(train_sizes, test_mean,
         color='C1', linestyle='--',
         marker='^', markersize=5,
         label='Validation set')

ax.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.25, color='C1')

ax.set_xlabel('Number of training samples')
ax.set_ylabel('Accuracy')
ax.grid()
ax.legend()
# plt.ylim([0.0, 1.0])
plt.tight_layout()
plt.show()



In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [10]:
fig, axarr = plt.subplots(2, 2)
for max_depth, ax in zip([2, 5, 6, 10], axarr.flatten()):
    print('max_depth = {}'.format(max_depth))
    pipeline = comp.get_pipeline('RF')
    params = {'classifier__max_depth': max_depth}
    pipeline.set_params(**params)

    train_sizes, train_scores, test_scores =\
        learning_curve(estimator=pipeline,
                       X=X_train_sim,
                       y=y_train_sim,
                       train_sizes=np.linspace(0.1, 1.0, 10),
                       cv=10,
                       n_jobs=20,
                       verbose=0)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    ax.plot(train_sizes, train_mean,
             color='b', linestyle='-',
             marker='o', markersize=5,
             label='training accuracy')

    ax.fill_between(train_sizes,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.15, color='b')

    ax.plot(train_sizes, test_mean,
             color='g', linestyle='--',
             marker='s', markersize=5,
             label='validation accuracy')

    ax.fill_between(train_sizes,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15, color='g')

    ax.grid()
    ax.set_xlabel('Number of training samples')
    ax.set_ylabel('Accuracy')
    ax.set_title('max depth = {}'.format(max_depth))
    ax.set_ylim([0.6, 0.9])
    ax.legend()
plt.tight_layout()
plt.show()


max_depth = 2
max_depth = 5
max_depth = 6
max_depth = 10

In [ ]: