Random forest parameter-tuning

Table of contents

  1. Data preprocessing
  2. Validation curves
  3. KS-test tuning

In [7]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')


Added to PYTHONPATH

In [8]:
from __future__ import division, print_function
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns
import scipy.stats as stats

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid, KFold, ShuffleSplit

import composition as comp

# Plotting-related
sns.set_palette('muted')
sns.set_color_codes()
color_dict = defaultdict()
for i, composition in enumerate(['light', 'heavy', 'total']):
    color_dict[composition] = sns.color_palette('muted').as_hex()[i]
%matplotlib inline

Data preprocessing

  1. Load simulation dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets

In [9]:
X_train_sim, X_test_sim, y_train_sim, y_test_sim, le, energy_train_sim, energy_test_sim = comp.preprocess_sim(return_energy=True)


Cut event flow:
              lap_reco_success:  0.612  0.612
                    lap_zenith:  0.718  0.578
                 num_hits_1_30:  0.624  0.563
                     IT_signal:  0.231  0.221
                max_qfrac_1_30:  0.775  0.221
               lap_containment:  0.589  0.172
              energy_range_lap:  0.415  0.117


(0, 1, 2, 3, 4, 5, 9, 10)

Features selected = ['lap_log_energy', 'lap_cos_zenith', 'log_NChannels_1_30', 'nchannels_nhits_ratio', 'lap_rlogl', 'log_NHits_1_30', 'log_s50', 'log_s125']

Number training events = 145932
Number testing events = 62543

In [10]:
X_test_data, energy_test_data = comp.preprocess_data(return_energy=True)


Features selected = ['lap_log_energy', 'lap_cos_zenith', 'log_NChannels_1_30', 'nchannels_nhits_ratio', 'lap_rlogl', 'log_NHits_1_30', 'log_s50', 'log_s125']

Number testing events = 3253341

Validation curves

(10-fold CV)

Maximum depth


In [5]:
pipeline = comp.get_pipeline('RF')
param_range = np.arange(1, 16)
train_scores, test_scores = validation_curve(
                estimator=pipeline, 
                X=X_train_sim, 
                y=y_train_sim, 
                param_name='classifier__max_depth', 
                param_range=param_range,
                cv=5,
                scoring='accuracy',
                verbose=2,
                n_jobs=20)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)


[CV] classifier__max_depth=1 .........................................
[CV] classifier__max_depth=2 .........................................
[CV] classifier__max_depth=3 .........................................
[CV] classifier__max_depth=4 .........................................
[CV] classifier__max_depth=5 .........................................
[CV] classifier__max_depth=6 .........................................
[CV] classifier__max_depth=7 .........................................
[CV] classifier__max_depth=8 .........................................
[CV] classifier__max_depth=9 .........................................
[CV] classifier__max_depth=10 ........................................
[CV] classifier__max_depth=11 ........................................
[CV] classifier__max_depth=12 ........................................
[CV] classifier__max_depth=13 ........................................
[CV] classifier__max_depth=14 ........................................
[CV] classifier__max_depth=15 ........................................
[CV] classifier__max_depth=1 .........................................
[CV] classifier__max_depth=2 .........................................
[CV] classifier__max_depth=3 .........................................
[CV] classifier__max_depth=4 .........................................
[CV] classifier__max_depth=5 .........................................
[CV] ................................ classifier__max_depth=1 -   3.3s
[CV] classifier__max_depth=6 .........................................
[CV] ................................ classifier__max_depth=2 -   3.7s
[CV] classifier__max_depth=7 .........................................
[CV] ................................ classifier__max_depth=1 -   3.4s
[CV] classifier__max_depth=8 .........................................
[CV] ................................ classifier__max_depth=3 -   3.6s
[CV] classifier__max_depth=9 .........................................
[CV] ................................ classifier__max_depth=2 -   3.4s
[CV] classifier__max_depth=10 ........................................
[CV] ................................ classifier__max_depth=5 -   2.5s
[CV] classifier__max_depth=11 ........................................
[CV] ................................ classifier__max_depth=4 -   2.2s
[CV] classifier__max_depth=12 ........................................
[CV] ................................ classifier__max_depth=3 -   2.4s
[CV] classifier__max_depth=13 ........................................
[CV] ................................ classifier__max_depth=4 -   2.6s
[CV] classifier__max_depth=14 ........................................
[CV] ................................ classifier__max_depth=5 -   1.6s
[CV] classifier__max_depth=15 ........................................
[CV] ................................ classifier__max_depth=6 -   1.8s
[CV] classifier__max_depth=1 .........................................
[CV] ................................ classifier__max_depth=7 -   1.8s
[CV] classifier__max_depth=2 .........................................
[CV] ............................... classifier__max_depth=10 -   1.7s
[CV] classifier__max_depth=3 .........................................
[CV] ................................ classifier__max_depth=8 -   1.5s
[CV] classifier__max_depth=4 .........................................
[CV] ................................ classifier__max_depth=9 -   1.4s
[CV] classifier__max_depth=5 .........................................
[CV] ............................... classifier__max_depth=11 -   1.4s
[CV] classifier__max_depth=6 .........................................
[CV] ............................... classifier__max_depth=13 -   1.0s
[CV] classifier__max_depth=7 .........................................
[CV] ............................... classifier__max_depth=14 -   1.5s
[CV] classifier__max_depth=8 .........................................
[CV] ............................... classifier__max_depth=12 -   1.6s
[CV] classifier__max_depth=9 .........................................
[CV] ............................... classifier__max_depth=15 -   2.4s
[CV] classifier__max_depth=10 ........................................
[CV] ................................ classifier__max_depth=1 -   2.3s
[CV] classifier__max_depth=11 ........................................
[CV] ................................ classifier__max_depth=6 -   2.5s
[CV] classifier__max_depth=12 ........................................
[CV] ................................ classifier__max_depth=2 -   2.8s
[CV] classifier__max_depth=13 ........................................
[CV] ................................ classifier__max_depth=7 -   3.1s
[CV] classifier__max_depth=14 ........................................
[CV] ................................ classifier__max_depth=3 -   2.4s
[CV] classifier__max_depth=15 ........................................
[CV] ................................ classifier__max_depth=8 -   2.5s
[CV] classifier__max_depth=1 .........................................
[CV] ................................ classifier__max_depth=9 -   2.2s
[CV] classifier__max_depth=2 .........................................
[CV] ............................... classifier__max_depth=10 -   2.0s
[CV] classifier__max_depth=3 .........................................
[CV] ................................ classifier__max_depth=4 -   2.3s
[CV] classifier__max_depth=4 .........................................
[CV] ................................ classifier__max_depth=5 -   2.0s
[CV] classifier__max_depth=5 .........................................
[CV] ............................... classifier__max_depth=11 -   1.9s
[CV] classifier__max_depth=6 .........................................
[CV] ............................... classifier__max_depth=12 -   2.2s
[CV] classifier__max_depth=7 .........................................
[CV] ............................... classifier__max_depth=13 -   2.1s
[CV] classifier__max_depth=8 .........................................
[CV] ................................ classifier__max_depth=6 -   2.2s
[CV] classifier__max_depth=9 .........................................
[CV] ............................... classifier__max_depth=14 -   1.4s
[CV] classifier__max_depth=10 ........................................
[CV] ................................ classifier__max_depth=8 -   1.5s
[CV] classifier__max_depth=11 ........................................
[CV] ................................ classifier__max_depth=7 -   2.5s
[CV] classifier__max_depth=12 ........................................
[CV] ................................ classifier__max_depth=1 -   2.3s
[CV] classifier__max_depth=13 ........................................
[Parallel(n_jobs=20)]: Done  38 out of  75 | elapsed:  1.2min remaining:  1.2min
[CV] ............................... classifier__max_depth=15 -   2.4s
[CV] classifier__max_depth=14 ........................................
[CV] ................................ classifier__max_depth=9 -   1.7s
[CV] classifier__max_depth=15 ........................................
[CV] ............................... classifier__max_depth=10 -   1.9s
[CV] classifier__max_depth=1 .........................................
[CV] ................................ classifier__max_depth=3 -   2.1s
[CV] classifier__max_depth=2 .........................................
[CV] ............................... classifier__max_depth=13 -   0.4s
[CV] classifier__max_depth=3 .........................................
[CV] ................................ classifier__max_depth=4 -   0.4s
[CV] classifier__max_depth=4 .........................................
[CV] ................................ classifier__max_depth=2 -   2.3s
[CV] ................................ classifier__max_depth=3 -   0.3s
[CV] classifier__max_depth=6 .........................................
[CV] classifier__max_depth=5 .........................................
[CV] ................................ classifier__max_depth=5 -   1.1s
[CV] classifier__max_depth=7 .........................................
[CV] ................................ classifier__max_depth=6 -   0.3s
[CV] classifier__max_depth=8 .........................................
[CV] ............................... classifier__max_depth=10 -   1.6s
[CV] classifier__max_depth=9 .........................................
[CV] ................................ classifier__max_depth=8 -   6.2s
[CV] classifier__max_depth=10 ........................................
[CV] ................................ classifier__max_depth=9 -   7.1s
[CV] ................................ classifier__max_depth=7 -   8.5s
[CV] classifier__max_depth=12 ........................................
[CV] classifier__max_depth=11 ........................................
[CV] ................................ classifier__max_depth=8 -   0.4s
[CV] classifier__max_depth=13 ........................................
[CV] ................................ classifier__max_depth=6 -   1.2s
[CV] classifier__max_depth=14 ........................................
[CV] ............................... classifier__max_depth=13 -   0.3s
[CV] classifier__max_depth=15 ........................................
[CV] ................................ classifier__max_depth=1 -   0.4s
[CV] ............................... classifier__max_depth=15 -   0.4s
[CV] ............................... classifier__max_depth=12 -   1.1s
[CV] ................................ classifier__max_depth=2 -   0.4s
[CV] ............................... classifier__max_depth=11 -   1.4s
[CV] ............................... classifier__max_depth=11 -   1.4s
[CV] ............................... classifier__max_depth=13 -   0.7s
[CV] ............................... classifier__max_depth=14 -   2.0s
[CV] ................................ classifier__max_depth=5 -   1.3s
[CV] ................................ classifier__max_depth=7 -   1.1s
[CV] ............................... classifier__max_depth=15 -   1.2s
[CV] ............................... classifier__max_depth=14 -   0.4s
[CV] ................................ classifier__max_depth=4 -   0.3s
[CV] ............................... classifier__max_depth=12 -   0.2s
[CV] ............................... classifier__max_depth=11 -   0.3s
[CV] ............................... classifier__max_depth=15 -   0.6s
[CV] ............................... classifier__max_depth=10 -   0.6s
[CV] ............................... classifier__max_depth=12 -   0.7s
[CV] ................................ classifier__max_depth=9 -   0.2s
[CV] ............................... classifier__max_depth=14 -   0.2s
[Parallel(n_jobs=20)]: Done  75 out of  75 | elapsed:  2.2min finished

In [6]:
plt.plot(param_range, train_mean, 
         color='b', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='b')

plt.plot(param_range, test_mean, 
         color='g', linestyle='None', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='g')

plt.grid()
plt.legend(loc='lower right')
plt.xlabel('Maximum depth')
plt.ylabel('Accuracy')
# plt.ylim([0.7, 0.8])
# plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/RF-validation_curve_min_samples_leaf.png', dpi=300)
plt.show()


Max features


In [10]:
pipeline = comp.get_pipeline('RF')
param_range = np.arange(1, X_train.shape[1])
train_scores, test_scores = validation_curve(
                estimator=pipeline, 
                X=X_train, 
                y=y_train, 
                param_name='classifier__max_features', 
                param_range=param_range,
                cv=10,
                verbose=2,
                n_jobs=20)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='b', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='b')

plt.plot(param_range, test_mean, 
         color='g', linestyle='None', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='g')

plt.grid()
# plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Maximum features')
plt.ylabel('Accuracy')
# plt.ylim([0.8, 1.0])
# plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/RF-validation_curve_min_samples_leaf.png', dpi=300)
plt.show()


[CV] classifier__max_features=1 ......................................
[CV] classifier__max_features=2 ......................................
[CV] classifier__max_features=3 ......................................
[CV] classifier__max_features=4 ......................................
[CV] classifier__max_features=5 ......................................
[CV] classifier__max_features=6 ......................................
[CV] classifier__max_features=7 ......................................
[CV] classifier__max_features=8 ......................................
[CV] classifier__max_features=9 ......................................
[CV] classifier__max_features=1 ......................................
[CV] classifier__max_features=2 ......................................
[CV] classifier__max_features=3 ......................................
[CV] classifier__max_features=4 ......................................
[CV] classifier__max_features=5 ......................................
[CV] classifier__max_features=6 ......................................
[CV] classifier__max_features=7 ......................................
[CV] classifier__max_features=8 ......................................
[CV] classifier__max_features=9 ......................................
[CV] classifier__max_features=1 ......................................
[CV] classifier__max_features=2 ......................................
[CV] ............................. classifier__max_features=1 -   2.6s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=1 -   2.9s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=1 -   2.9s
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=2 -   2.3s
[CV] ............................. classifier__max_features=2 -   2.5s
[CV] classifier__max_features=6 ......................................
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=2 -   2.8s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=3 -   2.0s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=3 -   2.1s
[CV] classifier__max_features=1 ......................................
[CV] ............................. classifier__max_features=4 -   2.5s
[CV] classifier__max_features=2 ......................................
[CV] ............................. classifier__max_features=4 -   2.4s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=5 -   3.0s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=5 -   1.9s
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=6 -   2.6s
[CV] classifier__max_features=6 ......................................
[CV] ............................. classifier__max_features=6 -   2.7s
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=3 -   1.7s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=7 -   0.4s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=4 -   0.8s
[CV] classifier__max_features=1 ......................................
[CV] ............................. classifier__max_features=3 -   2.2s
[CV] classifier__max_features=2 ......................................
[CV] ............................. classifier__max_features=7 -   0.9s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=1 -   3.9s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=2 -   1.5s
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=7 -   0.5s
[CV] classifier__max_features=6 ......................................
[CV] ............................. classifier__max_features=1 -   0.4s
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=6 -   0.4s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=5 -   0.3s
[CV] ............................. classifier__max_features=8 -   0.6s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=2 -   0.8s
[CV] classifier__max_features=2 ......................................
[CV] classifier__max_features=1 ......................................
[CV] ............................. classifier__max_features=4 -   0.5s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=5 -   0.3s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=8 -   0.7s
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=3 -   0.5s
[CV] classifier__max_features=6 ......................................
[CV] ............................. classifier__max_features=9 -   0.2s
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=8 -   0.7s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=9 -   0.3s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=9 -   0.4s
[CV] classifier__max_features=1 ......................................
[CV] ............................. classifier__max_features=7 -   1.0s
[CV] classifier__max_features=2 ......................................
[CV] ............................. classifier__max_features=2 -   2.7s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=5 -   0.5s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=1 -   0.3s
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=8 -   0.8s
[CV] classifier__max_features=6 ......................................
[CV] ............................. classifier__max_features=1 -   1.1s
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=7 -   0.3s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=9 -   0.6s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=2 -   1.9s
[CV] classifier__max_features=1 ......................................
[CV] ............................. classifier__max_features=5 -   1.4s
[CV] classifier__max_features=2 ......................................
[Parallel(n_jobs=20)]: Done  45 out of  90 | elapsed:  3.0min remaining:  3.0min
[CV] ............................. classifier__max_features=9 -   2.0s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=4 -   2.2s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=5 -   0.9s
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=7 -   1.0s
[CV] classifier__max_features=6 ......................................
[CV] ............................. classifier__max_features=8 -   2.9s
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=6 -   2.8s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=1 -   0.5s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=3 -   0.3s
[CV] classifier__max_features=1 ......................................
[CV] ............................. classifier__max_features=3 -   0.4s
[CV] classifier__max_features=2 ......................................
[CV] ............................. classifier__max_features=1 -   0.5s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=8 -   0.5s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=7 -   0.3s
[CV] ............................. classifier__max_features=6 -   1.7s
[CV] classifier__max_features=6 ......................................
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=8 -   1.2s
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=3 -   5.7s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=5 -   2.6s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=6 -   1.3s
[CV] classifier__max_features=1 ......................................
[CV] ............................. classifier__max_features=4 -   0.3s
[CV] classifier__max_features=2 ......................................
[CV] ............................. classifier__max_features=4 -   4.6s
[CV] classifier__max_features=3 ......................................
[CV] ............................. classifier__max_features=9 -   0.3s
[CV] classifier__max_features=4 ......................................
[CV] ............................. classifier__max_features=6 -   1.4s
[CV] classifier__max_features=5 ......................................
[CV] ............................. classifier__max_features=3 -   4.4s
[CV] classifier__max_features=6 ......................................
[CV] ............................. classifier__max_features=2 -   0.3s
[CV] classifier__max_features=7 ......................................
[CV] ............................. classifier__max_features=4 -   1.3s
[CV] classifier__max_features=8 ......................................
[CV] ............................. classifier__max_features=9 -   0.3s
[CV] classifier__max_features=9 ......................................
[CV] ............................. classifier__max_features=5 -   0.2s
[CV] ............................. classifier__max_features=7 -   1.0s
[CV] ............................. classifier__max_features=4 -   1.3s
[CV] ............................. classifier__max_features=8 -   1.5s
[CV] ............................. classifier__max_features=6 -   1.5s
[CV] ............................. classifier__max_features=2 -   1.2s
[CV] ............................. classifier__max_features=9 -   4.2s
[CV] ............................. classifier__max_features=3 -   3.1s
[CV] ............................. classifier__max_features=4 -   0.7s
[CV] ............................. classifier__max_features=2 -   0.2s
[CV] ............................. classifier__max_features=9 -   0.4s
[CV] ............................. classifier__max_features=6 -   0.9s
[CV] ............................. classifier__max_features=1 -   6.3s
[CV] ............................. classifier__max_features=6 -   0.6s
[CV] ............................. classifier__max_features=8 -   0.3s
[CV] ............................. classifier__max_features=5 -   0.2s
[CV] ............................. classifier__max_features=7 -   0.2s
[CV] ............................. classifier__max_features=8 -   0.2s
[CV] ............................. classifier__max_features=7 -   0.2s
[CV] ............................. classifier__max_features=9 -   0.4s
[Parallel(n_jobs=20)]: Done  90 out of  90 | elapsed:  5.2min finished

Minimum samples in leaf node


In [7]:
pipeline = comp.get_pipeline('RF')
param_range = np.arange(1, 400, 25)
train_scores, test_scores = validation_curve(
                estimator=pipeline, 
                X=X_train, 
                y=y_train, 
                param_name='classifier__min_samples_leaf', 
                param_range=param_range,
                cv=10,
                verbose=2,
                n_jobs=20)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='b', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='b')

plt.plot(param_range, test_mean, 
         color='g', linestyle='None', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='g')

plt.grid()
# plt.xscale('log')
plt.legend()
plt.xlabel('Minimum samples in leaf node')
plt.ylabel('Accuracy')
# plt.ylim([0.8, 1.0])
# plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/RF-validation_curve_min_samples_leaf.png', dpi=300)
plt.show()


[CV] classifier__min_samples_leaf=1 ..................................
[CV] classifier__min_samples_leaf=26 .................................
[CV] classifier__min_samples_leaf=51 .................................
[CV] classifier__min_samples_leaf=76 .................................
[CV] classifier__min_samples_leaf=101 ................................
[CV] classifier__min_samples_leaf=126 ................................
[CV] classifier__min_samples_leaf=151 ................................
[CV] classifier__min_samples_leaf=176 ................................
[CV] classifier__min_samples_leaf=201 ................................
[CV] classifier__min_samples_leaf=226 ................................
[CV] classifier__min_samples_leaf=251 ................................
[CV] classifier__min_samples_leaf=276 ................................
[CV] classifier__min_samples_leaf=301 ................................
[CV] classifier__min_samples_leaf=326 ................................
[CV] classifier__min_samples_leaf=351 ................................
[CV] classifier__min_samples_leaf=376 ................................
[CV] classifier__min_samples_leaf=1 ..................................
[CV] classifier__min_samples_leaf=26 .................................
[CV] classifier__min_samples_leaf=51 .................................
[CV] classifier__min_samples_leaf=76 .................................
[CV] ....................... classifier__min_samples_leaf=176 -   0.9s
[CV] classifier__min_samples_leaf=101 ................................
[CV] ....................... classifier__min_samples_leaf=226 -   0.5s
[CV] ........................ classifier__min_samples_leaf=76 -   1.0s
[CV] classifier__min_samples_leaf=126 ................................
[CV] ........................ classifier__min_samples_leaf=51 -   0.4s
[CV] ....................... classifier__min_samples_leaf=201 -   0.4s
[CV] ........................ classifier__min_samples_leaf=26 -   0.6s
[CV] ....................... classifier__min_samples_leaf=376 -   0.6s
[CV] classifier__min_samples_leaf=151 ................................
[CV] ....................... classifier__min_samples_leaf=251 -   0.6s
[CV] classifier__min_samples_leaf=176 ................................
[CV] classifier__min_samples_leaf=201 ................................
[CV] ......................... classifier__min_samples_leaf=1 -   0.5s
[CV] ........................ classifier__min_samples_leaf=26 -   0.4s
[CV] classifier__min_samples_leaf=226 ................................
[CV] ....................... classifier__min_samples_leaf=301 -   0.6s
[CV] classifier__min_samples_leaf=251 ................................
[CV] classifier__min_samples_leaf=276 ................................
[CV] ......................... classifier__min_samples_leaf=1 -   0.5s
[CV] ....................... classifier__min_samples_leaf=351 -   0.5s
[CV] classifier__min_samples_leaf=301 ................................
[CV] classifier__min_samples_leaf=326 ................................
[CV] ....................... classifier__min_samples_leaf=276 -   0.5s
[CV] ....................... classifier__min_samples_leaf=151 -   0.7s
[CV] classifier__min_samples_leaf=351 ................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.6s
[CV] ....................... classifier__min_samples_leaf=126 -   0.7s
[CV] ........................ classifier__min_samples_leaf=51 -   0.9s
[CV] classifier__min_samples_leaf=376 ................................
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ....................... classifier__min_samples_leaf=101 -   1.0s
[CV] classifier__min_samples_leaf=26 .................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.9s
[CV] classifier__min_samples_leaf=51 .................................
[CV] classifier__min_samples_leaf=76 .................................
[CV] classifier__min_samples_leaf=101 ................................
[CV] classifier__min_samples_leaf=126 ................................
[CV] classifier__min_samples_leaf=151 ................................
[CV] classifier__min_samples_leaf=176 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   2.9s
[CV] classifier__min_samples_leaf=201 ................................
[CV] ....................... classifier__min_samples_leaf=126 -   2.1s
[CV] classifier__min_samples_leaf=226 ................................
[CV] ....................... classifier__min_samples_leaf=251 -   2.0s
[CV] classifier__min_samples_leaf=251 ................................
[CV] ....................... classifier__min_samples_leaf=301 -   1.1s
[CV] classifier__min_samples_leaf=276 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   0.6s
[CV] ....................... classifier__min_samples_leaf=151 -   0.8s
[CV] ....................... classifier__min_samples_leaf=351 -   0.7s
[CV] classifier__min_samples_leaf=301 ................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.5s
[CV] ......................... classifier__min_samples_leaf=1 -   0.7s
[CV] ....................... classifier__min_samples_leaf=176 -   0.8s
[CV] classifier__min_samples_leaf=326 ................................
[CV] ....................... classifier__min_samples_leaf=276 -   0.9s
[CV] classifier__min_samples_leaf=351 ................................
[CV] classifier__min_samples_leaf=376 ................................
[CV] ....................... classifier__min_samples_leaf=226 -   0.7s
[CV] ....................... classifier__min_samples_leaf=376 -   1.1s
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ....................... classifier__min_samples_leaf=201 -   0.9s
[CV] classifier__min_samples_leaf=26 .................................
[CV] classifier__min_samples_leaf=51 .................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.8s
[CV] classifier__min_samples_leaf=76 .................................
[CV] ........................ classifier__min_samples_leaf=51 -   1.0s
[CV] classifier__min_samples_leaf=101 ................................
[CV] classifier__min_samples_leaf=126 ................................
[CV] ........................ classifier__min_samples_leaf=26 -   1.0s
[CV] classifier__min_samples_leaf=151 ................................
[CV] classifier__min_samples_leaf=176 ................................
[CV] classifier__min_samples_leaf=201 ................................
[CV] ....................... classifier__min_samples_leaf=151 -   0.6s
[CV] classifier__min_samples_leaf=226 ................................
[CV] ....................... classifier__min_samples_leaf=126 -   0.7s
[CV] classifier__min_samples_leaf=251 ................................
[CV] ....................... classifier__min_samples_leaf=176 -   0.7s
[CV] classifier__min_samples_leaf=276 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   0.9s
[CV] classifier__min_samples_leaf=301 ................................
[CV] ....................... classifier__min_samples_leaf=126 -   0.6s
[CV] classifier__min_samples_leaf=326 ................................
[CV] ....................... classifier__min_samples_leaf=226 -   6.7s
[CV] ....................... classifier__min_samples_leaf=151 -   0.3s
[CV] classifier__min_samples_leaf=376 ................................
[CV] classifier__min_samples_leaf=351 ................................
[CV] ........................ classifier__min_samples_leaf=26 -   0.2s
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ........................ classifier__min_samples_leaf=51 -   0.4s
[CV] ....................... classifier__min_samples_leaf=201 -   0.4s
[CV] classifier__min_samples_leaf=26 .................................
[CV] classifier__min_samples_leaf=51 .................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.9s
[CV] ....................... classifier__min_samples_leaf=251 -   1.4s
[CV] ......................... classifier__min_samples_leaf=1 -   0.8s
[CV] classifier__min_samples_leaf=76 .................................
[CV] classifier__min_samples_leaf=101 ................................
[CV] ....................... classifier__min_samples_leaf=376 -   1.6s
[CV] classifier__min_samples_leaf=126 ................................
[CV] ....................... classifier__min_samples_leaf=251 -   5.6s
[CV] classifier__min_samples_leaf=151 ................................
[CV] classifier__min_samples_leaf=176 ................................
[CV] ....................... classifier__min_samples_leaf=201 -  12.7s
[CV] ....................... classifier__min_samples_leaf=276 -   2.4s
[CV] classifier__min_samples_leaf=201 ................................
[CV] classifier__min_samples_leaf=226 ................................
[CV] ....................... classifier__min_samples_leaf=176 -   4.1s
[CV] classifier__min_samples_leaf=251 ................................
[CV] ....................... classifier__min_samples_leaf=326 -   3.4s
[CV] classifier__min_samples_leaf=276 ................................
[CV] ....................... classifier__min_samples_leaf=226 -   0.7s
[CV] classifier__min_samples_leaf=301 ................................
[CV] ....................... classifier__min_samples_leaf=351 -   1.0s
[CV] ....................... classifier__min_samples_leaf=301 -   7.0s
[CV] classifier__min_samples_leaf=351 ................................
[CV] classifier__min_samples_leaf=326 ................................
[CV] ....................... classifier__min_samples_leaf=276 -   2.9s
[CV] classifier__min_samples_leaf=376 ................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.3s
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ....................... classifier__min_samples_leaf=351 -   0.2s
[CV] classifier__min_samples_leaf=26 .................................
[CV] ....................... classifier__min_samples_leaf=176 -   0.3s
[CV] classifier__min_samples_leaf=51 .................................
[CV] ....................... classifier__min_samples_leaf=301 -   0.5s
[CV] classifier__min_samples_leaf=76 .................................
[CV] ....................... classifier__min_samples_leaf=376 -   0.3s
[CV] classifier__min_samples_leaf=101 ................................
[CV] ....................... classifier__min_samples_leaf=251 -   0.5s
[CV] ....................... classifier__min_samples_leaf=351 -   0.7s
[CV] classifier__min_samples_leaf=126 ................................
[CV] classifier__min_samples_leaf=151 ................................
[CV] ....................... classifier__min_samples_leaf=226 -  11.2s
[CV] classifier__min_samples_leaf=176 ................................
[CV] ....................... classifier__min_samples_leaf=201 -   0.4s
[CV] classifier__min_samples_leaf=201 ................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.4s
[CV] classifier__min_samples_leaf=226 ................................
[CV] ......................... classifier__min_samples_leaf=1 -   1.4s
[CV] classifier__min_samples_leaf=251 ................................
[CV] ......................... classifier__min_samples_leaf=1 -   2.5s
[CV] classifier__min_samples_leaf=276 ................................
[CV] ....................... classifier__min_samples_leaf=276 -   0.6s
[CV] classifier__min_samples_leaf=301 ................................
[CV] ....................... classifier__min_samples_leaf=301 -   0.6s
[CV] classifier__min_samples_leaf=326 ................................
[CV] ....................... classifier__min_samples_leaf=151 -   2.2s
[CV] classifier__min_samples_leaf=351 ................................
[CV] ........................ classifier__min_samples_leaf=51 -   0.9s
[CV] classifier__min_samples_leaf=376 ................................
[CV] ........................ classifier__min_samples_leaf=26 -   0.3s
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.6s
[CV] classifier__min_samples_leaf=26 .................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.3s
[CV] classifier__min_samples_leaf=51 .................................
[CV] ....................... classifier__min_samples_leaf=151 -   0.5s
[CV] classifier__min_samples_leaf=76 .................................
[Parallel(n_jobs=20)]: Done  80 out of 160 | elapsed:  2.5min remaining:  2.5min
[CV] ....................... classifier__min_samples_leaf=276 -   0.5s
[CV] classifier__min_samples_leaf=101 ................................
[CV] ....................... classifier__min_samples_leaf=226 -   0.4s
[CV] classifier__min_samples_leaf=126 ................................
[CV] ....................... classifier__min_samples_leaf=126 -   0.3s
[CV] classifier__min_samples_leaf=151 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   3.8s
[CV] classifier__min_samples_leaf=176 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   0.4s
[CV] classifier__min_samples_leaf=201 ................................
[CV] ....................... classifier__min_samples_leaf=126 -   0.4s
[CV] classifier__min_samples_leaf=226 ................................
[CV] ....................... classifier__min_samples_leaf=176 -   1.2s
[CV] classifier__min_samples_leaf=251 ................................
[CV] ....................... classifier__min_samples_leaf=376 -   7.3s
[CV] ......................... classifier__min_samples_leaf=1 -   0.6s
[CV] classifier__min_samples_leaf=276 ................................
[CV] classifier__min_samples_leaf=301 ................................
[CV] ....................... classifier__min_samples_leaf=301 -   0.3s
[CV] classifier__min_samples_leaf=326 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   0.4s
[CV] ....................... classifier__min_samples_leaf=376 -   1.0s
[CV] ....................... classifier__min_samples_leaf=351 -   0.6s
[CV] classifier__min_samples_leaf=376 ................................
[CV] classifier__min_samples_leaf=351 ................................
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.6s
[CV] classifier__min_samples_leaf=26 .................................
[CV] ....................... classifier__min_samples_leaf=176 -   0.7s
[CV] classifier__min_samples_leaf=51 .................................
[CV] ........................ classifier__min_samples_leaf=26 -   0.7s
[CV] classifier__min_samples_leaf=76 .................................
[CV] ....................... classifier__min_samples_leaf=376 -   0.6s
[CV] classifier__min_samples_leaf=101 ................................
[CV] ....................... classifier__min_samples_leaf=301 -   0.3s
[CV] classifier__min_samples_leaf=126 ................................
[CV] ....................... classifier__min_samples_leaf=151 -   0.6s
[CV] classifier__min_samples_leaf=151 ................................
[CV] ....................... classifier__min_samples_leaf=351 -   0.3s
[CV] classifier__min_samples_leaf=176 ................................
[CV] ....................... classifier__min_samples_leaf=276 -   0.5s
[CV] classifier__min_samples_leaf=201 ................................
[CV] ....................... classifier__min_samples_leaf=126 -   0.6s
[CV] classifier__min_samples_leaf=226 ................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.7s
[CV] classifier__min_samples_leaf=251 ................................
[CV] ........................ classifier__min_samples_leaf=51 -   0.8s
[CV] classifier__min_samples_leaf=276 ................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.8s
[CV] classifier__min_samples_leaf=301 ................................
[CV] ....................... classifier__min_samples_leaf=151 -   0.4s
[CV] classifier__min_samples_leaf=326 ................................
[CV] ........................ classifier__min_samples_leaf=26 -   0.3s
[CV] classifier__min_samples_leaf=351 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   0.7s
[CV] classifier__min_samples_leaf=376 ................................
[CV] ........................ classifier__min_samples_leaf=51 -   0.5s
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ......................... classifier__min_samples_leaf=1 -   0.5s
[CV] ....................... classifier__min_samples_leaf=126 -   0.4s
[CV] classifier__min_samples_leaf=26 .................................
[CV] classifier__min_samples_leaf=51 .................................
[CV] ....................... classifier__min_samples_leaf=251 -   0.6s
[CV] classifier__min_samples_leaf=76 .................................
[CV] ....................... classifier__min_samples_leaf=201 -   0.3s
[CV] classifier__min_samples_leaf=101 ................................
[CV] ....................... classifier__min_samples_leaf=351 -   0.3s
[CV] classifier__min_samples_leaf=126 ................................
[CV] ....................... classifier__min_samples_leaf=251 -   0.4s
[CV] classifier__min_samples_leaf=151 ................................
[CV] ....................... classifier__min_samples_leaf=201 -   1.3s
[CV] classifier__min_samples_leaf=176 ................................
[CV] ........................ classifier__min_samples_leaf=51 -   0.8s
[CV] classifier__min_samples_leaf=201 ................................
[CV] ........................ classifier__min_samples_leaf=26 -   0.6s
[CV] ....................... classifier__min_samples_leaf=301 -   0.4s
[CV] classifier__min_samples_leaf=251 ................................
[CV] classifier__min_samples_leaf=226 ................................
[CV] ....................... classifier__min_samples_leaf=276 -   0.4s
[CV] classifier__min_samples_leaf=276 ................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.5s
[CV] classifier__min_samples_leaf=301 ................................
[CV] ....................... classifier__min_samples_leaf=226 -   0.7s
[CV] classifier__min_samples_leaf=326 ................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.5s
[CV] classifier__min_samples_leaf=351 ................................
[CV] ....................... classifier__min_samples_leaf=201 -   0.8s
[CV] classifier__min_samples_leaf=376 ................................
[CV] ......................... classifier__min_samples_leaf=1 -   0.5s
[CV] classifier__min_samples_leaf=1 ..................................
[CV] ....................... classifier__min_samples_leaf=226 -   0.7s
[CV] classifier__min_samples_leaf=26 .................................
[CV] ........................ classifier__min_samples_leaf=76 -   0.6s
[CV] classifier__min_samples_leaf=51 .................................
[CV] ....................... classifier__min_samples_leaf=126 -   0.3s
[CV] ....................... classifier__min_samples_leaf=376 -   1.3s
[CV] classifier__min_samples_leaf=76 .................................
[CV] classifier__min_samples_leaf=101 ................................
[CV] ........................ classifier__min_samples_leaf=51 -   2.9s
[CV] classifier__min_samples_leaf=126 ................................
[CV] ....................... classifier__min_samples_leaf=351 -   0.3s
[CV] classifier__min_samples_leaf=151 ................................
[CV] ........................ classifier__min_samples_leaf=26 -   0.4s
[CV] classifier__min_samples_leaf=176 ................................
[CV] ....................... classifier__min_samples_leaf=376 -   1.7s
[CV] classifier__min_samples_leaf=201 ................................
[CV] ....................... classifier__min_samples_leaf=326 -   0.4s
[CV] classifier__min_samples_leaf=226 ................................
[CV] ....................... classifier__min_samples_leaf=151 -   0.8s
[CV] classifier__min_samples_leaf=251 ................................
[CV] ....................... classifier__min_samples_leaf=101 -   0.3s
[CV] classifier__min_samples_leaf=276 ................................
[CV] ....................... classifier__min_samples_leaf=176 -   0.8s
[CV] classifier__min_samples_leaf=301 ................................
[CV] ....................... classifier__min_samples_leaf=176 -   0.6s
[CV] classifier__min_samples_leaf=326 ................................
[CV] ........................ classifier__min_samples_leaf=26 -   0.4s
[CV] classifier__min_samples_leaf=351 ................................
[CV] ....................... classifier__min_samples_leaf=276 -   2.0s
[CV] classifier__min_samples_leaf=376 ................................
[CV] ....................... classifier__min_samples_leaf=251 -   0.7s
[CV] ....................... classifier__min_samples_leaf=226 -   3.9s
[CV] ......................... classifier__min_samples_leaf=1 -   3.2s
[CV] ....................... classifier__min_samples_leaf=251 -   0.5s
[CV] ....................... classifier__min_samples_leaf=201 -   0.4s
[CV] ........................ classifier__min_samples_leaf=51 -   0.4s
[CV] ....................... classifier__min_samples_leaf=201 -   0.3s
[CV] ....................... classifier__min_samples_leaf=101 -   0.4s
[CV] ....................... classifier__min_samples_leaf=226 -   1.4s
[CV] ....................... classifier__min_samples_leaf=151 -   0.5s
[CV] ........................ classifier__min_samples_leaf=76 -   0.4s
[CV] ....................... classifier__min_samples_leaf=126 -   0.3s
[CV] ....................... classifier__min_samples_leaf=376 -   0.3s
[CV] ....................... classifier__min_samples_leaf=176 -   0.3s
[CV] ....................... classifier__min_samples_leaf=351 -   0.3s
[CV] ....................... classifier__min_samples_leaf=301 -   0.4s
[CV] ....................... classifier__min_samples_leaf=301 -   0.4s
[CV] ....................... classifier__min_samples_leaf=251 -   0.3s
[CV] ....................... classifier__min_samples_leaf=276 -   0.4s
[CV] ....................... classifier__min_samples_leaf=326 -   0.2s
[Parallel(n_jobs=20)]: Done 160 out of 160 | elapsed:  4.7min finished

KS-test tuning

Maximum depth


In [7]:
comp_list = ['light', 'heavy']
max_depth_list = np.arange(1, 16)
pval_comp = defaultdict(list)
ks_stat = defaultdict(list)
kf = KFold(n_splits=10)
fold_num = 0
for train_index, test_index in kf.split(X_train):
    fold_num += 1
    print('\r')
    print('Fold {}: '.format(fold_num), end='')
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    pval_maxdepth = defaultdict(list)
    print('max_depth = ', end='')
    for max_depth in max_depth_list:
        print('{}...'.format(max_depth), end='')
        pipeline = comp.get_pipeline('RF')
        pipeline.named_steps['classifier'].set_params(max_depth=max_depth)
        pipeline.fit(X_train_fold, y_train_fold)
        test_probs = pipeline.predict_proba(X_test_fold)
        train_probs = pipeline.predict_proba(X_train_fold)
        for class_ in pipeline.classes_:
            pval_maxdepth[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
    for composition in comp_list:
        pval_comp[composition].append(pval_maxdepth[composition])
pval_sys_err = {key: np.std(pval_comp[key], axis=0) for key in pval_comp}
pval = {key: np.mean(pval_comp[key], axis=0) for key in pval_comp}


Fold 1: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 2: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 3: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 4: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 5: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 6: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 7: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 8: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 9: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...
Fold 10: max_depth = 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...

In [8]:
comp_list = ['light']
fig, ax = plt.subplots()
for composition in comp_list:
    upper_err = np.copy(pval_sys_err[composition])
    upper_err = [val if ((pval[composition][i] + val) < 1) else 1-pval[composition][i] for i, val in enumerate(upper_err)]
    lower_err = np.copy(pval_sys_err[composition])
    lower_err = [val if ((pval[composition][i] - val) > 0) else pval[composition][i] for i, val in enumerate(lower_err)]
    if composition == 'light':
        ax.errorbar(max_depth_list -0.25/2, pval[composition],
                    yerr=[lower_err, upper_err],
                    marker='.', linestyle=':',
                    label=composition, alpha=0.75)
    if composition == 'heavy':
        ax.errorbar(max_depth_list + 0.25/2, pval[composition],
                    yerr=[lower_err, upper_err],
                    marker='.', linestyle=':',
                    label=composition, alpha=0.75)
plt.ylabel('KS-test p-value')
plt.xlabel('Maximum depth')
plt.ylim([-0.1, 1.1])
# plt.legend()
plt.grid()
plt.show()



In [20]:
pval


Out[20]:
{'heavy': array([  7.16894934e-01,   6.76231102e-01,   6.75417490e-01,
          5.46730739e-01,   5.68780852e-01,   4.40231335e-01,
          4.82516082e-01,   5.14720962e-01,   5.16080561e-01,
          5.00066822e-01,   3.96053875e-01,   2.12820818e-01,
          3.65621330e-02,   2.58837771e-04,   1.52490663e-07]),
 'light': array([  7.16894934e-01,   6.76231102e-01,   6.75417490e-01,
          5.46730739e-01,   5.68780852e-01,   4.40231335e-01,
          4.82516082e-01,   5.14720962e-01,   5.16080561e-01,
          5.00066822e-01,   3.96053875e-01,   2.12820818e-01,
          3.65621330e-02,   2.58837771e-04,   1.52490663e-07])}

Minimum samples in leaf node


In [ ]:
comp_list = np.unique(df['MC_comp_class'])
min_samples_list = np.arange(1, 400, 25)
pval = defaultdict(list)
ks_stat = defaultdict(list)
print('min_samples_leaf = ', end='')
for min_samples_leaf in min_samples_list:
    print('{}...'.format(min_samples_leaf), end='')
    pipeline = comp.get_pipeline('RF')
    params = {'max_depth': 4, 'min_samples_leaf': min_samples_leaf}
    pipeline.named_steps['classifier'].set_params(**params)
    pipeline.fit(X_train, y_train)
    test_probs = pipeline.predict_proba(X_test)
    train_probs = pipeline.predict_proba(X_train)
    for class_ in pipeline.classes_:
        pval[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
fig, ax = plt.subplots()
for composition in pval:
    ax.plot(min_samples_list, pval[composition], linestyle='-.', label=composition)
plt.ylabel('KS-test p-value')
plt.xlabel('Minimum samples leaf node')
plt.legend()
plt.grid()
plt.show()

Maximum depth for various minimum samples in leaf node


In [11]:
# comp_list = np.unique(df['MC_comp_class'])
comp_list = ['light']
min_samples_list = [1, 25, 50, 75]
min_samples_list = [1, 100, 200, 300]
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
print('min_samples_leaf = ', end='')
for min_samples_leaf, ax in zip(min_samples_list, axarr.flatten()):
    print('{}...'.format(min_samples_leaf), end='')
    max_depth_list = np.arange(1, 16)
    pval = defaultdict(list)
    ks_stat = defaultdict(list)
    for max_depth in max_depth_list:
        pipeline = comp.get_pipeline('RF')
        params = {'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf}
        pipeline.named_steps['classifier'].set_params(**params)
        pipeline.fit(X_train, y_train)
        test_probs = pipeline.predict_proba(X_test)
        train_probs = pipeline.predict_proba(X_train)
        for class_ in pipeline.classes_:
            pval[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
    for composition in pval:
        ax.plot(max_depth_list, pval[composition], linestyle='-.', label=composition)
    ax.set_ylabel('KS-test p-value')
    ax.set_xlabel('Maximum depth')
    ax.set_title('min samples = {}'.format(min_samples_leaf))
    ax.set_ylim([0, 0.5])
    ax.legend()
    ax.grid()
plt.tight_layout()
plt.show()


min_samples_leaf = 1...100...200...300...

In [ ]: