In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
In [2]:
from __future__ import division, print_function
import argparse
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns
import scipy.stats as stats
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid, KFold
import composition as comp
# Plotting-related
sns.set_palette('muted')
sns.set_color_codes()
color_dict = defaultdict()
for i, composition in enumerate(['light', 'heavy', 'total']):
color_dict[composition] = sns.color_palette('muted').as_hex()[i]
%matplotlib inline
In [3]:
df_sim, cut_dict_sim = comp.load_dataframe(type_='sim', config='IT73', return_cut_dict=True)
selection_mask = np.array([True] * len(df_sim))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
selection_mask *= cut_dict_sim[key]
df_sim = df_sim[selection_mask]
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
df_sim, feature_list, comp_class=True, train_he=True, test_he=True)
print('number training events = ' + str(y_train.shape[0]))
print('number testing events = ' + str(y_test.shape[0]))
In [4]:
pipeline = comp.get_pipeline('AB')
param_range = np.arange(0.1, 1.0, 0.25)
train_scores, test_scores = validation_curve(
estimator=pipeline,
X=X_train,
y=y_train,
param_name='classifier__learning_rate',
param_range=param_range,
cv=10,
verbose=2,
n_jobs=20)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean,
color='b', marker='o',
markersize=5, label='training accuracy')
plt.fill_between(param_range,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='b')
plt.plot(param_range, test_mean,
color='g', linestyle='None',
marker='s', markersize=5,
label='validation accuracy')
plt.fill_between(param_range,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='g')
plt.grid()
# plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Learning rate')
plt.ylabel('Accuracy')
# plt.ylim([0.8, 1.0])
# plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/RF-validation_curve_min_samples_leaf.png', dpi=300)
plt.show()
In [10]:
pipeline = comp.get_pipeline('RF')
param_range = np.arange(1, X_train.shape[1])
train_scores, test_scores = validation_curve(
estimator=pipeline,
X=X_train,
y=y_train,
param_name='classifier__max_features',
param_range=param_range,
cv=10,
verbose=2,
n_jobs=20)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean,
color='b', marker='o',
markersize=5, label='training accuracy')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, alpha=0.15,
color='b')
plt.plot(param_range, test_mean,
color='g', linestyle='None',
marker='s', markersize=5,
label='validation accuracy')
plt.fill_between(param_range,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='g')
plt.grid()
# plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Maximum features')
plt.ylabel('Accuracy')
# plt.ylim([0.8, 1.0])
# plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/RF-validation_curve_min_samples_leaf.png', dpi=300)
plt.show()
In [7]:
pipeline = comp.get_pipeline('RF')
param_range = np.arange(1, 400, 25)
train_scores, test_scores = validation_curve(
estimator=pipeline,
X=X_train,
y=y_train,
param_name='classifier__min_samples_leaf',
param_range=param_range,
cv=10,
verbose=2,
n_jobs=20)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean,
color='b', marker='o',
markersize=5, label='training accuracy')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, alpha=0.15,
color='b')
plt.plot(param_range, test_mean,
color='g', linestyle='None',
marker='s', markersize=5,
label='validation accuracy')
plt.fill_between(param_range,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='g')
plt.grid()
# plt.xscale('log')
plt.legend()
plt.xlabel('Minimum samples in leaf node')
plt.ylabel('Accuracy')
# plt.ylim([0.8, 1.0])
# plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/RF-validation_curve_min_samples_leaf.png', dpi=300)
plt.show()
In [7]:
comp_list = ['light', 'heavy']
max_depth_list = np.arange(1, 16)
pval_comp = defaultdict(list)
ks_stat = defaultdict(list)
kf = KFold(n_splits=10)
fold_num = 0
for train_index, test_index in kf.split(X_train):
fold_num += 1
print('\r')
print('Fold {}: '.format(fold_num), end='')
X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
pval_maxdepth = defaultdict(list)
print('max_depth = ', end='')
for max_depth in max_depth_list:
print('{}...'.format(max_depth), end='')
pipeline = comp.get_pipeline('RF')
pipeline.named_steps['classifier'].set_params(max_depth=max_depth)
pipeline.fit(X_train_fold, y_train_fold)
test_probs = pipeline.predict_proba(X_test_fold)
train_probs = pipeline.predict_proba(X_train_fold)
for class_ in pipeline.classes_:
pval_maxdepth[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
for composition in comp_list:
pval_comp[composition].append(pval_maxdepth[composition])
pval_sys_err = {key: np.std(pval_comp[key], axis=0) for key in pval_comp}
pval = {key: np.mean(pval_comp[key], axis=0) for key in pval_comp}
In [8]:
comp_list = ['light']
fig, ax = plt.subplots()
for composition in comp_list:
upper_err = np.copy(pval_sys_err[composition])
upper_err = [val if ((pval[composition][i] + val) < 1) else 1-pval[composition][i] for i, val in enumerate(upper_err)]
lower_err = np.copy(pval_sys_err[composition])
lower_err = [val if ((pval[composition][i] - val) > 0) else pval[composition][i] for i, val in enumerate(lower_err)]
if composition == 'light':
ax.errorbar(max_depth_list -0.25/2, pval[composition],
yerr=[lower_err, upper_err],
marker='.', linestyle=':',
label=composition, alpha=0.75)
if composition == 'heavy':
ax.errorbar(max_depth_list + 0.25/2, pval[composition],
yerr=[lower_err, upper_err],
marker='.', linestyle=':',
label=composition, alpha=0.75)
plt.ylabel('KS-test p-value')
plt.xlabel('Maximum depth')
plt.ylim([-0.1, 1.1])
# plt.legend()
plt.grid()
plt.show()
In [20]:
pval
Out[20]:
In [ ]:
comp_list = np.unique(df['MC_comp_class'])
min_samples_list = np.arange(1, 400, 25)
pval = defaultdict(list)
ks_stat = defaultdict(list)
print('min_samples_leaf = ', end='')
for min_samples_leaf in min_samples_list:
print('{}...'.format(min_samples_leaf), end='')
pipeline = comp.get_pipeline('RF')
params = {'max_depth': 4, 'min_samples_leaf': min_samples_leaf}
pipeline.named_steps['classifier'].set_params(**params)
pipeline.fit(X_train, y_train)
test_probs = pipeline.predict_proba(X_test)
train_probs = pipeline.predict_proba(X_train)
for class_ in pipeline.classes_:
pval[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
fig, ax = plt.subplots()
for composition in pval:
ax.plot(min_samples_list, pval[composition], linestyle='-.', label=composition)
plt.ylabel('KS-test p-value')
plt.xlabel('Minimum samples leaf node')
plt.legend()
plt.grid()
plt.show()
In [11]:
# comp_list = np.unique(df['MC_comp_class'])
comp_list = ['light']
min_samples_list = [1, 25, 50, 75]
min_samples_list = [1, 100, 200, 300]
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
print('min_samples_leaf = ', end='')
for min_samples_leaf, ax in zip(min_samples_list, axarr.flatten()):
print('{}...'.format(min_samples_leaf), end='')
max_depth_list = np.arange(1, 16)
pval = defaultdict(list)
ks_stat = defaultdict(list)
for max_depth in max_depth_list:
pipeline = comp.get_pipeline('RF')
params = {'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf}
pipeline.named_steps['classifier'].set_params(**params)
pipeline.fit(X_train, y_train)
test_probs = pipeline.predict_proba(X_test)
train_probs = pipeline.predict_proba(X_train)
for class_ in pipeline.classes_:
pval[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
for composition in pval:
ax.plot(max_depth_list, pval[composition], linestyle='-.', label=composition)
ax.set_ylabel('KS-test p-value')
ax.set_xlabel('Maximum depth')
ax.set_title('min samples = {}'.format(min_samples_leaf))
ax.set_ylim([0, 0.5])
ax.legend()
ax.grid()
plt.tight_layout()
plt.show()
In [ ]: