In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
In [17]:
from __future__ import division
import argparse
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import scipy.stats as stats
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score
import composition as comp
sns.set_palette('muted')
sns.set_color_codes()
%matplotlib inline
In [18]:
df, cut_dict = comp.load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
'StationDensity', 'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
selection_mask *= cut_dict[key]
df = df[selection_mask]
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
df, feature_list, train_he=True, test_he=True)
print('number training events = ' + str(y_train.shape[0]))
print('number testing events = ' + str(y_test.shape[0]))
In [28]:
comp_list = np.unique(df['MC_comp'])
max_depth_list = np.arange(1, 16)
pval = defaultdict(list)
ks_stat = defaultdict(list)
for max_depth in max_depth_list:
print('max_depth = {}'.format(max_depth))
pipeline = comp.get_pipeline('RF')
pipeline.named_steps['classifier'].set_params(max_depth=max_depth)
pipeline.fit(X_train, y_train)
test_probs = pipeline.predict_proba(X_test)
train_probs = pipeline.predict_proba(X_train)
for class_ in pipeline.classes_:
pval[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
fig, ax = plt.subplots()
for composition in pval:
ax.plot(max_depth_list, pval[composition], linestyle='-.', label=composition)
plt.ylabel('KS-test p-value')
plt.xlabel('Maximum depth')
plt.legend()
plt.grid()
plt.show()
In [34]:
comp_list = np.unique(df['MC_comp'])
min_samples_list = np.arange(1, 400, 25)
pval = defaultdict(list)
ks_stat = defaultdict(list)
for min_samples_leaf in min_samples_list:
print('min_samples_leaf = {}'.format(min_samples_leaf))
pipeline = comp.get_pipeline('RF')
params = {'max_depth': 4, 'min_samples_leaf': min_samples_leaf}
pipeline.named_steps['classifier'].set_params(**params)
pipeline.fit(X_train, y_train)
test_probs = pipeline.predict_proba(X_test)
train_probs = pipeline.predict_proba(X_train)
for class_ in pipeline.classes_:
pval[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
fig, ax = plt.subplots()
for composition in pval:
ax.plot(min_samples_list, pval[composition], linestyle='-.', label=composition)
plt.ylabel('KS-test p-value')
plt.xlabel('Minimum samples leaf')
plt.legend()
plt.grid()
plt.show()
In [1]:
comp_list = np.unique(df['MC_comp'])
min_samples_list = [1, 25, 50, 75]
# min_samples_list = [1, 100, 200, 300]
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for min_samples_leaf, ax in zip(min_samples_list, axarr.flatten()):
print('min_samples_leaf = {}'.format(min_samples_leaf))
max_depth_list = np.arange(1, 16)
pval = defaultdict(list)
ks_stat = defaultdict(list)
for max_depth in max_depth_list:
pipeline = comp.get_pipeline('RF')
params = {'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf}
pipeline.named_steps['classifier'].set_params(**params)
pipeline.fit(X_train, y_train)
test_probs = pipeline.predict_proba(X_test)
train_probs = pipeline.predict_proba(X_train)
for class_ in pipeline.classes_:
pval[le.inverse_transform(class_)].append(stats.ks_2samp(test_probs[:, class_], train_probs[:, class_])[1])
for composition in pval:
ax.plot(max_depth_list, pval[composition], linestyle='-.', label=composition)
ax.set_ylabel('KS-test p-value')
ax.set_xlabel('Maximum depth')
ax.set_title('min samples = {}'.format(min_samples_leaf))
ax.legend()
ax.grid()
plt.tight_layout()
plt.show()
In [ ]: