In [1]:
from __future__ import division
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid

# from mlxtend.evaluate import plot_decision_regions

from composition.analysis.load_sim import load_sim
from composition.analysis.preprocessing import get_train_test_sets, LabelEncoder
from composition.analysis.features import get_training_features
from composition.analysis.pipelines import get_pipeline
from composition.analysis.plotting_functions import plot_decision_regions
import composition.analysis.data_functions as data_functions
from composition.support_functions.checkdir import checkdir

%matplotlib inline


/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
sns.set_palette('muted')
sns.set_color_codes()

In [3]:
df, cut_dict = load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['reco_exists', 'reco_zenith', 'num_hits', 'IT_signal',
                     'StationDensity', 'max_charge_frac', 'reco_containment', 'energy_range']
for key in standard_cut_keys:
    selection_mask *= cut_dict[key]

df = df[selection_mask]

feature_list = get_training_features()
X_train, X_test, y_train, y_test, le = get_train_test_sets(df, feature_list)

print('events = ' + str(y_train.shape[0]))


/home/jbourbeau/composition/analysis/load_sim.py:67: RuntimeWarning: divide by zero encountered in log10
  df['reco_log_energy'] = np.nan_to_num(np.log10(df['reco_energy']))
/home/jbourbeau/composition/analysis/load_sim.py:68: RuntimeWarning: invalid value encountered in log10
  df['InIce_log_charge'] = np.nan_to_num(np.log10(df['InIce_charge']))
events = 72644

In [4]:
pipeline = get_pipeline('KN')
# param_range = np.arange(70, 90, 1)
param_range = np.arange(1, 100, 5)
train_scores, test_scores = validation_curve(
                estimator=pipeline, 
                X=X_train, 
                y=y_train, 
                param_name='classifier__n_neighbors', 
                param_range=param_range,
                cv=5,
                verbose=3,
                n_jobs=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='b', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='b')

plt.plot(param_range, test_mean, 
         color='g', linestyle='None', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='g')

plt.grid()
# plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Number neighbors')
plt.ylabel('Accuracy [\%]')
# plt.ylim([0.8, 1.0])
# plt.tight_layout()
plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/KN-validation_curve.png', dpi=300)
# plt.show()


[CV] classifier__n_neighbors=1 .......................................
[CV] classifier__n_neighbors=6 .......................................
[CV] classifier__n_neighbors=11 ......................................
[CV] classifier__n_neighbors=16 ......................................
[CV] classifier__n_neighbors=21 ......................................
[CV] classifier__n_neighbors=26 ......................................
[CV] classifier__n_neighbors=31 ......................................
[CV] classifier__n_neighbors=36 ......................................
[CV] classifier__n_neighbors=41 ......................................
[CV] classifier__n_neighbors=46 ......................................
[CV] .............. classifier__n_neighbors=1, score=0.707826 -   0.1s
[CV] classifier__n_neighbors=51 ......................................
[CV] ............. classifier__n_neighbors=11, score=0.776378 -   0.4s
[CV] ............. classifier__n_neighbors=16, score=0.779131 -   0.4s
[CV] classifier__n_neighbors=56 ......................................
[CV] classifier__n_neighbors=61 ......................................
[CV] .............. classifier__n_neighbors=6, score=0.760686 -   0.3s
[CV] classifier__n_neighbors=66 ......................................
[CV] ............. classifier__n_neighbors=31, score=0.782091 -   0.7s
[CV] classifier__n_neighbors=71 ......................................
[CV] ............. classifier__n_neighbors=21, score=0.780783 -   0.5s
[CV] classifier__n_neighbors=76 ......................................
[CV] ............. classifier__n_neighbors=36, score=0.784913 -   0.6s
[CV] classifier__n_neighbors=81 ......................................
[CV] ............. classifier__n_neighbors=46, score=0.784500 -   1.1s
[CV] classifier__n_neighbors=86 ......................................
[CV] ............. classifier__n_neighbors=41, score=0.785670 -   1.0s
[CV] classifier__n_neighbors=91 ......................................
[CV] ............. classifier__n_neighbors=26, score=0.782710 -   0.8s
[CV] classifier__n_neighbors=96 ......................................
[CV] ............. classifier__n_neighbors=51, score=0.784087 -   0.8s
[CV] classifier__n_neighbors=1 .......................................
[CV] ............. classifier__n_neighbors=56, score=0.784018 -   0.7s
[CV] classifier__n_neighbors=6 .......................................
[CV] .............. classifier__n_neighbors=1, score=0.717324 -   0.3s
[CV] classifier__n_neighbors=11 ......................................
[CV] ............. classifier__n_neighbors=66, score=0.783330 -   0.7s
[CV] classifier__n_neighbors=16 ......................................
[CV] .............. classifier__n_neighbors=6, score=0.763439 -   0.4s
[CV] classifier__n_neighbors=21 ......................................
[CV] ............. classifier__n_neighbors=11, score=0.779544 -   0.3s
[CV] classifier__n_neighbors=26 ......................................
[CV] ............. classifier__n_neighbors=71, score=0.785051 -   1.0s
[CV] classifier__n_neighbors=31 ......................................
[CV] ............. classifier__n_neighbors=61, score=0.783743 -   0.8s
[CV] classifier__n_neighbors=36 ......................................
[CV] ............. classifier__n_neighbors=16, score=0.783192 -   0.8s
[CV] classifier__n_neighbors=41 ......................................
[CV] ............. classifier__n_neighbors=91, score=0.785601 -   0.7s
[CV] ............. classifier__n_neighbors=81, score=0.786427 -   1.3s
[CV] classifier__n_neighbors=46 ......................................
[CV] classifier__n_neighbors=51 ......................................
[CV] ............. classifier__n_neighbors=21, score=0.787322 -   0.9s
[CV] classifier__n_neighbors=56 ......................................
[CV] ............. classifier__n_neighbors=76, score=0.785395 -   1.4s
[CV] classifier__n_neighbors=61 ......................................
[CV] ............. classifier__n_neighbors=26, score=0.786358 -   0.9s
[CV] classifier__n_neighbors=66 ......................................
[CV] ............. classifier__n_neighbors=96, score=0.785532 -   1.0s
[CV] classifier__n_neighbors=71 ......................................
[CV] ............. classifier__n_neighbors=31, score=0.787941 -   1.0s
[CV] classifier__n_neighbors=76 ......................................
[CV] ............. classifier__n_neighbors=36, score=0.787391 -   0.5s
[CV] classifier__n_neighbors=81 ......................................
[CV] ............. classifier__n_neighbors=86, score=0.786496 -   1.1s
[CV] classifier__n_neighbors=86 ......................................
[CV] ............. classifier__n_neighbors=51, score=0.790144 -   0.8s
[CV] classifier__n_neighbors=91 ......................................
[CV] ............. classifier__n_neighbors=56, score=0.789869 -   0.5s
[CV] classifier__n_neighbors=96 ......................................
[CV] ............. classifier__n_neighbors=41, score=0.788561 -   0.9s
[CV] classifier__n_neighbors=1 .......................................
[CV] .............. classifier__n_neighbors=1, score=0.713538 -   0.2s
[CV] classifier__n_neighbors=6 .......................................
[CV] ............. classifier__n_neighbors=61, score=0.790006 -   0.9s
[CV] classifier__n_neighbors=11 ......................................
[Parallel(n_jobs=10)]: Done  33 out of 100 | elapsed:   12.8s remaining:   26.0s
[CV] ............. classifier__n_neighbors=46, score=0.787528 -   0.9s
[CV] classifier__n_neighbors=16 ......................................
[CV] ............. classifier__n_neighbors=66, score=0.789318 -   0.9s
[CV] classifier__n_neighbors=21 ......................................
[CV] ............. classifier__n_neighbors=81, score=0.789593 -   0.9s
[CV] classifier__n_neighbors=26 ......................................
[CV] ............. classifier__n_neighbors=76, score=0.789456 -   0.9s
[CV] classifier__n_neighbors=31 ......................................
[CV] ............. classifier__n_neighbors=71, score=0.790006 -   0.9s
[CV] classifier__n_neighbors=36 ......................................
[CV] ............. classifier__n_neighbors=11, score=0.772317 -   0.3s
[CV] classifier__n_neighbors=41 ......................................
[CV] .............. classifier__n_neighbors=6, score=0.759378 -   0.4s
[CV] classifier__n_neighbors=46 ......................................
[CV] ............. classifier__n_neighbors=86, score=0.789111 -   0.8s
[CV] classifier__n_neighbors=51 ......................................
[CV] ............. classifier__n_neighbors=16, score=0.775208 -   0.3s
[CV] classifier__n_neighbors=56 ......................................
[CV] ............. classifier__n_neighbors=91, score=0.789180 -   0.8s
[CV] classifier__n_neighbors=61 ......................................
[CV] ............. classifier__n_neighbors=21, score=0.778581 -   0.5s
[CV] classifier__n_neighbors=66 ......................................
[CV] ............. classifier__n_neighbors=96, score=0.789731 -   1.1s
[CV] classifier__n_neighbors=71 ......................................
[CV] ............. classifier__n_neighbors=36, score=0.783330 -   0.6s
[CV] ............. classifier__n_neighbors=26, score=0.779751 -   0.5s
[CV] classifier__n_neighbors=76 ......................................
[CV] classifier__n_neighbors=81 ......................................
[CV] ............. classifier__n_neighbors=41, score=0.783743 -   0.6s
[CV] classifier__n_neighbors=86 ......................................
[CV] ............. classifier__n_neighbors=56, score=0.782986 -   0.9s
[CV] classifier__n_neighbors=91 ......................................
[CV] ............. classifier__n_neighbors=46, score=0.783261 -   1.0s
[CV] classifier__n_neighbors=96 ......................................
[CV] ............. classifier__n_neighbors=31, score=0.782986 -   0.6s
[CV] classifier__n_neighbors=1 .......................................
[CV] ............. classifier__n_neighbors=51, score=0.782091 -   1.0s
[CV] classifier__n_neighbors=6 .......................................
[CV] .............. classifier__n_neighbors=1, score=0.719389 -   0.3s
[CV] classifier__n_neighbors=11 ......................................
[CV] ............. classifier__n_neighbors=61, score=0.782779 -   0.8s
[CV] classifier__n_neighbors=16 ......................................
[CV] .............. classifier__n_neighbors=6, score=0.768532 -   0.4s
[CV] ............. classifier__n_neighbors=66, score=0.784913 -   1.1s
[CV] classifier__n_neighbors=21 ......................................
[CV] classifier__n_neighbors=26 ......................................
[CV] ............. classifier__n_neighbors=11, score=0.781265 -   0.5s
[CV] classifier__n_neighbors=31 ......................................
[CV] ............. classifier__n_neighbors=71, score=0.784156 -   1.0s
[CV] classifier__n_neighbors=36 ......................................
[CV] ............. classifier__n_neighbors=16, score=0.785464 -   0.5s
[CV] classifier__n_neighbors=41 ......................................
[CV] ............. classifier__n_neighbors=81, score=0.783468 -   0.7s
[CV] classifier__n_neighbors=46 ......................................
[CV] ............. classifier__n_neighbors=86, score=0.783881 -   1.2s
[CV] classifier__n_neighbors=51 ......................................
[CV] ............. classifier__n_neighbors=21, score=0.787460 -   0.5s
[CV] classifier__n_neighbors=56 ......................................
[CV] ............. classifier__n_neighbors=76, score=0.782642 -   0.9s
[CV] classifier__n_neighbors=61 ......................................
[CV] ............. classifier__n_neighbors=26, score=0.790213 -   0.6s
[CV] ............. classifier__n_neighbors=91, score=0.785051 -   1.5s
[CV] classifier__n_neighbors=66 ......................................
[CV] classifier__n_neighbors=71 ......................................
[CV] ............. classifier__n_neighbors=31, score=0.792415 -   0.5s
[CV] classifier__n_neighbors=76 ......................................
[CV] ............. classifier__n_neighbors=96, score=0.785119 -   1.1s
[CV] classifier__n_neighbors=81 ......................................
[Parallel(n_jobs=10)]: Done  67 out of 100 | elapsed:   23.9s remaining:   11.7s
[CV] ............. classifier__n_neighbors=36, score=0.792071 -   0.9s
[CV] ............. classifier__n_neighbors=41, score=0.792622 -   0.7s
[CV] classifier__n_neighbors=86 ......................................
[CV] classifier__n_neighbors=91 ......................................
[CV] ............. classifier__n_neighbors=46, score=0.791107 -   0.9s
[CV] classifier__n_neighbors=96 ......................................
[CV] ............. classifier__n_neighbors=51, score=0.793861 -   0.6s
[CV] classifier__n_neighbors=1 .......................................
[CV] ............. classifier__n_neighbors=56, score=0.793241 -   0.9s
[CV] classifier__n_neighbors=6 .......................................
[CV] ............. classifier__n_neighbors=71, score=0.793792 -   0.8s
[CV] .............. classifier__n_neighbors=1, score=0.722811 -   0.3s
[CV] classifier__n_neighbors=11 ......................................
[CV] classifier__n_neighbors=16 ......................................
[CV] ............. classifier__n_neighbors=66, score=0.793516 -   1.0s
[CV] classifier__n_neighbors=21 ......................................
[CV] ............. classifier__n_neighbors=61, score=0.793654 -   0.9s
[CV] classifier__n_neighbors=26 ......................................
[CV] .............. classifier__n_neighbors=6, score=0.761977 -   0.3s
[CV] classifier__n_neighbors=31 ......................................
[CV] ............. classifier__n_neighbors=81, score=0.792897 -   1.1s
[CV] classifier__n_neighbors=36 ......................................
[CV] ............. classifier__n_neighbors=11, score=0.777602 -   0.6s
[CV] classifier__n_neighbors=41 ......................................
[CV] ............. classifier__n_neighbors=76, score=0.793998 -   1.6s
[CV] ............. classifier__n_neighbors=21, score=0.786068 -   0.5s
[CV] classifier__n_neighbors=46 ......................................
[CV] classifier__n_neighbors=51 ......................................
[CV] ............. classifier__n_neighbors=86, score=0.792140 -   0.8s
[CV] classifier__n_neighbors=56 ......................................
[CV] ............. classifier__n_neighbors=16, score=0.780080 -   0.7s
[CV] classifier__n_neighbors=61 ......................................
[CV] ............. classifier__n_neighbors=91, score=0.792071 -   1.4s
[CV] ............. classifier__n_neighbors=26, score=0.783384 -   0.5s
[CV] classifier__n_neighbors=66 ......................................
[CV] classifier__n_neighbors=71 ......................................
[CV] ............. classifier__n_neighbors=96, score=0.791796 -   1.0s
[CV] classifier__n_neighbors=76 ......................................
[CV] ............. classifier__n_neighbors=31, score=0.786963 -   0.5s
[CV] classifier__n_neighbors=81 ......................................
[CV] ............. classifier__n_neighbors=36, score=0.785518 -   0.5s
[CV] classifier__n_neighbors=86 ......................................
[CV] ............. classifier__n_neighbors=51, score=0.784967 -   0.7s
[CV] classifier__n_neighbors=91 ......................................
[CV] ............. classifier__n_neighbors=66, score=0.785036 -   0.8s
[CV] classifier__n_neighbors=96 ......................................
[CV] ............. classifier__n_neighbors=46, score=0.785105 -   0.7s
[CV] ............. classifier__n_neighbors=41, score=0.786619 -   1.2s
[CV] ............. classifier__n_neighbors=61, score=0.784279 -   1.2s
[CV] ............. classifier__n_neighbors=56, score=0.784141 -   0.8s
[CV] ............. classifier__n_neighbors=76, score=0.785999 -   0.9s
[CV] ............. classifier__n_neighbors=81, score=0.787376 -   0.8s
[CV] ............. classifier__n_neighbors=71, score=0.787307 -   1.0s
[CV] ............. classifier__n_neighbors=86, score=0.786412 -   1.4s
[CV] ............. classifier__n_neighbors=91, score=0.786068 -   1.1s
[CV] ............. classifier__n_neighbors=96, score=0.786619 -   1.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:   38.9s finished

In [6]:
n_neighbors_list = [3, 50, 100, 500]

fig, axarr = plt.subplots(2,2)
for num, ax in zip(n_neighbors_list, axarr.flatten()):
    pipeline = get_pipeline('KN')
    pipeline.named_steps['classifier'].set_params(n_neighbors=num)
    pipeline.fit(X_train, y_train)
    scaler = pipeline.named_steps['scaler']
    clf = pipeline.named_steps['classifier']
    X_test_std = scaler.transform(X_test)
#     plot_decision_regions(X_test_std, y_test, clf, ax=ax)
    plot_decision_regions(X_test_std, y_test, clf, scatter_fraction=None, ax=ax)
    # Adding axes annotations
    ax.set_xlabel('Scaled energy')
    ax.set_ylabel('Scaled charge')
    ax.set_title('Number neighbors = {}'.format(num))
    ax.legend()
plt.tight_layout()
# plt.show()
plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/KN-decision-regions.png')



In [15]:
# pipeline = get_pipeline('KN')
param_range = np.arange(1, 100, 1)
param_grid = {'classifier__n_neighbors': param_range}
gs = GridSearchCV(estimator=pipeline, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=10)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)


0.747054126975
{'classifier__n_neighbors': 83}

In [12]:
param_range


Out[12]:
array([  1,   6,  11,  16,  21,  26,  31,  36,  41,  46,  51,  56,  61,
        66,  71,  76,  81,  86,  91,  96, 101, 106, 111, 116])

In [ ]: