notebook.community

Edit and run



In [1]:

    
from __future__ import division
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid

# from mlxtend.evaluate import plot_decision_regions

from composition.analysis.load_sim import load_sim
from composition.analysis.preprocessing import get_train_test_sets, LabelEncoder
from composition.analysis.features import get_training_features
from composition.analysis.pipelines import get_pipeline
from composition.analysis.plotting_functions import plot_decision_regions
import composition.analysis.data_functions as data_functions
from composition.support_functions.checkdir import checkdir

%matplotlib inline









    



/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [2]:

    
sns.set_palette('muted')
sns.set_color_codes()



In [3]:

    
df, cut_dict = load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['reco_exists', 'reco_zenith', 'num_hits', 'IT_signal',
                     'StationDensity', 'max_charge_frac', 'reco_containment', 'energy_range']
for key in standard_cut_keys:
    selection_mask *= cut_dict[key]

df = df[selection_mask]

feature_list = get_training_features()
X_train, X_test, y_train, y_test, le = get_train_test_sets(df, feature_list)

print('events = ' + str(y_train.shape[0]))









    



/home/jbourbeau/composition/analysis/load_sim.py:67: RuntimeWarning: divide by zero encountered in log10
  df['reco_log_energy'] = np.nan_to_num(np.log10(df['reco_energy']))
/home/jbourbeau/composition/analysis/load_sim.py:68: RuntimeWarning: invalid value encountered in log10
  df['InIce_log_charge'] = np.nan_to_num(np.log10(df['InIce_charge']))






    



events = 72644



In [5]:

    
pipeline = get_pipeline('GBC')
param_range = np.arange(1, 20)
train_scores, test_scores = validation_curve(
                estimator=pipeline, 
                X=X_train, 
                y=y_train, 
                param_name='classifier__max_depth', 
                param_range=param_range,
                cv=5,
                verbose=3,
                n_jobs=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='b', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='b')

plt.plot(param_range, test_mean, 
         color='g', linestyle='None', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='g')

plt.grid()
# plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Number neighbors')
plt.ylabel('Accuracy [\%]')
# plt.ylim([0.8, 1.0])
# plt.tight_layout()
plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/GBC-validation_curve.png', dpi=300)
# plt.show()









    



[CV] classifier__max_depth=1 .........................................
[CV] classifier__max_depth=2 .........................................
[CV] classifier__max_depth=3 .........................................
[CV] classifier__max_depth=4 .........................................
[CV] classifier__max_depth=5 .........................................
[CV] classifier__max_depth=6 .........................................
[CV] classifier__max_depth=7 .........................................
[CV] classifier__max_depth=8 .........................................
[CV] classifier__max_depth=9 .........................................
[CV] classifier__max_depth=10 ........................................
[CV] ................ classifier__max_depth=1, score=0.746989 -   0.0s
[CV] classifier__max_depth=11 ........................................
[CV] ................ classifier__max_depth=2, score=0.779338 -   0.0s
[CV] classifier__max_depth=12 ........................................
[CV] ................ classifier__max_depth=3, score=0.784706 -   0.0s
[CV] classifier__max_depth=13 ........................................
[CV] ................ classifier__max_depth=4, score=0.785326 -   0.0s
[CV] classifier__max_depth=14 ........................................
[CV] ................ classifier__max_depth=5, score=0.787735 -   0.0s
[CV] classifier__max_depth=15 ........................................
[CV] ................ classifier__max_depth=6, score=0.786771 -   0.1s
[CV] classifier__max_depth=16 ........................................
[CV] ................ classifier__max_depth=7, score=0.785877 -   0.1s
[CV] classifier__max_depth=17 ........................................
[CV] ................ classifier__max_depth=8, score=0.784018 -   0.1s
[CV] classifier__max_depth=18 ........................................
[CV] ................ classifier__max_depth=9, score=0.784362 -   0.1s
[CV] classifier__max_depth=19 ........................................
[CV] ............... classifier__max_depth=10, score=0.782297 -   0.1s
[CV] classifier__max_depth=1 .........................................
[CV] ................ classifier__max_depth=1, score=0.753527 -   0.0s
[CV] classifier__max_depth=2 .........................................
[CV] ................ classifier__max_depth=2, score=0.780852 -   0.0s
[CV] classifier__max_depth=3 .........................................
[CV] ................ classifier__max_depth=3, score=0.787047 -   0.0s
[CV] classifier__max_depth=4 .........................................
[CV] ................ classifier__max_depth=4, score=0.790832 -   0.0s
[CV] classifier__max_depth=5 .........................................
[CV] ............... classifier__max_depth=11, score=0.778718 -   0.1s
[CV] classifier__max_depth=6 .........................................
[CV] ................ classifier__max_depth=5, score=0.789249 -   0.0s
[CV] classifier__max_depth=7 .........................................
[CV] ................ classifier__max_depth=6, score=0.790213 -   0.0s
[CV] classifier__max_depth=8 .........................................
[CV] ............... classifier__max_depth=12, score=0.776378 -   0.1s
[CV] classifier__max_depth=9 .........................................
[CV] ................ classifier__max_depth=7, score=0.789249 -   0.1s
[CV] classifier__max_depth=10 ........................................
[CV] ................ classifier__max_depth=8, score=0.790970 -   0.1s
[CV] classifier__max_depth=11 ........................................
[CV] ................ classifier__max_depth=9, score=0.786427 -   0.1s
[CV] classifier__max_depth=12 ........................................
[CV] ............... classifier__max_depth=13, score=0.774658 -   0.1s
[CV] classifier__max_depth=13 ........................................
[CV] ............... classifier__max_depth=10, score=0.786634 -   0.1s
[CV] classifier__max_depth=14 ........................................
[CV] ............... classifier__max_depth=14, score=0.774589 -   0.1s
[CV] classifier__max_depth=15 ........................................
[CV] ............... classifier__max_depth=11, score=0.783812 -   0.1s
[CV] classifier__max_depth=16 ........................................
[CV] ............... classifier__max_depth=12, score=0.782504 -   0.1s
[CV] classifier__max_depth=17 ........................................
[CV] ............... classifier__max_depth=13, score=0.781127 -   0.1s
[CV] classifier__max_depth=18 ........................................
[CV] ............... classifier__max_depth=15, score=0.770734 -   0.2s
[CV] classifier__max_depth=19 ........................................
[CV] ............... classifier__max_depth=14, score=0.781196 -   0.1s
[CV] classifier__max_depth=1 .........................................
[CV] ................ classifier__max_depth=1, score=0.746369 -   0.0s
[CV] classifier__max_depth=2 .........................................
[CV] ................ classifier__max_depth=2, score=0.778787 -   0.0s
[CV] classifier__max_depth=3 .........................................
[CV] ................ classifier__max_depth=3, score=0.784913 -   0.0s
[CV] classifier__max_depth=4 .........................................






    



[Parallel(n_jobs=10)]: Done  32 out of  95 | elapsed:  4.5min remaining:  8.8min






    



[CV] ................ classifier__max_depth=4, score=0.784087 -   0.1s
[CV] classifier__max_depth=5 .........................................
[CV] ............... classifier__max_depth=16, score=0.767912 -   0.3s
[CV] classifier__max_depth=6 .........................................
[CV] ................ classifier__max_depth=5, score=0.784018 -   0.1s
[CV] classifier__max_depth=7 .........................................
[CV] ................ classifier__max_depth=6, score=0.784844 -   0.1s
[CV] classifier__max_depth=8 .........................................
[CV] ................ classifier__max_depth=8, score=0.782848 -   0.1s
[CV] classifier__max_depth=9 .........................................
[CV] ................ classifier__max_depth=7, score=0.785119 -   0.1s
[CV] classifier__max_depth=10 ........................................
[CV] ................ classifier__max_depth=9, score=0.786014 -   0.1s
[CV] classifier__max_depth=11 ........................................
[CV] ............... classifier__max_depth=15, score=0.775621 -   0.1s
[CV] classifier__max_depth=12 ........................................
[CV] ............... classifier__max_depth=10, score=0.781609 -   0.1s
[CV] classifier__max_depth=13 ........................................
[CV] ............... classifier__max_depth=11, score=0.781059 -   0.1s
[CV] classifier__max_depth=14 ........................................
[CV] ............... classifier__max_depth=17, score=0.764402 -   0.2s
[CV] classifier__max_depth=15 ........................................
[CV] ............... classifier__max_depth=12, score=0.776585 -   0.1s
[CV] classifier__max_depth=16 ........................................
[CV] ............... classifier__max_depth=16, score=0.773969 -   0.2s
[CV] classifier__max_depth=17 ........................................
[CV] ............... classifier__max_depth=13, score=0.776172 -   0.1s
[CV] classifier__max_depth=18 ........................................
[CV] ............... classifier__max_depth=18, score=0.763576 -   0.2s
[CV] classifier__max_depth=19 ........................................
[CV] ............... classifier__max_depth=14, score=0.772042 -   0.1s
[CV] classifier__max_depth=1 .........................................
[CV] ................ classifier__max_depth=1, score=0.757382 -   0.0s
[CV] classifier__max_depth=2 .........................................
[CV] ............... classifier__max_depth=17, score=0.773006 -   0.2s
[CV] classifier__max_depth=3 .........................................
[CV] ................ classifier__max_depth=2, score=0.785257 -   0.0s
[CV] classifier__max_depth=4 .........................................
[CV] ................ classifier__max_depth=3, score=0.789937 -   0.0s
[CV] classifier__max_depth=5 .........................................
[CV] ................ classifier__max_depth=4, score=0.792346 -   0.0s
[CV] classifier__max_depth=6 .........................................
[CV] ................ classifier__max_depth=5, score=0.794274 -   0.0s
[CV] classifier__max_depth=7 .........................................
[CV] ................ classifier__max_depth=6, score=0.793172 -   0.0s
[CV] classifier__max_depth=8 .........................................
[CV] ................ classifier__max_depth=7, score=0.792966 -   0.1s
[CV] classifier__max_depth=9 .........................................
[CV] ................ classifier__max_depth=8, score=0.793998 -   0.1s
[CV] classifier__max_depth=10 ........................................
[CV] ................ classifier__max_depth=9, score=0.790006 -   0.1s
[CV] classifier__max_depth=11 ........................................
[CV] ............... classifier__max_depth=15, score=0.766949 -   0.1s
[CV] classifier__max_depth=12 ........................................
[CV] ............... classifier__max_depth=19, score=0.758208 -   0.2s
[CV] classifier__max_depth=13 ........................................
[CV] ............... classifier__max_depth=10, score=0.787460 -   0.1s
[CV] classifier__max_depth=14 ........................................
[CV] ............... classifier__max_depth=11, score=0.786427 -   0.1s
[CV] classifier__max_depth=15 ........................................
[CV] ............... classifier__max_depth=12, score=0.784706 -   0.1s
[CV] classifier__max_depth=16 ........................................
[CV] ............... classifier__max_depth=18, score=0.771973 -   0.2s
[CV] classifier__max_depth=17 ........................................






    



[Parallel(n_jobs=10)]: Done  64 out of  95 | elapsed: 10.4min remaining:  5.1min






    



[CV] ............... classifier__max_depth=16, score=0.763026 -   0.2s
[CV] classifier__max_depth=18 ........................................
[CV] ............... classifier__max_depth=13, score=0.782642 -   0.1s
[CV] classifier__max_depth=19 ........................................
[CV] ............... classifier__max_depth=14, score=0.781059 -   0.1s
[CV] classifier__max_depth=1 .........................................
[CV] ................ classifier__max_depth=1, score=0.749036 -   0.0s
[CV] classifier__max_depth=2 .........................................
[CV] ................ classifier__max_depth=2, score=0.780699 -   0.0s
[CV] classifier__max_depth=3 .........................................
[CV] ................ classifier__max_depth=3, score=0.785173 -   0.0s
[CV] classifier__max_depth=4 .........................................
[CV] ................ classifier__max_depth=4, score=0.785380 -   0.0s
[CV] classifier__max_depth=5 .........................................
[CV] ................ classifier__max_depth=5, score=0.788546 -   0.1s
[CV] classifier__max_depth=6 .........................................
[CV] ............... classifier__max_depth=17, score=0.766605 -   0.2s
[CV] classifier__max_depth=7 .........................................
[CV] ................ classifier__max_depth=6, score=0.786550 -   0.1s
[CV] classifier__max_depth=8 .........................................
[CV] ............... classifier__max_depth=19, score=0.767293 -   0.4s
[CV] classifier__max_depth=9 .........................................
[CV] ................ classifier__max_depth=7, score=0.788271 -   0.1s
[CV] classifier__max_depth=10 ........................................
[CV] ................ classifier__max_depth=8, score=0.784760 -   0.1s
[CV] classifier__max_depth=11 ........................................
[CV] ................ classifier__max_depth=9, score=0.785586 -   0.1s
[CV] classifier__max_depth=12 ........................................
[CV] ............... classifier__max_depth=10, score=0.782764 -   0.1s
[CV] classifier__max_depth=13 ........................................
[CV] ............... classifier__max_depth=15, score=0.776654 -   0.1s
[CV] classifier__max_depth=14 ........................................
[CV] ............... classifier__max_depth=11, score=0.783866 -   0.1s
[CV] classifier__max_depth=15 ........................................
[CV] ............... classifier__max_depth=12, score=0.780493 -   0.1s
[CV] classifier__max_depth=16 ........................................
[CV] ............... classifier__max_depth=18, score=0.762131 -   0.2s
[CV] classifier__max_depth=17 ........................................
[CV] ............... classifier__max_depth=16, score=0.776654 -   0.2s
[CV] classifier__max_depth=18 ........................................
[CV] ............... classifier__max_depth=13, score=0.779873 -   0.1s
[CV] classifier__max_depth=19 ........................................
[CV] ............... classifier__max_depth=14, score=0.775812 -   0.1s
[CV] ............... classifier__max_depth=17, score=0.773969 -   0.2s
[CV] ............... classifier__max_depth=15, score=0.774780 -   0.2s
[CV] ............... classifier__max_depth=19, score=0.758483 -   0.3s
[CV] ............... classifier__max_depth=18, score=0.771836 -   0.2s
[CV] ............... classifier__max_depth=16, score=0.768378 -   0.2s
[CV] ............... classifier__max_depth=19, score=0.769633 -   0.2s
[CV] ............... classifier__max_depth=17, score=0.769342 -   0.2s
[CV] ............... classifier__max_depth=18, score=0.766520 -   0.2s
[CV] ............... classifier__max_depth=19, score=0.763147 -   0.2s






    



[Parallel(n_jobs=10)]: Done  95 out of  95 | elapsed: 21.7min finished



In [7]:

    
max_depth_list = [3, 5, 8, 10]

fig, axarr = plt.subplots(2,2)
for num, ax in zip(max_depth_list, axarr.flatten()):
    pipeline = get_pipeline('GBC')
    pipeline.named_steps['classifier'].set_params(max_depth=num)
    pipeline.fit(X_train, y_train)
    scaler = pipeline.named_steps['scaler']
    clf = pipeline.named_steps['classifier']
    X_test_std = scaler.transform(X_test)
#     plot_decision_regions(X_test_std, y_test, clf, ax=ax)
    plot_decision_regions(X_test_std, y_test, clf, scatter_fraction=None, ax=ax)
    # Adding axes annotations
    ax.set_xlabel('Scaled energy')
    ax.set_ylabel('Scaled charge')
    ax.set_title('Max depth = {}'.format(num))
    ax.legend()
plt.tight_layout()
# plt.show()
plt.savefig('/home/jbourbeau/public_html/figures/composition/parameter-tuning/GBC-decision-regions.png')









    




ValueErrorTraceback (most recent call last)
<ipython-input-7-68bf3d486767> in <module>()
     10     X_test_std = scaler.transform(X_test)
     11 #     plot_decision_regions(X_test_std, y_test, clf, ax=ax)
---> 12     plot_decision_regions(X_test_std, y_test, clf, scatter_fraction=None, ax=ax)
     13     # Adding axes annotations
     14     ax.set_xlabel('Scaled energy')

/home/jbourbeau/composition/analysis/plotting_functions.pyc in plot_decision_regions(X, y, classifier, resolution, scatter_fraction, ax)
     31     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
     32                            np.arange(x2_min, x2_max, resolution))
---> 33     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
     34     Z = Z.reshape(xx1.shape)
     35     if ax is None:

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in predict(self, X)
   1532             The predicted values.
   1533         """
-> 1534         score = self.decision_function(X)
   1535         decisions = self.loss_._score_to_decision(score)
   1536         return self.classes_.take(decisions, axis=0)

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in decision_function(self, X)
   1491         """
   1492         X = check_array(X, dtype=DTYPE, order="C")
-> 1493         score = self._decision_function(X)
   1494         if score.shape[1] == 1:
   1495             return score.ravel()

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _decision_function(self, X)
   1122         # for use in inner loop, not raveling the output in single-class case,
   1123         # not doing input validation.
-> 1124         score = self._init_decision_function(X)
   1125         predict_stages(self.estimators_, X, self.learning_rate, score)
   1126         return score

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _init_decision_function(self, X)
   1112         """Check input and compute prediction of ``init``. """
   1113         self._check_initialized()
-> 1114         X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
   1115         if X.shape[1] != self.n_features:
   1116             raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format(

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/tree/tree.pyc in _validate_X_predict(self, X, check_input)
    401                              "match the input. Model n_features is %s and "
    402                              "input n_features is %s "
--> 403                              % (self.n_features_, n_features))
    404 
    405         return X

ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 2



In [15]:

    
# pipeline = get_pipeline('KN')
param_range = np.arange(1, 100, 1)
param_grid = {'classifier__n_neighbors': param_range}
gs = GridSearchCV(estimator=pipeline, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=10)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)









    



0.747054126975
{'classifier__n_neighbors': 83}



In [12]:

    
param_range









    Out[12]:





array([  1,   6,  11,  16,  21,  26,  31,  36,  41,  46,  51,  56,  61,
        66,  71,  76,  81,  86,  91,  96, 101, 106, 111, 116])



In [ ]: