Author: James Bourbeau



In [1]:

    
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend









    



last updated: 2017-04-28 

CPython 2.7.10
IPython 5.3.0

numpy 1.12.1
matplotlib 2.0.0
scipy 0.19.0
pandas 0.19.2
sklearn 0.18.1
mlxtend 0.6.0

Model parameter-tuning

Validation curves
KS-test tuning



In [2]:

    
from __future__ import division, print_function
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

import comptools as comp
import comptools.analysis.plotting as plotting

color_dict = comp.analysis.get_color_dict()

%matplotlib inline









    



/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Define analysis free parameters

[ back to top ]



In [3]:

    
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']



In [4]:

    
pipeline_str = 'xgboost'
# pipeline_str = 'GBDT'
pipeline = comp.analysis.get_pipeline(pipeline_str)



In [5]:

    
scoring = 'accuracy'
cv = 10

Data preprocessing

Load simulation dataframe and apply specified quality cuts
Extract desired features from dataframe
Get separate testing and training datasets



In [6]:

    
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)









    




TypeErrorTraceback (most recent call last)
<ipython-input-6-1421c8e2455f> in <module>()
----> 1 sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)

TypeError: preprocess_sim() got an unexpected keyword argument 'comp_class'



In [7]:

    
pipeline.fit(sim_train.X, sim_train.y)









    Out[7]:





Pipeline(steps=[('classifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=2,
              subsample=1.0, verbose=0, warm_start=False))])



In [8]:

    
# compute test set accuracy
n_estimators = pipeline.named_steps['classifier'].get_params()['n_estimators']
test_score = np.zeros(n_estimators, dtype=np.float64)

for i, y_pred in enumerate(pipeline.named_steps['classifier'].staged_predict(sim_test.X)):
    test_score[i] = accuracy_score(sim_test.y, y_pred)



In [9]:

    
# compute train set accuracy
n_estimators = pipeline.named_steps['classifier'].get_params()['n_estimators']
train_score = np.zeros(n_estimators, dtype=np.float64)

for i, y_pred in enumerate(pipeline.named_steps['classifier'].staged_predict(sim_train.X)):
    train_score[i] = accuracy_score(sim_train.y, y_pred)



In [10]:

    
fig, ax = plt.subplots()
ax.plot(range(1, n_estimators+1), train_score, ls='-', lw=2, marker='None', label='Training')
ax.plot(range(1, n_estimators+1), test_score, ls='-', lw=2, marker='None', label='Test')
ax.set_ylabel('Classification accuracy')
ax.set_xlabel('Boosting iteration')
ax.grid()
ax.legend()
plt.savefig('/home/jbourbeau/public_html/figures/accuracy-vs-boosting-iteration.png')
plt.show()









    



/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

Parameter tuning

[ back to top ]



In [6]:

    
def plot_validation_curve(df, xlabel, key='total', ylabel='Classification accuracy', ylim=None, outfile=None):
   
    assert key in ['light', 'heavy', 'total']
    
    plt.plot(df['param_value'], df['train_mean_{}'.format(key)], 
         color='C0', linestyle='-',
         marker='o', markersize=5,
         label='training')

    plt.fill_between(df['param_value'],
                     df['train_mean_{}'.format(key)] + df['train_std_{}'.format(key)],
                     df['train_mean_{}'.format(key)] - df['train_std_{}'.format(key)],
                     alpha=0.15, color='C0')

    plt.plot(df['param_value'], df['validation_mean_{}'.format(key)], 
             color='C1', linestyle='-', 
             marker='^', markersize=5, 
             label='validation')

    plt.fill_between(df['param_value'],
                     df['validation_mean_{}'.format(key)] + df['validation_std_{}'.format(key)],
                     df['validation_mean_{}'.format(key)] - df['validation_std_{}'.format(key)],
                     alpha=0.15, color='C1')

    plt.grid()
    plt.legend()
    plt.xlim([df['param_value'].min(), df['param_value'].max()])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if ylim: plt.ylim(ylim)
    if outfile: plt.savefig(outfile)



In [7]:

    
def plot_validation_curve_comp(df, xlabel, ylabel='Classification accuracy', ylim=None, outfile=None):
   
    for key in ['light', 'heavy']:
#     for key in ['light', 'heavy', 'total']:
        plt.plot(df['param_value'], df['train_mean_{}'.format(key)], 
             color=color_dict[key], linestyle='-',
             marker='o', markersize=5,
             label='{} training set'.format(key))

        plt.fill_between(df['param_value'],
                         df['train_mean_{}'.format(key)] + df['train_std_{}'.format(key)],
                         df['train_mean_{}'.format(key)] - df['train_std_{}'.format(key)],
                         alpha=0.15, color=color_dict[key])

        plt.plot(df['param_value'], df['validation_mean_{}'.format(key)], 
                 color=color_dict[key], linestyle=':', 
                 marker='^', markersize=5, 
                 label='{} validation set'.format(key))

        plt.fill_between(df['param_value'],
                         df['validation_mean_{}'.format(key)] + df['validation_std_{}'.format(key)],
                         df['validation_mean_{}'.format(key)] - df['validation_std_{}'.format(key)],
                         alpha=0.15, color=color_dict[key])

    plt.grid()
    plt.legend()
    plt.xlim([df['param_value'].min(), df['param_value'].max()])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if ylim: plt.ylim(ylim)
    if outfile: plt.savefig(outfile)



In [8]:

    
def plot_ks_curve_comp(df, xlabel, ylabel='Classification accuracy', ylim=None, outfile=None):
   
    for key in ['light', 'heavy']:
        plt.plot(df['param_value'], df['ks_mean_{}'.format(key)], 
             color=color_dict[key], linestyle=':',
             marker='o', markersize=5,
             label=key)
        
        fill_lower = df['ks_mean_{}'.format(key)] - df['ks_std_{}'.format(key)]
        fill_lower[fill_lower < 0 ] = 0
        fill_upper = df['ks_mean_{}'.format(key)] + df['ks_std_{}'.format(key)]
        fill_upper[fill_lower > 1 ] = 1
        plt.fill_between(df['param_value'],
                         fill_upper,
                         fill_lower,
                         alpha=0.15, color=color_dict[key])

    plt.grid()
    plt.legend()
    plt.xlim([df['param_value'].min(), df['param_value'].max()])
    plt.xlabel(xlabel)
    plt.ylabel('KS p-value')
    if ylim: plt.ylim(ylim)
    if outfile: plt.savefig(outfile)

Maximum depth



In [10]:

    
df_max_depth = pd.read_csv('data/validation-{}-max_depth-{}-cv{}.csv'.format(pipeline_str, scoring, cv),
                           index_col=0)
df_max_depth.sort_values('param_value', inplace=True)



In [11]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_max_depth.png'.format(pipeline_str)
plot_validation_curve(df_max_depth, xlabel='Maximum depth', outfile=outfile)









    



/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "



In [12]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_max_depth_comp.png'.format(pipeline_str)
plot_validation_curve_comp(df_max_depth, xlabel='Maximum depth', outfile=outfile)



In [13]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-ks_curve_max_depth_comp.png'.format(pipeline_str)
plot_ks_curve_comp(df_max_depth, xlabel='Maximum depth', outfile=outfile)

Learning rate



In [14]:

    
df_learning_rate = pd.read_csv('data/validation-{}-learning_rate-{}-cv{}.csv'.format(pipeline_str, scoring, cv),
                               index_col=0)
df_learning_rate.sort_values('param_value', inplace=True)



In [15]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_learning_rate.png'.format(pipeline_str)
plot_validation_curve(df_learning_rate, xlabel='Learning rate', ylim=[0.7, 0.8], outfile=outfile)



In [16]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_learning_rate_comp.png'.format(pipeline_str)
plot_validation_curve_comp(df_learning_rate, xlabel='Learning rate', outfile=outfile)



In [17]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-ks_curve_learning_rate.png'.format(pipeline_str)
plot_ks_curve_comp(df_learning_rate, xlabel='Learning rate', outfile=outfile)

Number of estimators



In [9]:

    
df_n_estimators = pd.read_csv('data/validation-{}-n_estimators-{}-cv{}.csv'.format(pipeline_str, scoring, cv),
                              index_col=0)
df_n_estimators.sort_values('param_value', inplace=True)



In [10]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_n_estimators.png'.format(pipeline_str)
plot_validation_curve(df_n_estimators, xlabel='Number of estimators', outfile=outfile)









    



/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "



In [11]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_n_estimators_comp.png'.format(pipeline_str)
plot_validation_curve_comp(df_n_estimators, xlabel='Number of estimators', outfile=outfile)



In [12]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-ks_curve_n_estimators_comp.png'.format(pipeline_str)
plot_ks_curve_comp(df_n_estimators, xlabel='Number of estimators', outfile=outfile)

Minimum number of samples to split



In [22]:

    
df_min_samples_split = pd.read_csv('data/validation-{}-min_samples_split-cv10.csv'.format(pipeline_str), index_col=0)
df_min_samples_split.sort_values('param_value', inplace=True)









    




IOErrorTraceback (most recent call last)
<ipython-input-22-a2a982bb0e34> in <module>()
----> 1 df_min_samples_split = pd.read_csv('data/validation-{}-min_samples_split-cv10.csv'.format(pipeline_str), index_col=0)
      2 df_min_samples_split.sort_values('param_value', inplace=True)

/home/jbourbeau/.local/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    644                     skip_blank_lines=skip_blank_lines)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 
    648     parser_f.__name__ = name

/home/jbourbeau/.local/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    387 
    388     # Create the parser.
--> 389     parser = TextFileReader(filepath_or_buffer, **kwds)
    390 
    391     if (nrows is not None) and (chunksize is not None):

/home/jbourbeau/.local/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    728             self.options['has_index_names'] = kwds['has_index_names']
    729 
--> 730         self._make_engine(self.engine)
    731 
    732     def close(self):

/home/jbourbeau/.local/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    921     def _make_engine(self, engine='c'):
    922         if engine == 'c':
--> 923             self._engine = CParserWrapper(self.f, **self.options)
    924         else:
    925             if engine == 'python':

/home/jbourbeau/.local/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1388         kwds['allow_leading_cols'] = self.index_col is not False
   1389 
-> 1390         self._reader = _parser.TextReader(src, **kwds)
   1391 
   1392         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)()

IOError: File data/validation-GBDT-min_samples_split-cv10.csv does not exist



In [23]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_min_samples_split.png'.format(pipeline_str)
plot_validation_curve(df_min_samples_split, xlabel='Minimum samples to split', outfile=outfile)









    




NameErrorTraceback (most recent call last)
<ipython-input-23-b806614bd861> in <module>()
      1 outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_min_samples_split.png'.format(pipeline_str)
----> 2 plot_validation_curve(df_min_samples_split, xlabel='Minimum samples to split', outfile=outfile)

NameError: name 'df_min_samples_split' is not defined

Minimum number of samples in leaf



In [24]:

    
df_min_samples_leaf = pd.read_csv('data/validation-{}-min_samples_leaf-{}-cv{}.csv'.format(pipeline_str, scoring, cv), index_col=0)
df_min_samples_leaf.sort_values('param_value', inplace=True)



In [25]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_min_samples_leaf.png'.format(pipeline_str)
plot_validation_curve(df_min_samples_leaf, xlabel='Minimum samples in leaf', ylim=[0.7, 0.8], outfile=outfile)



In [26]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-validation_curve_min_samples_leaf_comp.png'.format(pipeline_str)
plot_validation_curve_comp(df_min_samples_leaf, xlabel='Minimum samples in leaf', ylim=[0.7, 0.8], outfile=outfile)



In [27]:

    
outfile = '/home/jbourbeau/public_html/figures/{}-ks_curve_min_samples_leaf.png'.format(pipeline_str)
plot_ks_curve_comp(df_min_samples_leaf, xlabel='Minimum samples in leaf', outfile=outfile)



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Model parameter-tuning

Table of contents

Define analysis free parameters

Data preprocessing

Parameter tuning

Maximum depth

Learning rate

Number of estimators

Minimum number of samples to split

Minimum number of samples in leaf