Author: James Bourbeau
In [5]:
    
%load_ext watermark
%watermark -u -d -v -p numpy,scipy,pandas,sklearn,mlxtend
    
    
In [6]:
    
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
    
    
In [7]:
    
from __future__ import division, print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cPickle
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import composition as comp
import composition.analysis.plotting as plotting
# # Plotting-related
# sns.set_palette('muted')
# sns.set_color_codes()
# color_dict = {}
# for i, composition in enumerate(['light', 'heavy', 'total']):
#     color_dict[composition] = sns.color_palette('muted').as_hex()[i]
%matplotlib inline
    
[ back to top ]
In [8]:
    
df_sim = comp.load_dataframe(datatype='sim', config='IC79')
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train_sim, X_test_sim, y_train_sim, y_test_sim, le = comp.get_train_test_sets(
    df_sim, feature_list, comp_class=True)
print('number training events = ' + str(y_train_sim.shape[0]))
print('number testing events = ' + str(y_test_sim.shape[0]))
    
    
[ back to top ]
In [9]:
    
pipeline = comp.get_pipeline('xgboost')
# pipeline = comp.get_pipeline('RF')
print('max_depth = {}'.format(pipeline.get_params()['classifier__max_depth']))
n_features_forward = X_train_sim.shape[1]
n_features_backward = 1
    
    
In [10]:
    
sfs = SFS(pipeline, 
          k_features=2, 
          forward=True, 
          floating=False, 
          scoring='accuracy',
          print_progress=True,
          cv=3,
          n_jobs=15)
sfs = sfs.fit(X_train_sim, y_train_sim)
    
    
In [ ]:
    
sbs = SFS(pipeline, 
          k_features=n_features_backward, 
          forward=False, 
          floating=False, 
          scoring='accuracy',
          print_progress=True,
          cv=3,
          n_jobs=20)
sbs = sbs.fit(X_train_sim, y_train_sim)
    
    
In [ ]:
    
sffs = SFS(pipeline, 
          k_features=n_features_forward, 
          forward=True, 
          floating=True, 
          scoring='accuracy',
          print_progress=True,
          cv=3,
          n_jobs=15)
sffs = sffs.fit(X_train_sim, y_train_sim)
    
In [10]:
    
sfbs = SFS(pipeline, 
          k_features=n_features_backward, 
          forward=False, 
          floating=True, 
          scoring='accuracy',
          print_progress=True,
          cv=3,
          n_jobs=20)
sfbs = sfbs.fit(X_train_sim, y_train_sim)
    
    
In [7]:
    
selector_list = [sfs]
# selector_list = [sfs, sffs, sbs, sfbs]
selector_label = {sfs:'SFS'}
# selector_label = {sfs:'SFS', sffs:'SFFS', sbs:'SBS', sfbs:'SFBS'}
fig, ax = plt.subplots()
for selector in selector_list:
    info = selector.get_metric_dict()
    scores = [info[size]['avg_score'] for size in info]
    errs = [info[size]['ci_bound'] for size in info]
    ax.errorbar(info.keys(), scores, yerr=errs,
                marker='.', linestyle='-', lw=1, alpha=0.75, 
                label=selector_label[selector])
plt.xlabel('Feature subset size')
plt.ylabel('Accurary')
plt.title('XGBoost')
# plt.title('Max depth = {}'.format(pipeline.get_params()['classifier__max_depth']))
plt.grid()
plt.legend(title='Selection algorithm', loc='lower right')
plt.show()
    
    
    
In [11]:
    
for selector in selector_list:
    idx = selector.get_metric_dict()[4]['feature_idx']
    print(idx)
    print(selector_label[selector]+':')
    print(np.sort(feature_labels[list(idx)]))
    
    
In [22]:
    
sfs.get_metric_dict()
    
    Out[22]:
In [23]:
    
sfs.subsets_
    
    Out[23]:
In [7]:
    
print('\nSequential Forward Selection:')
selected_features_str = 'Selected features:\n'
for idx in sfs.k_feature_idx_:
    selected_features_str += r'{}, '.format(feature_labels[idx])
print(selected_features_str)
print('CV Score:')
print(sfs.k_score_)
    
    
[ back to top ]
In [35]:
    
pipeline = comp.get_pipeline('xgboost')
# pipeline = comp.get_pipeline('RF')
max_depth = pipeline.get_params()['classifier__max_depth']
n_features_forward = 7
    
In [36]:
    
sfs_pickle = SFS(pipeline, 
          k_features=n_features_forward, 
          forward=True, 
          floating=False, 
          scoring='accuracy',
          print_progress=True,
          cv=3,
          n_jobs=15)
sfs_pickle = sfs_pickle.fit(X_train_sim, y_train_sim)
    
    
Serialize sfs for later use (fitting procedure takes quite a while)
In [37]:
    
with open('feature-selection/sfs_nfeatures_{}_xgboost.pkl'.format(n_features_forward), 'wb') as f_obj:
    cPickle.dump(sfs_pickle, f_obj, protocol=cPickle.HIGHEST_PROTOCOL)
    
In [ ]:
    
sbs_pickle = SFS(pipeline, 
          k_features=5, 
          forward=False, 
          floating=False, 
          scoring='accuracy',
          print_progress=True,
          cv=3,
          n_jobs=15)
sbs_pickle = sbs_pickle.fit(X_train_sim, y_train_sim)
    
    
Serialize sfs for later use (fitting procedure takes quite a while)
In [37]:
    
with open('feature-selection/sfs_nfeatures_{}_xgboost.pkl'.format(n_features_forward), 'wb') as f_obj:
    cPickle.dump(sfs_pickle, f_obj, protocol=cPickle.HIGHEST_PROTOCOL)
    
In [23]:
    
sffs = SFS(pipeline, 
          k_features=n_features_forward, 
          forward=True, 
          floating=True, 
          scoring='accuracy',
          print_progress=True,
          cv=3,
          n_jobs=20)
sffs = sffs.fit(X_train_sim, y_train_sim)
    
    
Serialize sffs for later use (fitting procedure takes quite a while)
In [24]:
    
with open('feature-selection/sffs_nfeatures_{}.pkl'.format(n_features_forward), 'wb') as f_obj:
    cPickle.dump(sffs, f_obj, protocol=cPickle.HIGHEST_PROTOCOL)
    
In [ ]: