Author: James Bourbeau
In [5]:
%load_ext watermark
%watermark -u -d -v -p numpy,scipy,pandas,sklearn,mlxtend
In [6]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')
In [7]:
from __future__ import division, print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cPickle
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import composition as comp
import composition.analysis.plotting as plotting
# # Plotting-related
# sns.set_palette('muted')
# sns.set_color_codes()
# color_dict = {}
# for i, composition in enumerate(['light', 'heavy', 'total']):
# color_dict[composition] = sns.color_palette('muted').as_hex()[i]
%matplotlib inline
[ back to top ]
In [8]:
df_sim = comp.load_dataframe(datatype='sim', config='IC79')
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train_sim, X_test_sim, y_train_sim, y_test_sim, le = comp.get_train_test_sets(
df_sim, feature_list, comp_class=True)
print('number training events = ' + str(y_train_sim.shape[0]))
print('number testing events = ' + str(y_test_sim.shape[0]))
[ back to top ]
In [9]:
pipeline = comp.get_pipeline('xgboost')
# pipeline = comp.get_pipeline('RF')
print('max_depth = {}'.format(pipeline.get_params()['classifier__max_depth']))
n_features_forward = X_train_sim.shape[1]
n_features_backward = 1
In [10]:
sfs = SFS(pipeline,
k_features=2,
forward=True,
floating=False,
scoring='accuracy',
print_progress=True,
cv=3,
n_jobs=15)
sfs = sfs.fit(X_train_sim, y_train_sim)
In [ ]:
sbs = SFS(pipeline,
k_features=n_features_backward,
forward=False,
floating=False,
scoring='accuracy',
print_progress=True,
cv=3,
n_jobs=20)
sbs = sbs.fit(X_train_sim, y_train_sim)
In [ ]:
sffs = SFS(pipeline,
k_features=n_features_forward,
forward=True,
floating=True,
scoring='accuracy',
print_progress=True,
cv=3,
n_jobs=15)
sffs = sffs.fit(X_train_sim, y_train_sim)
In [10]:
sfbs = SFS(pipeline,
k_features=n_features_backward,
forward=False,
floating=True,
scoring='accuracy',
print_progress=True,
cv=3,
n_jobs=20)
sfbs = sfbs.fit(X_train_sim, y_train_sim)
In [7]:
selector_list = [sfs]
# selector_list = [sfs, sffs, sbs, sfbs]
selector_label = {sfs:'SFS'}
# selector_label = {sfs:'SFS', sffs:'SFFS', sbs:'SBS', sfbs:'SFBS'}
fig, ax = plt.subplots()
for selector in selector_list:
info = selector.get_metric_dict()
scores = [info[size]['avg_score'] for size in info]
errs = [info[size]['ci_bound'] for size in info]
ax.errorbar(info.keys(), scores, yerr=errs,
marker='.', linestyle='-', lw=1, alpha=0.75,
label=selector_label[selector])
plt.xlabel('Feature subset size')
plt.ylabel('Accurary')
plt.title('XGBoost')
# plt.title('Max depth = {}'.format(pipeline.get_params()['classifier__max_depth']))
plt.grid()
plt.legend(title='Selection algorithm', loc='lower right')
plt.show()
In [11]:
for selector in selector_list:
idx = selector.get_metric_dict()[4]['feature_idx']
print(idx)
print(selector_label[selector]+':')
print(np.sort(feature_labels[list(idx)]))
In [22]:
sfs.get_metric_dict()
Out[22]:
In [23]:
sfs.subsets_
Out[23]:
In [7]:
print('\nSequential Forward Selection:')
selected_features_str = 'Selected features:\n'
for idx in sfs.k_feature_idx_:
selected_features_str += r'{}, '.format(feature_labels[idx])
print(selected_features_str)
print('CV Score:')
print(sfs.k_score_)
[ back to top ]
In [35]:
pipeline = comp.get_pipeline('xgboost')
# pipeline = comp.get_pipeline('RF')
max_depth = pipeline.get_params()['classifier__max_depth']
n_features_forward = 7
In [36]:
sfs_pickle = SFS(pipeline,
k_features=n_features_forward,
forward=True,
floating=False,
scoring='accuracy',
print_progress=True,
cv=3,
n_jobs=15)
sfs_pickle = sfs_pickle.fit(X_train_sim, y_train_sim)
Serialize sfs
for later use (fitting procedure takes quite a while)
In [37]:
with open('feature-selection/sfs_nfeatures_{}_xgboost.pkl'.format(n_features_forward), 'wb') as f_obj:
cPickle.dump(sfs_pickle, f_obj, protocol=cPickle.HIGHEST_PROTOCOL)
In [ ]:
sbs_pickle = SFS(pipeline,
k_features=5,
forward=False,
floating=False,
scoring='accuracy',
print_progress=True,
cv=3,
n_jobs=15)
sbs_pickle = sbs_pickle.fit(X_train_sim, y_train_sim)
Serialize sfs
for later use (fitting procedure takes quite a while)
In [37]:
with open('feature-selection/sfs_nfeatures_{}_xgboost.pkl'.format(n_features_forward), 'wb') as f_obj:
cPickle.dump(sfs_pickle, f_obj, protocol=cPickle.HIGHEST_PROTOCOL)
In [23]:
sffs = SFS(pipeline,
k_features=n_features_forward,
forward=True,
floating=True,
scoring='accuracy',
print_progress=True,
cv=3,
n_jobs=20)
sffs = sffs.fit(X_train_sim, y_train_sim)
Serialize sffs
for later use (fitting procedure takes quite a while)
In [24]:
with open('feature-selection/sffs_nfeatures_{}.pkl'.format(n_features_forward), 'wb') as f_obj:
cPickle.dump(sffs, f_obj, protocol=cPickle.HIGHEST_PROTOCOL)
In [ ]: