<span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">The code and ideas in this notebook,</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Matteo Niccoli and Mark Dahl,</span> are licensed under a Creative Commons Attribution 4.0 International License.
The mlxtend library used for the sequential feature selection is by Sebastian Raschka.
In [28]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import f1_score, accuracy_score, make_scorer
In [29]:
filename = 'train_engineered_features.csv'
train_data = pd.read_csv(filename)
train_data.describe()
Out[29]:
In [30]:
train_data['Well Name'] = train_data['Well Name'].astype('category')
train_data['Formation'] = train_data['Formation'].astype('category')
train_data['Well Name'].unique()
Out[30]:
In [31]:
y = train_data['Facies'].values
print y[25:40]
print np.shape(y)
In [32]:
X = train_data.drop(['Formation', 'Well Name','Facies'], axis=1)
print np.shape(X)
X.describe(percentiles=[.05, .25, .50, .75, .95])
Out[32]:
In [33]:
stdscaler = preprocessing.StandardScaler().fit(X)
X = stdscaler.transform(X)
In [34]:
Fscorer = make_scorer(f1_score, average = 'micro')
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
In [35]:
from sklearn.ensemble import RandomForestClassifier
In [10]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
clf = RandomForestClassifier(random_state=49)
sfs = SFS(clf,
k_features=100,
forward=True,
floating=False,
scoring=Fscorer,
cv = 8,
n_jobs = -1)
sfs = sfs.fit(X_train, y_train)
In [11]:
np.save('sfs_RF_metric_dict.npy', sfs.get_metric_dict())
In [36]:
# load previously saved dictionary
read_dictionary = np.load('sfs_RF_metric_dict.npy').item()
In [37]:
# plot results
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
In [39]:
# run this twice
fig = plt.figure()
ax = plot_sfs(read_dictionary, kind='std_err')
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 20
fig_size[1] = 18
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.xticks( rotation='vertical')
locs, labels = plt.xticks()
plt.xticks( locs, labels)
plt.show()
In [40]:
# save results to dataframe
selected_summary = pd.DataFrame.from_dict(read_dictionary).T
selected_summary['index'] = selected_summary.index
selected_summary.sort_values(by='avg_score', ascending=0)
Out[40]:
In [41]:
# save dataframe
selected_summary.to_csv('SFS_RF_selected_features_summary.csv', sep=',', header=True, index = False)
In [44]:
# re load saved dataframe and sort by score
filename = 'SFS_RF_selected_features_summary.csv'
selected_summary = pd.read_csv(filename)
selected_summary = selected_summary.set_index(['index'])
selected_summary.sort_values(by='avg_score', ascending=0).head()
Out[44]:
In [45]:
# feature selection with highest score
selected_summary.iloc[44]['feature_idx']
Out[45]:
In [46]:
slct = np.array([257, 3, 4, 6, 7, 8, 10, 12, 16, 273, 146, 19, 26, 27, 284, 285, 30, 34, 163, 1, 42, 179, 155, 181, 184, 58, 315, 190, 320, 193, 194, 203, 290, 80, 210, 35, 84, 90, 97, 18, 241, 372, 119, 120, 126])
slct
Out[46]:
In [52]:
# isolate and save selected features
filename = 'train_engineered_features.csv'
train_data = pd.read_csv(filename)
trainX = train_data.drop(['Formation', 'Well Name','Facies'], axis=1)
trainXs = trainX.iloc[:, slct]
trainXs = pd.concat([train_data[['Depth', 'Well Name', 'Formation', 'Facies']], trainXs], axis = 1)
print np.shape(trainXs), list(trainXs)
trainXs.to_csv('train_SFS_top45_engfeat.csv', sep=',', index=False)
In [53]:
# isolate and save selected features
filename = 'test_engineered_features.csv'
test_data = pd.read_csv(filename)
testX = test_data.drop(['Formation', 'Well Name'], axis=1)
testXs = testX.iloc[:, slct]
testXs = pd.concat([test_data[['Depth', 'Well Name', 'Formation']], testXs], axis = 1)
print np.shape(testXs), list(testXs)
testXs.to_csv('test_SFS_top45_engfeat.csv', sep=',', index=False)
In [54]:
# feature selection with highest score
selected_summary.iloc[74]['feature_idx']
Out[54]:
In [55]:
slct = np.array([257, 3, 4, 5, 6, 7, 8, 265, 10, 12, 13, 16, 273, 18, 19, 26, 27, 284, 285, 30, 34, 35, 1, 42, 304, 309, 313, 58, 315, 319, 320, 75, 80, 338, 84, 341, 89, 90, 92, 97, 101, 102, 110, 372, 119, 120, 122, 124, 126, 127, 138, 139, 146, 155, 163, 165, 167, 171, 177, 179, 180, 181, 184, 190, 193, 194, 198, 203, 290, 210, 211, 225, 241, 249, 253])
slct
Out[55]:
In [56]:
# isolate and save selected features
filename = 'train_engineered_features.csv'
train_data = pd.read_csv(filename)
trainX = train_data.drop(['Formation', 'Well Name','Facies'], axis=1)
trainXs = trainX.iloc[:, slct]
trainXs = pd.concat([train_data[['Depth', 'Well Name', 'Formation', 'Facies']], trainXs], axis = 1)
print np.shape(trainXs), list(trainXs)
trainXs.to_csv('train_SFS_top75_engfeat.csv', sep=',', index=False)
In [57]:
# isolate and save selected features
filename = 'test_engineered_features.csv'
test_data = pd.read_csv(filename)
testX = test_data.drop(['Formation', 'Well Name'], axis=1)
testXs = testX.iloc[:, slct]
testXs = pd.concat([test_data[['Depth', 'Well Name', 'Formation']], testXs], axis = 1)
print np.shape(testXs), list(testXs)
testXs.to_csv('test_SFS_top75_engfeat.csv', sep=',', index=False)
In [ ]: