Facies classification - Sequential Feature Selection


<span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">The code and ideas in this notebook,</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Matteo Niccoli and Mark Dahl,</span> are licensed under a Creative Commons Attribution 4.0 International License.

The mlxtend library used for the sequential feature selection is by Sebastian Raschka.


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.metrics import f1_score, accuracy_score, make_scorer

In [2]:
filename = 'engineered_features.csv'
training_data = pd.read_csv(filename)
training_data.describe()


Out[2]:
Facies Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS GR_d1 ... PE_GLCM_energy_asym_wsize=129 PE_GLCM_energy_asym_wsize=ave PE_GLCM_correlation_asym_wsize=3 PE_GLCM_correlation_asym_wsize=5 PE_GLCM_correlation_asym_wsize=9 PE_GLCM_correlation_asym_wsize=17 PE_GLCM_correlation_asym_wsize=33 PE_GLCM_correlation_asym_wsize=65 PE_GLCM_correlation_asym_wsize=129 PE_GLCM_correlation_asym_wsize=ave
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 ... 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000
mean 4.503254 2906.867438 64.933985 0.659566 4.402484 13.201066 3.725014 1.518438 0.521852 -0.000250 ... 0.362744 0.508851 0.759942 0.603343 0.719862 0.824668 0.882453 0.914078 0.929500 0.804835
std 2.474324 133.300164 30.302530 0.252703 5.274947 7.132846 0.790917 0.499720 0.286644 0.322450 ... 0.349133 0.264859 0.650069 0.480311 0.288003 0.176777 0.120571 0.081636 0.055219 0.190367
min 1.000000 2573.500000 10.149000 -0.025949 -21.832000 0.550000 0.200000 1.000000 0.000000 -4.054936 ... 0.096319 0.316176 -1.000000 -1.000000 -0.585463 -0.374228 0.006510 0.172521 0.535185 0.057670
25% 2.000000 2821.500000 44.730000 0.498000 1.600000 8.500000 3.200000 1.000000 0.277000 -0.094888 ... 0.126938 0.335702 1.000000 0.333333 0.563576 0.744050 0.839252 0.886120 0.900920 0.716265
50% 4.000000 2932.500000 64.990000 0.639000 4.300000 12.020000 3.725014 2.000000 0.528000 0.009984 ... 0.155859 0.366067 1.000000 0.812079 0.812063 0.865431 0.904105 0.922402 0.930043 0.856678
75% 6.000000 3007.000000 79.438000 0.822000 7.500000 16.050000 4.000000 2.000000 0.769000 0.103486 ... 0.459545 0.517911 1.000000 1.000000 0.952660 0.957511 0.966665 0.967000 0.962272 0.946690
max 9.000000 3138.000000 361.150000 1.800000 19.312000 84.400000 8.094000 2.000000 1.000000 3.835455 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 379 columns


In [3]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()


Out[3]:
[SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, KIMZEY A, CROSS H CATTLE, NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]
Categories (10, object): [SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, ..., NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]

In [4]:
y = training_data['Facies'].values
print y[25:40]
print np.shape(y)


[3 3 2 2 2 2 2 2 3 3 3 3 3 3 3]
(4149,)

In [5]:
X = training_data.drop(['Formation', 'Well Name','Facies'], axis=1)
print np.shape(X)
X.describe(percentiles=[.05, .25, .50, .75, .95])


(4149, 378)
Out[5]:
Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS GR_d1 GR_d2 ... PE_GLCM_energy_asym_wsize=129 PE_GLCM_energy_asym_wsize=ave PE_GLCM_correlation_asym_wsize=3 PE_GLCM_correlation_asym_wsize=5 PE_GLCM_correlation_asym_wsize=9 PE_GLCM_correlation_asym_wsize=17 PE_GLCM_correlation_asym_wsize=33 PE_GLCM_correlation_asym_wsize=65 PE_GLCM_correlation_asym_wsize=129 PE_GLCM_correlation_asym_wsize=ave
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 ... 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000
mean 2906.867438 64.933985 0.659566 4.402484 13.201066 3.725014 1.518438 0.521852 -0.000250 0.001948 ... 0.362744 0.508851 0.759942 0.603343 0.719862 0.824668 0.882453 0.914078 0.929500 0.804835
std 133.300164 30.302530 0.252703 5.274947 7.132846 0.790917 0.499720 0.286644 0.322450 0.204689 ... 0.349133 0.264859 0.650069 0.480311 0.288003 0.176777 0.120571 0.081636 0.055219 0.190367
min 2573.500000 10.149000 -0.025949 -21.832000 0.550000 0.200000 1.000000 0.000000 -4.054936 -2.996563 ... 0.096319 0.316176 -1.000000 -1.000000 -0.585463 -0.374228 0.006510 0.172521 0.535185 0.057670
5% 2644.200000 22.313200 0.255012 -4.000000 4.927800 2.600000 1.000000 0.070000 -0.445132 -0.246826 ... 0.105974 0.321458 -1.000000 -0.333333 0.132645 0.480292 0.643018 0.782681 0.841858 0.398231
25% 2821.500000 44.730000 0.498000 1.600000 8.500000 3.200000 1.000000 0.277000 -0.094888 -0.054821 ... 0.126938 0.335702 1.000000 0.333333 0.563576 0.744050 0.839252 0.886120 0.900920 0.716265
50% 2932.500000 64.990000 0.639000 4.300000 12.020000 3.725014 2.000000 0.528000 0.009984 0.005999 ... 0.155859 0.366067 1.000000 0.812079 0.812063 0.865431 0.904105 0.922402 0.930043 0.856678
75% 3007.000000 79.438000 0.822000 7.500000 16.050000 4.000000 2.000000 0.769000 0.103486 0.065753 ... 0.459545 0.517911 1.000000 1.000000 0.952660 0.957511 0.966665 0.967000 0.962272 0.946690
95% 3094.000000 105.841000 1.079600 12.800000 25.380000 5.200000 2.000000 0.964000 0.374634 0.232004 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
max 3138.000000 361.150000 1.800000 19.312000 84.400000 8.094000 2.000000 1.000000 3.835455 1.835877 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

10 rows × 378 columns


In [6]:
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

Make performance scorers


In [7]:
Fscorer = make_scorer(f1_score, average = 'micro')

In [8]:
from sklearn.ensemble import RandomForestClassifier

The next cell will take many hours to run, skip it


In [ ]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
clf = RandomForestClassifier(random_state=49)

sfs = SFS(clf, 
          k_features=100, 
          forward=True, 
          floating=False, 
          scoring=Fscorer,
          cv = 8,
          n_jobs = -1)

sfs = sfs.fit(X, y)

In [ ]:
np.save('sfs_RF_metric_dict.npy', sfs.get_metric_dict())

Restart from here


In [9]:
# load previously saved dictionary
read_dictionary = np.load('sfs_RF_metric_dict.npy').item()

In [10]:
# plot results
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

In [12]:
# run this twice
fig = plt.figure()                                                               
ax = plot_sfs(read_dictionary, kind='std_err')
fig_size = plt.rcParams["figure.figsize"] 
fig_size[0] = 22
fig_size[1] = 18

plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.xticks( rotation='vertical')
locs, labels = plt.xticks()
plt.xticks( locs, labels)
plt.show()


<matplotlib.figure.Figure at 0x11a0cc750>
It looks like the score stabilizes after about 6 features, reaches a max at 16, then begins to taper off after about 70 features. We will save the top 45 and the top 75.

In [18]:
# save results to dataframe
selected_summary = pd.DataFrame.from_dict(read_dictionary).T
selected_summary['index'] = selected_summary.index
selected_summary.sort_values(by='avg_score', ascending=0)


Out[18]:
avg_score ci_bound cv_scores feature_idx std_dev std_err index
39 0.536445 0.0314884 [0.570881226054, 0.556621880998, 0.54702495201... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0361276 0.013655 39
35 0.534828 0.0351144 [0.536398467433, 0.562380038388, 0.54318618042... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0402879 0.0152274 35
40 0.534512 0.0453272 [0.616858237548, 0.500959692898, 0.53550863723... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0520053 0.0196561 40
36 0.533108 0.0521373 [0.547892720307, 0.579654510557, 0.53550863723... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0598187 0.0226094 36
37 0.532347 0.0432363 [0.553639846743, 0.575815738964, 0.50095969289... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0496063 0.0187494 37
34 0.531847 0.0580695 [0.547892720307, 0.560460652591, 0.52783109405... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.066625 0.0251819 34
50 0.531364 0.0388602 [0.609195402299, 0.53358925144, 0.53358925144,... (256, 257, 3, 6, 1, 264, 137, 128, 277, 23, 28... 0.0445856 0.0168518 50
32 0.530205 0.0400123 [0.590038314176, 0.548944337812, 0.49520153550... (256, 257, 6, 1, 264, 137, 81, 80, 352, 23, 28... 0.0459074 0.0173514 32
54 0.53011 0.052203 [0.607279693487, 0.508637236084, 0.54510556621... (256, 257, 3, 4, 6, 1, 264, 137, 128, 277, 23,... 0.0598942 0.0226379 54
33 0.529715 0.0410833 [0.540229885057, 0.564299424184, 0.52207293666... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0471362 0.0178158 33
41 0.528651 0.0512986 [0.584291187739, 0.562380038388, 0.52591170825... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0588565 0.0222457 41
68 0.527753 0.0529404 [0.609195402299, 0.508637236084, 0.55854126679... (256, 257, 3, 4, 6, 1, 264, 9, 17, 277, 23, 28... 0.0607402 0.0229576 68
38 0.527266 0.0496563 [0.555555555556, 0.545105566219, 0.53934740882... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0569723 0.0215335 38
29 0.527089 0.0443396 [0.565134099617, 0.564299424184, 0.48560460652... (256, 257, 6, 1, 264, 137, 80, 352, 23, 280, 2... 0.0508722 0.0192279 29
64 0.525101 0.0399105 [0.582375478927, 0.52207293666, 0.568138195777... (256, 257, 3, 4, 6, 1, 264, 137, 128, 23, 17, ... 0.0457906 0.0173072 64
48 0.525098 0.0386203 [0.611111111111, 0.500959692898, 0.54510556621... (256, 257, 3, 6, 1, 264, 137, 128, 277, 23, 28... 0.0443103 0.0167477 48
53 0.525065 0.0405463 [0.588122605364, 0.556621880998, 0.55086372360... (256, 257, 3, 4, 6, 1, 264, 137, 128, 277, 23,... 0.0465201 0.0175829 53
26 0.52482 0.0555374 [0.622605363985, 0.550863723608, 0.49712092130... (256, 257, 6, 1, 264, 329, 80, 352, 23, 280, 2... 0.0637198 0.0240838 26
52 0.524801 0.0544376 [0.632183908046, 0.535508637236, 0.53934740882... (256, 257, 3, 4, 6, 1, 264, 137, 128, 277, 23,... 0.062458 0.0236069 52
27 0.52464 0.050661 [0.563218390805, 0.58349328215, 0.46641074856,... (256, 257, 6, 1, 264, 329, 80, 352, 23, 280, 2... 0.058125 0.0219692 27
46 0.523933 0.0460559 [0.561302681992, 0.591170825336, 0.50863723608... (256, 257, 3, 6, 1, 264, 137, 128, 149, 23, 28... 0.0528414 0.0199722 46
43 0.523925 0.0483159 [0.580459770115, 0.550863723608, 0.51247600767... (256, 257, 3, 6, 1, 264, 137, 128, 23, 280, 28... 0.0554343 0.0209522 43
45 0.523684 0.0461786 [0.565134099617, 0.537428023033, 0.52207293666... (256, 257, 3, 6, 1, 264, 137, 128, 23, 280, 28... 0.0529822 0.0200254 45
58 0.523644 0.0365109 [0.611111111111, 0.510556621881, 0.52591170825... (256, 257, 3, 4, 6, 1, 264, 137, 128, 23, 277,... 0.0418901 0.015833 58
44 0.523242 0.0469907 [0.595785440613, 0.506717850288, 0.49520153550... (256, 257, 3, 6, 1, 264, 137, 128, 23, 280, 28... 0.0539139 0.0203775 44
42 0.523172 0.0389101 [0.578544061303, 0.539347408829, 0.51631477927... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.0446428 0.0168734 42
67 0.523138 0.0410689 [0.599616858238, 0.518234165067, 0.54318618042... (256, 257, 3, 4, 6, 1, 264, 9, 17, 277, 23, 28... 0.0471197 0.0178096 67
31 0.522945 0.0396312 [0.580459770115, 0.554702495202, 0.50095969289... (256, 257, 6, 1, 264, 137, 208, 352, 23, 280, ... 0.0454702 0.0171861 31
49 0.522927 0.0477906 [0.599616858238, 0.520153550864, 0.51055662188... (256, 257, 3, 6, 1, 264, 137, 128, 277, 23, 28... 0.0548317 0.0207244 49
65 0.522755 0.0465524 [0.580459770115, 0.506717850288, 0.51439539347... (256, 257, 3, 4, 6, 1, 264, 9, 17, 277, 23, 28... 0.053411 0.0201875 65
... ... ... ... ... ... ... ...
92 0.50193 0.0338528 [0.567049808429, 0.506717850288, 0.52783109405... (256, 1, 3, 4, 6, 7, 264, 9, 272, 17, 18, 277,... 0.0388404 0.0146803 92
96 0.501699 0.0558453 [0.582375478927, 0.514395393474, 0.55278310940... (256, 1, 3, 4, 6, 7, 264, 9, 272, 17, 18, 277,... 0.0640731 0.0242173 96
97 0.501225 0.0599946 [0.603448275862, 0.510556621881, 0.52207293666... (256, 1, 3, 4, 6, 7, 264, 9, 272, 17, 18, 277,... 0.0688337 0.0260167 97
90 0.497919 0.0476234 [0.551724137931, 0.495201535509, 0.50863723608... (256, 1, 3, 4, 6, 7, 264, 9, 272, 17, 18, 277,... 0.0546399 0.0206519 90
100 0.497567 0.0495772 [0.574712643678, 0.523992322457, 0.54510556621... (256, 1, 3, 4, 6, 7, 264, 9, 272, 17, 18, 277,... 0.0568815 0.0214992 100
99 0.495095 0.0523133 [0.599616858238, 0.491362763916, 0.51823416506... (256, 1, 3, 4, 6, 7, 264, 9, 272, 17, 18, 277,... 0.0600207 0.0226857 99
98 0.490839 0.0481685 [0.578544061303, 0.47792706334, 0.516314779271... (256, 1, 3, 4, 6, 7, 264, 9, 272, 17, 18, 277,... 0.0552653 0.0208883 98
23 0.489535 0.0482821 [0.519157088123, 0.408829174664, 0.48752399232... (256, 257, 6, 1, 264, 329, 80, 352, 280, 281, ... 0.0553955 0.0209375 23
22 0.462783 0.0365904 [0.48275862069, 0.391554702495, 0.493282149712... (256, 257, 6, 1, 328, 329, 208, 352, 280, 281,... 0.0419813 0.0158674 22
21 0.416009 0.0272579 [0.404214559387, 0.399232245681, 0.44337811900... (256, 257, 6, 1, 328, 329, 208, 352, 280, 281,... 0.0312739 0.0118204 21
7 0.401555 0.021565 [0.379310344828, 0.391554702495, 0.38003838771... (208, 257, 6, 305, 233, 281, 329) 0.0247422 0.00935169 7
5 0.401315 0.0217532 [0.379310344828, 0.391554702495, 0.37811900191... (208, 233, 281, 6, 257) 0.0249581 0.00943327 5
8 0.401315 0.0217532 [0.379310344828, 0.391554702495, 0.37811900191... (208, 257, 6, 305, 233, 281, 329, 353) 0.0249581 0.00943327 8
6 0.401312 0.0214118 [0.379310344828, 0.391554702495, 0.38003838771... (208, 257, 6, 281, 233, 305) 0.0245664 0.00928522 6
4 0.401072 0.0218208 [0.379310344828, 0.391554702495, 0.37811900191... (208, 233, 6, 257) 0.0250357 0.00946259 4
3 0.401072 0.0215992 [0.379310344828, 0.391554702495, 0.37811900191... (208, 233, 6) 0.0247814 0.00936649 3
11 0.400599 0.0213276 [0.373563218391, 0.391554702495, 0.37619961612... (256, 257, 6, 353, 233, 304, 208, 305, 329, 28... 0.0244698 0.0092487 11
9 0.400599 0.0213276 [0.373563218391, 0.391554702495, 0.37619961612... (257, 6, 353, 233, 208, 305, 329, 280, 281) 0.0244698 0.0092487 9
12 0.400599 0.0213276 [0.373563218391, 0.391554702495, 0.37619961612... (256, 257, 6, 353, 328, 233, 304, 208, 305, 32... 0.0244698 0.0092487 12
13 0.400596 0.0218403 [0.373563218391, 0.391554702495, 0.37619961612... (256, 257, 6, 353, 328, 233, 304, 208, 305, 35... 0.0250581 0.00947108 13
10 0.400356 0.0213568 [0.373563218391, 0.391554702495, 0.37619961612... (257, 6, 353, 233, 304, 208, 305, 329, 280, 281) 0.0245034 0.0092614 10
19 0.400286 0.0271673 [0.377394636015, 0.404990403071, 0.39347408829... (256, 257, 6, 328, 329, 208, 352, 280, 281, 36... 0.0311699 0.0117811 19
15 0.39957 0.0263516 [0.375478927203, 0.404990403071, 0.39155470249... (256, 257, 6, 353, 328, 233, 288, 304, 208, 30... 0.030234 0.0114274 15
17 0.399318 0.027856 [0.377394636015, 0.404990403071, 0.38963531669... (256, 257, 6, 328, 329, 208, 352, 280, 281, 36... 0.0319601 0.0120798 17
16 0.399078 0.027368 [0.377394636015, 0.404990403071, 0.38963531669... (256, 257, 6, 353, 328, 233, 304, 288, 208, 30... 0.0314002 0.0118682 16
18 0.398601 0.0252617 [0.379310344828, 0.404990403071, 0.38963531669... (256, 257, 6, 328, 329, 208, 352, 280, 281, 36... 0.0289835 0.0109548 18
14 0.396473 0.0288102 [0.362068965517, 0.401151631478, 0.37428023032... (256, 257, 6, 353, 328, 233, 304, 208, 305, 35... 0.0330548 0.0124935 14
2 0.3963 0.0268422 [0.333333333333, 0.37236084453, 0.424184261036... (208, 6) 0.0307969 0.0116401 2
1 0.387546 0.00375038 [0.390804597701, 0.391554702495, 0.39155470249... (6,) 0.00430293 0.00162635 1
20 0.387283 0.0433681 [0.36398467433, 0.406909788868, 0.368522072937... (256, 257, 6, 328, 329, 208, 352, 280, 281, 36... 0.0497576 0.0188066 20

100 rows × 7 columns


In [19]:
# save dataframe
selected_summary.to_csv('SFS_RF_selected_features_summary.csv', sep=',', header=True, index = False)

In [20]:
# re load saved dataframe and sort by score
filename = 'SFS_RF_selected_features_summary.csv'
selected_summary = pd.read_csv(filename)
selected_summary = selected_summary.set_index(['index'])
selected_summary.sort_values(by='avg_score', ascending=0).head()


Out[20]:
avg_score ci_bound cv_scores feature_idx std_dev std_err
index
39 0.536445 0.031488 [ 0.57088123 0.55662188 0.54702495 0.568138... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.036128 0.013655
35 0.534828 0.035114 [ 0.53639847 0.56238004 0.54318618 0.560460... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.040288 0.015227
40 0.534512 0.045327 [ 0.61685824 0.50095969 0.53550864 0.585412... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.052005 0.019656
36 0.533108 0.052137 [ 0.54789272 0.57965451 0.53550864 0.573896... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.059819 0.022609
37 0.532347 0.043236 [ 0.55363985 0.57581574 0.50095969 0.604606... (256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 28... 0.049606 0.018749

In [21]:
# feature selection with highest score
selected_summary.iloc[39]['feature_idx']


Out[21]:
'(256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 288, 289, 113, 168, 7, 304, 305, 312, 193, 328, 329, 224, 80, 81, 83, 122, 95, 352, 353, 232, 233, 295, 208, 109, 336, 360, 118, 248, 250, 255)'

In [22]:
slct = np.array([256, 257, 3, 6, 1, 264, 137, 23, 280, 281, 288, 289, 113, 168, 7, 304, 305, 312, 193, 328, 
                 329, 224, 80, 81, 83, 122, 95, 352, 353, 232, 233, 295, 208, 109, 336, 360, 118, 248, 250, 255])
slct


Out[22]:
array([256, 257,   3,   6,   1, 264, 137,  23, 280, 281, 288, 289, 113,
       168,   7, 304, 305, 312, 193, 328, 329, 224,  80,  81,  83, 122,
        95, 352, 353, 232, 233, 295, 208, 109, 336, 360, 118, 248, 250, 255])

In [23]:
# isolate and save selected features
filename = 'engineered_features.csv'
training_data = pd.read_csv(filename)
X = training_data.drop(['Formation', 'Well Name','Facies'], axis=1)
Xs = X.iloc[:, slct]
Xs = pd.concat([training_data[['Depth', 'Well Name', 'Formation', 'Facies']], Xs], axis = 1)
print np.shape(Xs), list(Xs)


(4149, 44) ['Depth', 'Well Name', 'Formation', 'Facies', 'GR_GLCM_energy_asym_wsize=3', 'GR_GLCM_energy_asym_wsize=5', 'DeltaPHI', 'NM_M', 'GR', 'GR_GLCM_correlation_asym_wsize=3', 'GR_GLCM_energy_wsize=5', 'GR_std_wsize=ave', 'ILD_log10_GLCM_energy_asym_wsize=3', 'ILD_log10_GLCM_energy_asym_wsize=5', 'ILD_log10_GLCM_correlation_asym_wsize=3', 'ILD_log10_GLCM_correlation_asym_wsize=5', 'PE_std_wsize=5', 'ILD_log10_GLCM_correlation_wsize=3', 'RELPOS', 'DeltaPHI_GLCM_energy_asym_wsize=3', 'DeltaPHI_GLCM_energy_asym_wsize=5', 'DeltaPHI_GLCM_correlation_asym_wsize=3', 'DeltaPHI_GLCM_correlation_wsize=5', 'PHIND_GLCM_energy_asym_wsize=3', 'PHIND_GLCM_energy_asym_wsize=5', 'PE_GLCM_dissimilarity_wsize=3', 'PHIND_mean_wsize=3', 'PHIND_mean_wsize=5', 'PHIND_mean_wsize=17', 'PE_skew_wsize=9', 'PHIND_std_wsize=ave', 'PE_GLCM_energy_asym_wsize=3', 'PE_GLCM_energy_asym_wsize=5', 'PE_GLCM_energy_wsize=3', 'PE_GLCM_energy_wsize=5', 'ILD_log10_GLCM_correlation_asym_wsize=ave', 'PHIND_GLCM_energy_wsize=3', 'PE_mean_wsize=65', 'PHIND_GLCM_correlation_asym_wsize=3', 'PE_GLCM_correlation_asym_wsize=3', 'PE_std_wsize=129', 'GR_GLCM_dissimilarity_asym_wsize=3', 'GR_GLCM_dissimilarity_asym_wsize=9', 'GR_GLCM_dissimilarity_asym_wsize=ave']

In [24]:
Xs.to_csv('SFS_top40_selected_engineered_features.csv', sep=',',  index=False)

In [25]:
# feature selection with highest score
selected_summary.iloc[69]['feature_idx']


Out[25]:
'(256, 257, 3, 4, 6, 1, 264, 9, 17, 277, 23, 280, 281, 283, 288, 289, 295, 40, 7, 304, 305, 308, 265, 312, 317, 360, 97, 328, 329, 331, 79, 80, 81, 83, 89, 350, 95, 352, 353, 99, 104, 364, 109, 113, 118, 120, 122, 128, 137, 149, 151, 153, 168, 169, 171, 174, 193, 196, 207, 208, 224, 336, 226, 227, 232, 233, 25, 248, 250, 255)'

In [26]:
slct = np.array([256, 257, 3, 4, 6, 1, 264, 9, 17, 277, 23, 280, 281, 283, 288, 289, 295, 40, 7, 304, 305, 308, 265, 
                 312, 317, 360, 97, 328, 329, 331, 79, 80, 81, 83, 89, 350, 95, 352, 353, 99, 104, 364, 109, 113, 
                 118, 120, 122, 128, 137, 149, 151, 153, 168, 169, 171, 174, 193, 196, 207, 208, 224, 336, 226, 
                 227, 232, 233, 25, 248, 250, 255])
slct


Out[26]:
array([256, 257,   3,   4,   6,   1, 264,   9,  17, 277,  23, 280, 281,
       283, 288, 289, 295,  40,   7, 304, 305, 308, 265, 312, 317, 360,
        97, 328, 329, 331,  79,  80,  81,  83,  89, 350,  95, 352, 353,
        99, 104, 364, 109, 113, 118, 120, 122, 128, 137, 149, 151, 153,
       168, 169, 171, 174, 193, 196, 207, 208, 224, 336, 226, 227, 232,
       233,  25, 248, 250, 255])

In [27]:
# isolate and save selected features
filename = 'engineered_features.csv'
training_data = pd.read_csv(filename)
X = training_data.drop(['Formation', 'Well Name','Facies'], axis=1)
Xs = X.iloc[:, slct]
Xs = pd.concat([training_data[['Depth', 'Well Name', 'Formation', 'Facies']], Xs], axis = 1)
print np.shape(Xs), list(Xs)


(4149, 74) ['Depth', 'Well Name', 'Formation', 'Facies', 'GR_GLCM_energy_asym_wsize=3', 'GR_GLCM_energy_asym_wsize=5', 'DeltaPHI', 'PHIND', 'NM_M', 'GR', 'GR_GLCM_correlation_asym_wsize=3', 'GR_mean_wsize=5', 'GR_std_wsize=5', 'ILD_log10_GLCM_dissimilarity_asym_wsize=65', 'GR_std_wsize=ave', 'ILD_log10_GLCM_energy_asym_wsize=3', 'ILD_log10_GLCM_energy_asym_wsize=5', 'ILD_log10_GLCM_energy_asym_wsize=17', 'ILD_log10_GLCM_correlation_asym_wsize=3', 'ILD_log10_GLCM_correlation_asym_wsize=5', 'ILD_log10_GLCM_correlation_asym_wsize=ave', 'ILD_log10_std_wsize=3', 'RELPOS', 'DeltaPHI_GLCM_energy_asym_wsize=3', 'DeltaPHI_GLCM_energy_asym_wsize=5', 'DeltaPHI_GLCM_energy_asym_wsize=33', 'GR_GLCM_correlation_asym_wsize=5', 'DeltaPHI_GLCM_correlation_asym_wsize=3', 'DeltaPHI_GLCM_correlation_asym_wsize=65', 'PE_GLCM_correlation_asym_wsize=3', 'PHIND_skew_wsize=5', 'PHIND_GLCM_energy_asym_wsize=3', 'PHIND_GLCM_energy_asym_wsize=5', 'PHIND_GLCM_energy_asym_wsize=17', 'DeltaPHI_skew_wsize=ave', 'PHIND_mean_wsize=3', 'PHIND_mean_wsize=5', 'PHIND_mean_wsize=17', 'PHIND_std_wsize=5', 'PE_GLCM_dissimilarity_asym_wsize=129', 'PHIND_std_wsize=ave', 'PE_GLCM_energy_asym_wsize=3', 'PE_GLCM_energy_asym_wsize=5', 'PHIND_skew_wsize=17', 'PE_mean_wsize=3', 'PE_GLCM_correlation_asym_wsize=33', 'PE_mean_wsize=65', 'PE_std_wsize=5', 'PE_std_wsize=129', 'PE_skew_wsize=3', 'PE_skew_wsize=9', 'GR_GLCM_dissimilarity_wsize=3', 'GR_GLCM_energy_wsize=5', 'GR_GLCM_correlation_wsize=65', 'GR_GLCM_correlation_wsize=ave', 'ILD_log10_GLCM_dissimilarity_wsize=5', 'ILD_log10_GLCM_correlation_wsize=3', 'ILD_log10_GLCM_correlation_wsize=5', 'ILD_log10_GLCM_correlation_wsize=17', 'ILD_log10_GLCM_correlation_wsize=129', 'DeltaPHI_GLCM_correlation_wsize=5', 'DeltaPHI_GLCM_correlation_wsize=33', 'PHIND_GLCM_dissimilarity_wsize=ave', 'PHIND_GLCM_energy_wsize=3', 'PE_GLCM_dissimilarity_wsize=3', 'PHIND_GLCM_correlation_asym_wsize=3', 'PE_GLCM_dissimilarity_wsize=9', 'PE_GLCM_dissimilarity_wsize=17', 'PE_GLCM_energy_wsize=3', 'PE_GLCM_energy_wsize=5', 'GR_skew_wsize=5', 'GR_GLCM_dissimilarity_asym_wsize=3', 'GR_GLCM_dissimilarity_asym_wsize=9', 'GR_GLCM_dissimilarity_asym_wsize=ave']

In [28]:
Xs.to_csv('SFS_top70_selected_engineered_features.csv', sep=',',  index=False)

In [ ]: