Facies classification using XGBoost and engineered features

Contest entry by: Matteo Niccoli, Mark Dahl, with a contribution by Daniel Kittridge.

Original contest notebook by Brendon Hall, Enthought


<span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">The code and ideas in this notebook,</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Matteo Niccoli and Mark Dahl, </span> are licensed under a Creative Commons Attribution 4.0 International License.

Load the training dataset with selected set of top 45 engineered features

  • We first created a large set of features from derivatives, moments, and GLCMs. The workflow is described in this notebook (A huge thank you to Daniel Kittridge for his critically needed Pandas magic and useful suggestions).
  • We then selected the best performing 45 features using a Sequential (Forward) Feature Selector from Sebastian Raschka's mlxtend library. Details in this notebook.

In [57]:
%matplotlib inline

import numpy as np
import pandas as pd

import scipy as sp
from scipy.signal import medfilt

from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.model_selection import LeaveOneGroupOut

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from __future__ import division
mpl.rcParams['figure.figsize']=(20.0,10.0)
inline_rc = dict(mpl.rcParams)
from mpl_toolkits.axes_grid1 import make_axes_locatable

Load data


In [58]:
filename = 'train_SFS_top45_engfeat.csv'
train_data = pd.read_csv(filename)
train_data.describe()


Out[58]:
Depth Facies PE_GLCM_correlation_wsize=ave DeltaPHI PHIND NM_M RELPOS GR_d1 ILD_log10_d1 DeltaPHI_d1 ... GR_skew_wsize=5 DeltaPHI_skew_wsize=9 PHIND_mean_wsize=3 PHIND_mean_wsize=ave GR_mean_wsize=3 PE_GLCM_dissimilarity_wsize=ave PE_GLCM_correlation_asym_wsize=9 PE_mean_wsize=65 PE_mean_wsize=129 PE_std_wsize=33
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 ... 4149.000000 4149.000000 4.149000e+03 4149.000000 4.149000e+03 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000
mean 2906.867438 4.503254 0.669151 4.402484 13.201066 1.518438 0.521852 -0.000250 0.001440 0.000091 ... 0.064016 0.044582 -5.252433e-14 -0.005024 2.932937e-14 2.906766 0.719862 0.007491 0.015386 0.584426
std 133.300164 2.474324 0.207747 5.274947 7.132846 0.499720 0.286644 0.322450 0.185401 0.379750 ... 0.633118 0.682239 9.483332e-01 0.573990 9.615825e-01 2.365441 0.288003 0.575868 0.507093 0.448961
min 2573.500000 1.000000 0.080903 -21.832000 0.550000 1.000000 0.000000 -4.054936 -1.228861 -3.242129 ... -1.491707 -2.463226 -1.592039e+00 -1.121189 -1.774598e+00 0.000000 -0.585463 -1.301974 -1.090288 0.000000
25% 2821.500000 2.000000 0.531397 1.600000 8.500000 1.000000 0.277000 -0.094888 -0.079154 -0.145611 ... -0.382517 -0.368445 -6.488701e-01 -0.392760 -6.317280e-01 1.059152 0.563576 -0.294259 -0.247351 0.202617
50% 2932.500000 4.000000 0.639246 4.300000 12.020000 2.000000 0.528000 0.009984 -0.001979 0.000000 ... 0.059526 0.062414 -1.499441e-01 -0.078404 3.564997e-03 2.643973 0.812063 0.000000 0.000000 0.580953
75% 3007.000000 6.000000 0.744366 7.500000 16.050000 2.000000 0.769000 0.103486 0.079154 0.151679 ... 0.515196 0.464019 4.083389e-01 0.289299 4.655733e-01 4.343750 0.952660 0.219326 0.201666 0.923657
max 3138.000000 9.000000 1.000000 19.312000 84.400000 2.000000 1.000000 3.835455 1.980821 2.569056 ... 1.493900 2.327561 9.260946e+00 3.802650 8.791173e+00 12.801339 1.000000 2.193903 2.042554 1.788092

8 rows × 47 columns


In [59]:
filename = 'test_SFS_top45_engfeat.csv'
test_data = pd.read_csv(filename)
test_data.describe()


Out[59]:
Depth PE_GLCM_correlation_wsize=ave DeltaPHI PHIND NM_M RELPOS GR_d1 ILD_log10_d1 DeltaPHI_d1 PE_d1 ... GR_skew_wsize=5 DeltaPHI_skew_wsize=9 PHIND_mean_wsize=3 PHIND_mean_wsize=ave GR_mean_wsize=3 PE_GLCM_dissimilarity_wsize=ave PE_GLCM_correlation_asym_wsize=9 PE_mean_wsize=65 PE_mean_wsize=129 PE_std_wsize=33
count 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 ... 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000
mean 2987.070482 0.637547 2.851964 11.655277 1.678313 0.535807 0.001562 0.000319 0.001091 -0.000170 ... 0.060851 0.073473 -0.216740 -0.212150 -0.241668 3.766397 0.756560 -0.092549 -0.095933 0.665289
std 94.391925 0.114968 3.442074 5.190236 0.467405 0.283062 0.240581 0.221083 0.195123 0.235226 ... 0.609957 0.675670 0.695730 0.412185 0.886397 1.628802 0.253031 0.282968 0.211606 0.273686
min 2808.000000 0.193154 -8.900000 1.855000 1.000000 0.013000 -1.305395 -1.385190 -0.843712 -0.840265 ... -1.500000 -2.070045 -1.531981 -1.053343 -1.707180 0.589286 -0.452267 -0.716332 -0.613657 0.084940
25% 2911.625000 0.575441 0.411250 7.700000 1.000000 0.300000 -0.093819 -0.100426 -0.093496 -0.118706 ... -0.389511 -0.407239 -0.751167 -0.515234 -0.924519 2.599051 0.661402 -0.311043 -0.224941 0.512704
50% 2993.750000 0.667789 2.397500 10.950000 2.000000 0.547500 0.009901 -0.015831 0.001896 -0.013277 ... 0.064914 0.132321 -0.312241 -0.277171 -0.222753 3.511161 0.849161 -0.072464 -0.106932 0.673868
75% 3055.375000 0.725687 4.600000 14.793750 2.000000 0.778000 0.096332 0.096963 0.104279 0.080928 ... 0.528001 0.540932 0.198486 0.052659 0.251511 4.652344 0.934892 0.097174 0.106181 0.830130
max 3160.500000 0.893054 16.500000 31.335000 2.000000 1.000000 1.340677 1.234798 1.080710 1.324572 ... 1.479472 1.970021 2.075710 1.150128 4.839740 10.217634 0.996858 0.514209 0.341174 1.365075

8 rows × 46 columns


In [60]:
train_data['Well Name'] = train_data['Well Name'].astype('category')
train_data['Formation'] = train_data['Formation'].astype('category')
train_data['Well Name'].unique()


Out[60]:
[SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, KIMZEY A, CROSS H CATTLE, NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]
Categories (10, object): [SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, ..., NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]

In [61]:
test_data['Well Name'] = test_data['Well Name'].astype('category')
test_data['Formation'] = test_data['Formation'].astype('category')
test_data['Well Name'].unique()


Out[61]:
[STUART, CRAWFORD]
Categories (2, object): [STUART, CRAWFORD]

Now we extract just the feature variables we need to perform the classification. The predictor variables are the five log values and two geologic constraining variables, and we are also using depth. We also get a vector of the facies labels that correspond to each feature vector.

Preprocessing and definitions


In [62]:
y_train = train_data['Facies'].values
print y_train[25:40]
print np.shape(y_train)


[3 3 2 2 2 2 2 2 3 3 3 3 3 3 3]
(4149,)

In [63]:
X_train = train_data.drop(['Formation', 'Well Name','Facies'], axis=1)
print np.shape(X_train)
X_train.describe(percentiles=[.05, .25, .50, .75, .95])


(4149, 46)
Out[63]:
Depth PE_GLCM_correlation_wsize=ave DeltaPHI PHIND NM_M RELPOS GR_d1 ILD_log10_d1 DeltaPHI_d1 PE_d1 ... GR_skew_wsize=5 DeltaPHI_skew_wsize=9 PHIND_mean_wsize=3 PHIND_mean_wsize=ave GR_mean_wsize=3 PE_GLCM_dissimilarity_wsize=ave PE_GLCM_correlation_asym_wsize=9 PE_mean_wsize=65 PE_mean_wsize=129 PE_std_wsize=33
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 ... 4149.000000 4149.000000 4.149000e+03 4149.000000 4.149000e+03 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000
mean 2906.867438 0.669151 4.402484 13.201066 1.518438 0.521852 -0.000250 0.001440 0.000091 0.000965 ... 0.064016 0.044582 -5.252433e-14 -0.005024 2.932937e-14 2.906766 0.719862 0.007491 0.015386 0.584426
std 133.300164 0.207747 5.274947 7.132846 0.499720 0.286644 0.322450 0.185401 0.379750 0.293177 ... 0.633118 0.682239 9.483332e-01 0.573990 9.615825e-01 2.365441 0.288003 0.575868 0.507093 0.448961
min 2573.500000 0.080903 -21.832000 0.550000 1.000000 0.000000 -4.054936 -1.228861 -3.242129 -1.613512 ... -1.491707 -2.463226 -1.592039e+00 -1.121189 -1.774598e+00 0.000000 -0.585463 -1.301974 -1.090288 0.000000
5% 2644.200000 0.363184 -4.000000 4.927800 1.000000 0.070000 -0.445132 -0.259228 -0.557836 -0.442578 ... -1.025811 -1.097455 -1.124194e+00 -0.778963 -1.376926e+00 0.000000 0.132645 -0.893441 -0.795401 0.000000
25% 2821.500000 0.531397 1.600000 8.500000 1.000000 0.277000 -0.094888 -0.079154 -0.145611 -0.070180 ... -0.382517 -0.368445 -6.488701e-01 -0.392760 -6.317280e-01 1.059152 0.563576 -0.294259 -0.247351 0.202617
50% 2932.500000 0.639246 4.300000 12.020000 2.000000 0.528000 0.009984 -0.001979 0.000000 0.000000 ... 0.059526 0.062414 -1.499441e-01 -0.078404 3.564997e-03 2.643973 0.812063 0.000000 0.000000 0.580953
75% 3007.000000 0.744366 7.500000 16.050000 2.000000 0.769000 0.103486 0.079154 0.151679 0.063225 ... 0.515196 0.464019 4.083389e-01 0.289299 4.655733e-01 4.343750 0.952660 0.219326 0.201666 0.923657
95% 3094.000000 1.000000 12.800000 25.380000 2.000000 0.964000 0.374634 0.275059 0.549835 0.442578 ... 1.134798 1.156450 1.617539e+00 0.945527 1.342206e+00 7.320089 1.000000 1.081930 0.763096 1.341845
max 3138.000000 1.000000 19.312000 84.400000 2.000000 1.000000 3.835455 1.980821 2.569056 2.781917 ... 1.493900 2.327561 9.260946e+00 3.802650 8.791173e+00 12.801339 1.000000 2.193903 2.042554 1.788092

10 rows × 46 columns


In [64]:
# load blind test data
X_test = test_data.drop(['Formation', 'Well Name'], axis=1)
print np.shape(X_test)
X_test.describe(percentiles=[.05, .25, .50, .75, .95])


(830, 46)
Out[64]:
Depth PE_GLCM_correlation_wsize=ave DeltaPHI PHIND NM_M RELPOS GR_d1 ILD_log10_d1 DeltaPHI_d1 PE_d1 ... GR_skew_wsize=5 DeltaPHI_skew_wsize=9 PHIND_mean_wsize=3 PHIND_mean_wsize=ave GR_mean_wsize=3 PE_GLCM_dissimilarity_wsize=ave PE_GLCM_correlation_asym_wsize=9 PE_mean_wsize=65 PE_mean_wsize=129 PE_std_wsize=33
count 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 ... 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000
mean 2987.070482 0.637547 2.851964 11.655277 1.678313 0.535807 0.001562 0.000319 0.001091 -0.000170 ... 0.060851 0.073473 -0.216740 -0.212150 -0.241668 3.766397 0.756560 -0.092549 -0.095933 0.665289
std 94.391925 0.114968 3.442074 5.190236 0.467405 0.283062 0.240581 0.221083 0.195123 0.235226 ... 0.609957 0.675670 0.695730 0.412185 0.886397 1.628802 0.253031 0.282968 0.211606 0.273686
min 2808.000000 0.193154 -8.900000 1.855000 1.000000 0.013000 -1.305395 -1.385190 -0.843712 -0.840265 ... -1.500000 -2.070045 -1.531981 -1.053343 -1.707180 0.589286 -0.452267 -0.716332 -0.613657 0.084940
5% 2828.725000 0.420325 -1.600000 4.500000 1.000000 0.073450 -0.390014 -0.354411 -0.340803 -0.340121 ... -0.949965 -1.049720 -1.168594 -0.764508 -1.445715 1.454688 0.259285 -0.554296 -0.510150 0.181840
25% 2911.625000 0.575441 0.411250 7.700000 1.000000 0.300000 -0.093819 -0.100426 -0.093496 -0.118706 ... -0.389511 -0.407239 -0.751167 -0.515234 -0.924519 2.599051 0.661402 -0.311043 -0.224941 0.512704
50% 2993.750000 0.667789 2.397500 10.950000 2.000000 0.547500 0.009901 -0.015831 0.001896 -0.013277 ... 0.064914 0.132321 -0.312241 -0.277171 -0.222753 3.511161 0.849161 -0.072464 -0.106932 0.673868
75% 3055.375000 0.725687 4.600000 14.793750 2.000000 0.778000 0.096332 0.096963 0.104279 0.080928 ... 0.528001 0.540932 0.198486 0.052659 0.251511 4.652344 0.934892 0.097174 0.106181 0.830130
95% 3139.775000 0.764531 9.400000 21.341500 2.000000 0.967100 0.347309 0.381224 0.311344 0.415739 ... 1.024280 1.159384 1.078898 0.555657 0.901060 6.638616 0.977025 0.360418 0.201518 1.134933
max 3160.500000 0.893054 16.500000 31.335000 2.000000 1.000000 1.340677 1.234798 1.080710 1.324572 ... 1.479472 1.970021 2.075710 1.150128 4.839740 10.217634 0.996858 0.514209 0.341174 1.365075

10 rows × 46 columns


In [65]:
stdscaler = preprocessing.StandardScaler().fit(X_train)
X_train = stdscaler.transform(X_train)
X_test = stdscaler.transform(X_test)

In [66]:
# Average test F1 score with leave one well out

well = train_data["Well Name"].values
logo = LeaveOneGroupOut()

def fit_and_test(X, y, param):

    # Classifier
    model = XGBClassifier(objective='multi:softmax',
                          colsample_bylevel=param['CBL'],
                          colsample_bytree=param['CBT'],
                          gamma=param['GAM'],
                          learning_rate=param['LRN'],
                          n_estimators=param['EST'],
                          max_depth=param['DEP'],
                          min_child_weight=param['MCW'],
                          subsample=1)
        
    # rescale test data to training data
    robscaler1 = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X[cal])
    X_cal = robscaler1.transform(X[cal])
    X_val = robscaler1.transform(X[val])
    y_cal = y[cal]
    y_val = y[val]
        
    # set early stopping rounds
    esr = 10
        
    # fit model
    truth = [(X_val, y_val)]
    model.fit(X_cal, y_cal,
              early_stopping_rounds = esr,
              eval_metric='mlogloss',
              eval_set=truth,
              verbose=False)
    
    # predict test data
    pred = model.predict(X_val)

    # Clean isolated facies for each well
    pred_mf = medfilt(pred, kernel_size=5)
        
    #score
    return f1_score(y_val, pred_mf, labels = np.arange(10), average = 'micro')

XGBoost parameter tuning

Initial tune of n_estimators and learning_rate


In [67]:
# grid search
colsample_bylevel_grid = [0.85]
colsample_bytree_grid = [0.85]
gamma_grid = [0]
learning_rate_grid = [0.08, 0.013, 0.021]  
n_estimators_grid = [55, 89, 144, 233]
max_depth_grid = [5]  
min_child_weight_grid = [1]

param_grid = []
for level in colsample_bylevel_grid:
    for tree in colsample_bytree_grid:
        for gamma in gamma_grid:
            for learn in learning_rate_grid: 
                for n in n_estimators_grid:
                    for depth in max_depth_grid:
                        for child in min_child_weight_grid:
                            param_grid.append({'CBL':level,
                                               'CBT':tree, 
                                               'GAM':gamma,
                                               'LRN':learn,
                                               'EST':n,
                                               'DEP':depth,
                                               'MCW':child
                                               })

In [16]:
# For each set of parameters
score_param = []
for param in param_grid:
    
    # split data into train and test with LOGO
    split_score = []
    for cal, val in logo.split(X_train, y_train, groups=well):
        well_name = well[val[0]]
        split = fit_and_test(X_train, y_train, param)
        print("{:>20s}  {:.3f}".format(well_name, split))
        split_score.append(split)
    
    # Average score for this param
    score_param.append(np.mean(split_score))    
    print('F1 score = %.3f %s' % (score_param[-1], param))

print "    -------------------------------------------    "

# Best set of parameters
best_idx = np.argmax(score_param)
param_best = param_grid[best_idx]
score_best = score_param[best_idx]
print('\nBest F1 score = %.3f %s' % (score_best, param_best))


         ALEXANDER D  0.564
     CHURCHMAN BIBLE  0.485
      CROSS H CATTLE  0.419
            KIMZEY A  0.444
            LUKE G U  0.627
               NEWBY  0.553
               NOLAN  0.566
          Recruit F9  0.738
             SHANKLE  0.604
           SHRIMPLIN  0.567
F1 score = 0.557 {'EST': 55, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.08, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.605
     CHURCHMAN BIBLE  0.485
      CROSS H CATTLE  0.433
            KIMZEY A  0.469
            LUKE G U  0.601
               NEWBY  0.557
               NOLAN  0.557
          Recruit F9  0.738
             SHANKLE  0.612
           SHRIMPLIN  0.571
F1 score = 0.563 {'EST': 89, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.08, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.607
     CHURCHMAN BIBLE  0.485
      CROSS H CATTLE  0.433
            KIMZEY A  0.469
            LUKE G U  0.601
               NEWBY  0.557
               NOLAN  0.557
          Recruit F9  0.738
             SHANKLE  0.619
           SHRIMPLIN  0.573
F1 score = 0.564 {'EST': 144, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.08, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.607
     CHURCHMAN BIBLE  0.485
      CROSS H CATTLE  0.433
            KIMZEY A  0.469
            LUKE G U  0.601
               NEWBY  0.557
               NOLAN  0.557
          Recruit F9  0.738
             SHANKLE  0.619
           SHRIMPLIN  0.573
F1 score = 0.564 {'EST': 233, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.08, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.579
     CHURCHMAN BIBLE  0.436
      CROSS H CATTLE  0.429
            KIMZEY A  0.465
            LUKE G U  0.666
               NEWBY  0.508
               NOLAN  0.564
          Recruit F9  0.762
             SHANKLE  0.559
           SHRIMPLIN  0.594
F1 score = 0.556 {'EST': 55, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.013, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.584
     CHURCHMAN BIBLE  0.448
      CROSS H CATTLE  0.439
            KIMZEY A  0.460
            LUKE G U  0.657
               NEWBY  0.525
               NOLAN  0.576
          Recruit F9  0.762
             SHANKLE  0.570
           SHRIMPLIN  0.590
F1 score = 0.561 {'EST': 89, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.013, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.586
     CHURCHMAN BIBLE  0.458
      CROSS H CATTLE  0.415
            KIMZEY A  0.465
            LUKE G U  0.638
               NEWBY  0.540
               NOLAN  0.576
          Recruit F9  0.762
             SHANKLE  0.566
           SHRIMPLIN  0.588
F1 score = 0.559 {'EST': 144, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.013, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.577
     CHURCHMAN BIBLE  0.460
      CROSS H CATTLE  0.417
            KIMZEY A  0.472
            LUKE G U  0.620
               NEWBY  0.553
               NOLAN  0.561
          Recruit F9  0.762
             SHANKLE  0.546
           SHRIMPLIN  0.575
F1 score = 0.554 {'EST': 233, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.013, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.573
     CHURCHMAN BIBLE  0.446
      CROSS H CATTLE  0.433
            KIMZEY A  0.451
            LUKE G U  0.659
               NEWBY  0.531
               NOLAN  0.557
          Recruit F9  0.762
             SHANKLE  0.548
           SHRIMPLIN  0.599
F1 score = 0.556 {'EST': 55, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.021, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.584
     CHURCHMAN BIBLE  0.458
      CROSS H CATTLE  0.405
            KIMZEY A  0.469
            LUKE G U  0.636
               NEWBY  0.525
               NOLAN  0.545
          Recruit F9  0.762
             SHANKLE  0.546
           SHRIMPLIN  0.599
F1 score = 0.553 {'EST': 89, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.021, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.579
     CHURCHMAN BIBLE  0.468
      CROSS H CATTLE  0.413
            KIMZEY A  0.469
            LUKE G U  0.642
               NEWBY  0.551
               NOLAN  0.566
          Recruit F9  0.762
             SHANKLE  0.541
           SHRIMPLIN  0.582
F1 score = 0.557 {'EST': 144, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.021, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
         ALEXANDER D  0.590
     CHURCHMAN BIBLE  0.468
      CROSS H CATTLE  0.419
            KIMZEY A  0.453
            LUKE G U  0.629
               NEWBY  0.564
               NOLAN  0.552
          Recruit F9  0.762
             SHANKLE  0.606
           SHRIMPLIN  0.584
F1 score = 0.563 {'EST': 233, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.021, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}
    -------------------------------------------    

Best F1 score = 0.564 {'EST': 144, 'DEP': 5, 'CBL': 0.85, 'LRN': 0.08, 'GAM': 0, 'CBT': 0.85, 'MCW': 1}

Stopped grid search. High colsample_bylevel and colsample_bytree values do not seem to work.

Best result:

Best F1 score = 0.593 {'EST': 144, 'DEP': 13, 'CBL': 0.75, 'LRN': 0.008, 'GAM': 0.1, 'CBT': 0.65, 'MCW': 3}

Initial tune of n_estimators and learning_rate

Fine tune learning_rate and n_estimators

Tune max_depth, min_child_weight

Tune of colsample_bylevel and colsample_bytree

Final model test


In [68]:
# define and test final model
      
#F1 score = 0.593 {'EST': 130, 'DEP': 12, 'CBL': 0.85, 'LRN': 0.01, 'GAM': 0, 'CBT': 0.65, 'MCW': 3}

colsample_bylevel_grid = [0.85]
colsample_bytree_grid = [0.65]
gamma_grid = [0]
learning_rate_grid = [0.01]  
n_estimators_grid = [130]
max_depth_grid = [12]  
min_child_weight_grid = [3]

param_grid = []
for level in colsample_bylevel_grid:
    for tree in colsample_bytree_grid:
        for gamma in gamma_grid:
            for learn in learning_rate_grid: 
                for n in n_estimators_grid:
                    for depth in max_depth_grid:
                        for child in min_child_weight_grid:
                            param_grid.append({'CBL':level,
                                               'CBT':tree, 
                                               'GAM':gamma,
                                               'LRN':learn,
                                               'EST':n,
                                               'DEP':depth,
                                               'MCW':child
                                               })

In [69]:
# For each set of parameters
score_param = []
for param in param_grid:
    
    # split data into train and test with LOGO
    split_score = []
    for cal, val in logo.split(X_train, y_train, groups=well):
        well_name = well[val[0]]
        split = fit_and_test(X_train, y_train, param)
        print("{:>20s}  {:.3f}".format(well_name, split))
        split_score.append(split)
    
    # Average score for this param
    score_param.append(np.mean(split_score))    
    print('F1 score = %.3f %s' % (score_param[-1], param))

print "    -------------------------------------------    "

# Best set of parameters
best_idx = np.argmax(score_param)
param_best = param_grid[best_idx]
score_best = score_param[best_idx]
print('\nBest F1 score = %.3f %s' % (score_best, param_best))


         ALEXANDER D  0.609
     CHURCHMAN BIBLE  0.567
      CROSS H CATTLE  0.451
            KIMZEY A  0.515
            LUKE G U  0.668
               NEWBY  0.568
               NOLAN  0.583
          Recruit F9  0.800
             SHANKLE  0.546
           SHRIMPLIN  0.622
F1 score = 0.593 {'EST': 130, 'DEP': 12, 'CBL': 0.85, 'LRN': 0.01, 'GAM': 0, 'CBT': 0.65, 'MCW': 3}
    -------------------------------------------    

Best F1 score = 0.593 {'EST': 130, 'DEP': 12, 'CBL': 0.85, 'LRN': 0.01, 'GAM': 0, 'CBT': 0.65, 'MCW': 3}

Definitions for facies prediction in blind wells


In [70]:
model = XGBClassifier(objective='multi:softmax',
                      colsample_bylevel=param['CBL'],
                      colsample_bytree=param['CBT'],
                      gamma=param['GAM'],
                      learning_rate=param['LRN'],
                      n_estimators=param['EST'],
                      max_depth=param['DEP'],
                      min_child_weight=param['MCW'],
                      subsample=1)

In [71]:
# Parameters
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', 
                 '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
depth = training_data['Depth'].values

def make_facies_log_plot(logs, facies_colors):
    #make sure logs are sorted by depth
    logs = logs.sort_values(by='Depth')
    cmap_facies = colors.ListedColormap(
            facies_colors[0:len(facies_colors)], 'indexed')
    
    ztop=logs.Depth.min(); zbot=logs.Depth.max()
    
    cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
    
    f, ax = plt.subplots(nrows=1, ncols=6, figsize=(8, 12))
    ax[0].plot(logs.GR, logs.Depth, '-g')
    ax[1].plot(logs.ILD_log10, logs.Depth, '-')
    ax[2].plot(logs.DeltaPHI, logs.Depth, '-', color='0.5')
    ax[3].plot(logs.PHIND, logs.Depth, '-', color='r')
    ax[4].plot(logs.PE, logs.Depth, '-', color='black')
    im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
                    cmap=cmap_facies,vmin=1,vmax=9)
    
    divider = make_axes_locatable(ax[5])
    cax = divider.append_axes("right", size="20%", pad=0.05)
    cbar=plt.colorbar(im, cax=cax)
    cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS', 'SiSh', 
                                  ' MS ', ' WS ', ' D  ', ' PS ', ' BS ']))
    cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
    
    for i in range(len(ax)-1):
        ax[i].set_ylim(ztop,zbot)
        ax[i].invert_yaxis()
        ax[i].grid()
        ax[i].locator_params(axis='x', nbins=3)
    
    ax[0].set_xlabel("GR")
    ax[0].set_xlim(logs.GR.min(),logs.GR.max())
    ax[1].set_xlabel("ILD_log10")
    ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
    ax[2].set_xlabel("DeltaPHI")
    ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
    ax[3].set_xlabel("PHIND")
    ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
    ax[4].set_xlabel("PE")
    ax[4].set_xlim(logs.PE.min(),logs.PE.max())
    ax[5].set_xlabel('Facies')
    
    ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
    ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
    ax[5].set_xticklabels([])
    f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)

Predicting facies for blind wells


In [73]:
# scale blind data to full training data set
robscaler2 = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_train)
X_train = robscaler2.transform(X_train)
X_test = robscaler2.transform(X_test)
y_train = train_data['Facies'].values
        
# Predict facies
y_pred = model.fit(X_train, y_train).predict(X_test)
    
# Clean isolated facies for each well
y_pred = medfilt(y_pred, kernel_size=5)

Output of predicted facies


In [74]:
blind = pd.read_csv('nofacies_data.csv')
blind['Facies'] = y_pred
blind.describe()
#blind.dtypes


Out[74]:
Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS Facies
count 830.000000 830.00000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000
mean 2987.070482 57.61173 0.666312 2.851964 11.655277 3.654178 1.678313 0.535807 5.432530
std 94.391925 27.52774 0.288367 3.442074 5.190236 0.649793 0.467405 0.283062 2.394133
min 2808.000000 12.03600 -0.468000 -8.900000 1.855000 2.113000 1.000000 0.013000 2.000000
25% 2911.625000 36.77325 0.541000 0.411250 7.700000 3.171500 1.000000 0.300000 3.000000
50% 2993.750000 58.34450 0.675000 2.397500 10.950000 3.515500 2.000000 0.547500 6.000000
75% 3055.375000 73.05150 0.850750 4.600000 14.793750 4.191500 2.000000 0.778000 8.000000
max 3160.500000 220.41300 1.507000 16.500000 31.335000 6.321000 2.000000 1.000000 8.000000

In [75]:
blind['Formation'] = blind['Formation'].astype('category')
blind['Well Name'] = blind['Well Name'].astype('category')
blind['Depth'] = blind['Depth'].astype('float64')
blind['GR'] = blind['GR'].astype('float64')
blind['ILD_log10'] = blind['ILD_log10'].astype('float64')
blind['DeltaPHI'] = blind['DeltaPHI'].astype('float64')
blind['PHIND'] = blind['PHIND'].astype('float64')
blind['PE'] = blind['PE'].astype('float64')
blind['NM_M'] = blind['NM_M'].astype('int64')
blind['Facies'] = blind['Facies'].astype('int64')

blind.describe()


Out[75]:
Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS Facies
count 830.000000 830.00000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000
mean 2987.070482 57.61173 0.666312 2.851964 11.655277 3.654178 1.678313 0.535807 5.432530
std 94.391925 27.52774 0.288367 3.442074 5.190236 0.649793 0.467405 0.283062 2.394133
min 2808.000000 12.03600 -0.468000 -8.900000 1.855000 2.113000 1.000000 0.013000 2.000000
25% 2911.625000 36.77325 0.541000 0.411250 7.700000 3.171500 1.000000 0.300000 3.000000
50% 2993.750000 58.34450 0.675000 2.397500 10.950000 3.515500 2.000000 0.547500 6.000000
75% 3055.375000 73.05150 0.850750 4.600000 14.793750 4.191500 2.000000 0.778000 8.000000
max 3160.500000 220.41300 1.507000 16.500000 31.335000 6.321000 2.000000 1.000000 8.000000

In [76]:
# Save predicted labels
np.save('04_ypred_XGB_SFS_VC_001.npy', y_pred)
blind.to_csv('04_ypred_XGB_SFS_VC_001.csv')

# Plot predicted labels
make_facies_log_plot(
    blind[blind['Well Name'] == 'STUART'],
    facies_colors=facies_colors)

make_facies_log_plot(
    blind[blind['Well Name'] == 'CRAWFORD'],
    facies_colors=facies_colors)
mpl.rcParams.update(inline_rc)



In [ ]: