Facies classification utilizing an Adaptive Boosted Random Forest

Ryan Thielke

In the following, we provide a possible solution to the facies classification problem described in https://github.com/seg/2016-ml-contest.

Exploring the data



In [1]:

    
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

import sys
sys.path.append("..")

#Import standard pydata libs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns



In [2]:

    
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')                            
training_data.describe()









    Out[2]:






  
    
      
      Facies
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      count
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      3232.000000
      4149.000000
      4149.000000
    
    
      mean
      4.503254
      2906.867438
      64.933985
      0.659566
      4.402484
      13.201066
      3.725014
      1.518438
      0.521852
    
    
      std
      2.474324
      133.300164
      30.302530
      0.252703
      5.274947
      7.132846
      0.896152
      0.499720
      0.286644
    
    
      min
      1.000000
      2573.500000
      10.149000
      -0.025949
      -21.832000
      0.550000
      0.200000
      1.000000
      0.000000
    
    
      25%
      2.000000
      2821.500000
      44.730000
      0.498000
      1.600000
      8.500000
      NaN
      1.000000
      0.277000
    
    
      50%
      4.000000
      2932.500000
      64.990000
      0.639000
      4.300000
      12.020000
      NaN
      2.000000
      0.528000
    
    
      75%
      6.000000
      3007.000000
      79.438000
      0.822000
      7.500000
      16.050000
      NaN
      2.000000
      0.769000
    
    
      max
      9.000000
      3138.000000
      361.150000
      1.800000
      19.312000
      84.400000
      8.094000
      2.000000
      1.000000



In [3]:

    
#Visualize the distribution of facies for each well
wells = training_data['Well Name'].unique()

fig, ax = plt.subplots(5,2, figsize=(20,20))
for i, well in enumerate(wells):
    row = i % ax.shape[0]
    column = i // ax.shape[0]
    counts = training_data[training_data['Well Name']==well].Facies.value_counts()
    data_for_well = [counts[j] if j in counts.index else 0 for j in range(1,10)]
    ax[row, column].bar(range(1,10), data_for_well, align='center')
    ax[row, column].set_title("{well}".format(well=well))
    ax[row, column].set_ylabel("Counts")
    ax[row, column].set_xticks(range(1,10))

plt.show()



In [4]:

    
plt.figure(figsize=(10,10))
sns.heatmap(training_data.drop(['Formation', 'Well Name'], axis=1).corr())









    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x10ba542e8>

Feature Engineering

Here we will do a couple things to clean the data and attempt to create new features for our model to consume.

First, we will smooth the PE and GR features. Second, we replace missing PE values with the mean of the entire dataset (might want to investigate other methods) Last, we will encode the formations into integer values



In [5]:

    
dfs = []
for well in training_data['Well Name'].unique():
    df = training_data[training_data['Well Name']==well].copy(deep=True)
    df.sort_values('Depth', inplace=True)
    for col in ['PE', 'GR']:
        smooth_col = 'smooth_'+col
        df[smooth_col] = pd.rolling_mean(df[col], window=25)
        df[smooth_col].fillna(method='ffill', inplace=True)
        df[smooth_col].fillna(method='bfill', inplace=True)
    dfs.append(df)
training_data = pd.concat(dfs)
pe_mean = training_data.PE.mean()
sm_pe_mean = training_data.smooth_PE.mean()
training_data['PE'] = training_data.PE.replace({np.nan:pe_mean})
training_data['smooth_PE'] = training_data['smooth_PE'].replace({np.nan:sm_pe_mean})
formation_encoder = dict(zip(training_data.Formation.unique(), range(len(training_data.Formation.unique()))))
training_data['enc_formation'] = training_data.Formation.map(formation_encoder)



In [6]:

    
training_data.describe()









    Out[6]:






  
    
      
      Facies
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
      smooth_PE
      smooth_GR
      enc_formation
    
  
  
    
      count
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
    
    
      mean
      4.503254
      2906.867438
      64.933985
      0.659566
      4.402484
      13.201066
      3.725014
      1.518438
      0.521852
      3.707428
      65.006645
      6.476982
    
    
      std
      2.474324
      133.300164
      30.302530
      0.252703
      5.274947
      7.132846
      0.790917
      0.499720
      0.286644
      0.577399
      17.544534
      4.819176
    
    
      min
      1.000000
      2573.500000
      10.149000
      -0.025949
      -21.832000
      0.550000
      0.200000
      1.000000
      0.000000
      2.552000
      22.672680
      0.000000
    
    
      25%
      2.000000
      2821.500000
      44.730000
      0.498000
      1.600000
      8.500000
      3.200000
      1.000000
      0.277000
      3.329280
      53.656400
      2.000000
    
    
      50%
      4.000000
      2932.500000
      64.990000
      0.639000
      4.300000
      12.020000
      3.725014
      2.000000
      0.528000
      3.707428
      65.104800
      6.000000
    
    
      75%
      6.000000
      3007.000000
      79.438000
      0.822000
      7.500000
      16.050000
      4.000000
      2.000000
      0.769000
      3.916000
      75.716000
      12.000000
    
    
      max
      9.000000
      3138.000000
      361.150000
      1.800000
      19.312000
      84.400000
      8.094000
      2.000000
      1.000000
      6.086440
      134.212600
      13.000000

Building the model and parameter tuning

In the section below we will create a Adaptive Boosted Random Forest Classifier from the Scikit-Learn ML Library



In [7]:

    
#Let's build a model
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, cross_validation 
from classification_utilities import display_cm



In [9]:

    
#We will take a look at an F1 score for each well
n_estimators=100
learning_rate=.01
random_state=0
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']

title_length = 20 

wells = training_data['Well Name'].unique()
for well in wells:
    blind = training_data[training_data['Well Name']==well]
    train = training_data[(training_data['Well Name']!=well)]
    
    train_X = train.drop(['Formation', 'Well Name', 'Depth', 'Facies'], axis=1)
    train_Y = train.Facies.values
    test_X = blind.drop(['Formation', 'Well Name', 'Facies', 'Depth'], axis=1)
    test_Y = blind.Facies.values
    
    clf = AdaBoostClassifier(RandomForestClassifier(), n_estimators=200, learning_rate=learning_rate, random_state=random_state, algorithm='SAMME.R')
    clf.fit(X=train_X, y=train_Y)
    pred_Y = clf.predict(test_X)
    f1 = metrics.f1_score(test_Y, pred_Y, average='micro')
    print("*"*title_length)
    print("{well}={f1:.4f}".format(well=well,f1=f1))
    print("*"*title_length)









    



********************
SHRIMPLIN=0.6306
********************
********************
ALEXANDER D=0.6738
********************
********************
SHANKLE=0.4477
********************
********************
LUKE G U=0.6334
********************
********************
KIMZEY A=0.5148
********************
********************
CROSS H CATTLE=0.4331
********************
********************
NOLAN=0.4892
********************
********************
Recruit F9=0.6750
********************
********************
NEWBY=0.4320
********************
********************
CHURCHMAN BIBLE=0.5248
********************



In [10]:

    
train_X, test_X, train_Y, test_Y = cross_validation.train_test_split(training_data.drop(['Formation', 'Well Name','Facies', 'Depth'], axis=1), training_data.Facies.values, test_size=.2)



In [11]:

    
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)









    



(3319, 10)
(3319,)
(830, 10)
(830,)



In [12]:

    
clf = AdaBoostClassifier(RandomForestClassifier(), n_estimators=200, learning_rate=learning_rate, random_state=0, algorithm='SAMME.R')

clf.fit(train_X, train_Y)
pred_Y = clf.predict(test_X)
cm = metrics.confusion_matrix(y_true=test_Y, y_pred=pred_Y)
display_cm(cm, facies_labels, display_metrics=True)









    



     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS    50    16     1     0     0     0     0     0     0    67
     CSiS     2   164    25     0     0     0     0     1     0   192
     FSiS     0    18   134     0     0     2     0     1     0   155
     SiSh     0     0     1    53     0     4     0     2     0    60
       MS     0     2     2     4    31    11     1     9     0    60
       WS     0     0     2     5     3    81     1    19     1   112
        D     0     0     1     0     1     0    22     6     0    30
       PS     0     0     3     1     1     9     0    98     0   112
       BS     0     0     0     0     0     1     0     2    39    42

Precision  0.96  0.82  0.79  0.84  0.86  0.75  0.92  0.71  0.97  0.82
   Recall  0.75  0.85  0.86  0.88  0.52  0.72  0.73  0.88  0.93  0.81
       F1  0.84  0.84  0.83  0.86  0.65  0.74  0.81  0.78  0.95  0.81



In [13]:

    
validation_data = pd.read_csv("../validation_data_nofacies.csv")



In [14]:

    
dfs = []
for well in validation_data['Well Name'].unique():
    df = validation_data[validation_data['Well Name']==well].copy(deep=True)
    df.sort_values('Depth', inplace=True)
    for col in ['PE', 'GR']:
        smooth_col = 'smooth_'+col
        df[smooth_col] = pd.rolling_mean(df[col], window=25)
        df[smooth_col].fillna(method='ffill', inplace=True)
        df[smooth_col].fillna(method='bfill', inplace=True)
    dfs.append(df)
validation_data = pd.concat(dfs)

validation_data['enc_formation'] = validation_data.Formation.map(formation_encoder)
validation_data.describe()









    Out[14]:






  
    
      
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
      smooth_PE
      smooth_GR
      enc_formation
    
  
  
    
      count
      830.000000
      830.00000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
    
    
      mean
      2987.070482
      57.61173
      0.666312
      2.851964
      11.655277
      3.654178
      1.678313
      0.535807
      3.640066
      57.535834
      6.214458
    
    
      std
      94.391925
      27.52774
      0.288367
      3.442074
      5.190236
      0.649793
      0.467405
      0.283062
      0.373720
      16.020556
      4.599762
    
    
      min
      2808.000000
      12.03600
      -0.468000
      -8.900000
      1.855000
      2.113000
      1.000000
      0.013000
      2.783080
      23.462960
      0.000000
    
    
      25%
      2911.625000
      36.77325
      0.541000
      0.411250
      7.700000
      3.171500
      1.000000
      0.300000
      3.386970
      46.641210
      1.000000
    
    
      50%
      2993.750000
      58.34450
      0.675000
      2.397500
      10.950000
      3.515500
      2.000000
      0.547500
      3.631880
      56.626940
      5.000000
    
    
      75%
      3055.375000
      73.05150
      0.850750
      4.600000
      14.793750
      4.191500
      2.000000
      0.778000
      3.846550
      68.118700
      11.000000
    
    
      max
      3160.500000
      220.41300
      1.507000
      16.500000
      31.335000
      6.321000
      2.000000
      1.000000
      4.696200
      95.138680
      13.000000



In [15]:

    
X = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
Y = training_data.Facies.values
test_X = validation_data.drop(['Formation', 'Well Name', 'Depth'], axis=1)

clf = AdaBoostClassifier(RandomForestClassifier(), n_estimators=200, learning_rate=learning_rate, random_state=0)
clf.fit(X,Y)
predicted_facies = clf.predict(test_X)
validation_data['Facies'] = predicted_facies



In [16]:

    
validation_data.to_csv("Kr1m_SEG_ML_Attempt1.csv")



In [ ]:

	Facies	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
count	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	3232.000000	4149.000000	4149.000000
mean	4.503254	2906.867438	64.933985	0.659566	4.402484	13.201066	3.725014	1.518438	0.521852
std	2.474324	133.300164	30.302530	0.252703	5.274947	7.132846	0.896152	0.499720	0.286644
min	1.000000	2573.500000	10.149000	-0.025949	-21.832000	0.550000	0.200000	1.000000	0.000000
25%	2.000000	2821.500000	44.730000	0.498000	1.600000	8.500000	NaN	1.000000	0.277000
50%	4.000000	2932.500000	64.990000	0.639000	4.300000	12.020000	NaN	2.000000	0.528000
75%	6.000000	3007.000000	79.438000	0.822000	7.500000	16.050000	NaN	2.000000	0.769000
max	9.000000	3138.000000	361.150000	1.800000	19.312000	84.400000	8.094000	2.000000	1.000000

	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS	smooth_PE	smooth_GR	enc_formation
count	830.000000	830.00000	830.000000	830.000000	830.000000	830.000000	830.000000	830.000000	830.000000	830.000000	830.000000
mean	2987.070482	57.61173	0.666312	2.851964	11.655277	3.654178	1.678313	0.535807	3.640066	57.535834	6.214458
std	94.391925	27.52774	0.288367	3.442074	5.190236	0.649793	0.467405	0.283062	0.373720	16.020556	4.599762
min	2808.000000	12.03600	-0.468000	-8.900000	1.855000	2.113000	1.000000	0.013000	2.783080	23.462960	0.000000
25%	2911.625000	36.77325	0.541000	0.411250	7.700000	3.171500	1.000000	0.300000	3.386970	46.641210	1.000000
50%	2993.750000	58.34450	0.675000	2.397500	10.950000	3.515500	2.000000	0.547500	3.631880	56.626940	5.000000
75%	3055.375000	73.05150	0.850750	4.600000	14.793750	4.191500	2.000000	0.778000	3.846550	68.118700	11.000000
max	3160.500000	220.41300	1.507000	16.500000	31.335000	6.321000	2.000000	1.000000	4.696200	95.138680	13.000000