Facies classification utilizing an Adaptive Boosted Random Forest

Ryan Thielke

In the following, we provide a possible solution to the facies classification problem described in https://github.com/seg/2016-ml-contest.

Exploring the data


In [1]:
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

import sys
sys.path.append("..")

#Import standard pydata libs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
#training_data['Well Name'] = training_data['Well Name'].astype('category')
#training_data['Formation'] = training_data['Formation'].astype('category')
training_data['train'] = 1
training_data.describe()


Out[2]:
Facies Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS train
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 3232.000000 4149.000000 4149.000000 4149.0
mean 4.503254 2906.867438 64.933985 0.659566 4.402484 13.201066 3.725014 1.518438 0.521852 1.0
std 2.474324 133.300164 30.302530 0.252703 5.274947 7.132846 0.896152 0.499720 0.286644 0.0
min 1.000000 2573.500000 10.149000 -0.025949 -21.832000 0.550000 0.200000 1.000000 0.000000 1.0
25% 2.000000 2821.500000 44.730000 0.498000 1.600000 8.500000 NaN 1.000000 0.277000 1.0
50% 4.000000 2932.500000 64.990000 0.639000 4.300000 12.020000 NaN 2.000000 0.528000 1.0
75% 6.000000 3007.000000 79.438000 0.822000 7.500000 16.050000 NaN 2.000000 0.769000 1.0
max 9.000000 3138.000000 361.150000 1.800000 19.312000 84.400000 8.094000 2.000000 1.000000 1.0

In [3]:
validation_data = pd.read_csv("../validation_data_nofacies.csv")
#validation_data['Well Name'] = validation_data['Well Name'].astype('category')
#validation_data['Formation'] = validation_data['Formation'].astype('category')
validation_data['train'] = 0
validation_data.describe()


Out[3]:
Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS train
count 830.000000 830.00000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.0
mean 2987.070482 57.61173 0.666312 2.851964 11.655277 3.654178 1.678313 0.535807 0.0
std 94.391925 27.52774 0.288367 3.442074 5.190236 0.649793 0.467405 0.283062 0.0
min 2808.000000 12.03600 -0.468000 -8.900000 1.855000 2.113000 1.000000 0.013000 0.0
25% 2911.625000 36.77325 0.541000 0.411250 7.700000 3.171500 1.000000 0.300000 0.0
50% 2993.750000 58.34450 0.675000 2.397500 10.950000 3.515500 2.000000 0.547500 0.0
75% 3055.375000 73.05150 0.850750 4.600000 14.793750 4.191500 2.000000 0.778000 0.0
max 3160.500000 220.41300 1.507000 16.500000 31.335000 6.321000 2.000000 1.000000 0.0

In [4]:
all_data = training_data.append(validation_data)
all_data.describe()


Out[4]:
DeltaPHI Depth Facies GR ILD_log10 NM_M PE PHIND RELPOS train
count 4979.000000 4979.000000 4149.000000 4979.000000 4979.000000 4979.000000 4062.000000 4979.000000 4979.000000 4979.000000
mean 4.144012 2920.237297 4.503254 63.713364 0.660690 1.545089 3.710540 12.943383 0.524178 0.833300
std 5.049037 131.086857 2.474324 29.979746 0.258971 0.498013 0.852033 6.871145 0.286069 0.372745
min -21.832000 2573.500000 1.000000 10.149000 -0.468000 1.000000 0.200000 0.550000 0.000000 0.000000
25% 1.330000 2837.750000 NaN 43.197000 0.503623 1.000000 NaN 8.350000 0.281000 1.000000
50% 3.900000 2943.000000 NaN 64.085000 0.645226 2.000000 NaN 11.800000 0.530000 1.000000
75% 7.000000 3016.000000 NaN 78.555000 0.829000 2.000000 NaN 15.826000 0.771500 1.000000
max 19.312000 3160.500000 9.000000 361.150000 1.800000 2.000000 8.094000 84.400000 1.000000 1.000000

In [5]:
#Visualize the distribution of facies for each well
wells = training_data['Well Name'].unique()

fig, ax = plt.subplots(5,2, figsize=(20,20))
for i, well in enumerate(wells):
    row = i % ax.shape[0]
    column = i // ax.shape[0]
    counts = training_data[training_data['Well Name']==well].Facies.value_counts()
    data_for_well = [counts[j] if j in counts.index else 0 for j in range(1,10)]
    ax[row, column].bar(range(1,10), data_for_well, align='center')
    ax[row, column].set_title("{well}".format(well=well))
    ax[row, column].set_ylabel("Counts")
    ax[row, column].set_xticks(range(1,10))

plt.show()



In [6]:
plt.figure(figsize=(10,10))
sns.heatmap(training_data.drop(['Formation', 'Well Name'], axis=1).corr())


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ad18080>

Feature Engineering

Here we will do a couple things to clean the data and attempt to create new features for our model to consume.

First, we will smooth the PE and GR features. Second, we replace missing PE values with the mean of the entire dataset (might want to investigate other methods) Last, we will encode the formations into integer values


In [7]:
avg_PE_facies = training_data[['Facies', 'PE']].groupby('Facies').mean()
avg_PE_facies = avg_PE_facies.to_dict()
all_data['PE2'] = all_data.Facies.map(avg_PE_facies['PE'])

In [9]:
dfs = []
for well in all_data['Well Name'].unique():
    df = all_data[all_data['Well Name']==well].copy(deep=True)
    df.sort_values('Depth', inplace=True)
    for col in ['PE', 'GR']:
        smooth_col = 'smooth_'+col
        df[smooth_col] = pd.rolling_mean(df[col], window=10)
        df[smooth_col].fillna(method='ffill', inplace=True)
        df[smooth_col].fillna(method='bfill', inplace=True)
    dfs.append(df)
all_data = pd.concat(dfs)
all_data['PE'] = all_data.PE.fillna(all_data.PE2)
all_data['smooth_PE'] = all_data.smooth_PE.fillna(all_data.PE2)
formation_encoder = dict(zip(all_data.Formation.unique(), range(len(all_data.Formation.unique()))))
all_data['enc_formation'] = all_data.Formation.map(formation_encoder)

In [10]:
def to_binary_vec(value, vec_length):
    vec = np.zeros(vec_length)
    vec[value] = 1
    return vec

In [12]:
dfs = list()
for well in all_data['Well Name'].unique():
    tmp_df = all_data[all_data['Well Name'] == well].copy(deep=True)
    tmp_df.sort_values('Depth', inplace=True)
    for feature in ['Depth', 'ILD_log10', 'DeltaPHI', 'PHIND', 'smooth_PE', 'smooth_GR']:
        tmp_df['3prev_'+feature] = tmp_df[feature] / tmp_df[feature].shift(4)
        #tmp_df['2prev_'+feature] = tmp_df[feature] / tmp_df[feature].shift(-1)
        
        tmp_df['3prev_'+feature].fillna(method='bfill', inplace=True)
        #tmp_df['2prev_'+feature].fillna(method='ffill', inplace=True)
    
        tmp_df['3prev_'+feature].replace([np.inf, -np.inf], 0, inplace=True)
        #tmp_df['2prev_'+feature].replace([np.inf, -np.inf], 0, inplace=True)
        
    tmp_df['3prev_enc'] = tmp_df['enc_formation'].shift(3).fillna(method='bfill')
    tmp_df['2prev_enc'] = tmp_df['enc_formation'].shift(2).fillna(method='bfill')
    dfs.append(tmp_df)
all_data = pd.concat(dfs)

In [13]:
all_data.columns


Out[13]:
Index(['DeltaPHI', 'Depth', 'Facies', 'Formation', 'GR', 'ILD_log10', 'NM_M',
       'PE', 'PHIND', 'RELPOS', 'Well Name', 'train', 'PE2', 'smooth_PE',
       'smooth_GR', 'enc_formation', '3prev_Depth', '3prev_ILD_log10',
       '3prev_DeltaPHI', '3prev_PHIND', '3prev_smooth_PE', '3prev_smooth_GR',
       '3prev_enc', '2prev_enc'],
      dtype='object')

In [14]:
#Let's build a model
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, cross_validation 
from classification_utilities import display_cm

In [25]:
#We will take a look at an F1 score for each well
estimators=200
learning_rate=.01
random_state=0
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']

title_length = 20 

training_data = all_data[all_data.train==1]
scores = list()

wells = training_data['Well Name'].unique()
for well in wells:
    blind = training_data[training_data['Well Name']==well]
    train = training_data[(training_data['Well Name']!=well)]
    
    train_X = train.drop(['Formation', 'Well Name', 'Facies', 'Depth', 'PE2', 'train'], axis=1)
    train_Y = train.Facies.values
    test_X = blind.drop(['Formation', 'Well Name', 'Facies', 'Depth', 'PE2', 'train'], axis=1)
    test_Y = blind.Facies.values
    
    clf = AdaBoostClassifier(RandomForestClassifier(n_estimators=200), n_estimators=200, learning_rate=learning_rate, random_state=random_state, algorithm='SAMME.R')
    
    clf.fit(train_X,train_Y)
    print(clf.feature_importances_)
    pred_Y = clf.predict(test_X)
    f1 = metrics.f1_score(test_Y, pred_Y, average='micro')
    scores.append(f1)
    print("*"*title_length)
    print("{well}={f1:.4f}".format(well=well,f1=f1))
    print("*"*title_length)
print("Avg F1: {score}".format(score=sum(scores)/len(scores)))


[ 0.04412707  0.06924824  0.06219999  0.09407778  0.1291804   0.06575857
  0.05279185  0.11209138  0.05450196  0.02639228  0.06552353  0.03295724
  0.03070323  0.03120985  0.04852163  0.03647824  0.02310799  0.02112877]
********************
SHRIMPLIN=0.5520
********************
[ 0.04817865  0.07717444  0.07172197  0.10003113  0.08962734  0.07149685
  0.05979758  0.07565908  0.05747789  0.03187165  0.07222296  0.03694864
  0.032208    0.03469485  0.04778764  0.03600791  0.02877287  0.02832056]
********************
ALEXANDER D=0.7918
********************
[ 0.04604163  0.0698863   0.06681954  0.10304146  0.11700358  0.06573775
  0.05777647  0.10074734  0.0525314   0.02354127  0.07073205  0.03330401
  0.03128147  0.03142841  0.0465587   0.03633147  0.02317155  0.0240656 ]
********************
SHANKLE=0.4699
********************
[ 0.04490513  0.07091263  0.06105166  0.09318855  0.12289318  0.06532766
  0.05501392  0.10429908  0.05518794  0.02680151  0.079507    0.03280856
  0.02858424  0.03063302  0.04630504  0.03387799  0.02533666  0.02336624]
********************
LUKE G U=0.4967
********************
[ 0.04497652  0.08311817  0.07326976  0.09812093  0.08805989  0.0697546
  0.0605411   0.07646661  0.05962708  0.02874062  0.07097299  0.0393633
  0.0316203   0.03586572  0.04630896  0.03888944  0.02714416  0.02715986]
********************
KIMZEY A=0.7130
********************
[ 0.04650795  0.07647087  0.05729825  0.10904619  0.11778947  0.07194021
  0.05659809  0.10387643  0.04956465  0.02630625  0.06507165  0.03260924
  0.02850273  0.03228639  0.04394322  0.03361187  0.02398189  0.02459464]
********************
CROSS H CATTLE=0.3752
********************
[ 0.04666394  0.07242038  0.06785402  0.09962659  0.11739978  0.0668865
  0.05107436  0.10661038  0.04944     0.02676269  0.07006726  0.03343632
  0.0297034   0.03210379  0.04791647  0.03332361  0.0240332   0.02467731]
********************
NOLAN=0.5133
********************
[ 0.04431871  0.07636515  0.06450561  0.1016622   0.11117008  0.07040393
  0.0559545   0.0944977   0.05241118  0.02692014  0.06855894  0.03515734
  0.03169158  0.03292116  0.04706353  0.03532533  0.02552328  0.02554964]
********************
Recruit F9=0.7125
********************
[ 0.0428206   0.07503308  0.06217271  0.09773576  0.11733915  0.06756176
  0.05278412  0.10422784  0.05162514  0.0300889   0.0741217   0.03247683
  0.02910392  0.03172109  0.04613974  0.03358564  0.0253562   0.02610582]
********************
NEWBY=0.4471
********************
[ 0.04264951  0.07529773  0.06488272  0.10207635  0.11507261  0.06754021
  0.05160586  0.10135837  0.05296214  0.0266645   0.07246982  0.03422763
  0.03034796  0.03430729  0.04415651  0.03549216  0.02376133  0.02512732]
********************
CHURCHMAN BIBLE=0.5792
********************
Avg F1: 0.5650820589229345

In [26]:
train_X, test_X, train_Y, test_Y = cross_validation.train_test_split(training_data.drop(['Formation', 'Well Name','Facies', 'Depth', 'PE2', 'train'], axis=1), training_data.Facies.values, test_size=.2)

In [27]:
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)


(3319, 18)
(3319,)
(830, 18)
(830,)

In [29]:
clf = AdaBoostClassifier(RandomForestClassifier(n_estimators=estimators), n_estimators=estimators, random_state=0,learning_rate=learning_rate, algorithm='SAMME.R')
clf.fit(train_X, train_Y)
pred_Y = clf.predict(test_X)
cm = metrics.confusion_matrix(y_true=test_Y, y_pred=pred_Y)
display_cm(cm, facies_labels, display_metrics=True)


     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS    49    12     2     0     0     0     0     0     0    63
     CSiS     3   178    13     0     0     0     0     0     0   194
     FSiS     0    14   136     0     0     0     0     1     0   151
     SiSh     0     0     0    48     3     0     0     0     0    51
       MS     0     0     0     0    49    18     1     5     0    73
       WS     0     0     1     2     3   102     0     9     0   117
        D     0     0     0     0     0     0    19     2     2    23
       PS     0     2     1     1     0    12     1   102     0   119
       BS     0     0     0     0     0     1     0     4    34    39

Precision  0.94  0.86  0.89  0.94  0.89  0.77  0.90  0.83  0.94  0.87
   Recall  0.78  0.92  0.90  0.94  0.67  0.87  0.83  0.86  0.87  0.86
       F1  0.85  0.89  0.89  0.94  0.77  0.82  0.86  0.84  0.91  0.86

In [23]:
validation_data = all_data[all_data.train==0]

In [32]:
validation_data.describe()


Out[32]:
DeltaPHI Depth Facies GR ILD_log10 NM_M PE PHIND RELPOS train ... smooth_GR enc_formation 3prev_Depth 3prev_ILD_log10 3prev_DeltaPHI 3prev_PHIND 3prev_smooth_PE 3prev_smooth_GR 3prev_enc 2prev_enc
count 830.000000 830.000000 0.0 830.00000 830.000000 830.000000 830.000000 830.000000 830.000000 830.0 ... 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000
mean 2.851964 2987.070482 NaN 57.61173 0.666312 1.678313 3.654178 11.655277 0.535807 0.0 ... 57.335679 6.214458 1.000687 0.938413 -0.224295 1.144035 1.003713 1.034686 6.127711 6.156627
std 3.442074 94.391925 NaN 27.52774 0.288367 0.467405 0.649793 5.190236 0.283062 0.0 ... 22.822765 4.599762 0.000209 3.727498 18.569470 0.707483 0.091570 0.252184 4.593548 4.595802
min -8.900000 2808.000000 NaN 12.03600 -0.468000 1.000000 2.113000 1.855000 0.013000 0.0 ... 16.646700 0.000000 1.000633 -103.000000 -338.000000 0.205227 0.804704 0.494273 0.000000 0.000000
25% 0.411250 2911.625000 NaN 36.77325 0.541000 1.000000 3.171500 7.700000 0.300000 0.0 ... 38.718950 1.000000 1.000655 0.858326 0.105263 0.773897 0.943903 0.900490 1.000000 1.000000
50% 2.397500 2993.750000 NaN 58.34450 0.675000 2.000000 3.515500 10.950000 0.547500 0.0 ... 57.712300 5.000000 1.000669 0.979612 0.769673 1.000000 0.997295 1.009033 5.000000 5.000000
75% 4.600000 3055.375000 NaN 73.05150 0.850750 2.000000 4.191500 14.793750 0.778000 0.0 ... 71.845425 11.000000 1.000688 1.143782 1.445772 1.263789 1.044156 1.155654 11.000000 11.000000
max 16.500000 3160.500000 NaN 220.41300 1.507000 2.000000 6.321000 31.335000 1.000000 0.0 ... 151.488400 13.000000 1.003641 13.100000 73.000000 7.212500 1.294929 2.004543 13.000000 13.000000

8 rows × 22 columns


In [33]:
X = training_data.drop(['Formation', 'Well Name', 'Depth','Facies', 'train', 'PE2'], axis=1)
Y = training_data.Facies.values
test_X = validation_data.drop(['Formation', 'Well Name', 'Depth', 'train', 'PE2', 'Facies'], axis=1)

clf = AdaBoostClassifier(RandomForestClassifier(n_estimators=estimators), n_estimators=estimators, learning_rate=learning_rate, random_state=0)
clf.fit(X,Y)
predicted_facies = clf.predict(test_X)
validation_data['Facies'] = predicted_facies

In [36]:
validation_data.to_csv("Kr1m_SEG_ML_Attempt2.csv", index=False)

In [ ]: