Facies classification utilizing an Adaptive Boosted Random Forest

Ryan Thielke

In the following, we provide a possible solution to the facies classification problem described in https://github.com/seg/2016-ml-contest.

Exploring the data



In [1]:

    
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

import sys
sys.path.append("..")

#Import standard pydata libs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns



In [2]:

    
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
#training_data['Well Name'] = training_data['Well Name'].astype('category')
#training_data['Formation'] = training_data['Formation'].astype('category')
training_data['train'] = 1
training_data.describe()









    Out[2]:






  
    
      
      Facies
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
      train
    
  
  
    
      count
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      3232.000000
      4149.000000
      4149.000000
      4149.0
    
    
      mean
      4.503254
      2906.867438
      64.933985
      0.659566
      4.402484
      13.201066
      3.725014
      1.518438
      0.521852
      1.0
    
    
      std
      2.474324
      133.300164
      30.302530
      0.252703
      5.274947
      7.132846
      0.896152
      0.499720
      0.286644
      0.0
    
    
      min
      1.000000
      2573.500000
      10.149000
      -0.025949
      -21.832000
      0.550000
      0.200000
      1.000000
      0.000000
      1.0
    
    
      25%
      2.000000
      2821.500000
      44.730000
      0.498000
      1.600000
      8.500000
      NaN
      1.000000
      0.277000
      1.0
    
    
      50%
      4.000000
      2932.500000
      64.990000
      0.639000
      4.300000
      12.020000
      NaN
      2.000000
      0.528000
      1.0
    
    
      75%
      6.000000
      3007.000000
      79.438000
      0.822000
      7.500000
      16.050000
      NaN
      2.000000
      0.769000
      1.0
    
    
      max
      9.000000
      3138.000000
      361.150000
      1.800000
      19.312000
      84.400000
      8.094000
      2.000000
      1.000000
      1.0



In [3]:

    
validation_data = pd.read_csv("../validation_data_nofacies.csv")
#validation_data['Well Name'] = validation_data['Well Name'].astype('category')
#validation_data['Formation'] = validation_data['Formation'].astype('category')
validation_data['train'] = 0
validation_data.describe()









    Out[3]:






  
    
      
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
      train
    
  
  
    
      count
      830.000000
      830.00000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
      830.0
    
    
      mean
      2987.070482
      57.61173
      0.666312
      2.851964
      11.655277
      3.654178
      1.678313
      0.535807
      0.0
    
    
      std
      94.391925
      27.52774
      0.288367
      3.442074
      5.190236
      0.649793
      0.467405
      0.283062
      0.0
    
    
      min
      2808.000000
      12.03600
      -0.468000
      -8.900000
      1.855000
      2.113000
      1.000000
      0.013000
      0.0
    
    
      25%
      2911.625000
      36.77325
      0.541000
      0.411250
      7.700000
      3.171500
      1.000000
      0.300000
      0.0
    
    
      50%
      2993.750000
      58.34450
      0.675000
      2.397500
      10.950000
      3.515500
      2.000000
      0.547500
      0.0
    
    
      75%
      3055.375000
      73.05150
      0.850750
      4.600000
      14.793750
      4.191500
      2.000000
      0.778000
      0.0
    
    
      max
      3160.500000
      220.41300
      1.507000
      16.500000
      31.335000
      6.321000
      2.000000
      1.000000
      0.0



In [4]:

    
all_data = training_data.append(validation_data)
all_data.describe()









    Out[4]:






  
    
      
      DeltaPHI
      Depth
      Facies
      GR
      ILD_log10
      NM_M
      PE
      PHIND
      RELPOS
      train
    
  
  
    
      count
      4979.000000
      4979.000000
      4149.000000
      4979.000000
      4979.000000
      4979.000000
      4062.000000
      4979.000000
      4979.000000
      4979.000000
    
    
      mean
      4.144012
      2920.237297
      4.503254
      63.713364
      0.660690
      1.545089
      3.710540
      12.943383
      0.524178
      0.833300
    
    
      std
      5.049037
      131.086857
      2.474324
      29.979746
      0.258971
      0.498013
      0.852033
      6.871145
      0.286069
      0.372745
    
    
      min
      -21.832000
      2573.500000
      1.000000
      10.149000
      -0.468000
      1.000000
      0.200000
      0.550000
      0.000000
      0.000000
    
    
      25%
      1.330000
      2837.750000
      NaN
      43.197000
      0.503623
      1.000000
      NaN
      8.350000
      0.281000
      1.000000
    
    
      50%
      3.900000
      2943.000000
      NaN
      64.085000
      0.645226
      2.000000
      NaN
      11.800000
      0.530000
      1.000000
    
    
      75%
      7.000000
      3016.000000
      NaN
      78.555000
      0.829000
      2.000000
      NaN
      15.826000
      0.771500
      1.000000
    
    
      max
      19.312000
      3160.500000
      9.000000
      361.150000
      1.800000
      2.000000
      8.094000
      84.400000
      1.000000
      1.000000



In [5]:

    
#Visualize the distribution of facies for each well
wells = training_data['Well Name'].unique()

fig, ax = plt.subplots(5,2, figsize=(20,20))
for i, well in enumerate(wells):
    row = i % ax.shape[0]
    column = i // ax.shape[0]
    counts = training_data[training_data['Well Name']==well].Facies.value_counts()
    data_for_well = [counts[j] if j in counts.index else 0 for j in range(1,10)]
    ax[row, column].bar(range(1,10), data_for_well, align='center')
    ax[row, column].set_title("{well}".format(well=well))
    ax[row, column].set_ylabel("Counts")
    ax[row, column].set_xticks(range(1,10))

plt.show()



In [6]:

    
plt.figure(figsize=(10,10))
sns.heatmap(training_data.drop(['Formation', 'Well Name'], axis=1).corr())









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a415518>

Feature Engineering

Here we will do a couple things to clean the data and attempt to create new features for our model to consume.

First, we will smooth the PE and GR features. Second, we replace missing PE values with the mean of the entire dataset (might want to investigate other methods) Last, we will encode the formations into integer values



In [7]:

    
dfs = []
for well in all_data['Well Name'].unique():
    df = all_data[all_data['Well Name']==well].copy(deep=True)
    df.sort_values('Depth', inplace=True)
    for col in ['PE']:
        smooth_col = 'smooth_'+col
        df[smooth_col] = pd.rolling_mean(df[col], window=25)
        df[smooth_col].fillna(method='ffill', inplace=True)
        df[smooth_col].fillna(method='bfill', inplace=True)
    dfs.append(df)
all_data = pd.concat(dfs)

all_data['PE'] = all_data.PE.fillna(all_data.PE.mean())
all_data['smooth_PE'] = all_data.smooth_PE.fillna(all_data.smooth_PE.mean())
formation_encoder = dict(zip(all_data.Formation.unique(), range(len(all_data.Formation.unique()))))
all_data['enc_formation'] = all_data.Formation.map(formation_encoder)



In [8]:

    
all_data.columns









    Out[8]:





Index(['DeltaPHI', 'Depth', 'Facies', 'Formation', 'GR', 'ILD_log10', 'NM_M',
       'PE', 'PHIND', 'RELPOS', 'Well Name', 'train', 'smooth_PE',
       'enc_formation'],
      dtype='object')



In [9]:

    
from sklearn import preprocessing

feature_names = all_data.drop(['Well Name', 'train', 'Depth', 'Formation', 'enc_formation', 'Facies'], axis=1).columns
train_labels = all_data.train.tolist()
facies_labels = all_data.Facies.tolist()
well_names = all_data['Well Name'].tolist()
depths = all_data.Depth.tolist()

scaler = preprocessing.StandardScaler().fit(all_data.drop(['Well Name', 'train', 'Depth', 'Formation', 'enc_formation', 'Facies'], axis=1))
scaled_features = scaler.transform(all_data.drop(['Well Name', 'train', 'Depth', 'Formation', 'enc_formation', 'Facies'], axis=1))

scaled_df = pd.DataFrame(scaled_features, columns=feature_names)
scaled_df['train'] = train_labels
scaled_df['Facies'] = facies_labels
scaled_df['Well Name'] = well_names
scaled_df['Depth'] = depths



In [10]:

    
def to_binary_vec(value, vec_length):
    vec = np.zeros(vec_length+1)
    vec[value] = 1
    return vec

catagorical_vars = []

for i in all_data.enc_formation:
    vec = to_binary_vec(i, all_data.enc_formation.max())
    catagorical_vars.append(vec)
    
catagorical_vars = np.array(catagorical_vars)

for i in range(catagorical_vars.shape[1]):
    scaled_df['f'+str(i)] = catagorical_vars[:,i]



In [11]:

    
dfs = list()
for well in all_data['Well Name'].unique():
    tmp_df = all_data[all_data['Well Name'] == well].copy(deep=True)
    tmp_df.sort_values('Depth', inplace=True)
    for feature in ['PE', 'GR']:
        tmp_df['3'+feature] = tmp_df[feature] / tmp_df[feature].shift(1)
        
        tmp_df['3'+feature].fillna(0, inplace=True)
        
    dfs.append(tmp_df)
scaled_df = pd.concat(dfs)



In [12]:

    
#Let's build a model
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, cross_validation 
from classification_utilities import display_cm
import xgboost as xgb



In [13]:

    
import xgboost as xgb
#We will take a look at an F1 score for each well
estimators=200
learning_rate=.01
random_state=0
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']

title_length = 20 

training_data = scaled_df[scaled_df.train==1]
scores = list()

wells = training_data['Well Name'].unique()
for well in wells:
    blind = training_data[training_data['Well Name']==well]
    train = training_data[(training_data['Well Name']!=well)]
    
    train_X = train.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)
    train_Y = train.Facies.values
    test_X = blind.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)
    test_Y = blind.Facies.values
    
    gcf = xgb.XGBClassifier(n_estimators=2000, learning_rate=learning_rate)
    gcf.fit(train_X,train_Y)
    pred_Y = gcf.predict(test_X)
    f1 = metrics.f1_score(test_Y, pred_Y, average='micro')
    scores.append(f1)
    print("*"*title_length)
    print("{well}={f1:.4f}".format(well=well,f1=f1))
    print("*"*title_length)
print("Avg F1: {score}".format(score=sum(scores)/len(scores)))









    



********************
SHRIMPLIN=0.5945
********************
********************
ALEXANDER D=0.6116
********************
********************
SHANKLE=0.4855
********************
********************
LUKE G U=0.6204
********************
********************
KIMZEY A=0.5672
********************
********************
CROSS H CATTLE=0.3952
********************
********************
NOLAN=0.4843
********************
********************
Recruit F9=0.8250
********************
********************
NEWBY=0.4600
********************
********************
CHURCHMAN BIBLE=0.5322
********************
Avg F1: 0.5575948176091869



In [14]:

    
train_X, test_X, train_Y, test_Y = cross_validation.train_test_split(training_data.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1), training_data.Facies.values, test_size=.2)



In [15]:

    
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)









    



(3319, 11)
(3319,)
(830, 11)
(830,)



In [16]:

    
gcf = xgb.XGBClassifier(n_estimators=2000, learning_rate=learning_rate)
gcf.fit(train_X,train_Y)
pred_Y = gcf.predict(test_X)
cm = metrics.confusion_matrix(y_true=test_Y, y_pred=pred_Y)
display_cm(cm, facies_labels, display_metrics=True)









    



     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS    40    19     2     0     0     0     0     0     0    61
     CSiS    13   158    28     0     0     0     0     1     0   200
     FSiS     0    30   108     0     1     1     0     0     0   140
     SiSh     0     0     2    48     4    11     2     2     0    69
       MS     0     4     0     4    15    14     3    11     0    51
       WS     0     0     0     6    16    62     3    23     0   110
        D     0     0     0     1     0     4    16     8     0    29
       PS     1     1     2     3     2    19     0   103     3   134
       BS     0     0     0     0     0     3     1     4    28    36

Precision  0.74  0.75  0.76  0.77  0.39  0.54  0.64  0.68  0.90  0.69
   Recall  0.66  0.79  0.77  0.70  0.29  0.56  0.55  0.77  0.78  0.70
       F1  0.70  0.77  0.77  0.73  0.34  0.55  0.59  0.72  0.84  0.69



In [17]:

    
validation_data = scaled_df[scaled_df.train==0]



In [ ]:

    
validation_data.describe()









    Out[ ]:






  
    
      
      DeltaPHI
      Depth
      Facies
      GR
      ILD_log10
      NM_M
      PE
      PHIND
      RELPOS
      train
      smooth_PE
      enc_formation
      3PE
      3GR
    
  
  
    
      count
      830.000000
      830.000000
      0.0
      830.00000
      830.000000
      830.000000
      830.000000
      830.000000
      830.000000
      830.0
      830.000000
      830.000000
      830.000000
      830.000000
    
    
      mean
      2.851964
      2987.070482
      NaN
      57.61173
      0.666312
      1.678313
      3.654178
      11.655277
      0.535807
      0.0
      3.640066
      6.214458
      0.999123
      1.006279
    
    
      std
      3.442074
      94.391925
      NaN
      27.52774
      0.288367
      0.467405
      0.649793
      5.190236
      0.283062
      0.0
      0.373720
      4.599762
      0.076272
      0.134799
    
    
      min
      -8.900000
      2808.000000
      NaN
      12.03600
      -0.468000
      1.000000
      2.113000
      1.855000
      0.013000
      0.0
      2.783080
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.411250
      2911.625000
      NaN
      36.77325
      0.541000
      1.000000
      3.171500
      7.700000
      0.300000
      0.0
      3.386970
      1.000000
      0.973670
      0.949549
    
    
      50%
      2.397500
      2993.750000
      NaN
      58.34450
      0.675000
      2.000000
      3.515500
      10.950000
      0.547500
      0.0
      3.631880
      5.000000
      0.996059
      1.007291
    
    
      75%
      4.600000
      3055.375000
      NaN
      73.05150
      0.850750
      2.000000
      4.191500
      14.793750
      0.778000
      0.0
      3.846550
      11.000000
      1.020540
      1.062497
    
    
      max
      16.500000
      3160.500000
      NaN
      220.41300
      1.507000
      2.000000
      6.321000
      31.335000
      1.000000
      0.0
      4.696200
      13.000000
      1.446762
      1.563297



In [ ]:

    
X = training_data.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)
Y = training_data.Facies.values
test_X = validation_data.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)

gcf = xgb.XGBClassifier(n_estimators=2000, learning_rate=learning_rate)
gcf.fit(X,Y)
pred_Y = gcf.predict(test_X)

validation_data['Facies'] = pred_Y



In [ ]:

    
validation_data.to_csv("Kr1m_SEG_ML_Attempt3.csv", index=False)



In [ ]:

	Facies	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS	train
count	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	3232.000000	4149.000000	4149.000000	4149.0
mean	4.503254	2906.867438	64.933985	0.659566	4.402484	13.201066	3.725014	1.518438	0.521852	1.0
std	2.474324	133.300164	30.302530	0.252703	5.274947	7.132846	0.896152	0.499720	0.286644	0.0
min	1.000000	2573.500000	10.149000	-0.025949	-21.832000	0.550000	0.200000	1.000000	0.000000	1.0
25%	2.000000	2821.500000	44.730000	0.498000	1.600000	8.500000	NaN	1.000000	0.277000	1.0
50%	4.000000	2932.500000	64.990000	0.639000	4.300000	12.020000	NaN	2.000000	0.528000	1.0
75%	6.000000	3007.000000	79.438000	0.822000	7.500000	16.050000	NaN	2.000000	0.769000	1.0
max	9.000000	3138.000000	361.150000	1.800000	19.312000	84.400000	8.094000	2.000000	1.000000	1.0

	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS	train
count	830.000000	830.00000	830.000000	830.000000	830.000000	830.000000	830.000000	830.000000	830.0
mean	2987.070482	57.61173	0.666312	2.851964	11.655277	3.654178	1.678313	0.535807	0.0
std	94.391925	27.52774	0.288367	3.442074	5.190236	0.649793	0.467405	0.283062	0.0
min	2808.000000	12.03600	-0.468000	-8.900000	1.855000	2.113000	1.000000	0.013000	0.0
25%	2911.625000	36.77325	0.541000	0.411250	7.700000	3.171500	1.000000	0.300000	0.0
50%	2993.750000	58.34450	0.675000	2.397500	10.950000	3.515500	2.000000	0.547500	0.0
75%	3055.375000	73.05150	0.850750	4.600000	14.793750	4.191500	2.000000	0.778000	0.0
max	3160.500000	220.41300	1.507000	16.500000	31.335000	6.321000	2.000000	1.000000	0.0

	DeltaPHI	Depth	Facies	GR	ILD_log10	NM_M	PE	PHIND	RELPOS	train
count	4979.000000	4979.000000	4149.000000	4979.000000	4979.000000	4979.000000	4062.000000	4979.000000	4979.000000	4979.000000
mean	4.144012	2920.237297	4.503254	63.713364	0.660690	1.545089	3.710540	12.943383	0.524178	0.833300
std	5.049037	131.086857	2.474324	29.979746	0.258971	0.498013	0.852033	6.871145	0.286069	0.372745
min	-21.832000	2573.500000	1.000000	10.149000	-0.468000	1.000000	0.200000	0.550000	0.000000	0.000000
25%	1.330000	2837.750000	NaN	43.197000	0.503623	1.000000	NaN	8.350000	0.281000	1.000000
50%	3.900000	2943.000000	NaN	64.085000	0.645226	2.000000	NaN	11.800000	0.530000	1.000000
75%	7.000000	3016.000000	NaN	78.555000	0.829000	2.000000	NaN	15.826000	0.771500	1.000000
max	19.312000	3160.500000	9.000000	361.150000	1.800000	2.000000	8.094000	84.400000	1.000000	1.000000