Facies classification utilizing an Adaptive Boosted Random Forest

Ryan Thielke

In the following, we provide a possible solution to the facies classification problem described in https://github.com/seg/2016-ml-contest.

Exploring the data


In [1]:
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

import sys
sys.path.append("..")

#Import standard pydata libs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
#training_data['Well Name'] = training_data['Well Name'].astype('category')
#training_data['Formation'] = training_data['Formation'].astype('category')
training_data['train'] = 1
training_data.describe()


Out[2]:
Facies Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS train
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 3232.000000 4149.000000 4149.000000 4149.0
mean 4.503254 2906.867438 64.933985 0.659566 4.402484 13.201066 3.725014 1.518438 0.521852 1.0
std 2.474324 133.300164 30.302530 0.252703 5.274947 7.132846 0.896152 0.499720 0.286644 0.0
min 1.000000 2573.500000 10.149000 -0.025949 -21.832000 0.550000 0.200000 1.000000 0.000000 1.0
25% 2.000000 2821.500000 44.730000 0.498000 1.600000 8.500000 NaN 1.000000 0.277000 1.0
50% 4.000000 2932.500000 64.990000 0.639000 4.300000 12.020000 NaN 2.000000 0.528000 1.0
75% 6.000000 3007.000000 79.438000 0.822000 7.500000 16.050000 NaN 2.000000 0.769000 1.0
max 9.000000 3138.000000 361.150000 1.800000 19.312000 84.400000 8.094000 2.000000 1.000000 1.0

In [3]:
validation_data = pd.read_csv("../validation_data_nofacies.csv")
#validation_data['Well Name'] = validation_data['Well Name'].astype('category')
#validation_data['Formation'] = validation_data['Formation'].astype('category')
validation_data['train'] = 0
validation_data.describe()


Out[3]:
Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS train
count 830.000000 830.00000 830.000000 830.000000 830.000000 830.000000 830.000000 830.000000 830.0
mean 2987.070482 57.61173 0.666312 2.851964 11.655277 3.654178 1.678313 0.535807 0.0
std 94.391925 27.52774 0.288367 3.442074 5.190236 0.649793 0.467405 0.283062 0.0
min 2808.000000 12.03600 -0.468000 -8.900000 1.855000 2.113000 1.000000 0.013000 0.0
25% 2911.625000 36.77325 0.541000 0.411250 7.700000 3.171500 1.000000 0.300000 0.0
50% 2993.750000 58.34450 0.675000 2.397500 10.950000 3.515500 2.000000 0.547500 0.0
75% 3055.375000 73.05150 0.850750 4.600000 14.793750 4.191500 2.000000 0.778000 0.0
max 3160.500000 220.41300 1.507000 16.500000 31.335000 6.321000 2.000000 1.000000 0.0

In [4]:
all_data = training_data.append(validation_data)
all_data.describe()


Out[4]:
DeltaPHI Depth Facies GR ILD_log10 NM_M PE PHIND RELPOS train
count 4979.000000 4979.000000 4149.000000 4979.000000 4979.000000 4979.000000 4062.000000 4979.000000 4979.000000 4979.000000
mean 4.144012 2920.237297 4.503254 63.713364 0.660690 1.545089 3.710540 12.943383 0.524178 0.833300
std 5.049037 131.086857 2.474324 29.979746 0.258971 0.498013 0.852033 6.871145 0.286069 0.372745
min -21.832000 2573.500000 1.000000 10.149000 -0.468000 1.000000 0.200000 0.550000 0.000000 0.000000
25% 1.330000 2837.750000 NaN 43.197000 0.503623 1.000000 NaN 8.350000 0.281000 1.000000
50% 3.900000 2943.000000 NaN 64.085000 0.645226 2.000000 NaN 11.800000 0.530000 1.000000
75% 7.000000 3016.000000 NaN 78.555000 0.829000 2.000000 NaN 15.826000 0.771500 1.000000
max 19.312000 3160.500000 9.000000 361.150000 1.800000 2.000000 8.094000 84.400000 1.000000 1.000000

In [5]:
#Visualize the distribution of facies for each well
wells = training_data['Well Name'].unique()

fig, ax = plt.subplots(5,2, figsize=(20,20))
for i, well in enumerate(wells):
    row = i % ax.shape[0]
    column = i // ax.shape[0]
    counts = training_data[training_data['Well Name']==well].Facies.value_counts()
    data_for_well = [counts[j] if j in counts.index else 0 for j in range(1,10)]
    ax[row, column].bar(range(1,10), data_for_well, align='center')
    ax[row, column].set_title("{well}".format(well=well))
    ax[row, column].set_ylabel("Counts")
    ax[row, column].set_xticks(range(1,10))

plt.show()



In [6]:
plt.figure(figsize=(10,10))
sns.heatmap(training_data.drop(['Formation', 'Well Name'], axis=1).corr())


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a415518>

Feature Engineering

Here we will do a couple things to clean the data and attempt to create new features for our model to consume.

First, we will smooth the PE and GR features. Second, we replace missing PE values with the mean of the entire dataset (might want to investigate other methods) Last, we will encode the formations into integer values


In [7]:
dfs = []
for well in all_data['Well Name'].unique():
    df = all_data[all_data['Well Name']==well].copy(deep=True)
    df.sort_values('Depth', inplace=True)
    for col in ['PE']:
        smooth_col = 'smooth_'+col
        df[smooth_col] = pd.rolling_mean(df[col], window=25)
        df[smooth_col].fillna(method='ffill', inplace=True)
        df[smooth_col].fillna(method='bfill', inplace=True)
    dfs.append(df)
all_data = pd.concat(dfs)

all_data['PE'] = all_data.PE.fillna(all_data.PE.mean())
all_data['smooth_PE'] = all_data.smooth_PE.fillna(all_data.smooth_PE.mean())
formation_encoder = dict(zip(all_data.Formation.unique(), range(len(all_data.Formation.unique()))))
all_data['enc_formation'] = all_data.Formation.map(formation_encoder)

In [8]:
all_data.columns


Out[8]:
Index(['DeltaPHI', 'Depth', 'Facies', 'Formation', 'GR', 'ILD_log10', 'NM_M',
       'PE', 'PHIND', 'RELPOS', 'Well Name', 'train', 'smooth_PE',
       'enc_formation'],
      dtype='object')

In [9]:
from sklearn import preprocessing

feature_names = all_data.drop(['Well Name', 'train', 'Depth', 'Formation', 'enc_formation', 'Facies'], axis=1).columns
train_labels = all_data.train.tolist()
facies_labels = all_data.Facies.tolist()
well_names = all_data['Well Name'].tolist()
depths = all_data.Depth.tolist()

scaler = preprocessing.StandardScaler().fit(all_data.drop(['Well Name', 'train', 'Depth', 'Formation', 'enc_formation', 'Facies'], axis=1))
scaled_features = scaler.transform(all_data.drop(['Well Name', 'train', 'Depth', 'Formation', 'enc_formation', 'Facies'], axis=1))

scaled_df = pd.DataFrame(scaled_features, columns=feature_names)
scaled_df['train'] = train_labels
scaled_df['Facies'] = facies_labels
scaled_df['Well Name'] = well_names
scaled_df['Depth'] = depths

In [10]:
def to_binary_vec(value, vec_length):
    vec = np.zeros(vec_length+1)
    vec[value] = 1
    return vec

catagorical_vars = []

for i in all_data.enc_formation:
    vec = to_binary_vec(i, all_data.enc_formation.max())
    catagorical_vars.append(vec)
    
catagorical_vars = np.array(catagorical_vars)

for i in range(catagorical_vars.shape[1]):
    scaled_df['f'+str(i)] = catagorical_vars[:,i]

In [11]:
dfs = list()
for well in all_data['Well Name'].unique():
    tmp_df = all_data[all_data['Well Name'] == well].copy(deep=True)
    tmp_df.sort_values('Depth', inplace=True)
    for feature in ['PE', 'GR']:
        tmp_df['3'+feature] = tmp_df[feature] / tmp_df[feature].shift(1)
        
        tmp_df['3'+feature].fillna(0, inplace=True)
        
    dfs.append(tmp_df)
scaled_df = pd.concat(dfs)

In [12]:
#Let's build a model
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, cross_validation 
from classification_utilities import display_cm
import xgboost as xgb

In [13]:
import xgboost as xgb
#We will take a look at an F1 score for each well
estimators=200
learning_rate=.01
random_state=0
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']

title_length = 20 

training_data = scaled_df[scaled_df.train==1]
scores = list()

wells = training_data['Well Name'].unique()
for well in wells:
    blind = training_data[training_data['Well Name']==well]
    train = training_data[(training_data['Well Name']!=well)]
    
    train_X = train.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)
    train_Y = train.Facies.values
    test_X = blind.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)
    test_Y = blind.Facies.values
    
    gcf = xgb.XGBClassifier(n_estimators=2000, learning_rate=learning_rate)
    gcf.fit(train_X,train_Y)
    pred_Y = gcf.predict(test_X)
    f1 = metrics.f1_score(test_Y, pred_Y, average='micro')
    scores.append(f1)
    print("*"*title_length)
    print("{well}={f1:.4f}".format(well=well,f1=f1))
    print("*"*title_length)
print("Avg F1: {score}".format(score=sum(scores)/len(scores)))


********************
SHRIMPLIN=0.5945
********************
********************
ALEXANDER D=0.6116
********************
********************
SHANKLE=0.4855
********************
********************
LUKE G U=0.6204
********************
********************
KIMZEY A=0.5672
********************
********************
CROSS H CATTLE=0.3952
********************
********************
NOLAN=0.4843
********************
********************
Recruit F9=0.8250
********************
********************
NEWBY=0.4600
********************
********************
CHURCHMAN BIBLE=0.5322
********************
Avg F1: 0.5575948176091869

In [14]:
train_X, test_X, train_Y, test_Y = cross_validation.train_test_split(training_data.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1), training_data.Facies.values, test_size=.2)

In [15]:
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)


(3319, 11)
(3319,)
(830, 11)
(830,)

In [16]:
gcf = xgb.XGBClassifier(n_estimators=2000, learning_rate=learning_rate)
gcf.fit(train_X,train_Y)
pred_Y = gcf.predict(test_X)
cm = metrics.confusion_matrix(y_true=test_Y, y_pred=pred_Y)
display_cm(cm, facies_labels, display_metrics=True)


     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS    40    19     2     0     0     0     0     0     0    61
     CSiS    13   158    28     0     0     0     0     1     0   200
     FSiS     0    30   108     0     1     1     0     0     0   140
     SiSh     0     0     2    48     4    11     2     2     0    69
       MS     0     4     0     4    15    14     3    11     0    51
       WS     0     0     0     6    16    62     3    23     0   110
        D     0     0     0     1     0     4    16     8     0    29
       PS     1     1     2     3     2    19     0   103     3   134
       BS     0     0     0     0     0     3     1     4    28    36

Precision  0.74  0.75  0.76  0.77  0.39  0.54  0.64  0.68  0.90  0.69
   Recall  0.66  0.79  0.77  0.70  0.29  0.56  0.55  0.77  0.78  0.70
       F1  0.70  0.77  0.77  0.73  0.34  0.55  0.59  0.72  0.84  0.69

In [17]:
validation_data = scaled_df[scaled_df.train==0]

In [ ]:
validation_data.describe()


Out[ ]:
DeltaPHI Depth Facies GR ILD_log10 NM_M PE PHIND RELPOS train smooth_PE enc_formation 3PE 3GR
count 830.000000 830.000000 0.0 830.00000 830.000000 830.000000 830.000000 830.000000 830.000000 830.0 830.000000 830.000000 830.000000 830.000000
mean 2.851964 2987.070482 NaN 57.61173 0.666312 1.678313 3.654178 11.655277 0.535807 0.0 3.640066 6.214458 0.999123 1.006279
std 3.442074 94.391925 NaN 27.52774 0.288367 0.467405 0.649793 5.190236 0.283062 0.0 0.373720 4.599762 0.076272 0.134799
min -8.900000 2808.000000 NaN 12.03600 -0.468000 1.000000 2.113000 1.855000 0.013000 0.0 2.783080 0.000000 0.000000 0.000000
25% 0.411250 2911.625000 NaN 36.77325 0.541000 1.000000 3.171500 7.700000 0.300000 0.0 3.386970 1.000000 0.973670 0.949549
50% 2.397500 2993.750000 NaN 58.34450 0.675000 2.000000 3.515500 10.950000 0.547500 0.0 3.631880 5.000000 0.996059 1.007291
75% 4.600000 3055.375000 NaN 73.05150 0.850750 2.000000 4.191500 14.793750 0.778000 0.0 3.846550 11.000000 1.020540 1.062497
max 16.500000 3160.500000 NaN 220.41300 1.507000 2.000000 6.321000 31.335000 1.000000 0.0 4.696200 13.000000 1.446762 1.563297

In [ ]:
X = training_data.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)
Y = training_data.Facies.values
test_X = validation_data.drop(['Well Name', 'Facies', 'Depth', 'train', 'Formation'], axis=1)

gcf = xgb.XGBClassifier(n_estimators=2000, learning_rate=learning_rate)
gcf.fit(X,Y)
pred_Y = gcf.predict(test_X)

validation_data['Facies'] = pred_Y

In [ ]:
validation_data.to_csv("Kr1m_SEG_ML_Attempt3.csv", index=False)

In [ ]: