In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from pandas import set_option
set_option("display.max_rows", 10)

Load and pre-process data


In [2]:
from sklearn import preprocessing

filename = '../facies_vectors.csv'
train = pd.read_csv(filename)

# encode well name and formation features
le = preprocessing.LabelEncoder()
train["Well Name"] = le.fit_transform(train["Well Name"])
train["Formation"] = le.fit_transform(train["Formation"])

data_loaded = train.copy()

# cleanup memory
del train

data_loaded


Out[2]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 1 9 2793.0 77.450 0.664 9.900 11.915 4.600 1 1.000
1 3 1 9 2793.5 78.260 0.661 14.200 12.565 4.100 1 0.979
2 3 1 9 2794.0 79.050 0.658 14.800 13.050 3.600 1 0.957
3 3 1 9 2794.5 86.100 0.655 13.900 13.115 3.500 1 0.936
4 3 1 9 2795.0 74.580 0.647 13.500 13.300 3.400 1 0.915
... ... ... ... ... ... ... ... ... ... ... ...
4144 5 12 1 3120.5 46.719 0.947 1.828 7.254 3.617 2 0.685
4145 5 12 1 3121.0 44.563 0.953 2.241 8.013 3.344 2 0.677
4146 5 12 1 3121.5 49.719 0.964 2.925 8.013 3.190 2 0.669
4147 5 12 1 3122.0 51.469 0.965 3.083 7.708 3.152 2 0.661
4148 5 12 1 3122.5 50.031 0.970 2.609 6.668 3.295 2 0.653

4149 rows × 11 columns

Impute PE

First, I will impute PE by replacing missing values with the mean PE. Second, I will impute PE using a random forest regressor. I will compare the results by looking at the average RMSE's by performing the method across all wells with PE data (leaving each well out as a test set).

Impute PE through mean substitution

To evaluate - I will build a model for each well (the data for that well being the test data). Then I'll compute the RMSE for each model where we know the outcomes (the actual PE) to give us an idea of how good the model is.


In [3]:
from sklearn import preprocessing

data = data_loaded.copy()

impPE_features = ['Facies', 'Formation', 'Well Name', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']
rmse = []

for w in data["Well Name"].unique():
    wTrain = data[(data["PE"].notnull()) & (data["Well Name"] != w)]
    wTest = data[(data["PE"].notnull()) & (data["Well Name"] == w)]
    
    if wTest.shape[0] > 0:
        yTest = wTest["PE"].values
        
        meanPE = wTrain["PE"].mean()
        wTest["predictedPE"] = meanPE
        
        rmse.append((((yTest - wTest["predictedPE"])**2).mean())**0.5)
        
print(rmse)
print("Average RMSE:" + str(sum(rmse)/len(rmse)))

# cleanup memory
del data


[1.0719311870589785, 0.9340796790153048, 0.730314894811232, 0.8247927443986321, 0.8897884058023374, 1.9594336281055948, 0.5272582045276141, 1.093879133032094]
Average RMSE:1.0039347345939735

Impute PE through random forest regression

Using mean substitution as a method for PE imputing has an expected RMSE of just over 1.00. Let's see if I can do better using a random forest regressor.


In [4]:
from sklearn.ensemble import RandomForestRegressor

data = data_loaded.copy()

impPE_features = ['Facies', 'Formation', 'Well Name', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']
rf = RandomForestRegressor(max_features='sqrt', n_estimators=100, random_state=1)
rmse = []

for w in data["Well Name"].unique():
    wTrain = data[(data["PE"].isnull() == False) & (data["Well Name"] != w)]
    wTest = data[(data["PE"].isnull() == False) & (data["Well Name"] == w)]
    
    if wTest.shape[0] > 0:
        XTrain = wTrain[impPE_features].values
        yTrain = wTrain["PE"].values
        XTest = wTest[impPE_features].values
        yTest = wTest["PE"].values
        
        w_rf = rf.fit(XTrain, yTrain)
        
        predictedPE = w_rf.predict(XTest)
        rmse.append((((yTest - predictedPE)**2).mean())**0.5)
    
print(rmse)
print("Average RMSE:" + str(sum(rmse)/len(rmse)))

# cleanup memory
del data


[0.83396662046021119, 0.71399579328004692, 0.44573364937069315, 0.30723136728938832, 0.58276695902483289, 0.74381423836572802, 0.37445083087759934, 0.59868248927661949]
Average RMSE:0.575080243493

This approach gives us an expected RMSE of about 0.575 - now let's impute the missing data using this approach!


In [5]:
data = data_loaded.copy()

rf_train = data[data['PE'].notnull()]
rf_test = data[data['PE'].isnull()]

xTrain = rf_train[impPE_features].values
yTrain = rf_train["PE"].values
xTest = rf_test[impPE_features].values

rf_fit = rf.fit(xTrain, yTrain)
predictedPE = rf_fit.predict(xTest)
data["PE"][data["PE"].isnull()] = predictedPE

data_imputed = data.copy()

# cleanup memory
del data

# output
data_imputed


Out[5]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 1 9 2793.0 77.450 0.664 9.900 11.915 4.600 1 1.000
1 3 1 9 2793.5 78.260 0.661 14.200 12.565 4.100 1 0.979
2 3 1 9 2794.0 79.050 0.658 14.800 13.050 3.600 1 0.957
3 3 1 9 2794.5 86.100 0.655 13.900 13.115 3.500 1 0.936
4 3 1 9 2795.0 74.580 0.647 13.500 13.300 3.400 1 0.915
... ... ... ... ... ... ... ... ... ... ... ...
4144 5 12 1 3120.5 46.719 0.947 1.828 7.254 3.617 2 0.685
4145 5 12 1 3121.0 44.563 0.953 2.241 8.013 3.344 2 0.677
4146 5 12 1 3121.5 49.719 0.964 2.925 8.013 3.190 2 0.669
4147 5 12 1 3122.0 51.469 0.965 3.083 7.708 3.152 2 0.661
4148 5 12 1 3122.5 50.031 0.970 2.609 6.668 3.295 2 0.653

4149 rows × 11 columns

Now we have a full data set with no missing values!

Feature engineering

I'm going to now calculate the average value of each log feature on a by facies basis. For instance, I will calculate the distance of an observation's GR reading from the MS GR average. The idea being that true MS's will be close to that average! I will be squaring the observation deviations from these averages to make it more of a data-distance proxy.


In [6]:
facies_labels = ['SS','CSiS','FSiS','SiSh','MS','WS','D','PS','BS']

data = data_imputed.copy()

features = ["GR", "ILD_log10", "DeltaPHI", "PHIND", "PE"]
for f in features:
    facies_mean = data[f].groupby(data["Facies"]).mean()
    
    for i in range(0, len(facies_mean)):
        data[f + "_" + facies_labels[i] + "_SqDev"] = (data[f] - facies_mean.values[i])**2

data_fe = data.copy()

del data
data_fe


Out[6]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M ... PHIND_BS_SqDev PE_SS_SqDev PE_CSiS_SqDev PE_FSiS_SqDev PE_SiSh_SqDev PE_MS_SqDev PE_WS_SqDev PE_D_SqDev PE_PS_SqDev PE_BS_SqDev
0 3 1 9 2793.0 77.450 0.664 9.900 11.915 4.600 1 ... 0.789139 2.815165 1.837350 2.088203 0.707583 0.396523 0.177255 0.837727 0.020820 0.408791
1 3 1 9 2793.5 78.260 0.661 14.200 12.565 4.100 1 ... 0.056804 1.387319 0.731861 0.893141 0.116403 0.016822 0.006238 0.172453 0.126529 1.298159
2 3 1 9 2794.0 79.050 0.658 14.800 13.050 3.600 1 ... 0.060844 0.459474 0.126372 0.198080 0.025224 0.137121 0.335221 0.007178 0.732238 2.687527
3 3 1 9 2794.5 86.100 0.655 13.900 13.115 3.500 1 ... 0.097135 0.333905 0.065275 0.119067 0.066988 0.221181 0.461018 0.034124 0.913380 3.025400
4 3 1 9 2795.0 74.580 0.647 13.500 13.300 3.400 1 ... 0.246676 0.228336 0.024177 0.060055 0.128752 0.325241 0.606815 0.081069 1.114522 3.383274
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4144 5 12 1 3120.5 46.719 0.947 1.828 7.254 3.617 2 ... 30.795120 0.482810 0.138748 0.213501 0.020113 0.124820 0.315825 0.004587 0.703433 2.632077
4145 5 12 1 3121.0 44.563 0.953 2.241 8.013 3.344 2 ... 22.947311 0.177953 0.009898 0.035744 0.172076 0.392251 0.697197 0.116094 1.235897 3.592419
4146 5 12 1 3121.5 49.719 0.964 2.925 8.013 3.190 2 ... 22.947311 0.071741 0.002971 0.001229 0.323557 0.608867 0.978087 0.244753 1.602020 4.199908
4147 5 12 1 3122.0 51.469 0.965 3.083 7.708 3.152 2 ... 25.962440 0.052829 0.008558 0.000009 0.368231 0.669614 1.054694 0.283797 1.699658 4.357104
4148 5 12 1 3122.5 50.031 0.970 2.609 6.668 3.295 2 ... 37.642337 0.139014 0.002549 0.019617 0.215130 0.456029 0.781426 0.151886 1.347246 3.780566

4149 rows × 56 columns

I proceed to run Paolo Bestagini's routines to include a small window of values to account for the spatial component in the log analysis, as well as gradient information with respect to depth.


In [7]:
# Feature windows concatenation function
def augment_features_window(X, N_neig):
    
    # Parameters
    N_row = X.shape[0]
    N_feat = X.shape[1]

    # Zero padding
    X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))

    # Loop over windows
    X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
    for r in np.arange(N_row)+N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
        X_aug[r-N_neig] = this_row

    return X_aug

# Feature gradient computation function
def augment_features_gradient(X, depth):
    
    # Compute features gradient
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
        
    # Compensate for last missing value
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

# Feature augmentation function
def augment_features(X, well, depth, N_neig=1):
    
    # Augment features
    X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
    
    # Find padded rows
    padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
    
    return X_aug, padded_rows

Now I'll apply the Paolo routines to the data - augmenting the features!


In [8]:
data = data_fe.copy()

remFeatures = ["Facies", "Well Name", "Depth"]
x = list(data)
features = [f for f in x if f not in remFeatures]

X = data[features].values
y = data["Facies"].values

# Store well labels and depths
well = data['Well Name']
depth = data['Depth'].values

X_aug, padded_rows = augment_features(X, well.values, depth)

Tuning and Cross-Validation


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
#from classification_utilities import display_cm, display_adj_cm

# 1) loops through wells - splitting data (current well held out as CV/test)
# 2) trains model (using all wells excluding current)
# 3) evaluates predictions against known values and adds f1-score to array
# 4) returns average f1-score (expected f1-score)
def cvTrain(X, y, well, params):
    rf = RandomForestClassifier(max_features=params['M'], n_estimators=params['N'], criterion='entropy', 
                                min_samples_split=params['S'], min_samples_leaf=params['L'], random_state=1)
    f1 = []
    
    for w in well.unique():
        Xtrain_w = X[well.values != w]
        ytrain_w = y[well.values != w]
        Xtest_w = X[well.values == w]
        ytest_w = y[well.values == w]
        
        w_rf = rf.fit(Xtrain_w, ytrain_w)
        predictedFacies = w_rf.predict(Xtest_w)
        f1.append(f1_score(ytest_w, predictedFacies, average='micro'))
        
    f1 = (sum(f1)/len(f1))
    return f1

Apply tuning to search for optimal hyperparameters.


In [10]:
# parameters search grid (uncomment for full grid search - will take a long time)
N_grid = [250]    #[50, 250, 500]        # n_estimators
M_grid = [75]     #[25, 50, 75]          # max_features
S_grid = [5]      #[5, 10]               # min_samples_split
L_grid = [2]      #[2, 3, 5]             # min_samples_leaf

# build grid of hyperparameters
param_grid = []
for N in N_grid:
    for M in M_grid:
        for S in S_grid:
            for L in L_grid:
                param_grid.append({'N':N, 'M':M, 'S':S, 'L':L})
                
# loop through parameters and cross-validate models for each
for params in param_grid:
    print(str(params) + ' Average F1-score: ' + str(cvTrain(X_aug, y, well, params)))


{'S': 5, 'M': 25, 'N': 50, 'L': 2} Average F1-score: 0.553127254948
{'S': 5, 'M': 25, 'N': 50, 'L': 3} Average F1-score: 0.561371416298
{'S': 5, 'M': 25, 'N': 50, 'L': 5} Average F1-score: 0.560944584925
{'S': 10, 'M': 25, 'N': 50, 'L': 2} Average F1-score: 0.558187011905
{'S': 10, 'M': 25, 'N': 50, 'L': 3} Average F1-score: 0.560214847478
{'S': 10, 'M': 25, 'N': 50, 'L': 5} Average F1-score: 0.560944584925
{'S': 5, 'M': 50, 'N': 50, 'L': 2} Average F1-score: 0.573159852049
{'S': 5, 'M': 50, 'N': 50, 'L': 3} Average F1-score: 0.566370422787
{'S': 5, 'M': 50, 'N': 50, 'L': 5} Average F1-score: 0.564303469716
{'S': 10, 'M': 50, 'N': 50, 'L': 2} Average F1-score: 0.570106723385
{'S': 10, 'M': 50, 'N': 50, 'L': 3} Average F1-score: 0.568068170232
{'S': 10, 'M': 50, 'N': 50, 'L': 5} Average F1-score: 0.564303469716
{'S': 5, 'M': 75, 'N': 50, 'L': 2} Average F1-score: 0.57405434535
{'S': 5, 'M': 75, 'N': 50, 'L': 3} Average F1-score: 0.576474505679
{'S': 5, 'M': 75, 'N': 50, 'L': 5} Average F1-score: 0.568675990853
{'S': 10, 'M': 75, 'N': 50, 'L': 2} Average F1-score: 0.575214236822
{'S': 10, 'M': 75, 'N': 50, 'L': 3} Average F1-score: 0.573102874115
{'S': 10, 'M': 75, 'N': 50, 'L': 5} Average F1-score: 0.568675990853
{'S': 5, 'M': 25, 'N': 250, 'L': 2} Average F1-score: 0.56914526206
{'S': 5, 'M': 25, 'N': 250, 'L': 3} Average F1-score: 0.567298603455
{'S': 5, 'M': 25, 'N': 250, 'L': 5} Average F1-score: 0.568047062693
{'S': 10, 'M': 25, 'N': 250, 'L': 2} Average F1-score: 0.565791091833
{'S': 10, 'M': 25, 'N': 250, 'L': 3} Average F1-score: 0.567127820202
{'S': 10, 'M': 25, 'N': 250, 'L': 5} Average F1-score: 0.568047062693
{'S': 5, 'M': 50, 'N': 250, 'L': 2} Average F1-score: 0.575469439239
{'S': 5, 'M': 50, 'N': 250, 'L': 3} Average F1-score: 0.573914450231
{'S': 5, 'M': 50, 'N': 250, 'L': 5} Average F1-score: 0.571761595073
{'S': 10, 'M': 50, 'N': 250, 'L': 2} Average F1-score: 0.570531237319
{'S': 10, 'M': 50, 'N': 250, 'L': 3} Average F1-score: 0.575244766814
{'S': 10, 'M': 50, 'N': 250, 'L': 5} Average F1-score: 0.571761595073
{'S': 5, 'M': 75, 'N': 250, 'L': 2} Average F1-score: 0.583859308012
{'S': 5, 'M': 75, 'N': 250, 'L': 3} Average F1-score: 0.579123509095
{'S': 5, 'M': 75, 'N': 250, 'L': 5} Average F1-score: 0.578517468739
{'S': 10, 'M': 75, 'N': 250, 'L': 2} Average F1-score: 0.581960313612
{'S': 10, 'M': 75, 'N': 250, 'L': 3} Average F1-score: 0.5754079813
{'S': 10, 'M': 75, 'N': 250, 'L': 5} Average F1-score: 0.578517468739
{'S': 5, 'M': 25, 'N': 500, 'L': 2} Average F1-score: 0.570675337259
{'S': 5, 'M': 25, 'N': 500, 'L': 3} Average F1-score: 0.566302874757
{'S': 5, 'M': 25, 'N': 500, 'L': 5} Average F1-score: 0.570030400449
{'S': 10, 'M': 25, 'N': 500, 'L': 2} Average F1-score: 0.571601517518
{'S': 10, 'M': 25, 'N': 500, 'L': 3} Average F1-score: 0.572730316004
{'S': 10, 'M': 25, 'N': 500, 'L': 5} Average F1-score: 0.570030400449
{'S': 5, 'M': 50, 'N': 500, 'L': 2} Average F1-score: 0.573685848763
{'S': 5, 'M': 50, 'N': 500, 'L': 3} Average F1-score: 0.575460529152
{'S': 5, 'M': 50, 'N': 500, 'L': 5} Average F1-score: 0.568780037329
{'S': 10, 'M': 50, 'N': 500, 'L': 2} Average F1-score: 0.573161714672
{'S': 10, 'M': 50, 'N': 500, 'L': 3} Average F1-score: 0.576025990934
{'S': 10, 'M': 50, 'N': 500, 'L': 5} Average F1-score: 0.568780037329
{'S': 5, 'M': 75, 'N': 500, 'L': 2} Average F1-score: 0.581869811496
{'S': 5, 'M': 75, 'N': 500, 'L': 3} Average F1-score: 0.578926260779
{'S': 5, 'M': 75, 'N': 500, 'L': 5} Average F1-score: 0.576863648329
{'S': 10, 'M': 75, 'N': 500, 'L': 2} Average F1-score: 0.583031794359
{'S': 10, 'M': 75, 'N': 500, 'L': 3} Average F1-score: 0.576126986391
{'S': 10, 'M': 75, 'N': 500, 'L': 5} Average F1-score: 0.576863648329

Through tuning we observe optimal hyperparameters to be 250 (number of estimators), 2 (minimum number of samples per leaf), 75 (maximum number of features to consider when looking for the optimal split), and 5 (minimum number of samples required to split a node). These values yielded an average F1-score of 0.584 through cross-validation.

Prediction

Before applying out algorithm to the test data, I must apply the feature engineering to the test data. This involves calculating the data deviations from the facies averages and applying Paolo Bestagini's routines.


In [11]:
from sklearn import preprocessing

filename = '../validation_data_nofacies.csv'
test = pd.read_csv(filename)

# encode well name and formation features
le = preprocessing.LabelEncoder()
test["Well Name"] = le.fit_transform(test["Well Name"])
test["Formation"] = le.fit_transform(test["Formation"])
test_loaded = test.copy()

facies_labels = ['SS','CSiS','FSiS','SiSh','MS','WS','D','PS','BS']

train = data_imputed.copy()

features = ["GR", "ILD_log10", "DeltaPHI", "PHIND", "PE"]
for f in features:
    facies_mean = train[f].groupby(train["Facies"]).mean()
    
    for i in range(0, len(facies_mean)):
        test[f + "_" + facies_labels[i] + "_SqDev"] = (test[f] - facies_mean.values[i])**2

test_fe = test.copy()

del test

test_fe


Out[11]:
Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS ... PHIND_BS_SqDev PE_SS_SqDev PE_CSiS_SqDev PE_FSiS_SqDev PE_SiSh_SqDev PE_MS_SqDev PE_WS_SqDev PE_D_SqDev PE_PS_SqDev PE_BS_SqDev
0 1 1 2808.0 66.276 0.630 3.300 10.650 3.591 1 1.000 ... 4.636852 0.447354 0.120054 1.901497e-01 0.028164 0.143868 0.345724 0.008784 0.747722 2.717116
1 1 1 2808.5 77.252 0.585 6.500 11.950 3.341 1 0.978 ... 0.728181 0.175431 0.009310 3.461890e-02 0.174574 0.396017 0.702216 0.118147 1.242577 3.603800
2 1 1 2809.0 82.899 0.566 9.400 13.600 3.064 1 0.956 ... 0.634675 0.020120 0.032584 8.269804e-03 0.482776 0.821378 1.243187 0.385300 1.936855 4.732225
3 1 1 2809.5 80.671 0.593 9.500 13.250 2.977 1 0.933 ... 0.199510 0.003008 0.071562 3.166210e-02 0.611244 0.986643 1.444763 0.500876 2.186581 5.118308
4 1 1 2810.0 75.971 0.638 8.700 12.350 3.020 1 0.911 ... 0.205513 0.009574 0.050405 1.820839e-02 0.545856 0.903069 1.343242 0.441860 2.061261 4.925593
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
825 13 0 3158.5 86.078 0.554 5.040 16.150 3.161 1 0.639 ... 11.200166 0.057047 0.006974 3.674225e-05 0.357390 0.654965 1.036289 0.274289 1.676272 4.319612
826 13 0 3159.0 88.855 0.539 5.560 16.750 3.118 1 0.611 ... 15.576164 0.038355 0.016005 1.364450e-03 0.410651 0.726414 1.125685 0.321178 1.789466 4.500201
827 13 0 3159.5 90.490 0.530 6.360 16.780 3.168 1 0.583 ... 15.813863 0.060440 0.005854 1.706038e-04 0.349069 0.643684 1.022087 0.267005 1.658195 4.290564
828 13 0 3160.0 90.975 0.522 7.035 16.995 3.154 1 0.556 ... 17.570054 0.053752 0.008192 8.807101e-07 0.365808 0.666344 1.050590 0.281670 1.694447 4.348759
829 13 0 3160.5 90.108 0.513 7.505 17.595 3.125 1 0.528 ... 22.960052 0.041146 0.014283 8.963115e-04 0.401729 0.714531 1.110880 0.313293 1.770787 4.470551

830 rows × 55 columns

Now I will apply Paolo Bestagini's routines.


In [12]:
test = test_fe.copy()

remFeatures = ["Well Name", "Depth"]
x = list(test)
features = [f for f in x if f not in remFeatures]

Xtest = test[features].values

# Store well labels and depths
welltest = test['Well Name']
depthtest = test['Depth'].values

Xtest_aug, test_padded_rows = augment_features(Xtest, welltest.values, depthtest)

In [13]:
from sklearn.ensemble import RandomForestClassifier

test = test_loaded.copy()

rf = RandomForestClassifier(max_features=75, n_estimators=250, criterion='entropy', 
                                min_samples_split=5, min_samples_leaf=2, random_state=1)
fit = rf.fit(X_aug, y)
predictedFacies = fit.predict(Xtest_aug)

test["Facies"] = predictedFacies
test.to_csv('jpoirier011_submission001.csv')