In [1]:

    
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from pandas import set_option
set_option("display.max_rows", 10)

Load and pre-process data



In [2]:

    
from sklearn import preprocessing

filename = '../facies_vectors.csv'
train = pd.read_csv(filename)

# encode well name and formation features
le = preprocessing.LabelEncoder()
train["Well Name"] = le.fit_transform(train["Well Name"])
train["Formation"] = le.fit_transform(train["Formation"])

data_loaded = train.copy()

# cleanup memory
del train

data_loaded









    Out[2]:






  
    
      
      Facies
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      0
      3
      1
      9
      2793.0
      77.450
      0.664
      9.900
      11.915
      4.600
      1
      1.000
    
    
      1
      3
      1
      9
      2793.5
      78.260
      0.661
      14.200
      12.565
      4.100
      1
      0.979
    
    
      2
      3
      1
      9
      2794.0
      79.050
      0.658
      14.800
      13.050
      3.600
      1
      0.957
    
    
      3
      3
      1
      9
      2794.5
      86.100
      0.655
      13.900
      13.115
      3.500
      1
      0.936
    
    
      4
      3
      1
      9
      2795.0
      74.580
      0.647
      13.500
      13.300
      3.400
      1
      0.915
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      4144
      5
      12
      1
      3120.5
      46.719
      0.947
      1.828
      7.254
      3.617
      2
      0.685
    
    
      4145
      5
      12
      1
      3121.0
      44.563
      0.953
      2.241
      8.013
      3.344
      2
      0.677
    
    
      4146
      5
      12
      1
      3121.5
      49.719
      0.964
      2.925
      8.013
      3.190
      2
      0.669
    
    
      4147
      5
      12
      1
      3122.0
      51.469
      0.965
      3.083
      7.708
      3.152
      2
      0.661
    
    
      4148
      5
      12
      1
      3122.5
      50.031
      0.970
      2.609
      6.668
      3.295
      2
      0.653
    
  

4149 rows × 11 columns

Impute PE

First, I will impute PE by replacing missing values with the mean PE. Second, I will impute PE using a random forest regressor. I will compare the results by looking at the average RMSE's by performing the method across all wells with PE data (leaving each well out as a test set).

Impute PE through mean substitution

To evaluate - I will build a model for each well (the data for that well being the test data). Then I'll compute the RMSE for each model where we know the outcomes (the actual PE) to give us an idea of how good the model is.



In [3]:

    
from sklearn import preprocessing

data = data_loaded.copy()

impPE_features = ['Facies', 'Formation', 'Well Name', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']
rmse = []

for w in data["Well Name"].unique():
    wTrain = data[(data["PE"].notnull()) & (data["Well Name"] != w)]
    wTest = data[(data["PE"].notnull()) & (data["Well Name"] == w)]
    
    if wTest.shape[0] > 0:
        yTest = wTest["PE"].values
        
        meanPE = wTrain["PE"].mean()
        wTest["predictedPE"] = meanPE
        
        rmse.append((((yTest - wTest["predictedPE"])**2).mean())**0.5)
        
print(rmse)
print("Average RMSE:" + str(sum(rmse)/len(rmse)))

# cleanup memory
del data









    



[1.0719311870589785, 0.9340796790153048, 0.730314894811232, 0.8247927443986321, 0.8897884058023374, 1.9594336281055948, 0.5272582045276141, 1.093879133032094]
Average RMSE:1.0039347345939735

Impute PE through random forest regression

Using mean substitution as a method for PE imputing has an expected RMSE of just over 1.00. Let's see if I can do better using a random forest regressor.



In [4]:

    
from sklearn.ensemble import RandomForestRegressor

data = data_loaded.copy()

impPE_features = ['Facies', 'Formation', 'Well Name', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']
rf = RandomForestRegressor(max_features='sqrt', n_estimators=100, random_state=1)
rmse = []

for w in data["Well Name"].unique():
    wTrain = data[(data["PE"].isnull() == False) & (data["Well Name"] != w)]
    wTest = data[(data["PE"].isnull() == False) & (data["Well Name"] == w)]
    
    if wTest.shape[0] > 0:
        XTrain = wTrain[impPE_features].values
        yTrain = wTrain["PE"].values
        XTest = wTest[impPE_features].values
        yTest = wTest["PE"].values
        
        w_rf = rf.fit(XTrain, yTrain)
        
        predictedPE = w_rf.predict(XTest)
        rmse.append((((yTest - predictedPE)**2).mean())**0.5)
    
print(rmse)
print("Average RMSE:" + str(sum(rmse)/len(rmse)))

# cleanup memory
del data









    



[0.83396662046021119, 0.71399579328004692, 0.44573364937069315, 0.30723136728938832, 0.58276695902483289, 0.74381423836572802, 0.37445083087759934, 0.59868248927661949]
Average RMSE:0.575080243493

This approach gives us an expected RMSE of about 0.575 - now let's impute the missing data using this approach!



In [5]:

    
data = data_loaded.copy()

rf_train = data[data['PE'].notnull()]
rf_test = data[data['PE'].isnull()]

xTrain = rf_train[impPE_features].values
yTrain = rf_train["PE"].values
xTest = rf_test[impPE_features].values

rf_fit = rf.fit(xTrain, yTrain)
predictedPE = rf_fit.predict(xTest)
data["PE"][data["PE"].isnull()] = predictedPE

data_imputed = data.copy()

# cleanup memory
del data

# output
data_imputed









    Out[5]:






  
    
      
      Facies
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      0
      3
      1
      9
      2793.0
      77.450
      0.664
      9.900
      11.915
      4.600
      1
      1.000
    
    
      1
      3
      1
      9
      2793.5
      78.260
      0.661
      14.200
      12.565
      4.100
      1
      0.979
    
    
      2
      3
      1
      9
      2794.0
      79.050
      0.658
      14.800
      13.050
      3.600
      1
      0.957
    
    
      3
      3
      1
      9
      2794.5
      86.100
      0.655
      13.900
      13.115
      3.500
      1
      0.936
    
    
      4
      3
      1
      9
      2795.0
      74.580
      0.647
      13.500
      13.300
      3.400
      1
      0.915
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      4144
      5
      12
      1
      3120.5
      46.719
      0.947
      1.828
      7.254
      3.617
      2
      0.685
    
    
      4145
      5
      12
      1
      3121.0
      44.563
      0.953
      2.241
      8.013
      3.344
      2
      0.677
    
    
      4146
      5
      12
      1
      3121.5
      49.719
      0.964
      2.925
      8.013
      3.190
      2
      0.669
    
    
      4147
      5
      12
      1
      3122.0
      51.469
      0.965
      3.083
      7.708
      3.152
      2
      0.661
    
    
      4148
      5
      12
      1
      3122.5
      50.031
      0.970
      2.609
      6.668
      3.295
      2
      0.653
    
  

4149 rows × 11 columns

Now we have a full data set with no missing values!

Feature engineering

I'm going to now calculate the average value of each log feature on a by facies basis. For instance, I will calculate the distance of an observation's GR reading from the MS GR average. The idea being that true MS's will be close to that average! I will be squaring the observation deviations from these averages to make it more of a data-distance proxy.



In [6]:

    
facies_labels = ['SS','CSiS','FSiS','SiSh','MS','WS','D','PS','BS']

data = data_imputed.copy()

features = ["GR", "ILD_log10", "DeltaPHI", "PHIND", "PE"]
for f in features:
    facies_mean = data[f].groupby(data["Facies"]).mean()
    
    for i in range(0, len(facies_mean)):
        data[f + "_" + facies_labels[i] + "_SqDev"] = (data[f] - facies_mean.values[i])**2

data_fe = data.copy()

del data
data_fe









    Out[6]:






  
    
      
      Facies
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      ...
      PHIND_BS_SqDev
      PE_SS_SqDev
      PE_CSiS_SqDev
      PE_FSiS_SqDev
      PE_SiSh_SqDev
      PE_MS_SqDev
      PE_WS_SqDev
      PE_D_SqDev
      PE_PS_SqDev
      PE_BS_SqDev
    
  
  
    
      0
      3
      1
      9
      2793.0
      77.450
      0.664
      9.900
      11.915
      4.600
      1
      ...
      0.789139
      2.815165
      1.837350
      2.088203
      0.707583
      0.396523
      0.177255
      0.837727
      0.020820
      0.408791
    
    
      1
      3
      1
      9
      2793.5
      78.260
      0.661
      14.200
      12.565
      4.100
      1
      ...
      0.056804
      1.387319
      0.731861
      0.893141
      0.116403
      0.016822
      0.006238
      0.172453
      0.126529
      1.298159
    
    
      2
      3
      1
      9
      2794.0
      79.050
      0.658
      14.800
      13.050
      3.600
      1
      ...
      0.060844
      0.459474
      0.126372
      0.198080
      0.025224
      0.137121
      0.335221
      0.007178
      0.732238
      2.687527
    
    
      3
      3
      1
      9
      2794.5
      86.100
      0.655
      13.900
      13.115
      3.500
      1
      ...
      0.097135
      0.333905
      0.065275
      0.119067
      0.066988
      0.221181
      0.461018
      0.034124
      0.913380
      3.025400
    
    
      4
      3
      1
      9
      2795.0
      74.580
      0.647
      13.500
      13.300
      3.400
      1
      ...
      0.246676
      0.228336
      0.024177
      0.060055
      0.128752
      0.325241
      0.606815
      0.081069
      1.114522
      3.383274
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      4144
      5
      12
      1
      3120.5
      46.719
      0.947
      1.828
      7.254
      3.617
      2
      ...
      30.795120
      0.482810
      0.138748
      0.213501
      0.020113
      0.124820
      0.315825
      0.004587
      0.703433
      2.632077
    
    
      4145
      5
      12
      1
      3121.0
      44.563
      0.953
      2.241
      8.013
      3.344
      2
      ...
      22.947311
      0.177953
      0.009898
      0.035744
      0.172076
      0.392251
      0.697197
      0.116094
      1.235897
      3.592419
    
    
      4146
      5
      12
      1
      3121.5
      49.719
      0.964
      2.925
      8.013
      3.190
      2
      ...
      22.947311
      0.071741
      0.002971
      0.001229
      0.323557
      0.608867
      0.978087
      0.244753
      1.602020
      4.199908
    
    
      4147
      5
      12
      1
      3122.0
      51.469
      0.965
      3.083
      7.708
      3.152
      2
      ...
      25.962440
      0.052829
      0.008558
      0.000009
      0.368231
      0.669614
      1.054694
      0.283797
      1.699658
      4.357104
    
    
      4148
      5
      12
      1
      3122.5
      50.031
      0.970
      2.609
      6.668
      3.295
      2
      ...
      37.642337
      0.139014
      0.002549
      0.019617
      0.215130
      0.456029
      0.781426
      0.151886
      1.347246
      3.780566
    
  

4149 rows × 56 columns

I proceed to run Paolo Bestagini's routines to include a small window of values to account for the spatial component in the log analysis, as well as gradient information with respect to depth.



In [7]:

    
# Feature windows concatenation function
def augment_features_window(X, N_neig):
    
    # Parameters
    N_row = X.shape[0]
    N_feat = X.shape[1]

    # Zero padding
    X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))

    # Loop over windows
    X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
    for r in np.arange(N_row)+N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
        X_aug[r-N_neig] = this_row

    return X_aug

# Feature gradient computation function
def augment_features_gradient(X, depth):
    
    # Compute features gradient
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
        
    # Compensate for last missing value
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

# Feature augmentation function
def augment_features(X, well, depth, N_neig=1):
    
    # Augment features
    X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
    
    # Find padded rows
    padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
    
    return X_aug, padded_rows

Now I'll apply the Paolo routines to the data - augmenting the features!



In [8]:

    
data = data_fe.copy()

remFeatures = ["Facies", "Well Name", "Depth"]
x = list(data)
features = [f for f in x if f not in remFeatures]

X = data[features].values
y = data["Facies"].values

# Store well labels and depths
well = data['Well Name']
depth = data['Depth'].values

X_aug, padded_rows = augment_features(X, well.values, depth)

Tuning and Cross-Validation



In [9]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
#from classification_utilities import display_cm, display_adj_cm

# 1) loops through wells - splitting data (current well held out as CV/test)
# 2) trains model (using all wells excluding current)
# 3) evaluates predictions against known values and adds f1-score to array
# 4) returns average f1-score (expected f1-score)
def cvTrain(X, y, well, params):
    rf = RandomForestClassifier(max_features=params['M'], n_estimators=params['N'], criterion='entropy', 
                                min_samples_split=params['S'], min_samples_leaf=params['L'], random_state=1)
    f1 = []
    
    for w in well.unique():
        Xtrain_w = X[well.values != w]
        ytrain_w = y[well.values != w]
        Xtest_w = X[well.values == w]
        ytest_w = y[well.values == w]
        
        w_rf = rf.fit(Xtrain_w, ytrain_w)
        predictedFacies = w_rf.predict(Xtest_w)
        f1.append(f1_score(ytest_w, predictedFacies, average='micro'))
        
    f1 = (sum(f1)/len(f1))
    return f1

Apply tuning to search for optimal hyperparameters.



In [10]:

    
# parameters search grid (uncomment for full grid search - will take a long time)
N_grid = [250]    #[50, 250, 500]        # n_estimators
M_grid = [75]     #[25, 50, 75]          # max_features
S_grid = [5]      #[5, 10]               # min_samples_split
L_grid = [2]      #[2, 3, 5]             # min_samples_leaf

# build grid of hyperparameters
param_grid = []
for N in N_grid:
    for M in M_grid:
        for S in S_grid:
            for L in L_grid:
                param_grid.append({'N':N, 'M':M, 'S':S, 'L':L})
                
# loop through parameters and cross-validate models for each
for params in param_grid:
    print(str(params) + ' Average F1-score: ' + str(cvTrain(X_aug, y, well, params)))









    



{'S': 5, 'M': 25, 'N': 50, 'L': 2} Average F1-score: 0.553127254948
{'S': 5, 'M': 25, 'N': 50, 'L': 3} Average F1-score: 0.561371416298
{'S': 5, 'M': 25, 'N': 50, 'L': 5} Average F1-score: 0.560944584925
{'S': 10, 'M': 25, 'N': 50, 'L': 2} Average F1-score: 0.558187011905
{'S': 10, 'M': 25, 'N': 50, 'L': 3} Average F1-score: 0.560214847478
{'S': 10, 'M': 25, 'N': 50, 'L': 5} Average F1-score: 0.560944584925
{'S': 5, 'M': 50, 'N': 50, 'L': 2} Average F1-score: 0.573159852049
{'S': 5, 'M': 50, 'N': 50, 'L': 3} Average F1-score: 0.566370422787
{'S': 5, 'M': 50, 'N': 50, 'L': 5} Average F1-score: 0.564303469716
{'S': 10, 'M': 50, 'N': 50, 'L': 2} Average F1-score: 0.570106723385
{'S': 10, 'M': 50, 'N': 50, 'L': 3} Average F1-score: 0.568068170232
{'S': 10, 'M': 50, 'N': 50, 'L': 5} Average F1-score: 0.564303469716
{'S': 5, 'M': 75, 'N': 50, 'L': 2} Average F1-score: 0.57405434535
{'S': 5, 'M': 75, 'N': 50, 'L': 3} Average F1-score: 0.576474505679
{'S': 5, 'M': 75, 'N': 50, 'L': 5} Average F1-score: 0.568675990853
{'S': 10, 'M': 75, 'N': 50, 'L': 2} Average F1-score: 0.575214236822
{'S': 10, 'M': 75, 'N': 50, 'L': 3} Average F1-score: 0.573102874115
{'S': 10, 'M': 75, 'N': 50, 'L': 5} Average F1-score: 0.568675990853
{'S': 5, 'M': 25, 'N': 250, 'L': 2} Average F1-score: 0.56914526206
{'S': 5, 'M': 25, 'N': 250, 'L': 3} Average F1-score: 0.567298603455
{'S': 5, 'M': 25, 'N': 250, 'L': 5} Average F1-score: 0.568047062693
{'S': 10, 'M': 25, 'N': 250, 'L': 2} Average F1-score: 0.565791091833
{'S': 10, 'M': 25, 'N': 250, 'L': 3} Average F1-score: 0.567127820202
{'S': 10, 'M': 25, 'N': 250, 'L': 5} Average F1-score: 0.568047062693
{'S': 5, 'M': 50, 'N': 250, 'L': 2} Average F1-score: 0.575469439239
{'S': 5, 'M': 50, 'N': 250, 'L': 3} Average F1-score: 0.573914450231
{'S': 5, 'M': 50, 'N': 250, 'L': 5} Average F1-score: 0.571761595073
{'S': 10, 'M': 50, 'N': 250, 'L': 2} Average F1-score: 0.570531237319
{'S': 10, 'M': 50, 'N': 250, 'L': 3} Average F1-score: 0.575244766814
{'S': 10, 'M': 50, 'N': 250, 'L': 5} Average F1-score: 0.571761595073
{'S': 5, 'M': 75, 'N': 250, 'L': 2} Average F1-score: 0.583859308012
{'S': 5, 'M': 75, 'N': 250, 'L': 3} Average F1-score: 0.579123509095
{'S': 5, 'M': 75, 'N': 250, 'L': 5} Average F1-score: 0.578517468739
{'S': 10, 'M': 75, 'N': 250, 'L': 2} Average F1-score: 0.581960313612
{'S': 10, 'M': 75, 'N': 250, 'L': 3} Average F1-score: 0.5754079813
{'S': 10, 'M': 75, 'N': 250, 'L': 5} Average F1-score: 0.578517468739
{'S': 5, 'M': 25, 'N': 500, 'L': 2} Average F1-score: 0.570675337259
{'S': 5, 'M': 25, 'N': 500, 'L': 3} Average F1-score: 0.566302874757
{'S': 5, 'M': 25, 'N': 500, 'L': 5} Average F1-score: 0.570030400449
{'S': 10, 'M': 25, 'N': 500, 'L': 2} Average F1-score: 0.571601517518
{'S': 10, 'M': 25, 'N': 500, 'L': 3} Average F1-score: 0.572730316004
{'S': 10, 'M': 25, 'N': 500, 'L': 5} Average F1-score: 0.570030400449
{'S': 5, 'M': 50, 'N': 500, 'L': 2} Average F1-score: 0.573685848763
{'S': 5, 'M': 50, 'N': 500, 'L': 3} Average F1-score: 0.575460529152
{'S': 5, 'M': 50, 'N': 500, 'L': 5} Average F1-score: 0.568780037329
{'S': 10, 'M': 50, 'N': 500, 'L': 2} Average F1-score: 0.573161714672
{'S': 10, 'M': 50, 'N': 500, 'L': 3} Average F1-score: 0.576025990934
{'S': 10, 'M': 50, 'N': 500, 'L': 5} Average F1-score: 0.568780037329
{'S': 5, 'M': 75, 'N': 500, 'L': 2} Average F1-score: 0.581869811496
{'S': 5, 'M': 75, 'N': 500, 'L': 3} Average F1-score: 0.578926260779
{'S': 5, 'M': 75, 'N': 500, 'L': 5} Average F1-score: 0.576863648329
{'S': 10, 'M': 75, 'N': 500, 'L': 2} Average F1-score: 0.583031794359
{'S': 10, 'M': 75, 'N': 500, 'L': 3} Average F1-score: 0.576126986391
{'S': 10, 'M': 75, 'N': 500, 'L': 5} Average F1-score: 0.576863648329

Through tuning we observe optimal hyperparameters to be 250 (number of estimators), 2 (minimum number of samples per leaf), 75 (maximum number of features to consider when looking for the optimal split), and 5 (minimum number of samples required to split a node). These values yielded an average F1-score of 0.584 through cross-validation.

Prediction

Before applying out algorithm to the test data, I must apply the feature engineering to the test data. This involves calculating the data deviations from the facies averages and applying Paolo Bestagini's routines.



In [11]:

    
from sklearn import preprocessing

filename = '../validation_data_nofacies.csv'
test = pd.read_csv(filename)

# encode well name and formation features
le = preprocessing.LabelEncoder()
test["Well Name"] = le.fit_transform(test["Well Name"])
test["Formation"] = le.fit_transform(test["Formation"])
test_loaded = test.copy()

facies_labels = ['SS','CSiS','FSiS','SiSh','MS','WS','D','PS','BS']

train = data_imputed.copy()

features = ["GR", "ILD_log10", "DeltaPHI", "PHIND", "PE"]
for f in features:
    facies_mean = train[f].groupby(train["Facies"]).mean()
    
    for i in range(0, len(facies_mean)):
        test[f + "_" + facies_labels[i] + "_SqDev"] = (test[f] - facies_mean.values[i])**2

test_fe = test.copy()

del test

test_fe









    Out[11]:






  
    
      
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
      ...
      PHIND_BS_SqDev
      PE_SS_SqDev
      PE_CSiS_SqDev
      PE_FSiS_SqDev
      PE_SiSh_SqDev
      PE_MS_SqDev
      PE_WS_SqDev
      PE_D_SqDev
      PE_PS_SqDev
      PE_BS_SqDev
    
  
  
    
      0
      1
      1
      2808.0
      66.276
      0.630
      3.300
      10.650
      3.591
      1
      1.000
      ...
      4.636852
      0.447354
      0.120054
      1.901497e-01
      0.028164
      0.143868
      0.345724
      0.008784
      0.747722
      2.717116
    
    
      1
      1
      1
      2808.5
      77.252
      0.585
      6.500
      11.950
      3.341
      1
      0.978
      ...
      0.728181
      0.175431
      0.009310
      3.461890e-02
      0.174574
      0.396017
      0.702216
      0.118147
      1.242577
      3.603800
    
    
      2
      1
      1
      2809.0
      82.899
      0.566
      9.400
      13.600
      3.064
      1
      0.956
      ...
      0.634675
      0.020120
      0.032584
      8.269804e-03
      0.482776
      0.821378
      1.243187
      0.385300
      1.936855
      4.732225
    
    
      3
      1
      1
      2809.5
      80.671
      0.593
      9.500
      13.250
      2.977
      1
      0.933
      ...
      0.199510
      0.003008
      0.071562
      3.166210e-02
      0.611244
      0.986643
      1.444763
      0.500876
      2.186581
      5.118308
    
    
      4
      1
      1
      2810.0
      75.971
      0.638
      8.700
      12.350
      3.020
      1
      0.911
      ...
      0.205513
      0.009574
      0.050405
      1.820839e-02
      0.545856
      0.903069
      1.343242
      0.441860
      2.061261
      4.925593
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      825
      13
      0
      3158.5
      86.078
      0.554
      5.040
      16.150
      3.161
      1
      0.639
      ...
      11.200166
      0.057047
      0.006974
      3.674225e-05
      0.357390
      0.654965
      1.036289
      0.274289
      1.676272
      4.319612
    
    
      826
      13
      0
      3159.0
      88.855
      0.539
      5.560
      16.750
      3.118
      1
      0.611
      ...
      15.576164
      0.038355
      0.016005
      1.364450e-03
      0.410651
      0.726414
      1.125685
      0.321178
      1.789466
      4.500201
    
    
      827
      13
      0
      3159.5
      90.490
      0.530
      6.360
      16.780
      3.168
      1
      0.583
      ...
      15.813863
      0.060440
      0.005854
      1.706038e-04
      0.349069
      0.643684
      1.022087
      0.267005
      1.658195
      4.290564
    
    
      828
      13
      0
      3160.0
      90.975
      0.522
      7.035
      16.995
      3.154
      1
      0.556
      ...
      17.570054
      0.053752
      0.008192
      8.807101e-07
      0.365808
      0.666344
      1.050590
      0.281670
      1.694447
      4.348759
    
    
      829
      13
      0
      3160.5
      90.108
      0.513
      7.505
      17.595
      3.125
      1
      0.528
      ...
      22.960052
      0.041146
      0.014283
      8.963115e-04
      0.401729
      0.714531
      1.110880
      0.313293
      1.770787
      4.470551
    
  

830 rows × 55 columns

Now I will apply Paolo Bestagini's routines.



In [12]:

    
test = test_fe.copy()

remFeatures = ["Well Name", "Depth"]
x = list(test)
features = [f for f in x if f not in remFeatures]

Xtest = test[features].values

# Store well labels and depths
welltest = test['Well Name']
depthtest = test['Depth'].values

Xtest_aug, test_padded_rows = augment_features(Xtest, welltest.values, depthtest)



In [13]:

    
from sklearn.ensemble import RandomForestClassifier

test = test_loaded.copy()

rf = RandomForestClassifier(max_features=75, n_estimators=250, criterion='entropy', 
                                min_samples_split=5, min_samples_leaf=2, random_state=1)
fit = rf.fit(X_aug, y)
predictedFacies = fit.predict(Xtest_aug)

test["Facies"] = predictedFacies
test.to_csv('jpoirier011_submission001.csv')

	Facies	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	3	1	9	2793.0	77.450	0.664	9.900	11.915	4.600	1	1.000
1	3	1	9	2793.5	78.260	0.661	14.200	12.565	4.100	1	0.979
2	3	1	9	2794.0	79.050	0.658	14.800	13.050	3.600	1	0.957
3	3	1	9	2794.5	86.100	0.655	13.900	13.115	3.500	1	0.936
4	3	1	9	2795.0	74.580	0.647	13.500	13.300	3.400	1	0.915
...	...	...	...	...	...	...	...	...	...	...	...
4144	5	12	1	3120.5	46.719	0.947	1.828	7.254	3.617	2	0.685
4145	5	12	1	3121.0	44.563	0.953	2.241	8.013	3.344	2	0.677
4146	5	12	1	3121.5	49.719	0.964	2.925	8.013	3.190	2	0.669
4147	5	12	1	3122.0	51.469	0.965	3.083	7.708	3.152	2	0.661
4148	5	12	1	3122.5	50.031	0.970	2.609	6.668	3.295	2	0.653

	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS	...	PHIND_BS_SqDev	PE_SS_SqDev	PE_CSiS_SqDev	PE_FSiS_SqDev	PE_SiSh_SqDev	PE_MS_SqDev	PE_WS_SqDev	PE_D_SqDev	PE_PS_SqDev	PE_BS_SqDev
0	1	1	2808.0	66.276	0.630	3.300	10.650	3.591	1	1.000	...	4.636852	0.447354	0.120054	1.901497e-01	0.028164	0.143868	0.345724	0.008784	0.747722	2.717116
1	1	1	2808.5	77.252	0.585	6.500	11.950	3.341	1	0.978	...	0.728181	0.175431	0.009310	3.461890e-02	0.174574	0.396017	0.702216	0.118147	1.242577	3.603800
2	1	1	2809.0	82.899	0.566	9.400	13.600	3.064	1	0.956	...	0.634675	0.020120	0.032584	8.269804e-03	0.482776	0.821378	1.243187	0.385300	1.936855	4.732225
3	1	1	2809.5	80.671	0.593	9.500	13.250	2.977	1	0.933	...	0.199510	0.003008	0.071562	3.166210e-02	0.611244	0.986643	1.444763	0.500876	2.186581	5.118308
4	1	1	2810.0	75.971	0.638	8.700	12.350	3.020	1	0.911	...	0.205513	0.009574	0.050405	1.820839e-02	0.545856	0.903069	1.343242	0.441860	2.061261	4.925593
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
825	13	0	3158.5	86.078	0.554	5.040	16.150	3.161	1	0.639	...	11.200166	0.057047	0.006974	3.674225e-05	0.357390	0.654965	1.036289	0.274289	1.676272	4.319612
826	13	0	3159.0	88.855	0.539	5.560	16.750	3.118	1	0.611	...	15.576164	0.038355	0.016005	1.364450e-03	0.410651	0.726414	1.125685	0.321178	1.789466	4.500201
827	13	0	3159.5	90.490	0.530	6.360	16.780	3.168	1	0.583	...	15.813863	0.060440	0.005854	1.706038e-04	0.349069	0.643684	1.022087	0.267005	1.658195	4.290564
828	13	0	3160.0	90.975	0.522	7.035	16.995	3.154	1	0.556	...	17.570054	0.053752	0.008192	8.807101e-07	0.365808	0.666344	1.050590	0.281670	1.694447	4.348759
829	13	0	3160.5	90.108	0.513	7.505	17.595	3.125	1	0.528	...	22.960052	0.041146	0.014283	8.963115e-04	0.401729	0.714531	1.110880	0.313293	1.770787	4.470551