In [1]:
import numpy as np

import pandas as pd

from tpot import TPOTRegressor, TPOTClassifier
from sklearn.model_selection import train_test_split

import numpy as np
np.random.seed(0)
import warnings
warnings.filterwarnings("ignore")
import time as tm
import pandas as pd
from sklearn.metrics import f1_score, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneGroupOut
from sklearn import preprocessing

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from scipy.signal import medfilt

%matplotlib inline

Load dataset


In [2]:
pe_fv = pd.read_csv('../facies_vectors.csv')
pe_nf = pd.read_csv('../nofacies_data.csv')

In [3]:
pe_fv.columns


Out[3]:
Index(['Facies', 'Formation', 'Well Name', 'Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS'],
      dtype='object')

In [4]:
Xfv = pe_fv[pe_fv["PE"].notnull()].drop(['Formation', 'Well Name', 'Depth', 'Facies', 'PE'], axis=1).values
Xnf = pe_nf[pe_nf["PE"].notnull()].drop(['Formation', 'Well Name', 'Depth', 'PE'], axis=1).values
Xpe = np.concatenate((Xfv, Xnf))

Yfv = pe_fv[pe_fv["PE"].notnull()]["PE"].values
Ynf = pe_nf[pe_nf["PE"].notnull()]["PE"].values
Ype = np.concatenate((Yfv, Ynf))

In [5]:
Xpetr, Xpete, Ypetr, Ypete = train_test_split(Xpe, Ype, train_size=0.7, test_size=0.3, random_state=0)

In [6]:
# # peReg = TPOTRegressor(generations=10, population_size=5, max_eval_time_mins=0.5, max_time_mins=1, verbosity=3)

# peReg = TPOTRegressor(generations=50, population_size=10, max_time_mins=60, verbosity=3)
# peReg.fit(Xpetr, Ypetr)

# print(peReg.score(Xpete, Ypete))
# peReg.export('pe_imputer_pipeline0.py')

In [7]:
from sklearn.ensemble import ExtraTreesRegressor, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

pe_imputer = make_pipeline(
    ExtraTreesRegressor(max_features=0.74, n_estimators=500)
)

from sklearn.decomposition import FastICA
from sklearn.ensemble import ExtraTreesRegressor, VotingClassifier
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

pe_imputer = make_pipeline(
    FastICA(tol=2.0),
    make_union(VotingClassifier([("est", ElasticNet(alpha=0.02, l1_ratio=0.96))]), FunctionTransformer(lambda X: X)),
    ExtraTreesRegressor(max_features=0.44, n_estimators=500)
)

pe_imputer.fit(Xpe, Ype)
# results = exported_pipeline.predict(testing_features)


Out[7]:
Pipeline(steps=[('fastica', FastICA(algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200,
    n_components=None, random_state=None, tol=2.0, w_init=None,
    whiten=True)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('votingclassifier', VotingClassifier(estimators=[('est', E...timators=500, n_jobs=1,
          oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [8]:
training_data = pd.read_csv("../facies_vectors.csv")

In [9]:
XimpPE = training_data[training_data["PE"].isnull()].drop(['Formation', 'Well Name', 'Depth', 'Facies', 'PE'], axis=1).values

In [10]:
training_data["PE"][training_data["PE"].isnull()] = pe_imputer.predict(XimpPE)

In [11]:
training_data["PE"][training_data["PE"].isnull()].head()


Out[11]:
Series([], Name: PE, dtype: float64)

Utilities function


In [12]:
def accuracy(conf):
    total_correct = 0.
    nb_classes = conf.shape[0]
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
    acc = total_correct/sum(sum(conf))
    return acc

adjacent_facies = np.array([[1], [0, 2], [1], [4], [3, 5], [4, 6, 7], [5, 7], [5, 6, 8], [6, 7]])


def accuracy_adjacent(conf, adjacent_facies):
    nb_classes = conf.shape[0]
    total_correct = 0.
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
        for j in adjacent_facies[i]:
            total_correct += conf[i][j]
    return total_correct / sum(sum(conf))

def mad_based_outlier(points, thresh=4.5):
    median = np.median(points, axis=0)
    diff = (points - median)**2
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation
    
    return abs(modified_z_score),abs(modified_z_score) > thresh

In [13]:
# 1=sandstone  2=c_siltstone   3=f_siltstone 
# 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite
# 8=packstone 9=bafflestone
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
       '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]

def label_facies(row, labels):
    return labels[ row['Facies'] -1]
    
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)

In [14]:
def make_facies_log_plot(logs, facies_colors):
    #make sure logs are sorted by depth
    logs = logs.sort_values(by='Depth')
    cmap_facies = colors.ListedColormap(
            facies_colors[0:len(facies_colors)], 'indexed')
    
    ztop=logs.Depth.min(); zbot=logs.Depth.max()
    
    cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
    
    f, ax = plt.subplots(nrows=1, ncols=6, figsize=(8, 12))
    ax[0].plot(logs.GR, logs.Depth, '.g')
    ax[1].plot(logs.ILD_log10, logs.Depth, '.')
    ax[2].plot(logs.DeltaPHI, logs.Depth, '.', color='0.5')
    ax[3].plot(logs.PHIND, logs.Depth, '.', color='r')
    ax[4].plot(logs.PE, logs.Depth, '.', color='black')
    im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
                    cmap=cmap_facies,vmin=1,vmax=9)
    
    divider = make_axes_locatable(ax[5])
    cax = divider.append_axes("right", size="20%", pad=0.05)
    cbar=plt.colorbar(im, cax=cax)
    cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS', 
                                'SiSh', ' MS ', ' WS ', ' D  ', 
                                ' PS ', ' BS ']))
    cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
    
    for i in range(len(ax)-1):
        ax[i].set_ylim(ztop,zbot)
        ax[i].invert_yaxis()
        ax[i].grid()
        ax[i].locator_params(axis='x', nbins=3)
    
    ax[0].set_xlabel("GR")
    ax[0].set_xlim(logs.GR.min(),logs.GR.max())
    ax[1].set_xlabel("ILD_log10")
    ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
    ax[2].set_xlabel("DeltaPHI")
    ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
    ax[3].set_xlabel("PHIND")
    ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
    ax[4].set_xlabel("PE")
    ax[4].set_xlim(logs.PE.min(),logs.PE.max())
    ax[5].set_xlabel('Facies')
    
    ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
    ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
    ax[5].set_xticklabels([])
    f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)

Remove Outlier


In [15]:
# Comment this block to delete outlier removal
[Scores,indices] = mad_based_outlier(training_data['GR'].values,3.5)
ind = np.where(indices==True)
training_data.drop(training_data.index[ind[0]],inplace=True)
[Scores,indices] = mad_based_outlier(training_data['ILD_log10'].values,3.5)
ind = np.where(indices==True)
training_data.drop(training_data.index[ind[0]],inplace=True)
[Scores,indices] = mad_based_outlier(training_data['DeltaPHI'].values,3.5)
ind = np.where(indices==True)
training_data.drop(training_data.index[ind[0]],inplace=True)

Extract data


In [16]:
X = training_data.drop(['Formation', 'Well Name', 'Depth', 'Facies', 'FaciesLabels'], axis=1).values
y = training_data['Facies'].values - 1
wells = training_data["Well Name"].values

Feature Augmentation method from Bestagini


In [17]:
# Feature windows concatenation function
def augment_features_window(X, N_neig):
    
    # Parameters
    N_row = X.shape[0]
    N_feat = X.shape[1]

    # Zero padding
    X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))

    # Loop over windows
    X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
    for r in np.arange(N_row)+N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
        X_aug[r-N_neig] = this_row

    return X_aug

In [18]:
# Feature gradient computation function
def augment_features_gradient(X, depth):
    
    # Compute features gradient
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
        
    # Compensate for last missing value
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

In [19]:
# Feature augmentation function
def augment_features(X, well, depth, N_neig=1):
    
    # Augment features
    X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
    
    # Find padded rows
    padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
    
    return X_aug, padded_rows

In [20]:
well = training_data['Well Name'].values
depth = training_data['Depth'].values

In [21]:
X, padded_rows = augment_features(X, well, depth, N_neig=1)

In [22]:
scaler = preprocessing.RobustScaler().fit(X)
X = scaler.transform(X)

In [23]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

In [24]:
# tpot = TPOTClassifier(scoring='f1_micro', random_state=0, max_eval_time_mins=1, max_time_mins=5, verbosity=1, num_cv_folds=2)
# tpot.fit(Xtrain, Ytrain)
# print(tpot.score(Xtest, Ytest))
# tpot.export('clf_pipeline0.py')

In [25]:
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

Validation with Leave One Well Out on Training Dataset


In [26]:
logo = LeaveOneGroupOut()
t0 = tm.time()

In [27]:
f1s_ls = []
acc_ls = []
adj_ls = []


for train, test in logo.split(X, y, groups=wells):
    well_name = wells[test[0]]
    X_tr = X[train]
    X_te = X[test]
    Y_tr = y[train]
  
    
    exported_pipeline = make_pipeline(
    make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=250, n_jobs=4, random_state=42, min_samples_split=10,
                                               max_depth=None, criterion='entropy', class_weight='balanced',
                                              min_samples_leaf=5, max_features=15))]), FunctionTransformer(lambda X: X)),
    ExtraTreesClassifier(criterion="entropy", max_features=1.0, n_estimators=500)
    )


    
    exported_pipeline.fit(X_tr, Y_tr)
    
    y_hat = exported_pipeline.predict(X_te)
#     y_hat = medfilt(y_hat, kernel_size=5)
    
    try:
        f1s = f1_score(y[test], y_hat, average="weighted", labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
    except:
        f1s = 0

    try:
        conf = confusion_matrix(y[test], y_hat, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
        acc = f1_score(y[test], y_hat, average="micro", labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
    except:
        acc = 0

    try:
        acc_adj = accuracy_adjacent(conf, adjacent_facies)
    except:
        acc_adj = 0

    f1s_ls += [f1s]
    acc_ls += [acc]
    adj_ls += [acc_adj]
    print("{:>20s} f1w:{:.3f} | f1m:{:.3f} | acc_adj:{:.3f}".format(well_name, f1s, acc, acc_adj))

t1 = tm.time()
print("Avg F1w", np.average(f1s_ls)*100, "Avg F1m", np.average(acc_ls)*100, "Avg Adj", np.average(adj_ls)*100)
print((t1-t0), "seconds")


         ALEXANDER D f1w:0.497 | f1m:0.524 | acc_adj:0.877
     CHURCHMAN BIBLE f1w:0.596 | f1m:0.601 | acc_adj:0.889
      CROSS H CATTLE f1w:0.447 | f1m:0.424 | acc_adj:0.883
            KIMZEY A f1w:0.506 | f1m:0.544 | acc_adj:0.885
            LUKE G U f1w:0.667 | f1m:0.631 | acc_adj:0.926
               NEWBY f1w:0.559 | f1m:0.581 | acc_adj:0.909
               NOLAN f1w:0.564 | f1m:0.532 | acc_adj:0.873
          Recruit F9 f1w:0.884 | f1m:0.792 | acc_adj:0.987
             SHANKLE f1w:0.582 | f1m:0.579 | acc_adj:0.976
           SHRIMPLIN f1w:0.494 | f1m:0.511 | acc_adj:0.917
Avg F1w 57.9715388705 Avg F1m 57.1871387926 Avg Adj 91.2131704144
76.77267646789551 seconds

Applying to Test Dataset


In [28]:
blind_data = pd.read_csv('../nofacies_data.csv')

X_blind = blind_data.drop(['Formation', 'Well Name', 'Depth'], axis=1).values
well_blind = blind_data['Well Name'].values
depth_blind = blind_data['Depth'].values

# Removed padded rows
X = np.delete(X, padded_rows, axis=0)
y = np.delete(y, padded_rows, axis=0) 

X_blind, padded_rows = augment_features(X_blind, well_blind, depth_blind, N_neig=1)

In [29]:
# Scaling
X_train = X
X_blind = scaler.transform(X_blind)

In [30]:
# # Method initialization
exported_pipeline = make_pipeline(
make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=250, n_jobs=4, random_state=42, min_samples_split=10,
                                           max_depth=None, criterion='entropy', class_weight='balanced',
                                          min_samples_leaf=5, max_features=15))]), FunctionTransformer(lambda X: X)),
ExtraTreesClassifier(criterion="entropy", max_features=1.0, n_estimators=500)
)


exported_pipeline.fit(X_train, y) 

# Predict
y_blind = exported_pipeline.predict(X_blind)
y_blind = medfilt(y_blind, kernel_size=5)

blind_data["Facies"] = y_blind + 1  # return the original value (1-9)

In [31]:
blind_data.to_csv("PA_Team_Submission_7_RF_01.csv")

In [32]:
make_facies_log_plot(
    blind_data[blind_data['Well Name'] == 'STUART'],
    facies_colors)



In [33]:
make_facies_log_plot(
    blind_data[blind_data['Well Name'] == 'CRAWFORD'],
    facies_colors)



In [ ]: