In [1]:
import numpy as np
np.random.seed(0)
import warnings
warnings.filterwarnings("ignore")
import time as tm
import pandas as pd
import xgboost as xgb
from tpot import TPOTRegressor, TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneGroupOut
from sklearn import preprocessing
from sklearn.multiclass import OneVsOneClassifier
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.signal import medfilt
%matplotlib inline
In [2]:
pe_fv = pd.read_csv('../facies_vectors.csv')
pe_nf = pd.read_csv('../nofacies_data.csv')
In [3]:
Xfv = pe_fv[pe_fv["PE"].notnull()].drop(['Formation', 'Well Name', 'Depth', 'Facies', 'PE'], axis=1).values
Xnf = pe_nf[pe_nf["PE"].notnull()].drop(['Formation', 'Well Name', 'Depth', 'PE'], axis=1).values
Xpe = np.concatenate((Xfv, Xnf))
Yfv = pe_fv[pe_fv["PE"].notnull()]["PE"].values
Ynf = pe_nf[pe_nf["PE"].notnull()]["PE"].values
Ype = np.concatenate((Yfv, Ynf))
In [4]:
Xpetr, Xpete, Ypetr, Ypete = train_test_split(Xpe, Ype, train_size=0.7, test_size=0.3, random_state=7)
In [5]:
from sklearn.decomposition import FastICA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
pe_imputer = make_pipeline(
make_union(
FastICA(tol=0.96),
FunctionTransformer(lambda X: X)
),
ExtraTreesRegressor(max_features=0.64, n_estimators=500, n_jobs=4, random_state=7)
)
pe_imputer.fit(Xpe, Ype)
Out[5]:
In [6]:
training_data = pd.read_csv("../facies_vectors.csv")
In [7]:
XimpPE = training_data[training_data["PE"].isnull()].drop(['Formation', 'Well Name', 'Depth', 'Facies', 'PE'], axis=1).values
In [8]:
training_data["PE"][training_data["PE"].isnull()] = pe_imputer.predict(XimpPE)
In [9]:
def accuracy(conf):
total_correct = 0.
nb_classes = conf.shape[0]
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
acc = total_correct/sum(sum(conf))
return acc
adjacent_facies = np.array([[1], [0, 2], [1], [4], [3, 5], [4, 6, 7], [5, 7], [5, 6, 8], [6, 7]])
def accuracy_adjacent(conf, adjacent_facies):
nb_classes = conf.shape[0]
total_correct = 0.
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
for j in adjacent_facies[i]:
total_correct += conf[i][j]
return total_correct / sum(sum(conf))
def mad_based_outlier(points, thresh=4.5):
median = np.median(points, axis=0)
diff = (points - median)**2
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return abs(modified_z_score),abs(modified_z_score) > thresh
In [10]:
# 1=sandstone 2=c_siltstone 3=f_siltstone
# 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite
# 8=packstone 9=bafflestone
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
'#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
facies_color_map[label] = facies_colors[ind]
def label_facies(row, labels):
return labels[ row['Facies'] -1]
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)
In [11]:
def make_facies_log_plot(logs, facies_colors):
#make sure logs are sorted by depth
logs = logs.sort_values(by='Depth')
cmap_facies = colors.ListedColormap(
facies_colors[0:len(facies_colors)], 'indexed')
ztop=logs.Depth.min(); zbot=logs.Depth.max()
cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
f, ax = plt.subplots(nrows=1, ncols=6, figsize=(8, 12))
ax[0].plot(logs.GR, logs.Depth, '.g')
ax[1].plot(logs.ILD_log10, logs.Depth, '.')
ax[2].plot(logs.DeltaPHI, logs.Depth, '.', color='0.5')
ax[3].plot(logs.PHIND, logs.Depth, '.', color='r')
ax[4].plot(logs.PE, logs.Depth, '.', color='black')
im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
cmap=cmap_facies,vmin=1,vmax=9)
divider = make_axes_locatable(ax[5])
cax = divider.append_axes("right", size="20%", pad=0.05)
cbar=plt.colorbar(im, cax=cax)
cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS',
'SiSh', ' MS ', ' WS ', ' D ',
' PS ', ' BS ']))
cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
for i in range(len(ax)-1):
ax[i].set_ylim(ztop,zbot)
ax[i].invert_yaxis()
ax[i].grid()
ax[i].locator_params(axis='x', nbins=3)
ax[0].set_xlabel("GR")
ax[0].set_xlim(logs.GR.min(),logs.GR.max())
ax[1].set_xlabel("ILD_log10")
ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
ax[2].set_xlabel("DeltaPHI")
ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
ax[3].set_xlabel("PHIND")
ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
ax[4].set_xlabel("PE")
ax[4].set_xlim(logs.PE.min(),logs.PE.max())
ax[5].set_xlabel('Facies')
ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
ax[5].set_xticklabels([])
f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)
In [12]:
X = training_data.drop(['Formation', 'Well Name', 'Depth', 'Facies', 'FaciesLabels'], axis=1).values
y = training_data['Facies'].values - 1
wells = training_data["Well Name"].values
In [13]:
# Feature windows concatenation function
def augment_features_window(X, N_neig):
# Parameters
N_row = X.shape[0]
N_feat = X.shape[1]
# Zero padding
X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))
# Loop over windows
X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
for r in np.arange(N_row)+N_neig:
this_row = []
for c in np.arange(-N_neig,N_neig+1):
this_row = np.hstack((this_row, X[r+c]))
X_aug[r-N_neig] = this_row
return X_aug
In [14]:
# Feature gradient computation function
def augment_features_gradient(X, depth):
# Compute features gradient
d_diff = np.diff(depth).reshape((-1, 1))
d_diff[d_diff==0] = 0.001
X_diff = np.diff(X, axis=0)
X_grad = X_diff / d_diff
# Compensate for last missing value
X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
return X_grad
In [15]:
# Feature augmentation function
def augment_features(X, well, depth, N_neig=1):
# Augment features
X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
for w in np.unique(well):
w_idx = np.where(well == w)[0]
X_aug_win = augment_features_window(X[w_idx, :], N_neig)
X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
# Find padded rows
padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
return X_aug, padded_rows
In [16]:
well = training_data['Well Name'].values
depth = training_data['Depth'].values
In [17]:
X, padded_rows = augment_features(X, well, depth, N_neig=1)
In [18]:
scaler = preprocessing.RobustScaler().fit(X)
X = scaler.transform(X)
In [19]:
logo = LeaveOneGroupOut()
t0 = tm.time()
In [20]:
f1s_ls = []
acc_ls = []
adj_ls = []
for train, test in logo.split(X, y, groups=wells):
well_name = wells[test[0]]
X_tr = X[train]
X_te = X[test]
Y_tr = y[train]
# XGBoost
exported_pipeline = OneVsOneClassifier(xgb.XGBClassifier(n_estimators=200, max_depth=3,
gamma=0.5, reg_alpha=0.5,
learning_rate=0.05,
min_child_weight=1,
subsample=0.9,
colsample_bytree=0.9,
seed=7, nthread =4),
n_jobs=4)
exported_pipeline.fit(X_tr, Y_tr)
# Predict
y_hat = exported_pipeline.predict(X_te)
y_hat = medfilt(y_hat, kernel_size=5)
try:
f1s = f1_score(y[test], y_hat, average="weighted", labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
except:
f1s = 0
try:
conf = confusion_matrix(y[test], y_hat, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
acc = f1_score(y[test], y_hat, average="micro", labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
except:
acc = 0
try:
acc_adj = accuracy_adjacent(conf, adjacent_facies)
except:
acc_adj = 0
f1s_ls += [f1s]
acc_ls += [acc]
adj_ls += [acc_adj]
print("{:>20s} f1w:{:.3f} | f1m:{:.3f} | acc_adj:{:.3f}".format(well_name, f1s, acc, acc_adj))
t1 = tm.time()
print("Avg F1w", np.average(f1s_ls)*100, "Avg F1m", np.average(acc_ls)*100, "Avg Adj", np.average(adj_ls)*100)
print((t1-t0), "seconds")
In [21]:
blind_data = pd.read_csv('../nofacies_data.csv')
X_blind = blind_data.drop(['Formation', 'Well Name', 'Depth'], axis=1).values
well_blind = blind_data['Well Name'].values
depth_blind = blind_data['Depth'].values
X = np.delete(X, padded_rows, axis=0)
y = np.delete(y, padded_rows, axis=0)
X_blind, padded_rows = augment_features(X_blind, well_blind, depth_blind, N_neig=1)
In [22]:
# Scaling
X_train = X
X_blind = scaler.transform(X_blind)
in_dim = len(X_train[0])
In [23]:
model = OneVsOneClassifier(xgb.XGBClassifier(n_estimators=200, max_depth=3,
gamma=0.5, reg_alpha=0.5,
learning_rate=0.05,
min_child_weight=1,
subsample=0.9,
colsample_bytree=0.9,
seed=7, nthread =4),
n_jobs=4)
model.fit(X_train, y)
# Predict
y_blind = model.predict(X_blind)
y_blind = medfilt(y_blind, kernel_size=5)
blind_data["Facies"] = y_blind + 1 # return the original value (1-9)
In [24]:
blind_data.to_csv("PA_Team_Submission_8_XGB.csv")
In [25]:
make_facies_log_plot(
blind_data[blind_data['Well Name'] == 'STUART'],
facies_colors)
In [26]:
make_facies_log_plot(
blind_data[blind_data['Well Name'] == 'CRAWFORD'],
facies_colors)
In [ ]: