<span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">The code and ideas in this notebook,</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Matteo Niccoli and Mark Dahl, </span> are licensed under a Creative Commons Attribution 4.0 International License.
In [57]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy as sp
from scipy.signal import medfilt
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.model_selection import LeaveOneGroupOut
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from __future__ import division
mpl.rcParams['figure.figsize']=(20.0,10.0)
inline_rc = dict(mpl.rcParams)
from mpl_toolkits.axes_grid1 import make_axes_locatable
In [58]:
filename = 'train_SFS_top45_engfeat.csv'
train_data = pd.read_csv(filename)
train_data.describe()
Out[58]:
In [59]:
filename = 'test_SFS_top45_engfeat.csv'
test_data = pd.read_csv(filename)
test_data.describe()
Out[59]:
In [60]:
train_data['Well Name'] = train_data['Well Name'].astype('category')
train_data['Formation'] = train_data['Formation'].astype('category')
train_data['Well Name'].unique()
Out[60]:
In [61]:
test_data['Well Name'] = test_data['Well Name'].astype('category')
test_data['Formation'] = test_data['Formation'].astype('category')
test_data['Well Name'].unique()
Out[61]:
Now we extract just the feature variables we need to perform the classification. The predictor variables are the five log values and two geologic constraining variables, and we are also using depth. We also get a vector of the facies labels that correspond to each feature vector.
In [62]:
y_train = train_data['Facies'].values
print y_train[25:40]
print np.shape(y_train)
In [63]:
X_train = train_data.drop(['Formation', 'Well Name','Facies'], axis=1)
print np.shape(X_train)
X_train.describe(percentiles=[.05, .25, .50, .75, .95])
Out[63]:
In [64]:
# load blind test data
X_test = test_data.drop(['Formation', 'Well Name'], axis=1)
print np.shape(X_test)
X_test.describe(percentiles=[.05, .25, .50, .75, .95])
Out[64]:
In [65]:
stdscaler = preprocessing.StandardScaler().fit(X_train)
X_train = stdscaler.transform(X_train)
X_test = stdscaler.transform(X_test)
In [66]:
# Average test F1 score with leave one well out
well = train_data["Well Name"].values
logo = LeaveOneGroupOut()
def fit_and_test(X, y, param):
# Classifier
model = XGBClassifier(objective='multi:softmax',
colsample_bylevel=param['CBL'],
colsample_bytree=param['CBT'],
gamma=param['GAM'],
learning_rate=param['LRN'],
n_estimators=param['EST'],
max_depth=param['DEP'],
min_child_weight=param['MCW'],
subsample=1)
# rescale test data to training data
robscaler1 = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X[cal])
X_cal = robscaler1.transform(X[cal])
X_val = robscaler1.transform(X[val])
y_cal = y[cal]
y_val = y[val]
# set early stopping rounds
esr = 10
# fit model
truth = [(X_val, y_val)]
model.fit(X_cal, y_cal,
early_stopping_rounds = esr,
eval_metric='mlogloss',
eval_set=truth,
verbose=False)
# predict test data
pred = model.predict(X_val)
# Clean isolated facies for each well
pred_mf = medfilt(pred, kernel_size=5)
#score
return f1_score(y_val, pred_mf, labels = np.arange(10), average = 'micro')
In [67]:
# grid search
colsample_bylevel_grid = [0.85]
colsample_bytree_grid = [0.85]
gamma_grid = [0]
learning_rate_grid = [0.08, 0.013, 0.021]
n_estimators_grid = [55, 89, 144, 233]
max_depth_grid = [5]
min_child_weight_grid = [1]
param_grid = []
for level in colsample_bylevel_grid:
for tree in colsample_bytree_grid:
for gamma in gamma_grid:
for learn in learning_rate_grid:
for n in n_estimators_grid:
for depth in max_depth_grid:
for child in min_child_weight_grid:
param_grid.append({'CBL':level,
'CBT':tree,
'GAM':gamma,
'LRN':learn,
'EST':n,
'DEP':depth,
'MCW':child
})
In [16]:
# For each set of parameters
score_param = []
for param in param_grid:
# split data into train and test with LOGO
split_score = []
for cal, val in logo.split(X_train, y_train, groups=well):
well_name = well[val[0]]
split = fit_and_test(X_train, y_train, param)
print("{:>20s} {:.3f}".format(well_name, split))
split_score.append(split)
# Average score for this param
score_param.append(np.mean(split_score))
print('F1 score = %.3f %s' % (score_param[-1], param))
print " ------------------------------------------- "
# Best set of parameters
best_idx = np.argmax(score_param)
param_best = param_grid[best_idx]
score_best = score_param[best_idx]
print('\nBest F1 score = %.3f %s' % (score_best, param_best))
Stopped grid search. High colsample_bylevel and colsample_bytree values do not seem to work.
Best result:
Best F1 score = 0.593 {'EST': 144, 'DEP': 13, 'CBL': 0.75, 'LRN': 0.008, 'GAM': 0.1, 'CBT': 0.65, 'MCW': 3}
In [68]:
# define and test final model
#F1 score = 0.593 {'EST': 130, 'DEP': 12, 'CBL': 0.85, 'LRN': 0.01, 'GAM': 0, 'CBT': 0.65, 'MCW': 3}
colsample_bylevel_grid = [0.85]
colsample_bytree_grid = [0.65]
gamma_grid = [0]
learning_rate_grid = [0.01]
n_estimators_grid = [130]
max_depth_grid = [12]
min_child_weight_grid = [3]
param_grid = []
for level in colsample_bylevel_grid:
for tree in colsample_bytree_grid:
for gamma in gamma_grid:
for learn in learning_rate_grid:
for n in n_estimators_grid:
for depth in max_depth_grid:
for child in min_child_weight_grid:
param_grid.append({'CBL':level,
'CBT':tree,
'GAM':gamma,
'LRN':learn,
'EST':n,
'DEP':depth,
'MCW':child
})
In [69]:
# For each set of parameters
score_param = []
for param in param_grid:
# split data into train and test with LOGO
split_score = []
for cal, val in logo.split(X_train, y_train, groups=well):
well_name = well[val[0]]
split = fit_and_test(X_train, y_train, param)
print("{:>20s} {:.3f}".format(well_name, split))
split_score.append(split)
# Average score for this param
score_param.append(np.mean(split_score))
print('F1 score = %.3f %s' % (score_param[-1], param))
print " ------------------------------------------- "
# Best set of parameters
best_idx = np.argmax(score_param)
param_best = param_grid[best_idx]
score_best = score_param[best_idx]
print('\nBest F1 score = %.3f %s' % (score_best, param_best))
In [70]:
model = XGBClassifier(objective='multi:softmax',
colsample_bylevel=param['CBL'],
colsample_bytree=param['CBT'],
gamma=param['GAM'],
learning_rate=param['LRN'],
n_estimators=param['EST'],
max_depth=param['DEP'],
min_child_weight=param['MCW'],
subsample=1)
In [71]:
# Parameters
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
'#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
depth = training_data['Depth'].values
def make_facies_log_plot(logs, facies_colors):
#make sure logs are sorted by depth
logs = logs.sort_values(by='Depth')
cmap_facies = colors.ListedColormap(
facies_colors[0:len(facies_colors)], 'indexed')
ztop=logs.Depth.min(); zbot=logs.Depth.max()
cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
f, ax = plt.subplots(nrows=1, ncols=6, figsize=(8, 12))
ax[0].plot(logs.GR, logs.Depth, '-g')
ax[1].plot(logs.ILD_log10, logs.Depth, '-')
ax[2].plot(logs.DeltaPHI, logs.Depth, '-', color='0.5')
ax[3].plot(logs.PHIND, logs.Depth, '-', color='r')
ax[4].plot(logs.PE, logs.Depth, '-', color='black')
im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
cmap=cmap_facies,vmin=1,vmax=9)
divider = make_axes_locatable(ax[5])
cax = divider.append_axes("right", size="20%", pad=0.05)
cbar=plt.colorbar(im, cax=cax)
cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS', 'SiSh',
' MS ', ' WS ', ' D ', ' PS ', ' BS ']))
cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
for i in range(len(ax)-1):
ax[i].set_ylim(ztop,zbot)
ax[i].invert_yaxis()
ax[i].grid()
ax[i].locator_params(axis='x', nbins=3)
ax[0].set_xlabel("GR")
ax[0].set_xlim(logs.GR.min(),logs.GR.max())
ax[1].set_xlabel("ILD_log10")
ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
ax[2].set_xlabel("DeltaPHI")
ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
ax[3].set_xlabel("PHIND")
ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
ax[4].set_xlabel("PE")
ax[4].set_xlim(logs.PE.min(),logs.PE.max())
ax[5].set_xlabel('Facies')
ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
ax[5].set_xticklabels([])
f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)
In [73]:
# scale blind data to full training data set
robscaler2 = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_train)
X_train = robscaler2.transform(X_train)
X_test = robscaler2.transform(X_test)
y_train = train_data['Facies'].values
# Predict facies
y_pred = model.fit(X_train, y_train).predict(X_test)
# Clean isolated facies for each well
y_pred = medfilt(y_pred, kernel_size=5)
In [74]:
blind = pd.read_csv('nofacies_data.csv')
blind['Facies'] = y_pred
blind.describe()
#blind.dtypes
Out[74]:
In [75]:
blind['Formation'] = blind['Formation'].astype('category')
blind['Well Name'] = blind['Well Name'].astype('category')
blind['Depth'] = blind['Depth'].astype('float64')
blind['GR'] = blind['GR'].astype('float64')
blind['ILD_log10'] = blind['ILD_log10'].astype('float64')
blind['DeltaPHI'] = blind['DeltaPHI'].astype('float64')
blind['PHIND'] = blind['PHIND'].astype('float64')
blind['PE'] = blind['PE'].astype('float64')
blind['NM_M'] = blind['NM_M'].astype('int64')
blind['Facies'] = blind['Facies'].astype('int64')
blind.describe()
Out[75]:
In [76]:
# Save predicted labels
np.save('04_ypred_XGB_SFS_VC_001.npy', y_pred)
blind.to_csv('04_ypred_XGB_SFS_VC_001.csv')
# Plot predicted labels
make_facies_log_plot(
blind[blind['Well Name'] == 'STUART'],
facies_colors=facies_colors)
make_facies_log_plot(
blind[blind['Well Name'] == 'CRAWFORD'],
facies_colors=facies_colors)
mpl.rcParams.update(inline_rc)
In [ ]: