The ideas of Paolo Bestagini's "Try 2", Alan Richardson's "Try 2", Dalide's "Try 6", augmented, by Dimitrios Oikonomou and Eirik Larsen (ESA AS) by
In the following, we provide a possible solution to the facies classification problem described at https://github.com/seg/2016-ml-contest.
The proposed algorithm is based on the use of random forests, xgboost or gradient boost combined in one-vs-one multiclass strategy. In particular, we would like to study the effect of:
In [1]:
# Import
from __future__ import division
get_ipython().magic(u'matplotlib inline')
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (20.0, 10.0)
inline_rc = dict(mpl.rcParams)
from classification_utilities import make_facies_log_plot
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import LeavePGroupsOut
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from scipy.signal import medfilt
In [2]:
import sys, scipy, sklearn
print('Python: ' + sys.version.split('\n')[0])
print(' ' + sys.version.split('\n')[0])
print('Pandas: ' + pd.__version__)
print('Numpy: ' + np.__version__)
print('Scipy: ' + scipy.__version__)
print('Sklearn: ' + sklearn.__version__)
print('Xgboost: ' + xgb.__version__)
In [4]:
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
#Select classifier type
#clfType='GB' #Gradient Boosting Classifier
clfType='XBA' #XGB Clasifier
# Define window length
N_neig=2
#Seed
seed = 24
np.random.seed(seed)
In [5]:
# Load data from file
data = pd.read_csv('../facies_vectors.csv')
# Load Test data from file
test_data = pd.read_csv('../validation_data_nofacies.csv')
test_data.insert(0,'Facies',np.ones(test_data.shape[0])*(-1))
#Create Dataset for PE prediction from both dasets
all_data=pd.concat([data,test_data])
In [6]:
# Store features and labels
X = data[feature_names].values # features
y = data['Facies'].values # labels
# Store well labels and depths
well = data['Well Name'].values
depth = data['Depth'].values
Let us inspect the features we are working with. This step is useful to understand how to normalize them and how to devise a correct cross-validation strategy. Specifically, it is possible to observe that:
In [7]:
# Define function for plotting feature statistics
def plot_feature_stats(X, y, feature_names, facies_colors, facies_names):
# Remove NaN
nan_idx = np.any(np.isnan(X), axis=1)
X = X[np.logical_not(nan_idx), :]
y = y[np.logical_not(nan_idx)]
# Merge features and labels into a single DataFrame
features = pd.DataFrame(X, columns=feature_names)
labels = pd.DataFrame(y, columns=['Facies'])
for f_idx, facies in enumerate(facies_names):
labels[labels[:] == f_idx] = facies
data = pd.concat((labels, features), axis=1)
# Plot features statistics
facies_color_map = {}
for ind, label in enumerate(facies_names):
facies_color_map[label] = facies_colors[ind]
sns.pairplot(data, hue='Facies', palette=facies_color_map, hue_order=list(reversed(facies_names)))
In [8]:
# Facies per well
for w_idx, w in enumerate(np.unique(well)):
ax = plt.subplot(3, 4, w_idx+1)
hist = np.histogram(y[well == w], bins=np.arange(len(facies_names)+1)+.5)
plt.bar(np.arange(len(hist[0])), hist[0], color=facies_colors, align='center')
ax.set_xticks(np.arange(len(hist[0])))
ax.set_xticklabels(facies_names)
ax.set_title(w)
# Features per well
for w_idx, w in enumerate(np.unique(well)):
ax = plt.subplot(3, 4, w_idx+1)
hist = np.logical_not(np.any(np.isnan(X[well == w, :]), axis=0))
plt.bar(np.arange(len(hist)), hist, color=facies_colors, align='center')
ax.set_xticks(np.arange(len(hist)))
ax.set_xticklabels(feature_names)
ax.set_yticks([0, 1])
ax.set_yticklabels(['miss', 'hit'])
ax.set_title(w)
In [9]:
def make_pe(X, seed):
reg = RandomForestRegressor(max_features='sqrt', n_estimators=50, random_state=seed)
DataImpAll = all_data[feature_names].copy()
DataImp = DataImpAll.dropna(axis = 0, inplace=False)
Ximp=DataImp.loc[:, DataImp.columns != 'PE']
Yimp=DataImp.loc[:, 'PE']
reg.fit(Ximp, Yimp)
X[np.array(data.PE.isnull()),feature_names.index('PE')] = reg.predict(data.loc[data.PE.isnull(),:][['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']])
return X
In [10]:
# ## Feature augmentation
# Our guess is that facies do not abrutly change from a given depth layer to the next one. Therefore, we consider features at neighboring layers to be somehow correlated. To possibly exploit this fact, let us perform feature augmentation by:
# - Select features to augment.
# - Aggregating aug_features at neighboring depths.
# - Computing aug_features spatial gradient.
# - Computing aug_features spatial gradient of gradient.
In [11]:
# Feature windows concatenation function
def augment_features_window(X, N_neig, features=-1):
# Parameters
N_row = X.shape[0]
if features==-1:
N_feat = X.shape[1]
features=np.arange(0,X.shape[1])
else:
N_feat = len(features)
# Zero padding
X = np.vstack((np.zeros((N_neig, X.shape[1])), X, (np.zeros((N_neig, X.shape[1])))))
# Loop over windows
X_aug = np.zeros((N_row, N_feat*(2*N_neig)+X.shape[1]))
for r in np.arange(N_row)+N_neig:
this_row = []
for c in np.arange(-N_neig,N_neig+1):
if (c==0):
this_row = np.hstack((this_row, X[r+c,:]))
else:
this_row = np.hstack((this_row, X[r+c,features]))
X_aug[r-N_neig] = this_row
return X_aug
# Feature gradient computation function
def augment_features_gradient(X, depth, features=-1):
if features==-1:
features=np.arange(0,X.shape[1])
# Compute features gradient
d_diff = np.diff(depth).reshape((-1, 1))
d_diff[d_diff==0] = 0.001
X_diff = np.diff(X[:,features], axis=0)
X_grad = X_diff / d_diff
# Compensate for last missing value
X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
return X_grad
In [12]:
# Feature augmentation function
def augment_features(X, well, depth, N_neig=1, features=-1, seed=None, pe=True):
seed = seed or 0
if pe:
X = make_pe(X, seed)
if (features==-1):
N_Feat=X.shape[1]
else:
N_Feat=len(features)
# Augment features
X_aug = np.zeros((X.shape[0], X.shape[1] + N_Feat*(N_neig*2+2)))
for w in np.unique(well):
w_idx = np.where(well == w)[0]
X_aug_win = augment_features_window(X[w_idx, :], N_neig,features)
X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx],features)
X_aug_grad_grad = augment_features_gradient(X_aug_grad, depth[w_idx])
X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad,X_aug_grad_grad), axis=1)
# Find padded rows
padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
return X_aug, padded_rows
In [13]:
# Train and test a classifier
def train_and_test(X_tr, y_tr, X_v, well_v, clf):
# Feature normalization
scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
X_tr = scaler.transform(X_tr)
X_v = scaler.transform(X_v)
# Train classifier
clf.fit(X_tr, y_tr)
# Test classifier
y_v_hat = clf.predict(X_v)
# Clean isolated facies for each well
for w in np.unique(well_v):
y_v_hat[well_v==w] = medfilt(y_v_hat[well_v==w], kernel_size=3)
return y_v_hat
In [14]:
# Define which features to augment by introducing window and gradients.
augm_Features=['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'RELPOS']
# Get the columns of features to be augmented
feature_indices=[feature_names.index(log) for log in augm_Features]
# # Augment features
# X_aug, padded_rows = augment_features(X, well, depth, N_neig=N_neig, features=feature_indices)
# # Remove padded rows
# data_no_pad = np.setdiff1d(np.arange(0,X_aug.shape[0]), padded_rows)
# X=X[data_no_pad ,:]
# depth=depth[data_no_pad]
# X_aug=X_aug[data_no_pad ,:]
# y=y[data_no_pad]
# data=data.iloc[data_no_pad ,:]
# well=well[data_no_pad]
The choice of training and validation data is paramount in order to avoid overfitting and find a solution that generalizes well on new data. For this reason, we generate a set of training-validation splits so that:
In [15]:
lpgo = LeavePGroupsOut(2)
# Generate splits
split_list = []
for train, val in lpgo.split(X, y, groups=data['Well Name']):
hist_tr = np.histogram(y[train], bins=np.arange(len(facies_names)+1)+.5)
hist_val = np.histogram(y[val], bins=np.arange(len(facies_names)+1)+.5)
if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0):
split_list.append({'train':train, 'val':val})
# Print splits
for s, split in enumerate(split_list):
print('Split %d' % s)
print(' training: %s' % (data.iloc[split['train']]['Well Name'].unique()))
print(' validation: %s' % (data.iloc[split['val']]['Well Name'].unique()))
Let us perform the following steps for each set of parameters:
In [16]:
# if clfType=='XB':
# md_grid = [2,3]
# # mcw_grid = [1]
# gamma_grid = [0.2, 0.3, 0.4]
# ss_grid = [0.7, 0.9, 0.5]
# csb_grid = [0.6,0.8,0.9]
# alpha_grid =[0.2, 0.4, 0.3]
# lr_grid = [0.04, 0.06, 0.05]
# ne_grid = [100,200,300]
# param_grid = []
# for N in md_grid:
# # for M in mcw_grid:
# for S in gamma_grid:
# for L in ss_grid:
# for K in csb_grid:
# for P in alpha_grid:
# for R in lr_grid:
# for E in ne_grid:
# param_grid.append({'maxdepth':N,
# # 'minchildweight':M,
# 'gamma':S,
# 'subsample':L,
# 'colsamplebytree':K,
# 'alpha':P,
# 'learningrate':R,
# 'n_estimators':E})
# if clfType=='XBA':
# learning_rate_grid=[0.12] #[0.06, 0.10, 0.12]
# max_depth_grid=[3] #[3, 5]
# min_child_weight_grid=[6] #[6, 8, 10]
# colsample_bytree_grid = [0.9] #[0.7, 0.9]
# n_estimators_grid=[120] #[80, 120, 150] #[150]
# param_grid = []
# for max_depth in max_depth_grid:
# for min_child_weight in min_child_weight_grid:
# for colsample_bytree in colsample_bytree_grid:
# for learning_rate in learning_rate_grid:
# for n_estimators in n_estimators_grid:
# param_grid.append({'maxdepth':max_depth,
# 'minchildweight':min_child_weight,
# 'colsamplebytree':colsample_bytree,
# 'learningrate':learning_rate,
# 'n_estimators':n_estimators})
# if clfType=='RF':
# N_grid = [50, 100, 150]
# M_grid = [5, 10, 15]
# S_grid = [10, 25, 50, 75]
# L_grid = [2, 3, 4, 5, 10, 25]
# param_grid = []
# for N in N_grid:
# for M in M_grid:
# for S in S_grid:
# for L in L_grid:
# param_grid.append({'N':N, 'M':M, 'S':S, 'L':L})
# if clfType=='GB':
# N_grid = [80] #[80, 100, 120]
# MD_grid = [5] #[3, 5]
# M_grid = [10]
# LR_grid = [0.12] #[0.1, 0.08, 0.12]
# L_grid = [3] #[3, 5, 7]
# S_grid = [25] #[20, 25, 30]
# param_grid = []
# for N in N_grid:
# for M in MD_grid:
# for M1 in M_grid:
# for S in LR_grid:
# for L in L_grid:
# for S1 in S_grid:
# param_grid.append({'N':N, 'MD':M, 'MF':M1,'LR':S,'L':L,'S1':S1})
In [17]:
def getClf(clfType, param):
if clfType=='RF':
clf = OneVsOneClassifier(RandomForestClassifier(n_estimators=param['N'], criterion='entropy',
max_features=param['M'], min_samples_split=param['S'], min_samples_leaf=param['L'],
class_weight='balanced', random_state=seed), n_jobs=-1)
if clfType=='XB':
clf = OneVsOneClassifier(XGBClassifier(
learning_rate = param['learningrate'],
n_estimators=param['n_estimators'],
max_depth=param['maxdepth'],
# min_child_weight=param['minchildweight'],
gamma = param['gamma'],
subsample=param['subsample'],
colsample_bytree=param['colsamplebytree'],
reg_alpha = param['alpha'],
nthread =4,
seed = seed,
) , n_jobs=4)
if clfType=='XBA':
clf = XGBClassifier(
learning_rate = param['learningrate'],
n_estimators=param['n_estimators'],
max_depth=param['maxdepth'],
min_child_weight=param['minchildweight'],
colsample_bytree=param['colsamplebytree'],
nthread = -1,
seed = param['seed']
)
if clfType=='GB':
clf=OneVsOneClassifier(GradientBoostingClassifier(
loss='exponential',
n_estimators=param['N'],
learning_rate=param['LR'],
max_depth=param['MD'],
max_features= param['MF'],
min_samples_leaf=param['L'],
min_samples_split=param['S1'],
random_state=seed,
max_leaf_nodes=None,)
, n_jobs=-1)
return clf
In [18]:
# # For each set of parameters
# score_param = []
# print('features: %d' % X_aug.shape[1])
# exportScores=[]
# for param in param_grid:
# print('features: %d' % X_aug.shape[1])
# # For each data split
# score_split = []
# split = split_list[5]
# split_train_no_pad = split['train']
# # Select training and validation data from current split
# X_tr = X_aug[split_train_no_pad, :]
# X_v = X_aug[split['val'], :]
# y_tr = y[split_train_no_pad]
# y_v = y[split['val']]
# # Select well labels for validation data
# well_v = well[split['val']]
# # Train and test
# y_v_hat = train_and_test(X_tr, y_tr, X_v, well_v, getClf(clfType,param))
# # Score
# score = f1_score(y_v, y_v_hat, average='micro')
# score_split.append(score)
# #print('Split: {0}, Score = {1:0.3f}'.format(split_list.index(split),score))
# #print('Split: , Score = {0:0.3f}'.format(score))
# # Average score for this param
# score_param.append(np.mean(score_split))
# print('Average F1 score = %.3f %s' % (score_param[-1], param))
# exportScores.append('Average F1 score = %.3f %s' % (score_param[-1], param))
In [18]:
# Best set of parameters
# best_idx = np.argmax(score_param)
# param_best = param_grid[best_idx]
# score_best = score_param[best_idx]
# print('\nBest F1 score = %.3f %s' % (score_best, param_best))
# # Store F1 scores for multiple param grids
# if len(exportScores)>1:
# exportScoresFile=open('results_{0}_{1}_sub01b.txt'.format(clfType,N_neig),'wb')
# exportScoresFile.write('features: %d' % X_aug.shape[1])
# for item in exportScores:
# exportScoresFile.write("%s\n" % item)
# exportScoresFile.write('\nBest F1 score = %.3f %s' % (score_best, param_best))
# exportScoresFile.close()
In [19]:
params = {'minchildweight': 6, 'colsamplebytree': 0.9, 'learningrate': 0.12, 'maxdepth': 3, 'n_estimators': 120}
In [20]:
# Prepare test data
well_ts = test_data['Well Name'].values
depth_ts = test_data['Depth'].values
X_ts = test_data[feature_names].values
In [21]:
y_pred = []
print('.' * 100)
for seed in range(100):
np.random.seed(seed)
# Make training data.
X_train, padded_rows = augment_features(X, well, depth, seed=seed+100, N_neig=N_neig, features=feature_indices)
y_train = y
X_train = np.delete(X_train, padded_rows, axis=0)
y_train = np.delete(y_train, padded_rows, axis=0)
params['seed'] = seed
# Train classifier
clf = getClf("XBA", params)
# Make blind data.
X_test, _ = augment_features(X_ts, well_ts, depth_ts, seed=seed+100, pe=False, N_neig=N_neig, features=feature_indices)
# Train and test.
y_ts_hat = train_and_test(X_train, y_train, X_test, well_ts, clf)
# Collect result.
y_pred.append(y_ts_hat)
print('|', end='')
np.save('esaTeam_100_realizations.npy', y_pred)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [19]:
# ## Predict labels on test data
# Let us now apply the selected classification technique to test data.
In [20]:
# Training data
X_tr = X_aug
y_tr = y
# Prepare test data
well_ts = test_data['Well Name'].values
depth_ts = test_data['Depth'].values
X_ts = test_data[feature_names].values
# Augment Test data features
X_ts, padded_rows = augment_features(X_ts, well_ts,depth_ts,N_neig=N_neig, features=feature_indices)
In [21]:
# Predict test labels
y_ts_hat = train_and_test(X_tr, y_tr, X_ts, well_ts, getClf(clfType,param_best))
# Save predicted labels
test_data['Facies'] = y_ts_hat
test_data.to_csv('esa_predicted_facies_{0}_{1}_sub01c.csv'.format(clfType,N_neig))
In [22]:
# Plot predicted labels
make_facies_log_plot(
test_data[test_data['Well Name'] == 'STUART'],
facies_colors=facies_colors)
make_facies_log_plot(
test_data[test_data['Well Name'] == 'CRAWFORD'],
facies_colors=facies_colors)
mpl.rcParams.update(inline_rc)
In [ ]: