Copy of Paolo Bestagini's "Try 2", augmented, by Alan Richardson (Ausar Geophysical), with an ML estimator for PE in the wells where it is missing (rather than just using the mean).
In the following, we provide a possible solution to the facies classification problem described at https://github.com/seg/2016-ml-contest.
The proposed algorithm is based on the use of random forests combined in one-vs-one multiclass strategy. In particular, we would like to study the effect of:
Let us import the used packages and define some parameters (e.g., colors, labels, etc.).
In [2]:
# Import
from __future__ import division
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (20.0, 10.0)
inline_rc = dict(mpl.rcParams)
from classification_utilities import make_facies_log_plot
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import LeavePGroupsOut
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from scipy.signal import medfilt
In [3]:
import sys, scipy, sklearn
print('Python: ' + sys.version.split('\n')[0])
print(' ' + sys.version.split('\n')[1])
print('Pandas: ' + pd.__version__)
print('Numpy: ' + np.__version__)
print('Scipy: ' + scipy.__version__)
print('Sklearn: ' + sklearn.__version__)
In [4]:
# Parameters
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
In [5]:
# Load data from file
data = pd.read_csv('../facies_vectors.csv')
In [6]:
# Store features and labels
X = data[feature_names].values # features
y = data['Facies'].values # labels
In [7]:
# Store well labels and depths
well = data['Well Name'].values
depth = data['Depth'].values
Let us inspect the features we are working with. This step is useful to understand how to normalize them and how to devise a correct cross-validation strategy. Specifically, it is possible to observe that:
In [8]:
# Define function for plotting feature statistics
def plot_feature_stats(X, y, feature_names, facies_colors, facies_names):
# Remove NaN
nan_idx = np.any(np.isnan(X), axis=1)
X = X[np.logical_not(nan_idx), :]
y = y[np.logical_not(nan_idx)]
# Merge features and labels into a single DataFrame
features = pd.DataFrame(X, columns=feature_names)
labels = pd.DataFrame(y, columns=['Facies'])
for f_idx, facies in enumerate(facies_names):
labels[labels[:] == f_idx] = facies
data = pd.concat((labels, features), axis=1)
# Plot features statistics
facies_color_map = {}
for ind, label in enumerate(facies_names):
facies_color_map[label] = facies_colors[ind]
sns.pairplot(data, hue='Facies', palette=facies_color_map, hue_order=list(reversed(facies_names)))
In [9]:
# Feature distribution
plot_feature_stats(X, y, feature_names, facies_colors, facies_names)
mpl.rcParams.update(inline_rc)
In [10]:
# Facies per well
for w_idx, w in enumerate(np.unique(well)):
ax = plt.subplot(3, 4, w_idx+1)
hist = np.histogram(y[well == w], bins=np.arange(len(facies_names)+1)+.5)
plt.bar(np.arange(len(hist[0])), hist[0], color=facies_colors, align='center')
ax.set_xticks(np.arange(len(hist[0])))
ax.set_xticklabels(facies_names)
ax.set_title(w)
In [11]:
# Features per well
for w_idx, w in enumerate(np.unique(well)):
ax = plt.subplot(3, 4, w_idx+1)
hist = np.logical_not(np.any(np.isnan(X[well == w, :]), axis=0))
plt.bar(np.arange(len(hist)), hist, color=facies_colors, align='center')
ax.set_xticks(np.arange(len(hist)))
ax.set_xticklabels(feature_names)
ax.set_yticks([0, 1])
ax.set_yticklabels(['miss', 'hit'])
ax.set_title(w)
In [22]:
reg = RandomForestRegressor(max_features='sqrt', n_estimators=50)
DataImpAll = data[feature_names].copy()
DataImp = DataImpAll.dropna(axis = 0, inplace=False)
Ximp=DataImp.loc[:, DataImp.columns != 'PE']
Yimp=DataImp.loc[:, 'PE']
reg.fit(Ximp, Yimp)
X[np.array(DataImpAll.PE.isnull()),4] = reg.predict(DataImpAll.loc[DataImpAll.PE.isnull(),:].drop('PE',axis=1,inplace=False))
Our guess is that facies do not abrutly change from a given depth layer to the next one. Therefore, we consider features at neighboring layers to be somehow correlated. To possibly exploit this fact, let us perform feature augmentation by:
In [23]:
# Feature windows concatenation function
def augment_features_window(X, N_neig):
# Parameters
N_row = X.shape[0]
N_feat = X.shape[1]
# Zero padding
X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))
# Loop over windows
X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
for r in np.arange(N_row)+N_neig:
this_row = []
for c in np.arange(-N_neig,N_neig+1):
this_row = np.hstack((this_row, X[r+c]))
X_aug[r-N_neig] = this_row
return X_aug
In [24]:
# Feature gradient computation function
def augment_features_gradient(X, depth):
# Compute features gradient
d_diff = np.diff(depth).reshape((-1, 1))
d_diff[d_diff==0] = 0.001
X_diff = np.diff(X, axis=0)
X_grad = X_diff / d_diff
# Compensate for last missing value
X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
return X_grad
In [25]:
# Feature augmentation function
def augment_features(X, well, depth, N_neig=1):
# Augment features
X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
for w in np.unique(well):
w_idx = np.where(well == w)[0]
X_aug_win = augment_features_window(X[w_idx, :], N_neig)
X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
# Find padded rows
padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
return X_aug, padded_rows
In [26]:
# Augment features
X_aug, padded_rows = augment_features(X, well, depth)
The choice of training and validation data is paramount in order to avoid overfitting and find a solution that generalizes well on new data. For this reason, we generate a set of training-validation splits so that:
In [27]:
# Initialize model selection methods
lpgo = LeavePGroupsOut(2)
# Generate splits
split_list = []
for train, val in lpgo.split(X, y, groups=data['Well Name']):
hist_tr = np.histogram(y[train], bins=np.arange(len(facies_names)+1)+.5)
hist_val = np.histogram(y[val], bins=np.arange(len(facies_names)+1)+.5)
if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0):
split_list.append({'train':train, 'val':val})
# Print splits
for s, split in enumerate(split_list):
print('Split %d' % s)
print(' training: %s' % (data['Well Name'][split['train']].unique()))
print(' validation: %s' % (data['Well Name'][split['val']].unique()))
Let us perform the following steps for each set of parameters:
At the end of the loop, we select the classifier that maximizes the average F1 score on the validation set. Hopefully, this classifier should be able to generalize well on new data.
In [28]:
# Parameters search grid (uncomment parameters for full grid search... may take a lot of time)
N_grid = [100] # [50, 100, 150]
M_grid = [10] # [5, 10, 15]
S_grid = [25] # [10, 25, 50, 75]
L_grid = [5] # [2, 3, 4, 5, 10, 25]
param_grid = []
for N in N_grid:
for M in M_grid:
for S in S_grid:
for L in L_grid:
param_grid.append({'N':N, 'M':M, 'S':S, 'L':L})
In [29]:
# Train and test a classifier
def train_and_test(X_tr, y_tr, X_v, well_v, param):
# Feature normalization
scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
X_tr = scaler.transform(X_tr)
X_v = scaler.transform(X_v)
# Train classifier
clf = OneVsOneClassifier(RandomForestClassifier(n_estimators=param['N'], criterion='entropy',
max_features=param['M'], min_samples_split=param['S'], min_samples_leaf=param['L'],
class_weight='balanced', random_state=0), n_jobs=-1)
clf.fit(X_tr, y_tr)
# Test classifier
y_v_hat = clf.predict(X_v)
# Clean isolated facies for each well
for w in np.unique(well_v):
y_v_hat[well_v==w] = medfilt(y_v_hat[well_v==w], kernel_size=5)
return y_v_hat
In [30]:
# For each set of parameters
score_param = []
for param in param_grid:
# For each data split
score_split = []
for split in split_list:
# Remove padded rows
split_train_no_pad = np.setdiff1d(split['train'], padded_rows)
# Select training and validation data from current split
X_tr = X_aug[split_train_no_pad, :]
X_v = X_aug[split['val'], :]
y_tr = y[split_train_no_pad]
y_v = y[split['val']]
# Select well labels for validation data
well_v = well[split['val']]
# Train and test
y_v_hat = train_and_test(X_tr, y_tr, X_v, well_v, param)
# Score
score = f1_score(y_v, y_v_hat, average='micro')
score_split.append(score)
# Average score for this param
score_param.append(np.mean(score_split))
print('F1 score = %.3f %s' % (score_param[-1], param))
# Best set of parameters
best_idx = np.argmax(score_param)
param_best = param_grid[best_idx]
score_best = score_param[best_idx]
print('\nBest F1 score = %.3f %s' % (score_best, param_best))
In [31]:
# Load data from file
test_data = pd.read_csv('../validation_data_nofacies.csv')
In [32]:
# Prepare training data
X_tr = X
y_tr = y
# Augment features
X_tr, padded_rows = augment_features(X_tr, well, depth)
# Removed padded rows
X_tr = np.delete(X_tr, padded_rows, axis=0)
y_tr = np.delete(y_tr, padded_rows, axis=0)
In [33]:
# Prepare test data
well_ts = test_data['Well Name'].values
depth_ts = test_data['Depth'].values
X_ts = test_data[feature_names].values
# Augment features
X_ts, padded_rows = augment_features(X_ts, well_ts, depth_ts)
In [34]:
# Predict test labels
y_ts_hat = train_and_test(X_tr, y_tr, X_ts, well_ts, param_best)
In [35]:
# Save predicted labels
test_data['Facies'] = y_ts_hat
test_data.to_csv('ar4_predicted_facies_submission002.csv')
In [36]:
# Plot predicted labels
make_facies_log_plot(
test_data[test_data['Well Name'] == 'STUART'],
facies_colors=facies_colors)
make_facies_log_plot(
test_data[test_data['Well Name'] == 'CRAWFORD'],
facies_colors=facies_colors)
mpl.rcParams.update(inline_rc)