In this approach for solving the facies classfication problem ( https://github.com/seg/2016-ml-contest. ) we will explore the following statregies:
In [1]:
# %%sh
# pip install pandas
# pip install scikit-learn
# pip install tpot
In [2]:
from __future__ import print_function
import numpy as np
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold , StratifiedKFold
from classification_utilities import display_cm, display_adj_cm
from sklearn.metrics import confusion_matrix, f1_score
from sklearn import preprocessing
from sklearn.model_selection import LeavePGroupsOut
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.signal import medfilt
In [4]:
#Load Data
data = pd.read_csv('../facies_vectors.csv')
# Parameters
feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
facies_names = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
# Store features and labels
X = data[feature_names].values
y = data['Facies'].values
# Store well labels and depths
well = data['Well Name'].values
depth = data['Depth'].values
# Fill 'PE' missing values with mean
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)
We procceed to run Paolo Bestagini's routine to include a small window of values to acount for the spatial component in the log analysis, as well as the gradient information with respect to depth. This will be our prepared training dataset.
In [5]:
# Feature windows concatenation function
def augment_features_window(X, N_neig):
# Parameters
N_row = X.shape[0]
N_feat = X.shape[1]
# Zero padding
X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))
# Loop over windows
X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
for r in np.arange(N_row)+N_neig:
this_row = []
for c in np.arange(-N_neig,N_neig+1):
this_row = np.hstack((this_row, X[r+c]))
X_aug[r-N_neig] = this_row
return X_aug
# Feature gradient computation function
def augment_features_gradient(X, depth):
# Compute features gradient
d_diff = np.diff(depth).reshape((-1, 1))
d_diff[d_diff==0] = 0.001
X_diff = np.diff(X, axis=0)
X_grad = X_diff / d_diff
# Compensate for last missing value
X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
return X_grad
# Feature augmentation function
def augment_features(X, well, depth, N_neig=1):
# Augment features
X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
for w in np.unique(well):
w_idx = np.where(well == w)[0]
X_aug_win = augment_features_window(X[w_idx, :], N_neig)
X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
# Find padded rows
padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
return X_aug, padded_rows
In [6]:
X_aug, padded_rows = augment_features(X, well, depth)
In [7]:
# # Initialize model selection methods
# lpgo = LeavePGroupsOut(2)
# # Generate splits
# split_list = []
# for train, val in lpgo.split(X, y, groups=data['Well Name']):
# hist_tr = np.histogram(y[train], bins=np.arange(len(facies_names)+1)+.5)
# hist_val = np.histogram(y[val], bins=np.arange(len(facies_names)+1)+.5)
# if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0):
# split_list.append({'train':train, 'val':val})
In [8]:
def preprocess():
# Preprocess data to use in model
X_train_aux = []
X_test_aux = []
y_train_aux = []
y_test_aux = []
# For each data split
split = split_list[5]
# Remove padded rows
split_train_no_pad = np.setdiff1d(split['train'], padded_rows)
# Select training and validation data from current split
X_tr = X_aug[split_train_no_pad, :]
X_v = X_aug[split['val'], :]
y_tr = y[split_train_no_pad]
y_v = y[split['val']]
# Select well labels for validation data
well_v = well[split['val']]
# Feature normalization
scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
X_tr = scaler.transform(X_tr)
X_v = scaler.transform(X_v)
X_train_aux.append( X_tr )
X_test_aux.append( X_v )
y_train_aux.append( y_tr )
y_test_aux.append ( y_v )
X_train = np.concatenate( X_train_aux )
X_test = np.concatenate ( X_test_aux )
y_train = np.concatenate ( y_train_aux )
y_test = np.concatenate ( y_test_aux )
return X_train , X_test , y_train , y_test
In [9]:
# from tpot import TPOTClassifier
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = preprocess()
# tpot = TPOTClassifier(generations=5, population_size=20,
# verbosity=2,max_eval_time_mins=20,
# max_time_mins=100,scoring='f1_micro',
# random_state = 17)
# tpot.fit(X_train, y_train)
# print(tpot.score(X_test, y_test))
# tpot.export('FinalPipeline.py')
In [10]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
In [12]:
# Train and test a classifier
# Pass in the classifier so we can iterate over many seed later.
def train_and_test(X_tr, y_tr, X_v, well_v, clf):
# Feature normalization
scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
X_tr = scaler.transform(X_tr)
X_v = scaler.transform(X_v)
clf.fit(X_tr, y_tr)
# Test classifier
y_v_hat = clf.predict(X_v)
# Clean isolated facies for each well
for w in np.unique(well_v):
y_v_hat[well_v==w] = medfilt(y_v_hat[well_v==w], kernel_size=5)
return y_v_hat
In [14]:
#Load testing data
test_data = pd.read_csv('../validation_data_nofacies.csv')
# Train classifier
#clf = make_pipeline(make_union(VotingClassifier([("est", ExtraTreesClassifier(criterion="gini", max_features=1.0, n_estimators=500))]), FunctionTransformer(lambda X: X)), XGBClassifier(learning_rate=0.73, max_depth=10, min_child_weight=10, n_estimators=500, subsample=0.27))
#clf = make_pipeline( KNeighborsClassifier(n_neighbors=5, weights="distance") )
#clf = make_pipeline(MaxAbsScaler(),make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]), FunctionTransformer(lambda X: X)),ExtraTreesClassifier(criterion="entropy", max_features=0.0001, n_estimators=500))
# * clf = make_pipeline( make_union(VotingClassifier([("est", BernoulliNB(alpha=60.0, binarize=0.26, fit_prior=True))]), FunctionTransformer(lambda X: X)),RandomForestClassifier(n_estimators=500))
# # Prepare training data
# X_tr = X
# y_tr = y
# # Augment features
# X_tr, padded_rows = augment_features(X_tr, well, depth)
# # Removed padded rows
# X_tr = np.delete(X_tr, padded_rows, axis=0)
# y_tr = np.delete(y_tr, padded_rows, axis=0)
# Prepare test data
well_ts = test_data['Well Name'].values
depth_ts = test_data['Depth'].values
X_ts = test_data[feature_names].values
y_pred = []
print('.' * 100)
for seed in range(100):
np.random.seed(seed)
# Make training data.
X_train, padded_rows = augment_features(X, well, depth)
y_train = y
X_train = np.delete(X_train, padded_rows, axis=0)
y_train = np.delete(y_train, padded_rows, axis=0)
# Train classifier
clf = make_pipeline(XGBClassifier(learning_rate=0.12,
max_depth=3,
min_child_weight=10,
n_estimators=150,
seed=seed,
colsample_bytree=0.9))
# Make blind data.
X_test, _ = augment_features(X_ts, well_ts, depth_ts)
# Train and test.
y_ts_hat = train_and_test(X_train, y_train, X_test, well_ts, clf)
# Collect result.
y_pred.append(y_ts_hat)
print('|', end='')
np.save('LA_Team_100_realizations.npy', y_pred)
In [ ]:
# # Augment features
# X_ts, padded_rows = augment_features(X_ts, well_ts, depth_ts)
# # Predict test labels
# y_ts_hat = train_and_test(X_tr, y_tr, X_ts, well_ts)
# # Save predicted labels
# test_data['Facies'] = y_ts_hat
# test_data.to_csv('Prediction_XX_Final.csv')
In [ ]: