I've had a play with some of the data here and used something of a brute force approach, by creating a large number of additional features and then using the TPOT library to train a model and refine the model parameters. I will be interested to see whether this has over-fitted, as the selected Extra Trees Classifier can do that.
In [1]:
# Initial imports for reading data and first observations
import pandas as pd
import bokeh.plotting as bk
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
bk.output_notebook()
In [89]:
# Input file paths
train_path = r'training_data.csv'
test_path = r'validation_data_nofacies.csv'
# Read training data to dataframe
train = pd.read_csv(train_path)
# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)
In [90]:
train.head()
Out[90]:
In [91]:
train.describe()
Out[91]:
In [92]:
formations = {}
for i, value in enumerate(train['Formation'].unique()):
formations[value] = i
train.loc[train['Formation'] == value, 'Formation'] = i
wells = {}
for i, value in enumerate(train['Well Name'].unique()):
wells[value] = i
train.loc[train['Well Name'] == value, 'Well Name'] = i
In [7]:
from bokeh.palettes import viridis
def log_plot(well_frame):
plots = []
iterator = 0
#cmap = viridis(len(column.unique()))
index = 0
for i, column in well_frame.iteritems():
if index == 0:
plots.append(bk.figure(height = 800, width = 150))
else:
plots.append(bk.figure(height = 800, width = 75, y_range = plots[0].y_range))
plots[-1].line(column, well_frame['Depth'])
plots[-1].yaxis.visible = False
plots[-1].title.text = i
plots[0].yaxis.visible = True
grid = bk.gridplot([plots])
bk.show(grid)
for i, group in train.groupby('Well Name'):
well_frame = group.sort_values('Depth')
log_plot(well_frame)
In [93]:
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
'WS', 'D','PS', 'BS']
Feature construction and data clean-up.
1. Z-score normalisation of data.
2. Group each of the measurement parameters into quartiles. Most of the classification methods find data like this easier to work with.
3. Create a series of 'adjacent' parameters by looking for the above and below depth sample for each well. Create a series of features associated with the above and below parameters.
In [94]:
train_columns = train.columns[1:]
std_scaler = preprocessing.StandardScaler().fit(train[train_columns])
train_std = std_scaler.transform(train[train_columns])
train_std_frame = train
for i, column in enumerate(train_columns):
train_std_frame.loc[:, column] = train_std[:, i]
train = train_std_frame
master_columns = train.columns[4:]
def in_range(row, vmin, vmax, variable):
if vmin <= row[variable] < vmax:
return 1
else:
return 0
for i, column in train[master_columns].iteritems():
ds = np.linspace(0, 1.0, 5)
quantiles = [column.quantile(n) for n in ds]
for j in range(len(quantiles) - 1):
train[i + '_{0}'.format(j)] = train.apply(lambda row: in_range(row, ds[j], ds[j + 1], i), axis = 1)
master_columns = train.columns[4:]
above = []
below = []
for i, group in train.groupby('Well Name'):
df = group.sort_values('Depth')
u = df.shift(-1).fillna(method = 'ffill')
b = df.shift(1).fillna(method = 'bfill')
above.append(u[master_columns])
below.append(b[master_columns])
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
frame = pd.concat((train, above_frame, below_frame), axis = 1)
In [96]:
train_vector = ['class']
train_columns = frame.columns[4:]
train_f, test_f = train_test_split(frame, test_size = 0.1,
random_state = 7)
TPOT uses a genetic algorithm to tune model parameters for the most effective fit. This can take quite a while to process if you want to re-run this part!
In [26]:
tpot = TPOTClassifier(verbosity = 2, generations = 5, max_eval_time_mins = 30)
tpot.fit(train_f[train_columns], train_f['class'])
In [61]:
tpot.score(test_f[train_columns], test_f['class'])
Out[61]:
In [28]:
tpot.export('contest_export.py')
In [62]:
result = tpot.predict(frame[train_columns])
from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm
conf = confusion_matrix(frame['class'], result)
display_cm(conf, facies_labels, hide_zeros=True, display_metrics = True)
def accuracy(conf):
total_correct = 0.
nb_classes = conf.shape[0]
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
acc = total_correct/sum(sum(conf))
return acc
print(accuracy(conf))
adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])
def accuracy_adjacent(conf, adjacent_facies):
nb_classes = conf.shape[0]
total_correct = 0.
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
for j in adjacent_facies[i]:
total_correct += conf[i][j]
return total_correct / sum(sum(conf))
print(accuracy_adjacent(conf, adjacent_facies))
Run this to generate results from output model.
In [102]:
test_path = r'validation_data_nofacies.csv'
# Read training data to dataframe
test = pd.read_csv(test_path)
# TPOT library requires that the target class is renamed to 'class'
test.rename(columns={'Facies': 'class'}, inplace=True)
test_columns = test.columns
formations = {}
for i, value in enumerate(test['Formation'].unique()):
formations[value] = i
test.loc[test['Formation'] == value, 'Formation'] = i
wells = {}
for i, value in enumerate(test['Well Name'].unique()):
wells[value] = i
test.loc[test['Well Name'] == value, 'Well Name'] = i
std_scaler = preprocessing.StandardScaler().fit(test[test_columns])
test_std = std_scaler.transform(test[test_columns])
test_std_frame = test
for i, column in enumerate(test_columns):
test_std_frame.loc[:, column] = test_std[:, i]
test = test_std_frame
master_columns = test.columns[3:]
def in_range(row, vmin, vmax, variable):
if vmin <= row[variable] < vmax:
return 1
else:
return 0
for i, column in test[master_columns].iteritems():
ds = np.linspace(0, 1.0, 5)
quantiles = [column.quantile(n) for n in ds]
for j in range(len(quantiles) - 1):
test[i + '_{0}'.format(j)] = test.apply(lambda row: in_range(row, ds[j], ds[j + 1], i), axis = 1)
master_columns = test.columns[3:]
above = []
below = []
for i, group in test.groupby('Well Name'):
df = group.sort_values('Depth')
u = df.shift(-1).fillna(method = 'ffill')
b = df.shift(1).fillna(method = 'bfill')
above.append(u[master_columns])
below.append(b[master_columns])
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
frame = pd.concat((test, above_frame, below_frame), axis = 1)
test_columns = frame.columns[3:]
result = tpot.predict(frame[test_columns])
In [107]:
result
Out[107]:
In [113]:
output_frame = pd.read_csv(test_path)
output_frame['Facies'] = result
output_frame.to_csv('Well Facies Prediction - Test Data Set.csv')