I've had a play with some of the data here and used something of a brute force approach, by creating a large number of additional features and then using the TPOT library to train a model and refine the model parameters. I will be interested to see whether this has over-fitted, as the selected Extra Trees Classifier can do that.
In [1]:
# Initial imports for reading data and first observations
import pandas as pd
import bokeh.plotting as bk
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
bk.output_notebook()
In [2]:
# Input file paths
train_path = r'../training_data.csv'
test_path = r'../validation_data_nofacies.csv'
# Read training data to dataframe
train = pd.read_csv(train_path)
# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)
In [3]:
formations = {}
for i, value in enumerate(train['Formation'].unique()):
formations[value] = i
train.loc[train['Formation'] == value, 'Formation'] = i
wells = {}
for i, value in enumerate(train['Well Name'].unique()):
wells[value] = i
train.loc[train['Well Name'] == value, 'Well Name'] = i
In [4]:
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
'WS', 'D','PS', 'BS']
Feature construction and data clean-up.
1. Z-score normalisation of data.
2. Group each of the measurement parameters into quartiles. Most of the classification methods find data like this easier to work with.
3. Create a series of 'adjacent' parameters by looking for the above and below depth sample for each well. Create a series of features associated with the above and below parameters.
In [5]:
train_columns = train.columns[1:]
std_scaler = preprocessing.StandardScaler().fit(train[train_columns])
train_std = std_scaler.transform(train[train_columns])
train_std_frame = train
for i, column in enumerate(train_columns):
train_std_frame.loc[:, column] = train_std[:, i]
train = train_std_frame
master_columns = train.columns[4:]
def in_range(row, vmin, vmax, variable):
if vmin <= row[variable] < vmax:
return 1
else:
return 0
for i, column in train[master_columns].iteritems():
ds = np.linspace(0, 1.0, 5)
quantiles = [column.quantile(n) for n in ds]
for j in range(len(quantiles) - 1):
train[i + '_{0}'.format(j)] = train.apply(lambda row: in_range(row, ds[j], ds[j + 1], i), axis = 1)
master_columns = train.columns[4:]
above = []
below = []
for i, group in train.groupby('Well Name'):
df = group.sort_values('Depth')
u = df.shift(-1).fillna(method = 'ffill')
b = df.shift(1).fillna(method = 'bfill')
above.append(u[master_columns])
below.append(b[master_columns])
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
frame = pd.concat((train, above_frame, below_frame), axis = 1)
In [6]:
train_vector = ['class']
train_columns = frame.columns[4:]
# train_f, test_f = train_test_split(frame, test_size = 0.1, random_state = 7)
TPOT uses a genetic algorithm to tune model parameters for the most effective fit. This can take quite a while to process if you want to re-run this part!
In [7]:
# tpot = TPOTClassifier(verbosity=2, generations=5, max_eval_time_mins=30)
# tpot.fit(train_f[train_columns], train_f['class'])
In [8]:
# tpot.score(test_f[train_columns], test_f['class'])
In [9]:
# tpot.export('contest_export.py')
In [10]:
!cat contest_export.py
In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
clf = make_pipeline(
VarianceThreshold(threshold=0.37),
ExtraTreesClassifier(criterion="entropy", max_features=0.71, n_estimators=500, random_state=49)
)
In [12]:
clf.fit(frame[train_columns], frame['class'])
Out[12]:
Run this to generate results from output model.
In [13]:
test_path = r'../validation_data_nofacies.csv'
# Read training data to dataframe
test = pd.read_csv(test_path)
# TPOT library requires that the target class is renamed to 'class'
test.rename(columns={'Facies': 'class'}, inplace=True)
test_columns = test.columns
formations = {}
for i, value in enumerate(test['Formation'].unique()):
formations[value] = i
test.loc[test['Formation'] == value, 'Formation'] = i
wells = {}
for i, value in enumerate(test['Well Name'].unique()):
wells[value] = i
test.loc[test['Well Name'] == value, 'Well Name'] = i
std_scaler = preprocessing.StandardScaler().fit(test[test_columns])
test_std = std_scaler.transform(test[test_columns])
test_std_frame = test
for i, column in enumerate(test_columns):
test_std_frame.loc[:, column] = test_std[:, i]
test = test_std_frame
master_columns = test.columns[3:]
def in_range(row, vmin, vmax, variable):
if vmin <= row[variable] < vmax:
return 1
else:
return 0
for i, column in test[master_columns].iteritems():
ds = np.linspace(0, 1.0, 5)
quantiles = [column.quantile(n) for n in ds]
for j in range(len(quantiles) - 1):
test[i + '_{0}'.format(j)] = test.apply(lambda row: in_range(row, ds[j], ds[j + 1], i), axis = 1)
master_columns = test.columns[3:]
above = []
below = []
for i, group in test.groupby('Well Name'):
df = group.sort_values('Depth')
u = df.shift(-1).fillna(method = 'ffill')
b = df.shift(1).fillna(method = 'bfill')
above.append(u[master_columns])
below.append(b[master_columns])
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
frame = pd.concat((test, above_frame, below_frame), axis = 1)
test_columns = frame.columns[3:]
result = clf.predict(frame[test_columns])
In [14]:
result
Out[14]:
In [15]:
output_frame = pd.read_csv(test_path)
output_frame['Facies'] = result
output_frame.to_csv('Well Facies Prediction - Test Data Set__MATT3.csv')
In [ ]:
In [ ]: