In this second attempt, I've updated some of the feature engineering before re-training an extra trees classifier on the data
In [1]:
# Initial imports for reading data and first observations
import pandas as pd
import bokeh.plotting as bk
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tpot import TPOTClassifier
import sys
sys.path.append(r'C:\Users\george.crowther\Documents\Python\Projects\2016-ml-contest-master')
from classification_utilities import display_cm, display_adj_cm
bk.output_notebook()
In [2]:
# Input file paths
train_path = r'..\training_data.csv'
test_path = r'.\validation_data_nofacies.csv'
# Read training data to dataframe
train = pd.read_csv(train_path)
# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)
In [6]:
train.head()
Out[6]:
In [7]:
train.describe()
Out[7]:
Again, as with the previous result, the method here is somewhat brute-force, looking at the differences between each sample and it's formation mean/median, its above formation lower sample and below formation upper sample. There could definitely be more metrics, and undoubtedly more informed metrics to pull in this manner, these are arguably somewhat naieve.
In [24]:
def feature_extraction(train):
#------------------------------------
# Split and separate formation names into
for i, value in enumerate(train.Formation.unique()):
name_a = value.split(' ')[0]
name_b = value.split(' ')[1]
if name_a not in train.columns:
train[name_a] = 0
if name_b not in train.columns:
train[name_b] = 0
train.loc[train.Formation == value, name_a] = 1
train.loc[train.Formation == value, name_b] = 1
#------------------------------------
# Replace formation names with values
for i, value in enumerate(train['Formation'].unique()):
train.loc[train['Formation'] == value, 'Formation'] = i
#------------------------------------
# Going to take the difference of each sample from the formation mean and median for each well for each measured parameter
# This will add a 0 value column for each potential value
columns = ['Formation', 'Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
above_columns = ['above_delta_' + col for col in columns]
below_columns = ['below_delta_' + col for col in columns]
formation_columns = ['formation_delta_' + col for col in columns]
formation_med_columns = ['formation_delta_med_' + col for col in columns]
def add_empty_columns(df, column_list):
for column in column_list:
df[column] = 0
for column_list in [above_columns, below_columns, formation_columns, formation_med_columns]:
add_empty_columns(train, column_list)
#-------------------------------------------
# Group data by well, sort by depth, then groupby formation
# Take mean, median, top and bottom (by depth) values for each sub group
# Add feature which is the difference of the sample from the mean for each formation and its adjacent formations
# TBD - un-log 'ILD log10' prior to mean, the re-log
for i, group in train.groupby('Well Name'):
iteration = 0
sorted_group = group.sort_values('Depth')
for j, sub_group in sorted_group.groupby('Formation'):
means = sub_group[columns].mean()
medians = sub_group[columns].median()
top = sub_group.iloc[0][columns]
if iteration == 0:
above_group = sub_group
else:
above_means = above_group[columns].mean()
above_bottom = above_group.iloc[-1][columns]
train.loc[sub_group.index, above_columns] = (train.loc[sub_group.index, columns] - above_bottom).values
train.loc[above_group.index, below_columns] = (train.loc[sub_group.index, columns] - top).values
train.loc[sub_group.index, formation_columns] = (train.loc[sub_group.index, columns] - means).values
train.loc[sub_group.index, formation_med_columns] = (train.loc[sub_group.index, columns] - medians).values
above_group = sub_group
iteration += 1
return train
In [15]:
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
'WS', 'D','PS', 'BS']
model_columns = train.columns[11:]
TPOT uses a genetic algorithm to tune model parameters for the most effective fit. This can take quite a while to process if you want to re-run this part!
In [18]:
# Input file paths
train_path = r'..\training_data.csv'
# Read training data to dataframe
train = pd.read_csv(train_path)
# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)
train = feature_extraction(train)
In [8]:
alt_model_columns = ['GR', 'ILD_log10',
'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'A1', 'SH', 'LM', 'B1',
'B2', 'B3', 'B4', 'B5', 'C', 'above_delta_Formation',
'above_delta_Depth', 'above_delta_GR', 'above_delta_ILD_log10',
'above_delta_DeltaPHI', 'above_delta_PHIND', 'above_delta_PE',
'above_delta_NM_M', 'above_delta_RELPOS', 'below_delta_Formation',
'below_delta_Depth', 'below_delta_GR', 'below_delta_ILD_log10',
'below_delta_DeltaPHI', 'below_delta_PHIND', 'below_delta_PE',
'below_delta_NM_M', 'below_delta_RELPOS', 'formation_delta_Formation',
'formation_delta_Depth', 'formation_delta_GR',
'formation_delta_ILD_log10', 'formation_delta_DeltaPHI',
'formation_delta_PHIND', 'formation_delta_PE', 'formation_delta_NM_M',
'formation_delta_RELPOS', 'formation_delta_med_Formation',
'formation_delta_med_Depth', 'formation_delta_med_GR',
'formation_delta_med_ILD_log10', 'formation_delta_med_DeltaPHI',
'formation_delta_med_PHIND', 'formation_delta_med_PE',
'formation_delta_med_NM_M', 'formation_delta_med_RELPOS']
In [9]:
#-------------------------------
# Z-scale normalisation of features.
# Should probably exclude boolean features from normalisation, though should make nominal difference.
std_scaler = preprocessing.StandardScaler().fit(train[alt_model_columns])
norm = std_scaler.transform(train[alt_model_columns])
norm_frame = train
for i, column in enumerate(alt_model_columns):
norm_frame.loc[:, column] = norm[:, i]
train = norm_frame
In [155]:
train[alt_model_columns].describe()
Out[155]:
In [10]:
#------------------------------------
# Train test split
alt_train_f, alt_test_f = train_test_split(train, test_size = 0.1,
random_state = 68)
In [12]:
# Setup TPOT classifier and train
alt_tpot = TPOTClassifier(verbosity = 2, generations = 5, max_eval_time_mins = 60)
alt_tpot.fit(alt_train_f[alt_model_columns], alt_train_f['class'])
In [22]:
print(alt_tpot.score(alt_test_f[alt_model_columns], alt_test_f['class']))
alt_tpot.export('02 contest_export.py')
In [49]:
result = alt_tpot.predict(train[alt_model_columns])
conf = confusion_matrix(train['class'], result)
display_cm(conf, facies_labels, hide_zeros=True, display_metrics = True)
def accuracy(conf):
total_correct = 0.
nb_classes = conf.shape[0]
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
acc = total_correct/sum(sum(conf))
return acc
print(accuracy(conf))
adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])
def accuracy_adjacent(conf, adjacent_facies):
nb_classes = conf.shape[0]
total_correct = 0.
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
for j in adjacent_facies[i]:
total_correct += conf[i][j]
return total_correct / sum(sum(conf))
print(accuracy_adjacent(conf, adjacent_facies))
In [40]:
test_path = r'..\validation_data_nofacies.csv'
# Read training data to dataframe
test = pd.read_csv(test_path)
# Rename 'Facies'
test.rename(columns={'Facies': 'class'}, inplace=True)
frame = feature_extraction(test)
In [41]:
frame.describe()
Out[41]:
In [42]:
alt_model_columns = ['GR', 'ILD_log10',
'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'A1', 'SH', 'LM', 'B1',
'B2', 'B3', 'B4', 'B5', 'C', 'above_delta_Formation',
'above_delta_Depth', 'above_delta_GR', 'above_delta_ILD_log10',
'above_delta_DeltaPHI', 'above_delta_PHIND', 'above_delta_PE',
'above_delta_NM_M', 'above_delta_RELPOS', 'below_delta_Formation',
'below_delta_Depth', 'below_delta_GR', 'below_delta_ILD_log10',
'below_delta_DeltaPHI', 'below_delta_PHIND', 'below_delta_PE',
'below_delta_NM_M', 'below_delta_RELPOS', 'formation_delta_Formation',
'formation_delta_Depth', 'formation_delta_GR',
'formation_delta_ILD_log10', 'formation_delta_DeltaPHI',
'formation_delta_PHIND', 'formation_delta_PE', 'formation_delta_NM_M',
'formation_delta_RELPOS', 'formation_delta_med_Formation',
'formation_delta_med_Depth', 'formation_delta_med_GR',
'formation_delta_med_ILD_log10', 'formation_delta_med_DeltaPHI',
'formation_delta_med_PHIND', 'formation_delta_med_PE',
'formation_delta_med_NM_M', 'formation_delta_med_RELPOS']
std_scaler = preprocessing.StandardScaler().fit(frame[alt_model_columns])
norm = std_scaler.transform(frame[alt_model_columns])
norm_frame = frame
for i, column in enumerate(alt_model_columns):
norm_frame.loc[:, column] = norm[:, i]
frame = norm_frame
frame.describe()
Out[42]:
In [43]:
#--------------------------------------
# TPOT Exported Model
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
exported_pipeline = make_pipeline(
ExtraTreesClassifier(criterion="entropy", max_features=0.48, n_estimators=500)
)
exported_pipeline.fit(train[alt_model_columns], train['class'])
Out[43]:
In [44]:
frame['Facies'] = exported_pipeline.predict(frame[alt_model_columns])
In [52]:
frame['Facies']
Out[52]:
In [46]:
frame.to_csv('02 - Well Facies Prediction - Test Data Set.csv')