03 - Facies Determination with Regression

As with the prior entries, this is a combination of brute-force feature creation and an ExtraTrees Regressor method. The aim of this is to capture more of the inter-dependancy of samples. I will freely admit that this is stretching my ML knowledge, I've spent quite a lot of time trying to ascertain whether this is a sensible thing to be doing at all. Comments and thoughts very welcome!


In [1]:
import pandas as pd
import bokeh.plotting as bk
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor

import sys
sys.path.append(r'C:\Users\george.crowther\Documents\Python\Projects\2016-ml-contest-master')

import classification_utilities

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

bk.output_notebook()


Loading BokehJS ...

In [2]:
# Input file paths
train_path = r'..\training_data.csv'

# Read training data to dataframe
train = pd.read_csv(train_path)

# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)

well_names = train['Well Name']

In [3]:
# Set string features to integers

for i, value in enumerate(train['Formation'].unique()):
    train.loc[train['Formation'] == value, 'Formation'] = i
    
for i, value in enumerate(train['Well Name'].unique()):
    train.loc[train['Well Name'] == value, 'Well Name'] = i

In [4]:
# The first thing that will be done is to upsample and interpolate the training data,
# the objective here is to provide significantly more samples to train the regressor on and
# also to capture more of the sample interdependancy.
upsampled_arrays = []
train['orig_index'] = train.index

for well, group in train.groupby('Well Name'):
    # This is a definite, but helpful, mis-use of the pandas resample timeseries
    # functionality.
    group.index = pd.to_datetime(group['Depth'] * 10)
    # Upsampled by a factor of 5 and interpolate
    us_group = group.resample('1ns').mean().interpolate(how='time')
    # Revert to integer
    us_group.index = us_group.index.asi8 / 10
    us_group['Well Name'] = well
    
    upsampled_arrays.append(us_group)

In [5]:
upsampled_arrays[0].head()


Out[5]:
class Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS orig_index Well Name
2793.0 3.0 2793.0 77.450 0.6640 9.90 11.915 4.6 1.0 1.0000 0.0 0
2793.1 3.0 2793.1 77.612 0.6634 10.76 12.045 4.5 1.0 0.9958 0.2 0
2793.2 3.0 2793.2 77.774 0.6628 11.62 12.175 4.4 1.0 0.9916 0.4 0
2793.3 3.0 2793.3 77.936 0.6622 12.48 12.305 4.3 1.0 0.9874 0.6 0
2793.4 3.0 2793.4 78.098 0.6616 13.34 12.435 4.2 1.0 0.9832 0.8 0

In [6]:
resample_factors = [2, 5, 10, 50, 100, 200]

initial_columns = ['Formation', 'Well Name', 'Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']

upsampled_frame = pd.concat(upsampled_arrays, axis = 0)

In [7]:
# Use rolling windows through upsampled frame, grouping by well name.

# Empty list to hold frames
mean_frames = []

for well, group in upsampled_frame.groupby('Well Name'):
    # Empty list to hold rolling frames
    constructor_list = []
    for f in resample_factors:
        
        working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
       'RELPOS', 'Well Name']]
        
        mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = f)
        mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
        max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = f)
        max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
        min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = f)
        min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
        std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = f)
        std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
        var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = f)
        var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
        diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = f)
        diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
        rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = f).sort_index()
        rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
        
        f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)
        
        constructor_list.append(f_frame)
        
    well_frame = pd.concat(constructor_list, axis = 1)
    well_frame['class'] = group['class']
    well_frame['Well Name'] = group['Well Name']
    # orig index is holding the original index locations, to make extracting the results trivial
    well_frame['orig_index'] = group['orig_index']
    mean_frames.append(well_frame)

In [8]:
upsampled_frame.index = upsampled_frame['orig_index']
upsampled_frame.drop(['orig_index', 'class', 'Well Name'], axis = 1, inplace = True)

for f in mean_frames:
    f.index = f['orig_index']

rolling_frame = pd.concat(mean_frames, axis = 0)
upsampled_frame = pd.concat((upsampled_frame, rolling_frame), axis = 1)

# Features is the column set used for training the model
features = upsampled_frame.columns[:-4]
print(features)


Index(['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS',
       'Mean_2_Depth', 'Mean_2_GR',
       ...
       'Diff_200_RELPOS', 'Diff_200_Well Name', 'Rdiff_200_Depth',
       'Rdiff_200_GR', 'Rdiff_200_ILD_log10', 'Rdiff_200_DeltaPHI',
       'Rdiff_200_PHIND', 'Rdiff_200_PE', 'Rdiff_200_NM_M',
       'Rdiff_200_RELPOS'],
      dtype='object', length=385)

In [9]:
# Define model

from sklearn.ensemble import ExtraTreesRegressor, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

exported_pipeline = make_pipeline(
    ExtraTreesRegressor(max_features=0.27, n_estimators=500)
)

In [10]:
# Fit model to data
exported_pipeline.fit(upsampled_frame[features], upsampled_frame['class'])


Out[10]:
Pipeline(steps=[('extratreesregressor', ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=0.27, max_leaf_nodes=None, min_impurity_split=1e-07,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
          oob_score=False, random_state=None, verbose=0, warm_start=False))])

Now load and process the test data set, then predict using the 'exported_pipeline' model.


In [11]:
test_path = r'..\validation_data_nofacies.csv'

# Read training data to dataframe
test = pd.read_csv(test_path)

# TPOT library requires that the target class is renamed to 'class'
test.rename(columns={'Facies': 'class'}, inplace=True)

# Set string features to integers

for i, value in enumerate(test['Formation'].unique()):
    test.loc[train['Formation'] == value, 'Formation'] = i
    
for i, value in enumerate(test['Well Name'].unique()):
    test.loc[test['Well Name'] == value, 'Well Name'] = i

# The first thing that will be done is to upsample and interpolate the training data,
# the objective here is to provide significantly more samples to train the regressor on and
# also to capture more of the sample interdependancy.
upsampled_arrays = []
test['orig_index'] = test.index

for well, group in test.groupby('Well Name'):
    # This is a definite, but helpful, mis-use of the pandas resample timeseries
    # functionality.
    group.index = pd.to_datetime(group['Depth'] * 10)
    # Upsampled by a factor of 5 and interpolate
    us_group = group.resample('1ns').mean().interpolate(how='time')
    # Revert to integer
    us_group.index = us_group.index.asi8 / 10
    us_group['Well Name'] = well
    
    upsampled_arrays.append(us_group)
    
upsampled_frame = pd.concat(upsampled_arrays, axis = 0)

# Use rolling windows through upsampled frame, grouping by well name.

# Empty list to hold frames
mean_frames = []

for well, group in upsampled_frame.groupby('Well Name'):
    # Empty list to hold rolling frames
    constructor_list = []
    for f in resample_factors:
        
        working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
       'RELPOS', 'Well Name']]
        
        mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = f)
        mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
        max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = f)
        max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
        min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = f)
        min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
        std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = f)
        std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
        var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = f)
        var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
        diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = f)
        diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
        rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = f).sort_index()
        rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
        
        f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)
        
        constructor_list.append(f_frame)
        
    well_frame = pd.concat(constructor_list, axis = 1)
    well_frame['Well Name'] = group['Well Name']
    # orig index is holding the original index locations, to make extracting the results trivial
    well_frame['orig_index'] = group['orig_index']
    mean_frames.append(well_frame)
    
upsampled_frame.index = upsampled_frame['orig_index']
upsampled_frame.drop(['orig_index', 'Well Name'], axis = 1, inplace = True)

for f in mean_frames:
    f.index = f['orig_index']

rolling_frame = pd.concat(mean_frames, axis = 0)
upsampled_frame = pd.concat((upsampled_frame, rolling_frame), axis = 1)

tfeatures = upsampled_frame.columns[:-3]
print(tfeatures)


Index(['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS',
       'Mean_2_Depth', 'Mean_2_GR',
       ...
       'Diff_200_RELPOS', 'Diff_200_Well Name', 'Rdiff_200_Depth',
       'Rdiff_200_GR', 'Rdiff_200_ILD_log10', 'Rdiff_200_DeltaPHI',
       'Rdiff_200_PHIND', 'Rdiff_200_PE', 'Rdiff_200_NM_M',
       'Rdiff_200_RELPOS'],
      dtype='object', length=385)

In [12]:
# Predict result on full sample set
result = exported_pipeline.predict(upsampled_frame[tfeatures])
# Round result to nearest int
upsampled_frame['Facies'] = [round(n) for n in result]
# Extract results against test index
result_frame = upsampled_frame.loc[test.index, :]
# Output to csv
result_frame.to_csv('regressor_results.csv')

In [ ]: