Machine Learning Contest

By: Kris Darnell & David Tang

Test run with a larger sized neural network. Contest is described here.



In [3]:

    
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

# Loading Data
filename = 'facies_vectors.csv'                         # Read in training data
training_data = pd.read_csv(filename)
training_data.fillna(training_data.mean(),inplace=True) # Remove NaN with mean value
training_data
filename = 'validation_data_nofacies.csv'               # Read in test well
validationFull = pd.read_csv(filename)

# Converts to category
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()
training_data.describe()









    Out[3]:






  
    
      
      Facies
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      count
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
    
    
      mean
      4.503254
      2906.867438
      64.933985
      0.659566
      4.402484
      13.201066
      3.725014
      1.518438
      0.521852
    
    
      std
      2.474324
      133.300164
      30.302530
      0.252703
      5.274947
      7.132846
      0.790917
      0.499720
      0.286644
    
    
      min
      1.000000
      2573.500000
      10.149000
      -0.025949
      -21.832000
      0.550000
      0.200000
      1.000000
      0.000000
    
    
      25%
      2.000000
      2821.500000
      44.730000
      0.498000
      1.600000
      8.500000
      3.200000
      1.000000
      0.277000
    
    
      50%
      4.000000
      2932.500000
      64.990000
      0.639000
      4.300000
      12.020000
      3.725014
      2.000000
      0.528000
    
    
      75%
      6.000000
      3007.000000
      79.438000
      0.822000
      7.500000
      16.050000
      4.000000
      2.000000
      0.769000
    
    
      max
      9.000000
      3138.000000
      361.150000
      1.800000
      19.312000
      84.400000
      8.094000
      2.000000
      1.000000



In [4]:

    
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
       '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}   # Dictionary # enumerate puts out ind=0, label=SS, and loops through the whole thing
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]
   
def label_facies(row, labels):
    return labels[ row['Facies'] -1]
    
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)

correct_facies_labels = training_data['Facies'].values

feature_vectors = training_data.drop(['Well Name','Facies','FaciesLabels'], axis=1)

feature_vectors.describe()

feature_vectors.insert(1,'FormationNum',0)
validationFull.insert(1,'FormationNum',0)

for ii, formation in enumerate(feature_vectors['Formation'].unique()):
    feature_vectors.FormationNum[feature_vectors.Formation == formation] = ii
    validationFull.FormationNum[validationFull.Formation == formation] = ii
    
feature_vectors = feature_vectors.drop(['Formation'], axis = 1)
validation = validationFull.drop(['Formation', 'Well Name'], axis = 1)

Normalizing and splitting data



In [5]:

    
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors)

X_train, X_test, y_train, y_test = train_test_split(
        scaled_features, correct_facies_labels, test_size=0.2, random_state=42)

Training Data



In [6]:

    
from sklearn.neural_network import MLPClassifier

sizes = (200,100,100)
clfNN = MLPClassifier(solver='lbfgs', alpha=.015,
                    hidden_layer_sizes=sizes, random_state=15)
clfOne = OneVsRestClassifier(MLPClassifier(solver='lbfgs', alpha=.015,
                    hidden_layer_sizes=sizes, random_state=15), n_jobs = -1)

clfNN.fit(X_train,y_train)
clfOne.fit(X_train,y_train)

predicted_NN     = clfNN.predict(X_test)
predicted_One    = clfOne.predict(X_test)

from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm

conf_NN  = confusion_matrix(y_test,predicted_NN)
conf_One = confusion_matrix(y_test,predicted_One) 
display_cm(conf_NN,facies_labels,hide_zeros=True)


def accuracy(conf):
    total_correct = 0.
    nb_classes = conf.shape[0]
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
    acc = total_correct/sum(sum(conf))
    return acc


adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])

def accuracy_adjacent(conf, adjacent_facies):
    nb_classes = conf.shape[0]
    total_correct = 0.
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
        for j in adjacent_facies[i]:
            total_correct += conf[i][j]
    return total_correct / sum(sum(conf))

print('Facies classification accuracy (NN) = %f' % accuracy(conf_NN))
print('Facies classification accuracy (OneVsRest) = %f' % accuracy(conf_One))
print('Adjacent facies classification accuracy (NN) = %f' % accuracy_adjacent(conf_NN, adjacent_facies))
print('Adjacent facies classification accuracy (OneVsRest) = %f' % accuracy_adjacent(conf_One, adjacent_facies))









    



     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS    44     9     1                                        54
     CSiS     1   178    23     1           1           1         205
     FSiS     3    26   119           1                 3         152
     SiSh                      40     1     8           3          52
       MS                 1     4    36     9     1     3          54
       WS     1           1     7    13    75     1    13     1   112
        D                 1                 1    22     3     2    29
       PS           1     3     5     4    12     2   102     2   131
       BS                                                    41    41
Facies classification accuracy (NN) = 0.791566
Facies classification accuracy (OneVsRest) = 0.801205
Adjacent facies classification accuracy (NN) = 0.936145
Adjacent facies classification accuracy (OneVsRest) = 0.948193

Retrain using the full dataset.



In [7]:

    
clf_final = OneVsRestClassifier(MLPClassifier(solver='lbfgs', alpha=.015,
                    hidden_layer_sizes=sizes, random_state=1),n_jobs = -1)

clf_final.fit(scaled_features,correct_facies_labels)









    Out[7]:





OneVsRestClassifier(estimator=MLPClassifier(activation='relu', alpha=0.015, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False),
          n_jobs=-1)

Apply to test well



In [ ]:

    
# Normalize data
scaled_validation = scaler.transform(validation)
validation_output = clf_final.predict(scaled_validation)



In [ ]:

    
def make_facies_log_plot(logs, facies_colors):
   #make sure logs are sorted by depth
   logs = logs.sort_values(by='Depth')
   cmap_facies = colors.ListedColormap(
           facies_colors[0:len(facies_colors)], 'indexed')
   
   ztop=logs.Depth.min(); zbot=logs.Depth.max()
   
   cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1) # Makes it a nx1, repeating values along an dimension
   
   f, ax = plt.subplots(nrows=1, ncols=6, figsize=(8, 12))
   ax[0].plot(logs.GR, logs.Depth, '-g')
   ax[1].plot(logs.ILD_log10, logs.Depth, '-')
   ax[2].plot(logs.DeltaPHI, logs.Depth, '-', color='0.5')
   ax[3].plot(logs.PHIND, logs.Depth, '-', color='r')
   ax[4].plot(logs.PE, logs.Depth, '-', color='black')
   im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
                   cmap=cmap_facies,vmin=1,vmax=9)
   
   divider = make_axes_locatable(ax[5])
   cax = divider.append_axes("right", size="20%", pad=0.05)
   cbar=plt.colorbar(im, cax=cax)
   cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS', 
                               'SiSh', ' MS ', ' WS ', ' D  ', 
                               ' PS ', ' BS ']))
   cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
   
   for i in range(len(ax)-1):
       ax[i].set_ylim(ztop,zbot)
       ax[i].invert_yaxis()
       ax[i].grid()
       ax[i].locator_params(axis='x', nbins=3)
   
   ax[0].set_xlabel("GR")
   ax[0].set_xlim(logs.GR.min(),logs.GR.max())
   ax[1].set_xlabel("ILD_log10")
   ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
   ax[2].set_xlabel("DeltaPHI")
   ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
   ax[3].set_xlabel("PHIND")
   ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
   ax[4].set_xlabel("PE")
   ax[4].set_xlim(logs.PE.min(),logs.PE.max())
   ax[5].set_xlabel('Facies')
   
   ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
   ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
   ax[5].set_xticklabels([])
   f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)



In [ ]:

    
%matplotlib inline
validationFull['Facies']=validation_output
make_facies_log_plot(
    validationFull[validationFull['Well Name']=='STUART'],
    facies_colors=facies_colors)
make_facies_log_plot(
    validationFull[validationFull['Well Name']=='CRAWFORD'],
    facies_colors=facies_colors)



In [ ]:

    
validationFull.to_csv('TangDarnell.csv')

	Facies	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
count	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000
mean	4.503254	2906.867438	64.933985	0.659566	4.402484	13.201066	3.725014	1.518438	0.521852
std	2.474324	133.300164	30.302530	0.252703	5.274947	7.132846	0.790917	0.499720	0.286644
min	1.000000	2573.500000	10.149000	-0.025949	-21.832000	0.550000	0.200000	1.000000	0.000000
25%	2.000000	2821.500000	44.730000	0.498000	1.600000	8.500000	3.200000	1.000000	0.277000
50%	4.000000	2932.500000	64.990000	0.639000	4.300000	12.020000	3.725014	2.000000	0.528000
75%	6.000000	3007.000000	79.438000	0.822000	7.500000	16.050000	4.000000	2.000000	0.769000
max	9.000000	3138.000000	361.150000	1.800000	19.312000	84.400000	8.094000	2.000000	1.000000