Machine Learning Contest

By: Kris Darnell & David Tang

01/31/17 Submission with feature enginering and stochastic gradient descent for MLP classifier. Contest is described here.


In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import f1_score

from sklearn.neural_network import MLPClassifier
from scipy.signal import medfilt

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

# Loading Data
filename = 'facies_vectors.csv'  # Read in training data
training_data = pd.read_csv(filename)
training_data.fillna(training_data.mean(),inplace=True)  # Remove NaNs
training_data
filename = 'validation_data_nofacies.csv'  # Read in test well
validationFull = pd.read_csv(filename)

# Converts to category
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()
training_data.describe()


Out[1]:
Facies Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000
mean 4.503254 2906.867438 64.933985 0.659566 4.402484 13.201066 3.725014 1.518438 0.521852
std 2.474324 133.300164 30.302530 0.252703 5.274947 7.132846 0.790917 0.499720 0.286644
min 1.000000 2573.500000 10.149000 -0.025949 -21.832000 0.550000 0.200000 1.000000 0.000000
25% 2.000000 2821.500000 44.730000 0.498000 1.600000 8.500000 3.200000 1.000000 0.277000
50% 4.000000 2932.500000 64.990000 0.639000 4.300000 12.020000 3.725014 2.000000 0.528000
75% 6.000000 3007.000000 79.438000 0.822000 7.500000 16.050000 4.000000 2.000000 0.769000
max 9.000000 3138.000000 361.150000 1.800000 19.312000 84.400000 8.094000 2.000000 1.000000

Do Feature Engineering


In [2]:
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
       '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors

# Dictionary # enumerate puts out ind=0, label=SS, 
# and loops through the whole thing
facies_color_map = {}   
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]
   
def label_facies(row, labels):
    return labels[ row['Facies'] -1]
    
training_data.loc[:,'FaciesLabels'] = training_data.apply(
        lambda row: label_facies(row, facies_labels), axis=1)

correct_facies_labels = training_data['Facies'].values
                                     
training_data.insert(1,'FormationNum',0)
validationFull.insert(1,'FormationNum',0)
training_data.insert(1,'FormationSH',0)
validationFull.insert(1,'FormationSH',0)

# Label Formations to differentiate between 'SH' and 'LM'
def is_SH(x):
    if 'SH' not in x:
        return 0
    else:
        return 1

# Convert formations to numerical values
for ii, formation in enumerate(training_data['Formation'].unique()):
    training_data.FormationNum[training_data.Formation == formation] = ii
    validationFull.FormationNum[validationFull.Formation == formation] = ii
    training_data.FormationSH = training_data.Formation.apply(is_SH)
    validationFull.FormationSH = validationFull.Formation.apply(is_SH)


wndw_sz = 30
dfs = []
for well in training_data['Well Name'].unique():
    df = training_data[training_data['Well Name']==well].copy(deep=True)
    df.sort_values('Depth', inplace=True)
    for col in ['PE', 'GR', 'PHIND', 'ILD_log10', 'DeltaPHI']:
        # Rolling averages
        rollavg_col = 'rollavg_' + col
        df[rollavg_col] = df[col].rolling(window=wndw_sz, center=False).mean()
        df[rollavg_col].fillna(method='ffill', inplace=True)
        df[rollavg_col].fillna(method='bfill', inplace=True)
        
        # Compute features gradient
        grad_col = 'grad_' + col
        df[grad_col] = df[col].diff()
        df[grad_col].fillna(method='ffill', inplace=True)
        df[grad_col].fillna(method='bfill', inplace=True)
    dfs.append(df)
training_data = pd.concat(dfs)
PE_mean = training_data.PE.mean()
roll_PE_mean = training_data.rollavg_PE.mean()
training_data['PE'] = training_data.PE.replace({np.nan: PE_mean})
training_data['rollavg_PE'] = training_data['rollavg_PE'].replace(
        {np.nan: roll_PE_mean})
feature_vectors = training_data.drop(['Well Name', 'Facies',
                                      'FaciesLabels', 'Formation'], axis=1)
feature_vectors.describe()

dfs = []
for well in validationFull['Well Name'].unique():
    df = validationFull[validationFull['Well Name'] == well].copy(deep=True)
    df.sort_values('Depth', inplace=True)
    for col in ['PE', 'GR', 'PHIND', 'ILD_log10', 'DeltaPHI']:
        rollavg_col = 'rollavg_' + col
        df[rollavg_col] = df[col].rolling(window=wndw_sz, center=False).mean()
        df[rollavg_col].fillna(method='ffill', inplace=True)
        df[rollavg_col].fillna(method='bfill', inplace=True)
        
        # Compute features gradient
        grad_col = 'grad_' + col
        df[grad_col] = df[col].diff()
        df[grad_col].fillna(method='ffill', inplace=True)
        df[grad_col].fillna(method='bfill', inplace=True)
    dfs.append(df)
validationFull = pd.concat(dfs)
validationFull.describe()
validation = validationFull.drop(['Formation', 'Well Name'], axis=1)

X = feature_vectors.values
y = correct_facies_labels
well = training_data['Well Name'].values

Make grid for cross validation


In [3]:
#alpha_grid = [1e-3, 1e-2, 2e-2, 3e-2, 1e-1]  
#solver_grid = ['sgd', 'lbfgs']  
#sizes_grid = [(100, 100), (200, 200), (200, 200, 100)]

alpha_grid = [0.1]  
solver_grid = ['sgd']  
sizes_grid = [(100, 100)]
param_grid = []
for alpha in alpha_grid:
    for solver in solver_grid:
        for sizes in sizes_grid:
                        param_grid.append({'alpha': alpha, 
                                           'solver': solver, 
                                           'sizes': sizes})

In [ ]:
# Initialize model selection methods
from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut

lpgo = LeavePGroupsOut(2)

# Generate splits
split_list = []
for train, val in lpgo.split(X, y, groups=training_data['Well Name']):
    hist_tr = np.histogram(y[train], bins=np.arange(len(facies_labels)+1)+.5)
    hist_val = np.histogram(y[val], bins=np.arange(len(facies_labels)+1)+.5)
    if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0):
        split_list.append({'train':train, 'val':val})
            
# Print splits
for s, split in enumerate(split_list):
    print('Split %d' % s)
    print('    training:   %s' % (training_data['Well Name'][split['train']].unique()))
    print('    validation: %s' % (training_data['Well Name'][split['val']].unique()))

Training Data


In [5]:
def train_and_test(X_tr, y_tr, X_v, well_v, param):
    
    # Feature normalization
    scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)
#    scaler = preprocessing.StandardScaler().fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_v = scaler.transform(X_v)
    
    # Train classifier  
    clf = MLPClassifier(solver=param['solver'], 
                        alpha=param['alpha'], 
                        hidden_layer_sizes=param['sizes'], 
                        random_state=seed)

    clf.fit(X_tr, y_tr)
    
    # Make prediction
    y_v_hat = clf.predict(X_v)
#    plt.plot(y_v_hat)
#    plt.title('Before')
#    plt.show()

    
    # Perform median filter to smooth results
    k_size = 5
    if len(np.unique(well_v)) > 1:
        for w in np.unique(well_v):
            y_v_hat[well_v == w] = medfilt(y_v_hat[well_v == w], kernel_size=k_size)
    else:
        y_v_hat = medfilt(y_v_hat, kernel_size=k_size)
#    plt.plot(y_v_hat)
#    plt.title('After')
#    plt.show()

    
    return y_v_hat

In [6]:
# Do cross validation as an average across the leave p groups out splittin
seed = 200
np.random.seed(seed)

# For each set of parameters
score_param = []
for param in param_grid:
    
    # For each data split
    score_split = []
    for split in split_list:
    
        
        # Select training and validation data from current split
        X_tr = X[split['train'], :]
        X_v = X[split['val'], :]
        y_tr = y[split['train']]
        y_v = y[split['val']]
        
        # Select well labels for validation data
        well_v = well[split['val']]

        # Train and test
        y_v_hat = train_and_test(X_tr, y_tr, X_v, well_v, param)
        
        # Score
        score = f1_score(y_v, y_v_hat, average='micro')
        score_split.append(score)
        
    # Average score for this param
    score_param.append(np.mean(score_split))
    print('F1 score = %.3f %s' % (score_param[-1], param))
          
# Best set of parameters
best_idx = np.argmax(score_param)
param_best = param_grid[best_idx]
score_best = score_param[best_idx]
print('\nBest F1 score = %.3f %s' % (score_best, param_best))


/Users/kdarnell/anaconda/lib/python3.5/site-packages/sklearn/neural_network/multilayer_perceptron.py:563: ConvergenceWarning: Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.
  % (), ConvergenceWarning)
F1 score = 0.489 {'alpha': 0.1, 'sizes': (100, 100), 'solver': 'sgd'}

Best F1 score = 0.489 {'alpha': 0.1, 'sizes': (100, 100), 'solver': 'sgd'}

Retrain using the full dataset.


In [7]:
X_v = validation.values
well_v = validationFull['Well Name'].values
param = param_grid[best_idx]
validation_output = train_and_test(X, y, X_v, well_v, param)
validationFull['Facies'] = validation_output


/Users/kdarnell/anaconda/lib/python3.5/site-packages/sklearn/neural_network/multilayer_perceptron.py:563: ConvergenceWarning: Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.
  % (), ConvergenceWarning)

Apply to test well


In [13]:
validationFull.to_csv('TangDarnell_sub7.csv')

In [14]:
logo = LeaveOneGroupOut()        
all_scores = []
for train, test in logo.split(X, y, groups=well):
    well_name = well[test[0]]
    
    # Select training and validation data from current split
    X_tr = X[train,:]
    X_v = X[test,:]
    y_tr = y[train]
    y_v = y[test]
               
    # Train and test
    y_out= train_and_test(X_tr, y_tr, X_v, well_name, param)

    score = f1_score(y_out, y[test], average='micro')
    all_scores.append(score)
    print("{:>20s}  {:.3f}".format(well_name, score))
all_scores = np.asarray(all_scores)
print("Average F1 score is: {:.3f}".format(all_scores.mean()))


/Users/kdarnell/anaconda/lib/python3.5/site-packages/sklearn/neural_network/multilayer_perceptron.py:563: ConvergenceWarning: Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.
  % (), ConvergenceWarning)
         ALEXANDER D  0.536
     CHURCHMAN BIBLE  0.547
      CROSS H CATTLE  0.353
            KIMZEY A  0.458
            LUKE G U  0.525
               NEWBY  0.471
               NOLAN  0.480
          Recruit F9  0.850
             SHANKLE  0.526
           SHRIMPLIN  0.480
Average F1 score is: 0.523

In [15]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
def make_facies_log_plot(logs, facies_colors):
   #make sure logs are sorted by depth
   logs = logs.sort_values(by='Depth')
   cmap_facies = colors.ListedColormap(
           facies_colors[0:len(facies_colors)], 'indexed')
   
   ztop=logs.Depth.min(); zbot=logs.Depth.max()
   
   cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1) # Makes it a nx1, repeating values along an dimension
   
   f, ax = plt.subplots(nrows=1, ncols=6, figsize=(8, 12))
   ax[0].plot(logs.GR, logs.Depth, '-g')
   ax[1].plot(logs.ILD_log10, logs.Depth, '-')
   ax[2].plot(logs.DeltaPHI, logs.Depth, '-', color='0.5')
   ax[3].plot(logs.PHIND, logs.Depth, '-', color='r')
   ax[4].plot(logs.PE, logs.Depth, '-', color='black')
   im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
                   cmap=cmap_facies,vmin=1,vmax=9)
   
   divider = make_axes_locatable(ax[5])
   cax = divider.append_axes("right", size="20%", pad=0.05)
   cbar=plt.colorbar(im, cax=cax)
   cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS', 
                               'SiSh', ' MS ', ' WS ', ' D  ', 
                               ' PS ', ' BS ']))
   cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
   
   for i in range(len(ax)-1):
       ax[i].set_ylim(ztop,zbot)
       ax[i].invert_yaxis()
       ax[i].grid()
       ax[i].locator_params(axis='x', nbins=3)
   
   ax[0].set_xlabel("GR")
   ax[0].set_xlim(logs.GR.min(),logs.GR.max())
   ax[1].set_xlabel("ILD_log10")
   ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
   ax[2].set_xlabel("DeltaPHI")
   ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
   ax[3].set_xlabel("PHIND")
   ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
   ax[4].set_xlabel("PE")
   ax[4].set_xlim(logs.PE.min(),logs.PE.max())
   ax[5].set_xlabel('Facies')
   
   ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
   ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
   ax[5].set_xticklabels([])
   f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)

In [16]:
get_ipython().magic('matplotlib inline')
make_facies_log_plot(
    validationFull[validationFull['Well Name']=='STUART'],
    facies_colors=facies_colors)
make_facies_log_plot(
    validationFull[validationFull['Well Name']=='CRAWFORD'],
    facies_colors=facies_colors)