In [1]:

    
%matplotlib inline
# to install watermark magic command: pip install ipyext
%load_ext watermark 
%watermark -v -p numpy,scipy,pandas,matplotlib,seaborn,sklearn









    



CPython 3.5.2
IPython 5.1.0

numpy 1.11.1
scipy 0.18.0
pandas 0.18.1
matplotlib 1.5.1
seaborn 0.7.1
sklearn 0.18

Exploring the dataset

First, we will examine the data set we will use to train the classifier.



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data.describe()









    



/Users/lorenzoperozzi/anaconda3/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)






    Out[2]:






  
    
      
      Facies
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      count
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      4149.000000
      3232.000000
      4149.000000
      4149.000000
    
    
      mean
      4.503254
      2906.867438
      64.933985
      0.659566
      4.402484
      13.201066
      3.725014
      1.518438
      0.521852
    
    
      std
      2.474324
      133.300164
      30.302530
      0.252703
      5.274947
      7.132846
      0.896152
      0.499720
      0.286644
    
    
      min
      1.000000
      2573.500000
      10.149000
      -0.025949
      -21.832000
      0.550000
      0.200000
      1.000000
      0.000000
    
    
      25%
      2.000000
      2821.500000
      44.730000
      0.498000
      1.600000
      8.500000
      NaN
      1.000000
      0.277000
    
    
      50%
      4.000000
      2932.500000
      64.990000
      0.639000
      4.300000
      12.020000
      NaN
      2.000000
      0.528000
    
    
      75%
      6.000000
      3007.000000
      79.438000
      0.822000
      7.500000
      16.050000
      NaN
      2.000000
      0.769000
    
    
      max
      9.000000
      3138.000000
      361.150000
      1.800000
      19.312000
      84.400000
      8.094000
      2.000000
      1.000000



In [3]:

    
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()









    Out[3]:





[SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, KIMZEY A, CROSS H CATTLE, NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]
Categories (10, object): [SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, ..., NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]



In [4]:

    
PE_mask = training_data['PE'].notnull().values
training_data = training_data[PE_mask]

K fold cross-validation

Adapted from @LukasMosser code to fit python 3.5. Doing a cross-validation on each well in order to see how each well perform as a blind text



In [5]:

    
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
#Create a set of unique well names

names = list(set(training_data["Well Name"]))

#Create a dicitionary of the well datasets, continued from original contest notebook 
#But perform dropping for each well individually
#Maybe not necessary.

well_datas = {}
for name in names:
    well = training_data[training_data["Well Name"]==name] 
    well_labels = well['Facies'].values.astype(np.int64)
    well = well.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1).values
    well_datas[name] = [well, well_labels]
    
    
X_data = {}
y_data = {}
for name, (data, labels) in well_datas.items():
    y_data[name] = np.array(labels, dtype=np.int64)
    X_data[name] = np.array(data, dtype=np.float32)

training_sets = []
test_sets = []

for i in range(len(names)):
    X_train = []
    y_train = []

    X_test = []
    y_test = []

    for name, data in X_data.items():
        if name is not names[i]:
            for row in data:
                X_train.append(row)
        else:
            for row in data:
                X_test.append(row)

    for name, labels in y_data.items():
        if name is not names[i]:
            for val in labels:
                y_train.append(val)
        else:
            for val in labels:
                y_test.append(val)

    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.int64).reshape(len(y_train), 1)
    y_train = y_train.ravel()
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)


    X_test = np.array(X_test, dtype=np.float32)
    X_test = scaler.transform(X_test)

    y_test = np.array(y_test, dtype=np.int32)
    training_sets.append([X_train, y_train, X_test, y_test])
    
#Use as follows:
scores = []
for i, (X_train, y_train, X_test, y_test) in enumerate(training_sets):
    clf = svm.LinearSVC(class_weight='balanced', tol=1e-03, random_state=42, C=10)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #Scoring
    score = metrics.f1_score(y_test, y_pred, average='weighted')
    scores.append(score)
    print('********')
    print('Blind well is {0}, F1 score : {1:.4%}\n'.format(names[i],score))
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    pass
print("="*30)
print('*********** RESULT ***********')
print("="*30)
print('\nAverage  F1-score is {:.4%}'.format(np.mean(scores)))









    



/Users/lorenzoperozzi/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)






    



********
Blind well is Recruit F9, F1 score : 83.7607%

********
Blind well is LUKE G U, F1 score : 55.6707%







    



/Users/lorenzoperozzi/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)






    



********
Blind well is SHRIMPLIN, F1 score : 36.2894%

********
Blind well is CROSS H CATTLE, F1 score : 47.6321%

********
Blind well is NOLAN, F1 score : 44.3317%

********
Blind well is CHURCHMAN BIBLE, F1 score : 53.9936%

********
Blind well is SHANKLE, F1 score : 55.7004%

********
Blind well is NEWBY, F1 score : 43.5872%

==============================
*********** RESULT ***********
==============================

Average  F1-score is 52.6207%

K fold cross validation shows that the F1-score for each well is highly variable. For example, the model fit well for SHANKLE but less SHRIMPLIN. This is way, as @LukasMosser and me suggest is to use the average F1-score as a metricsto evaluate the performance of the submission.



In [ ]:

	Facies	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
count	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	4149.000000	3232.000000	4149.000000	4149.000000
mean	4.503254	2906.867438	64.933985	0.659566	4.402484	13.201066	3.725014	1.518438	0.521852
std	2.474324	133.300164	30.302530	0.252703	5.274947	7.132846	0.896152	0.499720	0.286644
min	1.000000	2573.500000	10.149000	-0.025949	-21.832000	0.550000	0.200000	1.000000	0.000000
25%	2.000000	2821.500000	44.730000	0.498000	1.600000	8.500000	NaN	1.000000	0.277000
50%	4.000000	2932.500000	64.990000	0.639000	4.300000	12.020000	NaN	2.000000	0.528000
75%	6.000000	3007.000000	79.438000	0.822000	7.500000	16.050000	NaN	2.000000	0.769000
max	9.000000	3138.000000	361.150000	1.800000	19.312000	84.400000	8.094000	2.000000	1.000000