In [1]:
%matplotlib inline
# to install watermark magic command: pip install ipyext
%load_ext watermark 
%watermark -v -p numpy,scipy,pandas,matplotlib,seaborn,sklearn


CPython 3.5.2
IPython 5.1.0

numpy 1.11.1
scipy 0.18.0
pandas 0.18.1
matplotlib 1.5.1
seaborn 0.7.1
sklearn 0.18

Exploring the dataset

First, we will examine the data set we will use to train the classifier.


In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data.describe()


/Users/lorenzoperozzi/anaconda3/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[2]:
Facies Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
count 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 4149.000000 3232.000000 4149.000000 4149.000000
mean 4.503254 2906.867438 64.933985 0.659566 4.402484 13.201066 3.725014 1.518438 0.521852
std 2.474324 133.300164 30.302530 0.252703 5.274947 7.132846 0.896152 0.499720 0.286644
min 1.000000 2573.500000 10.149000 -0.025949 -21.832000 0.550000 0.200000 1.000000 0.000000
25% 2.000000 2821.500000 44.730000 0.498000 1.600000 8.500000 NaN 1.000000 0.277000
50% 4.000000 2932.500000 64.990000 0.639000 4.300000 12.020000 NaN 2.000000 0.528000
75% 6.000000 3007.000000 79.438000 0.822000 7.500000 16.050000 NaN 2.000000 0.769000
max 9.000000 3138.000000 361.150000 1.800000 19.312000 84.400000 8.094000 2.000000 1.000000

In [3]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()


Out[3]:
[SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, KIMZEY A, CROSS H CATTLE, NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]
Categories (10, object): [SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, ..., NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]

In [4]:
PE_mask = training_data['PE'].notnull().values
training_data = training_data[PE_mask]

K fold cross-validation

Adapted from @LukasMosser code to fit python 3.5. Doing a cross-validation on each well in order to see how each well perform as a blind text


In [5]:
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
#Create a set of unique well names

names = list(set(training_data["Well Name"]))

#Create a dicitionary of the well datasets, continued from original contest notebook 
#But perform dropping for each well individually
#Maybe not necessary.

well_datas = {}
for name in names:
    well = training_data[training_data["Well Name"]==name] 
    well_labels = well['Facies'].values.astype(np.int64)
    well = well.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1).values
    well_datas[name] = [well, well_labels]
    
    
X_data = {}
y_data = {}
for name, (data, labels) in well_datas.items():
    y_data[name] = np.array(labels, dtype=np.int64)
    X_data[name] = np.array(data, dtype=np.float32)

training_sets = []
test_sets = []

for i in range(len(names)):
    X_train = []
    y_train = []

    X_test = []
    y_test = []

    for name, data in X_data.items():
        if name is not names[i]:
            for row in data:
                X_train.append(row)
        else:
            for row in data:
                X_test.append(row)

    for name, labels in y_data.items():
        if name is not names[i]:
            for val in labels:
                y_train.append(val)
        else:
            for val in labels:
                y_test.append(val)

    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.int64).reshape(len(y_train), 1)
    y_train = y_train.ravel()
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)


    X_test = np.array(X_test, dtype=np.float32)
    X_test = scaler.transform(X_test)

    y_test = np.array(y_test, dtype=np.int32)
    training_sets.append([X_train, y_train, X_test, y_test])
    
#Use as follows:
scores = []
for i, (X_train, y_train, X_test, y_test) in enumerate(training_sets):
    clf = svm.LinearSVC(class_weight='balanced', tol=1e-03, random_state=42, C=10)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #Scoring
    score = metrics.f1_score(y_test, y_pred, average='weighted')
    scores.append(score)
    print('********')
    print('Blind well is {0}, F1 score : {1:.4%}\n'.format(names[i],score))
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    pass
print("="*30)
print('*********** RESULT ***********')
print("="*30)
print('\nAverage  F1-score is {:.4%}'.format(np.mean(scores)))


/Users/lorenzoperozzi/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
********
Blind well is Recruit F9, F1 score : 83.7607%

********
Blind well is LUKE G U, F1 score : 55.6707%

/Users/lorenzoperozzi/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
********
Blind well is SHRIMPLIN, F1 score : 36.2894%

********
Blind well is CROSS H CATTLE, F1 score : 47.6321%

********
Blind well is NOLAN, F1 score : 44.3317%

********
Blind well is CHURCHMAN BIBLE, F1 score : 53.9936%

********
Blind well is SHANKLE, F1 score : 55.7004%

********
Blind well is NEWBY, F1 score : 43.5872%

==============================
*********** RESULT ***********
==============================

Average  F1-score is 52.6207%

K fold cross validation shows that the F1-score for each well is highly variable. For example, the model fit well for SHANKLE but less SHRIMPLIN. This is way, as @LukasMosser and me suggest is to use the average F1-score as a metricsto evaluate the performance of the submission.


In [ ]: