In [1]:
%matplotlib inline
# to install watermark magic command: pip install ipyext
%load_ext watermark
%watermark -v -p numpy,scipy,pandas,matplotlib,seaborn,sklearn
In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data.describe()
Out[2]:
In [3]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()
Out[3]:
In [4]:
PE_mask = training_data['PE'].notnull().values
training_data = training_data[PE_mask]
Adapted from @LukasMosser code to fit python 3.5. Doing a cross-validation on each well in order to see how each well perform as a blind text
In [5]:
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
#Create a set of unique well names
names = list(set(training_data["Well Name"]))
#Create a dicitionary of the well datasets, continued from original contest notebook
#But perform dropping for each well individually
#Maybe not necessary.
well_datas = {}
for name in names:
well = training_data[training_data["Well Name"]==name]
well_labels = well['Facies'].values.astype(np.int64)
well = well.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1).values
well_datas[name] = [well, well_labels]
X_data = {}
y_data = {}
for name, (data, labels) in well_datas.items():
y_data[name] = np.array(labels, dtype=np.int64)
X_data[name] = np.array(data, dtype=np.float32)
training_sets = []
test_sets = []
for i in range(len(names)):
X_train = []
y_train = []
X_test = []
y_test = []
for name, data in X_data.items():
if name is not names[i]:
for row in data:
X_train.append(row)
else:
for row in data:
X_test.append(row)
for name, labels in y_data.items():
if name is not names[i]:
for val in labels:
y_train.append(val)
else:
for val in labels:
y_test.append(val)
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.int64).reshape(len(y_train), 1)
y_train = y_train.ravel()
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = np.array(X_test, dtype=np.float32)
X_test = scaler.transform(X_test)
y_test = np.array(y_test, dtype=np.int32)
training_sets.append([X_train, y_train, X_test, y_test])
#Use as follows:
scores = []
for i, (X_train, y_train, X_test, y_test) in enumerate(training_sets):
clf = svm.LinearSVC(class_weight='balanced', tol=1e-03, random_state=42, C=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
#Scoring
score = metrics.f1_score(y_test, y_pred, average='weighted')
scores.append(score)
print('********')
print('Blind well is {0}, F1 score : {1:.4%}\n'.format(names[i],score))
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
pass
print("="*30)
print('*********** RESULT ***********')
print("="*30)
print('\nAverage F1-score is {:.4%}'.format(np.mean(scores)))
K fold cross validation shows that the F1-score for each well is highly variable. For example, the model fit well for SHANKLE but less SHRIMPLIN. This is way, as @LukasMosser and me suggest is to use the average F1-score as a metricsto evaluate the performance of the submission.
In [ ]: