In [1]:
import sklearn
print(sklearn.__version__)
In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data
Out[2]:
Remove a single well to use as a blind test later.
In [3]:
blind = training_data[training_data['Well Name'] == 'SHANKLE']
training_data = training_data[training_data['Well Name'] != 'SHANKLE']
In [4]:
'SHANKLE' in training_data['Well Name'].unique()
Out[4]:
In [5]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()
Out[5]:
These are the names of the 10 training wells in the Council Grove reservoir. Data has been recruited into pseudo-well 'Recruit F9' to better represent facies 9, the Phylloid-algal bafflestone.
Before we plot the well data, let's define a color map so the facies are represented by consistent color in all the plots in this tutorial. We also create the abbreviated facies labels, and add those to the facies_vectors
dataframe.
In [6]:
# 1=sandstone 2=c_siltstone 3=f_siltstone # 4=marine_silt_shale
#5=mudstone 6=wackestone 7=dolomite 8=packstone 9=bafflestone
facies_colors = ['#F4D03F', '#F5B041', '#DC7633','#A569BD',
'#000000', '#000080', '#2E86C1', '#AED6F1', '#196F3D']
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
facies_color_map[label] = facies_colors[ind]
def label_facies(row, labels):
return labels[ row['Facies'] -1]
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)
training_data.describe()
Out[6]:
This is a quick view of the statistical distribution of the input variables. Looking at the count
values, most values have 4149 valid values except for PE
, which has 3232. In this tutorial we will drop the feature vectors that don't have a valid PE
entry.
In [7]:
PE_mask = training_data['PE'].notnull().values
training_data = training_data[PE_mask]
In [8]:
training_data.describe()
Out[8]:
Let's take a look at the data from individual wells in a more familiar log plot form. We will create plots for the five well log variables, as well as a log for facies labels. The plots are based on the those described in Alessandro Amato del Monte's excellent tutorial.
In [9]:
#I changed the colors to match the paper for easier compare.
def make_facies_log_plot(logs, facies_colors):
#make sure logs are sorted by depth
logs = logs.sort_values(by='Depth')
cmap_facies = colors.ListedColormap(
facies_colors[0:len(facies_colors)], 'indexed')
ztop=logs.Depth.min(); zbot=logs.Depth.max()
cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
f, ax = plt.subplots(nrows=1, ncols=6, figsize=(8, 12))
ax[0].plot(logs.GR, logs.Depth, '-g')
ax[1].plot(logs.ILD_log10, logs.Depth, '-')
ax[2].plot(logs.DeltaPHI, logs.Depth, '-', color='0.5')
ax[3].plot(logs.PHIND, logs.Depth, '-', color='r')
ax[4].plot(logs.PE, logs.Depth, '-', color='black')
im=ax[5].imshow(cluster, interpolation='none', aspect='auto',
cmap=cmap_facies,vmin=1,vmax=9)
divider = make_axes_locatable(ax[5])
cax = divider.append_axes("right", size="20%", pad=0.05)
cbar=plt.colorbar(im, cax=cax)
cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS',
'SiSh', ' MS ', ' WS ', ' D ',
' PS ', ' BS ']))
cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
for i in range(len(ax)-1):
ax[i].set_ylim(ztop,zbot)
ax[i].invert_yaxis()
ax[i].grid()
ax[i].locator_params(axis='x', nbins=3)
ax[0].set_xlabel("GR")
ax[0].set_xlim(logs.GR.min(),logs.GR.max())
ax[1].set_xlabel("ILD_log10")
ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
ax[2].set_xlabel("DeltaPHI")
ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
ax[3].set_xlabel("PHIND")
ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
ax[4].set_xlabel("PE")
ax[4].set_xlim(logs.PE.min(),logs.PE.max())
ax[5].set_xlabel('Facies')
ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
ax[5].set_xticklabels([])
f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)
In addition to individual wells, we can look at how the various facies are represented by the entire training set. Let's plot a histgram of the number of training examples for each facies class.
In [10]:
#count the number of unique entries for each facies, sort them by
#facies number (instead of by number of entries)
facies_counts = training_data['Facies'].value_counts().sort_index()
#use facies labels to index each count
facies_counts.index = facies_labels
facies_counts.plot(kind='bar',color=facies_colors,
title='Distribution of Training Data by Facies')
facies_counts
Out[10]:
In [11]:
correct_facies_labels = training_data['Facies'].values
feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
feature_vectors.describe()
Out[11]:
In [12]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors)
In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
scaled_features, correct_facies_labels, test_size=0.3, random_state=42)
Now we use the cleaned and conditioned training set to create a facies classifier. As mentioned above, we will use a type of machine learning model known as a support vector machine. The SVM is a map of the feature vectors as points in a multi dimensional space, mapped so that examples from different facies are divided by a clear gap that is as wide as possible.
In [14]:
from sklearn import svm
clf = svm.LinearSVC(random_state=42)
Now we can train the classifier using the training set we created above.
In [15]:
clf.fit(X_train, y_train)
Out[15]:
In [16]:
predicted_labels = clf.predict(X_test)
In [17]:
from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm
conf = confusion_matrix(y_test, predicted_labels)
display_cm(conf, facies_labels, display_metrics=True, hide_zeros=True)
In [18]:
def accuracy(conf):
total_correct = 0.
nb_classes = conf.shape[0]
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
acc = total_correct/sum(sum(conf))
return acc
In [19]:
adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])
def accuracy_adjacent(conf, adjacent_facies):
nb_classes = conf.shape[0]
total_correct = 0.
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
for j in adjacent_facies[i]:
total_correct += conf[i][j]
return total_correct / sum(sum(conf))
In [20]:
print('Facies classification accuracy = %f' % accuracy(conf))
print('Adjacent facies classification accuracy = %f' % accuracy_adjacent(conf, adjacent_facies))
In [21]:
#model selection takes a few minutes, change this variable
#to true to run the parameter loop
do_model_selection = False
if do_model_selection:
C_range = np.array([.01, 1, 5, 10, 20, 50, 100, 1000, 5000, 10000])
gamma_range = np.array([0.0001, 0.001, 0.01, 0.1, 1, 10])
fig, axes = plt.subplots(3, 2,
sharex='col', sharey='row',figsize=(10,10))
plot_number = 0
for outer_ind, gamma_value in enumerate(gamma_range):
row = int(plot_number / 2)
column = int(plot_number % 2)
cv_errors = np.zeros(C_range.shape)
train_errors = np.zeros(C_range.shape)
for index, c_value in enumerate(C_range):
clf = svm.SVC(C=c_value, gamma=gamma_value)
clf.fit(X_train,y_train)
train_conf = confusion_matrix(y_train, clf.predict(X_train))
cv_conf = confusion_matrix(y_test, clf.predict(X_test))
cv_errors[index] = accuracy(cv_conf)
train_errors[index] = accuracy(train_conf)
ax = axes[row, column]
ax.set_title('Gamma = %g'%gamma_value)
ax.semilogx(C_range, cv_errors, label='CV error')
ax.semilogx(C_range, train_errors, label='Train error')
plot_number += 1
ax.set_ylim([0.2,1])
ax.legend(bbox_to_anchor=(1.05, 0), loc='lower left', borderaxespad=0.)
fig.text(0.5, 0.03, 'C value', ha='center',
fontsize=14)
fig.text(0.04, 0.5, 'Classification Accuracy', va='center',
rotation='vertical', fontsize=14)
The best accuracy on the cross validation error curve was achieved for gamma = 1
, and C = 10
. We can now create and train an optimized classifier based on these parameters:
In [22]:
clf = svm.LinearSVC(class_weight='balanced', tol=1e-03, random_state=42)
clf.fit(X_train, y_train)
cv_conf = confusion_matrix(y_test, clf.predict(X_test))
print('Optimized facies classification accuracy = %.2f' % accuracy(cv_conf))
print('Optimized adjacent facies classification accuracy = %.2f' % accuracy_adjacent(cv_conf, adjacent_facies))
In [23]:
display_cm(cv_conf, facies_labels,
display_metrics=True, hide_zeros=True)
In [24]:
display_adj_cm(cv_conf, facies_labels, adjacent_facies,
display_metrics=True, hide_zeros=True)
In [25]:
blind
Out[25]:
The label vector is just the Facies
column:
In [26]:
y_blind = blind['Facies'].values
We can form the feature matrix by dropping some of the columns and making a new dataframe:
In [27]:
well_features = blind.drop(['Facies', 'Formation', 'Well Name', 'Depth'], axis=1)
Now we can transform this with the scaler we made before:
In [28]:
X_blind = scaler.transform(well_features)
Now it's a simple matter of making a prediction and storing it back in the dataframe:
In [29]:
y_pred = clf.predict(X_blind)
blind['Prediction'] = y_pred
Let's see how we did with the confusion matrix:
In [30]:
cv_conf = confusion_matrix(y_blind, y_pred)
print('Optimized facies classification accuracy = %.2f' % accuracy(cv_conf))
print('Optimized adjacent facies classification accuracy = %.2f' % accuracy_adjacent(cv_conf, adjacent_facies))
We managed 0.75 using the test data, but it was from the same wells as the training data. This more reasonable test does not perform as well...
In [31]:
display_cm(cv_conf, facies_labels,
display_metrics=True, hide_zeros=True)
...but does remarkably well on the adjacent facies predictions.
In [32]:
display_adj_cm(cv_conf, facies_labels, adjacent_facies,
display_metrics=True, hide_zeros=True)
In [33]:
def compare_facies_plot(logs, compadre, facies_colors):
#make sure logs are sorted by depth
logs = logs.sort_values(by='Depth')
cmap_facies = colors.ListedColormap(
facies_colors[0:len(facies_colors)], 'indexed')
ztop=logs.Depth.min(); zbot=logs.Depth.max()
cluster1 = np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)
cluster2 = np.repeat(np.expand_dims(logs[compadre].values,1), 100, 1)
f, ax = plt.subplots(nrows=1, ncols=7, figsize=(9, 12))
ax[0].plot(logs.GR, logs.Depth, '-g')
ax[1].plot(logs.ILD_log10, logs.Depth, '-')
ax[2].plot(logs.DeltaPHI, logs.Depth, '-', color='0.5')
ax[3].plot(logs.PHIND, logs.Depth, '-', color='r')
ax[4].plot(logs.PE, logs.Depth, '-', color='black')
im1 = ax[5].imshow(cluster1, interpolation='none', aspect='auto',
cmap=cmap_facies,vmin=1,vmax=9)
im2 = ax[6].imshow(cluster2, interpolation='none', aspect='auto',
cmap=cmap_facies,vmin=1,vmax=9)
divider = make_axes_locatable(ax[6])
cax = divider.append_axes("right", size="20%", pad=0.05)
cbar=plt.colorbar(im2, cax=cax)
cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS',
'SiSh', ' MS ', ' WS ', ' D ',
' PS ', ' BS ']))
cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
for i in range(len(ax)-2):
ax[i].set_ylim(ztop,zbot)
ax[i].invert_yaxis()
ax[i].grid()
ax[i].locator_params(axis='x', nbins=3)
ax[0].set_xlabel("GR")
ax[0].set_xlim(logs.GR.min(),logs.GR.max())
ax[1].set_xlabel("ILD_log10")
ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())
ax[2].set_xlabel("DeltaPHI")
ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())
ax[3].set_xlabel("PHIND")
ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())
ax[4].set_xlabel("PE")
ax[4].set_xlim(logs.PE.min(),logs.PE.max())
ax[5].set_xlabel('Facies')
ax[6].set_xlabel(compadre)
ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])
ax[4].set_yticklabels([]); ax[5].set_yticklabels([])
ax[5].set_xticklabels([])
ax[6].set_xticklabels([])
f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)
In [34]:
compare_facies_plot(blind, 'Prediction', facies_colors)
In [35]:
from sklearn.cross_validation import KFold
kf = KFold(4, n_folds=2)
In [36]:
print(kf)
In [ ]:
for train_index, test_index in kf:
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
In [ ]:
Now that we have a trained facies classification model we can use it to identify facies in wells that do not have core data. In this case, we will apply the classifier to two wells, but we could use it on any number of wells for which we have the same set of well logs for input.
This dataset is similar to the training data except it does not have facies labels. It is loaded into a dataframe called test_data
.
In [35]:
well_data = pd.read_csv('../validation_data_nofacies.csv')
well_data['Well Name'] = well_data['Well Name'].astype('category')
well_features = well_data.drop(['Formation', 'Well Name', 'Depth'], axis=1)
The data needs to be scaled using the same constants we used for the training data.
In [36]:
X_unknown = scaler.transform(well_features)
Finally we predict facies labels for the unknown data, and store the results in a Facies
column of the test_data
dataframe.
In [37]:
#predict facies of unclassified data
y_unknown = clf.predict(X_unknown)
well_data['Facies'] = y_unknown
well_data
Out[37]:
In [38]:
well_data['Well Name'].unique()
Out[38]:
We can use the well log plot to view the classification results along with the well logs.
In [39]:
well_data.to_csv('well_data_with_facies.csv')
In [ ]: