In [1]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
import itertools
from sklearn import neighbors
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut
import inversion
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
df = pd.read_csv('../facies_vectors.csv')
In the real world it would be unusual to have neutron-density cross-plot porosity (i.e. PHIND) without the corresponding raw input curves, namely bulk density and neutron porosity, as we have in this contest dataset. So as part of the feature engineering process, I back-calculate estimates of those raw curves from the provided DeltaPHI and PHIND curves. One issue with this approach though is that cross-plot porosity differs between vendors, toolstrings, and software packages, and it is not known exactly how the PHIND in this dataset was computed. So I make the assumption here that PHIND ≈ sum of squares porosity, which is usually an adequate approximation of neutron-density crossplot porosity. That equation looks like this:
$$PHIND = \sqrt{\frac{NPHI^2 + DPHI^2}{2}}$$and it is assumed here that DeltaPHI is:
$$DeltaPHI = NPHI - DPHI$$The functions below use the relationships from the above equations (...two equations, two unknowns...) to estimate NPHI and DPHI (and consequently RHOB).
Once we have RHOB, we can use it combined with PE to estimate apparent grain density (RHOMAA) and apparent photoelectric capture cross-section (UMAA), which are useful in lithology estimations from well logs.
In [3]:
def estimate_dphi(df):
return ((4*(df['PHIND']**2) - (df['DeltaPHI']**2))**0.5 - df['DeltaPHI']) / 2
def estimate_rhob(df):
return (2.71 - (df['DPHI_EST']/100) * 1.71)
def estimate_nphi(df):
return df['DPHI_EST'] + df['DeltaPHI']
def compute_rhomaa(df):
return (df['RHOB_EST'] - (df['PHIND'] / 100)) / (1 - df['PHIND'] / 100)
def compute_umaa(df):
return ((df['PE'] * df['RHOB_EST']) - (df['PHIND']/100 * 0.398)) / (1 - df['PHIND'] / 100)
Because solving the sum of squares equation involved the quadratic formula, in some cases imaginary numbers result due to porosities being negative, which is what the warning below is about.
In [4]:
df['DPHI_EST'] = df.apply(lambda x: estimate_dphi(x), axis=1).astype(float)
df['RHOB_EST'] = df.apply(lambda x: estimate_rhob(x), axis=1)
df['NPHI_EST'] = df.apply(lambda x: estimate_nphi(x), axis=1)
df['RHOMAA_EST'] = df.apply(lambda x: compute_rhomaa(x), axis=1)
In [5]:
pe = df.dropna()
PE = pe['PE'].values
wells = pe['Well Name'].values
drop_list_pe = ['Formation', 'Well Name', 'Facies', 'Depth', 'PE', 'RELPOS']
fv_pe = pe.drop(drop_list_pe, axis=1).values
X_pe = preprocessing.StandardScaler().fit(fv_pe).transform(fv_pe)
y_pe = PE
reg = neighbors.KNeighborsRegressor(n_neighbors=40, weights='distance')
logo = LeaveOneGroupOut()
f1knn_pe = []
for train, test in logo.split(X_pe, y_pe, groups=wells):
well_name = wells[test[0]]
reg.fit(X_pe[train], y_pe[train])
score = reg.fit(X_pe[train], y_pe[train]).score(X_pe[test], y_pe[test])
print("{:>20s} {:.3f}".format(well_name, score))
f1knn_pe.append(score)
print("-Average leave-one-well-out F1 Score: %6f" % (np.mean(f1knn_pe)))
In [6]:
reg.fit(X_pe, y_pe)
fv_apply = df.drop(drop_list_pe, axis=1).values
X_apply = preprocessing.StandardScaler().fit(fv_apply).transform(fv_apply)
df['PE_EST'] = reg.predict(X_apply)
df.PE = df.PE.combine_first(df.PE_EST)
In [7]:
df['UMAA_EST'] = df.apply(lambda x: compute_umaa(x), axis=1)
Just for fun, below is a basic Umaa-Rhomaa plot to view relative abundances of quartz, calcite, dolomite, and clay. The red triangle represents a ternary solution for QTZ, CAL, and DOL, while the green triangle represents a solution for QTZ, CAL, and CLAY (illite).
In [8]:
df[df.GR < 125].plot(kind='scatter', x='UMAA_EST', y='RHOMAA_EST', c='GR', figsize=(8,6))
plt.ylim(3.1, 2.2)
plt.xlim(0.0, 17.0)
plt.plot([4.8, 9.0, 13.8, 4.8], [2.65, 2.87, 2.71, 2.65], c='r')
plt.plot([4.8, 11.9, 13.8, 4.8], [2.65, 3.06, 2.71, 2.65], c='g')
plt.scatter([4.8], [2.65], s=50, c='r')
plt.scatter([9.0], [2.87], s=50, c='r')
plt.scatter([13.8], [2.71], s=50, c='r')
plt.scatter([11.9], [3.06], s=50, c='g')
plt.text(2.8, 2.65, 'Quartz', backgroundcolor='w')
plt.text(14.4, 2.71, 'Calcite', backgroundcolor='w')
plt.text(9.6, 2.87, 'Dolomite', backgroundcolor='w')
plt.text(12.5, 3.06, 'Illite', backgroundcolor='w')
plt.text(7.0, 2.55, "gas effect", ha="center", va="center", rotation=-55,
size=8, bbox=dict(boxstyle="larrow,pad=0.3", fc="pink", ec="red", lw=2))
plt.text(15.0, 2.78, "barite?", ha="center", va="center", rotation=0,
size=8, bbox=dict(boxstyle="rarrow,pad=0.3", fc="yellow", ec="orange", lw=2))
Out[8]:
Here I use matrix inversion to "solve" the ternary plot for each lithologic component. Essentially each datapoint is a mix of the three components defined by the ternary diagram, with abundances of each defined by the relative distances from each endpoint. I use a GR cutoff of 40 API to determine when to use either the QTZ-CAL-DOL or QTZ-CAL-CLAY ternary solutions. In other words, it is assumed that below 40 API, there is 0% clay, and above 40 API there is 0% dolomite, and also that these four lithologic components are the only components in these rocks. Admittedly it's not a great assumption, especially since the ternary plot indicates other stuff is going on. For example the high Umaa datapoints near the Calcite endpoint may indicate some heavy minerals (e.g., pyrite) or even barite-weighted mud. The "pull" of datapoints to the northwest quadrant probably reflects some gas effect, so my lithologies in those gassy zones will be skewed.
In [9]:
# QTZ-CAL-CLAY
ur1 = inversion.UmaaRhomaa()
ur1.set_dol_uma(11.9)
ur1.set_dol_rhoma(3.06)
# QTZ-CAL-DOL
ur2 = inversion.UmaaRhomaa()
df['UR_QTZ'] = np.nan
df['UR_CLY'] = np.nan
df['UR_CAL'] = np.nan
df['UR_DOL'] = np.nan
df.ix[df.GR >= 40, 'UR_QTZ'] = df.ix[df.GR >= 40].apply(lambda x: ur1.get_qtz(x.UMAA_EST, x.RHOMAA_EST), axis=1)
df.ix[df.GR >= 40, 'UR_CLY'] = df.ix[df.GR >= 40].apply(lambda x: ur1.get_dol(x.UMAA_EST, x.RHOMAA_EST), axis=1)
df.ix[df.GR >= 40, 'UR_CAL'] = df.ix[df.GR >= 40].apply(lambda x: ur1.get_cal(x.UMAA_EST, x.RHOMAA_EST), axis=1)
df.ix[df.GR >= 40, 'UR_DOL'] = 0
df.ix[df.GR < 40, 'UR_QTZ'] = df.ix[df.GR < 40].apply(lambda x: ur2.get_qtz(x.UMAA_EST, x.RHOMAA_EST), axis=1)
df.ix[df.GR < 40, 'UR_DOL'] = df.ix[df.GR < 40].apply(lambda x: ur2.get_dol(x.UMAA_EST, x.RHOMAA_EST), axis=1)
df.ix[df.GR < 40, 'UR_CAL'] = df.ix[df.GR < 40].apply(lambda x: ur2.get_cal(x.UMAA_EST, x.RHOMAA_EST), axis=1)
df.ix[df.GR < 40, 'UR_CLY'] = 0
In [10]:
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00', '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']
fms = df.Formation.unique()
fig, ax = plt.subplots(int(len(fms) / 2), 2, sharey=True, sharex=True, figsize=(5,10))
for i, fm in enumerate(fms):
facies_counts = df[df.Formation == fm]['Facies'].value_counts().sort_index()
colors = [facies_colors[i-1] for i in facies_counts.index]
ax[int(i/2), i%2].bar(facies_counts.index, height=facies_counts, color=colors)
ax[int(i/2), i%2].set_title(fm, size=8)
In [11]:
fm_groups = [['A1 SH', 'B1 SH', 'B2 SH', 'B3 SH', 'B4 SH'],
['B5 SH', 'C SH'],
['A1 LM', 'C LM'],
['B1 LM', 'B3 LM', 'B4 LM'],
['B2 LM', 'B5 LM']]
fm_group_dict = {fm:i for i, l in enumerate(fm_groups) for fm in l}
df['FM_GRP'] = df.Formation.map(fm_group_dict)
In [12]:
df = pd.get_dummies(df, prefix='FM_GRP', columns=['FM_GRP'])
In [13]:
def archie(df):
return np.sqrt(0.08 / ((df.PHIND ** 2) * (10 ** df.ILD_log10)))
In [14]:
df['SW'] = df.apply(lambda x: archie(x), axis=1)
In [15]:
# modified from jesper
latlong = pd.DataFrame({"SHRIMPLIN": [37.978076, -100.987305], #
"ALEXANDER D": [37.6747257, -101.1675259], #
"SHANKLE": [38.0633799, -101.3920543], #
"LUKE G U": [37.4499614, -101.6121913], #
"KIMZEY A": [37.12289, -101.39697], #
"CROSS H CATTLE": [37.9105826, -101.6464517], #
"NOLAN": [37.7866294, -101.0451641], #?
"NEWBY": [37.3172442, -101.3546995], #
"CHURCHMAN BIBLE": [37.3497658, -101.1060761], #?
"STUART": [37.4857262, -101.1391063], #
"CRAWFORD": [37.1893654, -101.1494994], #?
"Recruit F9": [0,0]})
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
km = 6367 * c
return km
def get_lat(df):
return latlong[df['Well Name']][0]
def get_long(df):
return latlong[df['Well Name']][1]
In [16]:
df['LAT'] = df.apply(lambda x: get_lat(x), axis=1)
df['LON'] = df.apply(lambda x: get_long(x), axis=1)
dist_dict = {}
for k in latlong:
dict_name = k + '_DISTANCES'
k_dict = {}
lat1 = latlong[k][0]
lon1 = latlong[k][1]
for l in latlong:
lat2 = latlong[l][0]
lon2 = latlong[l][1]
if l == 'Recruit F9':
dist = haversine(0, 0, 0, 0)
elif k == "Recruit F9":
dist = haversine(0, 0, 0, 0)
else:
dist = haversine(lon1, lat1, lon2, lat2)
k_dict[l] = dist
dist_dict[dict_name] = k_dict
for i in dist_dict:
df[i] = np.nan
for j in dist_dict[i]:
df.loc[df['Well Name'] == j, i] = dist_dict[i][j]
In [17]:
df0 = df[(df.PHIND <= 40) & (df['Well Name'] != 'CROSS H CATTLE')]
facies = df0['Facies'].values
wells = df0['Well Name'].values
keep_list0 = ['GR', 'ILD_log10', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'RHOB_EST',
'UR_CLY', 'UR_CAL']
fv0 = df0[keep_list0].values
clf0 = neighbors.KNeighborsClassifier(n_neighbors=56, weights='distance')
X0 = preprocessing.StandardScaler().fit(fv0).transform(fv0)
y0 = facies
logo = LeaveOneGroupOut()
f1knn0 = []
clf0.fit(X0, y0)
X1 = preprocessing.StandardScaler().fit(df[keep_list0].values).transform(df[keep_list0].values)
knn_pred = clf0.predict(X1)
df['KNN_FACIES'] = knn_pred
There is some bad log data in this dataset which I'd guess is due to rugose hole. PHIND gets as high at 80%, which is certainly spurious, so I'll remove data with cross-plot porosity greater than 40% from the dataset. CROSS H CATTLE well also looks pretty different from the others so I'm going to remove it from the training set.
In [18]:
df1 = df.dropna()
df1 = df1[(df1['Well Name'] != 'CROSS H CATTLE') & (df.PHIND < 40.0)]
facies = df1['Facies'].values
wells = df1['Well Name'].values
drop_list = ['Formation', 'Well Name', 'Facies', 'Depth', 'DPHI_EST', 'NPHI_EST', 'DeltaPHI',
'UMAA_EST', 'UR_QTZ', 'PE_EST', 'Recruit F9_DISTANCES', 'KIMZEY A_DISTANCES',
'NEWBY_DISTANCES', 'ALEXANDER D_DISTANCES', 'NOLAN_DISTANCES', 'FM_GRP_3']
fv = df1.drop(drop_list, axis=1).values
X = preprocessing.StandardScaler().fit(fv).transform(fv)
y = facies
ne_grid = [150]
mf_grid = [10]
md_grid = [None]
msl_grid = [5]
mss_grid = [20]
keys = ['n_estimators', 'max_features', 'max_depth', 'min_samples_leaf', 'min_samples_split']
param_sets = itertools.product(ne_grid, mf_grid, md_grid, msl_grid, mss_grid)
param_grid = [dict(zip(keys, i)) for i in param_sets]
clf_list = []
for i, d in enumerate(param_grid):
clf = ensemble.RandomForestClassifier(n_estimators=d['n_estimators'],
class_weight='balanced',
min_samples_leaf=d['min_samples_leaf'],
min_samples_split=d['min_samples_split'],
max_features=d['max_features'],
max_depth=d['max_depth'],
n_jobs=-1)
lpgo = LeavePGroupsOut(n_groups=2)
f1rfc = []
for train, test in lpgo.split(X, y, groups=wells):
clf.fit(X[train], y[train])
score = clf.fit(X[train], y[train]).score(X[test], y[test])
f1rfc.append(score)
print("Average leave-two-wells-out F1 Score: %6f" % (np.mean(f1rfc)))
clf_list.append((clf, np.mean(f1rfc)))
np.max([i[1] for i in clf_list])
Out[18]:
In [19]:
list(zip(df1.drop(drop_list, axis=1).columns, clf.feature_importances_))
Out[19]:
Load validation data (vd), build features, and use the classfier from above to predict facies. Ultimately the PE_EST curve seemed to be slightly more predictive than the PE curve proper (?). I use that instead of PE in the classifer so I need to compute it with the validation data.
In [20]:
# refit model to entire training set
clf.fit(X, y)
# load validation data
vd = pd.read_csv('../validation_data_nofacies.csv')
# compute extra log data features
vd['DPHI_EST'] = vd.apply(lambda x: estimate_dphi(x), axis=1).astype(float)
vd['RHOB_EST'] = vd.apply(lambda x: estimate_rhob(x), axis=1)
vd['NPHI_EST'] = vd.apply(lambda x: estimate_nphi(x), axis=1)
vd['RHOMAA_EST'] = vd.apply(lambda x: compute_rhomaa(x), axis=1)
# predict missing PE values
drop_list_vd = ['Formation', 'Well Name', 'Depth', 'PE', 'RELPOS']
fv_vd = vd.drop(drop_list_vd, axis=1).values
X_vd = preprocessing.StandardScaler().fit(fv_vd).transform(fv_vd)
vd['PE_EST'] = reg.predict(X_vd)
vd.PE = vd.PE.combine_first(vd.PE_EST)
vd['UMAA_EST'] = vd.apply(lambda x: compute_umaa(x), axis=1)
# Estimate lithology using Umaa Rhomaa solution
vd['UR_QTZ'] = np.nan
vd['UR_CLY'] = np.nan
vd['UR_CAL'] = np.nan
vd['UR_DOL'] = np.nan
vd.ix[vd.GR >= 40, 'UR_QTZ'] = vd.ix[vd.GR >= 40].apply(lambda x: ur1.get_qtz(x.UMAA_EST, x.RHOMAA_EST), axis=1)
vd.ix[vd.GR >= 40, 'UR_CLY'] = vd.ix[vd.GR >= 40].apply(lambda x: ur1.get_dol(x.UMAA_EST, x.RHOMAA_EST), axis=1)
vd.ix[vd.GR >= 40, 'UR_CAL'] = vd.ix[vd.GR >= 40].apply(lambda x: ur1.get_cal(x.UMAA_EST, x.RHOMAA_EST), axis=1)
vd.ix[vd.GR >= 40, 'UR_DOL'] = 0
vd.ix[vd.GR < 40, 'UR_QTZ'] = vd.ix[vd.GR < 40].apply(lambda x: ur2.get_qtz(x.UMAA_EST, x.RHOMAA_EST), axis=1)
vd.ix[vd.GR < 40, 'UR_DOL'] = vd.ix[vd.GR < 40].apply(lambda x: ur2.get_dol(x.UMAA_EST, x.RHOMAA_EST), axis=1)
vd.ix[vd.GR < 40, 'UR_CAL'] = vd.ix[vd.GR < 40].apply(lambda x: ur2.get_cal(x.UMAA_EST, x.RHOMAA_EST), axis=1)
vd.ix[vd.GR < 40, 'UR_CLY'] = 0
# Formation grouping
vd['FM_GRP'] = vd.Formation.map(fm_group_dict)
vd = pd.get_dummies(vd, prefix='FM_GRP', columns=['FM_GRP'])
# Water saturation
vd['SW'] = vd.apply(lambda x: archie(x), axis=1)
# Lat-long features
vd['LAT'] = vd.apply(lambda x: get_lat(x), axis=1)
vd['LON'] = vd.apply(lambda x: get_long(x), axis=1)
for i in dist_dict:
vd[i] = np.nan
for j in dist_dict[i]:
vd.loc[vd['Well Name'] == j, i] = dist_dict[i][j]
# Compute first guess at facies with KNN
X2 = preprocessing.StandardScaler().fit(vd[keep_list0].values).transform(vd[keep_list0].values)
vd['KNN_FACIES'] = clf0.predict(X2)
# Apply final model
drop_list1 = ['Formation', 'Well Name', 'Depth', 'DPHI_EST', 'NPHI_EST', 'DeltaPHI',
'UMAA_EST', 'UR_QTZ', 'PE', 'Recruit F9_DISTANCES', 'KIMZEY A_DISTANCES',
'NEWBY_DISTANCES', 'ALEXANDER D_DISTANCES', 'NOLAN_DISTANCES', 'FM_GRP_3']
fv_vd1 = vd.drop(drop_list1, axis=1).values
X_vd1 = preprocessing.StandardScaler().fit(fv_vd1).transform(fv_vd1)
vd_predicted_facies = clf.predict(X_vd1)
In [21]:
vd['Facies'] = vd_predicted_facies
vd.to_csv('RFC_submission_3_predictions.csv')
In [22]:
vd_predicted_facies
Out[22]:
In [ ]: