Facies classification using Machine Learning

Bird Team: PG+AC


In [9]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

In [16]:
filename = './../training_data.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()


set(['SHRIMPLIN', 'Recruit F9', 'SHANKLE', 'CHURCHMAN BIBLE', 'NOLAN', 'NEWBY', 'LUKE G U', 'CROSS H CATTLE'])
Out[16]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 A1 SH SHRIMPLIN 2793.0 77.45 0.664 9.9 11.915 4.6 1 1.000
1 3 A1 SH SHRIMPLIN 2793.5 78.26 0.661 14.2 12.565 4.1 1 0.979
2 3 A1 SH SHRIMPLIN 2794.0 79.05 0.658 14.8 13.050 3.6 1 0.957
3 3 A1 SH SHRIMPLIN 2794.5 86.10 0.655 13.9 13.115 3.5 1 0.936
4 3 A1 SH SHRIMPLIN 2795.0 74.58 0.647 13.5 13.300 3.4 1 0.915

In [48]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()


set(['CRAWFORD', 'STUART'])
(830, 10)
Out[48]:
Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000
1 A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978
2 A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956
3 A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933
4 A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911

In [18]:
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)


Out[18]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS origin
0 NaN A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000 test
1 NaN A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978 test
2 NaN A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956 test
3 NaN A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933 test
4 NaN A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911 test
5 NaN A1 SH STUART 2810.5 73.955 0.667 6.9 12.25 3.086 1 0.889 test
6 NaN A1 SH STUART 2811.0 77.962 0.674 6.5 12.45 3.092 1 0.867 test
7 NaN A1 SH STUART 2811.5 83.894 0.667 6.3 12.65 3.123 1 0.844 test
8 NaN A1 SH STUART 2812.0 84.424 0.653 6.7 13.05 3.121 1 0.822 test
9 NaN A1 SH STUART 2812.5 83.160 0.642 7.3 12.95 3.127 1 0.800 test

In [19]:
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]
    
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int) 

print("rolling")
#Add rolling mean
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'RELPOS']
window_size = 5
for val in list_to_roll:
    df[val+'_rollingmean_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).mean())
    df[val+'_rollingmax_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).max())
    df[val+'_rollingmin_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).min())
    df[val+'_rollingstd_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).std())

print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())


session
depth
add avgs of feat
add distances feat.
lag lead
rolling
filling na

In [22]:
tokeep =['Facies','origin','Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2',
         'Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
df = df[tokeep]

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_2',
 'Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = [c+"_"+l for c in cv.get_feature_names()]
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)

CV performance


In [37]:
# params
max_depth = 10
n_estimators = 1000
min_samples_leaf = 10
clf = RandomForestClassifier(max_depth = max_depth,
                             n_estimators=n_estimators,
                             min_samples_leaf=min_samples_leaf)

In [38]:
ytrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')]['Facies']
yvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')]['Facies']
xtrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)

In [39]:
clf.fit(xtrain,ytrain)


Out[39]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
preds = clf.predict(xvalid)

In [41]:
from sklearn.metrics import classification_report
print classification_report(yvalid, preds)


             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         8
        2.0       0.66      0.62      0.64        56
        3.0       0.62      0.73      0.67        51
        4.0       0.38      0.69      0.49        13
        5.0       0.56      0.67      0.61        30
        6.0       0.64      0.68      0.66        87
        7.0       0.00      0.00      0.00        34
        8.0       0.55      0.51      0.53        75
        9.0       0.67      0.94      0.78        50

avg / total       0.55      0.61      0.57       404

Apply to test


In [42]:
# this time let's use all the training set 
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)

In [50]:
#clf.fit(xtrain,ytrain)

In [43]:
preds = clf.predict(xvalid.values)

In [49]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_2.csv')