Facies classification using Machine Learning

Bird Team: PG+AC



In [9]:

    
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None



In [16]:

    
filename = './../training_data.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()









    



set(['SHRIMPLIN', 'Recruit F9', 'SHANKLE', 'CHURCHMAN BIBLE', 'NOLAN', 'NEWBY', 'LUKE G U', 'CROSS H CATTLE'])






    Out[16]:






  
    
      
      Facies
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      0
      3
      A1 SH
      SHRIMPLIN
      2793.0
      77.45
      0.664
      9.9
      11.915
      4.6
      1
      1.000
    
    
      1
      3
      A1 SH
      SHRIMPLIN
      2793.5
      78.26
      0.661
      14.2
      12.565
      4.1
      1
      0.979
    
    
      2
      3
      A1 SH
      SHRIMPLIN
      2794.0
      79.05
      0.658
      14.8
      13.050
      3.6
      1
      0.957
    
    
      3
      3
      A1 SH
      SHRIMPLIN
      2794.5
      86.10
      0.655
      13.9
      13.115
      3.5
      1
      0.936
    
    
      4
      3
      A1 SH
      SHRIMPLIN
      2795.0
      74.58
      0.647
      13.5
      13.300
      3.4
      1
      0.915



In [48]:

    
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()









    



set(['CRAWFORD', 'STUART'])
(830, 10)






    Out[48]:






  
    
      
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      0
      A1 SH
      STUART
      2808.0
      66.276
      0.630
      3.3
      10.65
      3.591
      1
      1.000
    
    
      1
      A1 SH
      STUART
      2808.5
      77.252
      0.585
      6.5
      11.95
      3.341
      1
      0.978
    
    
      2
      A1 SH
      STUART
      2809.0
      82.899
      0.566
      9.4
      13.60
      3.064
      1
      0.956
    
    
      3
      A1 SH
      STUART
      2809.5
      80.671
      0.593
      9.5
      13.25
      2.977
      1
      0.933
    
    
      4
      A1 SH
      STUART
      2810.0
      75.971
      0.638
      8.7
      12.35
      3.020
      1
      0.911



In [18]:

    
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)



In [19]:

    
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]
    
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int) 

print("rolling")
#Add rolling mean
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'RELPOS']
window_size = 5
for val in list_to_roll:
    df[val+'_rollingmean_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).mean())
    df[val+'_rollingmax_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).max())
    df[val+'_rollingmin_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).min())
    df[val+'_rollingstd_'+str(lag)]=df.groupby("Well Name")[val].apply(lambda x:x.rolling(window=window_size).std())

print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())









    



session
depth
add avgs of feat
add distances feat.
lag lead
rolling
filling na



In [22]:

    
tokeep =['Facies','origin','Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2',
         'Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
df = df[tokeep]



In [23]:

    
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_2',
 'Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = [c+"_"+l for c in cv.get_feature_names()]
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)

CV performance



In [37]:

    
# params
max_depth = 10
n_estimators = 1000
min_samples_leaf = 10
clf = RandomForestClassifier(max_depth = max_depth,
                             n_estimators=n_estimators,
                             min_samples_leaf=min_samples_leaf)



In [38]:

    
ytrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')]['Facies']
yvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')]['Facies']
xtrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)



In [39]:

    
clf.fit(xtrain,ytrain)









    Out[39]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [40]:

    
preds = clf.predict(xvalid)



In [41]:

    
from sklearn.metrics import classification_report
print classification_report(yvalid, preds)









    



             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         8
        2.0       0.66      0.62      0.64        56
        3.0       0.62      0.73      0.67        51
        4.0       0.38      0.69      0.49        13
        5.0       0.56      0.67      0.61        30
        6.0       0.64      0.68      0.66        87
        7.0       0.00      0.00      0.00        34
        8.0       0.55      0.51      0.53        75
        9.0       0.67      0.94      0.78        50

avg / total       0.55      0.61      0.57       404

Apply to test



In [42]:

    
# this time let's use all the training set 
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)



In [50]:

    
#clf.fit(xtrain,ytrain)



In [43]:

    
preds = clf.predict(xvalid.values)



In [49]:

    
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_2.csv')

	Facies	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	3	A1 SH	SHRIMPLIN	2793.0	77.45	0.664	9.9	11.915	4.6	1	1.000
1	3	A1 SH	SHRIMPLIN	2793.5	78.26	0.661	14.2	12.565	4.1	1	0.979
2	3	A1 SH	SHRIMPLIN	2794.0	79.05	0.658	14.8	13.050	3.6	1	0.957
3	3	A1 SH	SHRIMPLIN	2794.5	86.10	0.655	13.9	13.115	3.5	1	0.936
4	3	A1 SH	SHRIMPLIN	2795.0	74.58	0.647	13.5	13.300	3.4	1	0.915

	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	A1 SH	STUART	2808.0	66.276	0.630	3.3	10.65	3.591	1	1.000
1	A1 SH	STUART	2808.5	77.252	0.585	6.5	11.95	3.341	1	0.978
2	A1 SH	STUART	2809.0	82.899	0.566	9.4	13.60	3.064	1	0.956
3	A1 SH	STUART	2809.5	80.671	0.593	9.5	13.25	2.977	1	0.933
4	A1 SH	STUART	2810.0	75.971	0.638	8.7	12.35	3.020	1	0.911