Facies classification using Machine Learning

Bird Team: PG+AC



In [142]:

    
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score,LeavePGroupsOut, LeaveOneGroupOut, cross_val_predict
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
pd.options.mode.chained_assignment = None



In [143]:

    
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()









    



set(['SHRIMPLIN', 'Recruit F9', 'ALEXANDER D', 'SHANKLE', 'CHURCHMAN BIBLE', 'NOLAN', 'KIMZEY A', 'NEWBY', 'LUKE G U', 'CROSS H CATTLE'])






    Out[143]:






  
    
      
      Facies
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      0
      3
      A1 SH
      SHRIMPLIN
      2793.0
      77.45
      0.664
      9.9
      11.915
      4.6
      1
      1.000
    
    
      1
      3
      A1 SH
      SHRIMPLIN
      2793.5
      78.26
      0.661
      14.2
      12.565
      4.1
      1
      0.979
    
    
      2
      3
      A1 SH
      SHRIMPLIN
      2794.0
      79.05
      0.658
      14.8
      13.050
      3.6
      1
      0.957
    
    
      3
      3
      A1 SH
      SHRIMPLIN
      2794.5
      86.10
      0.655
      13.9
      13.115
      3.5
      1
      0.936
    
    
      4
      3
      A1 SH
      SHRIMPLIN
      2795.0
      74.58
      0.647
      13.5
      13.300
      3.4
      1
      0.915



In [144]:

    
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()









    



set(['CRAWFORD', 'STUART'])
(830, 10)






    Out[144]:






  
    
      
      Formation
      Well Name
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      0
      A1 SH
      STUART
      2808.0
      66.276
      0.630
      3.3
      10.65
      3.591
      1
      1.000
    
    
      1
      A1 SH
      STUART
      2808.5
      77.252
      0.585
      6.5
      11.95
      3.341
      1
      0.978
    
    
      2
      A1 SH
      STUART
      2809.0
      82.899
      0.566
      9.4
      13.60
      3.064
      1
      0.956
    
    
      3
      A1 SH
      STUART
      2809.5
      80.671
      0.593
      9.5
      13.25
      2.977
      1
      0.933
    
    
      4
      A1 SH
      STUART
      2810.0
      75.971
      0.638
      8.7
      12.35
      3.020
      1
      0.911



In [145]:

    
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)



In [146]:

    
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]
    
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int) 

print("rolling")
#Add rolling features
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M','RELPOS']
window_size = [5,10,15,20,50]
for w in window_size:
    for val in list_to_roll:
        df[val+'_rollingmean_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).mean())
        df[val+'_rollingmax_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).max())
        df[val+'_rollingmin_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).min())
        df[val+'_rollingstd_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).std())
        
print("special window features for NM_M")
def NM_M_distance(x,how,target):
    length = len(x)
    rank = np.empty(length)
    count = -1
    NM_M = x["NM_M"].values
    if how=="up":
        order = range(length)
    elif how=="down":
        order = range(length-1,-1,-1)
    for i in order:
        if ((NM_M[i] != target) & (count>-1)):
            count+=1
            rank[i] += count
        elif NM_M[i] == target:
            count=0
        else:
            rank[i] = count
    rank = pd.DataFrame(rank.astype(int), columns=["NM_M_Rank_Target_+"+str(target)+"_"+how], index = x.index)
    return(rank)
df["NM_M_Rank_Target_1_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=1)
df["NM_M_Rank_Target_2_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=2)
df["NM_M_Rank_Target_1_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=1)
df["NM_M_Rank_Target_2_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=2)

print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())
df = df.fillna(df.mean())

print("Vectorizing Formation text data")
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_2',
 'Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = [c+"_"+l for c in cv.get_feature_names()]
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)

print("Finished preparing data. Now ready for ML ignition!")









    



session
depth
add avgs of feat
add distances feat.
lag lead
rolling
special window features for NM_M
filling na
Vectorizing Formation text data
Finished preparing data. Now ready for ML ignition!



In [189]:

    
#tokeep =['Facies','origin','Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2',
#         'Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
#df = df[tokeep]

CV performance



In [147]:

    
f1_micro = make_scorer(f1_score, average='micro')



In [148]:

    
# this time let's use all the training set 
groups = df[(df['origin']=='train')]["Well Name"]
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
custom_cv=LeaveOneGroupOut().split(xtrain, ytrain, groups)



In [149]:

    
clf_rfe = RandomForestClassifier(
    n_estimators=100,
    criterion="entropy",
    class_weight='balanced',
) 
clf_final = RandomForestClassifier(
    n_estimators=100,
    criterion="entropy",
    class_weight='balanced',
    min_samples_leaf=5,
    min_samples_split=25,
    max_features=10,
    n_jobs=-1
)
pipe = Pipeline([
  ('fs', RFECV(clf_rfe,
               cv=custom_cv,
               scoring=f1_micro,
               step=0.1,
               verbose=2)),
  ('cl', clf_final)
])



In [150]:

    
pipe.fit(xtrain, ytrain)









    



Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.






    Out[150]:





Pipeline(steps=[('fs', RFECV(cv=<generator object split at 0x1044c18c0>,
   estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])



In [151]:

    
y_pred = pipe.predict(xtrain)
print(classification_report(ytrain, y_pred))
print(f1_score(ytrain, y_pred, average="micro"))









    



             precision    recall  f1-score   support

        1.0       0.88      1.00      0.93       268
        2.0       0.95      0.91      0.93       940
        3.0       0.94      0.93      0.93       780
        4.0       0.79      0.94      0.86       271
        5.0       0.87      0.86      0.87       296
        6.0       0.90      0.79      0.84       582
        7.0       0.76      0.98      0.85       141
        8.0       0.91      0.87      0.89       686
        9.0       0.94      0.98      0.96       185

avg / total       0.91      0.90      0.90      4149

0.902868161003



In [160]:

    
importances = pipe.named_steps["cl"].feature_importances_
support = pipe.named_steps["fs"].support_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature = pd.Series(xtrain.columns.values)
selected_features = list(feature[support])
for f in range(len(selected_features)):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], 
                                      selected_features[indices[f]], 
                                      importances[indices[f]]))









    



Feature ranking:
1. feature 9 NM_M_mean (0.085054)
2. feature 23 NM_M_rollingmean_5 (0.078424)
3. feature 5 NM_M (0.058173)
4. feature 42 PE_rollingmean_50 (0.031362)
5. feature 43 PE_rollingmax_50 (0.031326)
6. feature 48 sh_Formation (0.029579)
7. feature 35 NM_M_rollingmean_15 (0.025371)
8. feature 13 GR_rollingmin_5 (0.023771)
9. feature 1 GR (0.023272)
10. feature 40 RELPOS_rollingstd_20 (0.022150)
11. feature 32 RELPOS_rollingstd_10 (0.021844)
12. feature 29 PE_rollingmean_10 (0.020928)
13. feature 36 RELPOS_rollingstd_15 (0.020512)
14. feature 28 DeltaPHI_rollingmean_10 (0.019988)
15. feature 47 lm_Formation (0.019870)
16. feature 2 ILD_log10 (0.019583)
17. feature 8 PHIND_mean (0.019222)
18. feature 15 ILD_log10_rollingmin_5 (0.018798)
19. feature 14 ILD_log10_rollingmean_5 (0.018413)
20. feature 6 GR_min (0.017810)
21. feature 38 PE_rollingmax_20 (0.017757)
22. feature 11 GR_min_dist (0.017583)
23. feature 27 GR_rollingmin_10 (0.017055)
24. feature 18 PHIND_rollingmean_5 (0.016203)
25. feature 45 NM_M_rollingmean_50 (0.015900)
26. feature 44 PE_rollingmin_50 (0.015898)
27. feature 30 PE_rollingmin_10 (0.015659)
28. feature 19 PHIND_rollingmax_5 (0.015139)
29. feature 3 PHIND (0.015066)
30. feature 20 PHIND_rollingmin_5 (0.014670)
31. feature 25 RELPOS_rollingstd_5 (0.014481)
32. feature 46 NM_M_Rank_Target_1_up (0.014468)
33. feature 33 Depth_rollingmax_15 (0.014306)
34. feature 41 Depth_rollingmax_50 (0.013709)
35. feature 10 Depth_min_dist (0.013705)
36. feature 16 DeltaPHI_rollingmean_5 (0.013677)
37. feature 37 Depth_rollingmax_20 (0.013369)
38. feature 34 DeltaPHI_rollingmean_15 (0.013297)
39. feature 17 DeltaPHI_rollingmin_5 (0.013126)
40. feature 7 ILD_log10_mean (0.012856)
41. feature 21 PE_rollingmean_5 (0.012784)
42. feature 26 Depth_rollingmax_10 (0.012233)
43. feature 12 GR_rollingmean_5 (0.012187)
44. feature 24 NM_M_rollingmax_5 (0.012033)
45. feature 0 Depth (0.011084)
46. feature 22 PE_rollingmax_5 (0.010701)
47. feature 39 NM_M_rollingmean_20 (0.009658)
48. feature 4 PE (0.008100)
49. feature 31 NM_M_rollingmean_10 (0.007713)
50. feature 49 lm_Formation_lag_1 (0.000130)

Apply to test



In [163]:

    
# this time let's use all the training set 
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
well_name_valid = df.loc[(df['origin']=='test'),"Well Name"]



In [ ]:

    
#pipe.fit(xtrain,ytrain)



In [164]:

    
preds = pipe.predict(xvalid.values)



In [165]:

    
well = "CRAWFORD"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2950,3175, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [166]:

    
well = "STUART"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2800,3050, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [167]:

    
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_4.csv')

	Facies	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	3	A1 SH	SHRIMPLIN	2793.0	77.45	0.664	9.9	11.915	4.6	1	1.000
1	3	A1 SH	SHRIMPLIN	2793.5	78.26	0.661	14.2	12.565	4.1	1	0.979
2	3	A1 SH	SHRIMPLIN	2794.0	79.05	0.658	14.8	13.050	3.6	1	0.957
3	3	A1 SH	SHRIMPLIN	2794.5	86.10	0.655	13.9	13.115	3.5	1	0.936
4	3	A1 SH	SHRIMPLIN	2795.0	74.58	0.647	13.5	13.300	3.4	1	0.915

	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	A1 SH	STUART	2808.0	66.276	0.630	3.3	10.65	3.591	1	1.000
1	A1 SH	STUART	2808.5	77.252	0.585	6.5	11.95	3.341	1	0.978
2	A1 SH	STUART	2809.0	82.899	0.566	9.4	13.60	3.064	1	0.956
3	A1 SH	STUART	2809.5	80.671	0.593	9.5	13.25	2.977	1	0.933
4	A1 SH	STUART	2810.0	75.971	0.638	8.7	12.35	3.020	1	0.911