Facies classification using Machine Learning

Bird Team: PG+AC


In [44]:
%matplotlib inline
import pandas as pd
from pandasql import sqldf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score,LeavePGroupsOut, LeaveOneGroupOut, cross_val_predict
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
pd.options.mode.chained_assignment = None

In [45]:
filename = './../training_data.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()


set(['SHRIMPLIN', 'Recruit F9', 'SHANKLE', 'CHURCHMAN BIBLE', 'NOLAN', 'NEWBY', 'LUKE G U', 'CROSS H CATTLE'])
Out[45]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 A1 SH SHRIMPLIN 2793.0 77.45 0.664 9.9 11.915 4.6 1 1.000
1 3 A1 SH SHRIMPLIN 2793.5 78.26 0.661 14.2 12.565 4.1 1 0.979
2 3 A1 SH SHRIMPLIN 2794.0 79.05 0.658 14.8 13.050 3.6 1 0.957
3 3 A1 SH SHRIMPLIN 2794.5 86.10 0.655 13.9 13.115 3.5 1 0.936
4 3 A1 SH SHRIMPLIN 2795.0 74.58 0.647 13.5 13.300 3.4 1 0.915

In [46]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()


set(['CRAWFORD', 'STUART'])
(830, 10)
Out[46]:
Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000
1 A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978
2 A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956
3 A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933
4 A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911

In [47]:
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)


Out[47]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS origin
0 NaN A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000 test
1 NaN A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978 test
2 NaN A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956 test
3 NaN A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933 test
4 NaN A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911 test
5 NaN A1 SH STUART 2810.5 73.955 0.667 6.9 12.25 3.086 1 0.889 test
6 NaN A1 SH STUART 2811.0 77.962 0.674 6.5 12.45 3.092 1 0.867 test
7 NaN A1 SH STUART 2811.5 83.894 0.667 6.3 12.65 3.123 1 0.844 test
8 NaN A1 SH STUART 2812.0 84.424 0.653 6.7 13.05 3.121 1 0.822 test
9 NaN A1 SH STUART 2812.5 83.160 0.642 7.3 12.95 3.127 1 0.800 test

In [48]:
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]
    
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int) 

print("rolling")
#Add rolling features
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M','RELPOS']
window_size = [5,10,15,20,50]
for w in window_size:
    for val in list_to_roll:
        df[val+'_rollingmean_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).mean())
        df[val+'_rollingmax_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).max())
        df[val+'_rollingmin_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).min())
        df[val+'_rollingstd_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).std())
        
print("special window features for NM_M")
def NM_M_distance(x,how,target):
    length = len(x)
    rank = np.empty(length)
    count = -1
    NM_M = x["NM_M"].values
    if how=="up":
        order = range(length)
    elif how=="down":
        order = range(length-1,-1,-1)
    for i in order:
        if ((NM_M[i] != target) & (count>-1)):
            count+=1
            rank[i] += count
        elif NM_M[i] == target:
            count=0
        else:
            rank[i] = count
    rank = pd.DataFrame(rank.astype(int), columns=["NM_M_Rank_Target_+"+str(target)+"_"+how], index = x.index)
    return(rank)
df["NM_M_Rank_Target_1_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=1)
df["NM_M_Rank_Target_2_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=2)
df["NM_M_Rank_Target_1_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=1)
df["NM_M_Rank_Target_2_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=2)

print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())

print("Vectorizing Formation text data")
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_2',
 'Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = [c+"_"+l for c in cv.get_feature_names()]
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)

print("Finished preparing data. Now ready for ML ignition!")


session
depth
add avgs of feat
add distances feat.
lag lead
rolling
special window features for NM_M
filling na
Vectorizing Formation text data
Finished preparing data. Now ready for ML ignition!

In [189]:
#tokeep =['Facies','origin','Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2',
#         'Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
#df = df[tokeep]

CV performance


In [61]:
clf = RandomForestClassifier(
    max_depth = 10,
    n_estimators = 100,
    max_features=0.1,
    min_samples_leaf=25,
    min_samples_split=50,
    class_weight='balanced',
    oob_score=True,
)

In [62]:
ytrain = df[(df['origin']=='train')&(df['Well Name']<>'NOLAN')]['Facies']
yvalid = df[(df['origin']=='train')&(df['Well Name']=='NOLAN')]['Facies']
xtrain = df[(df['origin']=='train')&(df['Well Name']<>'NOLAN')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='train')&(df['Well Name']=='NOLAN')].drop(['Well Name','origin','Facies'],axis=1)

In [63]:
clf.fit(xtrain,ytrain)


Out[63]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.1,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=25, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [64]:
preds = clf.predict(xvalid)
from sklearn.metrics import classification_report
print(clf.oob_score_)
print classification_report(yvalid, preds)
print(f1_score(yvalid, preds,average="micro"))


0.773162939297
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         4
        2.0       0.82      0.66      0.73       118
        3.0       0.62      0.82      0.70        68
        4.0       0.27      0.21      0.24        28
        5.0       0.47      0.49      0.48        47
        6.0       0.25      0.13      0.17        30
        7.0       0.00      0.00      0.00         4
        8.0       0.72      0.49      0.58       116
        9.0       0.00      0.00      0.00         0

avg / total       0.63      0.54      0.57       415

0.539759036145

In [65]:
# this time let's use all the training set 
groups = df[(df['origin']=='train')]["Well Name"]
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)

In [66]:
cv=LeaveOneGroupOut().split(xtrain, ytrain, groups)
y_pred = cross_val_predict(clf, xtrain, ytrain, cv=cv, n_jobs=-1)

In [68]:
print(classification_report(ytrain, y_pred))
print(f1_score(ytrain, y_pred,average="micro"))


             precision    recall  f1-score   support

        1.0       0.64      0.36      0.46       259
        2.0       0.58      0.58      0.58       738
        3.0       0.56      0.67      0.61       615
        4.0       0.51      0.69      0.59       184
        5.0       0.37      0.22      0.28       217
        6.0       0.50      0.42      0.46       462
        7.0       0.31      0.45      0.37        98
        8.0       0.51      0.53      0.52       498
        9.0       0.60      0.66      0.63       161

avg / total       0.53      0.53      0.53      3232

0.532487623762

In [69]:
importances = clf.feature_importances_ 
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature = list(xtrain.columns.values)
for f in range(xtrain.shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], feature[indices[f]], importances[indices[f]]))


Feature ranking:
1. feature 403 lm_Formation (0.025752)
2. feature 381 PE_rollingmean_50 (0.024209)
3. feature 289 NM_M_rollingmean_10 (0.021777)
4. feature 257 NM_M_rollingmean_5 (0.018710)
5. feature 404 sh_Formation (0.018616)
6. feature 318 PE_rollingmax_15 (0.018099)
7. feature 350 PE_rollingmax_20 (0.017107)
8. feature 2 ILD_log10 (0.015412)
9. feature 360 RELPOS_rollingstd_20 (0.014475)
10. feature 239 GR_rollingmin_5 (0.013867)
11. feature 17 GR_min (0.012606)
12. feature 382 PE_rollingmax_50 (0.012416)
13. feature 241 ILD_log10_rollingmean_5 (0.011988)
14. feature 1 GR (0.011731)
15. feature 419 lm_Formation_lead_1 (0.011285)
16. feature 21 ILD_log10_min (0.011224)
17. feature 296 RELPOS_rollingstd_10 (0.011138)
18. feature 237 GR_rollingmean_5 (0.011138)
19. feature 273 ILD_log10_rollingmean_10 (0.011086)
20. feature 317 PE_rollingmean_15 (0.011065)
21. feature 35 PE_mean (0.011052)
22. feature 6 NM_M (0.010820)
23. feature 369 ILD_log10_rollingmean_50 (0.010532)
24. feature 305 ILD_log10_rollingmean_15 (0.010221)
25. feature 39 NM_M_mean (0.010044)
26. feature 243 ILD_log10_rollingmin_5 (0.009469)
27. feature 385 NM_M_rollingmean_50 (0.009135)
28. feature 328 RELPOS_rollingstd_15 (0.009055)
29. feature 349 PE_rollingmean_20 (0.008771)
30. feature 242 ILD_log10_rollingmax_5 (0.008643)
31. feature 48 GR_min_dist (0.008514)
32. feature 31 PHIND_mean (0.008388)
33. feature 286 PE_rollingmax_10 (0.008388)
34. feature 321 NM_M_rollingmean_15 (0.008347)
35. feature 412 sh_Formation_lag_1 (0.007454)
36. feature 285 PE_rollingmean_10 (0.007174)
37. feature 337 ILD_log10_rollingmean_20 (0.007114)
38. feature 278 DeltaPHI_rollingmax_10 (0.006858)
39. feature 376 DeltaPHI_rollingstd_50 (0.006857)
40. feature 49 GR_max_dist (0.006849)
41. feature 353 NM_M_rollingmean_20 (0.006425)
42. feature 277 DeltaPHI_rollingmean_10 (0.006424)
43. feature 4 PHIND (0.006408)
44. feature 393 NM_M_Rank_Target_1_up (0.006360)
45. feature 271 GR_rollingmin_10 (0.005924)
46. feature 254 PE_rollingmax_5 (0.005879)
47. feature 253 PE_rollingmean_5 (0.005586)
48. feature 22 ILD_log10_max (0.005536)
49. feature 420 sh_Formation_lead_1 (0.005492)
50. feature 258 NM_M_rollingmax_5 (0.005457)
51. feature 411 lm_Formation_lag_1 (0.005441)
52. feature 57 PHIND_min_dist (0.005375)
53. feature 50 GR_mean_dist (0.005299)
54. feature 435 lm_Formation_lead_2 (0.005228)
55. feature 269 GR_rollingmean_10 (0.005179)
56. feature 287 PE_rollingmin_10 (0.005146)
57. feature 371 ILD_log10_rollingmin_50 (0.005132)
58. feature 310 DeltaPHI_rollingmax_15 (0.005041)
59. feature 250 PHIND_rollingmax_5 (0.005020)
60. feature 362 Depth_rollingmax_50 (0.004984)
61. feature 358 RELPOS_rollingmax_20 (0.004901)
62. feature 264 RELPOS_rollingstd_5 (0.004878)
63. feature 307 ILD_log10_rollingmin_15 (0.004688)
64. feature 339 ILD_log10_rollingmin_20 (0.004617)
65. feature 5 PE (0.004368)
66. feature 342 DeltaPHI_rollingmax_20 (0.004280)
67. feature 251 PHIND_rollingmin_5 (0.004199)
68. feature 8 formation_size (0.004191)
69. feature 361 Depth_rollingmean_50 (0.004150)
70. feature 24 ILD_log10_var (0.004093)
71. feature 53 ILD_log10_mean_dist (0.004055)
72. feature 275 ILD_log10_rollingmin_10 (0.004005)
73. feature 247 DeltaPHI_rollingmin_5 (0.004002)
74. feature 249 PHIND_rollingmean_5 (0.003885)
75. feature 23 ILD_log10_mean (0.003850)
76. feature 245 DeltaPHI_rollingmean_5 (0.003834)
77. feature 309 DeltaPHI_rollingmean_15 (0.003788)
78. feature 11 formation_depth (0.003778)
79. feature 16 Depth_var (0.003691)
80. feature 274 ILD_log10_rollingmax_10 (0.003402)
81. feature 18 GR_max (0.003364)
82. feature 238 GR_rollingmax_5 (0.003362)
83. feature 341 DeltaPHI_rollingmean_20 (0.003343)
84. feature 210 RELPOS_lead_1 (0.003259)
85. feature 28 DeltaPHI_var (0.003191)
86. feature 329 Depth_rollingmean_20 (0.003186)
87. feature 281 PHIND_rollingmean_10 (0.003101)
88. feature 301 GR_rollingmean_15 (0.003024)
89. feature 246 DeltaPHI_rollingmax_5 (0.003002)
90. feature 396 NM_M_Rank_Target_2_down (0.002976)
91. feature 211 RELPOS_lag_2 (0.002969)
92. feature 294 RELPOS_rollingmax_10 (0.002961)
93. feature 3 DeltaPHI (0.002889)
94. feature 330 Depth_rollingmax_20 (0.002885)
95. feature 43 RELPOS_mean (0.002884)
96. feature 10 maximum_depth (0.002867)
97. feature 383 PE_rollingmin_50 (0.002838)
98. feature 213 RELPOS_lag_3 (0.002801)
99. feature 52 ILD_log10_max_dist (0.002753)
100. feature 322 NM_M_rollingmax_15 (0.002738)
101. feature 380 PHIND_rollingstd_50 (0.002732)
102. feature 212 RELPOS_lead_2 (0.002719)
103. feature 209 RELPOS_lag_1 (0.002681)
104. feature 367 GR_rollingmin_50 (0.002661)
105. feature 279 DeltaPHI_rollingmin_10 (0.002652)
106. feature 348 PHIND_rollingstd_20 (0.002646)
107. feature 394 NM_M_Rank_Target_2_up (0.002599)
108. feature 104 GR_lead_8 (0.002599)
109. feature 303 GR_rollingmin_15 (0.002580)
110. feature 392 RELPOS_rollingstd_50 (0.002548)
111. feature 388 NM_M_rollingstd_50 (0.002485)
112. feature 51 ILD_log10_min_dist (0.002430)
113. feature 108 GR_lead_10 (0.002423)
114. feature 384 PE_rollingstd_50 (0.002417)
115. feature 298 Depth_rollingmax_15 (0.002328)
116. feature 215 RELPOS_lag_4 (0.002321)
117. feature 359 RELPOS_rollingmin_20 (0.002320)
118. feature 9 minimum_depth (0.002317)
119. feature 255 PE_rollingmin_5 (0.002267)
120. feature 124 ILD_log10_lead_8 (0.002243)
121. feature 235 Depth_rollingmin_5 (0.002236)
122. feature 27 DeltaPHI_mean (0.002197)
123. feature 344 DeltaPHI_rollingstd_20 (0.002181)
124. feature 378 PHIND_rollingmax_50 (0.002133)
125. feature 262 RELPOS_rollingmax_5 (0.002110)
126. feature 351 PE_rollingmin_20 (0.002093)
127. feature 326 RELPOS_rollingmax_15 (0.002089)
128. feature 335 GR_rollingmin_20 (0.002061)
129. feature 355 NM_M_rollingmin_20 (0.002056)
130. feature 218 RELPOS_lead_5 (0.002055)
131. feature 366 GR_rollingmax_50 (0.001998)
132. feature 331 Depth_rollingmin_20 (0.001980)
133. feature 389 RELPOS_rollingmean_50 (0.001967)
134. feature 368 GR_rollingstd_50 (0.001943)
135. feature 228 RELPOS_lead_10 (0.001942)
136. feature 333 GR_rollingmean_20 (0.001940)
137. feature 283 PHIND_rollingmin_10 (0.001934)
138. feature 282 PHIND_rollingmax_10 (0.001932)
139. feature 372 ILD_log10_rollingstd_50 (0.001930)
140. feature 20 GR_var (0.001928)
141. feature 68 RELPOS_mean_dist (0.001906)
142. feature 370 ILD_log10_rollingmax_50 (0.001892)
143. feature 62 PE_mean_dist (0.001890)
144. feature 220 RELPOS_lead_6 (0.001882)
145. feature 263 RELPOS_rollingmin_5 (0.001842)
146. feature 167 PHIND_lag_10 (0.001829)
147. feature 291 NM_M_rollingmin_10 (0.001815)
148. feature 352 PE_rollingstd_20 (0.001812)
149. feature 336 GR_rollingstd_20 (0.001762)
150. feature 347 PHIND_rollingmin_20 (0.001707)
151. feature 316 PHIND_rollingstd_15 (0.001703)
152. feature 225 RELPOS_lag_9 (0.001684)
153. feature 217 RELPOS_lag_5 (0.001683)
154. feature 61 PE_max_dist (0.001682)
155. feature 374 DeltaPHI_rollingmax_50 (0.001676)
156. feature 44 RELPOS_var (0.001670)
157. feature 266 Depth_rollingmax_10 (0.001668)
158. feature 436 sh_Formation_lead_2 (0.001665)
159. feature 379 PHIND_rollingmin_50 (0.001664)
160. feature 315 PHIND_rollingmin_15 (0.001627)
161. feature 128 ILD_log10_lead_10 (0.001622)
162. feature 216 RELPOS_lead_4 (0.001600)
163. feature 100 GR_lead_6 (0.001593)
164. feature 338 ILD_log10_rollingmax_20 (0.001565)
165. feature 306 ILD_log10_rollingmax_15 (0.001559)
166. feature 106 GR_lead_9 (0.001548)
167. feature 45 Depth_min_dist (0.001543)
168. feature 223 RELPOS_lag_8 (0.001543)
169. feature 46 Depth_max_dist (0.001536)
170. feature 297 Depth_rollingmean_15 (0.001527)
171. feature 373 DeltaPHI_rollingmean_50 (0.001523)
172. feature 226 RELPOS_lead_9 (0.001517)
173. feature 233 Depth_rollingmean_5 (0.001503)
174. feature 219 RELPOS_lag_6 (0.001502)
175. feature 363 Depth_rollingmin_50 (0.001502)
176. feature 33 PE_min (0.001496)
177. feature 56 DeltaPHI_mean_dist (0.001494)
178. feature 187 PE_lag_10 (0.001480)
179. feature 272 GR_rollingstd_10 (0.001475)
180. feature 25 DeltaPHI_min (0.001448)
181. feature 162 PHIND_lead_7 (0.001426)
182. feature 345 PHIND_rollingmean_20 (0.001419)
183. feature 37 NM_M_min (0.001408)
184. feature 127 ILD_log10_lag_10 (0.001406)
185. feature 346 PHIND_rollingmax_20 (0.001406)
186. feature 429 a1_Formation_lead_2 (0.001376)
187. feature 29 PHIND_min (0.001362)
188. feature 98 GR_lead_5 (0.001354)
189. feature 377 PHIND_rollingmean_50 (0.001350)
190. feature 311 DeltaPHI_rollingmin_15 (0.001341)
191. feature 107 GR_lag_10 (0.001329)
192. feature 325 RELPOS_rollingmean_15 (0.001308)
193. feature 224 RELPOS_lead_8 (0.001290)
194. feature 299 Depth_rollingmin_15 (0.001283)
195. feature 234 Depth_rollingmax_5 (0.001279)
196. feature 265 Depth_rollingmean_10 (0.001274)
197. feature 99 GR_lag_6 (0.001273)
198. feature 0 Depth (0.001271)
199. feature 47 Depth_mean_dist (0.001263)
200. feature 66 RELPOS_min_dist (0.001254)
201. feature 334 GR_rollingmax_20 (0.001241)
202. feature 30 PHIND_max (0.001220)
203. feature 125 ILD_log10_lag_9 (0.001216)
204. feature 166 PHIND_lead_9 (0.001209)
205. feature 312 DeltaPHI_rollingstd_15 (0.001179)
206. feature 54 DeltaPHI_min_dist (0.001164)
207. feature 97 GR_lag_5 (0.001158)
208. feature 105 GR_lag_9 (0.001148)
209. feature 164 PHIND_lead_8 (0.001139)
210. feature 304 GR_rollingstd_15 (0.001138)
211. feature 102 GR_lead_7 (0.001111)
212. feature 26 DeltaPHI_max (0.001107)
213. feature 410 b5_Formation_lag_1 (0.001104)
214. feature 67 RELPOS_max_dist (0.001091)
215. feature 288 PE_rollingstd_10 (0.001088)
216. feature 59 PHIND_mean_dist (0.001088)
217. feature 320 PE_rollingstd_15 (0.001081)
218. feature 14 Depth_max (0.001073)
219. feature 60 PE_min_dist (0.001053)
220. feature 101 GR_lag_7 (0.001042)
221. feature 32 PHIND_var (0.001038)
222. feature 365 GR_rollingmean_50 (0.001010)
223. feature 221 RELPOS_lag_7 (0.000995)
224. feature 144 DeltaPHI_lead_8 (0.000994)
225. feature 19 GR_mean (0.000990)
226. feature 41 RELPOS_min (0.000986)
227. feature 227 RELPOS_lag_10 (0.000976)
228. feature 36 PE_var (0.000970)
229. feature 313 PHIND_rollingmean_15 (0.000964)
230. feature 165 PHIND_lag_9 (0.000961)
231. feature 214 RELPOS_lead_3 (0.000956)
232. feature 163 PHIND_lag_8 (0.000944)
233. feature 343 DeltaPHI_rollingmin_20 (0.000937)
234. feature 364 Depth_rollingstd_50 (0.000937)
235. feature 13 Depth_min (0.000920)
236. feature 146 DeltaPHI_lead_9 (0.000908)
237. feature 308 ILD_log10_rollingstd_15 (0.000875)
238. feature 123 ILD_log10_lag_8 (0.000855)
239. feature 248 DeltaPHI_rollingstd_5 (0.000843)
240. feature 117 ILD_log10_lag_5 (0.000842)
241. feature 158 PHIND_lead_5 (0.000840)
242. feature 319 PE_rollingmin_15 (0.000840)
243. feature 295 RELPOS_rollingmin_10 (0.000839)
244. feature 160 PHIND_lead_6 (0.000832)
245. feature 119 ILD_log10_lag_6 (0.000815)
246. feature 168 PHIND_lead_10 (0.000805)
247. feature 267 Depth_rollingmin_10 (0.000803)
248. feature 120 ILD_log10_lead_6 (0.000795)
249. feature 356 NM_M_rollingstd_20 (0.000793)
250. feature 270 GR_rollingmax_10 (0.000771)
251. feature 7 RELPOS (0.000766)
252. feature 391 RELPOS_rollingmin_50 (0.000752)
253. feature 183 PE_lag_8 (0.000732)
254. feature 327 RELPOS_rollingmin_15 (0.000721)
255. feature 222 RELPOS_lead_7 (0.000704)
256. feature 15 Depth_mean (0.000700)
257. feature 113 ILD_log10_lag_3 (0.000691)
258. feature 413 a1_Formation_lead_1 (0.000675)
259. feature 397 a1_Formation (0.000669)
260. feature 185 PE_lag_9 (0.000663)
261. feature 126 ILD_log10_lead_9 (0.000657)
262. feature 357 RELPOS_rollingmean_20 (0.000645)
263. feature 34 PE_max (0.000644)
264. feature 55 DeltaPHI_max_dist (0.000626)
265. feature 340 ILD_log10_rollingstd_20 (0.000625)
266. feature 252 PHIND_rollingstd_5 (0.000590)
267. feature 122 ILD_log10_lead_7 (0.000587)
268. feature 181 PE_lag_7 (0.000576)
269. feature 421 a1_Formation_lag_2 (0.000572)
270. feature 179 PE_lag_6 (0.000570)
271. feature 395 NM_M_Rank_Target_1_down (0.000539)
272. feature 324 NM_M_rollingstd_15 (0.000527)
273. feature 58 PHIND_max_dist (0.000518)
274. feature 375 DeltaPHI_rollingmin_50 (0.000517)
275. feature 121 ILD_log10_lag_7 (0.000513)
276. feature 142 DeltaPHI_lead_7 (0.000505)
277. feature 156 PHIND_lead_4 (0.000497)
278. feature 188 PE_lead_10 (0.000488)
279. feature 111 ILD_log10_lag_2 (0.000477)
280. feature 157 PHIND_lag_5 (0.000475)
281. feature 256 PE_rollingstd_5 (0.000472)
282. feature 261 RELPOS_rollingmean_5 (0.000461)
283. feature 314 PHIND_rollingmax_15 (0.000459)
284. feature 115 ILD_log10_lag_4 (0.000432)
285. feature 112 ILD_log10_lead_2 (0.000414)
286. feature 390 RELPOS_rollingmax_50 (0.000413)
287. feature 145 DeltaPHI_lag_9 (0.000411)
288. feature 182 PE_lead_7 (0.000402)
289. feature 96 GR_lead_4 (0.000397)
290. feature 177 PE_lag_5 (0.000386)
291. feature 147 DeltaPHI_lag_10 (0.000382)
292. feature 293 RELPOS_rollingmean_10 (0.000379)
293. feature 110 ILD_log10_lead_1 (0.000372)
294. feature 186 PE_lead_9 (0.000354)
295. feature 206 NM_M_lead_9 (0.000351)
296. feature 151 PHIND_lag_2 (0.000347)
297. feature 276 ILD_log10_rollingstd_10 (0.000346)
298. feature 280 DeltaPHI_rollingstd_10 (0.000343)
299. feature 118 ILD_log10_lead_5 (0.000336)
300. feature 103 GR_lag_8 (0.000329)
301. feature 402 b5_Formation (0.000320)
302. feature 155 PHIND_lag_4 (0.000318)
303. feature 12 soft_indic (0.000318)
304. feature 92 GR_lead_2 (0.000310)
305. feature 323 NM_M_rollingmin_15 (0.000309)
306. feature 94 GR_lead_3 (0.000295)
307. feature 148 DeltaPHI_lead_10 (0.000295)
308. feature 284 PHIND_rollingstd_10 (0.000285)
309. feature 114 ILD_log10_lead_3 (0.000278)
310. feature 201 NM_M_lag_7 (0.000268)
311. feature 197 NM_M_lag_5 (0.000244)
312. feature 154 PHIND_lead_3 (0.000240)
313. feature 90 GR_lead_1 (0.000233)
314. feature 387 NM_M_rollingmin_50 (0.000229)
315. feature 244 ILD_log10_rollingstd_5 (0.000226)
316. feature 171 PE_lag_2 (0.000209)
317. feature 93 GR_lag_3 (0.000209)
318. feature 173 PE_lag_3 (0.000191)
319. feature 152 PHIND_lead_2 (0.000190)
320. feature 184 PE_lead_8 (0.000189)
321. feature 143 DeltaPHI_lag_8 (0.000185)
322. feature 302 GR_rollingmax_15 (0.000183)
323. feature 95 GR_lag_4 (0.000182)
324. feature 178 PE_lead_5 (0.000181)
325. feature 91 GR_lag_2 (0.000176)
326. feature 180 PE_lead_6 (0.000165)
327. feature 292 NM_M_rollingstd_10 (0.000149)
328. feature 161 PHIND_lag_7 (0.000148)
329. feature 139 DeltaPHI_lag_6 (0.000143)
330. feature 159 PHIND_lag_6 (0.000135)
331. feature 136 DeltaPHI_lead_4 (0.000133)
332. feature 38 NM_M_max (0.000131)
333. feature 140 DeltaPHI_lead_6 (0.000128)
334. feature 109 ILD_log10_lag_1 (0.000122)
335. feature 176 PE_lead_4 (0.000118)
336. feature 240 GR_rollingstd_5 (0.000115)
337. feature 207 NM_M_lag_10 (0.000109)
338. feature 150 PHIND_lead_1 (0.000109)
339. feature 132 DeltaPHI_lead_2 (0.000104)
340. feature 405 a1_Formation_lag_1 (0.000103)
341. feature 138 DeltaPHI_lead_5 (0.000102)
342. feature 172 PE_lead_2 (0.000098)
343. feature 137 DeltaPHI_lag_5 (0.000098)
344. feature 430 b1_Formation_lead_2 (0.000093)
345. feature 170 PE_lead_1 (0.000089)
346. feature 409 b4_Formation_lag_1 (0.000086)
347. feature 386 NM_M_rollingmax_50 (0.000083)
348. feature 141 DeltaPHI_lag_7 (0.000082)
349. feature 196 NM_M_lead_4 (0.000081)
350. feature 133 DeltaPHI_lag_3 (0.000076)
351. feature 116 ILD_log10_lead_4 (0.000072)
352. feature 65 NM_M_mean_dist (0.000069)
353. feature 169 PE_lag_1 (0.000066)
354. feature 135 DeltaPHI_lag_4 (0.000061)
355. feature 131 DeltaPHI_lag_2 (0.000059)
356. feature 134 DeltaPHI_lead_3 (0.000054)
357. feature 354 NM_M_rollingmax_20 (0.000051)
358. feature 153 PHIND_lag_3 (0.000049)
359. feature 417 b4_Formation_lead_1 (0.000039)
360. feature 174 PE_lead_3 (0.000038)
361. feature 202 NM_M_lead_7 (0.000031)
362. feature 129 DeltaPHI_lag_1 (0.000029)
363. feature 175 PE_lag_4 (0.000026)
364. feature 89 GR_lag_1 (0.000026)
365. feature 149 PHIND_lag_1 (0.000024)
366. feature 130 DeltaPHI_lead_1 (0.000015)
367. feature 236 Depth_rollingstd_5 (0.000000)
368. feature 332 Depth_rollingstd_20 (0.000000)
369. feature 232 Formation_lead_2equal (0.000000)
370. feature 231 Formation_lag_2equal (0.000000)
371. feature 414 b1_Formation_lead_1 (0.000000)
372. feature 229 Formation_lag_1equal (0.000000)
373. feature 208 NM_M_lead_10 (0.000000)
374. feature 408 b3_Formation_lag_1 (0.000000)
375. feature 230 Formation_lead_1equal (0.000000)
376. feature 190 NM_M_lead_1 (0.000000)
377. feature 415 b2_Formation_lead_1 (0.000000)
378. feature 416 b3_Formation_lead_1 (0.000000)
379. feature 434 b5_Formation_lead_2 (0.000000)
380. feature 433 b4_Formation_lead_2 (0.000000)
381. feature 432 b3_Formation_lead_2 (0.000000)
382. feature 431 b2_Formation_lead_2 (0.000000)
383. feature 290 NM_M_rollingmax_10 (0.000000)
384. feature 428 sh_Formation_lag_2 (0.000000)
385. feature 427 lm_Formation_lag_2 (0.000000)
386. feature 426 b5_Formation_lag_2 (0.000000)
387. feature 425 b4_Formation_lag_2 (0.000000)
388. feature 268 Depth_rollingstd_10 (0.000000)
389. feature 300 Depth_rollingstd_15 (0.000000)
390. feature 424 b3_Formation_lag_2 (0.000000)
391. feature 423 b2_Formation_lag_2 (0.000000)
392. feature 422 b1_Formation_lag_2 (0.000000)
393. feature 260 NM_M_rollingstd_5 (0.000000)
394. feature 40 NM_M_var (0.000000)
395. feature 418 b5_Formation_lead_1 (0.000000)
396. feature 259 NM_M_rollingmin_5 (0.000000)
397. feature 203 NM_M_lag_8 (0.000000)
398. feature 42 RELPOS_max (0.000000)
399. feature 407 b2_Formation_lag_1 (0.000000)
400. feature 72 Depth_lead_2 (0.000000)
401. feature 73 Depth_lag_3 (0.000000)
402. feature 74 Depth_lead_3 (0.000000)
403. feature 75 Depth_lag_4 (0.000000)
404. feature 76 Depth_lead_4 (0.000000)
405. feature 77 Depth_lag_5 (0.000000)
406. feature 78 Depth_lead_5 (0.000000)
407. feature 79 Depth_lag_6 (0.000000)
408. feature 80 Depth_lead_6 (0.000000)
409. feature 81 Depth_lag_7 (0.000000)
410. feature 82 Depth_lead_7 (0.000000)
411. feature 83 Depth_lag_8 (0.000000)
412. feature 84 Depth_lead_8 (0.000000)
413. feature 85 Depth_lag_9 (0.000000)
414. feature 86 Depth_lead_9 (0.000000)
415. feature 87 Depth_lag_10 (0.000000)
416. feature 88 Depth_lead_10 (0.000000)
417. feature 71 Depth_lag_2 (0.000000)
418. feature 70 Depth_lead_1 (0.000000)
419. feature 191 NM_M_lag_2 (0.000000)
420. feature 398 b1_Formation (0.000000)
421. feature 406 b1_Formation_lag_1 (0.000000)
422. feature 205 NM_M_lag_9 (0.000000)
423. feature 204 NM_M_lead_8 (0.000000)
424. feature 189 NM_M_lag_1 (0.000000)
425. feature 401 b4_Formation (0.000000)
426. feature 400 b3_Formation (0.000000)
427. feature 399 b2_Formation (0.000000)
428. feature 200 NM_M_lead_6 (0.000000)
429. feature 192 NM_M_lead_2 (0.000000)
430. feature 199 NM_M_lag_6 (0.000000)
431. feature 198 NM_M_lead_5 (0.000000)
432. feature 195 NM_M_lag_4 (0.000000)
433. feature 194 NM_M_lead_3 (0.000000)
434. feature 63 NM_M_min_dist (0.000000)
435. feature 64 NM_M_max_dist (0.000000)
436. feature 193 NM_M_lag_3 (0.000000)
437. feature 69 Depth_lag_1 (0.000000)

Apply to test


In [70]:
# this time let's use all the training set 
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
well_name_valid = df.loc[(df['origin']=='test'),"Well Name"]

In [71]:
clf.fit(xtrain,ytrain)


Out[71]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features=0.1,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=25, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [72]:
preds = clf.predict(xvalid.values)

In [73]:
well = "CRAWFORD"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2950,3175, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [74]:
well = "STUART"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2800,3050, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [75]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_3.csv')

In [76]:
test_1 = pd.read_csv('XmasPreds_1.csv')["Facies"]
test_3 = pd.read_csv('XmasPreds_3.csv')["Facies"]

In [77]:
(test_1==test_3).describe()


Out[77]:
count      830
unique       2
top       True
freq       645
Name: Facies, dtype: object