Facies classification using Machine Learning

Bird Team: PG+AC


In [1]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score,LeavePGroupsOut, LeaveOneGroupOut, cross_val_predict
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
pd.options.mode.chained_assignment = None

In [2]:
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()


set(['SHRIMPLIN', 'Recruit F9', 'ALEXANDER D', 'SHANKLE', 'CHURCHMAN BIBLE', 'NOLAN', 'KIMZEY A', 'NEWBY', 'LUKE G U', 'CROSS H CATTLE'])
Out[2]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 A1 SH SHRIMPLIN 2793.0 77.45 0.664 9.9 11.915 4.6 1 1.000
1 3 A1 SH SHRIMPLIN 2793.5 78.26 0.661 14.2 12.565 4.1 1 0.979
2 3 A1 SH SHRIMPLIN 2794.0 79.05 0.658 14.8 13.050 3.6 1 0.957
3 3 A1 SH SHRIMPLIN 2794.5 86.10 0.655 13.9 13.115 3.5 1 0.936
4 3 A1 SH SHRIMPLIN 2795.0 74.58 0.647 13.5 13.300 3.4 1 0.915

In [3]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()


set(['CRAWFORD', 'STUART'])
(830, 10)
Out[3]:
Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000
1 A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978
2 A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956
3 A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933
4 A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911

In [4]:
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)


Out[4]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS origin
0 NaN A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000 test
1 NaN A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978 test
2 NaN A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956 test
3 NaN A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933 test
4 NaN A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911 test
5 NaN A1 SH STUART 2810.5 73.955 0.667 6.9 12.25 3.086 1 0.889 test
6 NaN A1 SH STUART 2811.0 77.962 0.674 6.5 12.45 3.092 1 0.867 test
7 NaN A1 SH STUART 2811.5 83.894 0.667 6.3 12.65 3.123 1 0.844 test
8 NaN A1 SH STUART 2812.0 84.424 0.653 6.7 13.05 3.121 1 0.822 test
9 NaN A1 SH STUART 2812.5 83.160 0.642 7.3 12.95 3.127 1 0.800 test

In [5]:
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]
    
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int) 

print("rolling")
#Add rolling features
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M','RELPOS']
window_size = [5,10,15,20,50]
for w in window_size:
    for val in list_to_roll:
        df[val+'_rollingmean_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).mean())
        df[val+'_rollingmax_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).max())
        df[val+'_rollingmin_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).min())
        df[val+'_rollingstd_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).std())
        
print("special window features for NM_M")
def NM_M_distance(x,how,target):
    length = len(x)
    rank = np.empty(length)
    count = -1
    NM_M = x["NM_M"].values
    if how=="up":
        order = range(length)
    elif how=="down":
        order = range(length-1,-1,-1)
    for i in order:
        if ((NM_M[i] != target) & (count>-1)):
            count+=1
            rank[i] += count
        elif NM_M[i] == target:
            count=0
        else:
            rank[i] = count
    rank = pd.DataFrame(rank.astype(int), columns=["NM_M_Rank_Target_+"+str(target)+"_"+how], index = x.index)
    return(rank)
df["NM_M_Rank_Target_1_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=1)
df["NM_M_Rank_Target_2_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=2)
df["NM_M_Rank_Target_1_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=1)
df["NM_M_Rank_Target_2_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=2)

print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())
df = df.fillna(df.mean())

print("Vectorizing Formation text data")
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_2',
 'Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = [c+"_"+l for c in cv.get_feature_names()]
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)

print("Finished preparing data. Now ready for ML ignition!")


session
depth
add avgs of feat
add distances feat.
lag lead
rolling
special window features for NM_M
filling na
Vectorizing Formation text data
Finished preparing data. Now ready for ML ignition!

In [189]:
#tokeep =['Facies','origin','Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2',
#         'Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
#df = df[tokeep]

CV performance


In [6]:
f1_micro = make_scorer(f1_score, average='micro')

In [7]:
# this time let's use all the training set 
groups = df[(df['origin']=='train')]["Well Name"]
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
custom_cv=LeaveOneGroupOut().split(xtrain, ytrain, groups)

In [8]:
clf_rfe = RandomForestClassifier(
    n_estimators=100,
    criterion="entropy",
    class_weight='balanced',
) 
clf_final = RandomForestClassifier(
    n_estimators=100,
    criterion="entropy",
    class_weight='balanced',
    min_samples_leaf=5,
    min_samples_split=25,
    max_features=10,
    n_jobs=-1
)
pipe = Pipeline([
  ('fs', RFECV(clf_rfe,
               cv=custom_cv,
               scoring=f1_micro,
               step=0.1,
               verbose=2)),
  ('cl', clf_final)
])

In [9]:
pipe.fit(xtrain, ytrain)


Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Out[9]:
Pipeline(steps=[('fs', RFECV(cv=<generator object split at 0x11518e640>,
   estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [151]:
y_pred = pipe.predict(xtrain)
print(classification_report(ytrain, y_pred))
print(f1_score(ytrain, y_pred, average="micro"))


             precision    recall  f1-score   support

        1.0       0.88      1.00      0.93       268
        2.0       0.95      0.91      0.93       940
        3.0       0.94      0.93      0.93       780
        4.0       0.79      0.94      0.86       271
        5.0       0.87      0.86      0.87       296
        6.0       0.90      0.79      0.84       582
        7.0       0.76      0.98      0.85       141
        8.0       0.91      0.87      0.89       686
        9.0       0.94      0.98      0.96       185

avg / total       0.91      0.90      0.90      4149

0.902868161003

In [11]:
list(feature[support])


Out[11]:
['Depth',
 'GR',
 'ILD_log10',
 'DeltaPHI',
 'PHIND',
 'PE',
 'NM_M',
 'RELPOS',
 'formation_size',
 'formation_depth',
 'soft_indic',
 'Depth_var',
 'GR_min',
 'GR_max',
 'GR_var',
 'ILD_log10_min',
 'ILD_log10_max',
 'ILD_log10_mean',
 'ILD_log10_var',
 'DeltaPHI_var',
 'PHIND_min',
 'PHIND_mean',
 'PE_min',
 'PE_max',
 'PE_mean',
 'NM_M_max',
 'NM_M_mean',
 'RELPOS_mean',
 'RELPOS_var',
 'Depth_min_dist',
 'Depth_max_dist',
 'Depth_mean_dist',
 'GR_min_dist',
 'GR_max_dist',
 'GR_mean_dist',
 'ILD_log10_min_dist',
 'ILD_log10_max_dist',
 'ILD_log10_mean_dist',
 'DeltaPHI_min_dist',
 'DeltaPHI_mean_dist',
 'PHIND_min_dist',
 'PHIND_max_dist',
 'PHIND_mean_dist',
 'RELPOS_min_dist',
 'RELPOS_max_dist',
 'RELPOS_mean_dist',
 'GR_lead_4',
 'GR_lead_5',
 'GR_lag_6',
 'GR_lead_6',
 'GR_lag_7',
 'GR_lead_7',
 'GR_lead_8',
 'GR_lead_9',
 'GR_lag_10',
 'GR_lead_10',
 'ILD_log10_lead_6',
 'ILD_log10_lead_7',
 'ILD_log10_lag_8',
 'ILD_log10_lead_8',
 'ILD_log10_lag_9',
 'ILD_log10_lead_9',
 'ILD_log10_lag_10',
 'ILD_log10_lead_10',
 'DeltaPHI_lead_9',
 'PHIND_lag_6',
 'PHIND_lead_6',
 'PHIND_lead_7',
 'PHIND_lead_8',
 'PHIND_lead_9',
 'PHIND_lag_10',
 'PHIND_lead_10',
 'RELPOS_lead_1',
 'RELPOS_lag_2',
 'RELPOS_lead_2',
 'RELPOS_lag_3',
 'RELPOS_lead_3',
 'RELPOS_lag_4',
 'RELPOS_lead_4',
 'RELPOS_lag_5',
 'RELPOS_lead_5',
 'RELPOS_lead_6',
 'RELPOS_lag_7',
 'RELPOS_lead_7',
 'RELPOS_lead_8',
 'RELPOS_lead_9',
 'RELPOS_lead_10',
 'Depth_rollingmean_5',
 'Depth_rollingmax_5',
 'Depth_rollingmin_5',
 'GR_rollingmean_5',
 'GR_rollingmax_5',
 'GR_rollingmin_5',
 'ILD_log10_rollingmean_5',
 'ILD_log10_rollingmax_5',
 'ILD_log10_rollingmin_5',
 'DeltaPHI_rollingmean_5',
 'DeltaPHI_rollingmax_5',
 'DeltaPHI_rollingmin_5',
 'PHIND_rollingmean_5',
 'PHIND_rollingmax_5',
 'PHIND_rollingmin_5',
 'PE_rollingmean_5',
 'PE_rollingmax_5',
 'PE_rollingmin_5',
 'NM_M_rollingmean_5',
 'NM_M_rollingmax_5',
 'NM_M_rollingmin_5',
 'RELPOS_rollingmean_5',
 'RELPOS_rollingmax_5',
 'RELPOS_rollingmin_5',
 'RELPOS_rollingstd_5',
 'Depth_rollingmean_10',
 'Depth_rollingmax_10',
 'Depth_rollingmin_10',
 'GR_rollingmean_10',
 'GR_rollingmax_10',
 'GR_rollingmin_10',
 'ILD_log10_rollingmean_10',
 'ILD_log10_rollingmax_10',
 'ILD_log10_rollingmin_10',
 'DeltaPHI_rollingmean_10',
 'DeltaPHI_rollingmax_10',
 'DeltaPHI_rollingmin_10',
 'DeltaPHI_rollingstd_10',
 'PHIND_rollingmean_10',
 'PHIND_rollingmax_10',
 'PHIND_rollingmin_10',
 'PE_rollingmean_10',
 'PE_rollingmax_10',
 'PE_rollingmin_10',
 'NM_M_rollingmean_10',
 'NM_M_rollingmin_10',
 'RELPOS_rollingmean_10',
 'RELPOS_rollingmax_10',
 'RELPOS_rollingmin_10',
 'RELPOS_rollingstd_10',
 'Depth_rollingmean_15',
 'Depth_rollingmax_15',
 'Depth_rollingmin_15',
 'GR_rollingmean_15',
 'GR_rollingmin_15',
 'ILD_log10_rollingmean_15',
 'ILD_log10_rollingmax_15',
 'ILD_log10_rollingmin_15',
 'ILD_log10_rollingstd_15',
 'DeltaPHI_rollingmean_15',
 'DeltaPHI_rollingmax_15',
 'DeltaPHI_rollingmin_15',
 'PHIND_rollingmean_15',
 'PHIND_rollingmax_15',
 'PHIND_rollingstd_15',
 'PE_rollingmean_15',
 'PE_rollingmax_15',
 'PE_rollingmin_15',
 'NM_M_rollingmean_15',
 'NM_M_rollingmax_15',
 'RELPOS_rollingmean_15',
 'RELPOS_rollingmax_15',
 'RELPOS_rollingmin_15',
 'RELPOS_rollingstd_15',
 'Depth_rollingmean_20',
 'Depth_rollingmax_20',
 'Depth_rollingmin_20',
 'GR_rollingmean_20',
 'GR_rollingmax_20',
 'GR_rollingmin_20',
 'ILD_log10_rollingmean_20',
 'ILD_log10_rollingmin_20',
 'ILD_log10_rollingstd_20',
 'DeltaPHI_rollingmean_20',
 'DeltaPHI_rollingmax_20',
 'DeltaPHI_rollingmin_20',
 'DeltaPHI_rollingstd_20',
 'PHIND_rollingmean_20',
 'PHIND_rollingmax_20',
 'PHIND_rollingstd_20',
 'PE_rollingmean_20',
 'PE_rollingmax_20',
 'PE_rollingmin_20',
 'PE_rollingstd_20',
 'NM_M_rollingmean_20',
 'RELPOS_rollingmean_20',
 'RELPOS_rollingmax_20',
 'RELPOS_rollingmin_20',
 'RELPOS_rollingstd_20',
 'Depth_rollingmean_50',
 'Depth_rollingmax_50',
 'Depth_rollingmin_50',
 'GR_rollingmean_50',
 'GR_rollingmin_50',
 'GR_rollingstd_50',
 'ILD_log10_rollingmean_50',
 'ILD_log10_rollingmin_50',
 'ILD_log10_rollingstd_50',
 'DeltaPHI_rollingmean_50',
 'DeltaPHI_rollingstd_50',
 'PHIND_rollingmean_50',
 'PHIND_rollingmax_50',
 'PHIND_rollingstd_50',
 'PE_rollingmean_50',
 'PE_rollingmax_50',
 'PE_rollingmin_50',
 'PE_rollingstd_50',
 'NM_M_rollingmean_50',
 'NM_M_rollingstd_50',
 'RELPOS_rollingmean_50',
 'RELPOS_rollingstd_50',
 'NM_M_Rank_Target_1_up',
 'NM_M_Rank_Target_2_up',
 'NM_M_Rank_Target_1_down',
 'NM_M_Rank_Target_2_down',
 u'lm_Formation',
 u'sh_Formation',
 u'lm_Formation_lag_1',
 u'sh_Formation_lag_1',
 u'lm_Formation_lead_1',
 u'sh_Formation_lead_1',
 u'lm_Formation_lag_2',
 u'sh_Formation_lag_2',
 u'lm_Formation_lead_2',
 u'sh_Formation_lead_2']

In [10]:
importances = pipe.named_steps["cl"].feature_importances_
support = pipe.named_steps["fs"].support_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature = pd.Series(xtrain.columns.values)
selected_features = list(feature[support])
for f in range(len(selected_features)):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], 
                                      selected_features[indices[f]], 
                                      importances[indices[f]]))


Feature ranking:
1. feature 26 NM_M_mean (0.027817)
2. feature 105 NM_M_rollingmean_5 (0.024449)
3. feature 211 NM_M_Rank_Target_2_down (0.022147)
4. feature 217 sh_Formation_lead_1 (0.019817)
5. feature 21 PHIND_mean (0.018843)
6. feature 214 lm_Formation_lag_1 (0.017985)
7. feature 12 GR_min (0.015142)
8. feature 131 NM_M_rollingmean_10 (0.014155)
9. feature 155 NM_M_rollingmean_15 (0.014047)
10. feature 212 lm_Formation (0.013508)
11. feature 181 NM_M_rollingmean_20 (0.012736)
12. feature 201 PE_rollingmax_50 (0.012687)
13. feature 213 sh_Formation (0.012215)
14. feature 90 GR_rollingmean_5 (0.011509)
15. feature 107 NM_M_rollingmin_5 (0.011475)
16. feature 6 NM_M (0.010877)
17. feature 200 PE_rollingmean_50 (0.010828)
18. feature 208 NM_M_Rank_Target_1_up (0.009528)
19. feature 32 GR_min_dist (0.009182)
20. feature 92 GR_rollingmin_5 (0.009155)
21. feature 221 sh_Formation_lead_2 (0.009107)
22. feature 216 lm_Formation_lead_1 (0.009057)
23. feature 220 lm_Formation_lead_2 (0.008814)
24. feature 178 PE_rollingmax_20 (0.008588)
25. feature 1 GR (0.008296)
26. feature 132 NM_M_rollingmin_10 (0.008145)
27. feature 102 PE_rollingmean_5 (0.007794)
28. feature 153 PE_rollingmax_15 (0.007584)
29. feature 215 sh_Formation_lag_1 (0.007484)
30. feature 24 PE_mean (0.007383)
31. feature 136 RELPOS_rollingstd_10 (0.007274)
32. feature 5 PE (0.007194)
33. feature 128 PE_rollingmean_10 (0.007115)
34. feature 111 RELPOS_rollingstd_5 (0.006994)
35. feature 185 RELPOS_rollingstd_20 (0.006989)
36. feature 152 PE_rollingmean_15 (0.006938)
37. feature 23 PE_max (0.006844)
38. feature 94 ILD_log10_rollingmax_5 (0.006819)
39. feature 160 RELPOS_rollingstd_15 (0.006639)
40. feature 14 GR_var (0.006637)
41. feature 115 GR_rollingmean_10 (0.006592)
42. feature 117 GR_rollingmin_10 (0.006554)
43. feature 168 ILD_log10_rollingmin_20 (0.006457)
44. feature 20 PHIND_min (0.006103)
45. feature 120 ILD_log10_rollingmin_10 (0.005956)
46. feature 104 PE_rollingmin_5 (0.005954)
47. feature 95 ILD_log10_rollingmin_5 (0.005948)
48. feature 18 ILD_log10_var (0.005905)
49. feature 209 NM_M_Rank_Target_2_up (0.005767)
50. feature 140 GR_rollingmean_15 (0.005747)
51. feature 99 PHIND_rollingmean_5 (0.005722)
52. feature 93 ILD_log10_rollingmean_5 (0.005482)
53. feature 192 ILD_log10_rollingmean_50 (0.005450)
54. feature 25 NM_M_max (0.005447)
55. feature 129 PE_rollingmax_10 (0.005330)
56. feature 121 DeltaPHI_rollingmean_10 (0.005311)
57. feature 17 ILD_log10_mean (0.005179)
58. feature 2 ILD_log10 (0.005156)
59. feature 98 DeltaPHI_rollingmin_5 (0.005139)
60. feature 177 PE_rollingmean_20 (0.005118)
61. feature 204 NM_M_rollingmean_50 (0.004964)
62. feature 100 PHIND_rollingmax_5 (0.004915)
63. feature 96 DeltaPHI_rollingmean_5 (0.004814)
64. feature 15 ILD_log10_min (0.004701)
65. feature 167 ILD_log10_rollingmean_20 (0.004688)
66. feature 187 Depth_rollingmax_50 (0.004684)
67. feature 101 PHIND_rollingmin_5 (0.004598)
68. feature 97 DeltaPHI_rollingmax_5 (0.004589)
69. feature 142 ILD_log10_rollingmean_15 (0.004486)
70. feature 193 ILD_log10_rollingmin_50 (0.004458)
71. feature 186 Depth_rollingmean_50 (0.004416)
72. feature 103 PE_rollingmax_5 (0.004386)
73. feature 22 PE_min (0.004372)
74. feature 164 GR_rollingmean_20 (0.004351)
75. feature 138 Depth_rollingmax_15 (0.004350)
76. feature 4 PHIND (0.004308)
77. feature 180 PE_rollingstd_20 (0.004193)
78. feature 125 PHIND_rollingmean_10 (0.004157)
79. feature 40 PHIND_min_dist (0.004087)
80. feature 130 PE_rollingmin_10 (0.003897)
81. feature 205 NM_M_rollingstd_50 (0.003892)
82. feature 144 ILD_log10_rollingmin_15 (0.003890)
83. feature 91 GR_rollingmax_5 (0.003880)
84. feature 191 GR_rollingstd_50 (0.003873)
85. feature 146 DeltaPHI_rollingmean_15 (0.003858)
86. feature 175 PHIND_rollingmax_20 (0.003843)
87. feature 134 RELPOS_rollingmax_10 (0.003830)
88. feature 76 RELPOS_lead_3 (0.003822)
89. feature 74 RELPOS_lead_2 (0.003723)
90. feature 202 PE_rollingmin_50 (0.003717)
91. feature 119 ILD_log10_rollingmax_10 (0.003683)
92. feature 118 ILD_log10_rollingmean_10 (0.003657)
93. feature 3 DeltaPHI (0.003643)
94. feature 29 Depth_min_dist (0.003608)
95. feature 80 RELPOS_lead_5 (0.003607)
96. feature 9 formation_depth (0.003565)
97. feature 75 RELPOS_lag_3 (0.003528)
98. feature 210 NM_M_Rank_Target_1_down (0.003514)
99. feature 38 DeltaPHI_min_dist (0.003497)
100. feature 27 RELPOS_mean (0.003483)
101. feature 141 GR_rollingmin_15 (0.003471)
102. feature 109 RELPOS_rollingmax_5 (0.003463)
103. feature 33 GR_max_dist (0.003428)
104. feature 147 DeltaPHI_rollingmax_15 (0.003340)
105. feature 161 Depth_rollingmean_20 (0.003318)
106. feature 78 RELPOS_lead_4 (0.003310)
107. feature 122 DeltaPHI_rollingmax_10 (0.003209)
108. feature 170 DeltaPHI_rollingmean_20 (0.003198)
109. feature 8 formation_size (0.003171)
110. feature 188 Depth_rollingmin_50 (0.003154)
111. feature 89 Depth_rollingmin_5 (0.003141)
112. feature 0 Depth (0.003129)
113. feature 218 lm_Formation_lag_2 (0.003039)
114. feature 207 RELPOS_rollingstd_50 (0.003035)
115. feature 11 Depth_var (0.003033)
116. feature 63 ILD_log10_lead_10 (0.003032)
117. feature 123 DeltaPHI_rollingmin_10 (0.003025)
118. feature 87 Depth_rollingmean_5 (0.002995)
119. feature 88 Depth_rollingmax_5 (0.002978)
120. feature 154 PE_rollingmin_15 (0.002944)
121. feature 114 Depth_rollingmin_10 (0.002932)
122. feature 73 RELPOS_lag_2 (0.002921)
123. feature 176 PHIND_rollingstd_20 (0.002911)
124. feature 13 GR_max (0.002856)
125. feature 55 GR_lead_10 (0.002839)
126. feature 179 PE_rollingmin_20 (0.002814)
127. feature 143 ILD_log10_rollingmax_15 (0.002810)
128. feature 43 RELPOS_min_dist (0.002788)
129. feature 112 Depth_rollingmean_10 (0.002764)
130. feature 126 PHIND_rollingmax_10 (0.002762)
131. feature 189 GR_rollingmean_50 (0.002742)
132. feature 77 RELPOS_lag_4 (0.002730)
133. feature 162 Depth_rollingmax_20 (0.002723)
134. feature 16 ILD_log10_max (0.002714)
135. feature 36 ILD_log10_max_dist (0.002667)
136. feature 110 RELPOS_rollingmin_5 (0.002655)
137. feature 81 RELPOS_lead_6 (0.002652)
138. feature 139 Depth_rollingmin_15 (0.002590)
139. feature 61 ILD_log10_lead_9 (0.002589)
140. feature 184 RELPOS_rollingmin_20 (0.002570)
141. feature 59 ILD_log10_lead_8 (0.002539)
142. feature 166 GR_rollingmin_20 (0.002496)
143. feature 31 Depth_mean_dist (0.002484)
144. feature 171 DeltaPHI_rollingmax_20 (0.002434)
145. feature 37 ILD_log10_mean_dist (0.002414)
146. feature 149 PHIND_rollingmean_15 (0.002403)
147. feature 158 RELPOS_rollingmax_15 (0.002393)
148. feature 28 RELPOS_var (0.002386)
149. feature 44 RELPOS_max_dist (0.002383)
150. feature 10 soft_indic (0.002366)
151. feature 113 Depth_rollingmax_10 (0.002365)
152. feature 163 Depth_rollingmin_20 (0.002353)
153. feature 45 RELPOS_mean_dist (0.002338)
154. feature 190 GR_rollingmin_50 (0.002334)
155. feature 85 RELPOS_lead_9 (0.002284)
156. feature 51 GR_lead_7 (0.002256)
157. feature 7 RELPOS (0.002210)
158. feature 68 PHIND_lead_8 (0.002210)
159. feature 165 GR_rollingmax_20 (0.002205)
160. feature 106 NM_M_rollingmax_5 (0.002202)
161. feature 150 PHIND_rollingmax_15 (0.002199)
162. feature 148 DeltaPHI_rollingmin_15 (0.002198)
163. feature 62 ILD_log10_lag_10 (0.002180)
164. feature 157 RELPOS_rollingmean_15 (0.002176)
165. feature 199 PHIND_rollingstd_50 (0.002176)
166. feature 42 PHIND_mean_dist (0.002158)
167. feature 203 PE_rollingstd_50 (0.002142)
168. feature 52 GR_lead_8 (0.002138)
169. feature 53 GR_lead_9 (0.002131)
170. feature 172 DeltaPHI_rollingmin_20 (0.002126)
171. feature 30 Depth_max_dist (0.002125)
172. feature 86 RELPOS_lead_10 (0.002110)
173. feature 182 RELPOS_rollingmean_20 (0.002047)
174. feature 127 PHIND_rollingmin_10 (0.002033)
175. feature 60 ILD_log10_lag_9 (0.002029)
176. feature 19 DeltaPHI_var (0.002000)
177. feature 174 PHIND_rollingmean_20 (0.001987)
178. feature 156 NM_M_rollingmax_15 (0.001985)
179. feature 183 RELPOS_rollingmax_20 (0.001945)
180. feature 34 GR_mean_dist (0.001940)
181. feature 137 Depth_rollingmean_15 (0.001922)
182. feature 67 PHIND_lead_7 (0.001920)
183. feature 194 ILD_log10_rollingstd_50 (0.001913)
184. feature 65 PHIND_lag_6 (0.001896)
185. feature 72 RELPOS_lead_1 (0.001873)
186. feature 159 RELPOS_rollingmin_15 (0.001855)
187. feature 57 ILD_log10_lead_7 (0.001850)
188. feature 197 PHIND_rollingmean_50 (0.001837)
189. feature 135 RELPOS_rollingmin_10 (0.001836)
190. feature 47 GR_lead_5 (0.001835)
191. feature 195 DeltaPHI_rollingmean_50 (0.001825)
192. feature 196 DeltaPHI_rollingstd_50 (0.001758)
193. feature 49 GR_lead_6 (0.001739)
194. feature 79 RELPOS_lag_5 (0.001737)
195. feature 124 DeltaPHI_rollingstd_10 (0.001717)
196. feature 173 DeltaPHI_rollingstd_20 (0.001698)
197. feature 206 RELPOS_rollingmean_50 (0.001695)
198. feature 66 PHIND_lead_6 (0.001656)
199. feature 35 ILD_log10_min_dist (0.001602)
200. feature 133 RELPOS_rollingmean_10 (0.001592)
201. feature 50 GR_lag_7 (0.001541)
202. feature 116 GR_rollingmax_10 (0.001540)
203. feature 84 RELPOS_lead_8 (0.001534)
204. feature 151 PHIND_rollingstd_15 (0.001507)
205. feature 71 PHIND_lead_10 (0.001486)
206. feature 54 GR_lag_10 (0.001474)
207. feature 83 RELPOS_lead_7 (0.001450)
208. feature 70 PHIND_lag_10 (0.001445)
209. feature 41 PHIND_max_dist (0.001414)
210. feature 82 RELPOS_lag_7 (0.001373)
211. feature 39 DeltaPHI_mean_dist (0.001357)
212. feature 48 GR_lag_6 (0.001336)
213. feature 69 PHIND_lead_9 (0.001303)
214. feature 108 RELPOS_rollingmean_5 (0.001284)
215. feature 64 DeltaPHI_lead_9 (0.001279)
216. feature 169 ILD_log10_rollingstd_20 (0.001271)
217. feature 56 ILD_log10_lead_6 (0.001250)
218. feature 219 sh_Formation_lag_2 (0.001219)
219. feature 198 PHIND_rollingmax_50 (0.001122)
220. feature 145 ILD_log10_rollingstd_15 (0.001117)
221. feature 46 GR_lead_4 (0.001102)
222. feature 58 ILD_log10_lag_8 (0.000869)

Apply to test


In [163]:
# this time let's use all the training set 
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
well_name_valid = df.loc[(df['origin']=='test'),"Well Name"]

In [ ]:
#pipe.fit(xtrain,ytrain)

In [164]:
preds = pipe.predict(xvalid.values)

In [165]:
well = "CRAWFORD"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2950,3175, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [166]:
well = "STUART"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2800,3050, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [167]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_4.csv')