Facies classification using Machine Learning

Bird Team: PG+AC


In [44]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, LeaveOneGroupOut, LeavePGroupsOut
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
pd.options.mode.chained_assignment = None

In [19]:
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()


set(['SHRIMPLIN', 'Recruit F9', 'ALEXANDER D', 'SHANKLE', 'CHURCHMAN BIBLE', 'NOLAN', 'KIMZEY A', 'NEWBY', 'LUKE G U', 'CROSS H CATTLE'])
Out[19]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 A1 SH SHRIMPLIN 2793.0 77.45 0.664 9.9 11.915 4.6 1 1.000
1 3 A1 SH SHRIMPLIN 2793.5 78.26 0.661 14.2 12.565 4.1 1 0.979
2 3 A1 SH SHRIMPLIN 2794.0 79.05 0.658 14.8 13.050 3.6 1 0.957
3 3 A1 SH SHRIMPLIN 2794.5 86.10 0.655 13.9 13.115 3.5 1 0.936
4 3 A1 SH SHRIMPLIN 2795.0 74.58 0.647 13.5 13.300 3.4 1 0.915

In [20]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()


set(['CRAWFORD', 'STUART'])
(830, 10)
Out[20]:
Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000
1 A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978
2 A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956
3 A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933
4 A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911

In [21]:
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)


Out[21]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS origin
0 NaN A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000 test
1 NaN A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978 test
2 NaN A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956 test
3 NaN A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933 test
4 NaN A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911 test
5 NaN A1 SH STUART 2810.5 73.955 0.667 6.9 12.25 3.086 1 0.889 test
6 NaN A1 SH STUART 2811.0 77.962 0.674 6.5 12.45 3.092 1 0.867 test
7 NaN A1 SH STUART 2811.5 83.894 0.667 6.3 12.65 3.123 1 0.844 test
8 NaN A1 SH STUART 2812.0 84.424 0.653 6.7 13.05 3.121 1 0.822 test
9 NaN A1 SH STUART 2812.5 83.160 0.642 7.3 12.95 3.127 1 0.800 test

In [22]:
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]
    
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int) 

print("rolling")
#Add rolling features
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M','RELPOS']
window_size = [5,10,15,20,50]
for w in window_size:
    for val in list_to_roll:
        df[val+'_rollingmean_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).mean())
        df[val+'_rollingmax_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).max())
        df[val+'_rollingmin_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).min())
        df[val+'_rollingstd_'+str(w)]=df.groupby("Well Name")[val].apply(
            lambda x:x.rolling(window=w,center=True).std())
        
print("special window features for NM_M")
def NM_M_distance(x,how,target):
    length = len(x)
    rank = np.empty(length)
    count = -1
    NM_M = x["NM_M"].values
    if how=="up":
        order = range(length)
    elif how=="down":
        order = range(length-1,-1,-1)
    for i in order:
        if ((NM_M[i] != target) & (count>-1)):
            count+=1
            rank[i] += count
        elif NM_M[i] == target:
            count=0
        else:
            rank[i] = count
    rank = pd.DataFrame(rank.astype(int), columns=["NM_M_Rank_Target_+"+str(target)+"_"+how], index = x.index)
    return(rank)
df["NM_M_Rank_Target_1_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=1)
df["NM_M_Rank_Target_2_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=2)
df["NM_M_Rank_Target_1_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=1)
df["NM_M_Rank_Target_2_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=2)

print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())
df = df.fillna(df.mean())

print("Vectorizing Formation text data")
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_2',
 'Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = [c+"_"+l for c in cv.get_feature_names()]
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)

print("Finished preparing data. Now ready for ML ignition!")


session
depth
add avgs of feat
add distances feat.
lag lead
rolling
special window features for NM_M
filling na
Vectorizing Formation text data
Finished preparing data. Now ready for ML ignition!

Fitting


In [69]:
# this time let's use all the training set 
groups = df[(df['origin']=='train')]["Well Name"]
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
custom_cv = LeavePGroupsOut(n_groups=2)

In [24]:
clf_rfe = RandomForestClassifier(
    n_estimators=100,
    criterion="entropy",
    class_weight='balanced',
    min_samples_leaf=5,
    min_samples_split=25,
)

In [25]:
custom_cv_1 = custom_cv.split(xtrain, ytrain, groups)
fs = RFECV(clf_rfe,cv=custom_cv_1,scoring="f1_micro",step=0.1,verbose=2,n_jobs=4)
fs.fit(xtrain, ytrain)


Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 136 features.
Fitting estimator with 50 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 50 features.
Fitting estimator with 93 features.
Fitting estimator with 437 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 50 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 7 features.
Fitting estimator with 394 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 93 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 50 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 7 features.
Fitting estimator with 50 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 222 features.
Fitting estimator with 308 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 222 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 394 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 394 features.
Fitting estimator with 437 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 179 features.
Fitting estimator with 50 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 93 features.
Fitting estimator with 136 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 394 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 222 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 93 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 50 features.
Fitting estimator with 136 features.
Fitting estimator with 179 features.
Fitting estimator with 7 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 437 features.
Fitting estimator with 50 features.
Fitting estimator with 136 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 394 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 437 features.
Fitting estimator with 50 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 7 features.
Fitting estimator with 394 features.
Fitting estimator with 437 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 93 features.
Fitting estimator with 136 features.
Fitting estimator with 222 features.
Fitting estimator with 50 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 93 features.
Fitting estimator with 437 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 394 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 394 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 351 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 308 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 136 features.
Fitting estimator with 437 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 351 features.
Fitting estimator with 394 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 351 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 308 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 179 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 437 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 394 features.
Fitting estimator with 394 features.
Fitting estimator with 50 features.
Fitting estimator with 394 features.
Fitting estimator with 351 features.
Fitting estimator with 7 features.
Fitting estimator with 351 features.
Fitting estimator with 437 features.
Fitting estimator with 351 features.
Fitting estimator with 308 features.
Fitting estimator with 308 features.
Fitting estimator with 394 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 265 features.
Fitting estimator with 351 features.
Fitting estimator with 222 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 308 features.
Fitting estimator with 179 features.
Fitting estimator with 222 features.
Fitting estimator with 136 features.
Fitting estimator with 136 features.
Fitting estimator with 265 features.
Fitting estimator with 179 features.
Fitting estimator with 93 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 136 features.
Fitting estimator with 222 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 437 features.
Fitting estimator with 93 features.
Fitting estimator with 179 features.
Fitting estimator with 50 features.
Fitting estimator with 394 features.
Fitting estimator with 136 features.
Fitting estimator with 7 features.
Fitting estimator with 93 features.
Fitting estimator with 351 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Fitting estimator with 308 features.
Fitting estimator with 265 features.
Fitting estimator with 222 features.
Fitting estimator with 179 features.
Fitting estimator with 136 features.
Fitting estimator with 93 features.
Fitting estimator with 50 features.
Fitting estimator with 7 features.
Out[25]:
RFECV(cv=<generator object split at 0x1157cb640>,
   estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=5, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
   n_jobs=4, scoring='f1_micro', step=0.1, verbose=2)

In [70]:
support = fs.support_
feature = pd.Series(xtrain.columns.values)
selected_features = list(feature[support])
print(len(selected_features))
xtrain_fs = xtrain[selected_features]
xvalid_fs = xvalid[selected_features]


265

In [63]:
rf = RandomForestClassifier(
            n_estimators=100,
            criterion="entropy",
            class_weight='balanced',
            min_samples_leaf=5,
            min_samples_split=25,
            max_features=10,
            random_state=42
)
rf_vs = OneVsOneClassifier(clone(rf))

xtc =  ExtraTreesClassifier(
            n_estimators=100,
            criterion="entropy",
            class_weight='balanced',
            min_samples_leaf=5,
            min_samples_split=25,
            max_features=10,
            random_state=42
)
xtc_vs = OneVsOneClassifier(clone(xtc))

gbt = GradientBoostingClassifier(
            loss='deviance',
            n_estimators = 100, 
            learning_rate = 0.1, 
            max_depth = 3,
            max_features = 10,
            min_samples_leaf = 5,
            min_samples_split = 25,
            random_state = 42, 
            max_leaf_nodes = None
)
gbt_vs = OneVsOneClassifier(GradientBoostingClassifier(
            loss='exponential',
            n_estimators = 100, 
            learning_rate = 0.1, 
            max_depth = 3,
            max_features = 10,
            min_samples_leaf = 5,
            min_samples_split = 25,
            random_state = 42, 
            max_leaf_nodes = None
))

xgb = XGBClassifier(
            learning_rate = 0.1, 
            max_depth = 3, 
            min_child_weight = 10, 
            n_estimators = 150, 
            colsample_bytree = 0.9,
            seed = 42
)
xgb_vs = OneVsOneClassifier(clone(xgb))

ensemble = VotingClassifier(
    estimators=[
            ('rf', rf), ('rf_vs',rf_vs),
            ('xtc', xtc), ('xtc_vs', xtc_vs),
            ('gbt', gbt), ('gbt_vs', gbt_vs),
            ('xgb', xgb), ('xgb_vs', xgb_vs)
        ], 
    voting='hard',
    weights = [0.1,0.1,
               0.1,0.1,
              0.1,0.1,
              0.3,0.1],
    n_jobs=4
)

In [64]:
ensemble.fit(xtrain_fs, ytrain)


Out[64]:
VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features=10,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=5, min_samples_split=25,
            min_weight_fraction_leaf=0..., reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=1),
          n_jobs=1))],
         n_jobs=4, voting='hard',
         weights=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1])

Apply to test


In [65]:
well_name_valid = df.loc[(df['origin']=='test'),"Well Name"]

In [71]:
preds = ensemble.predict(xvalid_fs)

In [72]:
well = "CRAWFORD"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2950,3175, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [73]:
well = "STUART"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2800,3050, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()



In [74]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_5.csv')