In [42]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, LeaveOneGroupOut, LeavePGroupsOut
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from stacking_classifiers import *
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
pd.options.mode.chained_assignment = None
In [2]:
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()
Out[2]:
In [3]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()
Out[3]:
In [4]:
# concat train and test for processing
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)
Out[4]:
In [5]:
# add some features based on the well data.
# nb points : can be correlated with how soft soil is ?
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns = ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
# depth :
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns = ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns = ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]
df["soft_indic"] = df['formation_depth'] / df["formation_size"]
# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg :
df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var)
# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg :
df[val + "_min_dist"] = df[val] -df[val + "_min"]
df[val + "_max_dist"] = df[val] -df[val + "_max"]
df[val + "_mean_dist"] = df[val] -df[val + "_mean"]
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
for lag in range(1,11):
df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)
# adding some Formation lag and lead.
for lag in range(1,3):
df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int)
print("rolling")
#Add rolling features
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M','RELPOS']
window_size = [5,10,15,20,50]
for w in window_size:
for val in list_to_roll:
df[val+'_rollingmean_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).mean())
df[val+'_rollingmax_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).max())
df[val+'_rollingmin_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).min())
df[val+'_rollingstd_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).std())
print("special window features for NM_M")
def NM_M_distance(x,how,target):
length = len(x)
rank = np.empty(length)
count = -1
NM_M = x["NM_M"].values
if how=="up":
order = range(length)
elif how=="down":
order = range(length-1,-1,-1)
for i in order:
if ((NM_M[i] != target) & (count>-1)):
count+=1
rank[i] += count
elif NM_M[i] == target:
count=0
else:
rank[i] = count
rank = pd.DataFrame(rank.astype(int), columns=["NM_M_Rank_Target_+"+str(target)+"_"+how], index = x.index)
return(rank)
df["NM_M_Rank_Target_1_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=1)
df["NM_M_Rank_Target_2_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=2)
df["NM_M_Rank_Target_1_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=1)
df["NM_M_Rank_Target_2_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=2)
print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())
df = df.fillna(df.mean())
print("Vectorizing Formation text data")
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
'Formation_lag_1',
'Formation_lead_1',
'Formation_lag_2',
'Formation_lead_2']
for l in list_formation:
cv = CountVectorizer()
counts = cv.fit_transform(df[l].values)
cols = [c+"_"+l for c in cv.get_feature_names()]
counts = pd.DataFrame(counts.toarray(),columns = cols)
df = df.drop(l,axis = 1)
df = pd.concat([df,counts],axis=1)
print("Finished preparing data. Now ready for ML ignition!")
In [39]:
# this time let's use all the training set
groups = df[(df['origin']=='train')]["Well Name"]
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
custom_cv = LeavePGroupsOut(n_groups=2)
In [80]:
set(yvalid.values)
Out[80]:
In [7]:
clf_rfe = RandomForestClassifier(
n_estimators=100,
criterion="entropy",
class_weight='balanced',
min_samples_leaf=5,
min_samples_split=25,
)
In [8]:
custom_cv_1 = custom_cv.split(xtrain, ytrain, groups)
fs = RFECV(clf_rfe,cv=custom_cv_1,scoring="f1_micro",step=0.1,verbose=2,n_jobs=4)
fs.fit(xtrain, ytrain)
Out[8]:
In [34]:
support = fs.support_
feature = pd.Series(xtrain.columns.values)
selected_features = list(feature[support])
print(len(selected_features))
xtrain_fs = xtrain[selected_features].copy()
xvalid_fs = xvalid[selected_features].copy()
In [67]:
rf = RandomForestClassifier(
n_estimators=100,
criterion="entropy",
class_weight='balanced',
min_samples_leaf=5,
min_samples_split=25,
max_features=10,
random_state=42
)
xtc = ExtraTreesClassifier(
n_estimators=100,
criterion="entropy",
class_weight='balanced',
min_samples_leaf=5,
min_samples_split=25,
max_features=10,
random_state=42
)
gbt = GradientBoostingClassifier(
loss='deviance',
n_estimators = 100,
learning_rate = 0.1,
max_depth = 3,
max_features = 10,
min_samples_leaf = 5,
min_samples_split = 25,
random_state = 42,
max_leaf_nodes = None
)
xgb = XGBClassifier(
learning_rate = 0.1,
max_depth = 3,
min_child_weight = 10,
n_estimators = 150,
colsample_bytree = 0.9,
seed = 42
)
custom_cv_2 = list(LeavePGroupsOut(n_groups=2).split(xtrain, ytrain, groups))
stacked = StackedClassifier(clfs = [rf, xtc, gbt, xgb],
level2_learner= LogisticRegression(),
skf = custom_cv_2
)
In [68]:
stacked.fit(xtrain_fs.values, ytrain.values)
Out[68]:
In [69]:
well_name_valid = df.loc[(df['origin']=='test'),"Well Name"]
In [76]:
preds = stacked.predict_proba(xvalid_fs)
In [94]:
classes = list(set(ytrain))
preds_hard = [classes[i] for i in np.argmax(preds, axis=1)]
In [95]:
well = "CRAWFORD"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds_hard).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2950,3175, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()
In [96]:
well = "STUART"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds_hard).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2800,3050, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()
In [97]:
xvalid['Facies']=preds_hard
xvalid.to_csv('XmasPreds_6.csv')