In [142]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score,LeavePGroupsOut, LeaveOneGroupOut, cross_val_predict
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
pd.options.mode.chained_assignment = None
In [143]:
filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()
Out[143]:
In [144]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()
Out[144]:
In [145]:
# concat train and test for processing
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)
Out[145]:
In [146]:
# add some features based on the well data.
# nb points : can be correlated with how soft soil is ?
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns = ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
# depth :
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns = ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns = ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]
df["soft_indic"] = df['formation_depth'] / df["formation_size"]
# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg :
df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var)
# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg :
df[val + "_min_dist"] = df[val] -df[val + "_min"]
df[val + "_max_dist"] = df[val] -df[val + "_max"]
df[val + "_mean_dist"] = df[val] -df[val + "_mean"]
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
for lag in range(1,11):
df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)
# adding some Formation lag and lead.
for lag in range(1,3):
df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int)
print("rolling")
#Add rolling features
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M','RELPOS']
window_size = [5,10,15,20,50]
for w in window_size:
for val in list_to_roll:
df[val+'_rollingmean_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).mean())
df[val+'_rollingmax_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).max())
df[val+'_rollingmin_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).min())
df[val+'_rollingstd_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).std())
print("special window features for NM_M")
def NM_M_distance(x,how,target):
length = len(x)
rank = np.empty(length)
count = -1
NM_M = x["NM_M"].values
if how=="up":
order = range(length)
elif how=="down":
order = range(length-1,-1,-1)
for i in order:
if ((NM_M[i] != target) & (count>-1)):
count+=1
rank[i] += count
elif NM_M[i] == target:
count=0
else:
rank[i] = count
rank = pd.DataFrame(rank.astype(int), columns=["NM_M_Rank_Target_+"+str(target)+"_"+how], index = x.index)
return(rank)
df["NM_M_Rank_Target_1_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=1)
df["NM_M_Rank_Target_2_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=2)
df["NM_M_Rank_Target_1_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=1)
df["NM_M_Rank_Target_2_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=2)
print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())
df = df.fillna(df.mean())
print("Vectorizing Formation text data")
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
'Formation_lag_1',
'Formation_lead_1',
'Formation_lag_2',
'Formation_lead_2']
for l in list_formation:
cv = CountVectorizer()
counts = cv.fit_transform(df[l].values)
cols = [c+"_"+l for c in cv.get_feature_names()]
counts = pd.DataFrame(counts.toarray(),columns = cols)
df = df.drop(l,axis = 1)
df = pd.concat([df,counts],axis=1)
print("Finished preparing data. Now ready for ML ignition!")
In [189]:
#tokeep =['Facies','origin','Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2',
# 'Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
#df = df[tokeep]
In [147]:
f1_micro = make_scorer(f1_score, average='micro')
In [148]:
# this time let's use all the training set
groups = df[(df['origin']=='train')]["Well Name"]
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
custom_cv=LeaveOneGroupOut().split(xtrain, ytrain, groups)
In [149]:
clf_rfe = RandomForestClassifier(
n_estimators=100,
criterion="entropy",
class_weight='balanced',
)
clf_final = RandomForestClassifier(
n_estimators=100,
criterion="entropy",
class_weight='balanced',
min_samples_leaf=5,
min_samples_split=25,
max_features=10,
n_jobs=-1
)
pipe = Pipeline([
('fs', RFECV(clf_rfe,
cv=custom_cv,
scoring=f1_micro,
step=0.1,
verbose=2)),
('cl', clf_final)
])
In [150]:
pipe.fit(xtrain, ytrain)
Out[150]:
In [151]:
y_pred = pipe.predict(xtrain)
print(classification_report(ytrain, y_pred))
print(f1_score(ytrain, y_pred, average="micro"))
In [160]:
importances = pipe.named_steps["cl"].feature_importances_
support = pipe.named_steps["fs"].support_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature = pd.Series(xtrain.columns.values)
selected_features = list(feature[support])
for f in range(len(selected_features)):
print("%d. feature %d %s (%f)" % (f + 1, indices[f],
selected_features[indices[f]],
importances[indices[f]]))
In [163]:
# this time let's use all the training set
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
well_name_valid = df.loc[(df['origin']=='test'),"Well Name"]
In [ ]:
#pipe.fit(xtrain,ytrain)
In [164]:
preds = pipe.predict(xvalid.values)
In [165]:
well = "CRAWFORD"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2950,3175, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()
In [166]:
well = "STUART"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2800,3050, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()
In [167]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_4.csv')