In [44]:
%matplotlib inline
import pandas as pd
from pandasql import sqldf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score,LeavePGroupsOut, LeaveOneGroupOut, cross_val_predict
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
pd.options.mode.chained_assignment = None
In [45]:
filename = './../training_data.csv'
training_data = pd.read_csv(filename)
print(set(training_data["Well Name"]))
training_data.head()
Out[45]:
In [46]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
print(set(well_data["Well Name"]))
print(well_data.shape)
well_data.head()
Out[46]:
In [47]:
# concat train and test for processing
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)
Out[47]:
In [48]:
# add some features based on the well data.
# nb points : can be correlated with how soft soil is ?
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns = ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
# depth :
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns = ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns = ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]
df["soft_indic"] = df['formation_depth'] / df["formation_size"]
# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg :
df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var)
# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg :
df[val + "_min_dist"] = df[val] -df[val + "_min"]
df[val + "_max_dist"] = df[val] -df[val + "_max"]
df[val + "_mean_dist"] = df[val] -df[val + "_mean"]
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
for lag in range(1,11):
df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)
# adding some Formation lag and lead.
for lag in range(1,3):
df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int)
print("rolling")
#Add rolling features
list_to_roll = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M','RELPOS']
window_size = [5,10,15,20,50]
for w in window_size:
for val in list_to_roll:
df[val+'_rollingmean_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).mean())
df[val+'_rollingmax_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).max())
df[val+'_rollingmin_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).min())
df[val+'_rollingstd_'+str(w)]=df.groupby("Well Name")[val].apply(
lambda x:x.rolling(window=w,center=True).std())
print("special window features for NM_M")
def NM_M_distance(x,how,target):
length = len(x)
rank = np.empty(length)
count = -1
NM_M = x["NM_M"].values
if how=="up":
order = range(length)
elif how=="down":
order = range(length-1,-1,-1)
for i in order:
if ((NM_M[i] != target) & (count>-1)):
count+=1
rank[i] += count
elif NM_M[i] == target:
count=0
else:
rank[i] = count
rank = pd.DataFrame(rank.astype(int), columns=["NM_M_Rank_Target_+"+str(target)+"_"+how], index = x.index)
return(rank)
df["NM_M_Rank_Target_1_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=1)
df["NM_M_Rank_Target_2_up"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="up",target=2)
df["NM_M_Rank_Target_1_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=1)
df["NM_M_Rank_Target_2_down"]=df.groupby(["Well Name"]).apply(NM_M_distance,how="down",target=2)
print("filling na")
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.bfill())
df = df.groupby(["Well Name"], as_index=False).apply(lambda group: group.ffill())
print("Vectorizing Formation text data")
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation',
'Formation_lag_1',
'Formation_lead_1',
'Formation_lag_2',
'Formation_lead_2']
for l in list_formation:
cv = CountVectorizer()
counts = cv.fit_transform(df[l].values)
cols = [c+"_"+l for c in cv.get_feature_names()]
counts = pd.DataFrame(counts.toarray(),columns = cols)
df = df.drop(l,axis = 1)
df = pd.concat([df,counts],axis=1)
print("Finished preparing data. Now ready for ML ignition!")
In [189]:
#tokeep =['Facies','origin','Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2',
# 'Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
#tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
#df = df[tokeep]
In [61]:
clf = RandomForestClassifier(
max_depth = 10,
n_estimators = 100,
max_features=0.1,
min_samples_leaf=25,
min_samples_split=50,
class_weight='balanced',
oob_score=True,
)
In [62]:
ytrain = df[(df['origin']=='train')&(df['Well Name']<>'NOLAN')]['Facies']
yvalid = df[(df['origin']=='train')&(df['Well Name']=='NOLAN')]['Facies']
xtrain = df[(df['origin']=='train')&(df['Well Name']<>'NOLAN')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='train')&(df['Well Name']=='NOLAN')].drop(['Well Name','origin','Facies'],axis=1)
In [63]:
clf.fit(xtrain,ytrain)
Out[63]:
In [64]:
preds = clf.predict(xvalid)
from sklearn.metrics import classification_report
print(clf.oob_score_)
print classification_report(yvalid, preds)
print(f1_score(yvalid, preds,average="micro"))
In [65]:
# this time let's use all the training set
groups = df[(df['origin']=='train')]["Well Name"]
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
In [66]:
cv=LeaveOneGroupOut().split(xtrain, ytrain, groups)
y_pred = cross_val_predict(clf, xtrain, ytrain, cv=cv, n_jobs=-1)
In [68]:
print(classification_report(ytrain, y_pred))
print(f1_score(ytrain, y_pred,average="micro"))
In [69]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature = list(xtrain.columns.values)
for f in range(xtrain.shape[1]):
print("%d. feature %d %s (%f)" % (f + 1, indices[f], feature[indices[f]], importances[indices[f]]))
In [70]:
# this time let's use all the training set
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
well_name_valid = df.loc[(df['origin']=='test'),"Well Name"]
In [71]:
clf.fit(xtrain,ytrain)
Out[71]:
In [72]:
preds = clf.predict(xvalid.values)
In [73]:
well = "CRAWFORD"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2950,3175, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()
In [74]:
well = "STUART"
depth = xvalid.loc[well_name_valid== well ,"Depth"]
predictions = pd.Series(preds).loc[well_name_valid==well]
plt.plot(depth,predictions)
plt.axis([2800,3050, 1, 9])
plt.grid(b=True, which='major', color='r', linestyle='--')
plt.show()
In [75]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds_3.csv')
In [76]:
test_1 = pd.read_csv('XmasPreds_1.csv')["Facies"]
test_3 = pd.read_csv('XmasPreds_3.csv')["Facies"]
In [77]:
(test_1==test_3).describe()
Out[77]: