In [1]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None
In [34]:
filename = './../training_data.csv'
training_data = pd.read_csv(filename)
training_data.head(10)
Out[34]:
In [3]:
set(training_data["Well Name"])
Out[3]:
In [35]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
well_data.head(10)
Out[35]:
In [5]:
set(well_data["Well Name"])
Out[5]:
In [36]:
# concat train and test for processing
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)
Out[36]:
In [37]:
# add some features based on the well data.
# nb points : can be correlated with how soft soil is ?
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns = ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
# depth :
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns = ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns = ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])
df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]
df["soft_indic"] = df['formation_depth'] / df["formation_size"]
# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg :
df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var)
# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg :
df[val + "_min_dist"] = df[val] -df[val + "_min"]
df[val + "_max_dist"] = df[val] -df[val + "_max"]
df[val + "_mean_dist"] = df[val] -df[val + "_mean"]
In [38]:
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
for lag in range(1,11):
df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)
# adding some Formation lag and lead.
for lag in range(1,3):
df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int)
In [32]:
type(df)
Out[32]:
In [39]:
#df = df.fillna(-9999)
df = df.fillna(method="bfill")
df = df.fillna(method="ffill")
In [40]:
df.shape
Out[40]:
In [41]:
[c for c in df.columns if "Formation" in c]
Out[41]:
In [59]:
tokeep =['Facies','origin','Formation','Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
df = df[tokeep]
In [60]:
# count vectorizer formation
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
counts = cv.fit_transform(df['Formation'].values)
In [61]:
cols = cv.get_feature_names()
In [62]:
counts = pd.DataFrame(counts.toarray(),columns = cols)
df = df.drop('Formation',axis = 1)
df = pd.concat([df,counts],axis=1)
df.shape
Out[62]:
In [44]:
df['Formation']
In [43]:
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2']
for l in list_formation:
cv = CountVectorizer()
counts = cv.fit_transform(df[l].values)
cols = cv.get_feature_names()
counts = pd.DataFrame(counts.toarray(),columns = cols)
df = df.drop(l,axis = 1)
df = pd.concat([df,counts],axis=1)
In [63]:
# params
max_depth = 8
n_estimators = 2000
clf = RandomForestClassifier(max_depth = max_depth,n_estimators=n_estimators)
In [65]:
ytrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')]['Facies']
yvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')]['Facies']
xtrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)
In [66]:
clf.fit(xtrain,ytrain)
Out[66]:
In [68]:
preds = clf.predict(xvalid)
In [69]:
from sklearn.metrics import classification_report
print classification_report(yvalid, preds)
In [70]:
# this time let's use all the training set
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)
In [71]:
clf.fit(xtrain,ytrain)
Out[71]:
In [73]:
preds = clf.predict(xvalid.values)
In [173]:
# preds
In [74]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds.csv')