In [2]:
from numpy.fft import rfft
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import pandas as pd
import timeit
from sqlalchemy.sql import text
from sklearn import tree
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#import sherlock.filesystem as sfs
#import sherlock.database as sdb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
First steps, reading in and exploring the data are the same as Brendon's steps:
In [26]:
# ALL preprocessing in 1 go
filename = 'training_data.csv'
training_data0 = pd.read_csv(filename)
df=training_data0.copy()
training_data1=magic1(df)
filename = 'validation_data_nofacies.csv'
test_data = pd.read_csv(filename)
In [1635]:
training_data0['Well Name'] = training_data0['Well Name'].astype('category')
training_data0['Formation'] = training_data0['Formation'].astype('category')
training_data0['Well Name'].unique()
Out[1635]:
In [31]:
def predict_final(test_well, training_data,test_data):
blind = test_data[test_data['Well Name'] == test_well]
correct_facies_labels_train = training_data['Facies'].values
#feature_vectors_train = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
feature_vectors_train = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
rf.fit(feature_vectors_train, correct_facies_labels_train)
# the blind well
feature_vectors_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)
predicted_random_forest = rf.predict(feature_vectors_blind)
#out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
return predicted_random_forest
In [1666]:
test_well='STUART'
predicted_stu=predict_final(test_well, training_data1, test_data1)
test_well='CRAWFORD'
predicted_craw=predict_final(test_well, training_data1, test_data1)
NEW
In [22]:
temp1=training_data0.groupby('Well Name')
temp2=test_data0.groupby('Well Name')
#temp1.describe()
temp1.mean()
Out[22]:
In [5]:
def magic1(df):
df1=df.copy()
b, a = signal.butter(2, 0.2, btype='high', analog=False)
feats00=['GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
feats01=['GR','DeltaPHI','PHIND']
for ii in feats00:
df1[ii]=df[ii]
name1=ii + '_1'
name2=ii + '_2'
name3=ii + '_3'
name4=ii + '_4'
xx1 = list(df[ii])
xx_mf= signal.medfilt(xx1,9)
x_min3=np.roll(xx_mf, 3)
xx1a=xx1-np.mean(xx1)
xx_fil = signal.filtfilt(b, a, xx1)
xx_grad=np.gradient(xx1a)
if ii in feats01:
df1[name1]=x_min3
df1[name2]=xx_fil
df1[name3]=xx_grad
df1[name4]=xx_mf
return df1
In [6]:
def run_test1(remove_well, df_train, df_test):
blind = df_test[df_test['Well Name'] == remove_well]
training_data = df_train[df_train['Well Name'] != remove_well]
correct_facies_labels_train = training_data['Facies'].values
feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
#scaled_features_train = scaler.transform(feature_vectors)
scaled_features_train = feature_vectors
rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
rf.fit(scaled_features_train, correct_facies_labels_train)
# get the blind well
correct_facies_labels = blind['Facies'].values
feature_vectors = blind.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
#scaled_features = scaler.transform(feature_vectors)
scaled_features =feature_vectors
predicted_random_forest = rf.predict(scaled_features)
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
return out_f1, predicted_random_forest
run preprocessing well by well (more correct in terms of filtering)
In [481]:
df=training_data0.copy()
wells=['CHURCHMAN BIBLE','SHANKLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
appended_data=[]
for name in wells:
dff=df[df['Well Name']==name]
dfnew1=magic1(dff)
appended_data.append(dfnew1)
appended_data1 = pd.concat(appended_data, axis=0)
appended_data1.head()
training_data3=appended_data1.copy()
How well do we do beforehand:
In [484]:
training_data1.head()
Out[484]:
In [7]:
def add_previous_facies(df):
N=len(df)
dfnew=df.copy()
dfnew.iloc[0]=df.iloc[0]
facies_old=df.iloc[0]
for ii in range(1,N):
facies_try1=df.iloc[ii-1] # the previous value of the facies
facies_try2=df.iloc[ii] # the current value of the facies
if facies_try1==facies_try2:
dfnew.iloc[ii]=facies_old
elif facies_try1!=facies_try2:
facies_old=facies_try1 # and set the previous value to this nr
dfnew.iloc[ii]=facies_old # then set to the previous facies
return dfnew
def add_next_facies(df):
N=len(df)
dfnew=df.copy()
dfnew.iloc[N-1]=df.iloc[N-1]
facies_old=df.iloc[N-1]
for ii in range(N-2,-1,-1):
#print ii
facies_try1=df.iloc[ii+1] # the previous value of the facies
facies_try2=df.iloc[ii] # the current value of the facies
if facies_try1==facies_try2:
dfnew.iloc[ii]=facies_old
elif facies_try1!=facies_try2:
facies_old=facies_try1 # and set the previous value to this nr
dfnew.iloc[ii]=facies_old # then set to the previous facies
return dfnew
In [70]:
def add_prediction(training_data1, predicted_random_forest, remove_well):
df_test=training_data1
df_train=training_data1
blind = df_test[df_test['Well Name'] == remove_well]
df_train_noblind = df_train[df_train['Well Name'] != remove_well]
blind['facies_pred'] = pd.Series(predicted_random_forest, index=blind.index)
# df=blind['facies_pred']
# mask = blind['Well Name']==remove_well
# blind.loc[mask, 'facies_pred'] = df
return blind, df_train_noblind
In [ ]:
In [8]:
def add_previous(training_data1, predicted_random_forest, remove_well):
df_test=training_data1
df_train=training_data1
blind = df_test[df_test['Well Name'] == remove_well]
df_train_noblind = df_train[df_train['Well Name'] != remove_well]
blind['facies_pred'] = pd.Series(predicted_random_forest, index=blind.index)
df=blind['facies_pred']
dfnew1=add_previous_facies(df)
mask = blind['Well Name']==remove_well
blind.loc[mask, 'facies_min1'] = dfnew1
return blind, df_train_noblind
In [110]:
def addn_1(wells, blindwell, blind, df_train_noblind):
blind1=blind.copy()
blind1['facies_min1']=0.0*blind1['facies_pred']
# calculating n-1 using for n the predicted facies of the blind well
#df=blind1['Facies'][blind1['Well Name']==blindwell]
df=blind1['facies_pred'][blind1['Well Name']==blindwell]
dfnew1=add_previous_facies(df)
mask = blind1['Well Name']==blindwell
blind1.loc[mask, 'facies_min1'] = dfnew1
#=========
# calculating n-1 using for n the true facies of the training wells
df_train_noblind1=df_train_noblind.copy()
df_train_noblind1['facies_min1']=0.0*df_train_noblind1['Facies']
for name in wells:
df=df_train_noblind1['Facies'][df_train_noblind1['Well Name']==name]
dfnew1=add_previous_facies(df)
mask = df_train_noblind1['Well Name']==name
df_train_noblind1.loc[mask, 'facies_min1'] = dfnew1
return blind1, df_train_noblind1
In [157]:
def add_surrounding0(predicted_random_forest, remove_well, blind, df_train_noblind):
#df_test=training_data1
#df_train=training_data1
#blind = df_test[df_test['Well Name'] == remove_well]
#df_train_noblind = df_train[df_train['Well Name'] != remove_well]
blind['facies_pred'] = pd.Series(predicted_random_forest, index=blind.index)
df=blind['facies_pred']
dfnew1=add_previous_facies(df)
dfnew2=add_previous_facies(dfnew1)
dfnew3=add_next_facies(df)
dfnew4=add_next_facies(dfnew3)
appended_data=[]
appended_data.append(df)
appended_data.append(df)
appended_data.append(dfnew1)
appended_data.append(dfnew2)
AA1 = pd.concat(appended_data, axis=1)
most_common=AA1.median(axis=1)
mask = blind['Well Name']==remove_well
#blind.loc[mask, 'facies_min1'] = most_common
blind.loc[mask, 'facies_pred'] = most_common
#blind['facies_min1'][blind['Well Name']==remove_well]=dfnew1
return blind, df_train_noblind
In [158]:
def add_surrounding(predicted_random_forest, remove_well, blind, df_train_noblind):
#df_test=training_data1
#df_train=training_data1
#blind = df_test[df_test['Well Name'] == remove_well]
#df_train_noblind = df_train[df_train['Well Name'] != remove_well]
blind['facies_pred'] = pd.Series(predicted_random_forest, index=blind.index)
df=blind['facies_pred']
dfnew1=add_previous_facies(df)
dfnew2=add_previous_facies(dfnew1)
dfnew3=add_next_facies(df)
dfnew4=add_next_facies(dfnew3)
appended_data=[]
appended_data.append(df)
appended_data.append(df)
appended_data.append(df)
appended_data.append(dfnew1)
appended_data.append(dfnew1)
appended_data.append(dfnew2)
appended_data.append(dfnew3)
appended_data.append(dfnew4)
AA1 = pd.concat(appended_data, axis=1)
most_common=AA1.median(axis=1)
mask = blind['Well Name']==remove_well
#blind.loc[mask, 'facies_min1'] = most_common
blind.loc[mask, 'facies_pred'] = most_common
#blind['facies_min1'][blind['Well Name']==remove_well]=dfnew1
return blind, df_train_noblind
In [204]:
def make_predict(df_train_noblind, blind):
correct_facies_labels_train = df_train_noblind['Facies'].values
scaled_features_train = df_train_noblind.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
rf.fit(scaled_features_train, correct_facies_labels_train)
# get the blind well
correct_facies_labels = blind['Facies'].values
feature_vectors = blind.drop(['Formation', 'Well Name', 'Depth','Facies','facies_pred'], axis=1)
#scaler = preprocessing.StandardScaler().fit(feature_vectors)
predicted_random_forest = rf.predict(feature_vectors)
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
print out_f1
return out_f1, predicted_random_forest
In [207]:
def make_predict_test(df_train_noblind, blind):
correct_facies_labels_train = df_train_noblind['Facies'].values
scaled_features_train = df_train_noblind.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
rf.fit(scaled_features_train, correct_facies_labels_train)
feature_vectors = blind.drop(['Formation', 'Well Name', 'Depth','facies_pred'], axis=1)
predicted_random_forest = rf.predict(feature_vectors)
return predicted_random_forest
In [168]:
# ALL preprocessing in 1 go
filename = 'training_data.csv'
training_data0 = pd.read_csv(filename)
df=training_data0.copy()
#first preprocessing:
training_data1=magic1(df)
In [169]:
df=training_data0.copy()
wells=['CHURCHMAN BIBLE','SHANKLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
appended_data=[]
for name in wells:
dff=df[df['Well Name']==name]
dfnew1=magic1(dff)
appended_data.append(dfnew1)
appended_data1 = pd.concat(appended_data, axis=0)
appended_data1.head()
training_data2=appended_data1.copy()
First prediction:
In [183]:
#df_train=training_data1
#df_test=training_data1
df_train=training_data2
df_test=training_data2
wells=['CHURCHMAN BIBLE']
#wells=['SHANKLE']
av_all=[]
for remove_well in wells:
all=[]
print("well : %s, f1 for different runs:" % (remove_well))
for ii in range(25):
out_f1, predicted_random_forest=run_test1(remove_well,df_train, df_test)
print out_f1
all.append(out_f1)
av1=np.mean(all)
av_all.append(av1)
print("average f1 is %f, 2*std is %f" % (av1, 2*np.std(all)) )
print("overall average f1 is %f" % (np.mean(av_all)))
ADDING PREDICTION INFORMATION AND SEPERATE INTO BLIND AND REST
In [184]:
remove_well='CHURCHMAN BIBLE'
#remove_well='SHANKLE'
#blind, df_train_noblind = add_previous(training_data1, predicted_random_forest, remove_well)
blind, df_train_noblind = add_prediction(training_data2, predicted_random_forest, remove_well)
ADDING N-1 INFORMATION
In [185]:
#blindwell='SHANKLE'
#wells=['CHURCHMAN BIBLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
blindwell='CHURCHMAN BIBLE'
wells=['SHANKLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
blind1, df_train_noblind1=addn_1(wells, blindwell, blind, df_train_noblind)
Make first prediction using pred and n-1
In [186]:
out_f1, predicted_random_forest1= make_predict(df_train_noblind1, blind1)
blind2, df_train_noblind2 = add_surrounding(predicted_random_forest, remove_well, blind1, df_train_noblind1)
out_f1, predicted_random_forest2= make_predict(df_train_noblind2, blind2)
The 2 steps above can be done recursively. (but does f1 still improve?)
In [222]:
blind=blind1
df_train_noblind=df_train_noblind1
predicted_random_forest=predicted_random_forest1
all1=[]
for ii in range(10):
print "iteration " + str(ii+1)
blind['facies_pred'] = pd.Series(predicted_random_forest, index=blind.index)
#blind, df_train_noblind=addn_1(wells, blindwell, blind, df_train_noblind)
#blind, df_train_noblind = add_surrounding(predicted_random_forest, remove_well, blind, df_train_noblind)
out_f1, predicted_random_forest= make_predict(df_train_noblind, blind)
all1.append(out_f1)
print("overall average f1 is %f" % (np.mean(all1)))
In [98]:
#predicted_random_forest_first=predicted_random_forest
#for ii in range(4):
# blind, df_train_noblind = add_surrounding2(training_data2, predicted_random_forest, remove_well)
# out_f1, predicted_random_forest= make_predict(df_train_noblind, blind)
In [38]:
#fig, ax = plt.subplots(figsize=(30, 20))
#plt.plot(correct_facies_labels, color='black', label='facies')
#plt.plot(predicted_random_forest, color='red', label='predicted')
#plt.show()
In [192]:
filename = 'validation_data_nofacies.csv'
test_data = pd.read_csv(filename)
df=test_data.copy()
#test_data1=magic1(df)
In [193]:
df=test_data.copy()
wells=['STUART','CRAWFORD']
appended_data=[]
for name in wells:
dff=df[df['Well Name']==name]
dfnew1=magic1(dff)
appended_data.append(dfnew1)
appended_data1 = pd.concat(appended_data, axis=0)
appended_data1.head()
test_data2=appended_data1.copy()
In [194]:
test_well='STUART'
predicted_stu=predict_final(test_well, training_data1, test_data2)
test_well='CRAWFORD'
predicted_craw=predict_final(test_well, training_data1, test_data2)
In [196]:
test_well='STUART'
mask = test_data2['Well Name']==test_well
test_data2.loc[mask, 'facies_pred'] = predicted_stu
test_well='CRAWFORD'
mask = test_data2['Well Name']==test_well
test_data2.loc[mask, 'facies_pred'] = predicted_craw
In [197]:
test_data2.head()
Out[197]:
In [217]:
test_well='STUART'
wells=['CHURCHMAN BIBLE','SHANKLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
df_test=test_data2[test_data2['Well Name']==test_well]
blind1, df_train_noblind1=addn_1(wells, test_well, df_test, training_data2)
FINAL PREDICTION
In [218]:
predicted_stu1= make_predict_test(df_train_noblind1, blind1)
predicted_stu1
Out[218]:
In [219]:
test_well='CRAWFORD'
wells=['CHURCHMAN BIBLE','SHANKLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
df_test=test_data2[test_data2['Well Name']==test_well]
blind1, df_train_noblind1=addn_1(wells, test_well, df_test, training_data2)
In [220]:
predicted_craw1= make_predict_test(df_train_noblind1, blind1)
predicted_craw1
Out[220]:
In [223]:
fig, ax = plt.subplots(figsize=(20, 10))
plt.plot(predicted_craw0, color='black', label='facies')
plt.plot(predicted_craw, color='red', label='predicted')
plt.plot(predicted_craw1, color='blue', label='predicted')
plt.show()