In [218]:
from numpy.fft import rfft
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import pandas as pd
import timeit
from sqlalchemy.sql import text
from sklearn import tree
from sklearn.model_selection import LeavePGroupsOut
#from sklearn import cross_validation
#from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
#import sherlock.filesystem as sfs
#import sherlock.database as sdb
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
In [2]:
filename = 'training_data.csv'
training_data0 = pd.read_csv(filename)
training_data1=training_data0.copy()
In [3]:
def magic(df):
df1=df.copy()
b, a = signal.butter(2, 0.2, btype='high', analog=False)
feats00=['GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
feats01=['GR','DeltaPHI','PHIND']
for ii in feats01:
df1[ii]=df[ii]
name1=ii + '_1'
name2=ii + '_2'
name3=ii + '_3'
name4=ii + '_4'
xx1 = list(df[ii])
xx_mf= signal.medfilt(xx1,9)
x_min3=np.roll(xx_mf, 3)
xx1a=xx1-np.mean(xx1)
xx_fil = signal.filtfilt(b, a, xx1)
xx_grad=np.gradient(xx1a)
if ii in feats01:
df1[name1]=x_min3
df1[name2]=xx_fil
df1[name3]=xx_grad
df1[name4]=xx_mf
return df1
In [4]:
def run_test(remove_well, df_train):
df_test=training_data1
blind = df_test[df_test['Well Name'] == remove_well]
training_data = df_train[df_train['Well Name'] != remove_well]
correct_facies_labels_train = training_data['Facies'].values
feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
rf = RandomForestClassifier(max_depth = 5, n_estimators=600)
rf.fit(feature_vectors, correct_facies_labels_train)
correct_facies_labels = blind['Facies'].values
features_blind = blind.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features =feature_vectors
predicted_random_forest = rf.predict(features_blind)
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
return out_f1
In [5]:
def dothererun(selected_pred_facies):
training_data_dodgy = blind[selected_data==True]
# PART BLIND WELL TO TRAIN
#correct_facies_labels_train = training_data_dodgy['Facies'].values #THIS WOULD OBVIOUSLY NOT BE AVAILABLE
correct_facies_labels_train = selected_pred_facies
feature_vectors = training_data_dodgy.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#rf = RandomForestClassifier(max_depth = 25, n_estimators=5000)
rf = RandomForestClassifier(max_depth = 5, n_estimators=1000)
rf.fit(feature_vectors, correct_facies_labels_train)
# ENTIRE BLIND WELL TO PREDICT
correct_facies_labels = blind['Facies'].values
features_blind = blind.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
predicted_random_forest_new = rf.predict(features_blind)
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest_new,average = 'micro')
props=rf.predict_proba(features_blind)
print "in the rerun f1 is: "
print out_f1
return out_f1
In [ ]:
df=training_data0.copy()
training_data1=magic(df)
In [6]:
df_train=training_data1
wells=['CHURCHMAN BIBLE','SHANKLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
#wells=['CHURCHMAN BIBLE']
wells=['NOLAN']
for remove_well in wells:
all=[]
print("well : %s, f1 for different runs:" % (remove_well))
for ii in range(4):
out_f1=run_test(remove_well,df_train)
#out_f1=run_bayes2(remove_well,df_train)
all.append(out_f1)
av1=np.mean(all)
print("average f1 is %f, 2*std is %f" % (av1, 2*np.std(all)) )
In [337]:
df_test=training_data1
blind = df_test[df_test['Well Name'] == remove_well]
training_data = df_train[df_train['Well Name'] != remove_well]
correct_facies_labels_train = training_data['Facies'].values
feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
rf = RandomForestClassifier(max_depth = 5, n_estimators=1000)
rf.fit(feature_vectors, correct_facies_labels_train)
correct_facies_labels = blind['Facies'].values
features_blind = blind.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#scaler = preprocessing.StandardScaler().fit(feature_vectors)
#scaled_features =feature_vectors
predicted_random_forest = rf.predict(features_blind)
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
print out_f1
props=rf.predict_proba(features_blind)
props_best=props.max(axis=1)
In [342]:
plt.plot(props[:,3])
plt.plot(props_best)
plt.show()
print props_best.mean()
print props_best.mean()+props_best.std()
RERUNNING PREDICTION FROM PART BLIND ONLY:
In [343]:
#decs=[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
#decs=[0.30, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42]
decs=[0.20, 0.22, 0.24, 0.26, 0.28, 0.30, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44]
#decs=[0.32, 0.325, 0.33, 0.335, 0.34, 0.345, 0.35, 0.355, 0.36, 0.38, 0.4, 0.42, 0.44]
decs=[0.15]
decs=[0.625]
for vals in decs:
new_array=[]
selected_data=[]
selected_data=0*predicted_random_forest
count=0
for idx, val in enumerate(props_best):
if val > vals:
new_array.append(predicted_random_forest[idx])
selected_data[idx]=True
count+=1
else:
selected_data[idx]=False
selected_pred_facies=predicted_random_forest[selected_data==True]
selected_true_labels=correct_facies_labels[selected_data==True]
out_f1=metrics.f1_score(selected_true_labels, selected_pred_facies,average = 'micro')
print("vals of decs is %.5f " % (vals) )
print("nr of samples %d " % (count) )
print("total nr of facies predicted is %d" % (len(np.unique(selected_pred_facies))))
print("total nr of facies present is %d" % (len(np.unique(selected_true_labels))))
print("f1 is %.5f " % (out_f1) )
#dothererun(selected_pred_facies)
print "-------------------------------------------------"
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Leave 2 out
In [314]:
wells = training_data["Well Name"].values #DOES NOT CONTAIN THE BLIND
y = training_data['Facies']
X = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#wells = training_data1["Well Name"].values #DOES CONTAIN THE BLIND
#y = training_data1['Facies']
#X = training_data1.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#lpgo = LeavePGroupsOut(n_groups=2)
lpgo = LeavePGroupsOut(n_groups=4)
pred_array=0*correct_facies_labels
for train, test in lpgo.split(X, y, groups=wells):
xx0=X.iloc[train]
yy0=y.iloc[train]
rf = RandomForestClassifier(max_depth = 7, n_estimators=2000)
rf.fit(xx0, yy0)
xx1=X.iloc[test]
yy1=y.iloc[test]
predicted_random_forest = rf.predict(xx1) #prediction on the leave-out dataset
out_f1=metrics.f1_score(yy1, predicted_random_forest,average = 'micro')
#cross_val_score(rf, xx1, yy1, cv=3)
#predicted = cross_val_predict(rf, xx1, cv=10)
predicted_random_forest = rf.predict(features_blind) #prediction on blind
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
pred_array = np.vstack((predicted_random_forest, pred_array))
print out_f1
med_pred=np.median(pred_array,axis=0)
med_pred=med_pred.astype(int)
out_f1=metrics.f1_score(correct_facies_labels, med_pred,average = 'micro')
print "prediction from median, leave p out:"
print out_f1
Combining part blind+leave 2 out:
In [344]:
training_data_dodgy = blind[selected_data==True]
training_data_dodgy1 = training_data_dodgy.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
correct_facies_labels_train = selected_pred_facies
#----------------------------------------------------------------------------
wells = training_data["Well Name"].values #DOES NOT CONTAIN THE BLIND
y = training_data['Facies']
X = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
lpgo = LeavePGroupsOut(n_groups=5)
pred_array=0*correct_facies_labels
for train, test in lpgo.split(X, y, groups=wells):
xx0=X.iloc[train]
yy0=y.iloc[train]
XTRAIN = pd.concat([xx0, training_data_dodgy1], ignore_index=True)
YTRAIN=np.concatenate((yy0.values, correct_facies_labels_train))
selected_pred_facies=predicted_random_forest[selected_data==True]
selected_true_labels=correct_facies_labels[selected_data==True]
rf = RandomForestClassifier(max_depth = 7, n_estimators=2000)
rf.fit(XTRAIN, YTRAIN)
predicted_random_forest = rf.predict(features_blind) #prediction on blind
#predicted = cross_val_predict(rf, features_blind, cv=10)
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
pred_array = np.vstack((predicted_random_forest, pred_array))
#predicted = cross_val_predict(rf, xx1, cv=10)
print out_f1
med_pred=np.median(pred_array,axis=0)
med_pred=med_pred.astype(int)
out_f1=metrics.f1_score(correct_facies_labels, med_pred,average = 'micro')
print "prediction from median, leave p out+augmentation:"
print out_f1
In [284]:
pred_array.shape
Out[284]:
In [292]:
pred_array[:,70]
Out[292]:
In [ ]:
In [ ]:
In [ ]:
In [1665]:
def predict_final(test_well, training_data,test_data):
blind = test_data[test_data['Well Name'] == test_well]
correct_facies_labels_train = training_data['Facies'].values
feature_vectors_train = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
rf.fit(feature_vectors_train, correct_facies_labels_train)
# the blind well
feature_vectors_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)
predicted_random_forest = rf.predict(feature_vectors_blind)
#out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
return predicted_random_forest
In [406]:
filename = 'training_data.csv'
training_data0 = pd.read_csv(filename)
training_data1=training_data0.copy()
In [407]:
filename = 'validation_data_nofacies.csv'
test_data = pd.read_csv(filename)
In [408]:
test_data1=magic(test_data)
training_data1=magic(training_data0)
In [409]:
#test_well='STUART'
test_well='CRAWFORD'
In [410]:
blind = test_data1[test_data1['Well Name'] == test_well]
training_data = training_data1
correct_facies_labels_train = training_data['Facies'].values
feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
rf = RandomForestClassifier(max_depth = 5, n_estimators=1000)
rf.fit(feature_vectors, correct_facies_labels_train)
#correct_facies_labels = blind['Facies'].values
features_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)
predicted_random_forest = rf.predict(features_blind)
#out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
#print out_f1
props=rf.predict_proba(features_blind)
props_best=props.max(axis=1)
In [411]:
plt.plot(props[:,3])
plt.plot(props_best)
plt.show()
print props_best.mean()
print props_best.mean()+props_best.std()
In [419]:
decs=[0.5, 0.54, 0.58, 0.62, 0.66, 0.68, 0.70, 0.74]
decs=[0.66]
decs=[0.65]
for vals in decs:
new_array=[]
selected_data=[]
selected_data=0*predicted_random_forest
count=0
for idx, val in enumerate(props_best):
if val > vals:
new_array.append(predicted_random_forest[idx])
selected_data[idx]=True
count+=1
else:
selected_data[idx]=False
selected_pred_facies=predicted_random_forest[selected_data==True]
#selected_true_labels=correct_facies_labels[selected_data==True]
#out_f1=metrics.f1_score(selected_true_labels, selected_pred_facies,average = 'micro')
print("vals of decs is %.5f " % (vals) )
print("nr of samples %d " % (count) )
print("total nr of facies predicted is %d" % (len(np.unique(selected_pred_facies))))
#print("total nr of facies present is %d" % (len(np.unique(selected_true_labels))))
#print("f1 is %.5f " % (out_f1) )
#dothererun(selected_pred_facies)
print "-------------------------------------------------"
In [422]:
training_data_dodgy = blind[selected_data==True]
training_data_dodgy1 = training_data_dodgy.drop(['Formation', 'Well Name', 'Depth'], axis=1)
correct_facies_labels_train = selected_pred_facies
#----------------------------------------------------------------------------
wells = training_data["Well Name"].values #DOES NOT CONTAIN THE BLIND
y = training_data['Facies']
X = training_data.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
count=0
lpgo = LeavePGroupsOut(n_groups=5)
#pred_array=np.zeros(474) #stu
pred_array=np.zeros(356) #craw
for train, test in lpgo.split(X, y, groups=wells):
xx0=X.iloc[train]
yy0=y.iloc[train]
XTRAIN = pd.concat([xx0, training_data_dodgy1], ignore_index=True)
YTRAIN=np.concatenate((yy0.values, correct_facies_labels_train))
selected_pred_facies=predicted_random_forest[selected_data==True]
rf = RandomForestClassifier(max_depth = 7, n_estimators=2000)
rf.fit(XTRAIN, YTRAIN)
predicted_random_forest = rf.predict(features_blind) #prediction on blind
pred_array = np.vstack((predicted_random_forest, pred_array))
count+=1
print count
med_pred=np.median(pred_array,axis=0)
med_pred=med_pred.astype(int)
#out_f1=metrics.f1_score(correct_facies_labels, med_pred,average = 'micro')
#print "prediction from median, leave p out+augmentation:"
#print out_f1
In [423]:
predicted_craw=med_pred
predicted_craw
Out[423]:
In [405]:
predicted_stu=med_pred
predicted_stu
Out[405]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: