In [38]:
from numpy.fft import rfft
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import pandas as pd
import timeit
from sqlalchemy.sql import text
from sklearn import tree
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#import sherlock.filesystem as sfs
#import sherlock.database as sdb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
First steps, reading in and exploring the data are the same as Brendon's steps:
In [1344]:
filename = 'training_data.csv'
training_data0 = pd.read_csv(filename)
training_data0.head()
Out[1344]:
In [1350]:
correct_facies_labels = training_data0['Facies'].values
feature_vectors = training_data0.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
feature_vectors.describe()
Out[1350]:
scale the data:
In [1351]:
scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors)
In [1352]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, correct_facies_labels, test_size=0.2, random_state=0)
In [1355]:
rf = RandomForestClassifier(max_depth = 15,n_estimators=200,max_features=None)
#rf = RandomForestClassifier()
rf.fit(X_train, y_train)
predicted_random_forest = rf.predict(X_test)
print "prediction from random forest:"
print metrics.accuracy_score(list(y_test), predicted_random_forest)
print "f1 score:"
print metrics.f1_score(list(y_test), predicted_random_forest,average = 'weighted')
In [1356]:
training_data=training_data0.copy()
In [1357]:
#remove 1 well
blind = training_data[training_data['Well Name'] == 'SHANKLE']
training_data = training_data[training_data['Well Name'] != 'SHANKLE']
In [1358]:
correct_facies_labels = training_data['Facies'].values
feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors)
In [1359]:
X_train, dum1, y_train, dum2 = train_test_split(scaled_features, correct_facies_labels, test_size=0.2, random_state=0)
rf.fit(X_train, y_train)
Out[1359]:
In [1360]:
# get the blind well
correct_facies_labels = blind['Facies'].values
feature_vectors = blind.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors)
In [1361]:
predicted_random_forest = rf.predict(scaled_features)
print "All training data different from test well"
print "prediction from random forest:"
print metrics.accuracy_score(correct_facies_labels, predicted_random_forest)
print "f1 score:"
print metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'weighted')
The prediction performs much much beter if the all data is included in the training, compared to blind wells. Shouldn't be that much a surprise but doesn't this suggest some wells are not representative of the others
In [1362]:
from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm
#conf = confusion_matrix(correct_facies_labels, predicted_gradboost)
conf = confusion_matrix(correct_facies_labels, predicted_random_forest)
display_cm(conf, facies_labels, hide_zeros=True)
This is the benchmark to beat : 0.44 using rf, (slightly higher for gradient boost)
Basic statistics by facies:
In [1668]:
temp_1=training_data.groupby('Formation').mean()
temp_2=training_data.groupby('Facies').mean()
#temp_3=training_data.groupby('Facies').count()
temp_2
Out[1668]:
Basic statistics by well:
In [1364]:
temp_4=training_data.groupby('Well Name')
#temp_4.describe()
#temp_5=training_data.groupby('Well Name').count()
#temp_5=training_data.groupby('Well Name').max()
temp_5=training_data.groupby('Well Name').mean()
temp_5
Out[1364]:
In [1373]:
xx0 = list(training_data0.Facies)
#xx1 = list(training_data0.DeltaPHI)
xx1 = list(training_data0.GR)
In [1374]:
x_min1=np.roll(xx1, 1)
x_min2=np.roll(xx1, 2)
x_min3=np.roll(xx1, 3)
scale=0.5
#b, a = signal.butter(2, 0.125, analog=False)
b, a = signal.butter(2, 0.09, btype='low', analog=False)
b, a = signal.butter(2, 0.2, btype='high', analog=False)
xx1=xx1-np.mean(xx1)
xx_fil = signal.filtfilt(b, a, xx1)
xx_mf= signal.medfilt(xx1,15)
xx_grad=np.gradient(xx1)
fig, ax = plt.subplots(figsize=(30, 20))
plt.plot(scale*xx1, color='black', label='Original Delta PHI')
#plt.plot(scale*xx_grad, color='blue', label='derivative')
#plt.plot(scale*xx_fil, color='red', label='low pass filter')
#plt.plot(scale*xx_fil, color='red', label='high pass filter')
plt.plot(scale*xx_mf, color='blue', label='median filter')
#plt.plot(x_min1, color='yellow', label='1 sample shift')
#xlim([500 800])
plt.plot(xx0, color='green', label='Facies')
ax.set_xlim(400,700)
#plt.plot(sig_lf, color='#cc0000', label='lfilter')
plt.legend(loc="best")
plt.show()
In [1608]:
def magic(df):
df1=df.copy()
b, a = signal.butter(2, 0.2, btype='high', analog=False)
feats00=['GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
feats01=['GR','DeltaPHI','PHIND']
for ii in feats0:
df1[ii]=df[ii]
name1=ii + '_1'
name2=ii + '_2'
name3=ii + '_3'
name4=ii + '_4'
xx1 = list(df[ii])
xx_mf= signal.medfilt(xx1,9)
x_min3=np.roll(xx_mf, 3)
xx1a=xx1-np.mean(xx1)
xx_fil = signal.filtfilt(b, a, xx1)
xx_grad=np.gradient(xx1a)
if ii in feats01:
df1[name1]=x_min3
df1[name2]=xx_fil
df1[name3]=xx_grad
df1[name4]=xx_mf
return df1
#del training_data1
df=training_data0.copy()
training_data1=magic(df)
In [1412]:
x=rf.feature_importances_
kolummen = feature_vectors.columns.tolist()
mask=x>0.025
mask=x>0.035
#mask=x>0.025
x1=x[mask]
#kols=kolummen[mask]
kols=[]
kols_out=[]
count=0
for name in kolummen:
if mask[count]==True:
kols.append(name)
else:
kols_out.append(name)
count+=1
fig, ax = plt.subplots(figsize=(30, 20))
## the data
N = len(kols)
#N = len(kolummen)-18
#X=gradboost.feature_importances_
#X=rf.feature_importances_
X=x1
## necessary variables
ind = np.arange(N) # the x locations for the groups
width = 0.30 # the width of the bars
fsize=16
## the bars
rects1 = ax.bar(ind, X, width,
color='black')
# axes and labels
ax.set_xlim(-width,len(ind)+width)
#ax.set_ylim(0,45)
ax.set_xlabel('feature', fontsize=fsize)
ax.set_ylabel('importance', fontsize=fsize)
ax.set_title('feature importance', fontsize=fsize)
#xTickMarks = ['Group'+str(i) for i in range(1,6)]
xTickMarks = kols
ax.set_xticks(ind+width)
xtickNames = ax.set_xticklabels(xTickMarks, fontsize=fsize)
plt.setp(xtickNames, rotation=45, fontsize=fsize)
## add a legend
#ax.legend( (rects1[0], rects2[0]), ('Men', 'Women') )
print count
print N
plt.show()
In [1413]:
training_data1a = training_data1.drop(kols_out, axis=1)
training_data1a.head()
Out[1413]:
In [1614]:
def run_test(remove_well, df_train):
#df_test=training_data0
df_test=training_data1
#---------------------------------
#df_train=training_data1a
#df_train=training_data2
#df_test=df_test.drop(kols_out, axis=1)
#---------------------------------
#df_train=training_data0
#df_train=training_data1
#df_train=df_train.drop(kols_out, axis=1)
#training_data1a = training_data1.drop(kols_out, axis=1)
blind = df_test[df_test['Well Name'] == remove_well]
training_data = df_train[df_train['Well Name'] != remove_well]
correct_facies_labels_train = training_data['Facies'].values
feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
#scaled_features_train = scaler.transform(feature_vectors)
scaled_features_train = feature_vectors
rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
#rf = RandomForestClassifier()
rf.fit(scaled_features_train, correct_facies_labels_train)
# get the blind well
correct_facies_labels = blind['Facies'].values
feature_vectors = blind.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
#scaled_features = scaler.transform(feature_vectors)
scaled_features =feature_vectors
predicted_random_forest = rf.predict(scaled_features)
#print "All training data different from test well"
#print "prediction from random forest:"
#print metrics.accuracy_score(correct_facies_labels, predicted_random_forest)
#printnt "f1 score:"
#print metrics.f1_score(correct_facies_labels, predicted_random_forest,average = None)
#print "average"
out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
return out_f1
#print
# 5-Fold Cross validation
#print "3-Fold Cross validation"
#cv_scores = cross_val_score(rf, scaled_features, correct_facies_labels, cv=4, scoring='f1_macro')
#avg_cv_score = np.mean(cv_scores)
#print cv_scores
#avg_cv_score
In [1615]:
#df_train=training_data1a
df_train=training_data1
wells=['CHURCHMAN BIBLE','SHANKLE','NOLAN','NEWBY','Recruit F9' ,'CROSS H CATTLE','LUKE G U','SHRIMPLIN']
av_all=[]
for remove_well in wells:
all=[]
print("well : %s, f1 for different runs:" % (remove_well))
for ii in range(3):
out_f1=run_test(remove_well,df_train)
all.append(out_f1)
av1=np.mean(all)
av_all.append(av1)
print("average f1 is %f, 2*std is %f" % (av1, 2*np.std(all)) )
print("overall average f1 is %f" % (np.mean(av_all)))
In [1572]:
#rf = RandomForestClassifier(max_depth = 1, max_features= 'sqrt', n_estimators=50, oob_score = True)
rfc = RandomForestClassifier(max_depth = 9, max_features= 'sqrt', n_estimators=250)
#rf = RandomForestClassifier()
#rf.fit(scaled_features_train, correct_facies_labels_train)
param_grid = {
'max_depth' : [5,6,7,8,9],
'n_estimators': [150, 250, 350, 600]
}
# 'max_features': ['auto', 'sqrt', 'log2']
#}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
#CV_rfc.fit(X, y)
CV_rfc.fit(scaled_features_train, correct_facies_labels_train)
print CV_rfc.best_params_
In [1633]:
filename = 'training_data.csv'
training_data = pd.read_csv(filename)
In [1642]:
filename = 'validation_data_nofacies.csv'
test_data = pd.read_csv(filename)
In [1643]:
test_data.head()
Out[1643]:
In [1635]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()
Out[1635]:
In [1636]:
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D','PS', 'BS']
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)
In [1657]:
#preprocessing
test_data1=magic(test_data)
training_data1=magic(training_data)
In [1665]:
def predict_final(test_well, training_data,test_data):
blind = test_data[test_data['Well Name'] == test_well]
correct_facies_labels_train = training_data['Facies'].values
feature_vectors_train = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
rf = RandomForestClassifier(max_depth = 15, n_estimators=600)
rf.fit(feature_vectors_train, correct_facies_labels_train)
# the blind well
feature_vectors_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)
predicted_random_forest = rf.predict(feature_vectors_blind)
#out_f1=metrics.f1_score(correct_facies_labels, predicted_random_forest,average = 'micro')
return predicted_random_forest
In [1666]:
test_well='STUART'
predicted_stu=predict_final(test_well, training_data1, test_data1)
test_well='CRAWFORD'
predicted_craw=predict_final(test_well, training_data1, test_data1)
In [1669]:
predicted_stu
Out[1669]:
In [1670]:
predicted_craw
Out[1670]: