In [ ]:
%matplotlib notebook
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm
import matplotlib.pyplot as plt
import pywt
from scipy.signal import upfirdn
from sklearn.model_selection import LeaveOneGroupOut
In [ ]:
filename = 'training_data.csv'
training_data = pd.read_csv(filename)
In [ ]:
## Create a difference vector for each feature e.g. x1-x2, x1-x3... x2-x3...
feature_vectors = training_data.drop(['Formation','Facies'], axis=1)
feature_vectors = feature_vectors[['Depth','GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE']]
def difference_vector(feature_vectors):
length = len(feature_vectors['Depth'])
df_temp = np.zeros((25, length))
for i in range(0,int(len(feature_vectors['Depth']))):
vector_i = feature_vectors.iloc[i,:]
vector_i = vector_i[['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE']]
for j, value_j in enumerate(vector_i):
for k, value_k in enumerate(vector_i):
differ_j_k = value_j - value_k
df_temp[5*j+k, i] = np.abs(differ_j_k)
return df_temp
def diff_vec2frame(feature_vectors, df_temp):
heads = feature_vectors.columns[1::]
for i in range(0,5):
string_i = heads[i]
for j in range(0,5):
string_j = heads[j]
col_head = 'diff'+string_i+string_j
df = pd.Series(df_temp[5*i+j, :])
feature_vectors[col_head] = df
return feature_vectors
df_diff = difference_vector(feature_vectors)
feature_vectors = diff_vec2frame(feature_vectors, df_diff)
# Drop duplicated columns and column of zeros
feature_vectors = feature_vectors.T.drop_duplicates().T
feature_vectors.drop('diffGRGR', axis = 1, inplace = True)
In [ ]:
## sort by Formation and put the rank of each formation into a separate column.
## Set rank of all rows not in specific formation to zero
feature_vectors['Formation'] = training_data['Formation']
# feature_vectors['Facies'] = training_data['Facies']
def rank_formation(feature_vectors):
formation_labels = np.sort(feature_vectors['Formation'].unique())
feature_vec_origin = feature_vectors[['Formation','GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE']]
for i, value_i in enumerate(formation_labels):
formation = feature_vec_origin[feature_vec_origin['Formation']==value_i]
form_rank = formation.rank(pct=True, numeric_only=True)
for j, value_j in enumerate(form_rank.columns):
rank_form_vector = form_rank[value_j]
feature_vectors[value_j + '_' + value_i +'_rank'] = rank_form_vector
feature_vectors[value_j + '_' + value_i +'_rank'] = feature_vectors[value_j + '_' + value_i +'_rank'].fillna(0)
return feature_vectors
feature_vectors = rank_formation(feature_vectors)
In [ ]:
## create DWT coeffs for scaled original feature vectors for first 8 scales. Upsample and hold the coeffs.
import pywt
from scipy.signal import upfirdn
def DWT_facies(feature_vectors):
feature_vec_origin = feature_vectors[['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE']]
cols = feature_vec_origin.columns
scaler = preprocessing.StandardScaler().fit(feature_vec_origin)
scaled_features = scaler.transform(feature_vec_origin)
scaled_features = pd.DataFrame(scaled_features, columns = cols)
for i, value_i in enumerate(cols):
sig = scaled_features[value_i]
for j in np.arange(1,8):
fir_coeffs = np.ones((2**j,), dtype=np.int)
dwt_coeffs = pywt.downcoef('a', sig, 'db1', mode='symmetric', level=j)
dwt_coeff_up = upfirdn(fir_coeffs, dwt_coeffs, 2**j)
feature_vectors[value_i + '_DWT_level' + str(j)] = dwt_coeff_up[0:len(sig)]
return feature_vectors
feature_vectors = DWT_facies(feature_vectors)
In [ ]:
# Scale feature vectors
feature_vectors_d = feature_vectors.drop(['Formation'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors_d)
scaled_features = scaler.transform(feature_vectors_d)
scaled_features = pd.DataFrame(scaled_features, columns = feature_vectors_d.columns)
# One hot encode categoricals
scaled_features['NM_M'] = training_data['NM_M']
scaled_features['Formation'] = training_data['Formation']
scaled_features['NM_M'] = scaled_features['NM_M'].astype('category')
scaled_features = pd.get_dummies(scaled_features)
#NM_M and Formation - remove from feature vector, scale feature vector and add NM_M and formation back before one hot.
In [ ]:
predictors = scaled_features.columns
correct_facies_labels = training_data['Facies'].values
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(scaled_features, correct_facies_labels, test_size=0.2, random_state=0)
alg1 = RandomForestClassifier(random_state=1, n_estimators=315, min_samples_split=2, min_samples_leaf=1, max_features= None)
alg2 = GradientBoostingClassifier(random_state=1, n_estimators=500, max_depth=6)
alg3 = ExtraTreesClassifier(n_estimators=450, max_depth=None, min_samples_split=3, min_samples_leaf=1, random_state=1)
eclf = VotingClassifier(estimators=[('rf', alg1), ('gbc', alg2), ('etc', alg3)], voting='hard')
eclf.fit(X_train, y_train)
predicted_vote = eclf.predict(X_test)
print("prediction from tree ensemble:")
print(metrics.accuracy_score(list(y_test), predicted_vote))
print("f1 score:")
print(metrics.f1_score(list(y_test), predicted_vote, average = 'weighted'))
In [ ]:
# alg1 = RandomForestClassifier(random_state=1, n_estimators=315, min_samples_split=2, min_samples_leaf=1, max_features= None)
# alg1.fit(X_train, y_train)
# predicted_vote = eclf.predict(X_test)
# print("prediction from tree ensemble:")
# print(metrics.accuracy_score(list(y_test), predicted_vote))
# print("f1 score:")
# print(metrics.f1_score(list(y_test), predicted_vote, average = 'weighted'))
# print("Features sorted by their score:")
# print(sorted(zip(map(lambda x: round(x, 4), alg1.feature_importances_), predictors),
# reverse=True))
In [ ]:
f1_ens = []
wells = training_data["Well Name"].values
logo = LeaveOneGroupOut()
X = np.array(scaled_features)
y = np.array(correct_facies_labels)
for train, test in logo.split(X, y, groups=wells):
well_name = wells[test[0]]
eclf.fit(X[train], y[train])
pred_cv = eclf.predict(X[test])
sc = metrics.f1_score(y[test], pred_cv, labels = np.arange(10), average = 'micro')
print("{:>20s} {:.3f}".format(well_name, sc))
f1_ens.append(sc)
print("-Average leave-one-well-out F1 Score: %6f" % (sum(f1_ens)/(1.0*(len(f1_ens)))))
In [ ]:
def facies_confusion(result, y_test):
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
'WS', 'D','PS', 'BS']
conf = confusion_matrix(y_test, result)
display_cm(conf, facies_labels, hide_zeros=True, display_metrics = True)
def accuracy(conf):
total_correct = 0.
nb_classes = conf.shape[0]
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
acc = total_correct/sum(sum(conf))
return acc
print(accuracy(conf))
adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])
def accuracy_adjacent(conf, adjacent_facies):
nb_classes = conf.shape[0]
total_correct = 0.
for i in np.arange(0,nb_classes):
total_correct += conf[i][i]
for j in adjacent_facies[i]:
total_correct += conf[i][j]
return total_correct / sum(sum(conf))
print(accuracy_adjacent(conf, adjacent_facies))
In [ ]:
## Run on test data
# read in Test data
filename = 'validation_data_nofacies.csv'
test_data = pd.read_csv(filename)
# apply feature engineering to test_data
feature_vectors_test = test_data.drop(['Formation'], axis=1)
feature_vectors_test = feature_vectors_test[['Depth','GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE']]
df_diff_test = difference_vector(feature_vectors_test)
feature_vectors_test = diff_vec2frame(feature_vectors_test, df_diff_test)
# Drop duplicated columns and column of zeros
feature_vectors_test = feature_vectors_test.T.drop_duplicates().T
feature_vectors_test.drop('diffGRGR', axis = 1, inplace = True)
feature_vectors_test['Formation'] = test_data['Formation']
feature_vectors_test = rank_formation(feature_vectors_test)
feature_vectors_test = DWT_facies(feature_vectors_test)
# Scale feature vectors
feature_vectors_test_d = feature_vectors_test.drop('Formation', axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors_test_d)
scaled_features_test = scaler.transform(feature_vectors_test_d)
scaled_features_test = pd.DataFrame(scaled_features_test, columns = feature_vectors_test_d.columns)
# One hot encode categoricals
scaled_features_test['NM_M'] = test_data['NM_M']
scaled_features_test['Formation'] = test_data['Formation']
scaled_features_test['NM_M'] = scaled_features_test['NM_M'].astype('category')
scaled_features_test = pd.get_dummies(scaled_features_test)
In [ ]:
eclf.fit(scaled_features, correct_facies_labels)
predicted_random_forest = eclf.predict(scaled_features_test)
In [ ]:
test_data['Facies'] = predicted_random_forest
test_data.to_csv('test_data_prediction_ver2_CE.csv')
In [ ]:
In [ ]: