<span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">The code and ideas in this notebook,</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Matteo Niccoli and Mark Dahl, </span> are licensed under a Creative Commons Attribution 4.0 International License.
In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
from scipy.stats import randint as sp_randint
from scipy.signal import argrelextrema
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import LeaveOneGroupOut, validation_curve
In [2]:
filename = 'SFS_top70_selected_engineered_features.csv'
training_data = pd.read_csv(filename)
training_data.describe()
Out[2]:
In [3]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()
Out[3]:
Now we extract just the feature variables we need to perform the classification. The predictor variables are the five log values and two geologic constraining variables, and we are also using depth. We also get a vector of the facies labels that correspond to each feature vector.
In [4]:
y = training_data['Facies'].values
print y[25:40]
print np.shape(y)
In [5]:
X = training_data.drop(['Formation', 'Well Name','Facies'], axis=1)
print np.shape(X)
X.describe(percentiles=[.05, .25, .50, .75, .95])
Out[5]:
In [6]:
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
In [7]:
Fscorer = make_scorer(f1_score, average = 'micro')
In [8]:
wells = training_data["Well Name"].values
logo = LeaveOneGroupOut()
In Random Forest classifiers serveral decision trees (often hundreds - a forest of trees) are created and trained on a random subsets of samples (drawn with replacement) and features (drawn without replacement); the decision trees work together to make a more accurate classification (description from Randal Olson's excellent notebook).
In [9]:
from sklearn.ensemble import RandomForestClassifier
RF_clf100 = RandomForestClassifier (n_estimators=100, n_jobs=-1, random_state = 49)
RF_clf200 = RandomForestClassifier (n_estimators=200, n_jobs=-1, random_state = 49)
RF_clf300 = RandomForestClassifier (n_estimators=300, n_jobs=-1, random_state = 49)
RF_clf400 = RandomForestClassifier (n_estimators=400, n_jobs=-1, random_state = 49)
RF_clf500 = RandomForestClassifier (n_estimators=500, n_jobs=-1, random_state = 49)
RF_clf600 = RandomForestClassifier (n_estimators=600, n_jobs=-1, random_state = 49)
param_name = "max_features"
#param_range = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
param_range = [9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51]
plt.figure()
plt.suptitle('n_estimators = 100', fontsize=14, fontweight='bold')
_, test_scores = validation_curve(RF_clf100, X, y, cv=logo.split(X, y, groups=wells),
param_name=param_name, param_range=param_range,
scoring=Fscorer, n_jobs=-1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, test_scores_mean)
plt.xlabel(param_name)
plt.xlim(min(param_range), max(param_range))
plt.ylabel("F1")
plt.ylim(0.47, 0.57)
plt.show()
#print max(test_scores_mean[argrelextrema(test_scores_mean, np.greater)])
print np.amax(test_scores_mean)
print np.array(param_range)[test_scores_mean.argmax(axis=0)]
plt.figure()
plt.suptitle('n_estimators = 200', fontsize=14, fontweight='bold')
_, test_scores = validation_curve(RF_clf200, X, y, cv=logo.split(X, y, groups=wells),
param_name=param_name, param_range=param_range,
scoring=Fscorer, n_jobs=-1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, test_scores_mean)
plt.xlabel(param_name)
plt.xlim(min(param_range), max(param_range))
plt.ylabel("F1")
plt.ylim(0.47, 0.57)
plt.show()
#print max(test_scores_mean[argrelextrema(test_scores_mean, np.greater)])
print np.amax(test_scores_mean)
print np.array(param_range)[test_scores_mean.argmax(axis=0)]
plt.figure()
plt.suptitle('n_estimators = 300', fontsize=14, fontweight='bold')
_, test_scores = validation_curve(RF_clf300, X, y, cv=logo.split(X, y, groups=wells),
param_name=param_name, param_range=param_range,
scoring=Fscorer, n_jobs=-1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, test_scores_mean)
plt.xlabel(param_name)
plt.xlim(min(param_range), max(param_range))
plt.ylabel("F1")
plt.ylim(0.47, 0.57)
plt.show()
#print max(test_scores_mean[argrelextrema(test_scores_mean, np.greater)])
print np.amax(test_scores_mean)
print np.array(param_range)[test_scores_mean.argmax(axis=0)]
plt.figure()
plt.suptitle('n_estimators = 400', fontsize=14, fontweight='bold')
_, test_scores = validation_curve(RF_clf400, X, y, cv=logo.split(X, y, groups=wells),
param_name=param_name, param_range=param_range,
scoring=Fscorer, n_jobs=-1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, test_scores_mean)
plt.xlabel(param_name)
plt.xlim(min(param_range), max(param_range))
plt.ylabel("F1")
plt.ylim(0.47, 0.57)
plt.show()
#print max(test_scores_mean[argrelextrema(test_scores_mean, np.greater)])
print np.amax(test_scores_mean)
print np.array(param_range)[test_scores_mean.argmax(axis=0)]
plt.figure()
plt.suptitle('n_estimators = 500', fontsize=14, fontweight='bold')
_, test_scores = validation_curve(RF_clf500, X, y, cv=logo.split(X, y, groups=wells),
param_name=param_name, param_range=param_range,
scoring=Fscorer, n_jobs=-1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, test_scores_mean)
plt.xlabel(param_name)
plt.xlim(min(param_range), max(param_range))
plt.ylabel("F1")
plt.ylim(0.47, 0.57)
plt.show()
#print max(test_scores_mean[argrelextrema(test_scores_mean, np.greater)])
print np.amax(test_scores_mean)
print np.array(param_range)[test_scores_mean.argmax(axis=0)]
plt.figure()
plt.suptitle('n_estimators = 600', fontsize=14, fontweight='bold')
_, test_scores = validation_curve(RF_clf600, X, y, cv=logo.split(X, y, groups=wells),
param_name=param_name, param_range=param_range,
scoring=Fscorer, n_jobs=-1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, test_scores_mean)
plt.xlabel(param_name)
plt.xlim(min(param_range), max(param_range))
plt.ylabel("F1")
plt.ylim(0.47, 0.57)
plt.show()
#print max(test_scores_mean[argrelextrema(test_scores_mean, np.greater)])
print np.amax(test_scores_mean)
print np.array(param_range)[test_scores_mean.argmax(axis=0)]
In [10]:
RF_clf_f1 = RandomForestClassifier (n_estimators=600, max_features = 21,
n_jobs=-1, random_state = 49)
f1_RF = []
for train, test in logo.split(X, y, groups=wells):
well_name = wells[test[0]]
RF_clf_f1.fit(X[train], y[train])
pred = RF_clf_f1.predict(X[test])
sc = f1_score(y[test], pred, labels = np.arange(10), average = 'micro')
print("{:>20s} {:.3f}".format(well_name, sc))
f1_RF.append(sc)
print "-Average leave-one-well-out F1 Score: %6f" % (sum(f1_RF)/(1.0*(len(f1_RF))))
In [11]:
RF_clf_b = RandomForestClassifier (n_estimators=600, max_features = 21,
n_jobs=-1, random_state = 49)
In [16]:
blind = pd.read_csv('engineered_features_validation_set_top70.csv')
X_blind = np.array(blind.drop(['Formation', 'Well Name'], axis=1))
scaler1 = preprocessing.StandardScaler().fit(X_blind)
X_blind = scaler1.transform(X_blind)
y_pred = RF_clf_b.fit(X, y).predict(X_blind)
#blind['Facies'] = y_pred
In [17]:
np.save('ypred_RF_SFS_VC.npy', y_pred)