In [15]:
###### Importing all used packages
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
from pandas import set_option
# set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None
##### import stuff from scikit learn
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, LeavePGroupsOut, LeaveOneGroupOut, cross_val_predict
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, recall_score, precision_score
filename = '../facies_vectors.csv'
df = pd.read_csv(filename)
print (df.shape)
df.head()
Out[15]:
In [17]:
####### create X_train and y_train
X_train = df[['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']][df.PE.notnull()]
y_train = df['PE'][df.PE.notnull()]
groups_train = df['Well Name'][df.PE.notnull()]
X_fit = df[['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'NM_M', 'RELPOS']][df.PE.isnull()]
In [12]:
Cl = RandomForestRegressor(n_estimators=100)
cv=LeavePGroupsOut(2).split(X_train, y_train, groups_train)
validated = cross_val_score(Cl, X_train, y_train, cv=cv, n_jobs=-1)
scores = np.array(validated)
print (validated)
In [18]:
Cl.fit(X_train, y_train)
y_predict = Cl.predict(X_fit)
In [21]:
df['PE'][df.PE.isnull()] = y_predict
In [23]:
training_data = df
###### Import packages needed for the make_vars functions
import Feature_Engineering as FE
##### cD From wavelet db1
dwt_db1_cD_df = FE.make_dwt_vars_cD(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
levels=[1, 2, 3, 4], wavelet='db1')
##### cA From wavelet db1
dwt_db1_cA_df = FE.make_dwt_vars_cA(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
levels=[1, 2, 3, 4], wavelet='db1')
##### cD From wavelet db3
dwt_db3_cD_df = FE.make_dwt_vars_cD(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
levels=[1, 2, 3, 4], wavelet='db3')
##### cA From wavelet db3
dwt_db3_cA_df = FE.make_dwt_vars_cA(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
levels=[1, 2, 3, 4], wavelet='db3')
##### From entropy
entropy_df = FE.make_entropy_vars(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
l_foots=[2, 3, 4, 5, 7, 10])
###### From gradient
gradient_df = FE.make_gradient_vars(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
dx_list=[2, 3, 4, 5, 6, 10, 20])
##### From rolling average
moving_av_df = FE.make_moving_av_vars(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
windows=[1, 2, 5, 10, 20])
##### From rolling standard deviation
moving_std_df = FE.make_moving_std_vars(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
windows=[3 , 4, 5, 7, 10, 15, 20])
##### From rolling max
moving_max_df = FE.make_moving_max_vars(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
windows=[3, 4, 5, 7, 10, 15, 20])
##### From rolling min
moving_min_df = FE.make_moving_min_vars(wells_df=training_data, logs=['GR', 'ILD_log10', 'DeltaPHI', 'PE', 'PHIND'],
windows=[3 , 4, 5, 7, 10, 15, 20])
###### From rolling NM/M ratio
rolling_marine_ratio_df = FE.make_rolling_marine_ratio_vars(wells_df=training_data, windows=[5, 10, 15, 20, 30, 50, 75, 100, 200])
###### From distance to NM and M, up and down
dist_M_up_df = FE.make_distance_to_M_up_vars(wells_df=training_data)
dist_M_down_df = FE.make_distance_to_M_down_vars(wells_df=training_data)
dist_NM_up_df = FE.make_distance_to_NM_up_vars(wells_df=training_data)
dist_NM_down_df = FE.make_distance_to_NM_down_vars(wells_df=training_data)
In [24]:
list_df_var = [dwt_db1_cD_df, dwt_db1_cA_df, dwt_db3_cD_df, dwt_db3_cA_df,
entropy_df, gradient_df, moving_av_df, moving_std_df, moving_max_df, moving_min_df,
rolling_marine_ratio_df, dist_M_up_df, dist_M_down_df, dist_NM_up_df, dist_NM_down_df]
combined_df = training_data
for var_df in list_df_var:
temp_df = var_df
combined_df = pd.concat([combined_df,temp_df],axis=1)
combined_df.replace(to_replace=np.nan, value='-1', inplace=True)
print (combined_df.shape)
combined_df.head(5)
Out[24]:
In [25]:
###### create predictor and target arrays
X = combined_df.iloc[:, 4:]
y = combined_df['Facies']
groups = combined_df['Well Name']
In [26]:
######### Estimation of validation scores from this tuning:
scoring_param = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted' ]
scores = []
Cl = RandomForestClassifier(n_estimators=100, max_features=0.1, min_samples_leaf=25,
min_samples_split=50, class_weight='balanced', random_state=42, n_jobs=-1)
for scoring in scoring_param:
cv=LeavePGroupsOut(2).split(X, y, groups)
validated = cross_val_score(Cl, X, y, scoring=scoring, cv=cv, n_jobs=-1)
scores.append(validated)
scores = np.array(scores)
scores = np.swapaxes(scores, 0, 1)
scores = pd.DataFrame(data=scores, columns=scoring_param)
sns.boxplot(data=scores)
plt.xlabel('scoring parameters')
plt.ylabel('score')
plt.title('Classification scores for tuned parameters')
plt.show()
In [ ]: