NOTE: I was having issues with GridSearchCV on AWS. As such, I could not automatically optimize for my hyper-parameters.
In [13]:
%matplotlib inline
import pickle
%run helper_functions.py
%run s3.py
%run show_cnf_mtx.py
pd.options.display.max_columns = 1000
plt.rcParams["figure.figsize"] = (15,10)
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")
import plotly.plotly as py
import plotly.graph_objs as go
In [2]:
from IPython.display import Image
In [7]:
working_bucket = 'gabr-project-3'
In [10]:
df = unpickle_object("rf_df.pkl")
In [11]:
y = df['loan_status_Late'].values
df.drop('loan_status_Late', inplace=True, axis=1)
X = df.values
Below I created a simple for loop to test out our random forest model with a range of different parameters.
A much cleaner implementation of this would be via gridsearchCV, however, due to some technical difficulties on AWS. I have had to implment it this way!
Later on in this notebook, I fun a loop with the best model I could find. Intermediate testing and analysis has been ommitted in this notebook for brevity.
In [4]:
model_params = [{"criterion":'entropy', "min_samples_split": 10, "class_weight": 'balanced'}, {"criterion":'entropy', "min_samples_split":9, "class_weight": 'balanced'}, {"criterion":'entropy', "min_samples_split":8, "class_weight": 'balanced'}, {"criterion":'entropy', "min_samples_split":7, "class_weight": 'balanced'}, {"criterion":'entropy', "min_samples_split":6, "class_weight": 'balanced'}, {"criterion":'entropy', "min_samples_split":5, "class_weight": 'balanced'}, {"criterion":'gini', "min_samples_split":10, "class_weight": 'balanced'}, {"criterion":'gini', "min_samples_split":9, "class_weight": 'balanced'}, {"criterion":'gini', "min_samples_split":8, "class_weight": 'balanced'}, {"criterion":'gini', "min_samples_split":7, "class_weight": 'balanced'}, {"criterion":'gini', "min_samples_split":6, "class_weight": 'balanced'}, {"criterion":'gini', "min_samples_split":5, "class_weight": 'balanced'}]
In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=222)
In [16]:
def best_features(feature_importance_list):
all_features = []
for index, importance_val in enumerate(feature_importance_list):
all_features.append((index,importance_val))
all_features = sorted(all_features, key=lambda x: x[1], reverse=True)
best_features = all_features[:35]
for i in best_features:
print(df.columns[i[0]])
try:
print(lookup_description(df.columns[i[0]]))
except:
print("DUMMY VARIABLE!")
print()
In [15]:
target_names = ["PAID", "CHARGED OFF"]
In [8]:
for param_dict in model_params:
model = RandomForestClassifier(n_estimators=23, n_jobs=-1, criterion=param_dict['criterion'], min_samples_split=param_dict["min_samples_split"], class_weight=param_dict["class_weight"], random_state=222)
print(param_dict)
print()
y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
print()
print(classification_report(y, y_pred,target_names=target_names))
print()
cnf_matrix = confusion_matrix(y, y_pred)
print("Accuracy Score: ", np.median(accuracy_score(y, y_pred)))
print()
print("F1 score - Binary: ", np.median(f1_score(y, y_pred, average='binary')))
print()
print("F1 score - Micro: ", np.median(f1_score(y, y_pred, average='micro')))
print()
print("F1 score - Weighted: ", np.median(f1_score(y, y_pred, average='weighted')))
print()
show_confusion_matrix(cnf_matrix, class_labels=target_names)
In [10]:
cv2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=222)
Let's contast our best model so far with no class weights.
In [11]:
model2 = RandomForestClassifier(n_estimators=23, n_jobs=-1, criterion='gini', min_samples_split=10, class_weight=None, random_state=222)
y_pred = cross_val_predict(model2, X, y, cv=cv2, n_jobs=-1)
print(classification_report(y, y_pred,target_names=target_names))
print()
cnf_matrix = confusion_matrix(y, y_pred)
print("Accuracy Score: ", np.median(accuracy_score(y, y_pred)))
print()
print("F1 score - Binary: ", np.median(f1_score(y, y_pred, average='binary')))
print()
print("F1 score - Micro: ", np.median(f1_score(y, y_pred, average='micro')))
print()
print("F1 score - Weighted: ", np.median(f1_score(y, y_pred, average='weighted')))
print()
show_confusion_matrix(cnf_matrix, class_labels=target_names)
From the above we can see that by having no weights, we obtain a significatnly worse F1 Binary score!
What if we increase the min_sample_split variable?
Upon doing some experimentation (ommitted in this notebook) I found that the following random forest performed the best wtih regards to F1 score and accuracy:
I will now re-run this model will 100 estimators and 11 fold cross validation to produce our final model! I will also pickle this model for later use!
In [17]:
final_cv = StratifiedKFold(n_splits=11, shuffle=True, random_state=222)
final_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy', min_samples_split=40, class_weight='balanced', random_state=223)
y_pred = cross_val_predict(final_model, X, y, cv=final_cv, n_jobs=-1)
print(classification_report(y, y_pred,target_names=target_names))
print()
cnf_matrix = confusion_matrix(y, y_pred)
print("Accuracy Score: ", np.median(accuracy_score(y, y_pred)))
print()
print("F1 score - Binary: ", np.median(f1_score(y, y_pred, average='binary')))
print()
print("F1 score - Micro: ", np.median(f1_score(y, y_pred, average='micro')))
print()
print("F1 score - Weighted: ", np.median(f1_score(y, y_pred, average='weighted')))
print()
show_confusion_matrix(cnf_matrix, class_labels=target_names)
print()
fit_object = final_model.fit(X,y)
print()
best_features(fit_object.feature_importances_)
print()
joblib.dump(fit_object, 'rf_model_final.pkl');
proba = cross_val_predict(final_model, X, y, cv=final_cv, method='predict_proba')
In [3]:
proba = unpickle_object("proba_rf_model.pkl")
In [14]:
fpr, tpr, thresholds = roc_curve(y, proba[:, 1])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
fpr[i], tpr[i], _ = roc_curve(y, proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
lw = 2
trace1 = go.Scatter(x=fpr[1], y=tpr[1],
mode='lines',
line=dict(color='darkorange', width=lw),
name='ROC curve (area = %0.2f)' % roc_auc[1]
)
trace2 = go.Scatter(x=[0, 1], y=[0, 1],
mode='lines',
line=dict(color='navy', width=lw, dash='dash'),
showlegend=False)
layout = go.Layout(title='Receiver operating characteristic example',
xaxis=dict(title='False Positive Rate'),
yaxis=dict(title='True Positive Rate'))
fig = go.Figure(data=[trace1, trace2], layout=layout)
py.iplot(fig);
Out[14]:
In [3]:
Image("newplot.png") #static image as pyplot isnt interactive when uploading notebook to Github
Out[3]:
In [19]:
# pickle_object(proba, "probabilities_array_rf_model_final")
In [20]:
# upload_to_bucket("rf_model_final.pkl", "rf_final_model", working_bucket)
In [21]:
# upload_to_bucket("probabilities_array_rf_model_final.pkl", "proba_rf_model", working_bucket)
In [15]:
# view_keys_in_bucket(working_bucket)