In [1]:
%matplotlib inline
In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import Image
from IPython.core.display import HTML
The default directory is the code subdirectory. Changing to the main repo directory above.
In [3]:
retval=os.chdir("..")
In [4]:
clean_data=pd.read_csv("./clean_data/modeling_data.csv")
clean_data.head()
Out[4]:
In [5]:
my_rand_state=0
In [6]:
from sklearn.model_selection import train_test_split
In [7]:
X = (clean_data.iloc[:,:-1]).as_matrix()
y = (clean_data.iloc[:,-1]).tolist()
In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=my_rand_state)
In [9]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
In [10]:
ros = RandomOverSampler(random_state=my_rand_state)
smote = SMOTE(random_state=my_rand_state)
rus = RandomUnderSampler(random_state=my_rand_state)
tl = TomekLinks(random_state=my_rand_state)
In [11]:
from sklearn.feature_selection import VarianceThreshold
In [12]:
vt = VarianceThreshold()
threshold=[p*(1-p) for p in [0, 0.05, 0.1, 0.15]]
Note, since the formula for the variance of binary variables is p*(1-p), where p is the proportion of times that the binary variable is 1, I use the proportion to define the variance thresholds. The max variance is 0.25 at p=0.5.
In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
Although tuning is not necessary for Naive Bayes, I pass the default parameters of those models to GridSearchCV anyway so that I can do a direct pair-wise comparison with the other models across the different steps of cross-validation.
In the interest of time, I didn't use the SVM classifier.
I used a training set to tune the model's hyperparameters and a test set to evaluate them. With more time and data, I would use repeated nested cross-validation to create a more robust model tuning, selection, and performance assessment workflow.
In [14]:
nb_clf=GaussianNB()
priors=[None]
In [15]:
qda_clf=QuadraticDiscriminantAnalysis()
reg_param=[0.0, 0.25, 0.5, 0.75]
In [16]:
log_clf=LogisticRegression()
C=[0.001 , 0.01, 10, 100,1000]
In [17]:
knn_clf=KNeighborsClassifier(n_jobs=4)
n_neighbors=list(range(1,17,2))
weights=['uniform','distance']
In [18]:
rf_clf=RandomForestClassifier()
n_estimators=[100]
max_features=[.1,.3,.5]
In [19]:
class_weight=['balanced']
class_weight.extend([{1: w} for w in [1, 2, 10]])
In [20]:
from imblearn import pipeline #needed if mixing imblearn with sklearn classes
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
In [21]:
n_jobs=4
In [22]:
n_folds=10
skfold = StratifiedKFold(n_splits=n_folds,random_state=my_rand_state, shuffle=False)
In [23]:
nb_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',nb_clf)])
nb_clf_est_b = GridSearchCV(estimator=nb_clf_b,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,clf__priors=priors))
In [24]:
nb_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
('clf',nb_clf)])
nb_clf_est_ros = GridSearchCV(estimator=nb_clf_ros,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__priors=priors))
In [25]:
nb_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
('clf',nb_clf)])
nb_clf_est_smote = GridSearchCV(estimator=nb_clf_smote,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__priors=priors))
In [26]:
nb_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
('clf',nb_clf)])
nb_clf_est_rus = GridSearchCV(estimator=nb_clf_rus,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__priors=priors))
In [27]:
nb_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
('clf',nb_clf)])
nb_clf_est_tl = GridSearchCV(estimator=nb_clf_tl,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__priors=priors))
In [28]:
qda_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',qda_clf)])
qda_clf_est_b = GridSearchCV(estimator=qda_clf_b,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,clf__reg_param=reg_param))
In [29]:
qda_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
('clf',qda_clf)])
qda_clf_est_ros = GridSearchCV(estimator=qda_clf_ros,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__reg_param=reg_param))
In [30]:
qda_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
('clf',qda_clf)])
qda_clf_est_smote = GridSearchCV(estimator=qda_clf_smote,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__reg_param=reg_param))
In [31]:
qda_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
('clf',qda_clf)])
qda_clf_est_rus = GridSearchCV(estimator=qda_clf_rus,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__reg_param=reg_param))
In [32]:
qda_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
('clf',qda_clf)])
qda_clf_est_tl = GridSearchCV(estimator=qda_clf_tl,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__reg_param=reg_param))
In [33]:
log_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',log_clf)])
log_clf_est_b = GridSearchCV(estimator=log_clf_b,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,clf__C=C,
clf__class_weight=class_weight))
In [34]:
log_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
('clf',log_clf)])
log_clf_est_ros = GridSearchCV(estimator=log_clf_ros,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,clf__C=C,
clf__class_weight=class_weight))
In [35]:
log_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
('clf',log_clf)])
log_clf_est_smote = GridSearchCV(estimator=log_clf_smote,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,clf__C=C,
clf__class_weight=class_weight))
In [36]:
log_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
('clf',log_clf)])
log_clf_est_rus = GridSearchCV(estimator=log_clf_rus,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,clf__C=C,
clf__class_weight=class_weight))
In [37]:
log_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
('clf',log_clf)])
log_clf_est_tl = GridSearchCV(estimator=log_clf_tl,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,clf__C=C,
clf__class_weight=class_weight))
In [38]:
knn_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',knn_clf)])
knn_clf_est_b = GridSearchCV(estimator=knn_clf_b,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_neighbors=n_neighbors,
clf__weights=weights))
In [39]:
knn_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
('clf',knn_clf)])
knn_clf_est_ros = GridSearchCV(estimator=knn_clf_ros,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_neighbors=n_neighbors,
clf__weights=weights))
In [40]:
knn_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
('clf',knn_clf)])
knn_clf_est_smote = GridSearchCV(estimator=knn_clf_smote,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_neighbors=n_neighbors,
clf__weights=weights))
In [41]:
knn_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
('clf',knn_clf)])
knn_clf_est_rus = GridSearchCV(estimator=knn_clf_rus,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_neighbors=n_neighbors,
clf__weights=weights))
In [42]:
knn_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
('clf',knn_clf)])
knn_clf_est_tl = GridSearchCV(estimator=knn_clf_tl,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_neighbors=n_neighbors,
clf__weights=weights))
In [43]:
rf_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',rf_clf)])
rf_clf_est_b = GridSearchCV(estimator=rf_clf_b,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_estimators=n_estimators,
clf__max_features=max_features,
clf__class_weight=class_weight))
In [44]:
rf_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
('clf',rf_clf)])
rf_clf_est_ros = GridSearchCV(estimator=rf_clf_ros,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_estimators=n_estimators,
clf__max_features=max_features,
clf__class_weight=class_weight))
In [45]:
rf_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
('clf',rf_clf)])
rf_clf_est_smote = GridSearchCV(estimator=rf_clf_smote,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_estimators=n_estimators,
clf__max_features=max_features,
clf__class_weight=class_weight))
In [46]:
rf_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
('clf',rf_clf)])
rf_clf_est_rus = GridSearchCV(estimator=rf_clf_rus,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_estimators=n_estimators,
clf__max_features=max_features,
clf__class_weight=class_weight))
In [47]:
rf_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
('clf',rf_clf)])
rf_clf_est_tl = GridSearchCV(estimator=rf_clf_tl,cv=skfold,
scoring='roc_auc',n_jobs=n_jobs,
param_grid=dict(vt__threshold=threshold,
clf__n_estimators=n_estimators,
clf__max_features=max_features,
clf__class_weight=class_weight))
In [48]:
from sklearn.externals import joblib
In [49]:
nb_clf_est_b.fit(X_train,y_train)
Out[49]:
In [50]:
joblib.dump(nb_clf_est_b, './other_output/nb_clf_est_b.pkl')
Out[50]:
In [51]:
nb_clf_est_ros.fit(X_train,y_train)
Out[51]:
In [52]:
joblib.dump(nb_clf_est_ros, './other_output/nb_clf_est_ros.pkl')
Out[52]:
In [53]:
nb_clf_est_smote.fit(X_train,y_train)
Out[53]:
In [54]:
joblib.dump(nb_clf_est_smote, './other_output/nb_clf_est_smote.pkl')
Out[54]:
In [55]:
nb_clf_est_rus.fit(X_train,y_train)
Out[55]:
In [56]:
joblib.dump(nb_clf_est_rus, './other_output/nb_clf_est_rus.pkl')
Out[56]:
In [57]:
nb_clf_est_tl.fit(X_train,y_train)
Out[57]:
In [58]:
joblib.dump(nb_clf_est_tl, './other_output/nb_clf_est_tl.pkl')
Out[58]:
In [ ]:
qda_clf_est_b.fit(X_train,y_train)
In [60]:
joblib.dump(qda_clf_est_b, './other_output/qda_clf_est_b.pkl')
Out[60]:
In [ ]:
qda_clf_est_ros.fit(X_train,y_train)
In [62]:
joblib.dump(qda_clf_est_ros, './other_output/qda_clf_est_ros.pkl')
Out[62]:
In [ ]:
qda_clf_est_smote.fit(X_train,y_train)
In [64]:
joblib.dump(qda_clf_est_smote, './other_output/qda_clf_est_smote.pkl')
Out[64]:
In [ ]:
qda_clf_est_rus.fit(X_train,y_train)
In [66]:
joblib.dump(qda_clf_est_rus, './other_output/qda_clf_est_rus.pkl')
Out[66]:
In [ ]:
qda_clf_est_tl.fit(X_train,y_train)
In [68]:
joblib.dump(qda_clf_est_tl, './other_output/qda_clf_est_tl.pkl')
Out[68]:
In [69]:
log_clf_est_b.fit(X_train,y_train)
Out[69]:
In [70]:
joblib.dump(log_clf_est_b, './other_output/log_clf_est_b.pkl')
Out[70]:
In [71]:
log_clf_est_ros.fit(X_train,y_train)
Out[71]:
In [72]:
joblib.dump(log_clf_est_ros, './other_output/log_clf_est_ros.pkl')
Out[72]:
In [73]:
log_clf_est_smote.fit(X_train,y_train)
Out[73]:
In [74]:
joblib.dump(log_clf_est_smote, './other_output/log_clf_est_smote.pkl')
Out[74]:
In [75]:
log_clf_est_rus.fit(X_train,y_train)
Out[75]:
In [76]:
joblib.dump(log_clf_est_rus, './other_output/log_clf_est_rus.pkl')
Out[76]:
In [77]:
log_clf_est_tl.fit(X_train,y_train)
Out[77]:
In [78]:
joblib.dump(log_clf_est_tl, './other_output/log_clf_est_tl.pkl')
Out[78]:
In [79]:
knn_clf_est_b.fit(X_train,y_train)
Out[79]:
In [80]:
joblib.dump(knn_clf_est_b, './other_output/knn_clf_est_b.pkl')
Out[80]:
In [81]:
knn_clf_est_ros.fit(X_train,y_train)
Out[81]:
In [82]:
joblib.dump(knn_clf_est_ros, './other_output/knn_clf_est_ros.pkl')
Out[82]:
In [83]:
knn_clf_est_smote.fit(X_train,y_train)
Out[83]:
In [84]:
joblib.dump(knn_clf_est_smote, './other_output/knn_clf_est_smote.pkl')
Out[84]:
In [85]:
knn_clf_est_rus.fit(X_train,y_train)
Out[85]:
In [86]:
joblib.dump(knn_clf_est_rus, './other_output/knn_clf_est_rus.pkl')
Out[86]:
In [87]:
knn_clf_est_tl.fit(X_train,y_train)
Out[87]:
In [88]:
joblib.dump(knn_clf_est_tl, './other_output/knn_clf_est_tl.pkl')
Out[88]:
In [89]:
rf_clf_est_b.fit(X_train,y_train)
Out[89]:
In [90]:
joblib.dump(rf_clf_est_b, './other_output/rf_clf_est_b.pkl')
Out[90]:
In [91]:
rf_clf_est_ros.fit(X_train,y_train)
Out[91]:
In [92]:
joblib.dump(rf_clf_est_ros, './other_output/rf_clf_est_ros.pkl')
Out[92]:
In [93]:
rf_clf_est_smote.fit(X_train,y_train)
Out[93]:
In [94]:
joblib.dump(rf_clf_est_smote, './other_output/rf_clf_est_smote.pkl')
Out[94]:
In [95]:
rf_clf_est_rus.fit(X_train,y_train)
Out[95]:
In [96]:
joblib.dump(rf_clf_est_rus, './other_output/rf_clf_est_rus.pkl')
Out[96]:
In [97]:
rf_clf_est_tl.fit(X_train,y_train)
Out[97]:
In [98]:
joblib.dump(rf_clf_est_tl, './other_output/rf_clf_est_tl.pkl')
Out[98]:
Below I show the ROC curves for the models over the test data.
In [99]:
from sklearn.metrics import roc_curve, auc
In [100]:
nb_fpr, nb_tpr, _ = roc_curve(y_test,
nb_clf_est_b.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)
qda_fpr, qda_tpr, _ = roc_curve(y_test,
qda_clf_est_b.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)
log_fpr, log_tpr, _ = roc_curve(y_test,
log_clf_est_b.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)
knn_fpr, knn_tpr, _ = roc_curve(y_test,
knn_clf_est_b.predict_proba(X_test)[:,1])
knn_roc_auc = auc(knn_fpr, knn_tpr)
rf_fpr, rf_tpr, _ = roc_curve(y_test,
rf_clf_est_b.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)
In [101]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
label='NB (area = %0.2f)' % nb_roc_auc, lw=2)
plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
label='LOG (area = %0.2f)' % log_roc_auc, lw=2)
plt.plot(knn_fpr, knn_tpr, color='yellow', linestyle='--',
label='KNN (area = %0.2f)' % knn_roc_auc, lw=2)
plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
label='RF (area = %0.2f)' % rf_roc_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Basic Models')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_Basic.png', bbox_inches='tight')
plt.show()
In [102]:
nb_fpr, nb_tpr, _ = roc_curve(y_test,
nb_clf_est_ros.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)
qda_fpr, qda_tpr, _ = roc_curve(y_test,
qda_clf_est_ros.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)
log_fpr, log_tpr, _ = roc_curve(y_test,
log_clf_est_ros.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)
knn_fpr, knn_tpr, _ = roc_curve(y_test,
knn_clf_est_ros.predict_proba(X_test)[:,1])
knn_roc_auc = auc(knn_fpr, knn_tpr)
rf_fpr, rf_tpr, _ = roc_curve(y_test,
rf_clf_est_ros.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)
In [103]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
label='NB (area = %0.2f)' % nb_roc_auc, lw=2)
plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
label='LOG (area = %0.2f)' % log_roc_auc, lw=2)
plt.plot(knn_fpr, knn_tpr, color='yellow', linestyle='--',
label='KNN (area = %0.2f)' % knn_roc_auc, lw=2)
plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
label='RF (area = %0.2f)' % rf_roc_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Models with Oversampling')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_ROS.png', bbox_inches='tight')
plt.show()
Interestingly, only the basic classifiers improve in predictive performance.
In [104]:
nb_fpr, nb_tpr, _ = roc_curve(y_test,
nb_clf_est_smote.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)
qda_fpr, qda_tpr, _ = roc_curve(y_test,
qda_clf_est_smote.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)
log_fpr, log_tpr, _ = roc_curve(y_test,
log_clf_est_smote.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)
knn_fpr, knn_tpr, _ = roc_curve(y_test,
knn_clf_est_smote.predict_proba(X_test)[:,1])
knn_roc_auc = auc(knn_fpr, knn_tpr)
rf_fpr, rf_tpr, _ = roc_curve(y_test,
rf_clf_est_smote.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)
In [105]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
label='NB (area = %0.2f)' % nb_roc_auc, lw=2)
plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
label='LOG (area = %0.2f)' % log_roc_auc, lw=2)
plt.plot(knn_fpr, knn_tpr, color='yellow', linestyle='--',
label='KNN (area = %0.2f)' % knn_roc_auc, lw=2)
plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
label='RF (area = %0.2f)' % rf_roc_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Models with SMOTE')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_SMOTE.png', bbox_inches='tight')
plt.show()
Again, class imbalance corrections only benefit Naive Bayes and QDA.
In [106]:
nb_fpr, nb_tpr, _ = roc_curve(y_test,
nb_clf_est_rus.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)
log_fpr, log_tpr, _ = roc_curve(y_test,
log_clf_est_rus.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)
knn_fpr, knn_tpr, _ = roc_curve(y_test,
knn_clf_est_rus.predict_proba(X_test)[:,1])
knn_roc_auc = auc(knn_fpr, knn_tpr)
rf_fpr, rf_tpr, _ = roc_curve(y_test,
rf_clf_est_rus.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)
In [107]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
label='NB (area = %0.2f)' % nb_roc_auc, lw=2)
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
label='LOG (area = %0.2f)' % log_roc_auc, lw=2)
plt.plot(knn_fpr, knn_tpr, color='yellow', linestyle='--',
label='KNN (area = %0.2f)' % knn_roc_auc, lw=2)
plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
label='RF (area = %0.2f)' % rf_roc_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Models with Undersampling')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_RUS.png', bbox_inches='tight')
plt.show()
Training QDA with undersampling resulted in errors.
In [108]:
nb_fpr, nb_tpr, _ = roc_curve(y_test,
nb_clf_est_tl.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)
qda_fpr, qda_tpr, _ = roc_curve(y_test,
qda_clf_est_tl.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)
log_fpr, log_tpr, _ = roc_curve(y_test,
log_clf_est_tl.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)
knn_fpr, knn_tpr, _ = roc_curve(y_test,
knn_clf_est_tl.predict_proba(X_test)[:,1])
knn_roc_auc = auc(knn_fpr, knn_tpr)
rf_fpr, rf_tpr, _ = roc_curve(y_test,
rf_clf_est_tl.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)
In [109]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
label='NB (area = %0.2f)' % nb_roc_auc, lw=2)
plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
label='LOG (area = %0.2f)' % log_roc_auc, lw=2)
plt.plot(knn_fpr, knn_tpr, color='yellow', linestyle='--',
label='KNN (area = %0.2f)' % knn_roc_auc, lw=2)
plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
label='RF (area = %0.2f)' % rf_roc_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Models with Tomek Link Removal')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_tl.png', bbox_inches='tight')
plt.show()
In [110]:
nb_fpr, nb_tpr, _ = roc_curve(y_test,
nb_clf_est_ros.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)
qda_fpr, qda_tpr, _ = roc_curve(y_test,
qda_clf_est_ros.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)
log_fpr, log_tpr, _ = roc_curve(y_test,
log_clf_est_b.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)
knn_fpr, knn_tpr, _ = roc_curve(y_test,
knn_clf_est_rus.predict_proba(X_test)[:,1])
knn_roc_auc = auc(knn_fpr, knn_tpr)
rf_fpr, rf_tpr, _ = roc_curve(y_test,
rf_clf_est_b.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)
In [111]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
label='NB (area = %0.2f)' % nb_roc_auc, lw=2)
plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
label='LOG (area = %0.2f)' % log_roc_auc, lw=2)
plt.plot(knn_fpr, knn_tpr, color='yellow', linestyle='--',
label='KNN (area = %0.2f)' % knn_roc_auc, lw=2)
plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
label='RF (area = %0.2f)' % rf_roc_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Best Iteration of Each Model')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_Best.png', bbox_inches='tight')
plt.show()
While best model performs extremely well. It is important to be aware of model characteristics like the variability of prediction quality, a key model reliability metric, and the important features, which should inform data maintenance and engineering practices as well as model interpretation.
Normally I would use bootstrapped samples of the test data, with the model fitted on the whole training data, to obtain an empirical distribution of the model's performance (AUC ROC in this case). However, with limited data and time, I will use the AUC ROC on the validation folds of the CV grid search to get a sense of the variability. Normally the validation set AUC ROC values will be biased towards optimism compared to the true out of sample performance (on the test set), however, this isn't the case in the example below.
In [112]:
log_cv_results=pd.DataFrame(log_clf_est_b.cv_results_)
log_cv_results.head()
Out[112]:
Isolate the best parameters
In [113]:
log_cv_results=log_cv_results[(log_cv_results.rank_test_score==1)]
log_cv_results.head()
Out[113]:
In [117]:
keep_cols=["split"+str(i)+"_test_score" for i in range(0,10)]
log_cv_results=log_cv_results[keep_cols]
log_cv_results.head()
temp=log_cv_results.T.reset_index()
temp.columns=['Fold','Validation AUC ROC']
temp['Fold']=range(1,11)
temp
Out[117]:
In [118]:
temp['Validation AUC ROC'].describe()
Out[118]:
Fortunately, the performance is stable. With more data and time, I would do repeat cross-validation or repeated nested cross-validation to get more robust estimates of the out of sample error and its variability.
Unfortunately, even on a standardized scale, coefficient magnitude is not necessarily the right way to determine variable importance in a logistic regression. Fortunately, the random forest classifier has similar performance to the logistic regression, so I will use it to identify important features.
In [119]:
rf_clf_b.set_params(**rf_clf_est_b.best_params_).fit(X_train,y_train)
Out[119]:
In [120]:
importance=rf_clf_b.named_steps['clf'].feature_importances_
indices = np.argsort(importance)[::-1]
In [121]:
feature_importance=pd.DataFrame({'feature':clean_data.columns[:-1][indices],
'importance':importance})
In [122]:
feature_importance.sort_values(by='importance',inplace=True,ascending=False)
feature_importance[:10]
Out[122]:
It is not surprising that the most important two features are ad attributes. This also adds confidence to the model by showing that the most important features make intuitive sense; although robust models can have seemingly non-intuitive important features.
Many of the other top ten features, like urlstatic.wired.com and ancurlwww.amazon.com, also make sense because they are likely links to the urls of the company that owns the ad.
I would not be surprised if adding random forest as a feature selection step in the pipeline wouldn't bring the AUC ROC of nearly all of the classifiers to 0.99.
The best model is clearly the logistic classifier without the sampling-based class imbalance corrections. While the random forest mirrors its performance, the random forest is a much more complex and computationally expensive model. Therefore, in practice, the logistic classifier would be best.
It is surprising that the class imbalance corrections had a limited impact on the more complex classifiers. In other classification tasks, the class imbalance corrections, especially the Tomek Link removals, significantly improved the AUC ROC of the more complex classifiers. However every classification task is different.
If the AUC ROC was not already 0.99, I would try feature selection via random forest, ensemble methods other than random forest like bagging classifiers, adaptive boosting, or even extreme gradient boosting or do more aggressive feature engineering and hyperparameter tuning. However at an AUC ROC of 0.99, this is definitely not a good investment of time.
In [123]:
best_paras = log_clf_est_b.best_params_
best_paras
Out[123]:
In [124]:
with open('./model_para/logistic_best_paras.json', 'w') as outfile:
json.dump(best_paras, outfile)