In [2]:

    
import pandas as pd
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


import pipeline.util as u
import pipeline.process as pr
import pipeline.read as r
import pipeline.explore as ex
import pipeline.evaluate as ev
% matplotlib inline

While I was working on improving my previous homework and determining a method for feature selection, I ran the small grid search of Magic Loops with the specifications Rayid selected. This ran over night, but unfortunately I ran it before I thought to add a timer. From observation the gradient boosting and ADA boosting appeared to be some of the slower models. The results informed the smaller loop I ran on the features selected by Random Forest, which were nearly identical to Rayid's feature set except I included monthly_income.



In [11]:

    
first_grid = r.read_csv("small_loop_result.csv")
fg = first_grid.sort_values(by="auc-roc")
fg.head(10)









    Out[11]:






  
    
      
      model_type
      clf
      parameters
      auc-roc
      p_at_5
      p_at_10
      p_at_20
    
  
  
    
      199
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.5, 'max_depth': 50, 'n_est...
      0.480473
      0.272000
      0.136000
      0.107333
    
    
      196
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.5, 'max_depth': 50, 'n_est...
      0.484790
      0.220267
      0.149600
      0.112000
    
    
      101
      KNN
      KNeighborsClassifier(algorithm='kd_tree', leaf...
      {'algorithm': 'ball_tree', 'n_neighbors': 1, '...
      0.567491
      0.258133
      0.486933
      0.339600
    
    
      100
      KNN
      KNeighborsClassifier(algorithm='kd_tree', leaf...
      {'algorithm': 'ball_tree', 'n_neighbors': 1, '...
      0.567491
      0.258133
      0.486933
      0.339600
    
    
      113
      KNN
      KNeighborsClassifier(algorithm='kd_tree', leaf...
      {'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...
      0.567687
      0.258667
      0.486933
      0.339600
    
    
      88
      KNN
      KNeighborsClassifier(algorithm='kd_tree', leaf...
      {'algorithm': 'auto', 'n_neighbors': 1, 'weigh...
      0.567687
      0.258667
      0.486933
      0.339600
    
    
      112
      KNN
      KNeighborsClassifier(algorithm='kd_tree', leaf...
      {'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...
      0.567687
      0.258667
      0.486933
      0.339600
    
    
      89
      KNN
      KNeighborsClassifier(algorithm='kd_tree', leaf...
      {'algorithm': 'auto', 'n_neighbors': 1, 'weigh...
      0.567687
      0.258667
      0.486933
      0.339600
    
    
      187
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 50, 'n_est...
      0.568171
      0.188800
      0.194667
      0.145067
    
    
      198
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.5, 'max_depth': 50, 'n_est...
      0.568612
      0.289067
      0.202933
      0.148000



In [17]:

    
#top 10
fg.tail(10).sort_values(by="auc-roc", ascending=False)









    Out[17]:






  
    
      
      model_type
      clf
      parameters
      auc-roc
      p_at_5
      p_at_10
      p_at_20
    
  
  
    
      183
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...
      0.830078
      0.443733
      0.324000
      0.223733
    
    
      182
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...
      0.829089
      0.448533
      0.321867
      0.224933
    
    
      192
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.5, 'max_depth': 5, 'n_esti...
      0.828998
      0.442667
      0.320000
      0.225200
    
    
      163
      AB
      AdaBoostClassifier(algorithm='SAMME.R',\r     ...
      {'algorithm': 'SAMME.R', 'n_estimators': 100}
      0.827574
      0.437333
      0.319467
      0.222400
    
    
      3
      RF
      RandomForestClassifier(bootstrap=True, class_w...
      {'max_depth': 5, 'max_features': 'sqrt', 'min_...
      0.826962
      0.449067
      0.322133
      0.222000
    
    
      5
      RF
      RandomForestClassifier(bootstrap=True, class_w...
      {'max_depth': 5, 'max_features': 'log2', 'min_...
      0.826932
      0.450133
      0.322667
      0.222400
    
    
      7
      RF
      RandomForestClassifier(bootstrap=True, class_w...
      {'max_depth': 5, 'max_features': 'log2', 'min_...
      0.826761
      0.451733
      0.322933
      0.222000
    
    
      0
      RF
      RandomForestClassifier(bootstrap=True, class_w...
      {'max_depth': 5, 'max_features': 'sqrt', 'min_...
      0.826650
      0.443733
      0.321067
      0.223467
    
    
      1
      RF
      RandomForestClassifier(bootstrap=True, class_w...
      {'max_depth': 5, 'max_features': 'sqrt', 'min_...
      0.826459
      0.448533
      0.322133
      0.222133
    
    
      179
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...
      0.826454
      0.451733
      0.333600
      0.222800

Ensemble methods performed the best overall, but there was wide variance. Hypertuning made a big difference in model fit according to AUC-ROC. Of the 213 specifications that worked, Gradient boosting classifer resulted in four of the top 10 and four of the bottom ten models. Gradient Boosting classifiers split on max depth, the top performers used max dapth of 5 while the bottom used max depth of 50.

Random forests performed very strongly and had much lower variance in the results; AUC-ROC ranged from 72 to 83 percent. (Compare with GB with a range from roughly 48 to 83). Random Forests also split on max depth with shorter trees out performing longer ones.

Simpler methods such as Decision Trees and KNN almost matched the ensemble methods under certain specifications. DT specifications with trees of depth 1 captured 65 to 72 percent of the area under the curve, which gives us a baseline of comparison. These models also beat trees with max depth of 20, 50 and 100, which suggest overall that with a limited number of features depths 4 or 5 times greater than the feature set length will overfit. Linear regression performed below the 1-deep trees; AUC-ROC range from 63 to 65 over 10 specifications. Naive bayes hit just above that range. SVD and SGD did not run directly from Rayid's set up.



In [22]:

    
second_grid = r.read_csv("refined_results.csv")
sg = second_grid.sort_values(by="auc-roc", ascending=False)
sg.head(10)
#Discuss speed and how they do with an extra feature.









    Out[22]:






  
    
      
      model_type
      clf
      parameters
      auc-roc
      p_at_5
      p_at_10
      p_at_20
      cnf
      runtime
    
  
  
    
      77
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...
      0.882031
      0.574400
      0.386133
      0.252933
      [[34732   221]\n [ 1813   734]]
      15.199776
    
    
      83
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...
      0.882008
      0.574400
      0.385867
      0.252933
      [[34732   221]\n [ 1813   734]]
      14.887500
    
    
      82
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...
      0.873045
      0.566933
      0.425600
      0.249333
      [[34937    16]\n [ 2212   335]]
      1.665100
    
    
      76
      GB
      GradientBoostingClassifier(criterion='friedman...
      {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...
      0.873045
      0.566933
      0.425600
      0.249333
      [[34937    16]\n [ 2212   335]]
      1.644140
    
    
      1
      RF
      RandomForestClassifier(bootstrap=True, class_w...
      {'max_depth': 5, 'max_features': 'sqrt', 'min_...
      0.870229
      0.569600
      0.381067
      0.246800
      [[34844   109]\n [ 2063   484]]
      5.075012
    
    
      0
      RF
      RandomForestClassifier(bootstrap=True, class_w...
      {'max_depth': 5, 'max_features': 'sqrt', 'min_...
      0.868082
      0.548800
      0.373067
      0.245867
      [[34802   151]\n [ 2045   502]]
      0.644746
    
    
      18
      DT
      DecisionTreeClassifier(class_weight=None, crit...
      {'criterion': 'gini', 'max_depth': 10, 'max_fe...
      0.865309
      0.568533
      0.413333
      0.244933
      [[34705   248]\n [ 1841   706]]
      0.181338
    
    
      17
      DT
      DecisionTreeClassifier(class_weight=None, crit...
      {'criterion': 'gini', 'max_depth': 10, 'max_fe...
      0.862431
      0.571733
      0.409867
      0.245467
      [[34693   260]\n [ 1827   720]]
      0.195340
    
    
      19
      DT
      DecisionTreeClassifier(class_weight=None, crit...
      {'criterion': 'gini', 'max_depth': 10, 'max_fe...
      0.861964
      0.557333
      0.374933
      0.247467
      [[34707   246]\n [ 1858   689]]
      0.186253
    
    
      57
      DT
      DecisionTreeClassifier(class_weight=None, crit...
      {'criterion': 'entropy', 'max_depth': 10, 'max...
      0.860982
      0.566933
      0.421333
      0.249867
      [[34645   308]\n [ 1812   735]]
      0.422406

Adding monthly_income increased the AUC-ROC scores for the top performing models by 6 or so points. The top Gradient Boosting algorithm had AUC-ROC of .88, the top RF .87 and the top DT .86. Linear regression and naive bayes did about the same as the first pass, hovering in the .66 range. Adding information did not improve their results much.

Time is a major factor in grid search and we see the top GB models takes significantly longer than other models. Dividing the number of estimators by 10 leads to a similar speed gain though. So it seem the algorithm is linear in n_estimators, but gains less than 1% in AUC-ROC. Considering this gain in relation to our baseline of about 72% that performance gain is bigger than it looks. N_estimators had a similar impact on RF though the performance gain was smaller.

I included an RF with max_depth 50 in this batch as well. It seems to double the length of time a similarly specified 5-deep RF takes, and produce lower AUC-ROC. However, if we change our focus to finding delinquents, it performed the best finding 768 of 2547 (~30 percent). That seems low, looking at the confusion matrices in the Jupyter notebook, there are high levels of false negatives. This makes sense given that there are a lot more non-delinquents. The best model in this regard found 735 of 2547 delinquents (~29 percent).

My recommendations for future analysis would depend on the purpose and usage of the model. My analysis generally relied on the AUC-ROC which measures the model's ability to identify true positives without selecting false positives. This is good for general reliability of a model. It appears these models do well by minimizing false positives; the benefit here would be to minimize unnecessary contact with people who are not going to be delinquent. Depending on the severity of the policy solution, that may be desirable, as being labeled a potential delinquent can have serious negative outcomes on a person, from higher finance rates to stigma. If the goal is to effectively use resources while minimizing unncessary contact, the analyist might focus on tweaking RF and GB; Since RF is rather robust to fearture selection adding additional variables could boost the score. The depth should stay around 5 in both models. However, if the goal is to minimize false negatives, then the analyst might consider focusing on recall.

	model_type	clf	parameters	auc-roc	p_at_5	p_at_10	p_at_20
199	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.5, 'max_depth': 50, 'n_est...	0.480473	0.272000	0.136000	0.107333
196	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.5, 'max_depth': 50, 'n_est...	0.484790	0.220267	0.149600	0.112000
101	KNN	KNeighborsClassifier(algorithm='kd_tree', leaf...	{'algorithm': 'ball_tree', 'n_neighbors': 1, '...	0.567491	0.258133	0.486933	0.339600
100	KNN	KNeighborsClassifier(algorithm='kd_tree', leaf...	{'algorithm': 'ball_tree', 'n_neighbors': 1, '...	0.567491	0.258133	0.486933	0.339600
113	KNN	KNeighborsClassifier(algorithm='kd_tree', leaf...	{'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...	0.567687	0.258667	0.486933	0.339600
88	KNN	KNeighborsClassifier(algorithm='kd_tree', leaf...	{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...	0.567687	0.258667	0.486933	0.339600
112	KNN	KNeighborsClassifier(algorithm='kd_tree', leaf...	{'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...	0.567687	0.258667	0.486933	0.339600
89	KNN	KNeighborsClassifier(algorithm='kd_tree', leaf...	{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...	0.567687	0.258667	0.486933	0.339600
187	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 50, 'n_est...	0.568171	0.188800	0.194667	0.145067
198	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.5, 'max_depth': 50, 'n_est...	0.568612	0.289067	0.202933	0.148000

	model_type	clf	parameters	auc-roc	p_at_5	p_at_10	p_at_20
183	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...	0.830078	0.443733	0.324000	0.223733
182	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...	0.829089	0.448533	0.321867	0.224933
192	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.5, 'max_depth': 5, 'n_esti...	0.828998	0.442667	0.320000	0.225200
163	AB	AdaBoostClassifier(algorithm='SAMME.R',\r ...	{'algorithm': 'SAMME.R', 'n_estimators': 100}	0.827574	0.437333	0.319467	0.222400
3	RF	RandomForestClassifier(bootstrap=True, class_w...	{'max_depth': 5, 'max_features': 'sqrt', 'min_...	0.826962	0.449067	0.322133	0.222000
5	RF	RandomForestClassifier(bootstrap=True, class_w...	{'max_depth': 5, 'max_features': 'log2', 'min_...	0.826932	0.450133	0.322667	0.222400
7	RF	RandomForestClassifier(bootstrap=True, class_w...	{'max_depth': 5, 'max_features': 'log2', 'min_...	0.826761	0.451733	0.322933	0.222000
0	RF	RandomForestClassifier(bootstrap=True, class_w...	{'max_depth': 5, 'max_features': 'sqrt', 'min_...	0.826650	0.443733	0.321067	0.223467
1	RF	RandomForestClassifier(bootstrap=True, class_w...	{'max_depth': 5, 'max_features': 'sqrt', 'min_...	0.826459	0.448533	0.322133	0.222133
179	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...	0.826454	0.451733	0.333600	0.222800

	model_type	clf	parameters	auc-roc	p_at_5	p_at_10	p_at_20	cnf	runtime
77	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...	0.882031	0.574400	0.386133	0.252933	[[34732 221]\n [ 1813 734]]	15.199776
83	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...	0.882008	0.574400	0.385867	0.252933	[[34732 221]\n [ 1813 734]]	14.887500
82	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...	0.873045	0.566933	0.425600	0.249333	[[34937 16]\n [ 2212 335]]	1.665100
76	GB	GradientBoostingClassifier(criterion='friedman...	{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...	0.873045	0.566933	0.425600	0.249333	[[34937 16]\n [ 2212 335]]	1.644140
1	RF	RandomForestClassifier(bootstrap=True, class_w...	{'max_depth': 5, 'max_features': 'sqrt', 'min_...	0.870229	0.569600	0.381067	0.246800	[[34844 109]\n [ 2063 484]]	5.075012
0	RF	RandomForestClassifier(bootstrap=True, class_w...	{'max_depth': 5, 'max_features': 'sqrt', 'min_...	0.868082	0.548800	0.373067	0.245867	[[34802 151]\n [ 2045 502]]	0.644746
18	DT	DecisionTreeClassifier(class_weight=None, crit...	{'criterion': 'gini', 'max_depth': 10, 'max_fe...	0.865309	0.568533	0.413333	0.244933	[[34705 248]\n [ 1841 706]]	0.181338
17	DT	DecisionTreeClassifier(class_weight=None, crit...	{'criterion': 'gini', 'max_depth': 10, 'max_fe...	0.862431	0.571733	0.409867	0.245467	[[34693 260]\n [ 1827 720]]	0.195340
19	DT	DecisionTreeClassifier(class_weight=None, crit...	{'criterion': 'gini', 'max_depth': 10, 'max_fe...	0.861964	0.557333	0.374933	0.247467	[[34707 246]\n [ 1858 689]]	0.186253
57	DT	DecisionTreeClassifier(class_weight=None, crit...	{'criterion': 'entropy', 'max_depth': 10, 'max...	0.860982	0.566933	0.421333	0.249867	[[34645 308]\n [ 1812 735]]	0.422406