Loan Approval Model

Created with H2O Automatic Machine Learning

This notebook ingests a dataset, and trains many machine learning models intelligently searching the hyper-parameter space for optimal values. A leaderboard is maintained. Finally, an ensemble is created stacking together some of the base learners and the result is added to the leaderboard. The best model is deployed to production.



In [2]:

    
%%capture
import h2o
from h2o.automl import H2OAutoML

import os
import plotly
import cufflinks
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
plotly.offline.init_notebook_mode(connected=True)

myPlotlyKey = os.environ['SECRET_ENV_BRETTS_PLOTLY_KEY']
py.sign_in(username='bretto777',api_key=myPlotlyKey)

# Suppress unwatned warnings
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np



In [5]:

    
%%capture
#h2o.init(nthreads=1, max_mem_size="256M")
h2o.connect(ip="35.199.178.30")
#h2o.no_progress()



In [6]:

    
# Import some data from Amazon S3
h2oDF = h2o.import_file("https://s3-us-west-1.amazonaws.com/dsclouddata/LendingClubData/LoansGoodBad.csv")

# Stratified Split into Train/Test
stratsplit = h2oDF["Bad_Loan"].stratified_split(test_frac=0.3, seed=12349453)
train = h2oDF[stratsplit=="train"]
test = h2oDF[stratsplit=="test"]









    



Parse progress: |█████████████████████████████████████████████████████████| 100%



In [7]:

    
dfSum = h2oDF.group_by(by="State").sum().frame
dfMean = h2oDF.group_by(by="State").mean().frame
stateData = dfSum.merge(dfMean).as_data_frame(use_pandas=True, header=True)
stateData = stateData.iloc[1:]
train.head(10)









    






  RowID   Loan_Amount Term       Interest_Rate   Employment_Years Home_Ownership    Annual_Income Verification_Status     Loan_Purpose      State    Debt_to_Income   Delinquent_2yr   Revolving_Cr_Util   Total_Accounts Bad_Loan    Longest_Credit_Length


      2          2500 60 months           15.27                0.5 RENT                    30000  VERIFIED - income source car               GA                 1                  0                 9.4                4 BAD                            12
      3          2400 36 months           15.96               10  RENT                    12252  not verified            small_business    IL                 8.72                0                98.5               10 GOOD                           10
      4         10000 36 months           13.49               10  RENT                    49200  VERIFIED - income source other             CA                20                  0                21                37 GOOD                           15
      5          5000 36 months            7.9                3  RENT                    36000  VERIFIED - income source wedding           AZ                11.2                0                28.3               12 GOOD                            7
      6          3000 36 months           18.64                9  RENT                    48000  VERIFIED - income source car               CA                 5.35                0                87.5                4 GOOD                            4
      9          6500 60 months           14.65                5  OWN                     72000  not verified            debt_consolidation AZ                16.12                0                20.6               23 GOOD                           13
     12          3000 36 months            9.91                3  RENT                    15000  VERIFIED - income source credit_card       IL                12.56                0                43.1               11 GOOD                            8
     13         10000 36 months           10.65                3  RENT                   100000  VERIFIED - income source other             CA                 7.06                0                55.5               29 BAD                            20
     14          1000 36 months           16.29                0.5 RENT                    28000  not verified            debt_consolidation MO                20.31                0                81.5               23 GOOD                            4
     18          9200 36 months            6.03                6  RENT                    77385.2 not verified            debt_consolidation CA                 9.86                0                23.1               28 GOOD                           10








    Out[7]:



In [8]:

    
for col in stateData.columns:
    stateData[col] = stateData[col].astype(str)

scl = [[0.0, 'rgb(164, 182, 216)'],[0.2, 'rgb(116, 141, 188)'],[0.4, 'rgb(69, 102, 165)'],\
            [0.6, 'rgb(45, 82, 153)'],[0.8, 'rgb(26, 62, 132)'],[1.0, 'rgb(4, 37, 99)']]

stateData['text'] = 'Avg Interest_Rate '+stateData['mean_Interest_Rate']+ '<br>' +\
    'Total Loan_Amount '+stateData['sum_Loan_Amount']+'<br>'+\
    'Avg Term '+stateData['mean_Term']+ '<br>' +\
    'Avg Income ' + stateData['mean_Annual_Income']

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = stateData['State'],
        z = stateData['sum_Bad_Loan'].astype(float),
        locationmode = 'USA-states',
        text = stateData['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "# Bad Loans")
        ) ]

layout = dict(
        title = 'Bad Loans by State<br>(Hover for breakdown)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )









    Out[8]:



In [9]:

    
# Identify predictors and response
x = train.columns
y = "Bad_Loan"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()



In [11]:

    
# Run AutoML, building 11 models
autoModel = H2OAutoML(max_models=11)
autoModel.train(x = x, y = y,
          training_frame = train,
          leaderboard_frame = test)









    



AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

Leaderboard

Display the best models, sorted by descending AUC



In [12]:

    
leaders = autoModel.leaderboard
leaders









    






model_id                                                  auc   logloss


StackedEnsemble_AllModels_0_AutoML_20171204_164954   0.724092  0.430484
StackedEnsemble_BestOfFamily_0_AutoML_20171204_164954 0.72219  0.431329
GBM_grid_0_AutoML_20171204_164954_model_0            0.721619  0.428126
GBM_grid_0_AutoML_20171204_164954_model_5            0.72082  0.440175
GBM_grid_0_AutoML_20171204_164954_model_1            0.720425  0.428626
GBM_grid_0_AutoML_20171204_164954_model_2            0.718393  0.429827
GBM_grid_0_AutoML_20171204_164954_model_4            0.71454  0.431989
GBM_grid_0_AutoML_20171204_164954_model_3            0.711595  0.433819
GBM_grid_0_AutoML_20171204_164954_model_6            0.707657  0.449183
DeepLearning_0_AutoML_20171204_164954                0.706245  0.437179








    Out[12]:

Variable Importance - Best Model



In [21]:

    
leaders[1, 0]









    Out[21]:





u'StackedEnsemble_BestOfFamily_0_AutoML_20171204_164954'



In [23]:

    
importances = h2o.get_model(leaders[2, 0]).varimp(use_pandas=True)
importances









    Out[23]:







  
    
      
      variable
      relative_importance
      scaled_importance
      percentage
    
  
  
    
      0
      Interest_Rate
      3238.202881
      1.000000
      0.333457
    
    
      1
      State
      1380.842285
      0.426422
      0.142194
    
    
      2
      RowID
      954.863831
      0.294875
      0.098328
    
    
      3
      Annual_Income
      765.047424
      0.236257
      0.078782
    
    
      4
      Term
      614.148438
      0.189657
      0.063243
    
    
      5
      Debt_to_Income
      599.893066
      0.185255
      0.061775
    
    
      6
      Loan_Purpose
      537.137817
      0.165875
      0.055312
    
    
      7
      Revolving_Cr_Util
      407.736725
      0.125915
      0.041987
    
    
      8
      Loan_Amount
      344.781158
      0.106473
      0.035504
    
    
      9
      Total_Accounts
      228.186920
      0.070467
      0.023498
    
    
      10
      Employment_Years
      198.352722
      0.061254
      0.020426
    
    
      11
      Longest_Credit_Length
      158.876251
      0.049063
      0.016360
    
    
      12
      Home_Ownership
      140.665573
      0.043439
      0.014485
    
    
      13
      Delinquent_2yr
      71.675285
      0.022134
      0.007381
    
    
      14
      Verification_Status
      70.584908
      0.021798
      0.007269



In [24]:

    
importances = h2o.get_model(leaders[2, 0]).varimp(use_pandas=True)
importances = importances.loc[:,['variable','relative_importance']].groupby('variable').mean()
importances.sort_values(by="relative_importance", ascending=False).iplot(kind='bar', colors='#5AC4F2', theme='white')









    Out[24]:

Leaderboard ROC Curves



In [27]:

    
Model0 = np.array(h2o.get_model(leaders[0, 0]).roc(valid=True))
Model1 = np.array(h2o.get_model(leaders[1, 0]).roc(valid=True))
Model2 = np.array(h2o.get_model(leaders[2, 0]).roc(valid=True))
Model3 = np.array(h2o.get_model(leaders[3, 0]).roc(valid=True))
Model4 = np.array(h2o.get_model(leaders[4, 0]).roc(valid=True))
Model5 = np.array(h2o.get_model(leaders[5, 0]).roc(valid=True))
Model6 = np.array(h2o.get_model(leaders[6, 0]).roc(valid=True))
Model7 = np.array(h2o.get_model(leaders[7, 0]).roc(valid=True))
Model8 = np.array(h2o.get_model(leaders[8, 0]).roc(valid=True))
Model9 = np.array(h2o.get_model(leaders[9, 0]).roc(valid=True))


layout = go.Layout(autosize=False, width=725, height=575,  xaxis=dict(title='False Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')), 
                                                           yaxis=dict(title='True Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')))

Model0Trace = go.Scatter(x = Model0[0], y = Model0[1], mode = 'lines', name = 'Leader', line = dict(color = ('rgb(26, 58, 126)'), width = 3))
Model1Trace = go.Scatter(x = Model1[0], y = Model1[1], mode = 'lines', name = 'Model 1', line = dict(color = ('rgb(135, 160, 216)'), width = 3))
Model2Trace = go.Scatter(x = Model2[0], y = Model2[1], mode = 'lines', name = 'Model 2', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model3Trace = go.Scatter(x = Model3[0], y = Model3[1], mode = 'lines', name = 'Model 3', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model4Trace = go.Scatter(x = Model4[0], y = Model4[1], mode = 'lines', name = 'Model 4', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model5Trace = go.Scatter(x = Model5[0], y = Model5[1], mode = 'lines', name = 'Model 5', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model6Trace = go.Scatter(x = Model6[0], y = Model6[1], mode = 'lines', name = 'Model 6', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model7Trace = go.Scatter(x = Model7[0], y = Model7[1], mode = 'lines', name = 'Model 7', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model8Trace = go.Scatter(x = Model8[0], y = Model8[1], mode = 'lines', name = 'Model 8', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model9Trace = go.Scatter(x = Model9[0], y = Model9[1], mode = 'lines', name = 'Model 9', line = dict(color = ('rgb(156, 190, 241)'), width = 1))


traceChanceLine = go.Scatter(x = [0,1], y = [0,1], mode = 'lines+markers', name = 'chance', line = dict(color = ('rgb(136, 140, 150)'), width = 4, dash = 'dash'))

fig = go.Figure(data=[Model0Trace,Model1Trace,Model2Trace,Model3Trace,Model4Trace,Model5Trace,Model7Trace,Model8Trace,Model9Trace,traceChanceLine], layout=layout)


py.iplot(fig)









    Out[27]:

Confusion Matrix



In [30]:

    
cm = autoModel.leader.confusion_matrix(xval=True)
cm = cm.table.as_data_frame()
cm
confusionMatrix = ff.create_table(cm)
confusionMatrix.layout.height=300
confusionMatrix.layout.width=800
confusionMatrix.layout.font.size=17
py.iplot(confusionMatrix)









    Out[30]:

Business Impact Matrix

Weighting Predictions With a Dollar Value

Correctly predicting GOOD: +\$500
Correctly predicting BAD: +\$800
Incorrectly predicting GOOD: -\$1000
Incorrectly predicting BAD: -\$100



In [31]:

    
CorrectPredictBad = cm.loc[0,'BAD']
CorrectPredictBadImpact = 500
cm1 = CorrectPredictBad*CorrectPredictBadImpact

IncorrectPredictBad = cm.loc[1,'BAD']
IncorrectPredictBadImpact = -100
cm2 = IncorrectPredictBad*IncorrectPredictBadImpact

IncorrectPredictGood = cm.loc[0,'GOOD']
IncorrectPredictGoodImpact = -1000
cm3 = IncorrectPredictGood*IncorrectPredictGoodImpact

CorrectPredictGood = cm.loc[0,'GOOD']
CorrectPredictGoodImpact = 800
cm4 = CorrectPredictGood*CorrectPredictGoodImpact


data_matrix = [['Business Impact', '($) Predicted BAD', '($) Predicted GOOD', '($) Total'],
               ['($) Actual BAD', cm1, cm3, '' ],
               ['($) Actual GOOD', cm2, cm4, ''],
               ['($) Total', cm1+cm2, cm3+cm4, cm1+cm2+cm3+cm4]]

impactMatrix = ff.create_table(data_matrix, height_constant=20, hoverinfo='weight')
impactMatrix.layout.height=300
impactMatrix.layout.width=800
impactMatrix.layout.font.size=17
py.iplot(impactMatrix)









    Out[31]:



In [ ]:

    
h2o.save_model(model=autoModel.leader)



In [ ]:

    
def approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,
                 Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length):
    # connect to the model scoring service
    h2o.connect()

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='DRF_model_1496459915419_4') 

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame({'Loan_Amount' : Loan_Amount,
                            'Term' : Term,
                            'Interest_Rate' : Interest_Rate,
                            'Employment_Years' : Employment_Years,
                            'Home_Ownership' : Home_Ownership,
                            'Annual_Income' : Annual_Income,
                            'Verification_Status' : Verification_Status,
                            'Loan_Purpose' : Loan_Purpose,
                            'State' : State,
                            'Debt_to_Income' : Debt_to_Income,
                            'Delinquent_2yr' : Delinquent_2yr,
                            'Revolving_Cr_Util' : Revolving_Cr_Util,
                            'Total_Accounts' : Total_Accounts,
                            'Longest_Credit_Length' : Longest_Credit_Length}, index=[0])
    
    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityBad = predictionsOut[1][1]
    probabilityGood = predictionsOut[1][2]
    return "Prediction: " + str(prediction) + " |Probability of Bad Loan: " + str(probabilityBad) + " |Probability of Good Loan: " + str(probabilityGood)



In [ ]:

    
Loan_Amount = 5000
Term = "60 months"
Interest_Rate=13
Employment_Years=5
Home_Ownership="RENT"
Annual_Income=75000
Verification_Status="VERIFIED - income"
Loan_Purpose="credit_card"
State="CA"
Debt_to_Income="16.12"
Delinquent_2yr="0"
Revolving_Cr_Util=37
Total_Accounts=6
Longest_Credit_Length=97
approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length)

RowID	Loan_Amount	Term	Interest_Rate	Employment_Years	Home_Ownership	Annual_Income	Verification_Status	Loan_Purpose	State	Debt_to_Income	Revolving_Cr_Util	Total_Accounts	Bad_Loan	Longest_Credit_Length
2	2500	60 months	15.27	0.5	RENT	30000	VERIFIED - income source	car	GA	1	9.4	4	BAD	12
3	2400	36 months	15.96	10	RENT	12252	not verified	small_business	IL	8.72	98.5	10	GOOD	10
4	10000	36 months	13.49	10	RENT	49200	VERIFIED - income source	other	CA	20	21	37	GOOD	15
5	5000	36 months	7.9	3	RENT	36000	VERIFIED - income source	wedding	AZ	11.2	28.3	12	GOOD	7
6	3000	36 months	18.64	9	RENT	48000	VERIFIED - income source	car	CA	5.35	87.5	4	GOOD	4
9	6500	60 months	14.65	5	OWN	72000	not verified	debt_consolidation	AZ	16.12	20.6	23	GOOD	13
12	3000	36 months	9.91	3	RENT	15000	VERIFIED - income source	credit_card	IL	12.56	43.1	11	GOOD	8
13	10000	36 months	10.65	3	RENT	100000	VERIFIED - income source	other	CA	7.06	55.5	29	BAD	20
14	1000	36 months	16.29	0.5	RENT	28000	not verified	debt_consolidation	MO	20.31	81.5	23	GOOD	4
18	9200	36 months	6.03	6	RENT	77385.2	not verified	debt_consolidation	CA	9.86	23.1	28	GOOD	10

model_id	auc	logloss
StackedEnsemble_AllModels_0_AutoML_20171204_164954	0.724092	0.430484
StackedEnsemble_BestOfFamily_0_AutoML_20171204_164954	0.72219	0.431329
GBM_grid_0_AutoML_20171204_164954_model_0	0.721619	0.428126
GBM_grid_0_AutoML_20171204_164954_model_5	0.72082	0.440175
GBM_grid_0_AutoML_20171204_164954_model_1	0.720425	0.428626
GBM_grid_0_AutoML_20171204_164954_model_2	0.718393	0.429827
GBM_grid_0_AutoML_20171204_164954_model_4	0.71454	0.431989
GBM_grid_0_AutoML_20171204_164954_model_3	0.711595	0.433819
GBM_grid_0_AutoML_20171204_164954_model_6	0.707657	0.449183
DeepLearning_0_AutoML_20171204_164954	0.706245	0.437179

	variable	relative_importance	scaled_importance	percentage
0	Interest_Rate	3238.202881	1.000000	0.333457
1	State	1380.842285	0.426422	0.142194
2	RowID	954.863831	0.294875	0.098328
3	Annual_Income	765.047424	0.236257	0.078782
4	Term	614.148438	0.189657	0.063243
5	Debt_to_Income	599.893066	0.185255	0.061775
6	Loan_Purpose	537.137817	0.165875	0.055312
7	Revolving_Cr_Util	407.736725	0.125915	0.041987
8	Loan_Amount	344.781158	0.106473	0.035504
9	Total_Accounts	228.186920	0.070467	0.023498
10	Employment_Years	198.352722	0.061254	0.020426
11	Longest_Credit_Length	158.876251	0.049063	0.016360
12	Home_Ownership	140.665573	0.043439	0.014485
13	Delinquent_2yr	71.675285	0.022134	0.007381
14	Verification_Status	70.584908	0.021798	0.007269