Loan Approval Model

Created with H2O Automatic Machine Learning

This notebook ingests a dataset, and trains many machine learning models intelligently searching the hyper-parameter space for optimal values. A leaderboard is maintained. Finally, an ensemble is created stacking together some of the base learners and the result is added to the leaderboard. The best model is deployed to production.


In [2]:
%%capture
import h2o
from h2o.automl import H2OAutoML

import os
import plotly
import cufflinks
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
plotly.offline.init_notebook_mode(connected=True)

myPlotlyKey = os.environ['SECRET_ENV_BRETTS_PLOTLY_KEY']
py.sign_in(username='bretto777',api_key=myPlotlyKey)

# Suppress unwatned warnings
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

In [5]:
%%capture
#h2o.init(nthreads=1, max_mem_size="256M")
h2o.connect(ip="35.199.178.30")
#h2o.no_progress()

In [6]:
# Import some data from Amazon S3
h2oDF = h2o.import_file("https://s3-us-west-1.amazonaws.com/dsclouddata/LendingClubData/LoansGoodBad.csv")

# Stratified Split into Train/Test
stratsplit = h2oDF["Bad_Loan"].stratified_split(test_frac=0.3, seed=12349453)
train = h2oDF[stratsplit=="train"]
test = h2oDF[stratsplit=="test"]


Parse progress: |█████████████████████████████████████████████████████████| 100%

In [7]:
dfSum = h2oDF.group_by(by="State").sum().frame
dfMean = h2oDF.group_by(by="State").mean().frame
stateData = dfSum.merge(dfMean).as_data_frame(use_pandas=True, header=True)
stateData = stateData.iloc[1:]
train.head(10)


RowID Loan_AmountTerm Interest_Rate Employment_YearsHome_Ownership Annual_IncomeVerification_Status Loan_Purpose State Debt_to_Income Delinquent_2yr Revolving_Cr_Util Total_AccountsBad_Loan Longest_Credit_Length
2 250060 months 15.27 0.5RENT 30000 VERIFIED - income sourcecar GA 1 0 9.4 4BAD 12
3 240036 months 15.96 10 RENT 12252 not verified small_business IL 8.72 0 98.5 10GOOD 10
4 1000036 months 13.49 10 RENT 49200 VERIFIED - income sourceother CA 20 0 21 37GOOD 15
5 500036 months 7.9 3 RENT 36000 VERIFIED - income sourcewedding AZ 11.2 0 28.3 12GOOD 7
6 300036 months 18.64 9 RENT 48000 VERIFIED - income sourcecar CA 5.35 0 87.5 4GOOD 4
9 650060 months 14.65 5 OWN 72000 not verified debt_consolidationAZ 16.12 0 20.6 23GOOD 13
12 300036 months 9.91 3 RENT 15000 VERIFIED - income sourcecredit_card IL 12.56 0 43.1 11GOOD 8
13 1000036 months 10.65 3 RENT 100000 VERIFIED - income sourceother CA 7.06 0 55.5 29BAD 20
14 100036 months 16.29 0.5RENT 28000 not verified debt_consolidationMO 20.31 0 81.5 23GOOD 4
18 920036 months 6.03 6 RENT 77385.2not verified debt_consolidationCA 9.86 0 23.1 28GOOD 10
Out[7]:


In [8]:
for col in stateData.columns:
    stateData[col] = stateData[col].astype(str)

scl = [[0.0, 'rgb(164, 182, 216)'],[0.2, 'rgb(116, 141, 188)'],[0.4, 'rgb(69, 102, 165)'],\
            [0.6, 'rgb(45, 82, 153)'],[0.8, 'rgb(26, 62, 132)'],[1.0, 'rgb(4, 37, 99)']]

stateData['text'] = 'Avg Interest_Rate '+stateData['mean_Interest_Rate']+ '<br>' +\
    'Total Loan_Amount '+stateData['sum_Loan_Amount']+'<br>'+\
    'Avg Term '+stateData['mean_Term']+ '<br>' +\
    'Avg Income ' + stateData['mean_Annual_Income']

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = stateData['State'],
        z = stateData['sum_Bad_Loan'].astype(float),
        locationmode = 'USA-states',
        text = stateData['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "# Bad Loans")
        ) ]

layout = dict(
        title = 'Bad Loans by State<br>(Hover for breakdown)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )


Out[8]:

In [9]:
# Identify predictors and response
x = train.columns
y = "Bad_Loan"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [11]:
# Run AutoML, building 11 models
autoModel = H2OAutoML(max_models=11)
autoModel.train(x = x, y = y,
          training_frame = train,
          leaderboard_frame = test)


AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

Leaderboard

Display the best models, sorted by descending AUC


In [12]:
leaders = autoModel.leaderboard
leaders


model_id auc logloss
StackedEnsemble_AllModels_0_AutoML_20171204_164954 0.724092 0.430484
StackedEnsemble_BestOfFamily_0_AutoML_20171204_1649540.72219 0.431329
GBM_grid_0_AutoML_20171204_164954_model_0 0.721619 0.428126
GBM_grid_0_AutoML_20171204_164954_model_5 0.72082 0.440175
GBM_grid_0_AutoML_20171204_164954_model_1 0.720425 0.428626
GBM_grid_0_AutoML_20171204_164954_model_2 0.718393 0.429827
GBM_grid_0_AutoML_20171204_164954_model_4 0.71454 0.431989
GBM_grid_0_AutoML_20171204_164954_model_3 0.711595 0.433819
GBM_grid_0_AutoML_20171204_164954_model_6 0.707657 0.449183
DeepLearning_0_AutoML_20171204_164954 0.706245 0.437179
Out[12]:

Variable Importance - Best Model


In [21]:
leaders[1, 0]


Out[21]:
u'StackedEnsemble_BestOfFamily_0_AutoML_20171204_164954'

In [23]:
importances = h2o.get_model(leaders[2, 0]).varimp(use_pandas=True)
importances


Out[23]:
variable relative_importance scaled_importance percentage
0 Interest_Rate 3238.202881 1.000000 0.333457
1 State 1380.842285 0.426422 0.142194
2 RowID 954.863831 0.294875 0.098328
3 Annual_Income 765.047424 0.236257 0.078782
4 Term 614.148438 0.189657 0.063243
5 Debt_to_Income 599.893066 0.185255 0.061775
6 Loan_Purpose 537.137817 0.165875 0.055312
7 Revolving_Cr_Util 407.736725 0.125915 0.041987
8 Loan_Amount 344.781158 0.106473 0.035504
9 Total_Accounts 228.186920 0.070467 0.023498
10 Employment_Years 198.352722 0.061254 0.020426
11 Longest_Credit_Length 158.876251 0.049063 0.016360
12 Home_Ownership 140.665573 0.043439 0.014485
13 Delinquent_2yr 71.675285 0.022134 0.007381
14 Verification_Status 70.584908 0.021798 0.007269

In [24]:
importances = h2o.get_model(leaders[2, 0]).varimp(use_pandas=True)
importances = importances.loc[:,['variable','relative_importance']].groupby('variable').mean()
importances.sort_values(by="relative_importance", ascending=False).iplot(kind='bar', colors='#5AC4F2', theme='white')


Out[24]:

Leaderboard ROC Curves


In [27]:
Model0 = np.array(h2o.get_model(leaders[0, 0]).roc(valid=True))
Model1 = np.array(h2o.get_model(leaders[1, 0]).roc(valid=True))
Model2 = np.array(h2o.get_model(leaders[2, 0]).roc(valid=True))
Model3 = np.array(h2o.get_model(leaders[3, 0]).roc(valid=True))
Model4 = np.array(h2o.get_model(leaders[4, 0]).roc(valid=True))
Model5 = np.array(h2o.get_model(leaders[5, 0]).roc(valid=True))
Model6 = np.array(h2o.get_model(leaders[6, 0]).roc(valid=True))
Model7 = np.array(h2o.get_model(leaders[7, 0]).roc(valid=True))
Model8 = np.array(h2o.get_model(leaders[8, 0]).roc(valid=True))
Model9 = np.array(h2o.get_model(leaders[9, 0]).roc(valid=True))


layout = go.Layout(autosize=False, width=725, height=575,  xaxis=dict(title='False Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')), 
                                                           yaxis=dict(title='True Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')))

Model0Trace = go.Scatter(x = Model0[0], y = Model0[1], mode = 'lines', name = 'Leader', line = dict(color = ('rgb(26, 58, 126)'), width = 3))
Model1Trace = go.Scatter(x = Model1[0], y = Model1[1], mode = 'lines', name = 'Model 1', line = dict(color = ('rgb(135, 160, 216)'), width = 3))
Model2Trace = go.Scatter(x = Model2[0], y = Model2[1], mode = 'lines', name = 'Model 2', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model3Trace = go.Scatter(x = Model3[0], y = Model3[1], mode = 'lines', name = 'Model 3', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model4Trace = go.Scatter(x = Model4[0], y = Model4[1], mode = 'lines', name = 'Model 4', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model5Trace = go.Scatter(x = Model5[0], y = Model5[1], mode = 'lines', name = 'Model 5', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model6Trace = go.Scatter(x = Model6[0], y = Model6[1], mode = 'lines', name = 'Model 6', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model7Trace = go.Scatter(x = Model7[0], y = Model7[1], mode = 'lines', name = 'Model 7', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model8Trace = go.Scatter(x = Model8[0], y = Model8[1], mode = 'lines', name = 'Model 8', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model9Trace = go.Scatter(x = Model9[0], y = Model9[1], mode = 'lines', name = 'Model 9', line = dict(color = ('rgb(156, 190, 241)'), width = 1))


traceChanceLine = go.Scatter(x = [0,1], y = [0,1], mode = 'lines+markers', name = 'chance', line = dict(color = ('rgb(136, 140, 150)'), width = 4, dash = 'dash'))

fig = go.Figure(data=[Model0Trace,Model1Trace,Model2Trace,Model3Trace,Model4Trace,Model5Trace,Model7Trace,Model8Trace,Model9Trace,traceChanceLine], layout=layout)


py.iplot(fig)


Out[27]:

Confusion Matrix


In [30]:
cm = autoModel.leader.confusion_matrix(xval=True)
cm = cm.table.as_data_frame()
cm
confusionMatrix = ff.create_table(cm)
confusionMatrix.layout.height=300
confusionMatrix.layout.width=800
confusionMatrix.layout.font.size=17
py.iplot(confusionMatrix)


Out[30]:

Business Impact Matrix

Weighting Predictions With a Dollar Value

  • Correctly predicting GOOD: +\$500
  • Correctly predicting BAD: +\$800
  • Incorrectly predicting GOOD: -\$1000
  • Incorrectly predicting BAD: -\$100

In [31]:
CorrectPredictBad = cm.loc[0,'BAD']
CorrectPredictBadImpact = 500
cm1 = CorrectPredictBad*CorrectPredictBadImpact

IncorrectPredictBad = cm.loc[1,'BAD']
IncorrectPredictBadImpact = -100
cm2 = IncorrectPredictBad*IncorrectPredictBadImpact

IncorrectPredictGood = cm.loc[0,'GOOD']
IncorrectPredictGoodImpact = -1000
cm3 = IncorrectPredictGood*IncorrectPredictGoodImpact

CorrectPredictGood = cm.loc[0,'GOOD']
CorrectPredictGoodImpact = 800
cm4 = CorrectPredictGood*CorrectPredictGoodImpact


data_matrix = [['Business Impact', '($) Predicted BAD', '($) Predicted GOOD', '($) Total'],
               ['($) Actual BAD', cm1, cm3, '' ],
               ['($) Actual GOOD', cm2, cm4, ''],
               ['($) Total', cm1+cm2, cm3+cm4, cm1+cm2+cm3+cm4]]

impactMatrix = ff.create_table(data_matrix, height_constant=20, hoverinfo='weight')
impactMatrix.layout.height=300
impactMatrix.layout.width=800
impactMatrix.layout.font.size=17
py.iplot(impactMatrix)


Out[31]:

In [ ]:
h2o.save_model(model=autoModel.leader)

In [ ]:
def approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,
                 Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length):
    # connect to the model scoring service
    h2o.connect()

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='DRF_model_1496459915419_4') 

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame({'Loan_Amount' : Loan_Amount,
                            'Term' : Term,
                            'Interest_Rate' : Interest_Rate,
                            'Employment_Years' : Employment_Years,
                            'Home_Ownership' : Home_Ownership,
                            'Annual_Income' : Annual_Income,
                            'Verification_Status' : Verification_Status,
                            'Loan_Purpose' : Loan_Purpose,
                            'State' : State,
                            'Debt_to_Income' : Debt_to_Income,
                            'Delinquent_2yr' : Delinquent_2yr,
                            'Revolving_Cr_Util' : Revolving_Cr_Util,
                            'Total_Accounts' : Total_Accounts,
                            'Longest_Credit_Length' : Longest_Credit_Length}, index=[0])
    
    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityBad = predictionsOut[1][1]
    probabilityGood = predictionsOut[1][2]
    return "Prediction: " + str(prediction) + " |Probability of Bad Loan: " + str(probabilityBad) + " |Probability of Good Loan: " + str(probabilityGood)

In [ ]:
Loan_Amount = 5000
Term = "60 months"
Interest_Rate=13
Employment_Years=5
Home_Ownership="RENT"
Annual_Income=75000
Verification_Status="VERIFIED - income"
Loan_Purpose="credit_card"
State="CA"
Debt_to_Income="16.12"
Delinquent_2yr="0"
Revolving_Cr_Util=37
Total_Accounts=6
Longest_Credit_Length=97
approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length)