This notebook ingests a dataset, and trains many machine learning models intelligently searching the hyper-parameter space for optimal values. A leaderboard is maintained. Finally, an ensemble is created stacking together some of the base learners and the result is added to the leaderboard. The best model is deployed to production.
In [2]:
%%capture
import h2o
from h2o.automl import H2OAutoML
import os
import plotly
import cufflinks
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
plotly.offline.init_notebook_mode(connected=True)
myPlotlyKey = os.environ['SECRET_ENV_BRETTS_PLOTLY_KEY']
py.sign_in(username='bretto777',api_key=myPlotlyKey)
# Suppress unwatned warnings
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
In [5]:
%%capture
#h2o.init(nthreads=1, max_mem_size="256M")
h2o.connect(ip="35.199.178.30")
#h2o.no_progress()
In [6]:
# Import some data from Amazon S3
h2oDF = h2o.import_file("https://s3-us-west-1.amazonaws.com/dsclouddata/LendingClubData/LoansGoodBad.csv")
# Stratified Split into Train/Test
stratsplit = h2oDF["Bad_Loan"].stratified_split(test_frac=0.3, seed=12349453)
train = h2oDF[stratsplit=="train"]
test = h2oDF[stratsplit=="test"]
In [7]:
dfSum = h2oDF.group_by(by="State").sum().frame
dfMean = h2oDF.group_by(by="State").mean().frame
stateData = dfSum.merge(dfMean).as_data_frame(use_pandas=True, header=True)
stateData = stateData.iloc[1:]
train.head(10)
Out[7]:
In [8]:
for col in stateData.columns:
stateData[col] = stateData[col].astype(str)
scl = [[0.0, 'rgb(164, 182, 216)'],[0.2, 'rgb(116, 141, 188)'],[0.4, 'rgb(69, 102, 165)'],\
[0.6, 'rgb(45, 82, 153)'],[0.8, 'rgb(26, 62, 132)'],[1.0, 'rgb(4, 37, 99)']]
stateData['text'] = 'Avg Interest_Rate '+stateData['mean_Interest_Rate']+ '<br>' +\
'Total Loan_Amount '+stateData['sum_Loan_Amount']+'<br>'+\
'Avg Term '+stateData['mean_Term']+ '<br>' +\
'Avg Income ' + stateData['mean_Annual_Income']
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = stateData['State'],
z = stateData['sum_Bad_Loan'].astype(float),
locationmode = 'USA-states',
text = stateData['text'],
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "# Bad Loans")
) ]
layout = dict(
title = 'Bad Loans by State<br>(Hover for breakdown)',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )
Out[8]:
In [9]:
# Identify predictors and response
x = train.columns
y = "Bad_Loan"
x.remove(y)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()
In [11]:
# Run AutoML, building 11 models
autoModel = H2OAutoML(max_models=11)
autoModel.train(x = x, y = y,
training_frame = train,
leaderboard_frame = test)
In [12]:
leaders = autoModel.leaderboard
leaders
Out[12]:
In [21]:
leaders[1, 0]
Out[21]:
In [23]:
importances = h2o.get_model(leaders[2, 0]).varimp(use_pandas=True)
importances
Out[23]:
In [24]:
importances = h2o.get_model(leaders[2, 0]).varimp(use_pandas=True)
importances = importances.loc[:,['variable','relative_importance']].groupby('variable').mean()
importances.sort_values(by="relative_importance", ascending=False).iplot(kind='bar', colors='#5AC4F2', theme='white')
Out[24]:
In [27]:
Model0 = np.array(h2o.get_model(leaders[0, 0]).roc(valid=True))
Model1 = np.array(h2o.get_model(leaders[1, 0]).roc(valid=True))
Model2 = np.array(h2o.get_model(leaders[2, 0]).roc(valid=True))
Model3 = np.array(h2o.get_model(leaders[3, 0]).roc(valid=True))
Model4 = np.array(h2o.get_model(leaders[4, 0]).roc(valid=True))
Model5 = np.array(h2o.get_model(leaders[5, 0]).roc(valid=True))
Model6 = np.array(h2o.get_model(leaders[6, 0]).roc(valid=True))
Model7 = np.array(h2o.get_model(leaders[7, 0]).roc(valid=True))
Model8 = np.array(h2o.get_model(leaders[8, 0]).roc(valid=True))
Model9 = np.array(h2o.get_model(leaders[9, 0]).roc(valid=True))
layout = go.Layout(autosize=False, width=725, height=575, xaxis=dict(title='False Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')),
yaxis=dict(title='True Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')))
Model0Trace = go.Scatter(x = Model0[0], y = Model0[1], mode = 'lines', name = 'Leader', line = dict(color = ('rgb(26, 58, 126)'), width = 3))
Model1Trace = go.Scatter(x = Model1[0], y = Model1[1], mode = 'lines', name = 'Model 1', line = dict(color = ('rgb(135, 160, 216)'), width = 3))
Model2Trace = go.Scatter(x = Model2[0], y = Model2[1], mode = 'lines', name = 'Model 2', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model3Trace = go.Scatter(x = Model3[0], y = Model3[1], mode = 'lines', name = 'Model 3', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model4Trace = go.Scatter(x = Model4[0], y = Model4[1], mode = 'lines', name = 'Model 4', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model5Trace = go.Scatter(x = Model5[0], y = Model5[1], mode = 'lines', name = 'Model 5', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model6Trace = go.Scatter(x = Model6[0], y = Model6[1], mode = 'lines', name = 'Model 6', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model7Trace = go.Scatter(x = Model7[0], y = Model7[1], mode = 'lines', name = 'Model 7', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model8Trace = go.Scatter(x = Model8[0], y = Model8[1], mode = 'lines', name = 'Model 8', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model9Trace = go.Scatter(x = Model9[0], y = Model9[1], mode = 'lines', name = 'Model 9', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
traceChanceLine = go.Scatter(x = [0,1], y = [0,1], mode = 'lines+markers', name = 'chance', line = dict(color = ('rgb(136, 140, 150)'), width = 4, dash = 'dash'))
fig = go.Figure(data=[Model0Trace,Model1Trace,Model2Trace,Model3Trace,Model4Trace,Model5Trace,Model7Trace,Model8Trace,Model9Trace,traceChanceLine], layout=layout)
py.iplot(fig)
Out[27]:
In [30]:
cm = autoModel.leader.confusion_matrix(xval=True)
cm = cm.table.as_data_frame()
cm
confusionMatrix = ff.create_table(cm)
confusionMatrix.layout.height=300
confusionMatrix.layout.width=800
confusionMatrix.layout.font.size=17
py.iplot(confusionMatrix)
Out[30]:
In [31]:
CorrectPredictBad = cm.loc[0,'BAD']
CorrectPredictBadImpact = 500
cm1 = CorrectPredictBad*CorrectPredictBadImpact
IncorrectPredictBad = cm.loc[1,'BAD']
IncorrectPredictBadImpact = -100
cm2 = IncorrectPredictBad*IncorrectPredictBadImpact
IncorrectPredictGood = cm.loc[0,'GOOD']
IncorrectPredictGoodImpact = -1000
cm3 = IncorrectPredictGood*IncorrectPredictGoodImpact
CorrectPredictGood = cm.loc[0,'GOOD']
CorrectPredictGoodImpact = 800
cm4 = CorrectPredictGood*CorrectPredictGoodImpact
data_matrix = [['Business Impact', '($) Predicted BAD', '($) Predicted GOOD', '($) Total'],
['($) Actual BAD', cm1, cm3, '' ],
['($) Actual GOOD', cm2, cm4, ''],
['($) Total', cm1+cm2, cm3+cm4, cm1+cm2+cm3+cm4]]
impactMatrix = ff.create_table(data_matrix, height_constant=20, hoverinfo='weight')
impactMatrix.layout.height=300
impactMatrix.layout.width=800
impactMatrix.layout.font.size=17
py.iplot(impactMatrix)
Out[31]:
In [ ]:
h2o.save_model(model=autoModel.leader)
In [ ]:
def approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,
Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length):
# connect to the model scoring service
h2o.connect()
# open the downloaded model
ChurnPredictor = h2o.load_model(path='DRF_model_1496459915419_4')
# define a feature vector to evaluate with the model
newData = pd.DataFrame({'Loan_Amount' : Loan_Amount,
'Term' : Term,
'Interest_Rate' : Interest_Rate,
'Employment_Years' : Employment_Years,
'Home_Ownership' : Home_Ownership,
'Annual_Income' : Annual_Income,
'Verification_Status' : Verification_Status,
'Loan_Purpose' : Loan_Purpose,
'State' : State,
'Debt_to_Income' : Debt_to_Income,
'Delinquent_2yr' : Delinquent_2yr,
'Revolving_Cr_Util' : Revolving_Cr_Util,
'Total_Accounts' : Total_Accounts,
'Longest_Credit_Length' : Longest_Credit_Length}, index=[0])
# evaluate the feature vector using the model
predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
predictionsOut = h2o.as_list(predictions, use_pandas=False)
prediction = predictionsOut[1][0]
probabilityBad = predictionsOut[1][1]
probabilityGood = predictionsOut[1][2]
return "Prediction: " + str(prediction) + " |Probability of Bad Loan: " + str(probabilityBad) + " |Probability of Good Loan: " + str(probabilityGood)
In [ ]:
Loan_Amount = 5000
Term = "60 months"
Interest_Rate=13
Employment_Years=5
Home_Ownership="RENT"
Annual_Income=75000
Verification_Status="VERIFIED - income"
Loan_Purpose="credit_card"
State="CA"
Debt_to_Income="16.12"
Delinquent_2yr="0"
Revolving_Cr_Util=37
Total_Accounts=6
Longest_Credit_Length=97
approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length)