The busienss objective in this loan default case is to minimize the loss given default. So we need to define a loss function for our problem.
Let us take a simple assumption that we will make 10% of the amount if we make a successful loan, but we will only recover 80% of the loan amount if it defaults
In [1]:
#Load the libraries
import numpy as np
import pandas as pd
%matplotlib inline
from plotnine import *
import ipywidgets as widgets
from ipywidgets import interact, interactive,fixed, interact_manual, IntSlider
In [2]:
#Load the training dataset
df = pd.read_csv("../data/loan_unbalanced.csv")
In [3]:
df.isnull().sum()
Out[3]:
In [4]:
df.years = df.years.fillna(np.mean(df.years))
In [5]:
pd.crosstab(df.default, df.grade)
Out[5]:
In [6]:
ggplot(df) + aes('grade', 'default', color='default') + geom_col() + facet_wrap('default')
Out[6]:
In [7]:
from sklearn.model_selection import train_test_split
In [8]:
X = df.iloc[:,1:]
y = df.iloc[:,0]
In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y )
In [10]:
df_train = df[df.index.isin(X_train.index)]
df_test = df[df.index.isin(X_test.index)]
In [11]:
df_test.head()
Out[11]:
In [12]:
X_test.head()
Out[12]:
In [13]:
from sklearn.ensemble import RandomForestClassifier
In [14]:
from sklearn.externals import joblib
# read the encoders and the model
grade_encoder = joblib.load("../notebooks/le_grade.pkl")
ownership_encoder = joblib.load("../notebooks/le_ownership.pkl")
In [15]:
def train_model(X,y):
"""Trains a random forest model based on the training data
"""
X_copy = X.copy()
# encoders the features
X_copy['grade_encoded'] = grade_encoder.transform(X_copy.grade)
X_copy['ownership_encoded'] = ownership_encoder.transform(X_copy.ownership)
X_t = X_copy[['amount', 'grade_encoded', 'years', 'ownership_encoded', 'income', 'age']]
# Train the model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100, class_weight= {0: 1, 1: 1000})
clf = clf.fit(X_t,y)
return clf
In [16]:
model = train_model(X_train, y_train)
In [17]:
clf = train_model(X_train, y_train)
In [18]:
def predict_test(X, y, model):
"""Returns the probablity of default for the dataframe based on a model.
"""
X_copy = X.copy()
df_copy = df.copy()
# encoders for
X_copy['grade_encoded'] = grade_encoder.transform(X_copy.grade)
X_copy['ownership_encoded'] = ownership_encoder.transform(X_copy.ownership)
# important to pass the features in the same order as we built the model
X_t = X_copy[['amount', 'grade_encoded', 'years', 'ownership_encoded', 'income', 'age']]
# probablity for defaulting
p1 = model.predict_proba(X_t)[:, 1]
pred_df = pd.DataFrame({"default": np.array(y), "amount": np.array(X.amount), "proba": p1})
# Binning the probability & counts in each bin
pred_df['proba_bin'] = np.round(pred_df.proba*100, decimals=0)/100
pred_df['proba_bin_count'] = (pred_df.sort_values(by = ["proba_bin", "default"])
.groupby(['proba_bin'])
.cumcount() + 1)
return pred_df
In [19]:
test_out = predict_test(X_test, y_test, clf)
In [20]:
test_out.head()
Out[20]:
In [21]:
def get_test_prediction(df, threshold):
''' Code the accuracy based on threshold
'''
df1 = df.copy()
## Get class prediction
df1['pred'] = 0
df1.loc[df1.proba_bin>=threshold, 'pred'] = 1
## Make input-ouput code
df1['pred_class'] = "00"
df1.loc[(df1.default == 0) & (df1.pred == 1), 'pred_class'] = "01"
df1.loc[(df1.default == 1) & (df1.pred == 0), 'pred_class'] = "10"
df1.loc[(df1.default == 1) & (df1.pred == 1), 'pred_class'] = "11"
return df1
In [22]:
get_test_prediction(test_out, 0.4).head()
Out[22]:
In [27]:
def plot_test_prediction(df, threshold):
df1 = get_test_prediction(df,threshold)
g = (ggplot(df1) +
aes('proba_bin', 'proba_bin_count', fill='pred_class') +
geom_tile(color = "white") + scale_fill_manual(['#ca0020','#f4a582','#92c5de','#0571b0']) +
xlab("threshold") + ylab("count") )
print(g)
In [28]:
plot_test_prediction(test_out, 0.2)
In [29]:
threshold_widget = widgets.FloatSlider(min=0.0, max=1.0, step=0.01, value=0.2)
In [30]:
interactive(plot_test_prediction, df=fixed(test_out), threshold = threshold_widget)
In [31]:
def loss(df, threshold):
df1 = get_test_prediction(df, threshold)
success = df1.loc[(df1.pred == 0) & (df1.default == 0), 'amount' ].sum() * 0.1
failure = df1.loc[(df1.pred == 0) & (df1.default == 1), 'amount' ].sum() * -0.8
loss = success + failure
return loss
In [32]:
loss(test_out, 0.4)
Out[32]:
In [35]:
def plot_loss(df):
thresh = np.arange(0,1,0.01)
loss_calc = [loss(test_out, x) for x in thresh]
df_ = pd.DataFrame({"threshold": thresh, "loss": loss_calc})
return df_
In [36]:
lossdf = plot_loss(test_out)
In [37]:
ggplot(lossdf) + aes('threshold', 'loss') + geom_line()
Out[37]:
In [ ]: