# Real Life Case Example

## Frame

The busienss objective in this loan default case is to minimize the loss given default. So we need to define a loss function for our problem.

Let us take a simple assumption that we will make 10% of the amount if we make a successful loan, but we will only recover 80% of the loan amount if it defaults

• Successful loan = amount * 10%
• Default loan = - amount * 80%
``````

In [1]:

import numpy as np
import pandas as pd
%matplotlib inline
from plotnine import *
import ipywidgets as widgets
from ipywidgets import interact, interactive,fixed, interact_manual, IntSlider

``````
``````

/Users/amitkaps/miniconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
from pandas.core import datetools

``````

## Acquire

``````

In [2]:

``````

## Refine

``````

In [3]:

df.isnull().sum()

``````
``````

Out[3]:

default        0
amount         0
years        809
ownership      0
income         0
age            0
dtype: int64

``````
``````

In [4]:

df.years = df.years.fillna(np.mean(df.years))

``````

# Explore

``````

In [5]:

``````
``````

Out[5]:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

A
B
C
D
E
F
G

default

0
9084
8344
4904
2651
692
155
35

1
565
985
844
580
176
56
21

``````
``````

In [6]:

ggplot(df) + aes('grade', 'default', color='default') + geom_col() + facet_wrap('default')

``````
``````

Out[6]:

<ggplot: (299238776)>

``````

# Transform

``````

In [7]:

from sklearn.model_selection import train_test_split

``````
``````

In [8]:

X = df.iloc[:,1:]
y = df.iloc[:,0]

``````
``````

In [9]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y )

``````
``````

In [10]:

df_train = df[df.index.isin(X_train.index)]
df_test = df[df.index.isin(X_test.index)]

``````
``````

In [11]:

``````
``````

Out[11]:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

default
amount
years
ownership
income
age

6
1
9000
C
0.0
RENT
30000.00
22

11
0
3600
A
13.0
MORTGAGE
110000.00
27

13
0
9200
A
6.0
RENT
77385.19
24

16
0
10000
B
5.0
RENT
50000.00
22

21
0
8500
B
0.0
RENT
25000.00
24

``````
``````

In [12]:

``````
``````

Out[12]:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

amount
years
ownership
income
age

28314
9600
C
4.0
RENT
78000.0
27

5408
5000
C
3.0
RENT
32000.0
23

27277
1200
A
0.0
OTHER
40000.0
27

17595
10000
C
7.0
RENT
43080.0
27

13934
5000
A
21.0
MORTGAGE
72000.0
22

``````

## Model

``````

In [13]:

from sklearn.ensemble import RandomForestClassifier

``````
``````

In [14]:

from sklearn.externals import joblib

# read the encoders and the model

``````
``````

In [15]:

def train_model(X,y):
"""Trains a random forest model based on the training data
"""
X_copy = X.copy()

# encoders the features
X_copy['ownership_encoded'] = ownership_encoder.transform(X_copy.ownership)
X_t = X_copy[['amount', 'grade_encoded', 'years', 'ownership_encoded', 'income', 'age']]

# Train the model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100, class_weight= {0: 1, 1: 1000})
clf = clf.fit(X_t,y)

return clf

``````
``````

In [16]:

model = train_model(X_train, y_train)

``````
``````

In [17]:

clf = train_model(X_train, y_train)

``````
``````

In [18]:

def predict_test(X, y, model):
"""Returns the probablity of default for the dataframe based on a model.
"""

X_copy = X.copy()
df_copy = df.copy()

# encoders for
X_copy['ownership_encoded'] = ownership_encoder.transform(X_copy.ownership)

# important to pass the features in the same order as we built the model
X_t = X_copy[['amount', 'grade_encoded', 'years', 'ownership_encoded', 'income', 'age']]

# probablity for defaulting
p1 = model.predict_proba(X_t)[:, 1]
pred_df = pd.DataFrame({"default": np.array(y), "amount": np.array(X.amount), "proba": p1})

# Binning the probability & counts in each bin
pred_df['proba_bin'] = np.round(pred_df.proba*100, decimals=0)/100
pred_df['proba_bin_count'] = (pred_df.sort_values(by = ["proba_bin", "default"])
.groupby(['proba_bin'])
.cumcount() + 1)

return pred_df

``````
``````

In [19]:

test_out = predict_test(X_test, y_test, clf)

``````
``````

In [20]:

``````
``````

Out[20]:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

amount
default
proba
proba_bin
proba_bin_count

0
9600
0
0.33
0.33
1

1
5000
0
0.07
0.07
1

2
1200
0
0.07
0.07
2

3
10000
0
0.15
0.15
1

4
5000
0
0.03
0.03
1

``````
``````

In [21]:

def get_test_prediction(df, threshold):
''' Code the accuracy based on threshold
'''

df1 = df.copy()

## Get class prediction
df1['pred'] = 0
df1.loc[df1.proba_bin>=threshold, 'pred'] = 1

## Make input-ouput code
df1['pred_class'] = "00"
df1.loc[(df1.default == 0) & (df1.pred == 1), 'pred_class'] = "01"
df1.loc[(df1.default == 1) & (df1.pred == 0), 'pred_class'] = "10"
df1.loc[(df1.default == 1) & (df1.pred == 1), 'pred_class'] = "11"

return df1

``````
``````

In [22]:

``````
``````

Out[22]:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

amount
default
proba
proba_bin
proba_bin_count
pred
pred_class

0
9600
0
0.33
0.33
1
0
00

1
5000
0
0.07
0.07
1
0
00

2
1200
0
0.07
0.07
2
0
00

3
10000
0
0.15
0.15
1
0
00

4
5000
0
0.03
0.03
1
0
00

``````
``````

In [27]:

def plot_test_prediction(df, threshold):
df1 = get_test_prediction(df,threshold)
g = (ggplot(df1) +
aes('proba_bin', 'proba_bin_count', fill='pred_class') +
geom_tile(color = "white") + scale_fill_manual(['#ca0020','#f4a582','#92c5de','#0571b0']) +
xlab("threshold") + ylab("count") )
print(g)

``````
``````

In [28]:

plot_test_prediction(test_out, 0.2)

``````
``````

<ggplot: (-9223372036549969390)>

``````
``````

In [29]:

threshold_widget = widgets.FloatSlider(min=0.0, max=1.0, step=0.01, value=0.2)

``````
``````

In [30]:

interactive(plot_test_prediction, df=fixed(test_out), threshold = threshold_widget)

``````
``````

var element = \$('#2214cf04-70ce-4ca6-825c-ae96b8f7fb48');

{"model_id": "cacdb6b2dd354eef9d99953bc620fe86", "version_major": 2, "version_minor": 0}

``````

## Insight

Lets calculate the loss for the calculation

``````

In [31]:

def loss(df, threshold):
df1 = get_test_prediction(df, threshold)
success = df1.loc[(df1.pred == 0) & (df1.default == 0), 'amount' ].sum() * 0.1
failure = df1.loc[(df1.pred == 0) & (df1.default == 1), 'amount' ].sum() * -0.8
loss = success + failure

return loss

``````
``````

In [32]:

loss(test_out, 0.4)

``````
``````

Out[32]:

129432.5

``````
``````

In [35]:

def plot_loss(df):
thresh = np.arange(0,1,0.01)
loss_calc = [loss(test_out, x) for x in thresh]
df_ = pd.DataFrame({"threshold": thresh, "loss": loss_calc})
return df_

``````
``````

In [36]:

lossdf = plot_loss(test_out)

``````
``````

In [37]:

ggplot(lossdf) + aes('threshold', 'loss') + geom_line()

``````
``````

Out[37]:

<ggplot: (300742078)>

``````
``````

In [ ]:

``````