Real Life Case Example

Building Insight & Communicating

Frame

The busienss objective in this loan default case is to minimize the loss given default. So we need to define a loss function for our problem.

Let us take a simple assumption that we will make 10% of the amount if we make a successful loan, but we will only recover 80% of the loan amount if it defaults

Successful loan = amount * 10%
Default loan = - amount * 80%



In [1]:

    
#Load the libraries
import numpy as np
import pandas as pd
%matplotlib inline
from plotnine import *
import ipywidgets as widgets
from ipywidgets import interact, interactive,fixed, interact_manual, IntSlider









    



/Users/amitkaps/miniconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

Acquire



In [2]:

    
#Load the training dataset
df = pd.read_csv("../data/loan_unbalanced.csv")

Refine



In [3]:

    
df.isnull().sum()









    Out[3]:





default        0
amount         0
grade          0
years        809
ownership      0
income         0
age            0
dtype: int64



In [4]:

    
df.years = df.years.fillna(np.mean(df.years))

Explore



In [5]:

    
pd.crosstab(df.default, df.grade)



In [6]:

    
ggplot(df) + aes('grade', 'default', color='default') + geom_col() + facet_wrap('default')









    












    Out[6]:





<ggplot: (299238776)>

Transform



In [7]:

    
from sklearn.model_selection import train_test_split



In [8]:

    
X = df.iloc[:,1:]
y = df.iloc[:,0]



In [9]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y )



In [10]:

    
df_train = df[df.index.isin(X_train.index)]
df_test = df[df.index.isin(X_test.index)]



In [11]:

    
df_test.head()



In [12]:

    
X_test.head()

Model



In [13]:

    
from sklearn.ensemble import RandomForestClassifier



In [14]:

    
from sklearn.externals import joblib

# read the encoders and the model
grade_encoder = joblib.load("../notebooks/le_grade.pkl")
ownership_encoder = joblib.load("../notebooks/le_ownership.pkl")



In [15]:

    
def train_model(X,y):
    """Trains a random forest model based on the training data
    """
    X_copy = X.copy()
    
     # encoders the features 
    X_copy['grade_encoded'] = grade_encoder.transform(X_copy.grade)
    X_copy['ownership_encoded'] = ownership_encoder.transform(X_copy.ownership)
    X_t = X_copy[['amount', 'grade_encoded', 'years', 'ownership_encoded', 'income', 'age']]
    
    # Train the model
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators = 100, class_weight= {0: 1, 1: 1000})
    clf = clf.fit(X_t,y)
    
    return clf



In [16]:

    
model = train_model(X_train, y_train)



In [17]:

    
clf = train_model(X_train, y_train)



In [18]:

    
def predict_test(X, y, model):
    """Returns the probablity of default for the dataframe based on a model.
    """
    
    X_copy = X.copy()
    df_copy = df.copy()
    
    
    # encoders for 
    X_copy['grade_encoded'] = grade_encoder.transform(X_copy.grade)
    X_copy['ownership_encoded'] = ownership_encoder.transform(X_copy.ownership)
    
    # important to pass the features in the same order as we built the model
    X_t = X_copy[['amount', 'grade_encoded', 'years', 'ownership_encoded', 'income', 'age']]
    
    # probablity for defaulting
    p1 = model.predict_proba(X_t)[:, 1]
    pred_df = pd.DataFrame({"default": np.array(y), "amount": np.array(X.amount), "proba": p1})
    
    # Binning the probability & counts in each bin
    pred_df['proba_bin'] = np.round(pred_df.proba*100, decimals=0)/100
    pred_df['proba_bin_count'] = (pred_df.sort_values(by = ["proba_bin", "default"])
                                        .groupby(['proba_bin'])
                                        .cumcount() + 1)
    
    return pred_df



In [19]:

    
test_out = predict_test(X_test, y_test, clf)



In [20]:

    
test_out.head()









    Out[20]:







  
    
      
      amount
      default
      proba
      proba_bin
      proba_bin_count
    
  
  
    
      0
      9600
      0
      0.33
      0.33
      1
    
    
      1
      5000
      0
      0.07
      0.07
      1
    
    
      2
      1200
      0
      0.07
      0.07
      2
    
    
      3
      10000
      0
      0.15
      0.15
      1
    
    
      4
      5000
      0
      0.03
      0.03
      1



In [21]:

    
def get_test_prediction(df, threshold): 
    ''' Code the accuracy based on threshold
    '''
    
    df1 = df.copy()
    
    ## Get class prediction
    df1['pred'] = 0
    df1.loc[df1.proba_bin>=threshold, 'pred'] = 1
    
    ## Make input-ouput code
    df1['pred_class'] = "00"
    df1.loc[(df1.default == 0) & (df1.pred == 1), 'pred_class'] = "01"
    df1.loc[(df1.default == 1) & (df1.pred == 0), 'pred_class'] = "10"
    df1.loc[(df1.default == 1) & (df1.pred == 1), 'pred_class'] = "11"

    return df1



In [22]:

    
get_test_prediction(test_out, 0.4).head()









    Out[22]:







  
    
      
      amount
      default
      proba
      proba_bin
      proba_bin_count
      pred
      pred_class
    
  
  
    
      0
      9600
      0
      0.33
      0.33
      1
      0
      00
    
    
      1
      5000
      0
      0.07
      0.07
      1
      0
      00
    
    
      2
      1200
      0
      0.07
      0.07
      2
      0
      00
    
    
      3
      10000
      0
      0.15
      0.15
      1
      0
      00
    
    
      4
      5000
      0
      0.03
      0.03
      1
      0
      00



In [27]:

    
def plot_test_prediction(df, threshold):
    df1 = get_test_prediction(df,threshold)
    g = (ggplot(df1) + 
         aes('proba_bin', 'proba_bin_count', fill='pred_class') + 
         geom_tile(color = "white") + scale_fill_manual(['#ca0020','#f4a582','#92c5de','#0571b0']) +
         xlab("threshold") + ylab("count") )
    print(g)



In [28]:

    
plot_test_prediction(test_out, 0.2)









    












    



<ggplot: (-9223372036549969390)>



In [29]:

    
threshold_widget = widgets.FloatSlider(min=0.0, max=1.0, step=0.01, value=0.2)



In [30]:

    
interactive(plot_test_prediction, df=fixed(test_out), threshold = threshold_widget)

Insight

Lets calculate the loss for the calculation



In [31]:

    
def loss(df, threshold):
    df1 = get_test_prediction(df, threshold)
    success = df1.loc[(df1.pred == 0) & (df1.default == 0), 'amount' ].sum() * 0.1
    failure = df1.loc[(df1.pred == 0) & (df1.default == 1), 'amount' ].sum() * -0.8
    loss = success + failure
        
    return loss



In [32]:

    
loss(test_out, 0.4)









    Out[32]:





129432.5



In [35]:

    
def plot_loss(df):
    thresh = np.arange(0,1,0.01)
    loss_calc = [loss(test_out, x) for x in thresh]
    df_ = pd.DataFrame({"threshold": thresh, "loss": loss_calc})        
    return df_



In [36]:

    
lossdf = plot_loss(test_out)



In [37]:

    
ggplot(lossdf) + aes('threshold', 'loss') + geom_line()









    












    Out[37]:





<ggplot: (300742078)>



In [ ]:

grade	A	B	C	D	E	F	G
default
0	9084	8344	4904	2651	692	155	35
1	565	985	844	580	176	56	21

	default	amount	grade	years	ownership	income	age
6	1	9000	C	0.0	RENT	30000.00	22
11	0	3600	A	13.0	MORTGAGE	110000.00	27
13	0	9200	A	6.0	RENT	77385.19	24
16	0	10000	B	5.0	RENT	50000.00	22
21	0	8500	B	0.0	RENT	25000.00	24

	amount	grade	years	ownership	income	age
28314	9600	C	4.0	RENT	78000.0	27
5408	5000	C	3.0	RENT	32000.0	23
27277	1200	A	0.0	OTHER	40000.0	27
17595	10000	C	7.0	RENT	43080.0	27
13934	5000	A	21.0	MORTGAGE	72000.0	22

	amount	proba	proba_bin	proba_bin_count
0	9600	0.33	0.33	1
1	5000	0.07	0.07	1
2	1200	0.07	0.07	2
3	10000	0.15	0.15	1
4	5000	0.03	0.03	1