In [1]:

    
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

Lending Club Data Explorations

Datastory

My goal in this data challenge is to understand the underlying characteristics that are present in clients that tend to default on their loans. This knowledge will help guide me in building a classifier that predicts the appropriate interest rate of a loan based on client profile.



In [2]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [3]:

    
# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.externals import joblib

# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style('whitegrid')



In [4]:

    
def plot_features_by_target(df, features, target):
    """Creates a pair plot of the features by the target variable
    
    Attributes
    ------------
    df: pandas dataframe
    features: list of strings (cols of df)
    target: list of strings (single col of df)
    """
    
    sns.pairplot(df, x_vars=features, y_vars=target);



In [5]:

    
def get_outliers(feature):
    """Get the indices of outliers for a feature if it exist
    
    Attributes
    ----------
    feature: pandas col
    
    return outlier indices
    """
    outlier  = []
    outlier_indices = []

    # For feature find the data points with extreme high or low values


    Q1 = data[feature].quantile(.25)
    Q3 = data[feature].quantile(.75)

    step = 1.5 * (Q3 - Q1)

    # Display the outliers
    temp_data = data[~((data[feature] >= Q1 - step) & (data[feature] <= Q3 + step))]
    outlier.append(dict(feature=feature, indices=temp_data.index.tolist()))
    outlier_indices += temp_data.index.tolist()

    return outlier_indices



In [6]:

    
def sample_data(num_sample, df, with_replacement=False):
    """
    Create a random sample from a table
    
    Attributes
    ---------
    num_sample: int
    df: dataframe
    with_replacement: boolean
    
    Returns a random subset of table index
    """
    df_index = []
    lst = np.arange(0, len(df), 1)

    for i in np.arange(0, num_sample, 1):

        # pick randomly from the whole table
        sample_index = np.random.choice(lst)

        if with_replacement:
            # store index
            df_index.append(sample_index)
        else:
            # remove the choice that was selected
            lst = np.setdiff1d(lst,[sample_index])
            df_index.append(sample_index)
            
    return df_index



In [7]:

    
# set figure size
pyplt.rcParams['figure.figsize'] = (6, 4)

Load Data



In [8]:

    
dataPath = 'data'
df = joblib.load(dataPath+'/df_cleaned.pkl')
print ("Dataset has {} samples with {} features each.".format(*df.shape))









    



Dataset has 329408 samples with 96 features each.



In [9]:

    
data = df

data.head(3)









    Out[9]:






  
    
      
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      emp_length
      home_ownership
      ...
      issue_d_week_of_year
      earliest_cr_line_num_day
      earliest_cr_line_day
      earliest_cr_line_week_of_year
      last_pymnt_d_num_day
      last_pymnt_d_day
      last_pymnt_d_week_of_year
      last_credit_pull_d_num_day
      last_credit_pull_d_day
      last_credit_pull_d_week_of_year
    
  
  
    
      0
      5000.0
      5000.0
      4975.0
      0
      10.65
      162.87
      1
      6
      1
      4
      ...
      52
      1
      Tuesday
      5
      0
      Monday
      33
      3
      Thursday
      34
    
    
      1
      2500.0
      2500.0
      2500.0
      1
      15.27
      59.83
      2
      13
      10
      4
      ...
      52
      3
      Thursday
      17
      1
      Tuesday
      18
      0
      Monday
      39
    
    
      2
      2400.0
      2400.0
      2400.0
      0
      15.96
      84.33
      2
      14
      1
      4
      ...
      52
      1
      Tuesday
      46
      1
      Tuesday
      24
      3
      Thursday
      34
    
  

3 rows × 96 columns

Exploratory Data Analysis



In [10]:

    
tmp = ['delinq_amnt', 'acc_now_delinq', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med',
 'tax_liens','policy_code']
data[tmp].describe()









    Out[10]:






  
    
      
      delinq_amnt
      acc_now_delinq
      chargeoff_within_12_mths
      collections_12_mths_ex_med
      tax_liens
      policy_code
    
  
  
    
      count
      329408.000000
      329408.000000
      329408.000000
      329408.000000
      329408.000000
      329408.0
    
    
      mean
      7.812348
      0.003218
      0.006378
      0.006093
      0.025944
      1.0
    
    
      std
      527.203072
      0.061566
      0.091562
      0.082624
      0.314483
      0.0
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.0
    
    
      25%
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.0
    
    
      50%
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.0
    
    
      75%
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.0
    
    
      max
      70076.000000
      5.000000
      7.000000
      5.000000
      63.000000
      1.0



In [11]:

    
data['delinq_amnt'].plot(kind='kde')
pyplt.title('Density Estimation of Amount Delinquent')
pyplt.legend(loc='upper right', shadow=True, fontsize='medium')
pyplt.savefig('report/figures/delinq_amnt.png', dpi=200)
pyplt.close();



In [12]:

    
# drop these features because they are not informative
data.drop(tmp, axis=1, inplace=True)
data.shape









    Out[12]:





(329408, 90)

Visualize the distributions of the dataset

First, I am taking a look at the summary statistics of some features that I think might be of interest in identifying bad loans. From these statistics, most of the data points are zero as can be seen from the kernel density estimation. As a result, there is no strong pattern here, so I am dropping these features.



In [13]:

    
# separate variables by type
date_vars = [x for x in data.columns if '_d' in x or '_cr_line' in x]

cat_vars = ['term','grade','sub_grade','emp_length','home_ownership','is_inc_v',
            'pymnt_plan','purpose','addr_city','addr_state','initial_list_status', 
            'loan_rank', 'pub_rec_bankruptcies']
cat_vars_lookup = [x for x in data.columns if '_old' in x]

# separate continous variables by their relative ranges

continuous_vars_0 = ['loan_amnt','funded_amnt','funded_amnt_inv']
continuous_vars_1 = ['installment','annual_inc','revol_bal',
 'out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp',
 'total_rec_int','recoveries','collection_recovery_fee','last_pymnt_amnt']
continuous_vars_2 = ['int_rate', 'delinq_2yrs', 'dti', 'inq_last_6mths', 'open_acc',  'pub_rec', 
 'total_acc', 'total_rec_late_fee', 'revol_util']



In [16]:

    
data[continuous_vars_0].describe()









    Out[16]:






  
    
      
      loan_amnt
      funded_amnt
      funded_amnt_inv
    
  
  
    
      count
      329408.000000
      329408.000000
      329408.000000
    
    
      mean
      14134.235507
      14099.446811
      14039.871005
    
    
      std
      8200.349559
      8181.723245
      8187.516489
    
    
      min
      500.000000
      500.000000
      0.000000
    
    
      25%
      8000.000000
      8000.000000
      8000.000000
    
    
      50%
      12000.000000
      12000.000000
      12000.000000
    
    
      75%
      20000.000000
      19750.000000
      19600.000000
    
    
      max
      35000.000000
      35000.000000
      35000.000000



In [17]:

    
data[continuous_vars_1].describe()









    Out[17]:






  
    
      
      installment
      annual_inc
      revol_bal
      out_prncp
      out_prncp_inv
      total_pymnt
      total_pymnt_inv
      total_rec_prncp
      total_rec_int
      recoveries
      collection_recovery_fee
      last_pymnt_amnt
    
  
  
    
      count
      329408.000000
      3.294080e+05
      3.294080e+05
      329408.000000
      329408.000000
      329408.000000
      329408.000000
      329408.000000
      329408.000000
      329408.000000
      329408.000000
      329408.000000
    
    
      mean
      430.573416
      7.277881e+04
      1.579633e+04
      8682.942381
      8677.238573
      6650.448760
      6591.550492
      4979.809248
      1652.164164
      18.171490
      1.096919
      1774.811078
    
    
      std
      243.265098
      5.444319e+04
      1.883157e+04
      8141.098320
      8137.292807
      6647.210635
      6584.340464
      5681.996003
      1792.366436
      242.882808
      49.261435
      4211.293421
    
    
      min
      16.080000
      3.000000e+03
      0.000000e+00
      0.000000
      0.000000
      32.740000
      0.000000
      0.000000
      5.520000
      0.000000
      0.000000
      0.000000
    
    
      25%
      254.910000
      4.500000e+04
      6.446000e+03
      1119.550000
      1118.242500
      2025.380000
      2017.177500
      1233.290000
      529.287500
      0.000000
      0.000000
      282.290000
    
    
      50%
      380.730000
      6.205200e+04
      1.175500e+04
      7015.110000
      7011.970000
      4482.955000
      4448.015000
      2964.040000
      1070.275000
      0.000000
      0.000000
      446.210000
    
    
      75%
      564.420000
      8.800000e+04
      2.009300e+04
      13605.700000
      13597.645000
      8928.420827
      8857.230000
      6400.000000
      2056.025000
      0.000000
      0.000000
      745.320000
    
    
      max
      1409.990000
      7.446395e+06
      2.568995e+06
      34706.760000
      34706.760000
      53438.202180
      52613.400000
      35000.030000
      19199.940000
      29282.070000
      7002.190000
      36115.200000

Income

Hypothesis: Clients with lower incomes are more apt to default their loans.

Based on my hypothesis, I am starting off my exploration with income. To start off, I am going to check this feature to see if there are any outliers in the data. When I plot the raw features, I see that there are some because the distribution is extremely skewed with standard deviation over \$500k.

I normalize the income data using Tukey's method; this gave a better representation of the underlying data; this distribution is closer to a uniform distribution. The majority of the clients in this dataset make an annual income of around \$65k+.

I ultimately decided to change my outliers skim by only removing folks in the top 1% of incomes from the data. I am making this choice because I believe there might be some signal from the folks in the upper ten percentile of the distribution. As a result of doing this, the mean moved to around \$70k+.



In [18]:

    
pyplt.rcParams['figure.figsize'] = (8, 8)

sns.distplot(data['annual_inc'])
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income')
pyplt.savefig('report/figures/annual_income_raw.png', format='png', dpi=200)
pyplt.close();

#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)

data['annual_inc'].describe()









    Out[18]:





count    3.294080e+05
mean     7.277881e+04
std      5.444319e+04
min      3.000000e+03
25%      4.500000e+04
50%      6.205200e+04
75%      8.800000e+04
max      7.446395e+06
Name: annual_inc, dtype: float64



In [19]:

    
# Clean income for outliers
mask = get_outliers('annual_inc')
df_inc = pd.DataFrame(index=data.index)
df_inc['income'] = data.annual_inc
df_inc.drop(df_inc.ix[mask].index, inplace = True)



In [20]:

    
pyplt.rcParams['figure.figsize'] = (8, 8)

sns.distplot(df_inc); 
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income: Outliers removed (Tukey)')
pyplt.savefig('report/figures/annual_income_cleaned.png', format='png', dpi=200)
pyplt.close();

#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)

df_inc.describe()









    Out[20]:






  
    
      
      income
    
  
  
    
      count
      315626.000000
    
    
      mean
      66172.901895
    
    
      std
      29010.425329
    
    
      min
      3000.000000
    
    
      25%
      45000.000000
    
    
      50%
      60000.000000
    
    
      75%
      84000.000000
    
    
      max
      152500.000000



In [21]:

    
#Am dropping folks whose salary is beyond the 99 percentile

top = data['annual_inc'].quantile(.99)
mask = data.loc[data['annual_inc'] > top].index
data.drop(mask, inplace=True)



In [22]:

    
pyplt.rcParams['figure.figsize'] = (8, 8)

sns.distplot(data['annual_inc'])
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income: Top 1% removed')
pyplt.savefig('report/figures/annual_income.png', format='png', dpi=200)
pyplt.close();

#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)

data['annual_inc'].describe()









    Out[22]:





count    326316.000000
mean      70019.943953
std       35614.590537
min        3000.000000
25%       45000.000000
50%       62000.000000
75%       86000.000000
max      240000.000000
Name: annual_inc, dtype: float64

Continuous data exploration

Next, I plotted the histograms of all the continous variables separated by their relative ranges. From the first set of histograms, I can see that the total amount for the loan, total amount committed by investors for that loan and the funded amount, funded_amnt, funded_amnt_inv, and loan_amnt are quite identical. Comes as no surprise that these features are also highly correlated. As a result, when it comes to modeling I will have to either pick one of the them, or average all three and create a new feature.

The features for the the remaining outstanding principal which are features with the prefix out_prncp, and payments received on the loans which are features with the prefix total_, are also very similar in the shape of their distributions. From these sets of histograms, I can tell that most loans are around \$8k to \$20k.

From the third set of histograms, majority of the clients in the Lending club dataset have extremely low 30+ days past-due incidences of delinquency in the last two years, delinq_2yrs. The distribution of the debt to income ratio, dti, is fairly symmetric, with majority of clients have a 15% dti. The distribution of interest rates, int_rate, is less uniform, with most loans served with an interest rate of 15%.



In [23]:

    
g = data[continuous_vars_0].hist(xrot=90, figsize=[15,17]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part One', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_0.png', format='png', dpi=200)
pyplt.close();

g = data[continuous_vars_1].hist(xrot=90, figsize=[15,17]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part Two', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_1.png', format='png', dpi=200)
pyplt.close();



In [25]:

    
g = data[continuous_vars_2].hist(figsize=[15,15]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part Three', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_2.png', format='png', dpi=200)
pyplt.close();



In [26]:

    
sns.distplot(data.loan_amnt);
pyplt.ylabel('percent per unit');
pyplt.title('Loan Amount');
pyplt.savefig('report/figures/loan_amnt.png', format='png', dpi=200)
pyplt.close();



In [27]:

    
sns.distplot(data.int_rate);
pyplt.ylabel('percent per unit');
pyplt.title('Interest Rate');
pyplt.savefig('report/figures/int_rate.png', format='png', dpi=200)
pyplt.close();



In [28]:

    
sns.distplot(data.dti);
pyplt.ylabel('percent per unit');
pyplt.title('Debt to Income Ratio');
pyplt.savefig('report/figures/dti.png', format='png', dpi=200)
pyplt.close();



In [30]:

    
pyplt.rcParams['figure.figsize'] = (8, 4)
corr = data[continuous_vars_1].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True

with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='.2f')
    pyplt.xticks(rotation=70, ha='right');
    

pyplt.title('Feature Correlation: Part One');
pyplt.savefig('report/figures/corr_1.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();



In [31]:

    
pyplt.rcParams['figure.figsize'] = (12, 6)
corr = data[date_vars].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True

with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='.1f')
    pyplt.xticks(rotation=70, ha='right');
    

pyplt.title('Feature Correlation: Part Two');
pyplt.savefig('report/figures/corr_2.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)



In [32]:

    
percentage_good = data.loan_rank.value_counts()[0] / len(data)
percentage_bad = data.loan_rank.value_counts()[1] / len(data)



In [33]:

    
print ('{}% of the loans are good, with a ratio of {} to 1.'.format(round(percentage_good*100, 2),
                                                            round(percentage_good / percentage_bad)))









    



93.71% of the loans are good, with a ratio of 15.0 to 1.



In [34]:

    
sns.distplot(data.loan_rank, bins=10, kde=False)
pyplt.title('Loan Status')
x = [0.05, 0.2, 0.4, 0.6, 0.8, 0.95]
labels = ["Good Loans", "", "", "", "", "Bad Loans"]
pyplt.xticks(x, labels)
pyplt.grid(False)
pyplt.savefig('report/figures/loans.png', format='png', dpi=200)
pyplt.close();

Separate the Good loans from the Bad loans

From the dataset, I can infer that most of the loans in the lending club dataset are good loans. The classes for good versus bad loans are highly imbalanced with a ratio of almost 15 to 1.

Create equal samples of good and bad loans in a dataset

To gain a deeper understanding of what separates defaulters from everyone else, I decided to uniformly sample the dataset to get balanced classes of good and bad loans.

Are defaulters and non-defaulters two different groups based on annual income?

To answer this question, I plotted the incomes of the two groups. Both distributions have a long tail and overlap a lot. From this graph, it appears that clients who default usually have lower incomes, and as incomes increase, the rates of defaults decrease.

Before I can conclusively make this determination, I need to know if the difference I am seeing is just chance variation or a difference in the distributions in the population. To make this determination, I will perform a hypothesis test using the Mann Whitney U test because the distributions are not normal.

Null hypothesis: In the population, the distribution of annual incomes is the same for clients who default and those who do not. The difference in the sample is due to chance.
Alternative hypothesis: The two distributions are different in the population.

I performed the test and got a P-value of zero. As a result, I can reject the null hypothesis and conclude that in the population, the distribution of annual incomes of defaulters and non-defaulters are different.

Do wealthier clients request bigger loans?

Since income is a feature that separates defaulters from non-defaulters, I want to see is there is a relationship between income and the amount client request for loans. To do this, I plotted income against loan amount and fitted a regression line. From the graph, we can see that there is a somewhat positive relationship between the two variables. As income goes up, so those the amount requested.

Understanding loan grades

Lending club grades their loans on two scales. The first scale is called grade, it ranges from A to G, which in this dataset, I have coded 0 to 6. They do a further breakdown of this grade into subgrades. For each grade they break it down into several smaller bins. In this dataset, these are captured in the sub_grade feature which has been coded similarly from 0 to n. The distribution of the subgrade has a long tail with most of the loans between sub grades 3 and 15.

In this dataset, I have coded good loans as 0 and bad loans as 1. On average, good loan clients have higher incomes for each grade category they fail into when compared with clients that tend to default.

Consider loans by grade and the purpose for which the loan was taken. When a loan is taken in order to consolidate debt, we see that as the grades progress from low risk to high risk, clients generally tend to increase their loan amounts.

What are the most important features between good and bad loans?

Once I had a good idea of the underlying characteristics of my data, I moved on to understanding the most important features. I implemented a Random Forest classifier and plotted its feature importance. From this graph, the most important features were the derived features from the dates in which the last payments of the loans were done. The interest rates, the sub grade of the loan, the debt to income ratio of the clients were all important features. It turns out income is not as important in determining whether a client will default or not.



In [36]:

    
good = data['loan_rank'] == 0
bad = data['loan_rank'] == 1



In [37]:

    
data_good = pd.DataFrame()
data_bad = pd.DataFrame()


data_good = data.ix[data[good].index, :]
data_bad = data.ix[data[bad].index, :]

data_good.reset_index(inplace=True)
data_bad.reset_index(inplace=True)



In [38]:

    
sample_size = 1000



In [39]:

    
the_index = sample_data(sample_size, data_good)
data_good_ = data_good.ix[the_index, :]

the_index = sample_data(sample_size, data_bad)
data_bad_ = data_bad.ix[the_index, :]



In [40]:

    
data_ = data_good_
data_ = data_.append(data_bad_)
data_.reset_index(inplace=True)



In [41]:

    
sns.distplot(data_.loan_rank, bins=10, kde=False)
pyplt.title('Loan Status: Classes Balanced')
x = [0.05, 0.2, 0.4, 0.6, 0.8, 0.95]
labels = ["Good Loans", "", "", "", "", "Bad Loans"]
pyplt.xticks(x, labels)
pyplt.grid(False)
pyplt.savefig('report/figures/balanced_loans.png', format='png', dpi=200)
pyplt.close();



In [42]:

    
good = data_['loan_rank'] == 0
bad = data_['loan_rank'] == 1

df_good = pd.DataFrame()
df_bad = pd.DataFrame()

df_good['good_loans'] = data_.ix[data_[good].index, 'annual_inc']
df_bad['bad_loans'] = data_.ix[data_[bad].index, 'annual_inc']



In [43]:

    
income_bin = np.arange(2e+04, 25e+04, 2e+04)



In [44]:

    
df_bad['bad_loans'].plot.hist(bins=income_bin, normed=True, alpha = 0.8)
df_good['good_loans'].plot.hist(bins=income_bin, normed=True, alpha = 0.8)
pyplt.ylabel('percent per dollar')
pyplt.xlabel('Annual Income, USD')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pyplt.suptitle('Annual Incomes of Good and Bad Loan Clients', fontsize=12)
pyplt.savefig('report/figures/good_bad.png', format='png', bbox_inches='tight', dpi=200);
pyplt.close();



In [45]:

    
a = df_bad['bad_loans'].values
b = df_good['good_loans'].values



In [46]:

    
import scipy.stats as st

statistic, p1 = st.mannwhitneyu(a, b) 
pvalue = p1 * 2
print ('P-value:%.2f'%pvalue)









    



P-value:0.00



In [47]:

    
g = sns.jointplot(data_['loan_amnt'], data_['annual_inc'],  kind="reg", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Loan Amount by Income', fontsize=14)
pyplt.savefig('report/figures/loan_inc.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();



In [48]:

    
g = sns.jointplot(data_['annual_inc'], data_['int_rate'], kind="reg", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Income by Interest Rate', fontsize=14)
pyplt.savefig('report/figures/inc_int_rate.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();



In [49]:

    
g = sns.jointplot(data_['annual_inc'], data_['sub_grade'], kind="reg", size=5, space=0)
g.fig.suptitle('Income by Loan Sub Grade', fontsize=14)
pyplt.savefig('report/figures/inc_sub_grade.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();



In [50]:

    
g = sns.jointplot(data_['int_rate'], data_['sub_grade'], kind="kde", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Interest Rate by Loan Sub Grade', fontsize=14)
pyplt.savefig('report/figures/sub_grade_int_rate.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

Categorical variables



In [51]:

    
data_[cat_vars].describe()









    Out[51]:






  
    
      
      term
      grade
      sub_grade
      emp_length
      home_ownership
      is_inc_v
      pymnt_plan
      purpose
      addr_city
      addr_state
      initial_list_status
      loan_rank
      pub_rec_bankruptcies
    
  
  
    
      count
      2000.000000
      2000.000000
      2000.000000
      2000.000000
      2000.000000
      2000.000000
      2000.000000
      2000.000000
      2000.000000
      2000.000000
      2000.0000
      2000.000000
      2000.000000
    
    
      mean
      0.277000
      2.025500
      12.058000
      4.198500
      2.061500
      1.138000
      0.005000
      3.020000
      14221.594500
      22.508000
      0.2000
      0.500000
      0.074000
    
    
      std
      0.447628
      1.360064
      6.736492
      3.443258
      1.935611
      0.843393
      0.070551
      2.857566
      8590.981577
      14.632498
      0.4001
      0.500125
      0.267507
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      144.000000
      0.000000
      0.0000
      0.000000
      0.000000
    
    
      25%
      0.000000
      1.000000
      7.000000
      1.000000
      0.000000
      0.000000
      0.000000
      2.000000
      6801.500000
      9.000000
      0.0000
      0.000000
      0.000000
    
    
      50%
      0.000000
      2.000000
      11.000000
      3.000000
      3.000000
      1.000000
      0.000000
      2.000000
      13918.000000
      23.000000
      0.0000
      0.500000
      0.000000
    
    
      75%
      1.000000
      3.000000
      16.000000
      7.000000
      4.000000
      2.000000
      0.000000
      2.000000
      21041.000000
      34.000000
      0.0000
      1.000000
      0.000000
    
    
      max
      1.000000
      6.000000
      34.000000
      11.000000
      4.000000
      2.000000
      1.000000
      13.000000
      32397.000000
      49.000000
      1.0000
      1.000000
      2.000000



In [52]:

    
pyplt.rcParams['figure.figsize'] = (12, 6)

sns.stripplot(x='sub_grade', y='loan_amnt', hue='loan_rank', data=data_);
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
pyplt.title('Bad Loans By Amount and Sub Grade', fontsize=14)
pyplt.savefig('report/figures/bad_loan_sub_grade.png',bbox_inches='tight', format='png', dpi=200)
pyplt.close();

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)



In [53]:

    
pyplt.rcParams['figure.figsize'] = (10, 4)

sns.countplot(x='sub_grade', data=data, hue='grade');
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
pyplt.title('Loan Sub Grades', fontsize=14)
pyplt.savefig('report/figures/sub_grade.png',bbox_inches='tight', format='png', dpi=200)
pyplt.close();

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)



In [54]:

    
sns.factorplot(x='grade', y='annual_inc', data=data_, hue='loan_rank');
pyplt.title('Annual income by Grade', fontsize=14)
pyplt.savefig('report/figures/inc_grade.png', format='png', dpi=200)
pyplt.close();



In [55]:

    
g = sns.factorplot(x='grade', y='loan_amnt', data=data_, hue='loan_rank', 
               col='purpose_old', col_wrap=4, kind='box');
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Loans By Grade and Purpose', fontsize=20)
pyplt.savefig('report/figures/loan_grade_purpose.png', bbox_inches='tight', format='png', dpi=200)

pyplt.close();



In [56]:

    
g = sns.factorplot(x='grade', y='loan_amnt', data=data_, hue='loan_rank', 
               col='home_ownership_old', col_wrap=4, kind='box');
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Loans By Home Ownership', fontsize=15)
pyplt.savefig('report/figures/loan_home.png', format='png', dpi=200)
pyplt.close();



In [57]:

    
pyplt.rcParams['figure.figsize'] = (12, 8)
sns.stripplot(x='addr_state_old', y='loan_amnt', hue='loan_rank', data=data_, size=4, jitter=True);

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
pyplt.close();

Modeling

Extract features and labels

Extract feature (X) and target (y) columns



In [62]:

    
target_col = data_['loan_rank']   
y = target_col 

print ("\nLabel values:-")
y.head()









    



Label values:-






    Out[62]:





0    0
1    0
2    0
3    0
4    0
Name: loan_rank, dtype: uint8



In [76]:

    
features = cat_vars + ['loan_amnt', 'installment','annual_inc','revol_bal',
 'out_prncp','total_pymnt','recoveries','collection_recovery_fee','last_pymnt_amnt'] + ['int_rate','delinq_2yrs',
 'dti','inq_last_6mths','total_acc','revol_util'] + ['accept_d_months',
 'accept_d_days',
 'earliest_cr_line_months',
 'earliest_cr_line_days',
 'last_pymnt_d_months',
 'last_pymnt_d_days',
 'accept_d_num_day',
 'accept_d_week_of_year',
 'list_d_num_day',
 'list_d_week_of_year',
 'exp_d_num_day',
 'exp_d_week_of_year',
 'issue_d_num_day',
 'issue_d_week_of_year',
 'last_pymnt_d_num_day',
 'last_pymnt_d_week_of_year',
 'last_credit_pull_d_num_day',
 'last_credit_pull_d_week_of_year']



In [79]:

    
X = data_[features]
X.drop(['loan_rank'], axis=1, inplace=True)
print ("\nFeature values:-")
X.head()









    



Feature values:-






    Out[79]:






  
    
      
      term
      grade
      sub_grade
      emp_length
      home_ownership
      is_inc_v
      pymnt_plan
      purpose
      addr_city
      addr_state
      ...
      list_d_num_day
      list_d_week_of_year
      exp_d_num_day
      exp_d_week_of_year
      issue_d_num_day
      issue_d_week_of_year
      last_pymnt_d_num_day
      last_pymnt_d_week_of_year
      last_credit_pull_d_num_day
      last_credit_pull_d_week_of_year
    
  
  
    
      0
      1
      1
      5
      7
      4
      2
      0
      1
      21061
      4
      ...
      5
      5
      5
      7
      0
      7
      4
      33
      3
      34
    
    
      1
      0
      2
      11
      1
      0
      2
      0
      2
      17584
      36
      ...
      3
      37
      3
      39
      4
      38
      1
      35
      4
      13
    
    
      2
      1
      2
      11
      4
      4
      1
      0
      2
      1668
      19
      ...
      2
      39
      2
      41
      1
      40
      0
      34
      3
      34
    
    
      3
      0
      0
      0
      9
      0
      0
      0
      2
      8032
      44
      ...
      5
      45
      5
      47
      0
      48
      3
      35
      3
      34
    
    
      4
      0
      1
      5
      1
      3
      0
      0
      1
      8058
      4
      ...
      0
      1
      0
      3
      0
      5
      4
      31
      3
      34
    
  

5 rows × 45 columns



In [93]:

    
from time import time, gmtime, strftime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import RobustScaler
from sklearn.calibration import CalibratedClassifierCV



In [88]:

    
from sklearn.model_selection import train_test_split


def shuffle_split_data(X, y):
    """ Shuffles and splits data into 75% training and 25% testing subsets,
        then returns the training and testing subsets. """
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1500, random_state=42)

    # Return the training and testing data subsets
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = shuffle_split_data(X, y)



In [95]:

    
models = {
          'RandomForest': RandomForestClassifier(n_estimators=10, n_jobs=-1),
         }

scaler = RobustScaler()
X_train_transform = scaler.fit_transform(X_train)
X_test_transform  = scaler.fit_transform(X_test)

print(X_train_transform.shape)
print(X_test_transform.shape)









    



(1500, 45)
(500, 45)



In [96]:

    
print('CLASSIFICATION RESULTS OF BASELINE CLASSIFIERS\n')
print('{:20}{:^15}{:^10}{:^10}'.format('CLASSIFIER', 'MEAN SCORE %', 'STD DEV %', 'TIME'))


for clf_name, clf in models.iteritems():
    t0 = time()
    results = cross_val_score(clf, X_train_transform, y_train, cv=5)
    t1 = time() - t0
    print('{:20}{:^15.2f}{:^10.2f}{:>10.2f}secs'.format(clf_name, results.mean()*100, results.std()*100, t1))









    



CLASSIFICATION RESULTS OF BASELINE CLASSIFIERS

CLASSIFIER           MEAN SCORE %  STD DEV %    TIME   
RandomForest             94.67        0.55         1.57secs



In [105]:

    
t0 = time()

clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(X_train_transform, y_train)
final_preds = calibrated_clf.predict(X_test_transform)
precision, recall, fbeta_score, support = score(y_test, final_preds)
print ("Precision:{:10.3f}\nRecall: {:^10.3f}\nF Score{:^10.3f}".format(precision.mean()*100, 
                                                 recall.mean()*100, fbeta_score.mean()*100))









    



Precision:    95.226
Recall:   95.174  
F Score  95.194



In [100]:

    
clf.fit(X_train_transform, y_train)
importances = clf.feature_importances_



In [109]:

    
pyplt.rcParams['figure.figsize'] = (8, 16)

importance_frame = pd.DataFrame({'Importance': importances, 'Feature': list(X.columns)})
importance_frame.sort_values(by = 'Importance', inplace = True)
ax =importance_frame.plot(kind = 'barh', x = 'Feature', color = 'deepskyblue')
pyplt.savefig('report/figures/feature_imp.png', bbox_inches='tight', format='png', dpi=200)
# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
pyplt.close();

	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	emp_length	home_ownership	...	issue_d_week_of_year	earliest_cr_line_num_day	earliest_cr_line_day	earliest_cr_line_week_of_year	last_pymnt_d_num_day	last_pymnt_d_day	last_pymnt_d_week_of_year	last_credit_pull_d_num_day	last_credit_pull_d_day	last_credit_pull_d_week_of_year
0	5000.0	5000.0	4975.0	0	10.65	162.87	1	6	1	4	...	52	1	Tuesday	5	0	Monday	33	3	Thursday	34
1	2500.0	2500.0	2500.0	1	15.27	59.83	2	13	10	4	...	52	3	Thursday	17	1	Tuesday	18	0	Monday	39
2	2400.0	2400.0	2400.0	0	15.96	84.33	2	14	1	4	...	52	1	Tuesday	46	1	Tuesday	24	3	Thursday	34

	delinq_amnt	acc_now_delinq	chargeoff_within_12_mths	collections_12_mths_ex_med	tax_liens	policy_code
count	329408.000000	329408.000000	329408.000000	329408.000000	329408.000000	329408.0
mean	7.812348	0.003218	0.006378	0.006093	0.025944	1.0
std	527.203072	0.061566	0.091562	0.082624	0.314483	0.0
min	0.000000	0.000000	0.000000	0.000000	0.000000	1.0
25%	0.000000	0.000000	0.000000	0.000000	0.000000	1.0
50%	0.000000	0.000000	0.000000	0.000000	0.000000	1.0
75%	0.000000	0.000000	0.000000	0.000000	0.000000	1.0
max	70076.000000	5.000000	7.000000	5.000000	63.000000	1.0

	installment	annual_inc	revol_bal	out_prncp	out_prncp_inv	total_pymnt	total_pymnt_inv	total_rec_prncp	total_rec_int	recoveries	collection_recovery_fee	last_pymnt_amnt
count	329408.000000	3.294080e+05	3.294080e+05	329408.000000	329408.000000	329408.000000	329408.000000	329408.000000	329408.000000	329408.000000	329408.000000	329408.000000
mean	430.573416	7.277881e+04	1.579633e+04	8682.942381	8677.238573	6650.448760	6591.550492	4979.809248	1652.164164	18.171490	1.096919	1774.811078
std	243.265098	5.444319e+04	1.883157e+04	8141.098320	8137.292807	6647.210635	6584.340464	5681.996003	1792.366436	242.882808	49.261435	4211.293421
min	16.080000	3.000000e+03	0.000000e+00	0.000000	0.000000	32.740000	0.000000	0.000000	5.520000	0.000000	0.000000	0.000000
25%	254.910000	4.500000e+04	6.446000e+03	1119.550000	1118.242500	2025.380000	2017.177500	1233.290000	529.287500	0.000000	0.000000	282.290000
50%	380.730000	6.205200e+04	1.175500e+04	7015.110000	7011.970000	4482.955000	4448.015000	2964.040000	1070.275000	0.000000	0.000000	446.210000
75%	564.420000	8.800000e+04	2.009300e+04	13605.700000	13597.645000	8928.420827	8857.230000	6400.000000	2056.025000	0.000000	0.000000	745.320000
max	1409.990000	7.446395e+06	2.568995e+06	34706.760000	34706.760000	53438.202180	52613.400000	35000.030000	19199.940000	29282.070000	7002.190000	36115.200000

	income
count	315626.000000
mean	66172.901895
std	29010.425329
min	3000.000000
25%	45000.000000
50%	60000.000000
75%	84000.000000
max	152500.000000

	term	grade	sub_grade	emp_length	home_ownership	is_inc_v	pymnt_plan	purpose	addr_city	addr_state	initial_list_status	loan_rank	pub_rec_bankruptcies
count	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.0000	2000.000000	2000.000000
mean	0.277000	2.025500	12.058000	4.198500	2.061500	1.138000	0.005000	3.020000	14221.594500	22.508000	0.2000	0.500000	0.074000
std	0.447628	1.360064	6.736492	3.443258	1.935611	0.843393	0.070551	2.857566	8590.981577	14.632498	0.4001	0.500125	0.267507
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	144.000000	0.000000	0.0000	0.000000	0.000000
25%	0.000000	1.000000	7.000000	1.000000	0.000000	0.000000	0.000000	2.000000	6801.500000	9.000000	0.0000	0.000000	0.000000
50%	0.000000	2.000000	11.000000	3.000000	3.000000	1.000000	0.000000	2.000000	13918.000000	23.000000	0.0000	0.500000	0.000000
75%	1.000000	3.000000	16.000000	7.000000	4.000000	2.000000	0.000000	2.000000	21041.000000	34.000000	0.0000	1.000000	0.000000
max	1.000000	6.000000	34.000000	11.000000	4.000000	2.000000	1.000000	13.000000	32397.000000	49.000000	1.0000	1.000000	2.000000

	term	grade	sub_grade	emp_length	home_ownership	is_inc_v	purpose	addr_city	addr_state	...	list_d_num_day	list_d_week_of_year	exp_d_num_day	exp_d_week_of_year	issue_d_num_day	issue_d_week_of_year	last_pymnt_d_num_day	last_pymnt_d_week_of_year	last_credit_pull_d_num_day	last_credit_pull_d_week_of_year
0	1	1	5	7	4	2	1	21061	4	...	5	5	5	7	0	7	4	33	3	34
1	0	2	11	1	0	2	2	17584	36	...	3	37	3	39	4	38	1	35	4	13
2	1	2	11	4	4	1	2	1668	19	...	2	39	2	41	1	40	0	34	3	34
3	0	0	0	9	0	0	2	8032	44	...	5	45	5	47	0	48	3	35	3	34
4	0	1	5	1	3	0	1	8058	4	...	0	1	0	3	0	5	4	31	3	34