In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)


Lending Club Data Explorations

Datastory

My goal in this data challenge is to understand the underlying characteristics that are present in clients that tend to default on their loans. This knowledge will help guide me in building a classifier that predicts the appropriate interest rate of a loan based on client profile.


In [2]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [3]:
# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.externals import joblib

# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style('whitegrid')

In [4]:
def plot_features_by_target(df, features, target):
    """Creates a pair plot of the features by the target variable
    
    Attributes
    ------------
    df: pandas dataframe
    features: list of strings (cols of df)
    target: list of strings (single col of df)
    """
    
    sns.pairplot(df, x_vars=features, y_vars=target);

In [5]:
def get_outliers(feature):
    """Get the indices of outliers for a feature if it exist
    
    Attributes
    ----------
    feature: pandas col
    
    return outlier indices
    """
    outlier  = []
    outlier_indices = []

    # For feature find the data points with extreme high or low values


    Q1 = data[feature].quantile(.25)
    Q3 = data[feature].quantile(.75)

    step = 1.5 * (Q3 - Q1)

    # Display the outliers
    temp_data = data[~((data[feature] >= Q1 - step) & (data[feature] <= Q3 + step))]
    outlier.append(dict(feature=feature, indices=temp_data.index.tolist()))
    outlier_indices += temp_data.index.tolist()

    return outlier_indices

In [6]:
def sample_data(num_sample, df, with_replacement=False):
    """
    Create a random sample from a table
    
    Attributes
    ---------
    num_sample: int
    df: dataframe
    with_replacement: boolean
    
    Returns a random subset of table index
    """
    df_index = []
    lst = np.arange(0, len(df), 1)

    for i in np.arange(0, num_sample, 1):

        # pick randomly from the whole table
        sample_index = np.random.choice(lst)

        if with_replacement:
            # store index
            df_index.append(sample_index)
        else:
            # remove the choice that was selected
            lst = np.setdiff1d(lst,[sample_index])
            df_index.append(sample_index)
            
    return df_index

In [7]:
# set figure size
pyplt.rcParams['figure.figsize'] = (6, 4)

Load Data


In [8]:
dataPath = 'data'
df = joblib.load(dataPath+'/df_cleaned.pkl')
print ("Dataset has {} samples with {} features each.".format(*df.shape))


Dataset has 329408 samples with 96 features each.

In [9]:
data = df

data.head(3)


Out[9]:
loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_length home_ownership ... issue_d_week_of_year earliest_cr_line_num_day earliest_cr_line_day earliest_cr_line_week_of_year last_pymnt_d_num_day last_pymnt_d_day last_pymnt_d_week_of_year last_credit_pull_d_num_day last_credit_pull_d_day last_credit_pull_d_week_of_year
0 5000.0 5000.0 4975.0 0 10.65 162.87 1 6 1 4 ... 52 1 Tuesday 5 0 Monday 33 3 Thursday 34
1 2500.0 2500.0 2500.0 1 15.27 59.83 2 13 10 4 ... 52 3 Thursday 17 1 Tuesday 18 0 Monday 39
2 2400.0 2400.0 2400.0 0 15.96 84.33 2 14 1 4 ... 52 1 Tuesday 46 1 Tuesday 24 3 Thursday 34

3 rows × 96 columns

Exploratory Data Analysis


In [10]:
tmp = ['delinq_amnt', 'acc_now_delinq', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med',
 'tax_liens','policy_code']
data[tmp].describe()


Out[10]:
delinq_amnt acc_now_delinq chargeoff_within_12_mths collections_12_mths_ex_med tax_liens policy_code
count 329408.000000 329408.000000 329408.000000 329408.000000 329408.000000 329408.0
mean 7.812348 0.003218 0.006378 0.006093 0.025944 1.0
std 527.203072 0.061566 0.091562 0.082624 0.314483 0.0
min 0.000000 0.000000 0.000000 0.000000 0.000000 1.0
25% 0.000000 0.000000 0.000000 0.000000 0.000000 1.0
50% 0.000000 0.000000 0.000000 0.000000 0.000000 1.0
75% 0.000000 0.000000 0.000000 0.000000 0.000000 1.0
max 70076.000000 5.000000 7.000000 5.000000 63.000000 1.0

In [11]:
data['delinq_amnt'].plot(kind='kde')
pyplt.title('Density Estimation of Amount Delinquent')
pyplt.legend(loc='upper right', shadow=True, fontsize='medium')
pyplt.savefig('report/figures/delinq_amnt.png', dpi=200)
pyplt.close();

In [12]:
# drop these features because they are not informative
data.drop(tmp, axis=1, inplace=True)
data.shape


Out[12]:
(329408, 90)

Visualize the distributions of the dataset

First, I am taking a look at the summary statistics of some features that I think might be of interest in identifying bad loans. From these statistics, most of the data points are zero as can be seen from the kernel density estimation. As a result, there is no strong pattern here, so I am dropping these features.


In [13]:
# separate variables by type
date_vars = [x for x in data.columns if '_d' in x or '_cr_line' in x]

cat_vars = ['term','grade','sub_grade','emp_length','home_ownership','is_inc_v',
            'pymnt_plan','purpose','addr_city','addr_state','initial_list_status', 
            'loan_rank', 'pub_rec_bankruptcies']
cat_vars_lookup = [x for x in data.columns if '_old' in x]

# separate continous variables by their relative ranges

continuous_vars_0 = ['loan_amnt','funded_amnt','funded_amnt_inv']
continuous_vars_1 = ['installment','annual_inc','revol_bal',
 'out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp',
 'total_rec_int','recoveries','collection_recovery_fee','last_pymnt_amnt']
continuous_vars_2 = ['int_rate', 'delinq_2yrs', 'dti', 'inq_last_6mths', 'open_acc',  'pub_rec', 
 'total_acc', 'total_rec_late_fee', 'revol_util']

In [16]:
data[continuous_vars_0].describe()


Out[16]:
loan_amnt funded_amnt funded_amnt_inv
count 329408.000000 329408.000000 329408.000000
mean 14134.235507 14099.446811 14039.871005
std 8200.349559 8181.723245 8187.516489
min 500.000000 500.000000 0.000000
25% 8000.000000 8000.000000 8000.000000
50% 12000.000000 12000.000000 12000.000000
75% 20000.000000 19750.000000 19600.000000
max 35000.000000 35000.000000 35000.000000

In [17]:
data[continuous_vars_1].describe()


Out[17]:
installment annual_inc revol_bal out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int recoveries collection_recovery_fee last_pymnt_amnt
count 329408.000000 3.294080e+05 3.294080e+05 329408.000000 329408.000000 329408.000000 329408.000000 329408.000000 329408.000000 329408.000000 329408.000000 329408.000000
mean 430.573416 7.277881e+04 1.579633e+04 8682.942381 8677.238573 6650.448760 6591.550492 4979.809248 1652.164164 18.171490 1.096919 1774.811078
std 243.265098 5.444319e+04 1.883157e+04 8141.098320 8137.292807 6647.210635 6584.340464 5681.996003 1792.366436 242.882808 49.261435 4211.293421
min 16.080000 3.000000e+03 0.000000e+00 0.000000 0.000000 32.740000 0.000000 0.000000 5.520000 0.000000 0.000000 0.000000
25% 254.910000 4.500000e+04 6.446000e+03 1119.550000 1118.242500 2025.380000 2017.177500 1233.290000 529.287500 0.000000 0.000000 282.290000
50% 380.730000 6.205200e+04 1.175500e+04 7015.110000 7011.970000 4482.955000 4448.015000 2964.040000 1070.275000 0.000000 0.000000 446.210000
75% 564.420000 8.800000e+04 2.009300e+04 13605.700000 13597.645000 8928.420827 8857.230000 6400.000000 2056.025000 0.000000 0.000000 745.320000
max 1409.990000 7.446395e+06 2.568995e+06 34706.760000 34706.760000 53438.202180 52613.400000 35000.030000 19199.940000 29282.070000 7002.190000 36115.200000

Income

Hypothesis: Clients with lower incomes are more apt to default their loans.

Based on my hypothesis, I am starting off my exploration with income. To start off, I am going to check this feature to see if there are any outliers in the data. When I plot the raw features, I see that there are some because the distribution is extremely skewed with standard deviation over \$500k.

I normalize the income data using Tukey's method; this gave a better representation of the underlying data; this distribution is closer to a uniform distribution. The majority of the clients in this dataset make an annual income of around \$65k+.

I ultimately decided to change my outliers skim by only removing folks in the top 1% of incomes from the data. I am making this choice because I believe there might be some signal from the folks in the upper ten percentile of the distribution. As a result of doing this, the mean moved to around \$70k+.


In [18]:
pyplt.rcParams['figure.figsize'] = (8, 8)

sns.distplot(data['annual_inc'])
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income')
pyplt.savefig('report/figures/annual_income_raw.png', format='png', dpi=200)
pyplt.close();

#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)

data['annual_inc'].describe()


Out[18]:
count    3.294080e+05
mean     7.277881e+04
std      5.444319e+04
min      3.000000e+03
25%      4.500000e+04
50%      6.205200e+04
75%      8.800000e+04
max      7.446395e+06
Name: annual_inc, dtype: float64

In [19]:
# Clean income for outliers
mask = get_outliers('annual_inc')
df_inc = pd.DataFrame(index=data.index)
df_inc['income'] = data.annual_inc
df_inc.drop(df_inc.ix[mask].index, inplace = True)

In [20]:
pyplt.rcParams['figure.figsize'] = (8, 8)

sns.distplot(df_inc); 
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income: Outliers removed (Tukey)')
pyplt.savefig('report/figures/annual_income_cleaned.png', format='png', dpi=200)
pyplt.close();

#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)

df_inc.describe()


Out[20]:
income
count 315626.000000
mean 66172.901895
std 29010.425329
min 3000.000000
25% 45000.000000
50% 60000.000000
75% 84000.000000
max 152500.000000

In [21]:
#Am dropping folks whose salary is beyond the 99 percentile

top = data['annual_inc'].quantile(.99)
mask = data.loc[data['annual_inc'] > top].index
data.drop(mask, inplace=True)

In [22]:
pyplt.rcParams['figure.figsize'] = (8, 8)

sns.distplot(data['annual_inc'])
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income: Top 1% removed')
pyplt.savefig('report/figures/annual_income.png', format='png', dpi=200)
pyplt.close();

#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)

data['annual_inc'].describe()


Out[22]:
count    326316.000000
mean      70019.943953
std       35614.590537
min        3000.000000
25%       45000.000000
50%       62000.000000
75%       86000.000000
max      240000.000000
Name: annual_inc, dtype: float64

Continuous data exploration

Next, I plotted the histograms of all the continous variables separated by their relative ranges. From the first set of histograms, I can see that the total amount for the loan, total amount committed by investors for that loan and the funded amount, funded_amnt, funded_amnt_inv, and loan_amnt are quite identical. Comes as no surprise that these features are also highly correlated. As a result, when it comes to modeling I will have to either pick one of the them, or average all three and create a new feature.

The features for the the remaining outstanding principal which are features with the prefix out_prncp, and payments received on the loans which are features with the prefix total_, are also very similar in the shape of their distributions. From these sets of histograms, I can tell that most loans are around \$8k to \$20k.

From the third set of histograms, majority of the clients in the Lending club dataset have extremely low 30+ days past-due incidences of delinquency in the last two years, delinq_2yrs. The distribution of the debt to income ratio, dti, is fairly symmetric, with majority of clients have a 15% dti. The distribution of interest rates, int_rate, is less uniform, with most loans served with an interest rate of 15%.


In [23]:
g = data[continuous_vars_0].hist(xrot=90, figsize=[15,17]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part One', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_0.png', format='png', dpi=200)
pyplt.close();

g = data[continuous_vars_1].hist(xrot=90, figsize=[15,17]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part Two', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_1.png', format='png', dpi=200)
pyplt.close();

In [25]:
g = data[continuous_vars_2].hist(figsize=[15,15]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part Three', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_2.png', format='png', dpi=200)
pyplt.close();

In [26]:
sns.distplot(data.loan_amnt);
pyplt.ylabel('percent per unit');
pyplt.title('Loan Amount');
pyplt.savefig('report/figures/loan_amnt.png', format='png', dpi=200)
pyplt.close();

In [27]:
sns.distplot(data.int_rate);
pyplt.ylabel('percent per unit');
pyplt.title('Interest Rate');
pyplt.savefig('report/figures/int_rate.png', format='png', dpi=200)
pyplt.close();

In [28]:
sns.distplot(data.dti);
pyplt.ylabel('percent per unit');
pyplt.title('Debt to Income Ratio');
pyplt.savefig('report/figures/dti.png', format='png', dpi=200)
pyplt.close();

In [30]:
pyplt.rcParams['figure.figsize'] = (8, 4)
corr = data[continuous_vars_1].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True

with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='.2f')
    pyplt.xticks(rotation=70, ha='right');
    

pyplt.title('Feature Correlation: Part One');
pyplt.savefig('report/figures/corr_1.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

In [31]:
pyplt.rcParams['figure.figsize'] = (12, 6)
corr = data[date_vars].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True

with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='.1f')
    pyplt.xticks(rotation=70, ha='right');
    

pyplt.title('Feature Correlation: Part Two');
pyplt.savefig('report/figures/corr_2.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)

In [32]:
percentage_good = data.loan_rank.value_counts()[0] / len(data)
percentage_bad = data.loan_rank.value_counts()[1] / len(data)

In [33]:
print ('{}% of the loans are good, with a ratio of {} to 1.'.format(round(percentage_good*100, 2),
                                                            round(percentage_good / percentage_bad)))


93.71% of the loans are good, with a ratio of 15.0 to 1.

In [34]:
sns.distplot(data.loan_rank, bins=10, kde=False)
pyplt.title('Loan Status')
x = [0.05, 0.2, 0.4, 0.6, 0.8, 0.95]
labels = ["Good Loans", "", "", "", "", "Bad Loans"]
pyplt.xticks(x, labels)
pyplt.grid(False)
pyplt.savefig('report/figures/loans.png', format='png', dpi=200)
pyplt.close();

Separate the Good loans from the Bad loans

From the dataset, I can infer that most of the loans in the lending club dataset are good loans. The classes for good versus bad loans are highly imbalanced with a ratio of almost 15 to 1.

Create equal samples of good and bad loans in a dataset

To gain a deeper understanding of what separates defaulters from everyone else, I decided to uniformly sample the dataset to get balanced classes of good and bad loans.

Are defaulters and non-defaulters two different groups based on annual income?

To answer this question, I plotted the incomes of the two groups. Both distributions have a long tail and overlap a lot. From this graph, it appears that clients who default usually have lower incomes, and as incomes increase, the rates of defaults decrease.

Before I can conclusively make this determination, I need to know if the difference I am seeing is just chance variation or a difference in the distributions in the population. To make this determination, I will perform a hypothesis test using the Mann Whitney U test because the distributions are not normal.

Null hypothesis: In the population, the distribution of annual incomes is the same for clients who default and those who do not. The difference in the sample is due to chance.
Alternative hypothesis: The two distributions are different in the population.

I performed the test and got a P-value of zero. As a result, I can reject the null hypothesis and conclude that in the population, the distribution of annual incomes of defaulters and non-defaulters are different.

Do wealthier clients request bigger loans?

Since income is a feature that separates defaulters from non-defaulters, I want to see is there is a relationship between income and the amount client request for loans. To do this, I plotted income against loan amount and fitted a regression line. From the graph, we can see that there is a somewhat positive relationship between the two variables. As income goes up, so those the amount requested.

Understanding loan grades

Lending club grades their loans on two scales. The first scale is called grade, it ranges from A to G, which in this dataset, I have coded 0 to 6. They do a further breakdown of this grade into subgrades. For each grade they break it down into several smaller bins. In this dataset, these are captured in the sub_grade feature which has been coded similarly from 0 to n. The distribution of the subgrade has a long tail with most of the loans between sub grades 3 and 15.

In this dataset, I have coded good loans as 0 and bad loans as 1. On average, good loan clients have higher incomes for each grade category they fail into when compared with clients that tend to default.

Consider loans by grade and the purpose for which the loan was taken. When a loan is taken in order to consolidate debt, we see that as the grades progress from low risk to high risk, clients generally tend to increase their loan amounts.

What are the most important features between good and bad loans?

Once I had a good idea of the underlying characteristics of my data, I moved on to understanding the most important features. I implemented a Random Forest classifier and plotted its feature importance. From this graph, the most important features were the derived features from the dates in which the last payments of the loans were done. The interest rates, the sub grade of the loan, the debt to income ratio of the clients were all important features. It turns out income is not as important in determining whether a client will default or not.


In [36]:
good = data['loan_rank'] == 0
bad = data['loan_rank'] == 1

In [37]:
data_good = pd.DataFrame()
data_bad = pd.DataFrame()


data_good = data.ix[data[good].index, :]
data_bad = data.ix[data[bad].index, :]

data_good.reset_index(inplace=True)
data_bad.reset_index(inplace=True)

In [38]:
sample_size = 1000

In [39]:
the_index = sample_data(sample_size, data_good)
data_good_ = data_good.ix[the_index, :]

the_index = sample_data(sample_size, data_bad)
data_bad_ = data_bad.ix[the_index, :]

In [40]:
data_ = data_good_
data_ = data_.append(data_bad_)
data_.reset_index(inplace=True)

In [41]:
sns.distplot(data_.loan_rank, bins=10, kde=False)
pyplt.title('Loan Status: Classes Balanced')
x = [0.05, 0.2, 0.4, 0.6, 0.8, 0.95]
labels = ["Good Loans", "", "", "", "", "Bad Loans"]
pyplt.xticks(x, labels)
pyplt.grid(False)
pyplt.savefig('report/figures/balanced_loans.png', format='png', dpi=200)
pyplt.close();

In [42]:
good = data_['loan_rank'] == 0
bad = data_['loan_rank'] == 1

df_good = pd.DataFrame()
df_bad = pd.DataFrame()

df_good['good_loans'] = data_.ix[data_[good].index, 'annual_inc']
df_bad['bad_loans'] = data_.ix[data_[bad].index, 'annual_inc']

In [43]:
income_bin = np.arange(2e+04, 25e+04, 2e+04)

In [44]:
df_bad['bad_loans'].plot.hist(bins=income_bin, normed=True, alpha = 0.8)
df_good['good_loans'].plot.hist(bins=income_bin, normed=True, alpha = 0.8)
pyplt.ylabel('percent per dollar')
pyplt.xlabel('Annual Income, USD')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pyplt.suptitle('Annual Incomes of Good and Bad Loan Clients', fontsize=12)
pyplt.savefig('report/figures/good_bad.png', format='png', bbox_inches='tight', dpi=200);
pyplt.close();

In [45]:
a = df_bad['bad_loans'].values
b = df_good['good_loans'].values

In [46]:
import scipy.stats as st

statistic, p1 = st.mannwhitneyu(a, b) 
pvalue = p1 * 2
print ('P-value:%.2f'%pvalue)


P-value:0.00

In [47]:
g = sns.jointplot(data_['loan_amnt'], data_['annual_inc'],  kind="reg", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Loan Amount by Income', fontsize=14)
pyplt.savefig('report/figures/loan_inc.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

In [48]:
g = sns.jointplot(data_['annual_inc'], data_['int_rate'], kind="reg", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Income by Interest Rate', fontsize=14)
pyplt.savefig('report/figures/inc_int_rate.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

In [49]:
g = sns.jointplot(data_['annual_inc'], data_['sub_grade'], kind="reg", size=5, space=0)
g.fig.suptitle('Income by Loan Sub Grade', fontsize=14)
pyplt.savefig('report/figures/inc_sub_grade.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

In [50]:
g = sns.jointplot(data_['int_rate'], data_['sub_grade'], kind="kde", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Interest Rate by Loan Sub Grade', fontsize=14)
pyplt.savefig('report/figures/sub_grade_int_rate.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();

Categorical variables


In [51]:
data_[cat_vars].describe()


Out[51]:
term grade sub_grade emp_length home_ownership is_inc_v pymnt_plan purpose addr_city addr_state initial_list_status loan_rank pub_rec_bankruptcies
count 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.0000 2000.000000 2000.000000
mean 0.277000 2.025500 12.058000 4.198500 2.061500 1.138000 0.005000 3.020000 14221.594500 22.508000 0.2000 0.500000 0.074000
std 0.447628 1.360064 6.736492 3.443258 1.935611 0.843393 0.070551 2.857566 8590.981577 14.632498 0.4001 0.500125 0.267507
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 144.000000 0.000000 0.0000 0.000000 0.000000
25% 0.000000 1.000000 7.000000 1.000000 0.000000 0.000000 0.000000 2.000000 6801.500000 9.000000 0.0000 0.000000 0.000000
50% 0.000000 2.000000 11.000000 3.000000 3.000000 1.000000 0.000000 2.000000 13918.000000 23.000000 0.0000 0.500000 0.000000
75% 1.000000 3.000000 16.000000 7.000000 4.000000 2.000000 0.000000 2.000000 21041.000000 34.000000 0.0000 1.000000 0.000000
max 1.000000 6.000000 34.000000 11.000000 4.000000 2.000000 1.000000 13.000000 32397.000000 49.000000 1.0000 1.000000 2.000000

In [52]:
pyplt.rcParams['figure.figsize'] = (12, 6)

sns.stripplot(x='sub_grade', y='loan_amnt', hue='loan_rank', data=data_);
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
pyplt.title('Bad Loans By Amount and Sub Grade', fontsize=14)
pyplt.savefig('report/figures/bad_loan_sub_grade.png',bbox_inches='tight', format='png', dpi=200)
pyplt.close();

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)

In [53]:
pyplt.rcParams['figure.figsize'] = (10, 4)

sns.countplot(x='sub_grade', data=data, hue='grade');
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
pyplt.title('Loan Sub Grades', fontsize=14)
pyplt.savefig('report/figures/sub_grade.png',bbox_inches='tight', format='png', dpi=200)
pyplt.close();

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)

In [54]:
sns.factorplot(x='grade', y='annual_inc', data=data_, hue='loan_rank');
pyplt.title('Annual income by Grade', fontsize=14)
pyplt.savefig('report/figures/inc_grade.png', format='png', dpi=200)
pyplt.close();

In [55]:
g = sns.factorplot(x='grade', y='loan_amnt', data=data_, hue='loan_rank', 
               col='purpose_old', col_wrap=4, kind='box');
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Loans By Grade and Purpose', fontsize=20)
pyplt.savefig('report/figures/loan_grade_purpose.png', bbox_inches='tight', format='png', dpi=200)

pyplt.close();

In [56]:
g = sns.factorplot(x='grade', y='loan_amnt', data=data_, hue='loan_rank', 
               col='home_ownership_old', col_wrap=4, kind='box');
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Loans By Home Ownership', fontsize=15)
pyplt.savefig('report/figures/loan_home.png', format='png', dpi=200)
pyplt.close();

In [57]:
pyplt.rcParams['figure.figsize'] = (12, 8)
sns.stripplot(x='addr_state_old', y='loan_amnt', hue='loan_rank', data=data_, size=4, jitter=True);

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
pyplt.close();

Modeling

Extract features and labels

Extract feature (X) and target (y) columns


In [62]:
target_col = data_['loan_rank']   
y = target_col 

print ("\nLabel values:-")
y.head()


Label values:-
Out[62]:
0    0
1    0
2    0
3    0
4    0
Name: loan_rank, dtype: uint8

In [76]:
features = cat_vars + ['loan_amnt', 'installment','annual_inc','revol_bal',
 'out_prncp','total_pymnt','recoveries','collection_recovery_fee','last_pymnt_amnt'] + ['int_rate','delinq_2yrs',
 'dti','inq_last_6mths','total_acc','revol_util'] + ['accept_d_months',
 'accept_d_days',
 'earliest_cr_line_months',
 'earliest_cr_line_days',
 'last_pymnt_d_months',
 'last_pymnt_d_days',
 'accept_d_num_day',
 'accept_d_week_of_year',
 'list_d_num_day',
 'list_d_week_of_year',
 'exp_d_num_day',
 'exp_d_week_of_year',
 'issue_d_num_day',
 'issue_d_week_of_year',
 'last_pymnt_d_num_day',
 'last_pymnt_d_week_of_year',
 'last_credit_pull_d_num_day',
 'last_credit_pull_d_week_of_year']

In [79]:
X = data_[features]
X.drop(['loan_rank'], axis=1, inplace=True)
print ("\nFeature values:-")
X.head()


Feature values:-
Out[79]:
term grade sub_grade emp_length home_ownership is_inc_v pymnt_plan purpose addr_city addr_state ... list_d_num_day list_d_week_of_year exp_d_num_day exp_d_week_of_year issue_d_num_day issue_d_week_of_year last_pymnt_d_num_day last_pymnt_d_week_of_year last_credit_pull_d_num_day last_credit_pull_d_week_of_year
0 1 1 5 7 4 2 0 1 21061 4 ... 5 5 5 7 0 7 4 33 3 34
1 0 2 11 1 0 2 0 2 17584 36 ... 3 37 3 39 4 38 1 35 4 13
2 1 2 11 4 4 1 0 2 1668 19 ... 2 39 2 41 1 40 0 34 3 34
3 0 0 0 9 0 0 0 2 8032 44 ... 5 45 5 47 0 48 3 35 3 34
4 0 1 5 1 3 0 0 1 8058 4 ... 0 1 0 3 0 5 4 31 3 34

5 rows × 45 columns


In [93]:
from time import time, gmtime, strftime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import RobustScaler
from sklearn.calibration import CalibratedClassifierCV

In [88]:
from sklearn.model_selection import train_test_split


def shuffle_split_data(X, y):
    """ Shuffles and splits data into 75% training and 25% testing subsets,
        then returns the training and testing subsets. """
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1500, random_state=42)

    # Return the training and testing data subsets
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = shuffle_split_data(X, y)

In [95]:
models = {
          'RandomForest': RandomForestClassifier(n_estimators=10, n_jobs=-1),
         }

scaler = RobustScaler()
X_train_transform = scaler.fit_transform(X_train)
X_test_transform  = scaler.fit_transform(X_test)

print(X_train_transform.shape)
print(X_test_transform.shape)


(1500, 45)
(500, 45)

In [96]:
print('CLASSIFICATION RESULTS OF BASELINE CLASSIFIERS\n')
print('{:20}{:^15}{:^10}{:^10}'.format('CLASSIFIER', 'MEAN SCORE %', 'STD DEV %', 'TIME'))


for clf_name, clf in models.iteritems():
    t0 = time()
    results = cross_val_score(clf, X_train_transform, y_train, cv=5)
    t1 = time() - t0
    print('{:20}{:^15.2f}{:^10.2f}{:>10.2f}secs'.format(clf_name, results.mean()*100, results.std()*100, t1))


CLASSIFICATION RESULTS OF BASELINE CLASSIFIERS

CLASSIFIER           MEAN SCORE %  STD DEV %    TIME   
RandomForest             94.67        0.55         1.57secs

In [105]:
t0 = time()

clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(X_train_transform, y_train)
final_preds = calibrated_clf.predict(X_test_transform)
precision, recall, fbeta_score, support = score(y_test, final_preds)
print ("Precision:{:10.3f}\nRecall: {:^10.3f}\nF Score{:^10.3f}".format(precision.mean()*100, 
                                                 recall.mean()*100, fbeta_score.mean()*100))


Precision:    95.226
Recall:   95.174  
F Score  95.194  

In [100]:
clf.fit(X_train_transform, y_train)
importances = clf.feature_importances_

In [109]:
pyplt.rcParams['figure.figsize'] = (8, 16)

importance_frame = pd.DataFrame({'Importance': importances, 'Feature': list(X.columns)})
importance_frame.sort_values(by = 'Importance', inplace = True)
ax =importance_frame.plot(kind = 'barh', x = 'Feature', color = 'deepskyblue')
pyplt.savefig('report/figures/feature_imp.png', bbox_inches='tight', format='png', dpi=200)
# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
pyplt.close();