In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)
# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
In [2]:
%pylab inline
In [3]:
# Import libraries
from __future__ import absolute_import, division, print_function
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.externals import joblib
# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style('whitegrid')
In [4]:
def plot_features_by_target(df, features, target):
"""Creates a pair plot of the features by the target variable
Attributes
------------
df: pandas dataframe
features: list of strings (cols of df)
target: list of strings (single col of df)
"""
sns.pairplot(df, x_vars=features, y_vars=target);
In [5]:
def get_outliers(feature):
"""Get the indices of outliers for a feature if it exist
Attributes
----------
feature: pandas col
return outlier indices
"""
outlier = []
outlier_indices = []
# For feature find the data points with extreme high or low values
Q1 = data[feature].quantile(.25)
Q3 = data[feature].quantile(.75)
step = 1.5 * (Q3 - Q1)
# Display the outliers
temp_data = data[~((data[feature] >= Q1 - step) & (data[feature] <= Q3 + step))]
outlier.append(dict(feature=feature, indices=temp_data.index.tolist()))
outlier_indices += temp_data.index.tolist()
return outlier_indices
In [6]:
def sample_data(num_sample, df, with_replacement=False):
"""
Create a random sample from a table
Attributes
---------
num_sample: int
df: dataframe
with_replacement: boolean
Returns a random subset of table index
"""
df_index = []
lst = np.arange(0, len(df), 1)
for i in np.arange(0, num_sample, 1):
# pick randomly from the whole table
sample_index = np.random.choice(lst)
if with_replacement:
# store index
df_index.append(sample_index)
else:
# remove the choice that was selected
lst = np.setdiff1d(lst,[sample_index])
df_index.append(sample_index)
return df_index
In [7]:
# set figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
In [8]:
dataPath = 'data'
df = joblib.load(dataPath+'/df_cleaned.pkl')
print ("Dataset has {} samples with {} features each.".format(*df.shape))
In [9]:
data = df
data.head(3)
Out[9]:
In [10]:
tmp = ['delinq_amnt', 'acc_now_delinq', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med',
'tax_liens','policy_code']
data[tmp].describe()
Out[10]:
In [11]:
data['delinq_amnt'].plot(kind='kde')
pyplt.title('Density Estimation of Amount Delinquent')
pyplt.legend(loc='upper right', shadow=True, fontsize='medium')
pyplt.savefig('report/figures/delinq_amnt.png', dpi=200)
pyplt.close();
In [12]:
# drop these features because they are not informative
data.drop(tmp, axis=1, inplace=True)
data.shape
Out[12]:
First, I am taking a look at the summary statistics of some features that I think might be of interest in identifying bad loans. From these statistics, most of the data points are zero as can be seen from the kernel density estimation. As a result, there is no strong pattern here, so I am dropping these features.
In [13]:
# separate variables by type
date_vars = [x for x in data.columns if '_d' in x or '_cr_line' in x]
cat_vars = ['term','grade','sub_grade','emp_length','home_ownership','is_inc_v',
'pymnt_plan','purpose','addr_city','addr_state','initial_list_status',
'loan_rank', 'pub_rec_bankruptcies']
cat_vars_lookup = [x for x in data.columns if '_old' in x]
# separate continous variables by their relative ranges
continuous_vars_0 = ['loan_amnt','funded_amnt','funded_amnt_inv']
continuous_vars_1 = ['installment','annual_inc','revol_bal',
'out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp',
'total_rec_int','recoveries','collection_recovery_fee','last_pymnt_amnt']
continuous_vars_2 = ['int_rate', 'delinq_2yrs', 'dti', 'inq_last_6mths', 'open_acc', 'pub_rec',
'total_acc', 'total_rec_late_fee', 'revol_util']
In [16]:
data[continuous_vars_0].describe()
Out[16]:
In [17]:
data[continuous_vars_1].describe()
Out[17]:
Hypothesis: Clients with lower incomes are more apt to default their loans.
Based on my hypothesis, I am starting off my exploration with income. To start off, I am going to check this feature to see if there are any outliers in the data. When I plot the raw features, I see that there are some because the distribution is extremely skewed with standard deviation over \$500k.
I normalize the income data using Tukey's method; this gave a better representation of the underlying data; this distribution is closer to a uniform distribution. The majority of the clients in this dataset make an annual income of around \$65k+.
I ultimately decided to change my outliers skim by only removing folks in the top 1% of incomes from the data. I am making this choice because I believe there might be some signal from the folks in the upper ten percentile of the distribution. As a result of doing this, the mean moved to around \$70k+.
In [18]:
pyplt.rcParams['figure.figsize'] = (8, 8)
sns.distplot(data['annual_inc'])
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income')
pyplt.savefig('report/figures/annual_income_raw.png', format='png', dpi=200)
pyplt.close();
#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)
data['annual_inc'].describe()
Out[18]:
In [19]:
# Clean income for outliers
mask = get_outliers('annual_inc')
df_inc = pd.DataFrame(index=data.index)
df_inc['income'] = data.annual_inc
df_inc.drop(df_inc.ix[mask].index, inplace = True)
In [20]:
pyplt.rcParams['figure.figsize'] = (8, 8)
sns.distplot(df_inc);
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income: Outliers removed (Tukey)')
pyplt.savefig('report/figures/annual_income_cleaned.png', format='png', dpi=200)
pyplt.close();
#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)
df_inc.describe()
Out[20]:
In [21]:
#Am dropping folks whose salary is beyond the 99 percentile
top = data['annual_inc'].quantile(.99)
mask = data.loc[data['annual_inc'] > top].index
data.drop(mask, inplace=True)
In [22]:
pyplt.rcParams['figure.figsize'] = (8, 8)
sns.distplot(data['annual_inc'])
pyplt.ylabel('percent per unit')
pyplt.xlabel('Annual income in dollars')
pyplt.xticks(rotation='vertical')
pyplt.title('Annual Income: Top 1% removed')
pyplt.savefig('report/figures/annual_income.png', format='png', dpi=200)
pyplt.close();
#reset figure
pyplt.rcParams['figure.figsize'] = (6, 4)
data['annual_inc'].describe()
Out[22]:
Next, I plotted the histograms of all the continous variables separated by their relative ranges. From the first set of histograms, I can see that the total amount for the loan, total amount committed by investors for that loan and the funded amount, funded_amnt
, funded_amnt_inv
, and loan_amnt
are quite identical. Comes as no surprise that these features are also highly correlated. As a result, when it comes to modeling I will have to either pick one of the them, or average all three and create a new feature.
The features for the the remaining outstanding principal which are features with the prefix out_prncp
, and payments received on the loans which are features with the prefix total_
, are also very similar in the shape of their distributions. From these sets of histograms, I can tell that most loans are around \$8k to \$20k.
From the third set of histograms, majority of the clients in the Lending club dataset have extremely low 30+ days past-due incidences of delinquency in the last two years, delinq_2yrs
. The distribution of the debt to income ratio, dti
, is fairly symmetric, with majority of clients have a 15% dti. The distribution of interest rates, int_rate
, is less uniform, with most loans served with an interest rate of 15%.
In [23]:
g = data[continuous_vars_0].hist(xrot=90, figsize=[15,17]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part One', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_0.png', format='png', dpi=200)
pyplt.close();
g = data[continuous_vars_1].hist(xrot=90, figsize=[15,17]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part Two', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_1.png', format='png', dpi=200)
pyplt.close();
In [25]:
g = data[continuous_vars_2].hist(figsize=[15,15]);
pyplt.subplots_adjust(top=0.9)
pyplt.suptitle('Feature Histograms: Part Three', fontsize=16)
pyplt.savefig('report/figures/continuous_vars_2.png', format='png', dpi=200)
pyplt.close();
In [26]:
sns.distplot(data.loan_amnt);
pyplt.ylabel('percent per unit');
pyplt.title('Loan Amount');
pyplt.savefig('report/figures/loan_amnt.png', format='png', dpi=200)
pyplt.close();
In [27]:
sns.distplot(data.int_rate);
pyplt.ylabel('percent per unit');
pyplt.title('Interest Rate');
pyplt.savefig('report/figures/int_rate.png', format='png', dpi=200)
pyplt.close();
In [28]:
sns.distplot(data.dti);
pyplt.ylabel('percent per unit');
pyplt.title('Debt to Income Ratio');
pyplt.savefig('report/figures/dti.png', format='png', dpi=200)
pyplt.close();
In [30]:
pyplt.rcParams['figure.figsize'] = (8, 4)
corr = data[continuous_vars_1].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
with sns.axes_style("white"):
ax = sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='.2f')
pyplt.xticks(rotation=70, ha='right');
pyplt.title('Feature Correlation: Part One');
pyplt.savefig('report/figures/corr_1.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();
In [31]:
pyplt.rcParams['figure.figsize'] = (12, 6)
corr = data[date_vars].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
with sns.axes_style("white"):
ax = sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='.1f')
pyplt.xticks(rotation=70, ha='right');
pyplt.title('Feature Correlation: Part Two');
pyplt.savefig('report/figures/corr_2.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();
# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
In [32]:
percentage_good = data.loan_rank.value_counts()[0] / len(data)
percentage_bad = data.loan_rank.value_counts()[1] / len(data)
In [33]:
print ('{}% of the loans are good, with a ratio of {} to 1.'.format(round(percentage_good*100, 2),
round(percentage_good / percentage_bad)))
In [34]:
sns.distplot(data.loan_rank, bins=10, kde=False)
pyplt.title('Loan Status')
x = [0.05, 0.2, 0.4, 0.6, 0.8, 0.95]
labels = ["Good Loans", "", "", "", "", "Bad Loans"]
pyplt.xticks(x, labels)
pyplt.grid(False)
pyplt.savefig('report/figures/loans.png', format='png', dpi=200)
pyplt.close();
From the dataset, I can infer that most of the loans in the lending club dataset are good loans. The classes for good versus bad loans are highly imbalanced with a ratio of almost 15 to 1.
To gain a deeper understanding of what separates defaulters from everyone else, I decided to uniformly sample the dataset to get balanced classes of good and bad loans.
To answer this question, I plotted the incomes of the two groups. Both distributions have a long tail and overlap a lot. From this graph, it appears that clients who default usually have lower incomes, and as incomes increase, the rates of defaults decrease.
Before I can conclusively make this determination, I need to know if the difference I am seeing is just chance variation or a difference in the distributions in the population. To make this determination, I will perform a hypothesis test using the Mann Whitney U test because the distributions are not normal.
Null hypothesis: In the population, the distribution of annual incomes is the same for clients who default and those who do not. The difference in the sample is due to chance.
Alternative hypothesis: The two distributions are different in the population.
I performed the test and got a P-value of zero. As a result, I can reject the null hypothesis and conclude that in the population, the distribution of annual incomes of defaulters and non-defaulters are different.
Since income is a feature that separates defaulters from non-defaulters, I want to see is there is a relationship between income and the amount client request for loans. To do this, I plotted income against loan amount and fitted a regression line. From the graph, we can see that there is a somewhat positive relationship between the two variables. As income goes up, so those the amount requested.
Lending club grades their loans on two scales. The first scale is called grade
, it ranges from A to G, which in this dataset, I have coded 0 to 6. They do a further breakdown of this grade into subgrades. For each grade they break it down into several smaller bins. In this dataset, these are captured in the sub_grade
feature which has been coded similarly from 0 to n. The distribution of the subgrade has a long tail with most of the loans between sub grades 3 and 15.
In this dataset, I have coded good loans as 0 and bad loans as 1. On average, good loan clients have higher incomes for each grade category they fail into when compared with clients that tend to default.
Consider loans by grade and the purpose for which the loan was taken. When a loan is taken in order to consolidate debt, we see that as the grades progress from low risk to high risk, clients generally tend to increase their loan amounts.
Once I had a good idea of the underlying characteristics of my data, I moved on to understanding the most important features. I implemented a Random Forest classifier and plotted its feature importance. From this graph, the most important features were the derived features from the dates in which the last payments of the loans were done. The interest rates, the sub grade of the loan, the debt to income ratio of the clients were all important features. It turns out income is not as important in determining whether a client will default or not.
In [36]:
good = data['loan_rank'] == 0
bad = data['loan_rank'] == 1
In [37]:
data_good = pd.DataFrame()
data_bad = pd.DataFrame()
data_good = data.ix[data[good].index, :]
data_bad = data.ix[data[bad].index, :]
data_good.reset_index(inplace=True)
data_bad.reset_index(inplace=True)
In [38]:
sample_size = 1000
In [39]:
the_index = sample_data(sample_size, data_good)
data_good_ = data_good.ix[the_index, :]
the_index = sample_data(sample_size, data_bad)
data_bad_ = data_bad.ix[the_index, :]
In [40]:
data_ = data_good_
data_ = data_.append(data_bad_)
data_.reset_index(inplace=True)
In [41]:
sns.distplot(data_.loan_rank, bins=10, kde=False)
pyplt.title('Loan Status: Classes Balanced')
x = [0.05, 0.2, 0.4, 0.6, 0.8, 0.95]
labels = ["Good Loans", "", "", "", "", "Bad Loans"]
pyplt.xticks(x, labels)
pyplt.grid(False)
pyplt.savefig('report/figures/balanced_loans.png', format='png', dpi=200)
pyplt.close();
In [42]:
good = data_['loan_rank'] == 0
bad = data_['loan_rank'] == 1
df_good = pd.DataFrame()
df_bad = pd.DataFrame()
df_good['good_loans'] = data_.ix[data_[good].index, 'annual_inc']
df_bad['bad_loans'] = data_.ix[data_[bad].index, 'annual_inc']
In [43]:
income_bin = np.arange(2e+04, 25e+04, 2e+04)
In [44]:
df_bad['bad_loans'].plot.hist(bins=income_bin, normed=True, alpha = 0.8)
df_good['good_loans'].plot.hist(bins=income_bin, normed=True, alpha = 0.8)
pyplt.ylabel('percent per dollar')
pyplt.xlabel('Annual Income, USD')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pyplt.suptitle('Annual Incomes of Good and Bad Loan Clients', fontsize=12)
pyplt.savefig('report/figures/good_bad.png', format='png', bbox_inches='tight', dpi=200);
pyplt.close();
In [45]:
a = df_bad['bad_loans'].values
b = df_good['good_loans'].values
In [46]:
import scipy.stats as st
statistic, p1 = st.mannwhitneyu(a, b)
pvalue = p1 * 2
print ('P-value:%.2f'%pvalue)
In [47]:
g = sns.jointplot(data_['loan_amnt'], data_['annual_inc'], kind="reg", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Loan Amount by Income', fontsize=14)
pyplt.savefig('report/figures/loan_inc.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();
In [48]:
g = sns.jointplot(data_['annual_inc'], data_['int_rate'], kind="reg", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Income by Interest Rate', fontsize=14)
pyplt.savefig('report/figures/inc_int_rate.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();
In [49]:
g = sns.jointplot(data_['annual_inc'], data_['sub_grade'], kind="reg", size=5, space=0)
g.fig.suptitle('Income by Loan Sub Grade', fontsize=14)
pyplt.savefig('report/figures/inc_sub_grade.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();
In [50]:
g = sns.jointplot(data_['int_rate'], data_['sub_grade'], kind="kde", size=5, space=0)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Interest Rate by Loan Sub Grade', fontsize=14)
pyplt.savefig('report/figures/sub_grade_int_rate.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();
In [51]:
data_[cat_vars].describe()
Out[51]:
In [52]:
pyplt.rcParams['figure.figsize'] = (12, 6)
sns.stripplot(x='sub_grade', y='loan_amnt', hue='loan_rank', data=data_);
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
pyplt.title('Bad Loans By Amount and Sub Grade', fontsize=14)
pyplt.savefig('report/figures/bad_loan_sub_grade.png',bbox_inches='tight', format='png', dpi=200)
pyplt.close();
# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
In [53]:
pyplt.rcParams['figure.figsize'] = (10, 4)
sns.countplot(x='sub_grade', data=data, hue='grade');
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
pyplt.title('Loan Sub Grades', fontsize=14)
pyplt.savefig('report/figures/sub_grade.png',bbox_inches='tight', format='png', dpi=200)
pyplt.close();
# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
In [54]:
sns.factorplot(x='grade', y='annual_inc', data=data_, hue='loan_rank');
pyplt.title('Annual income by Grade', fontsize=14)
pyplt.savefig('report/figures/inc_grade.png', format='png', dpi=200)
pyplt.close();
In [55]:
g = sns.factorplot(x='grade', y='loan_amnt', data=data_, hue='loan_rank',
col='purpose_old', col_wrap=4, kind='box');
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Loans By Grade and Purpose', fontsize=20)
pyplt.savefig('report/figures/loan_grade_purpose.png', bbox_inches='tight', format='png', dpi=200)
pyplt.close();
In [56]:
g = sns.factorplot(x='grade', y='loan_amnt', data=data_, hue='loan_rank',
col='home_ownership_old', col_wrap=4, kind='box');
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Loans By Home Ownership', fontsize=15)
pyplt.savefig('report/figures/loan_home.png', format='png', dpi=200)
pyplt.close();
In [57]:
pyplt.rcParams['figure.figsize'] = (12, 8)
sns.stripplot(x='addr_state_old', y='loan_amnt', hue='loan_rank', data=data_, size=4, jitter=True);
# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
pyplt.close();
In [62]:
target_col = data_['loan_rank']
y = target_col
print ("\nLabel values:-")
y.head()
Out[62]:
In [76]:
features = cat_vars + ['loan_amnt', 'installment','annual_inc','revol_bal',
'out_prncp','total_pymnt','recoveries','collection_recovery_fee','last_pymnt_amnt'] + ['int_rate','delinq_2yrs',
'dti','inq_last_6mths','total_acc','revol_util'] + ['accept_d_months',
'accept_d_days',
'earliest_cr_line_months',
'earliest_cr_line_days',
'last_pymnt_d_months',
'last_pymnt_d_days',
'accept_d_num_day',
'accept_d_week_of_year',
'list_d_num_day',
'list_d_week_of_year',
'exp_d_num_day',
'exp_d_week_of_year',
'issue_d_num_day',
'issue_d_week_of_year',
'last_pymnt_d_num_day',
'last_pymnt_d_week_of_year',
'last_credit_pull_d_num_day',
'last_credit_pull_d_week_of_year']
In [79]:
X = data_[features]
X.drop(['loan_rank'], axis=1, inplace=True)
print ("\nFeature values:-")
X.head()
Out[79]:
In [93]:
from time import time, gmtime, strftime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import RobustScaler
from sklearn.calibration import CalibratedClassifierCV
In [88]:
from sklearn.model_selection import train_test_split
def shuffle_split_data(X, y):
""" Shuffles and splits data into 75% training and 25% testing subsets,
then returns the training and testing subsets. """
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1500, random_state=42)
# Return the training and testing data subsets
return X_train, y_train, X_test, y_test
X_train, y_train, X_test, y_test = shuffle_split_data(X, y)
In [95]:
models = {
'RandomForest': RandomForestClassifier(n_estimators=10, n_jobs=-1),
}
scaler = RobustScaler()
X_train_transform = scaler.fit_transform(X_train)
X_test_transform = scaler.fit_transform(X_test)
print(X_train_transform.shape)
print(X_test_transform.shape)
In [96]:
print('CLASSIFICATION RESULTS OF BASELINE CLASSIFIERS\n')
print('{:20}{:^15}{:^10}{:^10}'.format('CLASSIFIER', 'MEAN SCORE %', 'STD DEV %', 'TIME'))
for clf_name, clf in models.iteritems():
t0 = time()
results = cross_val_score(clf, X_train_transform, y_train, cv=5)
t1 = time() - t0
print('{:20}{:^15.2f}{:^10.2f}{:>10.2f}secs'.format(clf_name, results.mean()*100, results.std()*100, t1))
In [105]:
t0 = time()
clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(X_train_transform, y_train)
final_preds = calibrated_clf.predict(X_test_transform)
precision, recall, fbeta_score, support = score(y_test, final_preds)
print ("Precision:{:10.3f}\nRecall: {:^10.3f}\nF Score{:^10.3f}".format(precision.mean()*100,
recall.mean()*100, fbeta_score.mean()*100))
In [100]:
clf.fit(X_train_transform, y_train)
importances = clf.feature_importances_
In [109]:
pyplt.rcParams['figure.figsize'] = (8, 16)
importance_frame = pd.DataFrame({'Importance': importances, 'Feature': list(X.columns)})
importance_frame.sort_values(by = 'Importance', inplace = True)
ax =importance_frame.plot(kind = 'barh', x = 'Feature', color = 'deepskyblue')
pyplt.savefig('report/figures/feature_imp.png', bbox_inches='tight', format='png', dpi=200)
# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)
pyplt.close();