In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pylab as pl
%matplotlib inline
loan_status -- Current status of the loan
loan_amnt -- The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
int_rate -- interest rate of the loan
sub_grade -- LC assigned sub loan grade -- dummie (grade -- LC assigned loan grade
-- dummie)
purpose -- A category provided by the borrower for the loan request. -- dummie
annual_inc -- The self-reported annual income provided by the borrower during registration.
emp_length -- Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years. -- dummie
fico_range_low
fico_range_high
home_ownership -- The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER -- dummie
tot_cur_bal -- Total current balance of all accounts
num_actv_bc_tl -- number of active bank accounts (avg_cur_bal -- average current balance of all accounts )
mort_acc -- number of mortgage accounts
num_actv_rev_tl -- Number of currently active revolving trades
dti -- A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
pub_rec_bankruptcies - Number of public record bankruptcies
In [1]:
# !wget https://www.dropbox.com/s/0o28v7qkjzdprb8/RejectStats2015.csv.zip?dl=0/RejectStats2015.csv.zip
In [3]:
df_unapp_2015 = pd.read_csv('Data/RejectStats2015.csv.zip', compression='zip',header=1,low_memory=False)
In [18]:
df_unapp_15 = df_unapp_2015.dropna()
In [19]:
df_unapp_15.head(10)
Out[19]:
In [20]:
len(df_unapp_15)
Out[20]:
In [125]:
## pick study columns in common in both approved and unapproved datasets
df_unapp_test = df_unapp_15[['Amount Requested', 'Risk_Score', 'Debt-To-Income Ratio']]
## remove percentage sign and convert to numeric
df_unapp_test.loc[:, 'Debt-To-Income Ratio'] = df_unapp_test.loc[:, 'Debt-To-Income Ratio'].apply(lambda r: float(r[:-1]))
### using "Outlier Boundry" found in data preprocessing for loan amount to exclude the potential outliers
outlier_bound = 33600.0
df_unapp_test = df_unapp_test[df_unapp_test['Amount Requested'] <= outlier_bound] # approved loans
In [126]:
len(df_unapp_15)
Out[126]:
In [127]:
df_unapp_test.tail(3)
Out[127]:
In [119]:
df_app_2015 = pd.read_csv('Data/approved_loan_2015_clean.csv', low_memory=False).loc[:, 'loan_amnt':]
#df_app_2015 = pd.read_csv('Data/LoanStats3d_securev1.csv.zip', compression='zip',header=1, low_memory=False)
In [121]:
df_app_2015.head(3)
Out[121]:
In [212]:
## Pick approved and fully paid loans
df_app_2015_paid = df_app_2015[df_app_2015['Target']==1]
## Pick apprived and default loans
df_app_2015_default= df_app_2015[df_app_2015['Target']==0]
In [213]:
## pick columns and drop nan
df_app_15 = df_app_2015[['loan_amnt', 'fico_range_low', 'dti']].dropna()
df_app_15_paid = df_app_2015_paid[['loan_amnt', 'fico_range_low', 'dti']].dropna()
df_app_15_default = df_app_2015_default[['loan_amnt', 'fico_range_low', 'dti']].dropna()
In [214]:
df_app_15.shape
Out[214]:
In [215]:
df_app_15_paid.shape
Out[215]:
In [216]:
df_app_15_default.shape
Out[216]:
In [128]:
df_app_15.head(3)
Out[128]:
In [217]:
### using pre-set "Outlier Boundry" for loan amount to exclude the potential outliers
outlier_bound = 33600.0
df_app_test = df_app_15[df_app_15['loan_amnt'] <= outlier_bound] # approved loans
df_app_paid_test = df_app_15_paid[df_app_15_paid['loan_amnt'] <= outlier_bound] # approved and fully paid loans
df_app_default_test = df_app_15_default[df_app_15_default['loan_amnt'] <= outlier_bound] # approved loans and default loans
In [218]:
len(df_app_test)
Out[218]:
In [53]:
## All Data -- might be too large to run
## KS two sample test to compare unapproved and approved loans
# stats.ks_2samp(df_unapp_test['Amount Requested'], df_app_tes['loan_amnt'])
## KS two sample test to compare unapproved and approved loans
#stats.ks_2samp(df_unapp_test['Risk_Score'], df_app_test['fico_range_low'])
In [210]:
### TEST with Samples using differen random states
random_states = np.random.randint(1, 1000, 10)
for rs in random_states:
df_unapp_test_s = df_unapp_test.sample(n=10000, random_state=rs)
df_app_test_s = df_app_test.sample(n=10000, random_state=rs)
## KS two sample test to compare unapproved and approved loan samples
print '\nrandom_state = {}'.format(rs)
print 'KS result on loan amounts:\n', stats.ks_2samp(sorted(df_unapp_test_s['Amount Requested']), \
sorted(df_app_test_s['loan_amnt']))
## KS two sample test to compare unapproved and approved loans -- samples
print 'KS result on credit scores:\n', stats.ks_2samp(sorted(df_unapp_test_s['Risk_Score']), \
sorted(df_app_test_s['fico_range_low']))
## KS two sample test to compare unapproved and approved loans -- samples
print 'KS result on dti:\n', stats.ks_2samp(sorted(df_unapp_test_s['Debt-To-Income Ratio']), \
sorted(df_app_test_s['dti']))
In [132]:
## Top25% scores lower bound -- whole clean data
df_unapp_test.describe().Risk_Score
Out[132]:
In [133]:
## Top25% scores lower bound -- sample data
df_unapp_test_s.describe().Risk_Score[6]
Out[133]:
In [134]:
## Bottom 25% scores upper bound -- whole clean data
df_app_test.describe().fico_range_low
Out[134]:
In [135]:
## Bottom 25% scores upper bound -- sample data
df_app_test_s.describe().fico_range_low[4]
Out[135]:
In [208]:
### TEST with Samples using differen random states
random_states = np.random.randint(1, 100, 5)
for rs in random_states:
# Sampling
df_unapp_test_s = df_unapp_test.sample(n=10000, random_state=rs)
df_app_test_s = df_app_test.sample(n=10000, random_state=rs)
### Select applications based on top 25% and bottom 25% credit scores boundries
top25_bound_unapp = df_unapp_test_s.describe().Risk_Score[6] ## Top25% scores lower bound -- sample data
bot25_bound_app = df_app_test_s.describe().fico_range_low[4] ## Bottom 25% scores upper bound -- sample data
df_unapp_top25= df_unapp_test_s[df_unapp_test_s['Risk_Score'] >= top25_bound_unapp] # unapproved loans
df_app_bot25 = df_app_test_s[df_app_test_s['fico_range_low'] <= bot25_bound_app] # approved loans
## KS two sample test to compare unapproved and approved loan samples
print '\nrandom_state = {}'.format(rs)
print 'KS result on loan amounts:\n', stats.ks_2samp(sorted(df_unapp_top25['Amount Requested']), \
sorted(df_app_bot25['loan_amnt']))
## KS two sample test to compare unapproved and approved loans -- samples
print 'KS result on credit scores:\n', stats.ks_2samp(sorted(df_unapp_top25['Risk_Score']), \
sorted(df_app_bot25['fico_range_low']))
## KS two sample test to compare unapproved and approved loans -- samples
print 'KS result on dti:\n', stats.ks_2samp(sorted(df_unapp_top25['Debt-To-Income Ratio']), \
sorted(df_app_bot25['dti']))
In [220]:
## KS two sample test to compare unapproved and fully paid loans
print '\nKS result on loan amounts:\n', stats.ks_2samp(sorted(df_unapp_test.sample(n=len(df_app_paid_test))['Amount Requested']), \
sorted(df_app_paid_test['loan_amnt']))
print 'KS result on credit scores:\n', stats.ks_2samp(sorted(df_unapp_test.sample(n=len(df_app_paid_test))['Risk_Score']), \
sorted(df_app_paid_test['fico_range_low']))
print 'KS result on dti:\n', stats.ks_2samp(sorted(df_unapp_test.sample(n=len(df_app_paid_test))['Debt-To-Income Ratio']), \
sorted(df_app_paid_test['dti']))
In [221]:
## KS two sample test to compare unapproved and default loans
print '\nKS result on loan amounts:\n', stats.ks_2samp(sorted(df_unapp_test.sample(n=len(df_app_default_test))['Amount Requested']), \
sorted(df_app_default_test['loan_amnt']))
print 'KS result on credit scores:\n', stats.ks_2samp(sorted(df_unapp_test.sample(n=len(df_app_default_test))['Risk_Score']), \
sorted(df_app_default_test['fico_range_low']))
print 'KS result on dti:\n', stats.ks_2samp(sorted(df_unapp_test.sample(n=len(df_app_default_test))['Debt-To-Income Ratio']), \
sorted(df_app_default_test['dti']))
In [ ]:
In [236]:
# make plots for distributions of loan features
def plot_feature_dist(df_unapp, col_unapp, df_app, col_app, ylabel=None, heading=None):
# Sampling
df_unapp_test_s = df_unapp.sample(n=10000, random_state=22)
df_app_test_s = df_app.sample(n=10000, random_state=22)
### Select applications based on top 25% and bottom 25% credit scores boundries
top25_bound_unapp = df_unapp_test_s.describe().Risk_Score[6] ## Top25% scores lower bound -- sample data
bot25_bound_app = df_app_test_s.describe().fico_range_low[4] ## Bottom 25% scores upper bound -- sample data
df_unapp_top25= df_unapp_test_s[df_unapp_test_s['Risk_Score'] >= top25_bound_unapp] # unapproved loans
df_app_bot25 = df_app_test_s[df_app_test_s['fico_range_low'] <= bot25_bound_app] # approved loans
### Plot ###
fig = pl.figure(figsize=(8,6))
pl.plot(range(1, len(df_unapp_top25)+1), sorted(df_unapp_top25[col_unapp]), 'r.', alpha=0.6, label='unapproved loans')
pl.plot(range(1,len(df_app_bot25)+1), sorted(df_app_bot25[col_app]), 'g.', alpha=0.6, label='approved loans')
#upper = max(df_unapp_top25[col_unapp].max(), df_app_bot25[col_app].max())
num = min(len(df_unapp_top25), len(df_app_bot25))
pl.xlim(1, num+1)
pl.xticks(np.arange(1, num+1, 1000), np.arange(1, num+1, 1000))
pl.xlabel('Loan Applications', size=15)
pl.ylabel(ylabel, size=15)
pl.title(heading, size=18)
pl.legend(loc='best', fontsize='xx-large')
# footnote
fig.text(0.99, 0.01, 'data source: Lending Club', ha='right',
va='bottom', fontsize=10, color='#999999')
pl.tight_layout()
In [237]:
## Credit Score
plot_feature_dist(df_unapp_test, 'Risk_Score', df_app_test, 'fico_range_low',\
ylabel='Credit Score', heading='Unapproved vs Approved Loans-Credit Score')
In [238]:
## Loan Amounts
plot_feature_dist(df_unapp_test, 'Amount Requested', df_app_test, 'loan_amnt',\
ylabel='Loan Amount', heading='Unapproved vs Approved Loans-Loan Amount')
In [239]:
## Debt-To-Income Ratio
plot_feature_dist(df_unapp_test, 'Debt-To-Income Ratio', df_app_test, 'dti',\
ylabel='Debt-To-Income Ratio', heading='Unapproved vs Approved Loans-Debt-To-Income Ratio')
In [240]:
## Credit Score
plot_feature_dist(df_unapp_test, 'Risk_Score', df_app_paid_test, 'fico_range_low',\
ylabel='Credit Score', heading='Unapproved vs Fully Paid Loans-Credit Score')
In [241]:
## Loan Amounts
plot_feature_dist(df_unapp_test, 'Amount Requested', df_app_paid_test, 'loan_amnt',\
ylabel='Loan Amount', heading='Unapproved vs Fully Paid Loans-Loan Amount')
In [242]:
## Debt-To-Income Ratio
plot_feature_dist(df_unapp_test, 'Debt-To-Income Ratio', df_app_paid_test, 'dti',\
ylabel='Debt-To-Income Ratio', heading='Unapproved vs Fully Paid Loans-Debt-To-Income Ratio')
In [243]:
## Credit Score
plot_feature_dist(df_unapp_test, 'Risk_Score', df_app_default_test, 'fico_range_low',\
ylabel='Credit Score', heading='Unapproved vs Default Loans-Credit Score')
In [244]:
## Loan Amounts
plot_feature_dist(df_unapp_test, 'Amount Requested', df_app_default_test, 'loan_amnt',\
ylabel='Loan Amount', heading='Unapproved vs Default Loans-Loan Amount')
In [245]:
## Debt-To-Income Ratio
plot_feature_dist(df_unapp_test, 'Debt-To-Income Ratio', df_app_default_test, 'dti',\
ylabel='Debt-To-Income Ratio', heading='Unapproved vs Default Loans-Debt-To-Income Ratio')
In [ ]:
In [ ]: