In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
import seaborn as sns
import matplotlib.pylab as pl
%matplotlib inline
#import matplotlib.pyplot as plt
loan_status -- Current status of the loan
loan_amnt -- The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
int_rate -- interest rate of the loan
grade -- LC assigned loan grade
sub_grade -- LC assigned sub loan grade
purpose -- A category provided by the borrower for the loan request.
-- dummy
annual_inc -- The self-reported annual income provided by the borrower during registration.
emp_length -- Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
-- dummie
fico_range_low fico_range_high
home_ownership -- The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER
tot_cur_bal -- Total current balance of all accounts
num_actv_bc_tl -- number of active bank accounts
(avg_cur_bal -- average current balance of all accounts )
mort_acc -- number of mortgage accounts
num_actv_rev_tl -- Number of currently active revolving trades
dti -- A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
pub_rec_bankruptcies - Number of public record bankruptcies
delinq_amnt --
title --
mths_since_last_delinq -- The number of months since the borrower's last delinquency.
mths_since_recent_revol_delinq -- Months since most recent revolving delinquency.
total_cu_tl -- Number of finance trades
last_credit_pull_d -- The most recent month LC pulled credit for this loan
In [76]:
## 2015
df_app_2015 = pd.read_csv('data/LoanStats3d_securev1.csv.zip', compression='zip', low_memory=False,\
header=1)
In [77]:
df_app_2015.loan_status.unique()
Out[77]:
In [78]:
df_app_2015.head(5)
Out[78]:
In [92]:
df_app_2015['delinq_amnt'].unique()
Out[92]:
In [32]:
df_app_2015.info(max_cols=111)
In [38]:
df_app_2015.groupby('title').loan_amnt.mean()
Out[38]:
In [39]:
df_app_2015.groupby('purpose').loan_amnt.mean()
Out[39]:
In [7]:
df_app_2015['emp_length'].unique()
Out[7]:
In [82]:
## selected columns
df = df_app_2015.ix[:, ['loan_status','loan_amnt', 'int_rate', 'grade', 'sub_grade',\
'purpose',\
'annual_inc', 'emp_length', 'home_ownership',\
'fico_range_low','fico_range_high',\
'num_actv_bc_tl', 'tot_cur_bal', 'mort_acc','num_actv_rev_tl',\
'pub_rec_bankruptcies','dti' ]]
In [83]:
df.head(3)
Out[83]:
In [91]:
len(df.dropna())
Out[91]:
In [85]:
df.shape
Out[85]:
In [86]:
df.loan_status.unique()
Out[86]:
In [87]:
len(df[df['loan_status']=='Fully Paid'])
Out[87]:
In [88]:
len(df[df['loan_status']=='Default'])
Out[88]:
In [89]:
len(df[df['loan_status']=='Charged Off'])
Out[89]:
In [90]:
len(df[df['loan_status']=='Late (31-120 days)'])
Out[90]:
In [65]:
df.info()
In [29]:
df.loan_status.unique()
Out[29]:
In [30]:
## Convert applicable fields to numeric (I only select "Interest Rate" to use for this analysis)
df.ix[:,'int_rate'] = df.ix[:,['int_rate']]\
.applymap(lambda e: pd.to_numeric(str(e).rstrip()[:-1], errors='coerce'))
df.info()
In [57]:
df = df.rename(columns={"int_rate": "int_rate(%)"})
In [58]:
df.head(3)
Out[58]:
In [7]:
#len(df.dropna(thresh= , axis=1).columns)
Out[7]:
In [59]:
df.describe()
Out[59]:
In [168]:
# 1. Loan Amount distribution
# # create plots and histogram to visualize total loan amounts
fig = pl.figure(figsize=(8,10))
ax1 = fig.add_subplot(211)
ax1.plot(range(len(df)), sorted(df.loan_amnt), '.', color='purple')
ax1.set_xlabel('Loan Applicant Count')
ax1.set_ylabel('Loan Amount ($)')
ax1.set_title('Fig 1a - Sorted Issued Loan Amount (2015)', size=15)
# all_ histogram
# pick upper bound 900 to exclude too large numbers
ax2 = fig.add_subplot(212)
ax2.hist(df.loan_amnt, range=(df.loan_amnt.min(), 36000), color='purple')
ax2.set_xlabel('Loan Amount -$', size=12)
ax2.set_ylabel('Counts',size=12)
ax2.set_title('Fig 1b - Sorted Issued Loan Amount (2015)', size=15)
Out[168]:
Fig 1a shows the sorted issued loan amounts from low to high.
Fig 2c is a histogram showing the distribution of the issued loan amounts.
Obeservation
The Loan amounts vary from $1000 to $35,000, and the most frequent loan amounts issued are around $10,000.
In [81]:
inc_75 = df.describe().loc['75%', 'annual_inc']
count_75 = int(len(df)*0.75)
Out[81]:
In [165]:
# 2. Applicant Anual Income Distribution
fig = pl.figure(figsize=(8,16))
ax0 = fig.add_subplot(311)
ax0.plot(range(len(df.annual_inc)), sorted(df.annual_inc), '.', color='blue')
ax0.set_xlabel('Loan Applicant Count')
ax0.set_ylabel('Applicant Annual Income ($)')
ax0.set_title('Fig 2a - Sorted Applicant Annual Income-all ($) (2015)', size=15)
# use 75% quantile to plot the graph and histograms -- excluding extreme values
inc_75 = df.describe().loc['75%', 'annual_inc']
inc_below75 = df.annual_inc[df.annual_inc <= inc_75]
count_75 = int(len(df)*0.75)
ax1 = fig.add_subplot(312)
ax1.plot(range(count_75), sorted(df.annual_inc)[:count_75], '.', color='blue')
ax1.set_xlabel('Loan Applicant Count')
ax1.set_ylabel('Applicant Annual Income ($)')
ax1.set_title('Fig 2b - Sorted Applicant Annual Income-75% ($) (2015)',size=15)
# all_ histogram
# pick upper bound 900 to exclude too large numbers
ax2 = fig.add_subplot(313)
ax2.hist(df.annual_inc, range=(df.annual_inc.min(), inc_75), color='blue')
ax2.set_xlabel('Applicant Annual Income -$', size=12)
ax2.set_ylabel('Counts',size=12)
ax2.set_title('Fig 2c - Sorted Applicant Income-75% ($) (2015)',size=15)
Out[165]:
Fig 2a and Fig 2b both show the sorted applicant annual income from low to high. The former indicates extreme values, and the latter plots only those values below the 75% quantile, which looks more sensible.
Fig 2c is a histogram showing the distribution of the applicants' income (below 75% quantile).
Obeservation The most frequent annual income amounts of ths applicants are between $40,000 and below $60,000.
In [69]:
4.600000e+04
Out[69]:
In [94]:
# 3. Loan amount and Applicant Annual Income
# View all
pl.figure(figsize=(6,4))
pl.plot(df.annual_inc, df.loan_amnt, '.')
pl.ylim(0, 40000)
pl.xlim(0, 0.2e7) # df.annual_inc.max()
pl.title('Fig 3a - Loan Amount VS Applicant Annual Income_all', size=15)
pl.ylabel('Loan Amount ($)', size=15)
pl.xlabel('Applicant Annual Income ($)', size=15)
Out[94]:
Fig 3a shows the approved loan amount against the applicants' annual income.
Oberservation:
We can see that there are a few people with self-reported income that is very high, while majority of the applicants are with income less than $100,000. These extreme values indicate a possibility of outliers.
Method to deal with Outliers
Locate Outliers using Median-Absolute-Deviation (MAD) test and remove them for further analysis
Pick samples to set outlier range using the mean of the outlier boundries-- the method could be improved by using ramdom sampling
In [102]:
# 3b
pl.figure(figsize=(6,4))
pl.plot(df.annual_inc, df.loan_amnt, '.')
pl.ylim(0, 40000)
pl.xlim(0, inc_75)
pl.title('Fig 3b - Loan Amount VS Applicant Annual Income_75%', size=15)
pl.ylabel('Loan Amount ($)', size=15)
pl.xlabel('Applicant Annual Income ($)', size=15)
Out[102]:
Fig 3b is plot of the loan amount VS applicant annual income with all extreme income amounts being excluded.
Observation:
Now it is clearer to see that there is quite "rigid" standard to determine loan amounts based on income, however, there are still exceptions (sparse points above the "division line".
In [91]:
pl.plot(np.log(df.annual_inc), np.log(df.loan_amnt), '.')
Out[91]:
In [ ]:
In [116]:
# 4. Average loan amount groupby grade
mean_loan_grade = df.groupby('grade')['loan_amnt'].mean()
mean_loan_grade
Out[116]:
In [117]:
sum_loan_grade = df.groupby('grade')['loan_amnt'].sum()
sum_loan_grade
Out[117]:
In [169]:
fig = pl.figure(figsize=(8,12)) #16,5
ax0 = fig.add_subplot(211)
ax0.plot(range(len(mean_loan_grade)), mean_loan_grade, 'o', color='blue')
ax0.set_ylim(0, 23000)
ax0.set_xlim(-0.5, len(mean_loan_grade))
ax0.set_xticks(range(len(mean_loan_grade)))
ax0.set_xticklabels(('A','B','C','D','E','F','G'))
ax0.set_xlabel('Grade')
ax0.set_ylabel('Average Loan Amount ($)')
ax0.set_title('Fig 4a - Average Loan Amount by Grade ($) (2015)', size=15)
ax1 = fig.add_subplot(212)
ax1.plot(range(len(sum_loan_grade)), sum_loan_grade, 'o', color='brown')
ax1.set_ylim(0, 2.3e9)
ax1.set_xlim(-0.5, len(sum_loan_grade))
ax1.set_xticks(range(len(sum_loan_grade)))
ax1.set_xticklabels(('A','B','C','D','E','F','G'))
ax1.set_xlabel('Grade')
ax1.set_ylabel('Total Loan Amount ($)')
ax1.set_title('Fig 4b - Total Loan Amount by Grade ($) (2015)', size=15)
Out[169]:
Fig 4a shows the avereage approved loan amounts corresponded to the grades determined by the Lending Club.
Fig 4b shows the total approved loan amounts corresponded to the grades determined by the Lending Club.
Oberservation:
It is interesting to see that the points in these two charts have different trends-- the total loan amount gets higher from grade A to C, and then fall to a very low level; the average loan amount falls a little from grade A to grade B, and then gradually increases as the grade goes from B to G (increased by more than $5,000 from B to G).
In [ ]: