Lending Club Data



In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import RFE

from sklearn.svm import SVR
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR

import seaborn as sns
import matplotlib.pylab as pl
%matplotlib inline
#import matplotlib.pyplot as plt

Columns Interested

loan_status -- Current status of the loan

loan_amnt -- The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
int_rate -- interest rate of the loan
grade -- LC assigned loan grade
sub_grade -- LC assigned sub loan grade

purpose -- A category provided by the borrower for the loan request.
-- dummy

annual_inc -- The self-reported annual income provided by the borrower during registration.
emp_length -- Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
-- dummie

fico_range_low fico_range_high

home_ownership -- The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER
tot_cur_bal -- Total current balance of all accounts
num_actv_bc_tl -- number of active bank accounts
(avg_cur_bal -- average current balance of all accounts )

mort_acc -- number of mortgage accounts

num_actv_rev_tl -- Number of currently active revolving trades
dti -- A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
pub_rec_bankruptcies - Number of public record bankruptcies

delinq_amnt --


title -- mths_since_last_delinq -- The number of months since the borrower's last delinquency.
mths_since_recent_revol_delinq -- Months since most recent revolving delinquency.
total_cu_tl -- Number of finance trades
last_credit_pull_d -- The most recent month LC pulled credit for this loan


In [76]:
## 2015
df_app_2015 = pd.read_csv('data/LoanStats3d_securev1.csv.zip', compression='zip', low_memory=False,\
                         header=1)

In [77]:
df_app_2015.loan_status.unique()


Out[77]:
array(['Current', 'Fully Paid', 'Default', 'Charged Off',
       'Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period', nan], dtype=object)

In [78]:
df_app_2015.head(5)


Out[78]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit
0 68615169 NaN 16000.0 16000.0 16000.0 60 months 8.49% 328.19 B B1 ... 0.0 3.0 87.5 50.0 0.0 0.0 307343.0 64371.0 23500.0 59526.0
1 68537564 NaN 8000.0 8000.0 8000.0 36 months 10.78% 261.08 B B4 ... 0.0 4.0 97.1 100.0 0.0 0.0 168819.0 148154.0 5800.0 136996.0
2 68536799 NaN 10000.0 10000.0 10000.0 36 months 10.78% 326.35 B B4 ... 0.0 2.0 100.0 33.3 0.0 0.0 190694.0 40681.0 13900.0 29512.0
3 68355089 NaN 24700.0 24700.0 24700.0 36 months 11.99% 820.28 C C1 ... 0.0 2.0 97.4 7.7 0.0 0.0 314017.0 39475.0 79300.0 24667.0
4 68506885 NaN 10000.0 10000.0 10000.0 60 months 11.99% 222.40 C C1 ... 0.0 1.0 100.0 50.0 0.0 0.0 61054.0 41166.0 13100.0 46854.0

5 rows × 115 columns


In [92]:
df_app_2015['delinq_amnt'].unique()


Out[92]:
array([  0.00000000e+00,   6.87000000e+02,   4.00000000e+01,
         3.60000000e+01,   6.00000000e+00,   6.20000000e+01,
         3.30000000e+01,   1.62000000e+02,   1.39400000e+03,
         7.84900000e+03,   8.96000000e+02,   3.76000000e+02,
         1.49200000e+03,   1.40000000e+01,   1.00000000e+00,
         3.03000000e+02,   1.96100000e+03,   6.02000000e+02,
         2.73000000e+03,   3.86650000e+04,   2.59000000e+02,
         1.65000000e+02,   3.39000000e+02,   2.49460000e+04,
         2.89000000e+02,   3.42000000e+02,   2.48000000e+02,
         3.50000000e+01,   5.30000000e+01,   6.50000000e+04,
         1.56000000e+02,   3.00000000e+01,   3.30000000e+02,
         3.43000000e+02,   1.35330000e+04,   7.87000000e+02,
         4.59000000e+03,   1.84000000e+02,   2.00000000e+00,
         1.80000000e+01,   3.34300000e+03,   9.41000000e+02,
         1.33400000e+03,   9.40000000e+01,   1.04300000e+03,
         1.89200000e+03,   4.90000000e+02,   1.26000000e+02,
         2.07080000e+04,   2.58000000e+02,   6.10800000e+03,
         1.14600000e+03,   7.10000000e+01,   2.53000000e+02,
         9.17000000e+02,   3.70000000e+02,   8.82160000e+04,
         1.93000000e+02,   1.64000000e+02,   9.60000000e+01,
         4.40000000e+01,   1.26600000e+03,   4.80000000e+01,
         5.40000000e+01,   1.01000000e+02,   8.90000000e+01,
         4.90000000e+01,   5.10000000e+01,   3.77000000e+02,
         2.20000000e+01,   8.15000000e+02,   1.39200000e+03,
         7.50000000e+01,   4.06200000e+03,   6.27100000e+03,
         5.60000000e+01,   2.52000000e+02,   7.62000000e+02,
         2.49000000e+02,   7.65000000e+02,   1.83000000e+02,
         2.33100000e+03,   5.71000000e+02,   7.38000000e+02,
         1.77520000e+04,   1.85100000e+03,   2.99100000e+03,
         3.02000000e+02,   2.72000000e+02,   7.05000000e+02,
         4.57000000e+02,   6.67000000e+02,   6.81000000e+02,
         1.20000000e+01,   1.00000000e+01,   4.80000000e+02,
         2.67000000e+02,   3.40000000e+01,   1.54000000e+02,
         2.29000000e+02,   4.15000000e+02,   2.90000000e+01,
         1.24000000e+02,   1.26100000e+03,   1.73000000e+02,
         3.84000000e+02,   7.60000000e+01,   5.78000000e+02,
         3.56000000e+02,   8.20000000e+01,   2.46600000e+03,
         2.01000000e+02,   8.14000000e+02,   1.48000000e+02,
         4.29100000e+03,   5.20000000e+01,   1.19000000e+02,
         1.59000000e+02,   7.49000000e+02,   3.16000000e+02,
         5.85000000e+02,   4.91850000e+04,   1.79000000e+02,
         5.70000000e+01,   1.69000000e+02,   1.32000000e+02,
         2.70000000e+01,   1.24800000e+03,   5.42000000e+02,
         2.60000000e+02,   9.66000000e+02,   2.47000000e+02,
         1.90000000e+01,   8.56000000e+02,   8.60000000e+01,
         6.76700000e+03,   7.00000000e+01,   6.00000000e+01,
         1.10000000e+01,   2.00000000e+01,   6.42600000e+03,
         4.79000000e+02,   8.40000000e+01,   1.08000000e+02,
         2.70000000e+02,   7.20000000e+01,   3.33000000e+02,
         5.27100000e+03,   1.71000000e+03,   4.37000000e+02,
         6.65000000e+02,   2.50000000e+01,   1.99000000e+02,
         2.59170000e+04,   1.27000000e+02,   8.42800000e+03,
         4.20000000e+01,   2.93000000e+02,   5.50100000e+03,
         6.36000000e+02,   1.00870000e+04,   1.03800000e+03,
         4.65000000e+02,   9.70000000e+01,   1.74000000e+02,
         4.23660000e+04,   1.22100000e+03,   7.43800000e+03,
         1.04900000e+03,   6.10000000e+01,   7.01200000e+03,
         9.00000000e+00,   9.90000000e+01,   4.84000000e+02,
         2.95000000e+02,   1.09000000e+02,   2.94600000e+03,
         2.32000000e+02,   8.36000000e+02,   3.88100000e+03,
         1.99100000e+03,   1.32200000e+03,   5.61000000e+02,
         1.30000000e+01,   2.34000000e+02,   7.40000000e+01,
         6.38000000e+02,   1.16000000e+02,   4.34000000e+02,
         7.59000000e+02,   1.17000000e+02,   4.89000000e+02,
         2.52400000e+03,   3.50600000e+03,   1.31000000e+02,
         3.15500000e+03,   1.39000000e+02,   2.40000000e+02,
         1.77000000e+02,   1.28710000e+04,   1.43000000e+02,
         5.00000000e+01,   6.59000000e+02,   3.23400000e+03,
         1.31100000e+03,   2.46200000e+03,   2.28000000e+02,
         3.71000000e+02,   3.80000000e+01,   1.38000000e+02,
         3.62400000e+03,   2.40000000e+01,   3.67500000e+03,
         3.24100000e+03,   1.57000000e+02,   1.30000000e+02,
         5.90000000e+01,   2.03000000e+02,   4.56800000e+03,
         8.34000000e+03,   9.56000000e+02,   6.82000000e+02,
         8.00000000e+01,   3.91600000e+03,   1.66000000e+02,
         3.59600000e+03,   8.50000000e+01,   4.97000000e+02,
         5.00000000e+00,   2.15000000e+02,   1.02000000e+02,
         1.29700000e+03,   7.70000000e+01,   5.51000000e+02,
         1.49000000e+02,   1.01900000e+03,   5.27000000e+02,
         1.45000000e+02,   1.28400000e+03,   1.85000000e+02,
         1.06000000e+02,   1.96000000e+02,   2.31000000e+02,
         2.75400000e+03,   2.56900000e+04,   8.10000000e+01,
         5.93000000e+02,   2.07000000e+02,   2.69800000e+03,
         2.11000000e+02,   2.75000000e+02,   3.96000000e+02,
         3.10500000e+03,   1.28000000e+02,   5.80000000e+01,
         6.58400000e+03,   5.28000000e+02,   4.63000000e+02,
         4.10000000e+02,   2.12000000e+02,   1.37700000e+03,
         4.25300000e+03,   2.75700000e+03,   4.18000000e+02,
         5.97000000e+02,   7.80000000e+01,   2.91700000e+03,
         2.48190000e+04,   1.47000000e+02,   1.55600000e+03,
         1.50800000e+03,   1.33180000e+04,   2.00900000e+03,
         2.62000000e+02,   8.00000000e+00,   2.64000000e+02,
         2.08000000e+02,   6.18900000e+03,   1.70100000e+03,
         4.50000000e+01,   3.58500000e+03,   9.93400000e+03,
         1.83400000e+03,   1.73300000e+03,   6.40000000e+01,
         1.44000000e+02,   6.15600000e+03,   4.87000000e+02,
         7.02000000e+02,   4.61000000e+02,   5.25000000e+02,
         4.13000000e+02,   2.08600000e+03,   1.63500000e+03,
         6.05400000e+03,   9.36300000e+03,   1.53000000e+02,
         7.00000000e+02,   9.62000000e+02,   7.41000000e+02,
         4.02000000e+02,   5.29000000e+02,   1.01950000e+04,
         2.10000000e+01,   7.94000000e+02,   3.59000000e+02,
         1.13880000e+04,   1.53140000e+04,   2.16000000e+02,
         2.24500000e+03,   8.11000000e+02,   7.47000000e+02,
         3.43000000e+03,   1.05000000e+02,   1.71000000e+02,
         2.33000000e+02,   1.31500000e+03,   7.69000000e+02,
         1.95500000e+03,   3.20000000e+01,   5.00000000e+02,
         1.63000000e+02,   5.67080000e+04,   4.25000000e+02,
         1.78000000e+02,   1.35000000e+02,   7.98000000e+02,
         1.70000000e+01,   4.03000000e+02,   3.96600000e+03,
         6.33000000e+02,   4.37840000e+04,   1.92000000e+02,
         2.44330000e+04,   1.00000000e+02,   2.39000000e+02,
         7.50000000e+02,   7.57000000e+02,   2.41100000e+03,
         5.33000000e+02,   6.49000000e+02,   1.40400000e+03,
         1.05100000e+03,   1.13700000e+03,   5.48000000e+02,
         1.25200000e+03,   2.13000000e+02,   3.10000000e+01,
         1.22000000e+02,   2.17000000e+02,   3.68000000e+02,
         9.10000000e+01,   3.26200000e+03,   4.24000000e+02,
         1.59177000e+05,   2.96000000e+02,   1.10100000e+03,
         2.27000000e+02,   1.95000000e+02,   9.20000000e+01,
         1.81800000e+03,   1.87000000e+02,   1.55000000e+03,
         2.80000000e+01,   5.50000000e+01,   2.92400000e+03,
         1.50000000e+01,   2.50000000e+02,   3.87000000e+03,
         2.98920000e+04,   4.46000000e+03,   9.91000000e+02,
         8.16300000e+03,   6.91000000e+02,   3.54000000e+02,
         2.43000000e+02,   5.09000000e+02,   1.19590000e+04,
         6.30000000e+01,   2.84790000e+04,   1.00100000e+03,
         2.42200000e+03,   6.45000000e+02,   8.12000000e+02,
         3.47000000e+02,   6.58000000e+02,   1.13660000e+04,
         6.70000000e+01,   7.67350000e+04,   2.81200000e+03,
         2.48410000e+04,   2.27700000e+03,   1.15400000e+03,
         1.13000000e+02,   1.62100000e+03,   1.55000000e+02,
         2.30000000e+01,   3.92000000e+02,   4.61700000e+03,
         8.30000000e+01,   3.24000000e+02,   1.41300000e+03,
         5.34000000e+02,   4.81000000e+02,   8.64000000e+02,
         9.71000000e+02,   7.45000000e+02,   3.10000000e+02,
         1.03000000e+02,   3.62320000e+04,   1.42700000e+03,
         5.06000000e+02,   3.03900000e+03,   8.80000000e+01,
         4.78000000e+02,   5.36190000e+04,   4.70000000e+01,
         5.88450000e+04,   1.08500000e+03,   2.35000000e+02,
         1.33000000e+02,   1.44000000e+03,   3.12500000e+03,
         1.57700000e+03,   1.37100000e+03,   5.35800000e+03,
         5.68000000e+02,   1.30990000e+04,   4.22000000e+02,
         3.22900000e+03,   1.36000000e+02,   3.25180000e+04,
         4.02610000e+04,   1.17000000e+03,   2.97000000e+02,
         9.82000000e+02,   1.87000000e+03,   3.00000000e+00,
         1.72900000e+03,   7.30000000e+01,   1.72100000e+03,
         9.92000000e+02,   9.28000000e+02,   7.80000000e+02,
         9.88000000e+02,   3.90000000e+01,   1.68000000e+02,
         9.02000000e+02,   2.73390000e+04,   1.37600000e+03,
         3.44900000e+03,   1.02300000e+03,   1.85800000e+03,
         1.90440000e+04,   3.03500000e+03,   2.93500000e+03,
         9.57000000e+02,   4.29040000e+04,   2.56000000e+02,
         2.66900000e+03,   1.51800000e+03,   1.91000000e+02,
         3.20000000e+02,   3.18000000e+02,   1.86000000e+02,
         4.35800000e+03,   3.88000000e+02,   3.04000000e+02,
         2.20000000e+02,   2.14000000e+02,   2.50000000e+03,
         4.60000000e+01,   1.22700000e+03,   1.11800000e+03,
         1.09360000e+04,   3.73100000e+03,   4.26000000e+02,
         1.61000000e+02,   5.32020000e+04,   6.50000000e+01,
         1.69690000e+04,   9.44000000e+02,   1.60000000e+01,
         4.42000000e+02,   4.38000000e+02,   6.24000000e+02,
         5.38000000e+02,   6.68000000e+02,   5.22100000e+03,
         9.79000000e+02,   1.97000000e+02,   6.04000000e+02,
         7.00000000e+00,   1.10000000e+02,   1.69000000e+04,
         1.20000000e+02,   3.80000000e+02,   6.80000000e+02,
         1.49700000e+03,   3.28600000e+03,   4.81900000e+03,
         6.18000000e+02,   4.10000000e+01,   1.87300000e+04,
         1.31400000e+03,   5.26500000e+03,   2.38200000e+03,
         3.13100000e+03,   2.95500000e+03,   1.44700000e+03,
         5.46000000e+02,   3.05000000e+02,   1.18000000e+02,
         9.55000000e+02,   8.20000000e+02,   3.17000000e+02,
         2.47900000e+04,   4.59000000e+02,   4.14000000e+02,
         8.35000000e+02,   1.04000000e+02,   1.07000000e+02,
         2.41000000e+02,   5.59000000e+02,   5.39000000e+02,
         9.12000000e+02,   3.48000000e+02,   2.06000000e+02,
         2.95200000e+03,   2.30000000e+02,   2.47400000e+03,
         3.69000000e+02,   1.10300000e+03,   5.16900000e+03,
         2.61000000e+02,   2.66000000e+02,   2.63300000e+03,
         4.08200000e+03,   2.51000000e+02,   1.51000000e+02,
         1.85000000e+03,   3.68600000e+03,   4.23000000e+02,
         1.32600000e+03,   1.42000000e+03,   9.70000000e+02,
         4.69000000e+02,   1.93300000e+03,   1.00800000e+03,
         7.17000000e+02,   9.30000000e+01,   4.74000000e+02,
         2.88600000e+03,   2.21400000e+03,   1.67800000e+03,
         4.20000000e+02,   3.25300000e+03,   3.50000000e+02,
         3.60400000e+03,   1.46000000e+02,   2.30720000e+04,
         5.69000000e+02,   5.65000000e+02,   1.75000000e+03,
         1.09000000e+03,   1.67900000e+03,   3.66900000e+03,
         3.51000000e+02,   1.60000000e+03,   2.55200000e+03,
         8.03000000e+02,   6.72000000e+02,   1.40600000e+03,
         6.41000000e+02,   7.22420000e+04,   4.36000000e+02,
         1.10950000e+04,   1.02130000e+04,   5.87930000e+04,
         3.78000000e+02,   2.91000000e+02,   1.23000000e+02,
         8.90000000e+02,   1.04800000e+04,   2.73000000e+02,
         6.02100000e+03,   8.70000000e+01,   6.97700000e+03,
         3.61000000e+02,   4.55400000e+03,   9.45210000e+04,
         6.90000000e+01,   3.98000000e+02,   6.77000000e+02,
         5.88800000e+03,   1.13750000e+04,   4.12000000e+02,
         6.66600000e+03,   2.26000000e+02,   6.95000000e+02,
         4.45000000e+02,   2.71630000e+04,   4.85000000e+02,
         2.17460000e+04,   5.92000000e+02,   3.00700000e+03,
         3.97000000e+02,   5.12000000e+02,   8.53000000e+03,
         6.93000000e+02,   1.37000000e+02,   9.46000000e+02,
         2.94000000e+02,   6.17000000e+02,   2.42000000e+02,
         1.98000000e+02,   1.36470000e+04,   4.29300000e+03,
         3.73000000e+02,   2.00500000e+03,   2.68000000e+02,
         5.03000000e+02,   1.75300000e+03,   1.86000000e+03,
         8.87800000e+03,   4.48000000e+02,   2.76600000e+03,
         1.02470000e+04,   4.40130000e+04,   2.74000000e+02,
         8.01000000e+02,   4.40000000e+02,   8.77000000e+02,
         5.24200000e+03,   1.42000000e+02,   1.91600000e+03,
         7.92000000e+02,   2.54000000e+02,   4.49000000e+02,
         5.99000000e+02,   5.15940000e+04,   4.72510000e+04,
         1.58480000e+04,   6.27000000e+02,   1.88000000e+02,
         1.83900000e+04,   2.00000000e+02,   4.30000000e+01,
         6.73200000e+03,   1.52000000e+02,   2.00000000e+03,
         2.55000000e+02,   4.93950000e+04,   1.20830000e+04,
         4.19000000e+02,   3.76200000e+03,   1.60000000e+02,
         2.67100000e+03,   1.25400000e+03,   2.10000000e+02,
         3.67000000e+02,   3.93000000e+02,   3.40000000e+03,
         4.60000000e+02,   8.80000000e+02,   1.46800000e+03,
         2.79000000e+02,   9.73000000e+02,   1.50700000e+03,
         9.00000000e+01,   1.14000000e+02,   3.76000000e+03,
         5.63380000e+04,   1.58300000e+03,   4.88000000e+02,
         2.00440000e+04,   2.00300000e+03,   4.29260000e+04,
         2.14500000e+03,   4.94000000e+02,   2.19000000e+02,
         3.70000000e+01,   2.12400000e+03,   3.79800000e+03,
         6.30400000e+03,   6.53000000e+02,   6.61000000e+02,
         1.21000000e+02,   1.95300000e+03,   6.60000000e+01,
         2.92000000e+02,   4.67930000e+04,   1.77100000e+03,
         4.30000000e+02,   3.90600000e+03,   5.14200000e+03,
         2.88000000e+02,   3.30700000e+03,   7.51400000e+03,
         1.85780000e+04,   6.97000000e+02,   1.97850000e+04,
         6.64970000e+04,   4.53000000e+02,   5.19000000e+02,
         3.12000000e+02,   1.05900000e+03,   2.45000000e+02,
         1.56100000e+03,   1.94000000e+02,   8.90000000e+03,
         2.09000000e+02,   4.59450000e+04,   1.64700000e+03,
         3.63900000e+03,   2.69200000e+03,   9.31000000e+02,
         4.51000000e+02,   3.36000000e+02,   2.65480000e+04,
         1.16300000e+03,   3.93200000e+03,   1.22900000e+03,
         1.91000000e+03,   8.57000000e+02,   1.00000000e+03,
         6.38670000e+04,   4.47000000e+02,   2.40500000e+04,
         3.60780000e+04,   5.79000000e+02,   6.51000000e+02,
         3.53000000e+02,   5.81000000e+02,   1.00400000e+03,
         2.44000000e+02,   7.68000000e+02,   3.75000000e+02,
         4.09000000e+02,   2.83000000e+02,   1.34000000e+02,
         3.65000000e+02,   1.76000000e+02,   2.99000000e+02,
         1.40180000e+04,   5.56000000e+02,   4.81940000e+04,
         1.94500000e+03,   2.41700000e+04,   1.57400000e+03,
         2.59200000e+03,   3.32000000e+02,   6.47000000e+02,
         1.25000000e+02,   1.72000000e+03,   3.56700000e+03,
         2.13000000e+03,   1.86900000e+03,   3.38000000e+02,
         1.85900000e+03,   4.16000000e+02,   3.21000000e+02,
         2.01900000e+03,   3.08000000e+02,   5.52000000e+02,
         3.86000000e+02,   1.33600000e+03,   4.00000000e+00,
         3.60000000e+02,   2.21000000e+02,   1.90800000e+03,
         2.75000000e+03,   3.94000000e+02,   6.98000000e+02,
         2.89200000e+03,   3.82200000e+03,   1.76100000e+03,
         3.63000000e+02,   9.14000000e+02,   7.36000000e+02,
         3.15000000e+02,   4.96000000e+02,   1.80000000e+02,
         8.26000000e+02,   8.63990000e+04,   4.62000000e+02,
         8.19000000e+02,   3.24400000e+03,   6.80000000e+01,
         6.30800000e+03,   4.82000000e+02,   3.79000000e+03,
         1.81700000e+03,   1.50000000e+02,   4.11500000e+03,
         3.34000000e+02,   2.87000000e+02,   8.38000000e+02,
         2.91990000e+04,   1.75230000e+04,   2.22000000e+02,
         9.90300000e+03,   5.84000000e+02,   1.80700000e+03,
         4.27100000e+03,   1.39600000e+03,   5.66000000e+02,
         2.25700000e+03,   7.32000000e+02,   2.90400000e+03,
         2.84300000e+03,   1.06200000e+03,   2.59550000e+04,
         2.53200000e+03,   5.26000000e+02,   1.67100000e+03,
         3.49300000e+03,   1.97300000e+03,   1.81000000e+02,
         5.44000000e+02,   2.77000000e+02,   3.00000000e+02,
         1.25800000e+03,   1.03700000e+03,   1.25000000e+03,
         3.28000000e+02,   1.47400000e+03,   1.44300000e+03,
         9.24000000e+02,   1.34800000e+03,   1.71400000e+03,
         3.79000000e+02,   9.84900000e+03,   9.64000000e+02,
         4.94300000e+03,   3.81000000e+02,   4.17000000e+02,
         2.98000000e+02,   7.54800000e+03,   7.67000000e+02,
         1.45000000e+03,   6.06480000e+04,   1.51040000e+04,
         3.97800000e+03,   7.88000000e+02,   3.01000000e+02,
         1.47460000e+04,   1.34000000e+03,   2.46000000e+02,
         4.05000000e+02,   8.60000000e+02,   3.37000000e+02,
         3.44000000e+02,   2.34000000e+03,   1.93700000e+03,
         7.10000000e+02,   7.90000000e+02,   5.55800000e+03,
         2.71000000e+02,   1.06700000e+04,   3.84300000e+03,
         7.31000000e+03,   5.32700000e+03,   7.73000000e+02,
         1.79000000e+03,   9.50000000e+01,   4.88600000e+03,
         2.27300000e+03,   3.95000000e+02,   3.86500000e+03,
         2.00570000e+04,   1.87200000e+03,   6.64000000e+02,
         1.07100000e+03,   7.35000000e+02,   3.22200000e+03,
         1.43300000e+03,   4.10200000e+03,   3.27000000e+02,
         5.58000000e+03,   2.63000000e+02,   4.41000000e+02,
         4.99000000e+02,   4.27260000e+04,   1.82000000e+02,
         8.49000000e+02,   1.00500000e+03,   1.75000000e+02,
         1.19000000e+03,   3.06000000e+02,   3.55000000e+02,
         8.95000000e+02,   5.98000000e+02,   1.37900000e+03,
         4.66000000e+02,   3.72000000e+02,   1.39300000e+03,
         5.28000000e+03,   2.74800000e+03,   2.18230000e+04,
         2.84000000e+02,   4.32800000e+03,   1.74600000e+03,
         1.00300000e+03,   2.85700000e+03,   5.96000000e+02,
         2.57000000e+02,   4.48900000e+03,   1.38430000e+04,
         2.65000000e+02,   7.19000000e+02,   2.95000000e+03,
         8.25000000e+02,   6.23590000e+04,   6.16000000e+02,
         1.12350000e+04,   5.08000000e+02,   6.39000000e+02,
         5.74000000e+02,   4.83000000e+02,   2.77720000e+04,
         1.15500000e+03,   1.81900000e+03,   2.36200000e+03,
         2.13500000e+03,   1.12720000e+04,   2.51230000e+04,
                    nan])

In [32]:
df_app_2015.info(max_cols=111)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421097 entries, 0 to 421096
Data columns (total 111 columns):
id                                2 non-null object
member_id                         0 non-null float64
loan_amnt                         421095 non-null float64
funded_amnt                       421095 non-null float64
funded_amnt_inv                   421095 non-null float64
term                              421095 non-null object
int_rate                          421095 non-null object
installment                       421095 non-null float64
grade                             421095 non-null object
sub_grade                         421095 non-null object
emp_title                         397221 non-null object
emp_length                        421095 non-null object
home_ownership                    421095 non-null object
annual_inc                        421095 non-null float64
verification_status               421095 non-null object
issue_d                           421095 non-null object
loan_status                       421095 non-null object
pymnt_plan                        421095 non-null object
url                               0 non-null float64
desc                              45 non-null object
purpose                           421095 non-null object
title                             420963 non-null object
zip_code                          421095 non-null object
addr_state                        421095 non-null object
dti                               421095 non-null float64
delinq_2yrs                       421095 non-null float64
earliest_cr_line                  421095 non-null object
inq_last_6mths                    421095 non-null float64
mths_since_last_delinq            217133 non-null float64
mths_since_last_record            74415 non-null float64
open_acc                          421095 non-null float64
pub_rec                           421095 non-null float64
revol_bal                         421095 non-null float64
revol_util                        420933 non-null object
total_acc                         421095 non-null float64
initial_list_status               421095 non-null object
out_prncp                         421095 non-null float64
out_prncp_inv                     421095 non-null float64
total_pymnt                       421095 non-null float64
total_pymnt_inv                   421095 non-null float64
total_rec_prncp                   421095 non-null float64
total_rec_int                     421095 non-null float64
total_rec_late_fee                421095 non-null float64
recoveries                        421095 non-null float64
collection_recovery_fee           421095 non-null float64
last_pymnt_d                      420801 non-null object
last_pymnt_amnt                   421095 non-null float64
next_pymnt_d                      292711 non-null object
last_credit_pull_d                421087 non-null object
collections_12_mths_ex_med        421095 non-null float64
mths_since_last_major_derog       122729 non-null float64
policy_code                       421095 non-null float64
application_type                  421095 non-null object
annual_inc_joint                  511 non-null float64
dti_joint                         509 non-null float64
verification_status_joint         511 non-null object
acc_now_delinq                    421095 non-null float64
tot_coll_amt                      421095 non-null float64
tot_cur_bal                       421095 non-null float64
open_acc_6m                       21372 non-null float64
open_il_6m                        21372 non-null float64
open_il_12m                       21372 non-null float64
open_il_24m                       21372 non-null float64
mths_since_rcnt_il                20810 non-null float64
total_bal_il                      21372 non-null float64
il_util                           18617 non-null float64
open_rv_12m                       21372 non-null float64
open_rv_24m                       21372 non-null float64
max_bal_bc                        21372 non-null float64
all_util                          21372 non-null float64
total_rev_hi_lim                  421095 non-null float64
inq_fi                            21372 non-null float64
total_cu_tl                       21372 non-null float64
inq_last_12m                      21372 non-null float64
acc_open_past_24mths              421095 non-null float64
avg_cur_bal                       421095 non-null float64
bc_open_to_buy                    417132 non-null float64
bc_util                           416868 non-null float64
chargeoff_within_12_mths          421095 non-null float64
delinq_amnt                       421095 non-null float64
mo_sin_old_il_acct                408841 non-null float64
mo_sin_old_rev_tl_op              421095 non-null float64
mo_sin_rcnt_rev_tl_op             421095 non-null float64
mo_sin_rcnt_tl                    421095 non-null float64
mort_acc                          421095 non-null float64
mths_since_recent_bc              417297 non-null float64
mths_since_recent_bc_dlq          108600 non-null float64
mths_since_recent_inq             376496 non-null float64
mths_since_recent_revol_delinq    151737 non-null float64
num_accts_ever_120_pd             421095 non-null float64
num_actv_bc_tl                    421095 non-null float64
num_actv_rev_tl                   421095 non-null float64
num_bc_sats                       421095 non-null float64
num_bc_tl                         421095 non-null float64
num_il_tl                         421095 non-null float64
num_op_rev_tl                     421095 non-null float64
num_rev_accts                     421094 non-null float64
num_rev_tl_bal_gt_0               421095 non-null float64
num_sats                          421095 non-null float64
num_tl_120dpd_2m                  401865 non-null float64
num_tl_30dpd                      421095 non-null float64
num_tl_90g_dpd_24m                421095 non-null float64
num_tl_op_past_12m                421095 non-null float64
pct_tl_nvr_dlq                    421095 non-null float64
percent_bc_gt_75                  416856 non-null float64
pub_rec_bankruptcies              421095 non-null float64
tax_liens                         421095 non-null float64
tot_hi_cred_lim                   421095 non-null float64
total_bal_ex_mort                 421095 non-null float64
total_bc_limit                    421095 non-null float64
total_il_high_credit_limit        421095 non-null float64
dtypes: float64(86), object(25)
memory usage: 356.6+ MB

In [38]:
df_app_2015.groupby('title').loan_amnt.mean()


Out[38]:
title
Business                                   15607.365207
Car financing                              10082.201439
Credit Card/Auto Repair                    10950.000000
Credit card refinancing                    15953.246747
Debt consolidation                         15755.709990
DebtC                                      20000.000000
Green loan                                  9670.964126
Home buying                                14547.170139
Home improvement                           14748.540413
Learning and training                       2200.000000
Major purchase                             13055.713613
Medical expenses                            9200.547631
Moving and relocation                       8315.978530
New Baby and New House (CC Consolidate)     8000.000000
Other                                      10340.992752
Pay off Lowes Card                         10000.000000
Paying off higher interest cards & auto    21000.000000
Prescription Drug and Medical Costs         8000.000000
SAVE                                       12000.000000
Simple Loan Until Contract Is Completed    25000.000000
Student Loan                               10800.000000
Trying to come back to reality!             6000.000000
Vacation                                    6526.302315
considerate                                25000.000000
new day                                    23200.000000
new kitchen for momma!                      8400.000000
odymeds                                     6400.000000
Name: loan_amnt, dtype: float64

In [39]:
df_app_2015.groupby('purpose').loan_amnt.mean()


Out[39]:
purpose
car                   10066.423832
credit_card           15951.617741
debt_consolidation    15756.119210
educational            2200.000000
home_improvement      14751.434191
house                 14542.715577
major_purchase        13058.913948
medical                9215.217115
moving                 8314.121901
other                 10345.197615
renewable_energy       9824.441964
small_business        15642.798751
vacation               6524.455313
wedding                8825.000000
Name: loan_amnt, dtype: float64

In [7]:
df_app_2015['emp_length'].unique()


Out[7]:
array(['5 years', '< 1 year', '3 years', '10+ years', '2 years', 'n/a',
       '4 years', '8 years', '6 years', '9 years', '1 year', '7 years', nan], dtype=object)

Decriptive Analyss

  1. Annual income distribution
  2. Total loan amount groupby interest rate chunks
  3. Average loan amount groupby grade
  4. Average loan amount groupby

In [82]:
## selected columns
df = df_app_2015.ix[:, ['loan_status','loan_amnt', 'int_rate', 'grade', 'sub_grade',\
                        'purpose',\
                        'annual_inc', 'emp_length', 'home_ownership',\
                        'fico_range_low','fico_range_high',\
                        'num_actv_bc_tl', 'tot_cur_bal', 'mort_acc','num_actv_rev_tl',\
                        'pub_rec_bankruptcies','dti' ]]

In [83]:
df.head(3)


Out[83]:
loan_status loan_amnt int_rate grade sub_grade purpose annual_inc emp_length home_ownership fico_range_low fico_range_high num_actv_bc_tl tot_cur_bal mort_acc num_actv_rev_tl pub_rec_bankruptcies dti
0 Current 16000.0 8.49% B B1 credit_card 62000.0 5 years MORTGAGE 720.0 724.0 4.0 227708.0 3.0 5.0 0.0 28.92
1 Current 8000.0 10.78% B B4 debt_consolidation 45000.0 < 1 year MORTGAGE 670.0 674.0 3.0 148154.0 2.0 11.0 0.0 21.23
2 Current 10000.0 10.78% B B4 debt_consolidation 41600.0 3 years MORTGAGE 695.0 699.0 2.0 168304.0 2.0 5.0 0.0 15.78

In [91]:
len(df.dropna())


Out[91]:
421095

In [85]:
df.shape


Out[85]:
(421097, 17)

In [86]:
df.loan_status.unique()


Out[86]:
array(['Current', 'Fully Paid', 'Default', 'Charged Off',
       'Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period', nan], dtype=object)

In [87]:
len(df[df['loan_status']=='Fully Paid'])


Out[87]:
94968

In [88]:
len(df[df['loan_status']=='Default'])


Out[88]:
260

In [89]:
len(df[df['loan_status']=='Charged Off'])


Out[89]:
33416

In [90]:
len(df[df['loan_status']=='Late (31-120 days)'])


Out[90]:
8626

In [65]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421097 entries, 0 to 421096
Data columns (total 15 columns):
loan_status             421095 non-null object
loan_amnt               421095 non-null float64
int_rate                421095 non-null object
grade                   421095 non-null object
sub_grade               421095 non-null object
purpose                 421095 non-null object
title                   420963 non-null object
annual_inc              421095 non-null float64
emp_length              421095 non-null object
home_ownership          421095 non-null object
num_actv_bc_tl          421095 non-null float64
tot_cur_bal             421095 non-null float64
mort_acc                421095 non-null float64
num_actv_rev_tl         421095 non-null float64
pub_rec_bankruptcies    421095 non-null float64
dtypes: float64(7), object(8)
memory usage: 48.2+ MB

In [29]:
df.loan_status.unique()


Out[29]:
array(['Current', 'Fully Paid', 'Default', 'Charged Off',
       'Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period', nan], dtype=object)

In [30]:
## Convert applicable fields to numeric (I only select "Interest Rate" to use for this analysis)
df.ix[:,'int_rate'] = df.ix[:,['int_rate']]\
                        .applymap(lambda e: pd.to_numeric(str(e).rstrip()[:-1], errors='coerce'))
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421097 entries, 0 to 421096
Data columns (total 18 columns):
loan_status                       421095 non-null object
loan_amnt                         421095 non-null float64
int_rate                          421095 non-null float64
grade                             421095 non-null object
sub_grade                         421095 non-null object
purpose                           421095 non-null object
annual_inc                        421095 non-null float64
emp_length                        421095 non-null object
home_ownership                    421095 non-null object
num_actv_bc_tl                    421095 non-null float64
tot_cur_bal                       421095 non-null float64
mort_acc                          421095 non-null float64
num_actv_rev_tl                   421095 non-null float64
total_cu_tl                       21372 non-null float64
pub_rec_bankruptcies              421095 non-null float64
mths_since_last_delinq            217133 non-null float64
mths_since_recent_bc_dlq          108600 non-null float64
mths_since_recent_revol_delinq    151737 non-null float64
dtypes: float64(12), object(6)
memory usage: 57.8+ MB

In [57]:
df = df.rename(columns={"int_rate": "int_rate(%)"})

In [58]:
df.head(3)


Out[58]:
loan_status loan_amnt int_rate(%) grade sub_grade annual_inc num_actv_bc_tl avg_cur_bal mort_acc
0 Current 16000.0 8.49 B B1 62000.0 4.0 22771.0 3.0
1 Current 8000.0 10.78 B B4 45000.0 3.0 5926.0 2.0
2 Current 10000.0 10.78 B B4 41600.0 2.0 9350.0 2.0

In [7]:
#len(df.dropna(thresh=  , axis=1).columns)


Out[7]:
52

In [59]:
df.describe()


Out[59]:
loan_amnt int_rate(%) annual_inc num_actv_bc_tl avg_cur_bal mort_acc
count 421095.000000 421095.000000 4.210950e+05 421095.000000 421095.000000 421095.000000
mean 15240.285862 12.600374 7.696561e+04 3.740966 13104.784721 1.685261
std 8571.325901 4.318474 7.394996e+04 2.304542 15671.803897 1.990160
min 1000.000000 5.320000 0.000000e+00 0.000000 0.000000 0.000000
25% 8500.000000 9.170000 4.600000e+04 2.000000 3199.000000 0.000000
50% 14000.000000 12.290000 6.500000e+04 3.000000 7262.000000 1.000000
75% 20000.000000 15.590000 9.169050e+04 5.000000 17974.500000 3.000000
max 35000.000000 28.990000 9.500000e+06 32.000000 555925.000000 52.000000

In [168]:
# 1. Loan Amount distribution
# # create plots and histogram to visualize total loan amounts 
fig = pl.figure(figsize=(8,10))
ax1 = fig.add_subplot(211)
ax1.plot(range(len(df)), sorted(df.loan_amnt), '.', color='purple')
ax1.set_xlabel('Loan Applicant Count')
ax1.set_ylabel('Loan Amount ($)')
ax1.set_title('Fig 1a - Sorted Issued Loan Amount (2015)', size=15)

# all_ histogram
# pick upper bound 900 to exclude too large numbers
ax2 = fig.add_subplot(212)
ax2.hist(df.loan_amnt, range=(df.loan_amnt.min(), 36000), color='purple')
ax2.set_xlabel('Loan Amount -$', size=12)
ax2.set_ylabel('Counts',size=12)
ax2.set_title('Fig 1b - Sorted Issued Loan Amount (2015)', size=15)


Out[168]:
<matplotlib.text.Text at 0x1b84f3750>

Fig 1a shows the sorted issued loan amounts from low to high.
Fig 2c is a histogram showing the distribution of the issued loan amounts.

Obeservation
The Loan amounts vary from $1000 to $35,000, and the most frequent loan amounts issued are around $10,000.


In [81]:
inc_75 = df.describe().loc['75%', 'annual_inc']
count_75 = int(len(df)*0.75)


Out[81]:
315822

In [165]:
# 2. Applicant Anual Income Distribution

fig = pl.figure(figsize=(8,16))

ax0 = fig.add_subplot(311)
ax0.plot(range(len(df.annual_inc)), sorted(df.annual_inc), '.', color='blue')
ax0.set_xlabel('Loan Applicant Count')
ax0.set_ylabel('Applicant Annual Income ($)')
ax0.set_title('Fig 2a - Sorted Applicant Annual Income-all ($) (2015)', size=15)

# use 75% quantile to plot the graph and histograms -- excluding extreme values
inc_75 = df.describe().loc['75%', 'annual_inc']
inc_below75 = df.annual_inc[df.annual_inc <= inc_75]
count_75 = int(len(df)*0.75)

ax1 = fig.add_subplot(312)
ax1.plot(range(count_75), sorted(df.annual_inc)[:count_75], '.', color='blue')
ax1.set_xlabel('Loan Applicant Count')
ax1.set_ylabel('Applicant Annual Income ($)')
ax1.set_title('Fig 2b - Sorted Applicant Annual Income-75% ($) (2015)',size=15)

# all_ histogram
# pick upper bound 900 to exclude too large numbers
ax2 = fig.add_subplot(313)
ax2.hist(df.annual_inc, range=(df.annual_inc.min(), inc_75), color='blue')
ax2.set_xlabel('Applicant Annual Income -$', size=12)
ax2.set_ylabel('Counts',size=12)
ax2.set_title('Fig 2c - Sorted Applicant Income-75% ($) (2015)',size=15)


Out[165]:
<matplotlib.text.Text at 0x1afc72fd0>

Fig 2a and Fig 2b both show the sorted applicant annual income from low to high. The former indicates extreme values, and the latter plots only those values below the 75% quantile, which looks more sensible.
Fig 2c is a histogram showing the distribution of the applicants' income (below 75% quantile).

Obeservation The most frequent annual income amounts of ths applicants are between $40,000 and below $60,000.


In [69]:
4.600000e+04


Out[69]:
46000.0

In [94]:
# 3. Loan amount and Applicant Annual Income
# View all
pl.figure(figsize=(6,4))
pl.plot(df.annual_inc, df.loan_amnt, '.')
pl.ylim(0, 40000)
pl.xlim(0, 0.2e7)   # df.annual_inc.max()
pl.title('Fig 3a - Loan Amount VS Applicant Annual Income_all', size=15)
pl.ylabel('Loan Amount ($)', size=15)
pl.xlabel('Applicant Annual Income ($)', size=15)


Out[94]:
<matplotlib.text.Text at 0x128701910>

Fig 3a shows the approved loan amount against the applicants' annual income.

Oberservation:
We can see that there are a few people with self-reported income that is very high, while majority of the applicants are with income less than $100,000. These extreme values indicate a possibility of outliers.

Method to deal with Outliers
Locate Outliers using Median-Absolute-Deviation (MAD) test and remove them for further analysis

Pick samples to set outlier range using the mean of the outlier boundries-- the method could be improved by using ramdom sampling


In [102]:
# 3b
pl.figure(figsize=(6,4))
pl.plot(df.annual_inc, df.loan_amnt, '.')
pl.ylim(0, 40000)
pl.xlim(0, inc_75)
pl.title('Fig 3b - Loan Amount VS Applicant Annual Income_75%', size=15)
pl.ylabel('Loan Amount ($)', size=15)
pl.xlabel('Applicant Annual Income ($)', size=15)


Out[102]:
<matplotlib.text.Text at 0x14d90a750>

Fig 3b is plot of the loan amount VS applicant annual income with all extreme income amounts being excluded.

Observation:
Now it is clearer to see that there is quite "rigid" standard to determine loan amounts based on income, however, there are still exceptions (sparse points above the "division line".


In [91]:
pl.plot(np.log(df.annual_inc), np.log(df.loan_amnt), '.')


/Users/fatkay22/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: RuntimeWarning: divide by zero encountered in log
  if __name__ == '__main__':
Out[91]:
[<matplotlib.lines.Line2D at 0x139cd3c90>]

In [ ]:


In [116]:
# 4. Average loan amount groupby grade
mean_loan_grade = df.groupby('grade')['loan_amnt'].mean()
mean_loan_grade


Out[116]:
grade
A    14691.902340
B    14251.806455
C    14745.592285
D    15947.183739
E    18472.726622
F    20090.274524
G    20427.838025
Name: loan_amnt, dtype: float64

In [117]:
sum_loan_grade = df.groupby('grade')['loan_amnt'].sum()
sum_loan_grade


Out[117]:
grade
A    1.077445e+09
B    1.676098e+09
C    1.777832e+09
D    9.991548e+08
E    6.455848e+08
F    1.972262e+08
G    4.426712e+07
Name: loan_amnt, dtype: float64

In [169]:
fig = pl.figure(figsize=(8,12))  #16,5

ax0 = fig.add_subplot(211)
ax0.plot(range(len(mean_loan_grade)), mean_loan_grade, 'o', color='blue')

ax0.set_ylim(0, 23000)
ax0.set_xlim(-0.5, len(mean_loan_grade))

ax0.set_xticks(range(len(mean_loan_grade)))
ax0.set_xticklabels(('A','B','C','D','E','F','G'))
ax0.set_xlabel('Grade')
ax0.set_ylabel('Average Loan Amount ($)')
ax0.set_title('Fig 4a - Average Loan Amount by Grade ($) (2015)', size=15)


ax1 = fig.add_subplot(212)
ax1.plot(range(len(sum_loan_grade)), sum_loan_grade, 'o', color='brown')

ax1.set_ylim(0, 2.3e9)
ax1.set_xlim(-0.5, len(sum_loan_grade))

ax1.set_xticks(range(len(sum_loan_grade)))
ax1.set_xticklabels(('A','B','C','D','E','F','G'))
ax1.set_xlabel('Grade')
ax1.set_ylabel('Total Loan Amount ($)')
ax1.set_title('Fig 4b - Total Loan Amount by Grade ($) (2015)', size=15)


Out[169]:
<matplotlib.text.Text at 0x15e56b510>

Fig 4a shows the avereage approved loan amounts corresponded to the grades determined by the Lending Club.
Fig 4b shows the total approved loan amounts corresponded to the grades determined by the Lending Club.

Oberservation:
It is interesting to see that the points in these two charts have different trends-- the total loan amount gets higher from grade A to C, and then fall to a very low level; the average loan amount falls a little from grade A to grade B, and then gradually increases as the grade goes from B to G (increased by more than $5,000 from B to G).


In [ ]: