In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
In [13]:
df = pd.read_csv('LoanStats3d_securev1.csv.zip', compression='zip',header=1)
In [5]:
df.head()
Out[5]:
id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
...
num_tl_90g_dpd_24m
num_tl_op_past_12m
pct_tl_nvr_dlq
percent_bc_gt_75
pub_rec_bankruptcies
tax_liens
tot_hi_cred_lim
total_bal_ex_mort
total_bc_limit
total_il_high_credit_limit
0
68615169
NaN
16000.0
16000.0
16000.0
60 months
8.49%
328.19
B
B1
...
0.0
3.0
87.5
50.0
0.0
0.0
307343.0
64371.0
23500.0
59526.0
1
68537564
NaN
8000.0
8000.0
8000.0
36 months
10.78%
261.08
B
B4
...
0.0
4.0
97.1
100.0
0.0
0.0
168819.0
148154.0
5800.0
136996.0
2
68536799
NaN
10000.0
10000.0
10000.0
36 months
10.78%
326.35
B
B4
...
0.0
2.0
100.0
33.3
0.0
0.0
190694.0
40681.0
13900.0
29512.0
3
68355089
NaN
24700.0
24700.0
24700.0
36 months
11.99%
820.28
C
C1
...
0.0
2.0
97.4
7.7
0.0
0.0
314017.0
39475.0
79300.0
24667.0
4
68506885
NaN
10000.0
10000.0
10000.0
60 months
11.99%
222.40
C
C1
...
0.0
1.0
100.0
50.0
0.0
0.0
61054.0
41166.0
13100.0
46854.0
5 rows × 115 columns
In [14]:
defaulters=['Default','Charged Off', 'Late (31-120 days)']
non_defaulters=['Fully Paid']
uncertain = ['Current','Late (16-30 days)','In Grace Period', 'none']
In [15]:
df.loan_status.unique()
Out[15]:
array(['Current', 'Fully Paid', 'Default', 'Charged Off',
'Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period', nan], dtype=object)
In [20]:
df['Target']= 2 ## uncertain
df.loc[df.loan_status.isin(defaulters),'Target'] = 0 ## defaulters
df.loc[df.loan_status.isin(non_defaulters),'Target'] = 1 ## paid -- (and to whom to issue the loan)
In [21]:
df.head()
Out[21]:
id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
...
num_tl_op_past_12m
pct_tl_nvr_dlq
percent_bc_gt_75
pub_rec_bankruptcies
tax_liens
tot_hi_cred_lim
total_bal_ex_mort
total_bc_limit
total_il_high_credit_limit
Target
0
68615169
NaN
16000.0
16000.0
16000.0
60 months
8.49%
328.19
B
B1
...
3.0
87.5
50.0
0.0
0.0
307343.0
64371.0
23500.0
59526.0
2
1
68537564
NaN
8000.0
8000.0
8000.0
36 months
10.78%
261.08
B
B4
...
4.0
97.1
100.0
0.0
0.0
168819.0
148154.0
5800.0
136996.0
2
2
68536799
NaN
10000.0
10000.0
10000.0
36 months
10.78%
326.35
B
B4
...
2.0
100.0
33.3
0.0
0.0
190694.0
40681.0
13900.0
29512.0
2
3
68355089
NaN
24700.0
24700.0
24700.0
36 months
11.99%
820.28
C
C1
...
2.0
97.4
7.7
0.0
0.0
314017.0
39475.0
79300.0
24667.0
1
4
68506885
NaN
10000.0
10000.0
10000.0
60 months
11.99%
222.40
C
C1
...
1.0
100.0
50.0
0.0
0.0
61054.0
41166.0
13100.0
46854.0
2
5 rows × 116 columns
In [22]:
df.loc[df['Target'] == 0].describe()
Out[22]:
member_id
loan_amnt
funded_amnt
funded_amnt_inv
installment
annual_inc
dti
delinq_2yrs
fico_range_low
fico_range_high
...
num_tl_op_past_12m
pct_tl_nvr_dlq
percent_bc_gt_75
pub_rec_bankruptcies
tax_liens
tot_hi_cred_lim
total_bal_ex_mort
total_bc_limit
total_il_high_credit_limit
Target
count
0.0
42302.000000
42302.000000
42302.000000
42302.000000
4.230200e+04
42302.000000
42302.000000
42302.000000
42302.000000
...
42302.000000
42302.000000
41790.000000
42302.000000
42302.000000
4.230200e+04
42302.000000
42302.000000
42302.000000
42302.0
mean
NaN
15658.597702
15658.597702
15650.406461
461.664684
7.023571e+04
21.319409
0.376294
685.131436
689.131483
...
2.714080
93.982022
52.155906
0.157321
0.066545
1.429021e+05
51433.688738
17917.678242
43445.640135
0.0
std
NaN
8555.295964
8555.295964
8550.764991
250.791773
8.017445e+04
49.315800
0.982413
23.979229
23.979545
...
2.067767
8.685771
35.785969
0.417973
0.576476
1.443945e+05
44800.858670
18031.898258
41308.078088
0.0
min
NaN
1000.000000
1000.000000
900.000000
30.540000
0.000000e+00
0.000000
0.000000
660.000000
664.000000
...
0.000000
12.500000
0.000000
0.000000
0.000000
2.584000e+03
0.000000
0.000000
0.000000
0.0
25%
NaN
9500.000000
9500.000000
9500.000000
279.720000
4.300000e+04
14.470000
0.000000
665.000000
669.000000
...
1.000000
90.900000
22.200000
0.000000
0.000000
4.567700e+04
23353.250000
6500.000000
16575.500000
0.0
50%
NaN
14475.000000
14475.000000
14450.000000
403.035000
6.000000e+04
20.940000
0.000000
680.000000
684.000000
...
2.000000
97.300000
50.000000
0.000000
0.000000
8.670500e+04
40000.000000
12500.000000
33714.000000
0.0
75%
NaN
20400.000000
20400.000000
20400.000000
596.340000
8.400000e+04
27.740000
0.000000
695.000000
699.000000
...
4.000000
100.000000
83.300000
0.000000
0.000000
1.991648e+05
65288.250000
23000.000000
58323.750000
0.0
max
NaN
35000.000000
35000.000000
35000.000000
1445.460000
8.900060e+06
9999.000000
27.000000
845.000000
850.000000
...
22.000000
100.000000
100.000000
8.000000
85.000000
2.319146e+06
893784.000000
326300.000000
757807.000000
0.0
8 rows × 90 columns
In [23]:
df.loc[df['Target'] == 1].describe()
Out[23]:
member_id
loan_amnt
funded_amnt
funded_amnt_inv
installment
annual_inc
dti
delinq_2yrs
fico_range_low
fico_range_high
...
num_tl_op_past_12m
pct_tl_nvr_dlq
percent_bc_gt_75
pub_rec_bankruptcies
tax_liens
tot_hi_cred_lim
total_bal_ex_mort
total_bc_limit
total_il_high_credit_limit
Target
count
0.0
94968.000000
94968.000000
94968.000000
94968.000000
9.496800e+04
94968.000000
94968.000000
94968.000000
94968.000000
...
94968.000000
94968.000000
93915.000000
94968.000000
94968.000000
9.496800e+04
9.496800e+04
94968.000000
9.496800e+04
94968.0
mean
NaN
14678.662813
14678.662813
14673.312853
437.941518
7.944094e+04
17.906022
0.322066
696.294278
700.294457
...
2.412507
94.198152
42.624899
0.155926
0.050175
1.864274e+05
5.150747e+04
22252.687547
4.336753e+04
1.0
std
NaN
8680.966801
8680.966801
8677.636591
254.684005
6.976620e+04
8.553780
0.903981
32.486469
32.487292
...
1.949052
8.496625
35.786696
0.409449
0.342618
1.824717e+05
4.883338e+04
22438.665164
4.387513e+04
0.0
min
NaN
1000.000000
1000.000000
975.000000
30.540000
5.000000e+03
0.000000
0.000000
660.000000
664.000000
...
0.000000
16.700000
0.000000
0.000000
0.000000
2.500000e+03
0.000000e+00
0.000000
0.000000e+00
1.0
25%
NaN
8000.000000
8000.000000
8000.000000
251.360000
4.880000e+04
11.550000
0.000000
670.000000
674.000000
...
1.000000
91.300000
0.000000
0.000000
0.000000
5.399600e+04
2.213300e+04
7800.000000
1.579275e+04
1.0
50%
NaN
12525.000000
12525.000000
12525.000000
377.040000
6.800000e+04
17.300000
0.000000
690.000000
694.000000
...
2.000000
97.600000
40.000000
0.000000
0.000000
1.268690e+05
3.898000e+04
15300.000000
3.312550e+04
1.0
75%
NaN
20000.000000
20000.000000
20000.000000
583.500000
9.500000e+04
23.700000
0.000000
710.000000
714.000000
...
3.000000
100.000000
66.700000
0.000000
0.000000
2.693190e+05
6.492800e+04
28900.000000
5.819925e+04
1.0
max
NaN
35000.000000
35000.000000
35000.000000
1409.990000
7.600000e+06
380.530000
39.000000
845.000000
850.000000
...
25.000000
100.000000
100.000000
7.000000
15.000000
4.214831e+06
1.684313e+06
684000.000000
2.101913e+06
1.0
8 rows × 90 columns
In [30]:
df.loc[df['Target'] == 0][['funded_amnt','total_pymnt']].describe()
Out[30]:
funded_amnt
total_pymnt
count
42302.000000
42302.000000
mean
15658.597702
5534.268757
std
8555.295964
3859.052983
min
1000.000000
0.000000
25%
9500.000000
2781.667500
50%
14475.000000
4660.085000
75%
20400.000000
7337.890000
max
35000.000000
48307.240000
In [26]:
df.loc[df['Target'] == 1][['funded_amnt','total_pymnt']].describe()
Out[26]:
funded_amnt
total_pymnt
count
94968.000000
94968.000000
mean
14678.662813
16161.004698
std
8680.966801
9740.942874
min
1000.000000
1000.010000
25%
8000.000000
8569.170000
50%
12525.000000
13994.355000
75%
20000.000000
22034.527500
max
35000.000000
50312.207800
In [29]:
df.loc[df['Target'] == 0][['funded_amnt','total_pymnt']].head(10)
Out[29]:
funded_amnt
total_pymnt
14
27300.0
8727.52
21
23975.0
3928.86
28
23100.0
5305.12
50
16000.0
4382.17
101
24250.0
4124.42
113
7200.0
3043.61
136
16000.0
3109.00
139
5000.0
1320.07
143
8000.0
3449.23
179
25825.0
3028.09
In [31]:
C=0.0820849986238988
In [ ]:
Content source: kayzhou22/DSBiz_Project_LendingClub
Similar notebooks: