In [11]:
df = pd.read_csv('lc_dataframe(cleaning).csv')
df.tail()
Out[11]:
loan_amnt
int_rate
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
loan_status
desc
purpose
dti
delinq_2yrs
inq_last_6mths
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
268131
31050
21.99
1
10
1
875000.0
1
12
1
0
3
9.66
1
0
0
25770
79.3
13
0
268132
10800
7.89
1
8
1
92400.0
1
12
1
0
2
19.62
1
0
0
9760
68.7
36
1
268133
9000
9.17
1
1
1
80000.0
1
12
1
0
3
3.97
1
0
0
6320
51.8
17
0
268134
14400
25.99
0
11
4
62000.0
1
12
1
0
3
16.88
0
1
1
5677
45.1
30
0
268135
8000
12.59
1
4
3
45000.0
1
12
1
0
3
26.21
0
0
0
9097
50.8
47
1
In [12]:
dfX = df.copy()
del dfX['loan_status']
dfy = pd.DataFrame(df['loan_status'], columns=['loan_status'])
In [14]:
from sklearn.preprocessing import scale, robust_scale
dfX_s = dfX.copy()
dfX_s['loan_amnt'] = scale(dfX_s['loan_amnt'].reshape(-1, 1))
dfX_s['annual_inc'] = scale(dfX_s['annual_inc'].reshape(-1, 1))
dfX_s['revol_bal'] = scale(dfX_s['revol_bal'].reshape(-1, 1))
dfX_s.tail()
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
app.launch_new_instance()
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:5: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
Out[14]:
loan_amnt
int_rate
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
desc
purpose
dti
delinq_2yrs
inq_last_6mths
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
268131
2.127000
21.99
1
10
1
13.782613
1
12
0
3
9.66
1
0
0
0.561689
79.3
13
0
268132
-0.352019
7.89
1
8
1
0.343283
1
12
0
2
19.62
1
0
0
-0.287872
68.7
36
1
268133
-0.572376
9.17
1
1
1
0.130342
1
12
0
3
3.97
1
0
0
-0.470413
51.8
17
0
268134
0.088696
25.99
0
11
4
-0.178766
1
12
0
3
16.88
0
1
1
-0.504533
45.1
30
0
268135
-0.694797
12.59
1
4
3
-0.470701
1
12
0
3
26.21
0
0
0
-0.323053
50.8
47
1
In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=None, whiten=False).fit(dfX_s)
var = pca.explained_variance_
plt.bar(np.arange(1, len(var)+1), var/np.sum(var), align='center')
plt.step(np.arange(1,len(var)+1), np.cumsum(var)/np.sum(var), where="mid")
plt.show()
In [16]:
from sklearn.metrics import classification_report
In [17]:
from sklearn.cross_validation import train_test_split
In [18]:
dfX_s.tail()
Out[18]:
loan_amnt
int_rate
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
desc
purpose
dti
delinq_2yrs
inq_last_6mths
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
268131
2.127000
21.99
1
10
1
13.782613
1
12
0
3
9.66
1
0
0
0.561689
79.3
13
0
268132
-0.352019
7.89
1
8
1
0.343283
1
12
0
2
19.62
1
0
0
-0.287872
68.7
36
1
268133
-0.572376
9.17
1
1
1
0.130342
1
12
0
3
3.97
1
0
0
-0.470413
51.8
17
0
268134
0.088696
25.99
0
11
4
-0.178766
1
12
0
3
16.88
0
1
1
-0.504533
45.1
30
0
268135
-0.694797
12.59
1
4
3
-0.470701
1
12
0
3
26.21
0
0
0
-0.323053
50.8
47
1
In [19]:
dfX_test = dfX_s.ix[:, 0:3]
dfX_test.tail()
Out[19]:
loan_amnt
int_rate
emp_title
268131
2.127000
21.99
1
268132
-0.352019
7.89
1
268133
-0.572376
9.17
1
268134
0.088696
25.99
0
268135
-0.694797
12.59
1
In [100]:
dfX_s.tail()
Out[100]:
loan_amnt
int_rate
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
desc
purpose
dti
delinq_2yrs
inq_last_6mths
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
268131
2.127000
21.99
1
10
1
13.782613
1
12
0
3
9.66
1
0
0
0.561689
79.3
13
0
268132
-0.352019
7.89
1
8
1
0.343283
1
12
0
2
19.62
1
0
0
-0.287872
68.7
36
1
268133
-0.572376
9.17
1
1
1
0.130342
1
12
0
3
3.97
1
0
0
-0.470413
51.8
17
0
268134
0.088696
25.99
0
11
4
-0.178766
1
12
0
3
16.88
0
1
1
-0.504533
45.1
30
0
268135
-0.694797
12.59
1
4
3
-0.470701
1
12
0
3
26.21
0
0
0
-0.323053
50.8
47
1
In [23]:
data = dfX_s.copy()
data['purpose'].value_counts()
Out[23]:
3 158081
2 52785
5 15775
10 15073
7 6537
12 5005
1 3741
8 3011
9 2152
14 1963
6 1735
13 1677
4 325
11 276
Name: purpose, dtype: int64
In [25]:
purpose = data['purpose']
purpose.max()
Out[25]:
14
In [26]:
purpose = purpose.replace([1,4,5,6,7,8,9,11,12,13,14], 0)
In [30]:
data.tail()
Out[30]:
loan_amnt
int_rate
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
desc
purpose
dti
delinq_2yrs
inq_last_6mths
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
268131
2.127000
21.99
1
10
1
13.782613
1
12
0
3
9.66
1
0
0
0.561689
79.3
13
0
268132
-0.352019
7.89
1
8
1
0.343283
1
12
0
2
19.62
1
0
0
-0.287872
68.7
36
1
268133
-0.572376
9.17
1
1
1
0.130342
1
12
0
3
3.97
1
0
0
-0.470413
51.8
17
0
268134
0.088696
25.99
0
11
4
-0.178766
1
12
0
3
16.88
0
1
1
-0.504533
45.1
30
0
268135
-0.694797
12.59
1
4
3
-0.470701
1
12
0
3
26.21
0
0
0
-0.323053
50.8
47
1
In [31]:
ohe= OneHotEncoder()
purpose = ohe.fit_transform(data['purpose'].reshape(-1,1)).toarray()
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
from ipykernel import kernelapp as app
In [33]:
ohe.active_features_
Out[33]:
array([ 0, 2, 3, 10])
In [34]:
purpose
Out[34]:
array([[ 0., 0., 1., 0.],
[ 1., 0., 0., 0.],
[ 0., 0., 1., 0.],
...,
[ 0., 0., 1., 0.],
[ 0., 0., 1., 0.],
[ 0., 0., 1., 0.]])
In [35]:
purpose = pd.DataFrame(purpose, columns = ['purpose_other', 'purpose_2', 'purpose_3', 'purpose_10'])
purpose.tail()
Out[35]:
purpose_other
purpose_2
purpose_3
purpose_10
268131
0.0
0.0
1.0
0.0
268132
0.0
1.0
0.0
0.0
268133
0.0
0.0
1.0
0.0
268134
0.0
0.0
1.0
0.0
268135
0.0
0.0
1.0
0.0
In [36]:
data.columns
Out[36]:
Index([u'loan_amnt', u'int_rate', u'emp_title', u'emp_length',
u'home_ownership', u'annual_inc', u'verification_status', u'issue_d',
u'desc', u'purpose', u'dti', u'delinq_2yrs', u'inq_last_6mths',
u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
u'initial_list_status'],
dtype='object')
In [37]:
data_test = pd.concat([data['loan_amnt'], data['int_rate'], data['emp_title'], data['annual_inc'], data['desc'], purpose, data['dti'], data['delinq_2yrs'], data['verification_status']], axis=1)
In [171]:
data_test
Out[171]:
loan_amnt
int_rate
emp_title
annual_inc
desc
purpose_other
purpose_2
purpose_3
purpose_10
dti
delinq_2yrs
verification_status
0
-0.756007
13.75
1
-0.865673
91
0.0
0.0
1.0
0.0
14.29
1
0
1
-1.245690
10.28
1
-0.900018
239
1.0
0.0
0.0
0.0
1.50
0
0
2
-0.970243
7.43
1
0.903112
41
0.0
0.0
1.0
0.0
0.27
0
0
3
-1.062059
7.43
1
-0.556565
845
1.0
0.0
0.0
0.0
2.55
0
0
4
-1.527257
11.54
1
-0.900018
649
0.0
1.0
0.0
0.0
2.04
0
0
5
-0.174509
10.59
1
-0.213111
751
1.0
0.0
0.0
0.0
17.12
1
0
6
-1.343626
15.96
1
-0.347058
27
0.0
0.0
1.0
0.0
12.57
0
0
7
-1.062059
9.01
1
3.049696
22
0.0
0.0
0.0
1.0
10.00
1
0
8
-1.245690
9.96
1
-1.011640
536
1.0
0.0
0.0
0.0
16.44
0
0
9
-1.062059
7.43
1
1.332429
103
1.0
0.0
0.0
0.0
0.00
0
0
10
-1.062059
7.43
1
0.387932
214
1.0
0.0
0.0
0.0
3.83
0
0
11
1.386355
10.59
1
-0.419183
598
1.0
0.0
0.0
0.0
4.05
0
0
12
-0.449955
10.28
1
-0.773352
740
1.0
0.0
0.0
0.0
10.70
0
0
13
-1.062059
9.01
1
0.010134
28
0.0
1.0
0.0
0.0
9.96
0
0
14
-1.049817
8.38
1
-0.693946
84
1.0
0.0
0.0
0.0
14.78
0
0
15
-1.013090
7.75
0
-1.106090
351
1.0
0.0
0.0
0.0
3.00
0
0
16
-1.025332
8.38
1
0.027306
88
0.0
1.0
0.0
0.0
14.37
0
0
17
-1.062059
11.22
1
-0.728291
260
0.0
1.0
0.0
0.0
18.64
0
0
18
-1.306900
10.28
1
2.191062
485
1.0
0.0
0.0
0.0
0.00
1
0
19
-1.306900
8.70
0
-0.814155
527
1.0
0.0
0.0
0.0
14.54
0
0
20
-1.062059
7.43
1
0.817249
68
0.0
0.0
1.0
0.0
2.29
0
0
21
-1.062059
8.07
0
1.847609
33
1.0
0.0
0.0
0.0
5.55
0
0
22
-1.062059
9.33
1
1.847609
41
1.0
0.0
0.0
0.0
11.93
0
0
23
-1.062059
9.96
1
-0.419183
104
0.0
0.0
1.0
0.0
8.03
0
0
24
-1.062059
11.22
1
0.130342
0
0.0
1.0
0.0
0.0
1.21
0
0
25
-1.062059
7.43
1
0.216206
0
0.0
1.0
0.0
0.0
0.31
0
0
26
-1.062059
8.70
1
0.044479
72
0.0
1.0
0.0
0.0
15.55
0
0
27
-1.368110
8.07
1
0.645522
148
1.0
0.0
0.0
0.0
11.33
0
0
28
-0.633586
10.28
1
-0.934363
406
0.0
1.0
0.0
0.0
6.40
1
0
29
-1.062059
8.07
0
0.473796
0
0.0
0.0
1.0
0.0
2.30
0
0
...
...
...
...
...
...
...
...
...
...
...
...
...
268106
0.284569
16.99
1
-0.470701
0
1.0
0.0
0.0
0.0
17.17
1
1
268107
0.174390
28.99
1
-0.522219
0
0.0
0.0
1.0
0.0
27.49
0
1
268108
-1.062059
11.49
1
-0.384838
0
0.0
1.0
0.0
0.0
8.52
0
1
268109
-1.306900
11.49
1
-0.693946
0
0.0
0.0
1.0
0.0
9.79
0
1
268110
1.508776
19.99
1
0.336414
0
0.0
1.0
0.0
0.0
19.59
0
1
268111
1.753617
18.99
1
2.294098
0
0.0
0.0
1.0
0.0
11.88
0
1
268112
-1.062059
10.99
1
-0.470701
0
1.0
0.0
0.0
0.0
21.95
0
1
268113
0.162148
9.99
1
-0.213111
0
0.0
0.0
1.0
0.0
9.86
0
0
268114
-1.123269
9.17
1
-0.814155
0
0.0
0.0
1.0
0.0
31.20
0
0
268115
-1.429321
12.05
0
-0.137551
0
0.0
0.0
1.0
0.0
22.29
0
1
268116
0.039727
10.99
1
1.160702
0
0.0
1.0
0.0
0.0
7.32
0
1
268117
-1.107966
15.41
0
-0.728291
0
0.0
0.0
1.0
0.0
16.28
0
1
268118
0.529410
15.41
1
-0.110075
0
0.0
0.0
1.0
0.0
21.92
0
1
268119
2.610562
16.99
1
0.319242
0
0.0
0.0
1.0
0.0
26.72
0
1
268120
0.162148
12.05
1
0.130342
0
0.0
0.0
0.0
1.0
14.75
0
1
268121
2.610562
13.67
1
0.319242
0
1.0
0.0
0.0
0.0
3.81
0
1
268122
1.998458
7.26
1
1.847609
0
1.0
0.0
0.0
0.0
12.39
0
1
268123
-0.272445
16.55
1
-0.213111
0
0.0
0.0
0.0
1.0
12.60
0
1
268124
-1.062059
13.67
1
-0.106641
0
0.0
0.0
0.0
1.0
19.56
0
1
268125
2.610562
9.17
1
1.783211
0
0.0
1.0
0.0
0.0
22.68
0
1
268126
-0.817217
12.59
1
-0.630407
0
0.0
0.0
1.0
0.0
25.51
0
1
268127
0.162148
17.57
1
-0.419183
0
0.0
0.0
1.0
0.0
27.12
0
0
268128
1.925006
17.86
1
0.559659
0
0.0
0.0
1.0
0.0
30.19
0
1
268129
0.162148
18.49
1
0.302069
0
0.0
0.0
1.0
0.0
25.13
1
0
268130
0.774252
6.24
1
0.903112
0
0.0
1.0
0.0
0.0
14.20
1
1
268131
2.127000
21.99
1
13.782613
0
0.0
0.0
1.0
0.0
9.66
1
1
268132
-0.352019
7.89
1
0.343283
0
0.0
1.0
0.0
0.0
19.62
1
1
268133
-0.572376
9.17
1
0.130342
0
0.0
0.0
1.0
0.0
3.97
1
1
268134
0.088696
25.99
0
-0.178766
0
0.0
0.0
1.0
0.0
16.88
0
1
268135
-0.694797
12.59
1
-0.470701
0
0.0
0.0
1.0
0.0
26.21
0
1
268136 rows × 12 columns
In [70]:
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
In [82]:
def stratified_cross_val(data_test, dfy, n_iter):
cv = StratifiedKFold(dfy['loan_status'], n_folds=n_iter, random_state=3)
for train_index, test_index in cv:
X_train = data_test.ix[train_index, :]
y_train = dfy.ix[train_index,:]
X_test = data_test.ix[test_index, :]
y_test = dfy.ix[test_index, :]
model = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))
print('='*80)
In [51]:
def sm_logit(data_test, dfy):
model = sm.Logit(dfy, sm.add_constant(data_test)).fit()
print(model.summary())
In [52]:
sm_logit(data_test2, dfy)
Optimization terminated successfully.
Current function value: 0.491694
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: loan_status No. Observations: 268136
Model: Logit Df Residuals: 268127
Method: MLE Df Model: 8
Date: Tue, 14 Mar 2017 Pseudo R-squ.: 0.07845
Time: 13:12:33 Log-Likelihood: -1.3184e+05
converged: True LL-Null: -1.4306e+05
LLR p-value: 0.000
=======================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------
const 3.1833 0.028 112.009 0.000 3.128 3.239
loan_amnt -0.1295 0.006 -22.154 0.000 -0.141 -0.118
int_rate -0.1251 0.001 -105.980 0.000 -0.127 -0.123
emp_title 0.4080 0.020 20.667 0.000 0.369 0.447
annual_inc 0.3000 0.009 32.345 0.000 0.282 0.318
desc 0.0004 2.79e-05 13.942 0.000 0.000 0.000
dti -0.0249 0.001 -38.919 0.000 -0.026 -0.024
delinq_2yrs -0.1040 0.013 -8.198 0.000 -0.129 -0.079
verification_status -0.0900 0.012 -7.667 0.000 -0.113 -0.067
=======================================================================================
In [54]:
dfX_s.tail()
Out[54]:
loan_amnt
int_rate
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
desc
purpose
dti
delinq_2yrs
inq_last_6mths
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
268131
2.127000
21.99
1
10
1
13.782613
1
12
0
3
9.66
1
0
0
0.561689
79.3
13
0
268132
-0.352019
7.89
1
8
1
0.343283
1
12
0
2
19.62
1
0
0
-0.287872
68.7
36
1
268133
-0.572376
9.17
1
1
1
0.130342
1
12
0
3
3.97
1
0
0
-0.470413
51.8
17
0
268134
0.088696
25.99
0
11
4
-0.178766
1
12
0
3
16.88
0
1
1
-0.504533
45.1
30
0
268135
-0.694797
12.59
1
4
3
-0.470701
1
12
0
3
26.21
0
0
0
-0.323053
50.8
47
1
In [55]:
dfX_s['pub_rec'].value_counts()
Out[55]:
0 234589
1 33547
Name: pub_rec, dtype: int64
In [56]:
data_test.tail()
Out[56]:
loan_amnt
int_rate
emp_title
annual_inc
desc
purpose_other
purpose_2
purpose_3
purpose_10
dti
delinq_2yrs
verification_status
268131
2.127000
21.99
1
13.782613
0
0.0
0.0
1.0
0.0
9.66
1
1
268132
-0.352019
7.89
1
0.343283
0
0.0
1.0
0.0
0.0
19.62
1
1
268133
-0.572376
9.17
1
0.130342
0
0.0
0.0
1.0
0.0
3.97
1
1
268134
0.088696
25.99
0
-0.178766
0
0.0
0.0
1.0
0.0
16.88
0
1
268135
-0.694797
12.59
1
-0.470701
0
0.0
0.0
1.0
0.0
26.21
0
1
In [57]:
del data_test['purpose_other']
del data_test['purpose_2']
del data_test['purpose_3']
In [58]:
data_test.tail()
Out[58]:
loan_amnt
int_rate
emp_title
annual_inc
desc
purpose_10
dti
delinq_2yrs
verification_status
268131
2.127000
21.99
1
13.782613
0
0.0
9.66
1
1
268132
-0.352019
7.89
1
0.343283
0
0.0
19.62
1
1
268133
-0.572376
9.17
1
0.130342
0
0.0
3.97
1
1
268134
0.088696
25.99
0
-0.178766
0
0.0
16.88
0
1
268135
-0.694797
12.59
1
-0.470701
0
0.0
26.21
0
1
In [59]:
sm_logit(data_test, dfy)
Optimization terminated successfully.
Current function value: 0.491682
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: loan_status No. Observations: 268136
Model: Logit Df Residuals: 268126
Method: MLE Df Model: 9
Date: Tue, 14 Mar 2017 Pseudo R-squ.: 0.07847
Time: 13:17:15 Log-Likelihood: -1.3184e+05
converged: True LL-Null: -1.4306e+05
LLR p-value: 0.000
=======================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------
const 3.1829 0.028 112.002 0.000 3.127 3.239
loan_amnt -0.1317 0.006 -22.278 0.000 -0.143 -0.120
int_rate -0.1247 0.001 -104.923 0.000 -0.127 -0.122
emp_title 0.4076 0.020 20.644 0.000 0.369 0.446
annual_inc 0.3010 0.009 32.418 0.000 0.283 0.319
desc 0.0004 2.79e-05 13.956 0.000 0.000 0.000
purpose_10 -0.0520 0.021 -2.492 0.013 -0.093 -0.011
dti -0.0249 0.001 -38.992 0.000 -0.026 -0.024
delinq_2yrs -0.1042 0.013 -8.214 0.000 -0.129 -0.079
verification_status -0.0901 0.012 -7.675 0.000 -0.113 -0.067
=======================================================================================
In [83]:
stratified_cross_val(data_test, dfy, 10)
precision recall f1-score support
0 1.00 0.00 0.01 6042
1 0.78 1.00 0.87 20773
avg / total 0.83 0.78 0.68 26815
================================================================================
precision recall f1-score support
0 0.87 0.03 0.06 6042
1 0.78 1.00 0.88 20772
avg / total 0.80 0.78 0.69 26814
================================================================================
precision recall f1-score support
0 0.63 0.06 0.11 6042
1 0.78 0.99 0.87 20772
avg / total 0.75 0.78 0.70 26814
================================================================================
precision recall f1-score support
0 0.62 0.09 0.16 6042
1 0.79 0.98 0.88 20772
avg / total 0.75 0.78 0.71 26814
================================================================================
precision recall f1-score support
0 0.54 0.11 0.19 6042
1 0.79 0.97 0.87 20772
avg / total 0.73 0.78 0.72 26814
================================================================================
precision recall f1-score support
0 0.42 0.10 0.16 6041
1 0.79 0.96 0.86 20772
avg / total 0.70 0.77 0.71 26813
================================================================================
precision recall f1-score support
0 0.44 0.09 0.14 6041
1 0.78 0.97 0.87 20772
avg / total 0.71 0.77 0.70 26813
================================================================================
precision recall f1-score support
0 0.56 0.11 0.18 6041
1 0.79 0.97 0.87 20772
avg / total 0.74 0.78 0.72 26813
================================================================================
precision recall f1-score support
0 0.50 0.13 0.20 6041
1 0.79 0.96 0.87 20772
avg / total 0.73 0.77 0.72 26813
================================================================================
precision recall f1-score support
0 0.43 0.12 0.19 6041
1 0.79 0.95 0.86 20772
avg / total 0.71 0.77 0.71 26813
================================================================================
In [ ]:
Content source: shinys825/lc_project
Similar notebooks: