In [11]:
    
df = pd.read_csv('lc_dataframe(cleaning).csv')
df.tail()
    
    Out[11]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      emp_length 
      home_ownership 
      annual_inc 
      verification_status 
      issue_d 
      loan_status 
      desc 
      purpose 
      dti 
      delinq_2yrs 
      inq_last_6mths 
      pub_rec 
      revol_bal 
      revol_util 
      total_acc 
      initial_list_status 
     
  
  
    
      268131 
      31050 
      21.99 
      1 
      10 
      1 
      875000.0 
      1 
      12 
      1 
      0 
      3 
      9.66 
      1 
      0 
      0 
      25770 
      79.3 
      13 
      0 
     
    
      268132 
      10800 
      7.89 
      1 
      8 
      1 
      92400.0 
      1 
      12 
      1 
      0 
      2 
      19.62 
      1 
      0 
      0 
      9760 
      68.7 
      36 
      1 
     
    
      268133 
      9000 
      9.17 
      1 
      1 
      1 
      80000.0 
      1 
      12 
      1 
      0 
      3 
      3.97 
      1 
      0 
      0 
      6320 
      51.8 
      17 
      0 
     
    
      268134 
      14400 
      25.99 
      0 
      11 
      4 
      62000.0 
      1 
      12 
      1 
      0 
      3 
      16.88 
      0 
      1 
      1 
      5677 
      45.1 
      30 
      0 
     
    
      268135 
      8000 
      12.59 
      1 
      4 
      3 
      45000.0 
      1 
      12 
      1 
      0 
      3 
      26.21 
      0 
      0 
      0 
      9097 
      50.8 
      47 
      1 
     
  
In [12]:
    
dfX = df.copy()
del dfX['loan_status']
dfy = pd.DataFrame(df['loan_status'], columns=['loan_status'])
    
In [14]:
    
from sklearn.preprocessing import scale, robust_scale
dfX_s = dfX.copy()
dfX_s['loan_amnt'] = scale(dfX_s['loan_amnt'].reshape(-1, 1))
dfX_s['annual_inc'] = scale(dfX_s['annual_inc'].reshape(-1, 1))
dfX_s['revol_bal'] = scale(dfX_s['revol_bal'].reshape(-1, 1))
dfX_s.tail()
    
    
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  app.launch_new_instance()
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:5: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
    Out[14]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      emp_length 
      home_ownership 
      annual_inc 
      verification_status 
      issue_d 
      desc 
      purpose 
      dti 
      delinq_2yrs 
      inq_last_6mths 
      pub_rec 
      revol_bal 
      revol_util 
      total_acc 
      initial_list_status 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
      10 
      1 
      13.782613 
      1 
      12 
      0 
      3 
      9.66 
      1 
      0 
      0 
      0.561689 
      79.3 
      13 
      0 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      8 
      1 
      0.343283 
      1 
      12 
      0 
      2 
      19.62 
      1 
      0 
      0 
      -0.287872 
      68.7 
      36 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      1 
      1 
      0.130342 
      1 
      12 
      0 
      3 
      3.97 
      1 
      0 
      0 
      -0.470413 
      51.8 
      17 
      0 
     
    
      268134 
      0.088696 
      25.99 
      0 
      11 
      4 
      -0.178766 
      1 
      12 
      0 
      3 
      16.88 
      0 
      1 
      1 
      -0.504533 
      45.1 
      30 
      0 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      4 
      3 
      -0.470701 
      1 
      12 
      0 
      3 
      26.21 
      0 
      0 
      0 
      -0.323053 
      50.8 
      47 
      1 
     
  
In [15]:
    
from sklearn.decomposition import PCA
pca = PCA(n_components=None, whiten=False).fit(dfX_s)
var = pca.explained_variance_
plt.bar(np.arange(1, len(var)+1), var/np.sum(var), align='center')
plt.step(np.arange(1,len(var)+1), np.cumsum(var)/np.sum(var), where="mid")
plt.show()
    
    
 
In [16]:
    
from sklearn.metrics import classification_report
    
In [17]:
    
from sklearn.cross_validation import train_test_split
    
In [18]:
    
dfX_s.tail()
    
    Out[18]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      emp_length 
      home_ownership 
      annual_inc 
      verification_status 
      issue_d 
      desc 
      purpose 
      dti 
      delinq_2yrs 
      inq_last_6mths 
      pub_rec 
      revol_bal 
      revol_util 
      total_acc 
      initial_list_status 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
      10 
      1 
      13.782613 
      1 
      12 
      0 
      3 
      9.66 
      1 
      0 
      0 
      0.561689 
      79.3 
      13 
      0 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      8 
      1 
      0.343283 
      1 
      12 
      0 
      2 
      19.62 
      1 
      0 
      0 
      -0.287872 
      68.7 
      36 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      1 
      1 
      0.130342 
      1 
      12 
      0 
      3 
      3.97 
      1 
      0 
      0 
      -0.470413 
      51.8 
      17 
      0 
     
    
      268134 
      0.088696 
      25.99 
      0 
      11 
      4 
      -0.178766 
      1 
      12 
      0 
      3 
      16.88 
      0 
      1 
      1 
      -0.504533 
      45.1 
      30 
      0 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      4 
      3 
      -0.470701 
      1 
      12 
      0 
      3 
      26.21 
      0 
      0 
      0 
      -0.323053 
      50.8 
      47 
      1 
     
  
In [19]:
    
dfX_test = dfX_s.ix[:, 0:3]
dfX_test.tail()
    
    Out[19]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
     
    
      268132 
      -0.352019 
      7.89 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
     
    
      268134 
      0.088696 
      25.99 
      0 
     
    
      268135 
      -0.694797 
      12.59 
      1 
     
  
In [100]:
    
dfX_s.tail()
    
    Out[100]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      emp_length 
      home_ownership 
      annual_inc 
      verification_status 
      issue_d 
      desc 
      purpose 
      dti 
      delinq_2yrs 
      inq_last_6mths 
      pub_rec 
      revol_bal 
      revol_util 
      total_acc 
      initial_list_status 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
      10 
      1 
      13.782613 
      1 
      12 
      0 
      3 
      9.66 
      1 
      0 
      0 
      0.561689 
      79.3 
      13 
      0 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      8 
      1 
      0.343283 
      1 
      12 
      0 
      2 
      19.62 
      1 
      0 
      0 
      -0.287872 
      68.7 
      36 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      1 
      1 
      0.130342 
      1 
      12 
      0 
      3 
      3.97 
      1 
      0 
      0 
      -0.470413 
      51.8 
      17 
      0 
     
    
      268134 
      0.088696 
      25.99 
      0 
      11 
      4 
      -0.178766 
      1 
      12 
      0 
      3 
      16.88 
      0 
      1 
      1 
      -0.504533 
      45.1 
      30 
      0 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      4 
      3 
      -0.470701 
      1 
      12 
      0 
      3 
      26.21 
      0 
      0 
      0 
      -0.323053 
      50.8 
      47 
      1 
     
  
In [23]:
    
data = dfX_s.copy()
data['purpose'].value_counts()
    
    Out[23]:
3     158081
2      52785
5      15775
10     15073
7       6537
12      5005
1       3741
8       3011
9       2152
14      1963
6       1735
13      1677
4        325
11       276
Name: purpose, dtype: int64
In [25]:
    
purpose = data['purpose']
purpose.max()
    
    Out[25]:
14
In [26]:
    
purpose = purpose.replace([1,4,5,6,7,8,9,11,12,13,14], 0)
    
In [30]:
    
data.tail()
    
    Out[30]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      emp_length 
      home_ownership 
      annual_inc 
      verification_status 
      issue_d 
      desc 
      purpose 
      dti 
      delinq_2yrs 
      inq_last_6mths 
      pub_rec 
      revol_bal 
      revol_util 
      total_acc 
      initial_list_status 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
      10 
      1 
      13.782613 
      1 
      12 
      0 
      3 
      9.66 
      1 
      0 
      0 
      0.561689 
      79.3 
      13 
      0 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      8 
      1 
      0.343283 
      1 
      12 
      0 
      2 
      19.62 
      1 
      0 
      0 
      -0.287872 
      68.7 
      36 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      1 
      1 
      0.130342 
      1 
      12 
      0 
      3 
      3.97 
      1 
      0 
      0 
      -0.470413 
      51.8 
      17 
      0 
     
    
      268134 
      0.088696 
      25.99 
      0 
      11 
      4 
      -0.178766 
      1 
      12 
      0 
      3 
      16.88 
      0 
      1 
      1 
      -0.504533 
      45.1 
      30 
      0 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      4 
      3 
      -0.470701 
      1 
      12 
      0 
      3 
      26.21 
      0 
      0 
      0 
      -0.323053 
      50.8 
      47 
      1 
     
  
In [31]:
    
ohe= OneHotEncoder()
purpose = ohe.fit_transform(data['purpose'].reshape(-1,1)).toarray()
    
    
/home/dockeruser/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  from ipykernel import kernelapp as app
In [33]:
    
ohe.active_features_
    
    Out[33]:
array([ 0,  2,  3, 10])
In [34]:
    
purpose
    
    Out[34]:
array([[ 0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       ..., 
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.]])
In [35]:
    
purpose = pd.DataFrame(purpose, columns = ['purpose_other', 'purpose_2', 'purpose_3', 'purpose_10'])
purpose.tail()
    
    Out[35]:
  
    
       
      purpose_other 
      purpose_2 
      purpose_3 
      purpose_10 
     
  
  
    
      268131 
      0.0 
      0.0 
      1.0 
      0.0 
     
    
      268132 
      0.0 
      1.0 
      0.0 
      0.0 
     
    
      268133 
      0.0 
      0.0 
      1.0 
      0.0 
     
    
      268134 
      0.0 
      0.0 
      1.0 
      0.0 
     
    
      268135 
      0.0 
      0.0 
      1.0 
      0.0 
     
  
In [36]:
    
data.columns
    
    Out[36]:
Index([u'loan_amnt', u'int_rate', u'emp_title', u'emp_length',
       u'home_ownership', u'annual_inc', u'verification_status', u'issue_d',
       u'desc', u'purpose', u'dti', u'delinq_2yrs', u'inq_last_6mths',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status'],
      dtype='object')
In [37]:
    
data_test = pd.concat([data['loan_amnt'], data['int_rate'], data['emp_title'], data['annual_inc'], data['desc'], purpose, data['dti'], data['delinq_2yrs'], data['verification_status']], axis=1)
    
In [171]:
    
data_test
    
    Out[171]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      annual_inc 
      desc 
      purpose_other 
      purpose_2 
      purpose_3 
      purpose_10 
      dti 
      delinq_2yrs 
      verification_status 
     
  
  
    
      0 
      -0.756007 
      13.75 
      1 
      -0.865673 
      91 
      0.0 
      0.0 
      1.0 
      0.0 
      14.29 
      1 
      0 
     
    
      1 
      -1.245690 
      10.28 
      1 
      -0.900018 
      239 
      1.0 
      0.0 
      0.0 
      0.0 
      1.50 
      0 
      0 
     
    
      2 
      -0.970243 
      7.43 
      1 
      0.903112 
      41 
      0.0 
      0.0 
      1.0 
      0.0 
      0.27 
      0 
      0 
     
    
      3 
      -1.062059 
      7.43 
      1 
      -0.556565 
      845 
      1.0 
      0.0 
      0.0 
      0.0 
      2.55 
      0 
      0 
     
    
      4 
      -1.527257 
      11.54 
      1 
      -0.900018 
      649 
      0.0 
      1.0 
      0.0 
      0.0 
      2.04 
      0 
      0 
     
    
      5 
      -0.174509 
      10.59 
      1 
      -0.213111 
      751 
      1.0 
      0.0 
      0.0 
      0.0 
      17.12 
      1 
      0 
     
    
      6 
      -1.343626 
      15.96 
      1 
      -0.347058 
      27 
      0.0 
      0.0 
      1.0 
      0.0 
      12.57 
      0 
      0 
     
    
      7 
      -1.062059 
      9.01 
      1 
      3.049696 
      22 
      0.0 
      0.0 
      0.0 
      1.0 
      10.00 
      1 
      0 
     
    
      8 
      -1.245690 
      9.96 
      1 
      -1.011640 
      536 
      1.0 
      0.0 
      0.0 
      0.0 
      16.44 
      0 
      0 
     
    
      9 
      -1.062059 
      7.43 
      1 
      1.332429 
      103 
      1.0 
      0.0 
      0.0 
      0.0 
      0.00 
      0 
      0 
     
    
      10 
      -1.062059 
      7.43 
      1 
      0.387932 
      214 
      1.0 
      0.0 
      0.0 
      0.0 
      3.83 
      0 
      0 
     
    
      11 
      1.386355 
      10.59 
      1 
      -0.419183 
      598 
      1.0 
      0.0 
      0.0 
      0.0 
      4.05 
      0 
      0 
     
    
      12 
      -0.449955 
      10.28 
      1 
      -0.773352 
      740 
      1.0 
      0.0 
      0.0 
      0.0 
      10.70 
      0 
      0 
     
    
      13 
      -1.062059 
      9.01 
      1 
      0.010134 
      28 
      0.0 
      1.0 
      0.0 
      0.0 
      9.96 
      0 
      0 
     
    
      14 
      -1.049817 
      8.38 
      1 
      -0.693946 
      84 
      1.0 
      0.0 
      0.0 
      0.0 
      14.78 
      0 
      0 
     
    
      15 
      -1.013090 
      7.75 
      0 
      -1.106090 
      351 
      1.0 
      0.0 
      0.0 
      0.0 
      3.00 
      0 
      0 
     
    
      16 
      -1.025332 
      8.38 
      1 
      0.027306 
      88 
      0.0 
      1.0 
      0.0 
      0.0 
      14.37 
      0 
      0 
     
    
      17 
      -1.062059 
      11.22 
      1 
      -0.728291 
      260 
      0.0 
      1.0 
      0.0 
      0.0 
      18.64 
      0 
      0 
     
    
      18 
      -1.306900 
      10.28 
      1 
      2.191062 
      485 
      1.0 
      0.0 
      0.0 
      0.0 
      0.00 
      1 
      0 
     
    
      19 
      -1.306900 
      8.70 
      0 
      -0.814155 
      527 
      1.0 
      0.0 
      0.0 
      0.0 
      14.54 
      0 
      0 
     
    
      20 
      -1.062059 
      7.43 
      1 
      0.817249 
      68 
      0.0 
      0.0 
      1.0 
      0.0 
      2.29 
      0 
      0 
     
    
      21 
      -1.062059 
      8.07 
      0 
      1.847609 
      33 
      1.0 
      0.0 
      0.0 
      0.0 
      5.55 
      0 
      0 
     
    
      22 
      -1.062059 
      9.33 
      1 
      1.847609 
      41 
      1.0 
      0.0 
      0.0 
      0.0 
      11.93 
      0 
      0 
     
    
      23 
      -1.062059 
      9.96 
      1 
      -0.419183 
      104 
      0.0 
      0.0 
      1.0 
      0.0 
      8.03 
      0 
      0 
     
    
      24 
      -1.062059 
      11.22 
      1 
      0.130342 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      1.21 
      0 
      0 
     
    
      25 
      -1.062059 
      7.43 
      1 
      0.216206 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      0.31 
      0 
      0 
     
    
      26 
      -1.062059 
      8.70 
      1 
      0.044479 
      72 
      0.0 
      1.0 
      0.0 
      0.0 
      15.55 
      0 
      0 
     
    
      27 
      -1.368110 
      8.07 
      1 
      0.645522 
      148 
      1.0 
      0.0 
      0.0 
      0.0 
      11.33 
      0 
      0 
     
    
      28 
      -0.633586 
      10.28 
      1 
      -0.934363 
      406 
      0.0 
      1.0 
      0.0 
      0.0 
      6.40 
      1 
      0 
     
    
      29 
      -1.062059 
      8.07 
      0 
      0.473796 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      2.30 
      0 
      0 
     
    
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
     
    
      268106 
      0.284569 
      16.99 
      1 
      -0.470701 
      0 
      1.0 
      0.0 
      0.0 
      0.0 
      17.17 
      1 
      1 
     
    
      268107 
      0.174390 
      28.99 
      1 
      -0.522219 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      27.49 
      0 
      1 
     
    
      268108 
      -1.062059 
      11.49 
      1 
      -0.384838 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      8.52 
      0 
      1 
     
    
      268109 
      -1.306900 
      11.49 
      1 
      -0.693946 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      9.79 
      0 
      1 
     
    
      268110 
      1.508776 
      19.99 
      1 
      0.336414 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      19.59 
      0 
      1 
     
    
      268111 
      1.753617 
      18.99 
      1 
      2.294098 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      11.88 
      0 
      1 
     
    
      268112 
      -1.062059 
      10.99 
      1 
      -0.470701 
      0 
      1.0 
      0.0 
      0.0 
      0.0 
      21.95 
      0 
      1 
     
    
      268113 
      0.162148 
      9.99 
      1 
      -0.213111 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      9.86 
      0 
      0 
     
    
      268114 
      -1.123269 
      9.17 
      1 
      -0.814155 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      31.20 
      0 
      0 
     
    
      268115 
      -1.429321 
      12.05 
      0 
      -0.137551 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      22.29 
      0 
      1 
     
    
      268116 
      0.039727 
      10.99 
      1 
      1.160702 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      7.32 
      0 
      1 
     
    
      268117 
      -1.107966 
      15.41 
      0 
      -0.728291 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      16.28 
      0 
      1 
     
    
      268118 
      0.529410 
      15.41 
      1 
      -0.110075 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      21.92 
      0 
      1 
     
    
      268119 
      2.610562 
      16.99 
      1 
      0.319242 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      26.72 
      0 
      1 
     
    
      268120 
      0.162148 
      12.05 
      1 
      0.130342 
      0 
      0.0 
      0.0 
      0.0 
      1.0 
      14.75 
      0 
      1 
     
    
      268121 
      2.610562 
      13.67 
      1 
      0.319242 
      0 
      1.0 
      0.0 
      0.0 
      0.0 
      3.81 
      0 
      1 
     
    
      268122 
      1.998458 
      7.26 
      1 
      1.847609 
      0 
      1.0 
      0.0 
      0.0 
      0.0 
      12.39 
      0 
      1 
     
    
      268123 
      -0.272445 
      16.55 
      1 
      -0.213111 
      0 
      0.0 
      0.0 
      0.0 
      1.0 
      12.60 
      0 
      1 
     
    
      268124 
      -1.062059 
      13.67 
      1 
      -0.106641 
      0 
      0.0 
      0.0 
      0.0 
      1.0 
      19.56 
      0 
      1 
     
    
      268125 
      2.610562 
      9.17 
      1 
      1.783211 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      22.68 
      0 
      1 
     
    
      268126 
      -0.817217 
      12.59 
      1 
      -0.630407 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      25.51 
      0 
      1 
     
    
      268127 
      0.162148 
      17.57 
      1 
      -0.419183 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      27.12 
      0 
      0 
     
    
      268128 
      1.925006 
      17.86 
      1 
      0.559659 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      30.19 
      0 
      1 
     
    
      268129 
      0.162148 
      18.49 
      1 
      0.302069 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      25.13 
      1 
      0 
     
    
      268130 
      0.774252 
      6.24 
      1 
      0.903112 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      14.20 
      1 
      1 
     
    
      268131 
      2.127000 
      21.99 
      1 
      13.782613 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      9.66 
      1 
      1 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      0.343283 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      19.62 
      1 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      0.130342 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      3.97 
      1 
      1 
     
    
      268134 
      0.088696 
      25.99 
      0 
      -0.178766 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      16.88 
      0 
      1 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      -0.470701 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      26.21 
      0 
      1 
     
  
268136 rows × 12 columns
In [70]:
    
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
    
In [82]:
    
def stratified_cross_val(data_test, dfy, n_iter):
    cv = StratifiedKFold(dfy['loan_status'], n_folds=n_iter, random_state=3)
    for train_index, test_index in cv:
        X_train = data_test.ix[train_index, :]
        y_train = dfy.ix[train_index,:]
        X_test = data_test.ix[test_index, :]
        y_test = dfy.ix[test_index, :]
        model = LogisticRegression().fit(X_train, y_train)
        print(classification_report(y_test, model.predict(X_test)))
        print('='*80)
    
In [51]:
    
def sm_logit(data_test, dfy):
    model = sm.Logit(dfy, sm.add_constant(data_test)).fit()
    print(model.summary())
    
In [52]:
    
sm_logit(data_test2, dfy)
    
    
Optimization terminated successfully.
         Current function value: 0.491694
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:            loan_status   No. Observations:               268136
Model:                          Logit   Df Residuals:                   268127
Method:                           MLE   Df Model:                            8
Date:                Tue, 14 Mar 2017   Pseudo R-squ.:                 0.07845
Time:                        13:12:33   Log-Likelihood:            -1.3184e+05
converged:                       True   LL-Null:                   -1.4306e+05
                                        LLR p-value:                     0.000
=======================================================================================
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.1833      0.028    112.009      0.000       3.128       3.239
loan_amnt              -0.1295      0.006    -22.154      0.000      -0.141      -0.118
int_rate               -0.1251      0.001   -105.980      0.000      -0.127      -0.123
emp_title               0.4080      0.020     20.667      0.000       0.369       0.447
annual_inc              0.3000      0.009     32.345      0.000       0.282       0.318
desc                    0.0004   2.79e-05     13.942      0.000       0.000       0.000
dti                    -0.0249      0.001    -38.919      0.000      -0.026      -0.024
delinq_2yrs            -0.1040      0.013     -8.198      0.000      -0.129      -0.079
verification_status    -0.0900      0.012     -7.667      0.000      -0.113      -0.067
=======================================================================================
In [54]:
    
dfX_s.tail()
    
    Out[54]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      emp_length 
      home_ownership 
      annual_inc 
      verification_status 
      issue_d 
      desc 
      purpose 
      dti 
      delinq_2yrs 
      inq_last_6mths 
      pub_rec 
      revol_bal 
      revol_util 
      total_acc 
      initial_list_status 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
      10 
      1 
      13.782613 
      1 
      12 
      0 
      3 
      9.66 
      1 
      0 
      0 
      0.561689 
      79.3 
      13 
      0 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      8 
      1 
      0.343283 
      1 
      12 
      0 
      2 
      19.62 
      1 
      0 
      0 
      -0.287872 
      68.7 
      36 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      1 
      1 
      0.130342 
      1 
      12 
      0 
      3 
      3.97 
      1 
      0 
      0 
      -0.470413 
      51.8 
      17 
      0 
     
    
      268134 
      0.088696 
      25.99 
      0 
      11 
      4 
      -0.178766 
      1 
      12 
      0 
      3 
      16.88 
      0 
      1 
      1 
      -0.504533 
      45.1 
      30 
      0 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      4 
      3 
      -0.470701 
      1 
      12 
      0 
      3 
      26.21 
      0 
      0 
      0 
      -0.323053 
      50.8 
      47 
      1 
     
  
In [55]:
    
dfX_s['pub_rec'].value_counts()
    
    Out[55]:
0    234589
1     33547
Name: pub_rec, dtype: int64
In [56]:
    
data_test.tail()
    
    Out[56]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      annual_inc 
      desc 
      purpose_other 
      purpose_2 
      purpose_3 
      purpose_10 
      dti 
      delinq_2yrs 
      verification_status 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
      13.782613 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      9.66 
      1 
      1 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      0.343283 
      0 
      0.0 
      1.0 
      0.0 
      0.0 
      19.62 
      1 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      0.130342 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      3.97 
      1 
      1 
     
    
      268134 
      0.088696 
      25.99 
      0 
      -0.178766 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      16.88 
      0 
      1 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      -0.470701 
      0 
      0.0 
      0.0 
      1.0 
      0.0 
      26.21 
      0 
      1 
     
  
In [57]:
    
del data_test['purpose_other']
del data_test['purpose_2']
del data_test['purpose_3']
    
In [58]:
    
data_test.tail()
    
    Out[58]:
  
    
       
      loan_amnt 
      int_rate 
      emp_title 
      annual_inc 
      desc 
      purpose_10 
      dti 
      delinq_2yrs 
      verification_status 
     
  
  
    
      268131 
      2.127000 
      21.99 
      1 
      13.782613 
      0 
      0.0 
      9.66 
      1 
      1 
     
    
      268132 
      -0.352019 
      7.89 
      1 
      0.343283 
      0 
      0.0 
      19.62 
      1 
      1 
     
    
      268133 
      -0.572376 
      9.17 
      1 
      0.130342 
      0 
      0.0 
      3.97 
      1 
      1 
     
    
      268134 
      0.088696 
      25.99 
      0 
      -0.178766 
      0 
      0.0 
      16.88 
      0 
      1 
     
    
      268135 
      -0.694797 
      12.59 
      1 
      -0.470701 
      0 
      0.0 
      26.21 
      0 
      1 
     
  
In [59]:
    
sm_logit(data_test, dfy)
    
    
Optimization terminated successfully.
         Current function value: 0.491682
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:            loan_status   No. Observations:               268136
Model:                          Logit   Df Residuals:                   268126
Method:                           MLE   Df Model:                            9
Date:                Tue, 14 Mar 2017   Pseudo R-squ.:                 0.07847
Time:                        13:17:15   Log-Likelihood:            -1.3184e+05
converged:                       True   LL-Null:                   -1.4306e+05
                                        LLR p-value:                     0.000
=======================================================================================
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.1829      0.028    112.002      0.000       3.127       3.239
loan_amnt              -0.1317      0.006    -22.278      0.000      -0.143      -0.120
int_rate               -0.1247      0.001   -104.923      0.000      -0.127      -0.122
emp_title               0.4076      0.020     20.644      0.000       0.369       0.446
annual_inc              0.3010      0.009     32.418      0.000       0.283       0.319
desc                    0.0004   2.79e-05     13.956      0.000       0.000       0.000
purpose_10             -0.0520      0.021     -2.492      0.013      -0.093      -0.011
dti                    -0.0249      0.001    -38.992      0.000      -0.026      -0.024
delinq_2yrs            -0.1042      0.013     -8.214      0.000      -0.129      -0.079
verification_status    -0.0901      0.012     -7.675      0.000      -0.113      -0.067
=======================================================================================
In [83]:
    
stratified_cross_val(data_test, dfy, 10)
    
    
             precision    recall  f1-score   support
          0       1.00      0.00      0.01      6042
          1       0.78      1.00      0.87     20773
avg / total       0.83      0.78      0.68     26815
================================================================================
             precision    recall  f1-score   support
          0       0.87      0.03      0.06      6042
          1       0.78      1.00      0.88     20772
avg / total       0.80      0.78      0.69     26814
================================================================================
             precision    recall  f1-score   support
          0       0.63      0.06      0.11      6042
          1       0.78      0.99      0.87     20772
avg / total       0.75      0.78      0.70     26814
================================================================================
             precision    recall  f1-score   support
          0       0.62      0.09      0.16      6042
          1       0.79      0.98      0.88     20772
avg / total       0.75      0.78      0.71     26814
================================================================================
             precision    recall  f1-score   support
          0       0.54      0.11      0.19      6042
          1       0.79      0.97      0.87     20772
avg / total       0.73      0.78      0.72     26814
================================================================================
             precision    recall  f1-score   support
          0       0.42      0.10      0.16      6041
          1       0.79      0.96      0.86     20772
avg / total       0.70      0.77      0.71     26813
================================================================================
             precision    recall  f1-score   support
          0       0.44      0.09      0.14      6041
          1       0.78      0.97      0.87     20772
avg / total       0.71      0.77      0.70     26813
================================================================================
             precision    recall  f1-score   support
          0       0.56      0.11      0.18      6041
          1       0.79      0.97      0.87     20772
avg / total       0.74      0.78      0.72     26813
================================================================================
             precision    recall  f1-score   support
          0       0.50      0.13      0.20      6041
          1       0.79      0.96      0.87     20772
avg / total       0.73      0.77      0.72     26813
================================================================================
             precision    recall  f1-score   support
          0       0.43      0.12      0.19      6041
          1       0.79      0.95      0.86     20772
avg / total       0.71      0.77      0.71     26813
================================================================================
In [ ]:
    
    
Content source: shinys825/lc_project
Similar notebooks: