notebook.community

Edit and run



In [44]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn import metrics
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

pd.set_option('display.max_columns',1000)



In [45]:

    
from xgboost import XGBClassifier,plot_importance



In [46]:

    
def plot_value_labels(axis,format):

    rects = axis.patches

    # For each bar: Place a label
    for rect in rects:

        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        label = '{:.2f}'.format(y_value)

        # Vertical alignment for positive values
        va = 'bottom'

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'

        # Create annotation

        axis.annotate(label, (x_value, y_value), 
                      xytext=(0, 2), 
                      textcoords="offset points", 
                      ha='center', 
                      rotation=45, 
                      va=va)



In [47]:

    
np.random.seed(1234)



In [48]:

    
df = pd.read_excel('data/credit-card-default/data.xls')



In [49]:

    
df.head(5)









    Out[49]:







  
    
      
      X1
      X2
      X3
      X4
      X5
      X6
      X7
      X8
      X9
      X10
      X11
      X12
      X13
      X14
      X15
      X16
      X17
      X18
      X19
      X20
      X21
      X22
      X23
      Y
    
  
  
    
      ID
      LIMIT_BAL
      SEX
      EDUCATION
      MARRIAGE
      AGE
      PAY_0
      PAY_2
      PAY_3
      PAY_4
      PAY_5
      PAY_6
      BILL_AMT1
      BILL_AMT2
      BILL_AMT3
      BILL_AMT4
      BILL_AMT5
      BILL_AMT6
      PAY_AMT1
      PAY_AMT2
      PAY_AMT3
      PAY_AMT4
      PAY_AMT5
      PAY_AMT6
      default payment next month
    
    
      1
      20000
      2
      2
      1
      24
      2
      2
      -1
      -1
      -2
      -2
      3913
      3102
      689
      0
      0
      0
      0
      689
      0
      0
      0
      0
      1
    
    
      2
      120000
      2
      2
      2
      26
      -1
      2
      0
      0
      0
      2
      2682
      1725
      2682
      3272
      3455
      3261
      0
      1000
      1000
      1000
      0
      2000
      1
    
    
      3
      90000
      2
      2
      2
      34
      0
      0
      0
      0
      0
      0
      29239
      14027
      13559
      14331
      14948
      15549
      1518
      1500
      1000
      1000
      1000
      5000
      0
    
    
      4
      50000
      2
      2
      1
      37
      0
      0
      0
      0
      0
      0
      46990
      48233
      49291
      28314
      28959
      29547
      2000
      2019
      1200
      1100
      1069
      1000
      0



In [50]:

    
df = df.rename(columns={
    'X1':'limit',
    'X2':'sex',
    'X3':'education',
    'X4':'marriage',
    'X5':'age',
    'X6': 'status_200509',
    'X7': 'status_200508',
    'X8': 'status_200507',
    'X9': 'status_200506',
    'X10': 'status_200505',
    'X11': 'status_200504',
    
    'X12': 'amount_charged_200509',
    'X13': 'amount_charged_200508',
    'X14': 'amount_charged_200507',
    'X15': 'amount_charged_200506',
    'X16': 'amount_charged_200505',
    'X17': 'amount_charged_200504',
    
    'X18': 'amount_paid_200509',
    'X19': 'amount_paid_200508',
    'X20': 'amount_paid_200507',
    'X21': 'amount_paid_200506',
    'X22': 'amount_paid_200505',
    'X23': 'amount_paid_200504',
    'Y': 'default'
}).reset_index().drop([0]).drop('index',axis=1)



In [51]:

    
df.sample(15)









    Out[51]:







  
    
      
      limit
      sex
      education
      marriage
      age
      status_200509
      status_200508
      status_200507
      status_200506
      status_200505
      status_200504
      amount_charged_200509
      amount_charged_200508
      amount_charged_200507
      amount_charged_200506
      amount_charged_200505
      amount_charged_200504
      amount_paid_200509
      amount_paid_200508
      amount_paid_200507
      amount_paid_200506
      amount_paid_200505
      amount_paid_200504
      default
    
  
  
    
      13126
      400000
      1
      1
      1
      34
      -2
      -2
      -2
      -2
      -2
      -2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14636
      80000
      1
      2
      2
      34
      0
      0
      0
      0
      0
      0
      66122
      92131
      47655
      43182
      44332
      45440
      2600
      4300
      2000
      2000
      2000
      2000
      0
    
    
      19430
      200000
      2
      3
      1
      49
      1
      -2
      -1
      -1
      -1
      -1
      0
      0
      2317
      7588
      7606
      14053
      0
      2317
      7588
      7614
      14053
      0
      0
    
    
      4382
      20000
      2
      2
      1
      41
      -1
      -1
      -1
      -1
      -1
      -1
      2468
      1077
      1140
      0
      7014
      7696
      1087
      1140
      0
      7014
      800
      0
      0
    
    
      7660
      70000
      2
      1
      1
      36
      2
      0
      0
      0
      0
      0
      81719
      85389
      86287
      65287
      35345
      9360
      5000
      3000
      2000
      3000
      5000
      0
      0
    
    
      10638
      50000
      1
      2
      2
      24
      0
      0
      0
      0
      0
      0
      23553
      21507
      17827
      7655
      7881
      8248
      1700
      2000
      500
      500
      500
      1000
      0
    
    
      17046
      20000
      2
      1
      2
      25
      -1
      -1
      -1
      -1
      0
      0
      4541
      0
      724
      18589
      18985
      19531
      0
      724
      18589
      1000
      1000
      1000
      1
    
    
      24979
      350000
      1
      1
      2
      33
      -2
      -2
      -2
      -1
      0
      0
      6015
      8751
      686
      430667
      161089
      119339
      8797
      691
      238241
      1009
      596
      10356
      0
    
    
      13640
      20000
      2
      2
      2
      23
      2
      0
      0
      0
      0
      0
      11995
      13324
      14029
      14308
      14625
      18245
      1528
      1233
      512
      547
      3880
      0
      1
    
    
      9469
      200000
      2
      1
      1
      47
      -1
      -1
      -1
      -1
      -1
      -1
      935
      1418
      -2
      120
      150
      928
      1418
      0
      122
      300
      928
      464
      1
    
    
      18652
      100000
      2
      2
      1
      29
      0
      0
      0
      0
      0
      0
      118008
      116686
      113939
      111079
      107257
      105377
      5614
      5600
      5500
      4500
      5000
      6000
      0
    
    
      12360
      20000
      1
      1
      2
      40
      0
      0
      0
      0
      0
      0
      13639
      14711
      15462
      16004
      16203
      16916
      1600
      1300
      800
      605
      1000
      2000
      0
    
    
      54
      180000
      2
      1
      2
      25
      1
      2
      0
      0
      0
      0
      41402
      41742
      42758
      43510
      44420
      45319
      1300
      2010
      1762
      1762
      1790
      1622
      0
    
    
      7013
      180000
      1
      3
      2
      29
      -2
      -2
      -2
      -2
      -2
      -1
      4358
      3160
      2611
      8710
      5127
      8758
      3169
      2618
      8736
      5142
      8766
      15040
      0
    
    
      5223
      400000
      1
      3
      2
      49
      0
      0
      -2
      -2
      -1
      0
      36560
      0
      0
      0
      1757
      21667
      0
      0
      0
      1757
      20000
      10158
      0



In [52]:

    
for column_name in df.columns:
    df[column_name] = pd.to_numeric(df[column_name])



In [53]:

    
df.describe()









    Out[53]:







  
    
      
      limit
      sex
      education
      marriage
      age
      status_200509
      status_200508
      status_200507
      status_200506
      status_200505
      status_200504
      amount_charged_200509
      amount_charged_200508
      amount_charged_200507
      amount_charged_200506
      amount_charged_200505
      amount_charged_200504
      amount_paid_200509
      amount_paid_200508
      amount_paid_200507
      amount_paid_200506
      amount_paid_200505
      amount_paid_200504
      default
    
  
  
    
      count
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      3.000000e+04
      30000.000000
      30000.000000
      30000.000000
      30000.000000
      3.000000e+04
      30000.00000
      30000.000000
      30000.000000
      30000.000000
      30000.000000
    
    
      mean
      167484.322667
      1.603733
      1.853133
      1.551867
      35.485500
      -0.016700
      -0.133767
      -0.166200
      -0.220667
      -0.266200
      -0.291100
      51223.330900
      49179.075167
      4.701315e+04
      43262.948967
      40311.400967
      38871.760400
      5663.580500
      5.921163e+03
      5225.68150
      4826.076867
      4799.387633
      5215.502567
      0.221200
    
    
      std
      129747.661567
      0.489129
      0.790349
      0.521970
      9.217904
      1.123802
      1.197186
      1.196868
      1.169139
      1.133187
      1.149988
      73635.860576
      71173.768783
      6.934939e+04
      64332.856134
      60797.155770
      59554.107537
      16563.280354
      2.304087e+04
      17606.96147
      15666.159744
      15278.305679
      17777.465775
      0.415062
    
    
      min
      10000.000000
      1.000000
      0.000000
      0.000000
      21.000000
      -2.000000
      -2.000000
      -2.000000
      -2.000000
      -2.000000
      -2.000000
      -165580.000000
      -69777.000000
      -1.572640e+05
      -170000.000000
      -81334.000000
      -339603.000000
      0.000000
      0.000000e+00
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      50000.000000
      1.000000
      1.000000
      1.000000
      28.000000
      -1.000000
      -1.000000
      -1.000000
      -1.000000
      -1.000000
      -1.000000
      3558.750000
      2984.750000
      2.666250e+03
      2326.750000
      1763.000000
      1256.000000
      1000.000000
      8.330000e+02
      390.00000
      296.000000
      252.500000
      117.750000
      0.000000
    
    
      50%
      140000.000000
      2.000000
      2.000000
      2.000000
      34.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      22381.500000
      21200.000000
      2.008850e+04
      19052.000000
      18104.500000
      17071.000000
      2100.000000
      2.009000e+03
      1800.00000
      1500.000000
      1500.000000
      1500.000000
      0.000000
    
    
      75%
      240000.000000
      2.000000
      2.000000
      2.000000
      41.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      67091.000000
      64006.250000
      6.016475e+04
      54506.000000
      50190.500000
      49198.250000
      5006.000000
      5.000000e+03
      4505.00000
      4013.250000
      4031.500000
      4000.000000
      0.000000
    
    
      max
      1000000.000000
      2.000000
      6.000000
      3.000000
      79.000000
      8.000000
      8.000000
      8.000000
      8.000000
      8.000000
      8.000000
      964511.000000
      983931.000000
      1.664089e+06
      891586.000000
      927171.000000
      961664.000000
      873552.000000
      1.684259e+06
      896040.00000
      621000.000000
      426529.000000
      528666.000000
      1.000000



In [54]:

    
df['default'].mean()









    Out[54]:





0.22120000000000001

turn payment status into categories

Author clarified codes for payment_status* columns

-2: No consumption; -1: Paid in full; 0: The use of revolving credit; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.

So let's use these categories:

-2 => category 'no_consumption'
-1 => category 'paid_full'
0 => category 'revolving'
1 and 2 => 'delay_2_mths
3 to 9 => 'delay_3+_mths'



In [55]:

    
def fix_status(current_value):
    if current_value == -2: return 'no_consumption'
    elif current_value == -1: return 'paid_full'
    elif current_value == 0: return 'revolving'
    elif current_value in [1,2]: return 'delay_2_mths'
    elif current_value in [3,4,5,6,7,8,9]: return 'delay_3+_mths'
    else: return 'other'

for column_name in df.columns:
    if column_name.startswith('status'):
        df[column_name] = df[column_name].map(lambda x: fix_status(x)).astype(str)

one hot encoding where needed



In [56]:

    
df = pd.concat([df,pd.get_dummies(df['sex'], prefix='sex')],axis=1)
df.drop(['sex'],axis=1,inplace=True)

df = pd.concat([df,pd.get_dummies(df['education'], prefix='education')],axis=1)
df.drop(['education'],axis=1,inplace=True)

df = pd.concat([df,pd.get_dummies(df['marriage'], prefix='marriage')],axis=1)
df.drop(['marriage'],axis=1,inplace=True)

# also all status columns
for column_name in df.columns:
    if column_name.startswith('status'):
        df = pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
        df.drop([column_name],axis=1,inplace=True)



In [57]:

    
df.sample(10)









    Out[57]:







  
    
      
      limit
      age
      amount_charged_200509
      amount_charged_200508
      amount_charged_200507
      amount_charged_200506
      amount_charged_200505
      amount_charged_200504
      amount_paid_200509
      amount_paid_200508
      amount_paid_200507
      amount_paid_200506
      amount_paid_200505
      amount_paid_200504
      default
      sex_1
      sex_2
      education_0
      education_1
      education_2
      education_3
      education_4
      education_5
      education_6
      marriage_0
      marriage_1
      marriage_2
      marriage_3
      status_200509_delay_2_mths
      status_200509_delay_3+_mths
      status_200509_no_consumption
      status_200509_paid_full
      status_200509_revolving
      status_200508_delay_2_mths
      status_200508_delay_3+_mths
      status_200508_no_consumption
      status_200508_paid_full
      status_200508_revolving
      status_200507_delay_2_mths
      status_200507_delay_3+_mths
      status_200507_no_consumption
      status_200507_paid_full
      status_200507_revolving
      status_200506_delay_2_mths
      status_200506_delay_3+_mths
      status_200506_no_consumption
      status_200506_paid_full
      status_200506_revolving
      status_200505_delay_2_mths
      status_200505_delay_3+_mths
      status_200505_no_consumption
      status_200505_paid_full
      status_200505_revolving
      status_200504_delay_2_mths
      status_200504_delay_3+_mths
      status_200504_no_consumption
      status_200504_paid_full
      status_200504_revolving
    
  
  
    
      28088
      120000
      33
      508
      607
      611
      593
      653
      587
      700
      611
      593
      653
      587
      916
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      1361
      240000
      30
      7818
      9363
      0
      0
      0
      0
      9434
      0
      0
      0
      0
      178
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
    
    
      25412
      30000
      24
      25616
      27526
      11458
      7500
      7000
      5192
      31348
      1328
      1000
      7000
      5192
      0
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
    
    
      19795
      180000
      39
      2569
      11662
      11867
      8111
      6312
      3739
      11662
      6003
      8111
      273
      3739
      6547
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      17523
      120000
      44
      16735
      18150
      19528
      19065
      18617
      21381
      2000
      2000
      2000
      1000
      3000
      2000
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      17800
      50000
      23
      18666
      15554
      17159
      8402
      6646
      7534
      2000
      2000
      1500
      500
      1000
      2000
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      13470
      240000
      46
      456
      0
      0
      2240
      1681
      2267
      0
      0
      2240
      0
      2267
      3074
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
    
    
      2660
      50000
      41
      26184
      29261
      28444
      28878
      27655
      24480
      3500
      0
      1100
      1100
      2000
      1000
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      22442
      330000
      36
      257156
      257576
      256014
      245244
      239441
      242025
      10037
      10109
      10053
      10014
      9001
      9001
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      18143
      300000
      32
      8077
      10566
      11293
      12000
      3458
      56338
      2800
      1200
      1200
      9427
      55000
      2000
      1
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1



In [58]:

    
data = df.drop('default',axis=1)
target = df['default']



In [59]:

    
data.head()









    Out[59]:







  
    
      
      limit
      age
      amount_charged_200509
      amount_charged_200508
      amount_charged_200507
      amount_charged_200506
      amount_charged_200505
      amount_charged_200504
      amount_paid_200509
      amount_paid_200508
      amount_paid_200507
      amount_paid_200506
      amount_paid_200505
      amount_paid_200504
      sex_1
      sex_2
      education_0
      education_1
      education_2
      education_3
      education_4
      education_5
      education_6
      marriage_0
      marriage_1
      marriage_2
      marriage_3
      status_200509_delay_2_mths
      status_200509_delay_3+_mths
      status_200509_no_consumption
      status_200509_paid_full
      status_200509_revolving
      status_200508_delay_2_mths
      status_200508_delay_3+_mths
      status_200508_no_consumption
      status_200508_paid_full
      status_200508_revolving
      status_200507_delay_2_mths
      status_200507_delay_3+_mths
      status_200507_no_consumption
      status_200507_paid_full
      status_200507_revolving
      status_200506_delay_2_mths
      status_200506_delay_3+_mths
      status_200506_no_consumption
      status_200506_paid_full
      status_200506_revolving
      status_200505_delay_2_mths
      status_200505_delay_3+_mths
      status_200505_no_consumption
      status_200505_paid_full
      status_200505_revolving
      status_200504_delay_2_mths
      status_200504_delay_3+_mths
      status_200504_no_consumption
      status_200504_paid_full
      status_200504_revolving
    
  
  
    
      1
      20000
      24
      3913
      3102
      689
      0
      0
      0
      0
      689
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
    
    
      2
      120000
      26
      2682
      1725
      2682
      3272
      3455
      3261
      0
      1000
      1000
      1000
      0
      2000
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      1
      0
      0
      0
      0
    
    
      3
      90000
      34
      29239
      14027
      13559
      14331
      14948
      15549
      1518
      1500
      1000
      1000
      1000
      5000
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      4
      50000
      37
      46990
      48233
      49291
      28314
      28959
      29547
      2000
      2019
      1200
      1100
      1069
      1000
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      5
      50000
      57
      8617
      5670
      35835
      20940
      19146
      19131
      2000
      36681
      10000
      9000
      689
      679
      1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1



In [60]:

    
X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)



In [61]:

    
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())









    Out[61]:





XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)



In [62]:

    
y_preds = clf.predict_proba(X_test)

# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()



In [63]:

    
n_features=43

plt.clf()
d = dict(zip(data.columns, clf.feature_importances_))

d['marriage'] = 0
d['sex'] = 0
d['education'] = 0

for n in range(0,3):
    d['marriage'] += d['marriage_{}'.format(n)]
    del(d['marriage_{}'.format(n)])

for n in range(1,3):
    d['sex'] += d['sex_{}'.format(n)]
    del(d['sex_{}'.format(n)])    
    
for n in range(0,7):
    d['education'] += d['education_{}'.format(n)]
    del(d['education_{}'.format(n)])



In [64]:

    
ss = sorted(d, key=d.get, reverse=True)
top_names = ss[0:n_features]

plt.title("Feature importances")
plt.bar(range(n_features), [d[i] for i in top_names], color="r", align="center")
plt.xlim(-1, n_features)
plt.xticks(range(n_features), top_names, rotation='vertical')
plt.yticks(np.arange(0, 0.12, 0.005))
plot_value_labels(plt.gca(),format='{:.3f}')
plt.gcf().set_size_inches(10,6)
plt.ylim(0.0,0.11)
plt.tight_layout()
plt.show()

can we do better by training a different model by subpopulation?



In [65]:

    
df['age'].describe()









    Out[65]:





count    30000.000000
mean        35.485500
std          9.217904
min         21.000000
25%         28.000000
50%         34.000000
75%         41.000000
max         79.000000
Name: age, dtype: float64

young people (age<=30)



In [66]:

    
data = df[df['age']<=30].drop('default',axis=1)
target = df[df['age']<=30]['default']

X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)

clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())

y_preds = clf.predict_proba(X_test)

# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()

middle age (30 < age <=50)



In [67]:

    
data = df[df['age'].between(31,50)].drop('default',axis=1)
target = df[df['age'].between(31,50)]['default']

X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)

clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())

y_preds = clf.predict_proba(X_test)

# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()

older people (age > 50)



In [68]:

    
data = df[df['age'] > 50].drop('default',axis=1)
target = df[df['age'] > 50]['default']

X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)

clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())

y_preds = clf.predict_proba(X_test)

# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()

men only



In [70]:

    
data = df[df['sex_1'] == 1].drop('default',axis=1)
target = df[df['sex_1'] == 1]['default']

X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)

clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())

y_preds = clf.predict_proba(X_test)

# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()

women only



In [71]:

    
data = df[df['sex_2'] == 1].drop('default',axis=1)
target = df[df['sex_2'] == 1]['default']

X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)

clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())

y_preds = clf.predict_proba(X_test)

# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()

	X1	X2	X3	X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14	X15	X16	X17	X18	X19	X20	X21	X22	X23	Y
ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default payment next month
1	20000	2	2	1	24	2	2	-1	-1	-2	-2	3913	3102	689	0	0	0	0	689	0	0	0	0	1
2	120000	2	2	2	26	-1	2	0	0	0	2	2682	1725	2682	3272	3455	3261	0	1000	1000	1000	0	2000	1
3	90000	2	2	2	34	0	0	0	0	0	0	29239	14027	13559	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
4	50000	2	2	1	37	0	0	0	0	0	0	46990	48233	49291	28314	28959	29547	2000	2019	1200	1100	1069	1000	0

	limit	sex	education	marriage	age	status_200509	status_200508	status_200507	status_200506	status_200505	status_200504	amount_charged_200509	amount_charged_200508	amount_charged_200507	amount_charged_200506	amount_charged_200505	amount_charged_200504	amount_paid_200509	amount_paid_200508	amount_paid_200507	amount_paid_200506	amount_paid_200505	amount_paid_200504	default
13126	400000	1	1	1	34	-2	-2	-2	-2	-2	-2	0	0	0	0	0	0	0	0	0	0	0	0	0
14636	80000	1	2	2	34	0	0	0	0	0	0	66122	92131	47655	43182	44332	45440	2600	4300	2000	2000	2000	2000	0
19430	200000	2	3	1	49	1	-2	-1	-1	-1	-1	0	0	2317	7588	7606	14053	0	2317	7588	7614	14053	0	0
4382	20000	2	2	1	41	-1	-1	-1	-1	-1	-1	2468	1077	1140	0	7014	7696	1087	1140	0	7014	800	0	0
7660	70000	2	1	1	36	2	0	0	0	0	0	81719	85389	86287	65287	35345	9360	5000	3000	2000	3000	5000	0	0
10638	50000	1	2	2	24	0	0	0	0	0	0	23553	21507	17827	7655	7881	8248	1700	2000	500	500	500	1000	0
17046	20000	2	1	2	25	-1	-1	-1	-1	0	0	4541	0	724	18589	18985	19531	0	724	18589	1000	1000	1000	1
24979	350000	1	1	2	33	-2	-2	-2	-1	0	0	6015	8751	686	430667	161089	119339	8797	691	238241	1009	596	10356	0
13640	20000	2	2	2	23	2	0	0	0	0	0	11995	13324	14029	14308	14625	18245	1528	1233	512	547	3880	0	1
9469	200000	2	1	1	47	-1	-1	-1	-1	-1	-1	935	1418	-2	120	150	928	1418	0	122	300	928	464	1
18652	100000	2	2	1	29	0	0	0	0	0	0	118008	116686	113939	111079	107257	105377	5614	5600	5500	4500	5000	6000	0
12360	20000	1	1	2	40	0	0	0	0	0	0	13639	14711	15462	16004	16203	16916	1600	1300	800	605	1000	2000	0
54	180000	2	1	2	25	1	2	0	0	0	0	41402	41742	42758	43510	44420	45319	1300	2010	1762	1762	1790	1622	0
7013	180000	1	3	2	29	-2	-2	-2	-2	-2	-1	4358	3160	2611	8710	5127	8758	3169	2618	8736	5142	8766	15040	0
5223	400000	1	3	2	49	0	0	-2	-2	-1	0	36560	0	0	0	1757	21667	0	0	0	1757	20000	10158	0

	limit	sex	education	marriage	age	status_200509	status_200508	status_200507	status_200506	status_200505	status_200504	amount_charged_200509	amount_charged_200508	amount_charged_200507	amount_charged_200506	amount_charged_200505	amount_charged_200504	amount_paid_200509	amount_paid_200508	amount_paid_200507	amount_paid_200506	amount_paid_200505	amount_paid_200504	default
count	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	3.000000e+04	30000.000000	30000.000000	30000.000000	30000.000000	3.000000e+04	30000.00000	30000.000000	30000.000000	30000.000000	30000.000000
mean	167484.322667	1.603733	1.853133	1.551867	35.485500	-0.016700	-0.133767	-0.166200	-0.220667	-0.266200	-0.291100	51223.330900	49179.075167	4.701315e+04	43262.948967	40311.400967	38871.760400	5663.580500	5.921163e+03	5225.68150	4826.076867	4799.387633	5215.502567	0.221200
std	129747.661567	0.489129	0.790349	0.521970	9.217904	1.123802	1.197186	1.196868	1.169139	1.133187	1.149988	73635.860576	71173.768783	6.934939e+04	64332.856134	60797.155770	59554.107537	16563.280354	2.304087e+04	17606.96147	15666.159744	15278.305679	17777.465775	0.415062
min	10000.000000	1.000000	0.000000	0.000000	21.000000	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000	-165580.000000	-69777.000000	-1.572640e+05	-170000.000000	-81334.000000	-339603.000000	0.000000	0.000000e+00	0.00000	0.000000	0.000000	0.000000	0.000000
25%	50000.000000	1.000000	1.000000	1.000000	28.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	3558.750000	2984.750000	2.666250e+03	2326.750000	1763.000000	1256.000000	1000.000000	8.330000e+02	390.00000	296.000000	252.500000	117.750000	0.000000
50%	140000.000000	2.000000	2.000000	2.000000	34.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	22381.500000	21200.000000	2.008850e+04	19052.000000	18104.500000	17071.000000	2100.000000	2.009000e+03	1800.00000	1500.000000	1500.000000	1500.000000	0.000000
75%	240000.000000	2.000000	2.000000	2.000000	41.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	67091.000000	64006.250000	6.016475e+04	54506.000000	50190.500000	49198.250000	5006.000000	5.000000e+03	4505.00000	4013.250000	4031.500000	4000.000000	0.000000
max	1000000.000000	2.000000	6.000000	3.000000	79.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	964511.000000	983931.000000	1.664089e+06	891586.000000	927171.000000	961664.000000	873552.000000	1.684259e+06	896040.00000	621000.000000	426529.000000	528666.000000	1.000000

	limit	age	amount_charged_200509	amount_charged_200508	amount_charged_200507	amount_charged_200506	amount_charged_200505	amount_charged_200504	amount_paid_200509	amount_paid_200508	amount_paid_200507	amount_paid_200506	amount_paid_200505	amount_paid_200504	default	sex_1	sex_2	education_1	education_2	education_3	marriage_1	marriage_2	status_200509_delay_2_mths	status_200509_paid_full	status_200509_revolving	status_200508_delay_2_mths	status_200508_paid_full	status_200508_revolving	status_200507_delay_2_mths	status_200507_no_consumption	status_200507_paid_full	status_200507_revolving	status_200506_delay_2_mths	status_200506_no_consumption	status_200506_paid_full	status_200506_revolving	status_200505_no_consumption	status_200505_paid_full	status_200505_revolving	status_200504_no_consumption	status_200504_paid_full	status_200504_revolving
28088	120000	33	508	607	611	593	653	587	700	611	593	653	587	916	0	0	1	0	1	0	0	1	0	1	0	0	1	0	0	0	1	0	0	0	1	0	0	1	0	0	1	0
1361	240000	30	7818	9363	0	0	0	0	9434	0	0	0	0	178	0	0	1	0	1	0	0	1	0	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	0
25412	30000	24	25616	27526	11458	7500	7000	5192	31348	1328	1000	7000	5192	0	0	0	1	1	0	0	0	1	0	1	0	0	1	0	0	0	0	1	0	0	0	1	0	0	1	0	1	0
19795	180000	39	2569	11662	11867	8111	6312	3739	11662	6003	8111	273	3739	6547	0	0	1	0	1	0	1	0	0	1	0	0	1	0	0	0	0	1	0	0	1	0	0	0	1	0	0	1
17523	120000	44	16735	18150	19528	19065	18617	21381	2000	2000	2000	1000	3000	2000	0	0	1	0	0	1	1	0	0	0	1	0	0	1	0	0	0	1	0	0	0	1	0	0	1	0	0	1
17800	50000	23	18666	15554	17159	8402	6646	7534	2000	2000	1500	500	1000	2000	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	0	0	1	0	0	0	1	0	0	1	0	0	1
13470	240000	46	456	0	0	2240	1681	2267	0	0	2240	0	2267	3074	0	0	1	1	0	0	0	1	1	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	1	0	1	0
2660	50000	41	26184	29261	28444	28878	27655	24480	3500	0	1100	1100	2000	1000	0	1	0	0	0	1	1	0	1	0	0	1	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	1
22442	330000	36	257156	257576	256014	245244	239441	242025	10037	10109	10053	10014	9001	9001	0	0	1	0	1	0	0	1	0	0	1	0	0	1	0	0	0	1	0	0	0	1	0	0	1	0	0	1
18143	300000	32	8077	10566	11293	12000	3458	56338	2800	1200	1200	9427	55000	2000	1	1	0	1	0	0	0	1	0	0	1	0	0	1	1	0	0	0	1	0	0	0	0	1	0	0	0	1