In [44]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn import metrics
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
pd.set_option('display.max_columns',1000)
    
In [45]:
    
from xgboost import XGBClassifier,plot_importance
    
In [46]:
    
def plot_value_labels(axis,format):
    rects = axis.patches
    # For each bar: Place a label
    for rect in rects:
        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2
        label = '{:.2f}'.format(y_value)
        # Vertical alignment for positive values
        va = 'bottom'
        # If value of bar is negative: Place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'
        # Create annotation
        axis.annotate(label, (x_value, y_value), 
                      xytext=(0, 2), 
                      textcoords="offset points", 
                      ha='center', 
                      rotation=45, 
                      va=va)
    
In [47]:
    
np.random.seed(1234)
    
In [48]:
    
df = pd.read_excel('data/credit-card-default/data.xls')
    
In [49]:
    
df.head(5)
    
    Out[49]:
In [50]:
    
df = df.rename(columns={
    'X1':'limit',
    'X2':'sex',
    'X3':'education',
    'X4':'marriage',
    'X5':'age',
    'X6': 'status_200509',
    'X7': 'status_200508',
    'X8': 'status_200507',
    'X9': 'status_200506',
    'X10': 'status_200505',
    'X11': 'status_200504',
    
    'X12': 'amount_charged_200509',
    'X13': 'amount_charged_200508',
    'X14': 'amount_charged_200507',
    'X15': 'amount_charged_200506',
    'X16': 'amount_charged_200505',
    'X17': 'amount_charged_200504',
    
    'X18': 'amount_paid_200509',
    'X19': 'amount_paid_200508',
    'X20': 'amount_paid_200507',
    'X21': 'amount_paid_200506',
    'X22': 'amount_paid_200505',
    'X23': 'amount_paid_200504',
    'Y': 'default'
}).reset_index().drop([0]).drop('index',axis=1)
    
In [51]:
    
df.sample(15)
    
    Out[51]:
In [52]:
    
for column_name in df.columns:
    df[column_name] = pd.to_numeric(df[column_name])
    
In [53]:
    
df.describe()
    
    Out[53]:
In [54]:
    
df['default'].mean()
    
    Out[54]:
Author clarified codes for payment_status* columns
-2: No consumption; -1: Paid in full; 0: The use of revolving credit; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
So let's use these categories:
In [55]:
    
def fix_status(current_value):
    if current_value == -2: return 'no_consumption'
    elif current_value == -1: return 'paid_full'
    elif current_value == 0: return 'revolving'
    elif current_value in [1,2]: return 'delay_2_mths'
    elif current_value in [3,4,5,6,7,8,9]: return 'delay_3+_mths'
    else: return 'other'
for column_name in df.columns:
    if column_name.startswith('status'):
        df[column_name] = df[column_name].map(lambda x: fix_status(x)).astype(str)
    
In [56]:
    
df = pd.concat([df,pd.get_dummies(df['sex'], prefix='sex')],axis=1)
df.drop(['sex'],axis=1,inplace=True)
df = pd.concat([df,pd.get_dummies(df['education'], prefix='education')],axis=1)
df.drop(['education'],axis=1,inplace=True)
df = pd.concat([df,pd.get_dummies(df['marriage'], prefix='marriage')],axis=1)
df.drop(['marriage'],axis=1,inplace=True)
# also all status columns
for column_name in df.columns:
    if column_name.startswith('status'):
        df = pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
        df.drop([column_name],axis=1,inplace=True)
    
In [57]:
    
df.sample(10)
    
    Out[57]:
In [58]:
    
data = df.drop('default',axis=1)
target = df['default']
    
In [59]:
    
data.head()
    
    Out[59]:
In [60]:
    
X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)
    
In [61]:
    
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
    
    Out[61]:
In [62]:
    
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
    
    
In [63]:
    
n_features=43
plt.clf()
d = dict(zip(data.columns, clf.feature_importances_))
d['marriage'] = 0
d['sex'] = 0
d['education'] = 0
for n in range(0,3):
    d['marriage'] += d['marriage_{}'.format(n)]
    del(d['marriage_{}'.format(n)])
for n in range(1,3):
    d['sex'] += d['sex_{}'.format(n)]
    del(d['sex_{}'.format(n)])    
    
for n in range(0,7):
    d['education'] += d['education_{}'.format(n)]
    del(d['education_{}'.format(n)])
    
In [64]:
    
ss = sorted(d, key=d.get, reverse=True)
top_names = ss[0:n_features]
plt.title("Feature importances")
plt.bar(range(n_features), [d[i] for i in top_names], color="r", align="center")
plt.xlim(-1, n_features)
plt.xticks(range(n_features), top_names, rotation='vertical')
plt.yticks(np.arange(0, 0.12, 0.005))
plot_value_labels(plt.gca(),format='{:.3f}')
plt.gcf().set_size_inches(10,6)
plt.ylim(0.0,0.11)
plt.tight_layout()
plt.show()
    
    
In [65]:
    
df['age'].describe()
    
    Out[65]:
In [66]:
    
data = df[df['age']<=30].drop('default',axis=1)
target = df[df['age']<=30]['default']
X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
    
    
solid gains here
In [67]:
    
data = df[df['age'].between(31,50)].drop('default',axis=1)
target = df[df['age'].between(31,50)]['default']
X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
    
    
In [68]:
    
data = df[df['age'] > 50].drop('default',axis=1)
target = df[df['age'] > 50]['default']
X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
    
    
In [70]:
    
data = df[df['sex_1'] == 1].drop('default',axis=1)
target = df[df['sex_1'] == 1]['default']
X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
    
    
In [71]:
    
data = df[df['sex_2'] == 1].drop('default',axis=1)
target = df[df['sex_2'] == 1]['default']
X_train, X_test, y_train, y_test = train_test_split(
    data.values, 
    target.values, 
    test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()