In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn import metrics
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
pd.set_option('display.max_columns',1000)
In [45]:
from xgboost import XGBClassifier,plot_importance
In [46]:
def plot_value_labels(axis,format):
rects = axis.patches
# For each bar: Place a label
for rect in rects:
# Get X and Y placement of label from rect.
y_value = rect.get_height()
x_value = rect.get_x() + rect.get_width() / 2
label = '{:.2f}'.format(y_value)
# Vertical alignment for positive values
va = 'bottom'
# If value of bar is negative: Place label below bar
if y_value < 0:
# Invert space to place label below
space *= -1
# Vertically align label at top
va = 'top'
# Create annotation
axis.annotate(label, (x_value, y_value),
xytext=(0, 2),
textcoords="offset points",
ha='center',
rotation=45,
va=va)
In [47]:
np.random.seed(1234)
In [48]:
df = pd.read_excel('data/credit-card-default/data.xls')
In [49]:
df.head(5)
Out[49]:
In [50]:
df = df.rename(columns={
'X1':'limit',
'X2':'sex',
'X3':'education',
'X4':'marriage',
'X5':'age',
'X6': 'status_200509',
'X7': 'status_200508',
'X8': 'status_200507',
'X9': 'status_200506',
'X10': 'status_200505',
'X11': 'status_200504',
'X12': 'amount_charged_200509',
'X13': 'amount_charged_200508',
'X14': 'amount_charged_200507',
'X15': 'amount_charged_200506',
'X16': 'amount_charged_200505',
'X17': 'amount_charged_200504',
'X18': 'amount_paid_200509',
'X19': 'amount_paid_200508',
'X20': 'amount_paid_200507',
'X21': 'amount_paid_200506',
'X22': 'amount_paid_200505',
'X23': 'amount_paid_200504',
'Y': 'default'
}).reset_index().drop([0]).drop('index',axis=1)
In [51]:
df.sample(15)
Out[51]:
In [52]:
for column_name in df.columns:
df[column_name] = pd.to_numeric(df[column_name])
In [53]:
df.describe()
Out[53]:
In [54]:
df['default'].mean()
Out[54]:
Author clarified codes for payment_status*
columns
-2: No consumption; -1: Paid in full; 0: The use of revolving credit; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
So let's use these categories:
In [55]:
def fix_status(current_value):
if current_value == -2: return 'no_consumption'
elif current_value == -1: return 'paid_full'
elif current_value == 0: return 'revolving'
elif current_value in [1,2]: return 'delay_2_mths'
elif current_value in [3,4,5,6,7,8,9]: return 'delay_3+_mths'
else: return 'other'
for column_name in df.columns:
if column_name.startswith('status'):
df[column_name] = df[column_name].map(lambda x: fix_status(x)).astype(str)
In [56]:
df = pd.concat([df,pd.get_dummies(df['sex'], prefix='sex')],axis=1)
df.drop(['sex'],axis=1,inplace=True)
df = pd.concat([df,pd.get_dummies(df['education'], prefix='education')],axis=1)
df.drop(['education'],axis=1,inplace=True)
df = pd.concat([df,pd.get_dummies(df['marriage'], prefix='marriage')],axis=1)
df.drop(['marriage'],axis=1,inplace=True)
# also all status columns
for column_name in df.columns:
if column_name.startswith('status'):
df = pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
df.drop([column_name],axis=1,inplace=True)
In [57]:
df.sample(10)
Out[57]:
In [58]:
data = df.drop('default',axis=1)
target = df['default']
In [59]:
data.head()
Out[59]:
In [60]:
X_train, X_test, y_train, y_test = train_test_split(
data.values,
target.values,
test_size=0.25)
In [61]:
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
Out[61]:
In [62]:
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [63]:
n_features=43
plt.clf()
d = dict(zip(data.columns, clf.feature_importances_))
d['marriage'] = 0
d['sex'] = 0
d['education'] = 0
for n in range(0,3):
d['marriage'] += d['marriage_{}'.format(n)]
del(d['marriage_{}'.format(n)])
for n in range(1,3):
d['sex'] += d['sex_{}'.format(n)]
del(d['sex_{}'.format(n)])
for n in range(0,7):
d['education'] += d['education_{}'.format(n)]
del(d['education_{}'.format(n)])
In [64]:
ss = sorted(d, key=d.get, reverse=True)
top_names = ss[0:n_features]
plt.title("Feature importances")
plt.bar(range(n_features), [d[i] for i in top_names], color="r", align="center")
plt.xlim(-1, n_features)
plt.xticks(range(n_features), top_names, rotation='vertical')
plt.yticks(np.arange(0, 0.12, 0.005))
plot_value_labels(plt.gca(),format='{:.3f}')
plt.gcf().set_size_inches(10,6)
plt.ylim(0.0,0.11)
plt.tight_layout()
plt.show()
In [65]:
df['age'].describe()
Out[65]:
In [66]:
data = df[df['age']<=30].drop('default',axis=1)
target = df[df['age']<=30]['default']
X_train, X_test, y_train, y_test = train_test_split(
data.values,
target.values,
test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [67]:
data = df[df['age'].between(31,50)].drop('default',axis=1)
target = df[df['age'].between(31,50)]['default']
X_train, X_test, y_train, y_test = train_test_split(
data.values,
target.values,
test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [68]:
data = df[df['age'] > 50].drop('default',axis=1)
target = df[df['age'] > 50]['default']
X_train, X_test, y_train, y_test = train_test_split(
data.values,
target.values,
test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [70]:
data = df[df['sex_1'] == 1].drop('default',axis=1)
target = df[df['sex_1'] == 1]['default']
X_train, X_test, y_train, y_test = train_test_split(
data.values,
target.values,
test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [71]:
data = df[df['sex_2'] == 1].drop('default',axis=1)
target = df[df['sex_2'] == 1]['default']
X_train, X_test, y_train, y_test = train_test_split(
data.values,
target.values,
test_size=0.25)
clf = XGBClassifier()
clf.fit(X_train, y_train.ravel())
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()