Projeto da disciplina de Data Mining
PESC - Programa de Engenharia de Sistemas e Computação
COPPE / UFRJ
Autores: Bernardo Souza e Rafael Lopes Conde dos Reis
GitHub: https://github.com/condereis/credit-card-default/
In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import randint, uniform
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import auc, confusion_matrix, roc_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
%matplotlib inline
In [2]:
train = pd.read_csv('../data/processed/train.csv', index_col=0)
del train.index.name
X = train.drop('default.payment.next.month', axis=1)
y = train['default.payment.next.month']
train.head()
Out[2]:
A acurácia de escolher a classe mais frequente é de 77,92%. Isso mostra que esse é um problema de classes desbalanceadas, de forma que a acurácia não é a melhor métrica para ser utilizada. Escolheu-se usar, então, como métrica a AUC (area under the curve), que representa a área sobre a curva ROC. A curva ROC, nada mais é do que uma curva, onde cada ponto corresponde a probabilidade de detecção e a probabilidade de falso alarme para um determinado valor de limiar de classificação.
In [74]:
print 'Acurácia:', max(y.mean(), 1-y.mean())
sns.countplot(y)
Out[74]:
In [75]:
model = GaussianNB()
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
print 'Naive Bayes (AUC): %f +- %f' % (np.mean(scores), np.std(scores))
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)
print 'Naive Bayes (acurácia): %f +- %f' % (np.mean(scores), np.std(scores))
In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
model = GaussianNB()
y_pred = model.fit(X_train,y_train).predict(X_test)
labels = ['Pago', 'Default']
mtx = confusion_matrix(y_test, y_pred)
mtx = [x/float(sum(x)) for x in mtx]
sns.heatmap(pd.DataFrame(mtx, columns=labels, index=labels), annot=True, fmt=".2f", linewidths=.5)
plt.xlabel('Valor Classificado')
plt.ylabel('Valor Real')
Out[5]:
In [6]:
y_pred = model.fit(X_train, y_train).predict_proba(X_test)
y_pred = np.array([x[1] for x in y_pred])
fpr, tpr, thresholds = roc_curve(y_test+1, y_pred, pos_label=2)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [6]:
score_list = []
error_list = []
n_dims = range(1,train.shape[1])
model = KNeighborsClassifier()
for dim in n_dims:
reduced_data = PCA(n_components=dim).fit_transform(X)
scores = cross_val_score(model, reduced_data, y,
cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(n_dims, score_list, yerr=error_list)
Out[6]:
In [10]:
X_red = CA(n_components=10).fit_transform(X)
Optou-se por utilizar um numero de vizinhos em torno do joelho da curva, que ocorreu em torno de 50. Com uma variação mais fina, chegou-se ao valor de 62 vizinhos.
In [16]:
k_list = [1, 10, 10, 50, 100, 150, 200]
score_list = []
error_list = []
for k in k_list:
model = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(model, X_red, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(k_list, score_list, yerr=error_list)
Out[16]:
In [20]:
k_list = range(45,70,2)
score_list = []
error_list = []
for k in k_list:
model = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(model, X_red, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(k_list, score_list, yerr=error_list)
Out[20]:
In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
model = KNeighborsClassifier(n_neighbors=62)
y_pred = model.fit(X_train, y_train).predict_proba(X_test)
y_pred = np.array([x[1] for x in y_pred])
fpr, tpr, thresholds = roc_curve(y_test+1, y_pred, pos_label=2)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [27]:
score_list = []
error_list = []
n_dims = range(1,train.shape[1])
model = LogisticRegression()
for dim in n_dims:
reduced_data = PCA(n_components=dim).fit_transform(X)
scores = cross_val_score(model, reduced_data, y,
cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(n_dims, score_list, yerr=error_list)
Out[27]:
In [22]:
O paramentros de regularização C que escolheu-se utilizar foi C = 1 pois ser um dos valores que resultaram no maior AUC e por estar próximo do joelho da curva.m
In [36]:
c_list = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
score_list = []
error_list = []
for c in c_list:
model = LogisticRegression(C=c)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(c_list, score_list, yerr=error_list)
plt.xscale('log')
In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
model = LogisticRegression(C=0.1)
y_pred = model.fit(X_train, y_train).predict_proba(X_test)
y_pred = np.array([x[1] for x in y_pred])
fpr, tpr, thresholds = roc_curve(y_test+1, y_pred, pos_label=2)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
Pode-se observar que a redução do número de dimensões com PCA não gerou uma melhora do AUC. Entretretanto o score de AUC para 15 dimensões é bem próximo do valor máximo encontrado e poderia ser usado para gerar um modelo mais leve computacionalmente.
In [15]:
score_list = []
error_list = []
n_dims = range(1,train.shape[1])
model = GradientBoostingClassifier()
for dim in n_dims:
reduced_data = PCA(n_components=dim).fit_transform(X)
scores = cross_val_score(model, reduced_data, y,
cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(n_dims, score_list, yerr=error_list)
Out[15]:
In [52]:
score_list = []
error_list = []
learning_rates = np.logspace(-4,1,20)
for learning_rate in learning_rates:
model = GradientBoostingClassifier(learning_rate=learning_rate)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(learning_rates, score_list, yerr=error_list)
plt.xscale('log')
Corresponde a porcentagem de amostras que são utilizadas para fitar os classificadores do ensemble. Valores menores que 1 tendem a reduzir a variância porém aumentar o bias do modelo. Não se observou melhora significativa ao varia a taxa de subamostragem, portanto optou-se por mante-la igual a 1.
In [62]:
score_list = []
error_list = []
subsamples = np.linspace(0.1,1,10)
for subsample in subsamples:
model = GradientBoostingClassifier(subsample=subsample)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(subsamples, score_list, yerr=error_list)
Out[62]:
Utilizar um número de features menor que o total costuma reduzir a variância do modelo em troca de um aumento do bias. Ao variar essa taxa não observou-se um valor estatisticamente superior ao outro, devido as grandes barras de erro. Optou-se por utilizar a taxa que apresentou maior AUC médio, no caso, 0.2.
In [63]:
score_list = []
error_list = []
max_features_list = np.linspace(0.1,1,10)
for max_features in max_features_list:
model = GradientBoostingClassifier(max_features=max_features)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(max_features_list, score_list, yerr=error_list)
Out[63]:
In [68]:
score_list = []
error_list = []
max_depths = [1,2,3,4,5,6]
for max_depth in max_depths:
model = GradientBoostingClassifier(max_features=0.2, max_depth=max_depth)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(max_depths, score_list, yerr=error_list)
Out[68]:
In [71]:
score_list = []
error_list = []
min_samples_splits = np.logspace(-5,0,10)
for min_samples_split in min_samples_splits:
model = GradientBoostingClassifier(max_features=0.2, min_samples_split=min_samples_split)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(min_samples_splits, score_list, yerr=error_list)
plt.xscale('log')
In [72]:
score_list = []
error_list = []
min_samples_splits = [2,3,4,5,6,7,8,9,10]
for min_samples_split in min_samples_splits:
model = GradientBoostingClassifier(max_features=0.2, min_samples_split=min_samples_split)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(min_samples_splits, score_list, yerr=error_list)
Out[72]:
In [83]:
score_list = []
error_list = []
min_samples_leafs = [1,10,20,30,40,50,60,70]
for min_samples_leaf in min_samples_leafs:
model = GradientBoostingClassifier(max_features=0.2, min_samples_leaf=min_samples_leaf)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
score_list.append(np.mean(scores))
error_list.append(np.std(scores))
plt.errorbar(min_samples_leafs, score_list, yerr=error_list)
Out[83]:
A busca de parâmetros até agora considerou os parâmetros como sendo independentes. Buscar a melhor combinação de todos os parâmetros testados seria computacionalmente inviável. Optou-se por comparar o melhor modelo obtido com a busca de parametros supondo independencia com um modelo obtido através de uma busca por combinações aleatóreas de parâmentros.
In [108]:
param_dist = {"max_depth": randint(1, 11),
"max_features": randint(1, 11),
"min_samples_split": randint(2, 11),
"min_samples_leaf": randint(1, 11),
"subsample": uniform(0.1, 0.9),
"learning_rate": uniform(0.001, 0.9),
"loss" : ['deviance', 'exponential']}
# run randomized search
model = GradientBoostingClassifier()
random_search = RandomizedSearchCV(model, param_distributions=param_dist,n_iter=50)
random_search.fit(X, y)
Out[108]:
In [133]:
model = GradientBoostingClassifier(max_features=0.2)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
print 'Busca Independente: %f +- %f' % (np.mean(scores), np.std(scores))
model = random_search.best_estimator_
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='roc_auc')
print 'Busca Aleatórea: %f +- %f' % (np.mean(scores), np.std(scores))
In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
model = random_search.best_estimator_
y_pred = model.fit(X_train, y_train).predict_proba(X_test)
y_pred = np.array([x[1] for x in y_pred])
fpr, tpr, thresholds = roc_curve(y_test+1, y_pred, pos_label=2)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
y_pred = model.fit(X_train,y_train).predict(X_test)
labels = ['Pago', 'Default']
mtx = confusion_matrix(y_test, y_pred)
mtx = [x/float(sum(x)) for x in mtx]
sns.heatmap(pd.DataFrame(mtx, columns=labels, index=labels), annot=True, fmt=".2f", linewidths=.5)
plt.xlabel('Valor Classificado')
plt.ylabel('Valor Real')
In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
with tf.Graph().as_default():
net = tflearn.input_data(shape=[None, X.shape[1]])
net = tflearn.fully_connected(net, 100, activation='relu')
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 100, activation='relu')
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
Y = np.array([[float(a), float(1-a)] for a in y_train])
lm = tflearn.DNN(net)
lm.fit(X_train.as_matrix(), Y, validation_set=0.3, show_metric=True, batch_size=200, n_epoch=50, snapshot_epoch=False)
In [61]:
y_pred = lm.predict(X_test)
y_pred = [a[0] for a in y_pred]
fpr, tpr, thresholds = roc_curve(y_test+1, y_pred, pos_label=2)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title ('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [ ]: