In [72]:
import pandas as pd
import numpy as np

import os
import sys
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import time

sns.set(style="darkgrid")

In [77]:
cc_data = pd.read_csv('CredCard_with_above50.csv')
cc_data.set_index(['ID'], inplace=True)
cc_data.columns


Out[77]:
Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month', 'above50k'],
      dtype='object')

Use numerical data


In [91]:
cc_num_data = cc_data.get(key=['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 
                               'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 
                               'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])

# normalize data
cc_data_norm = cc_num_data.copy()
cc_data_norm = (cc_data_norm - cc_data_norm.min()) / (cc_data_norm.max() - cc_data_norm.min())

In [93]:
cc_data_norm['above50k'] = cc_data.loc[:,'above50k']
cc_data_norm.columns


Out[93]:
Index(['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'above50k'],
      dtype='object')

In [94]:
mdata = cc_data_norm.as_matrix()
mdata.shape


Out[94]:
(30000, 14)

Grouping data by the default status


In [96]:
mdata_default = mdata[cc_data['default payment next month'] == 1]
print(mdata_default.shape)
mdata_not_default = mdata[cc_data['default payment next month'] == 0]
print(mdata_not_default.shape)


(6636, 14)
(23364, 14)

Estimating posterior distributions of each group

The Variational Bayesian Gaussian Mixture Models (VB-GMMs) method estimates the posterior distributions $\lambda_d$ and $\lambda_{nd}$ for the default group and not default group, respectively.


In [98]:
from sklearn.mixture import BayesianGaussianMixture

model_d = BayesianGaussianMixture(weight_concentration_prior_type="dirichlet_process", 
                                  n_components=int(np.sqrt(len(mdata_default))),
                                  reg_covar=0, init_params='random',  max_iter=1500, 
                                  mean_precision_prior=.1, random_state=2)
model_d.weight_concentration_prior = 0.1
print('Estimating posterior distribution for the default group...')
model_d.fit(mdata_default)

model_nd = BayesianGaussianMixture(weight_concentration_prior_type="dirichlet_process", 
                                  n_components=int(np.sqrt(len(mdata_not_default))),
                                  reg_covar=0, init_params='random',  max_iter=1500, 
                                  mean_precision_prior=.1, random_state=2)
model_nd.weight_concentration_prior = 0.1
print('Estimating posterior distribution for the not default group...')
model_nd.fit(mdata_not_default)


Estimating posterior distribution for the default group...
Estimating posterior distribution for the not default group...
Out[98]:
BayesianGaussianMixture(covariance_prior=None, covariance_type='full',
            degrees_of_freedom_prior=None, init_params='random',
            max_iter=1500, mean_precision_prior=0.1, mean_prior=None,
            n_components=152, n_init=1, random_state=2, reg_covar=0,
            tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
            weight_concentration_prior=0.1,
            weight_concentration_prior_type='dirichlet_process')

Compute the log of posterior probabilities

Now, we compute $\log p(\lambda_d|Data)$ and $\log p(\lambda_{nd}|Data)$


In [99]:
pos_prob_d = model_d.predict(mdata)
pos_prob_nd = model_nd.predict(mdata)

import pickle
pickle.dump(model_d, open('default_model.mdl', 'wb'))
pickle.dump(model_nd, open('not_default_model.mdl', 'wb'))

loaded_model_d = pickle.load(open('default_model.mdl', 'rb'))
loaded_model_nd = pickle.load(open('not_default_model.mdl', 'rb'))

pos_prob_d = loaded_model_d.predict(mdata)
pos_prob_nd = loaded_model_nd.predict(mdata)

print(pos_prob_nd > pos_prob_d)

plt.plot(pos_prob_d[1:100]) #blue
plt.plot(pos_prob_nd[1:100]) #green
plt.show()

plt.plot(pos_prob_nd[1:100] > pos_prob_d[1:100])
plt.show()


[ True  True  True ...,  True  True  True]

Using tSNE as a feature extractor


In [71]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, init='random',
            random_state=2, perplexity=5)
mdata_tsne_transform = tsne.fit_transform(mdata)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-71-e27a31cb19fe> in <module>()
      3 tsne = TSNE(n_components=2, init='random',
      4             random_state=2, perplexity=5)
----> 5 mdata_tsne_transform = tsne.fit_transform(mdata)

/usr/local/lib/python3.6/site-packages/sklearn/manifold/t_sne.py in fit_transform(self, X, y)
    856             Embedding of the training data in low-dimensional space.
    857         """
--> 858         embedding = self._fit(X)
    859         self.embedding_ = embedding
    860         return self.embedding_

/usr/local/lib/python3.6/site-packages/sklearn/manifold/t_sne.py in _fit(self, X, skip_num_points)
    768                           X_embedded=X_embedded,
    769                           neighbors=neighbors_nn,
--> 770                           skip_num_points=skip_num_points)
    771 
    772     @property

/usr/local/lib/python3.6/site-packages/sklearn/manifold/t_sne.py in _tsne(self, P, degrees_of_freedom, n_samples, X_embedded, neighbors, skip_num_points)
    810         P *= self.early_exaggeration
    811         params, kl_divergence, it = _gradient_descent(obj_func, params,
--> 812                                                       **opt_args)
    813         if self.verbose:
    814             print("[t-SNE] KL divergence after %d iterations with early "

/usr/local/lib/python3.6/site-packages/sklearn/manifold/t_sne.py in _gradient_descent(objective, p0, it, n_iter, n_iter_check, n_iter_without_progress, momentum, learning_rate, min_gain, min_grad_norm, verbose, args, kwargs)
    337     tic = time()
    338     for i in range(it, n_iter):
--> 339         error, grad = objective(p, *args, **kwargs)
    340         grad_norm = linalg.norm(grad)
    341 

/usr/local/lib/python3.6/site-packages/sklearn/manifold/t_sne.py in _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, angle, skip_num_points, verbose)
    245     error = _barnes_hut_tsne.gradient(val_P, X_embedded, neighbors, indptr,
    246                                       grad, angle, n_components, verbose,
--> 247                                       dof=degrees_of_freedom)
    248     c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
    249     grad = grad.ravel()

KeyboardInterrupt: 

In [70]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111, projection='3d')
color_dict = {0: 'b', 1: 'r'}

ax.scatter(mdata_tsne_transform[:,0], mdata_tsne_transform[:,1], mdata_tsne_transform[:,2], 
           c=[color_dict[label] for label in cc_data['default payment next month'].values])
ax.set_xlabel('Embedded dim 1')
ax.set_ylabel('Embedded dim 2')
ax.set_zlabel('Embedded dim 3')
plt.show()



In [ ]: