In [1]:
import pandas as pd
import numpy as np

import os
import sys
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.cluster import KMeans

sns.set(style="darkgrid")

In [2]:
cc_data = pd.read_excel('default of credit card clients.xls', skiprows=[0])
cc_data.set_index(['ID'], inplace=True)
col_names = cc_data.columns.values
col_names[-1] = 'default'
cc_data.columns = col_names
print (cc_data.shape)
cc_data.dtypes


(30000, 24)
Out[2]:
LIMIT_BAL    int64
SEX          int64
EDUCATION    int64
MARRIAGE     int64
AGE          int64
PAY_0        int64
PAY_2        int64
PAY_3        int64
PAY_4        int64
PAY_5        int64
PAY_6        int64
BILL_AMT1    int64
BILL_AMT2    int64
BILL_AMT3    int64
BILL_AMT4    int64
BILL_AMT5    int64
BILL_AMT6    int64
PAY_AMT1     int64
PAY_AMT2     int64
PAY_AMT3     int64
PAY_AMT4     int64
PAY_AMT5     int64
PAY_AMT6     int64
default      int64
dtype: object

In [3]:
cc_data.describe()


Out[3]:
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default
count 30000.000000 30000.000000 30000.000000 30000.000000 30000.000000 30000.000000 30000.000000 30000.000000 30000.000000 30000.000000 ... 30000.000000 30000.000000 30000.000000 30000.000000 3.000000e+04 30000.00000 30000.000000 30000.000000 30000.000000 30000.000000
mean 167484.322667 1.603733 1.853133 1.551867 35.485500 -0.016700 -0.133767 -0.166200 -0.220667 -0.266200 ... 43262.948967 40311.400967 38871.760400 5663.580500 5.921163e+03 5225.68150 4826.076867 4799.387633 5215.502567 0.221200
std 129747.661567 0.489129 0.790349 0.521970 9.217904 1.123802 1.197186 1.196868 1.169139 1.133187 ... 64332.856134 60797.155770 59554.107537 16563.280354 2.304087e+04 17606.96147 15666.159744 15278.305679 17777.465775 0.415062
min 10000.000000 1.000000 0.000000 0.000000 21.000000 -2.000000 -2.000000 -2.000000 -2.000000 -2.000000 ... -170000.000000 -81334.000000 -339603.000000 0.000000 0.000000e+00 0.00000 0.000000 0.000000 0.000000 0.000000
25% 50000.000000 1.000000 1.000000 1.000000 28.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... 2326.750000 1763.000000 1256.000000 1000.000000 8.330000e+02 390.00000 296.000000 252.500000 117.750000 0.000000
50% 140000.000000 2.000000 2.000000 2.000000 34.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 19052.000000 18104.500000 17071.000000 2100.000000 2.009000e+03 1800.00000 1500.000000 1500.000000 1500.000000 0.000000
75% 240000.000000 2.000000 2.000000 2.000000 41.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 54506.000000 50190.500000 49198.250000 5006.000000 5.000000e+03 4505.00000 4013.250000 4031.500000 4000.000000 0.000000
max 1000000.000000 2.000000 6.000000 3.000000 79.000000 8.000000 8.000000 8.000000 8.000000 8.000000 ... 891586.000000 927171.000000 961664.000000 873552.000000 1.684259e+06 896040.00000 621000.000000 426529.000000 528666.000000 1.000000

8 rows × 24 columns


In [4]:
# normalize data
cc_data = (cc_data - cc_data.min()) / (cc_data.max() - cc_data.min())

Pay Amount transform


In [6]:
col_names = ["PAY_AMT6",'PAY_AMT5','PAY_AMT4','PAY_AMT3','PAY_AMT2','PAY_AMT1']
pay_amnt_data = cc_data[col_names].copy()

In [10]:
pay_amnt_data.describe()


Out[10]:
PAY_AMT6 PAY_AMT5 PAY_AMT4 PAY_AMT3 PAY_AMT2 PAY_AMT1
count 30000.000000 30000.000000 30000.000000 30000.00000 3.000000e+04 30000.000000
mean 5215.502567 4799.387633 4826.076867 5225.68150 5.921163e+03 5663.580500
std 17777.465775 15278.305679 15666.159744 17606.96147 2.304087e+04 16563.280354
min 0.000000 0.000000 0.000000 0.00000 0.000000e+00 0.000000
25% 117.750000 252.500000 296.000000 390.00000 8.330000e+02 1000.000000
50% 1500.000000 1500.000000 1500.000000 1800.00000 2.009000e+03 2100.000000
75% 4000.000000 4031.500000 4013.250000 4505.00000 5.000000e+03 5006.000000
max 528666.000000 426529.000000 621000.000000 896040.00000 1.684259e+06 873552.000000

In [11]:
pay_amnt_data_temp = pay_amnt_data.copy()

In [14]:
pay_amnt_data_temp['pay_amt_sum'] = 0
for c in col_names:
    pay_amnt_data_temp['pay_amt_sum'] += pay_amnt_data_temp[c]

In [15]:
for c in ["PAY_AMT2",'PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']:
    pay_amnt_data_temp[c+'_'] = np.where(pay_amnt_data[c]>0, 1, 0)

In [16]:
pay_amnt_data_temp['pay_amt_qty'] = 0
for c in ["PAY_AMT2",'PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']:
    pay_amnt_data_temp['pay_amt_qty'] += pay_amnt_data_temp[c+'_']

In [17]:
pay_amnt_data_temp.sample(10)


Out[17]:
PAY_AMT6 PAY_AMT5 PAY_AMT4 PAY_AMT3 PAY_AMT2 PAY_AMT1 pay_amt_sum PAY_AMT2_ PAY_AMT3_ PAY_AMT4_ PAY_AMT5_ PAY_AMT6_ pay_amt_qty
ID
9998 4200 10017 27080 10000 5000 6437 62734 1 1 1 1 1 5
15935 0 525 10650 4495 2021 1041 18732 1 1 1 1 0 4
23708 0 0 0 3364 17 15506 18887 1 1 0 0 0 2
13019 39777 3044 1020 5625 2684 4013 56163 1 1 1 1 1 5
4504 0 0 0 0 3516 207 3723 1 0 0 0 0 1
13329 5000 6341 39604 6000 7508 12009 76462 1 1 1 1 1 5
24278 0 0 0 0 3000 2500 5500 1 0 0 0 0 1
3704 780 390 390 390 390 390 2730 1 1 1 1 1 5
26462 9192 6349 141246 1075 0 1025 158887 0 1 1 1 1 4
6613 291 291 291 291 291 291 1746 1 1 1 1 1 5

Clustering

k-means


In [ ]:
mdata = pay_amnt_data.as_matrix()
mdata.shape

In [ ]:
def explore_labels(labels_list):
    x = labels_list
    y = np.bincount(x)
    ii = np.nonzero(y)[0]
    for i in range(len(ii)):
        print ("  cluster %s: %s %%"%(str(ii[i]), str(y[i]/np.sum(y)*100)))
    freq_list = np.vstack((ii,y[ii]))
    alphab = [str(i) for i in freq_list[0]]
    pos = np.arange(len(alphab))
    width = 1.0     # gives histogram aspect to the bar diagram
    ax = plt.axes()
    ax.set_xticks(pos + (width / 2))
    ax.set_xticklabels(alphab)
    plt.bar(pos, freq_list[1], width, color='b')
    plt.show()

In [ ]:
inertia_list=[]
number_of_clusters = 10
for i in range(number_of_clusters):
    kmeans = KMeans(n_clusters=i+1, random_state=0)
    kmeans.fit(mdata)
    #print(kmeans.labels_)
    #print(kmeans.inertia_ )
    print("%s Clusters"%str(i+1))
    inertia_list.append(kmeans.inertia_ )
    explore_labels(kmeans.labels_)

Inertia plot


In [ ]:
#print(inertia_list)
plt.plot(range(1,len(inertia_list)+1),inertia_list, 'go-', label='inertia', linewidth=2)
plt.show()

In [ ]:


In [ ]:


In [ ]:


In [ ]: