In [1]:
import pandas as pd
import numpy as np
import os
import sys
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.cluster import KMeans
sns.set(style="darkgrid")
In [2]:
cc_data = pd.read_excel('default of credit card clients.xls', skiprows=[0])
cc_data.set_index(['ID'], inplace=True)
col_names = cc_data.columns.values
col_names[-1] = 'default'
cc_data.columns = col_names
print (cc_data.shape)
cc_data.dtypes
Out[2]:
In [3]:
cc_data.describe()
Out[3]:
In [4]:
# normalize data
cc_data = (cc_data - cc_data.min()) / (cc_data.max() - cc_data.min())
In [6]:
col_names = ["PAY_AMT6",'PAY_AMT5','PAY_AMT4','PAY_AMT3','PAY_AMT2','PAY_AMT1']
pay_amnt_data = cc_data[col_names].copy()
In [10]:
pay_amnt_data.describe()
Out[10]:
In [11]:
pay_amnt_data_temp = pay_amnt_data.copy()
In [14]:
pay_amnt_data_temp['pay_amt_sum'] = 0
for c in col_names:
pay_amnt_data_temp['pay_amt_sum'] += pay_amnt_data_temp[c]
In [15]:
for c in ["PAY_AMT2",'PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']:
pay_amnt_data_temp[c+'_'] = np.where(pay_amnt_data[c]>0, 1, 0)
In [16]:
pay_amnt_data_temp['pay_amt_qty'] = 0
for c in ["PAY_AMT2",'PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']:
pay_amnt_data_temp['pay_amt_qty'] += pay_amnt_data_temp[c+'_']
In [17]:
pay_amnt_data_temp.sample(10)
Out[17]:
In [ ]:
mdata = pay_amnt_data.as_matrix()
mdata.shape
In [ ]:
def explore_labels(labels_list):
x = labels_list
y = np.bincount(x)
ii = np.nonzero(y)[0]
for i in range(len(ii)):
print (" cluster %s: %s %%"%(str(ii[i]), str(y[i]/np.sum(y)*100)))
freq_list = np.vstack((ii,y[ii]))
alphab = [str(i) for i in freq_list[0]]
pos = np.arange(len(alphab))
width = 1.0 # gives histogram aspect to the bar diagram
ax = plt.axes()
ax.set_xticks(pos + (width / 2))
ax.set_xticklabels(alphab)
plt.bar(pos, freq_list[1], width, color='b')
plt.show()
In [ ]:
inertia_list=[]
number_of_clusters = 10
for i in range(number_of_clusters):
kmeans = KMeans(n_clusters=i+1, random_state=0)
kmeans.fit(mdata)
#print(kmeans.labels_)
#print(kmeans.inertia_ )
print("%s Clusters"%str(i+1))
inertia_list.append(kmeans.inertia_ )
explore_labels(kmeans.labels_)
In [ ]:
#print(inertia_list)
plt.plot(range(1,len(inertia_list)+1),inertia_list, 'go-', label='inertia', linewidth=2)
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]: