IMPORT LIBRARIES


In [ ]:
import pandas as pd
import numpy as np
import pylab as pl
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

LOAD DATA

Read CSV & list columns


In [ ]:
ccdefault = pd.read_csv('~/Desktop/ccdefault.csv')
list(ccdefault.columns.values)

Create new dataset without ID variable


In [ ]:
ccd = ccdefault[ccdefault.columns[1:]]
list(ccd.columns.values)

Split data


In [ ]:
test_idx = np.random.uniform(0, 1, len(ccd)) <= .333
train = ccd[test_idx == True]
test = ccd[test_idx == False]

In [ ]:
train.head()

CLASSIFICATION

Specify features used to predict


In [ ]:
features = [ 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'DEFAULT' ]

Chose n & conduct knn classification


In [ ]:
n = 5  # Neighbors

results = []

clf = KNeighborsClassifier(n_neighbors = n)
clf.fit(train[features], train['DEFAULT'])
preds = clf.predict(test[features])
accuracy = np.where(preds == test['DEFAULT'], 1, 0).sum() / float(len(test))*100

In [ ]:
print "Neighbors: %d, Accuracy: %2d%%" % (n, round(accuracy, 2))

In [ ]: