In [1]:
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from matplotlib import style
from collections import Counter
import pandas as pd
import random
from sklearn.model_selection import train_test_split
style.use('fivethirtyeight')

import pickle
%matplotlib notebook

In [2]:
# Importing Datasets.
df = pd.read_csv('breastdata.txt')
df.replace('?',-99999,inplace=True)

# Dropping features that do not help in clustering.
df.drop(['id'],1,inplace=True)

# Establishing Objective. Clustering done based on the label.
label = 'class'

In [3]:
# Fitting.
# X axis has everything but the labels, and Y axis has only labels.
X = np.array(df.drop([label],1))
y = np.array(df[label])

In [4]:
def k_nearest_neighbors(data,predict,k=5):
    if(len(data) >= k):
        warnings.warn('K is set to a value less than total voting groups')
       
    distances = []
    
    # The predicted point finds distance from all the data points.
    for group in data:
        for features in data[group]:
            features = list(map(int,features))
#             euclidean_distance = sqrt((plot1[0] - plot2[0])**2 + (plot1[1] - plot2[1])**2)
#             eucledian_distance = sqrt( (features[0]-predict[0])**2 + (features[1]-predict[1])**2)
#             eucledian_distance = np.sqrt(np.sum((np.array(features)-np.array(predict))**2))
            eucledian_distance = np.linalg.norm( np.array(features) - np.array(predict))
            distances.append([eucledian_distance,group])
            
    # Taking top 5 distances and their group ID's or clusters
    votes = [i[1] for i in sorted(distances)[:k]]
    
    # Count of each group ID or clusters
    vote_result = Counter(votes)
    
    # Find the cluster which has maximum closest points to the predicted point.
    vote_result = vote_result.most_common(1)[0][0]
    
    # Finding Confidence
    confidence = Counter(votes).most_common(1)[0][1] / k
    
    return vote_result, confidence

In [5]:
def fit(X_train, y_train):
    # Cluster X_train and y_train in a dictionary.
    X_train_dict = {} 
    for idx,y in enumerate(y_train):
        if(y not in X_train_dict):
            X_train_dict[y] = list()    
        X_train_dict[y].append(X_train[idx])
    return X_train_dict

In [6]:
def score(X_train_dict,X_test,y_test):
    correct = 0
    total = 0
    for idx,each in enumerate(X_test):
        each = list(map(int,each))
        vote, confidence = k_nearest_neighbors(X_train_dict, each, k=5)
        if y_test[idx] == vote:
            correct += 1
        total += 1

    return (correct/total)

In [7]:
def predict(X_train_dict,example_measures):
    prediction = []
    for idx,each in enumerate(example_measures):
        each = list(map(int,each))
        vote, confidence = k_nearest_neighbors(X_train_dict, each, k=5)
        prediction.append((vote,confidence))
    return prediction

In [195]:
## Testing Reallife Breast Cancer Dataset
accuracies = []
n_repeats = 25
for i in range(n_repeats):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    X_train_dict = fit(X_train, y_train)
    accuracy = score(X_train_dict,X_test,y_test)
    accuracies.append(accuracy)
    
print("Accuracy : " , sum(accuracies)/len(accuracies))


Accuracy :  0.9708571428571426

In [9]:
# Scoring and Predicting
example_measures = np.array([[9,2,2,3,2,2,5,4,2], [4,2,1,1,1,2,3,2,1], [3,2,2,5,2,2,5,4,2]])
# example_measures = example_measures.reshape(len(example_measures),-1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
X_train_dict = fit(X_train, y_train)
prediction = predict(X_train_dict,example_measures)

# Printing
for idx,x in enumerate(prediction):
    print(example_measures[idx], end=', ')
    if(x[0]==4):
        print("Malignant", "Confidence : %2f" %(x[1]))
    elif(x[0]==2):
        print("Benign", "Confidence : %2f" %(x[1]))


[9 2 2 3 2 2 5 4 2], Malignant Confidence : 0.600000
[4 2 1 1 1 2 3 2 1], Benign Confidence : 1.000000
[3 2 2 5 2 2 5 4 2], Benign Confidence : 0.600000

In [204]:
## Testing Reallife Breast Cancer Dataset
accuracies = []
for i in range(25):
    df = pd.read_csv('breastdata.txt')
    df.replace('?', -99999, inplace=True)
    df.drop(['id'], 1, inplace=True)
    full_data = df.astype(float).values.tolist()
    random.shuffle(full_data)
    test_size = 0.4
    train_set = {2:[], 4:[]}
    test_set = {2:[], 4:[]}
    train_data = full_data[:-int(test_size*len(full_data))]
    test_data = full_data[-int(test_size*len(full_data)):]

    for i in train_data:
        train_set[i[-1]].append(i[:-1])

    for i in test_data:
        test_set[i[-1]].append(i[:-1])

    correct = 0
    total = 0

    for group in test_set:
        for data in test_set[group]:
            vote, confidence = k_nearest_neighbors(train_set, data, k=5)
            if group == vote:
                correct += 1
            total += 1
#     print("Accuracy:",correct/total)
    accuracies.append(correct/total)
print("Accuracy : " , sum(accuracies)/len(accuracies))


Accuracy :  0.9693189964157706

In [ ]: