In [10]:
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from matplotlib import style
from collections import Counter
import pandas as pd
import random
from sklearn.model_selection import train_test_split
style.use('fivethirtyeight')
import pickle
%matplotlib notebook
In [2]:
# Importing Datasets.
df = pd.read_csv('breastdata.txt')
df.replace('?',-99999,inplace=True)
# Dropping features that do not help in clustering.
df.drop(['id'],1,inplace=True)
# Establishing Objective. Clustering done based on the label.
label = 'class'
In [3]:
# Fitting.
# X axis has everything but the labels, and Y axis has only labels.
X = np.array(df.drop([label],1))
y = np.array(df[label])
In [4]:
def k_nearest_neighbors(data,predict,k=5):
if(len(data) >= k):
warnings.warn('K is set to a value less than total voting groups')
distances = []
# The predicted point finds distance from all the data points.
for group in data:
for features in data[group]:
features = list(map(int,features))
# euclidean_distance = sqrt((plot1[0] - plot2[0])**2 + (plot1[1] - plot2[1])**2)
# eucledian_distance = sqrt( (features[0]-predict[0])**2 + (features[1]-predict[1])**2)
# eucledian_distance = np.sqrt(np.sum((np.array(features)-np.array(predict))**2))
eucledian_distance = np.linalg.norm( np.array(features) - np.array(predict))
distances.append([eucledian_distance,group])
# Taking top 5 distances and their group ID's or clusters
votes = [i[1] for i in sorted(distances)[:k]]
# Count of each group ID or clusters
vote_result = Counter(votes)
# Find the cluster which has maximum closest points to the predicted point.
vote_result = vote_result.most_common(1)[0][0]
# Finding Confidence
confidence = Counter(votes).most_common(1)[0][1] / k
return vote_result, confidence
In [5]:
def fit(X_train, y_train):
# Cluster X_train and y_train in a dictionary.
X_train_dict = {}
for idx,y in enumerate(y_train):
if(y not in X_train_dict):
X_train_dict[y] = list()
X_train_dict[y].append(X_train[idx])
return X_train_dict
In [6]:
def score(X_train_dict,X_test,y_test):
correct = 0
total = 0
for idx,each in enumerate(X_test):
each = list(map(int,each))
vote, confidence = k_nearest_neighbors(X_train_dict, each, k=5)
if y_test[idx] == vote:
correct += 1
total += 1
return (correct/total)
In [7]:
def predict(X_train_dict,example_measures):
prediction = []
for idx,each in enumerate(example_measures):
each = list(map(int,each))
vote, confidence = k_nearest_neighbors(X_train_dict, each, k=5)
prediction.append((vote,confidence))
return prediction
In [195]:
## Testing Reallife Breast Cancer Dataset
accuracies = []
n_repeats = 25
for i in range(n_repeats):
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
X_train_dict = fit(X_train, y_train)
accuracy = score(X_train_dict,X_test,y_test)
accuracies.append(accuracy)
print("Accuracy : " , sum(accuracies)/len(accuracies))
In [9]:
# Scoring and Predicting
example_measures = np.array([[9,2,2,3,2,2,5,4,2], [4,2,1,1,1,2,3,2,1], [3,2,2,5,2,2,5,4,2]])
# example_measures = example_measures.reshape(len(example_measures),-1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
X_train_dict = fit(X_train, y_train)
prediction = predict(X_train_dict,example_measures)
# Printing
for idx,x in enumerate(prediction):
print(example_measures[idx], end=', ')
if(x[0]==4):
print("Malignant", "Confidence : %2f" %(x[1]))
elif(x[0]==2):
print("Benign", "Confidence : %2f" %(x[1]))
In [204]:
## Testing Reallife Breast Cancer Dataset
accuracies = []
for i in range(25):
df = pd.read_csv('breastdata.txt')
df.replace('?', -99999, inplace=True)
df.drop(['id'], 1, inplace=True)
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)
test_size = 0.4
train_set = {2:[], 4:[]}
test_set = {2:[], 4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]
for i in train_data:
train_set[i[-1]].append(i[:-1])
for i in test_data:
test_set[i[-1]].append(i[:-1])
correct = 0
total = 0
for group in test_set:
for data in test_set[group]:
vote, confidence = k_nearest_neighbors(train_set, data, k=5)
if group == vote:
correct += 1
total += 1
# print("Accuracy:",correct/total)
accuracies.append(correct/total)
print("Accuracy : " , sum(accuracies)/len(accuracies))
In [ ]: