In [1]:
import numpy as np
from sklearn import neighbors, preprocessing
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib import style
%matplotlib notebook
# Importing Datasets.
df = pd.read_csv('breastdata.txt')
df.replace('?',-99999,inplace=True)
# Dropping features that do not help in clustering.
df.drop(['id'],1,inplace=True)
# Establishing Objective. Clustering done based on the label.
label = 'class'
In [2]:
# Fitting.
# X axis has everything but the labels, and Y axis has only labels.
X = np.array(df.drop([label],1))
y = np.array(df[label])
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA','#00AAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00','#00AAFF'])
plt.scatter(X[:, 7], X[:, 8],s=200,c=y, cmap=cmap_bold)
plt.scatter(X[:, 6], X[:, 7],s=150,c=y, cmap=cmap_bold)
plt.scatter(X[:, 3], X[:, 4],s=100,c=y, cmap=cmap_bold)
plt.scatter(X[:, 1], X[:, 2],s=50,c=y, cmap=cmap_bold)
plt.scatter(X[:, 0], X[:, 1],s=10,c=y, cmap=cmap_bold)
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
# Actual K Nearest Neighbor Classifier Training.
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.title("Data Points(2 features)")
plt.show()
In [3]:
# Scoring and Predicting
example_measures = np.array([[9,2,2,3,2,2,5,4,2], [4,2,1,1,1,2,3,2,1], [3,2,2,5,2,2,5,4,2]])
# example_measures = example_measures.reshape(len(example_measures),-1)
prediction = clf.predict(example_measures)
# Printing
color = 'r'
for idx,x in enumerate(prediction):
print(example_measures[idx], end=', ')
if(x==4):
print("Malignant")
elif(x==2):
print("Benign")
In [ ]: