In [ ]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

Working with inbuilt datasets


In [ ]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [ ]:
print(X.shape)

In [ ]:
print(y.shape)

In [ ]:
print(iris['feature_names'])

Simple Classification: Learning and Prediction


In [ ]:
knn = KNeighborsClassifier()
knn.fit(X, y)

In [ ]:
prediction = knn.predict(X)

In [ ]:
confusion_matrix(prediction, y)

In [ ]:
print("Accuracy: %f%%" % accuracy_score(prediction, y))

Q: What's wrong with this approach?


In [ ]:
x_train = X[:100, :]
y_train = y[:100]
x_test = X[100:, :]
y_test = y[100:]

In [ ]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
prediction = knn.predict(x_test)

In [ ]:
print("Confusion Matrix:")
print(confusion_matrix(prediction, y_test))
print("Accuracy: %f%%" % accuracy_score(prediction, y_test))

Q: What went wrong?


In [ ]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [ ]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
prediction = knn.predict(x_test)

In [ ]:
print("Confusion Matrix:")
print(confusion_matrix(prediction, y_test))
print("Accuracy: %f%%" % accuracy_score(prediction, y_test))

Using KNN on Image Data (MNIST)


In [ ]:
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [ ]:
print(X.shape)

In [ ]:
print(y.shape)

In [ ]:
fig, ax = plt.subplots(nrows=5, ncols=5)
for i in range(5):
    for j in range(5):
        ax[i, j].imshow(X[i * 5 + j, :].reshape(8, 8), cmap=plt.cm.gray)
        ax[i, j].grid()
        ax[i, j].set_xticks([])
        ax[i, j].set_yticks([])

In [ ]:
# Training and predicting
x_train, x_test, y_train, y_test = train_test_split(X, y)
knn.fit(x_train, y_train)

In [ ]:
fig, ax = plt.subplots(nrows=5, ncols=5)
for i in range(5):
    for j in range(5):
        prediction = knn.predict(x_test[i * 5 + j, :].reshape(1, -1))
        ax[i, j].imshow(x_test[i * 5 + j, :].reshape(8, 8), cmap=plt.cm.gray)
        ax[i, j].grid()
        ax[i, j].set_xticks([])
        ax[i, j].set_yticks([])
        ax[i, j].set_title("%d" % prediction)
plt.tight_layout()

Exercise: Use knn to classify the breast cancer dataset, print confusion matrix and accuracy scores


In [ ]:
bc = datasets.load_breast_cancer()
X = bc.data
y = bc.target

In [ ]:
# enter code here