In [ ]:

    
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

Working with inbuilt datasets



In [ ]:

    
iris = datasets.load_iris()
X = iris.data
y = iris.target



In [ ]:

    
print(X.shape)



In [ ]:

    
print(y.shape)



In [ ]:

    
print(iris['feature_names'])

Simple Classification: Learning and Prediction



In [ ]:

    
knn = KNeighborsClassifier()
knn.fit(X, y)



In [ ]:

    
prediction = knn.predict(X)



In [ ]:

    
confusion_matrix(prediction, y)



In [ ]:

    
print("Accuracy: %f%%" % accuracy_score(prediction, y))

Q: What's wrong with this approach?



In [ ]:

    
x_train = X[:100, :]
y_train = y[:100]
x_test = X[100:, :]
y_test = y[100:]



In [ ]:

    
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
prediction = knn.predict(x_test)



In [ ]:

    
print("Confusion Matrix:")
print(confusion_matrix(prediction, y_test))
print("Accuracy: %f%%" % accuracy_score(prediction, y_test))

Q: What went wrong?



In [ ]:

    
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y)



In [ ]:

    
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
prediction = knn.predict(x_test)



In [ ]:

    
print("Confusion Matrix:")
print(confusion_matrix(prediction, y_test))
print("Accuracy: %f%%" % accuracy_score(prediction, y_test))

Using KNN on Image Data (MNIST)



In [ ]:

    
digits = datasets.load_digits()
X = digits.data
y = digits.target



In [ ]:

    
print(X.shape)



In [ ]:

    
print(y.shape)



In [ ]:

    
fig, ax = plt.subplots(nrows=5, ncols=5)
for i in range(5):
    for j in range(5):
        ax[i, j].imshow(X[i * 5 + j, :].reshape(8, 8), cmap=plt.cm.gray)
        ax[i, j].grid()
        ax[i, j].set_xticks([])
        ax[i, j].set_yticks([])



In [ ]:

    
# Training and predicting
x_train, x_test, y_train, y_test = train_test_split(X, y)
knn.fit(x_train, y_train)



In [ ]:

    
fig, ax = plt.subplots(nrows=5, ncols=5)
for i in range(5):
    for j in range(5):
        prediction = knn.predict(x_test[i * 5 + j, :].reshape(1, -1))
        ax[i, j].imshow(x_test[i * 5 + j, :].reshape(8, 8), cmap=plt.cm.gray)
        ax[i, j].grid()
        ax[i, j].set_xticks([])
        ax[i, j].set_yticks([])
        ax[i, j].set_title("%d" % prediction)
plt.tight_layout()

Exercise: Use knn to classify the breast cancer dataset, print confusion matrix and accuracy scores



In [ ]:

    
bc = datasets.load_breast_cancer()
X = bc.data
y = bc.target



In [ ]:

    
# enter code here