In [1]:
#@title Agreement
# Copyright (c) 2021 Kevin P. Murphy (murphyk@gmail.com) and Mahmoud Soliman (mjs@aucegypt.edu)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
In [2]:
#@title Attribution
# This notebook is based on the following:
# https://github.com/probml/pyprobml/blob/master/scripts/knn_classify_demo.py
In [3]:
#@title Imports
from tensorflow.python.client import device_lib
from psutil import virtual_memory
import cv2
from google.colab.patches import cv2_imshow
%tensorflow_version 2.x
import tensorflow as tf
import os
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.datasets.samples_generator import make_blobs
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import pathlib
import shutil
import tempfile
from tqdm import tqdm
In [4]:
#@title Hardware check
def find_accelerator():
mem = virtual_memory()
devices=device_lib.list_local_devices()
RAM="Physical RAM: {:.2f} GB".format(mem.total/(1024*1024*1024))
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
device=["TPU at "+str(tpu.cluster_spec().as_dict()['worker'])]
except ValueError:
device =[d.physical_device_desc for d in devices if d.device_type=="GPU"]
if not device:
return None, RAM
return device , RAM
a,r=find_accelerator()
print("Please make sure that the statement below says Accelerator found")
print("Accelerator found:",a,r)
In [5]:
#@title Install the extra required packages if any
In [6]:
#@title Clone PyProbML repo and set enviroment variables
!git clone https://github.com/probml/pyprobml/ -q
os.environ["PYPROBML"]='/content/pyprobml/'
In [7]:
# In this notebook we will walk through KNN clustering technique
# Here we generate isotropic Gaussian blobs by using the make_blob function from sklearn
X, y = make_blobs(n_samples=1000, centers=3, n_features=2, cluster_std=6, random_state=42)
ntrain = 100
x_train = X[:ntrain]
y_train = y[:ntrain]
x_test = X[ntrain:]
y_test = y[ntrain:]
In [8]:
# Plotting the generated training dataset by class in a scatter plot
plt.figure()
y_unique = np.unique(y_train)
markers = '*x+'
colors = 'bgr'
for i in range(len(y_unique)):
plt.scatter(x_train[y_train == y_unique[i], 0],
x_train[y_train == y_unique[i], 1],
marker=markers[i],
c=colors[i])
plt.title('train')
plt.show()
In [9]:
# Plotting the generated test dataset by class in a scatter plot
plt.figure()
for i in range(len(y_unique)):
plt.scatter(x_test[y_test == y_unique[i], 0],
x_test[y_test == y_unique[i], 1],
marker=markers[i],
c=colors[i])
plt.title('test')
plt.show()
In [10]:
x = np.linspace(np.min(x_test[:, 0]), np.max(x_test[:, 0]), 200)
y = np.linspace(np.min(x_test[:, 1]), np.max(x_test[:, 1]), 200)
xx, yy = np.meshgrid(x, y)
xy = np.c_[xx.ravel(), yy.ravel()]
# Train a knn model and use the knn model to predict
for k in [1, 2, 5]:
knn = KNN(n_neighbors=k)
knn.fit(x_train, y_train)
plt.figure()
y_predicted = knn.predict(xy)
plt.pcolormesh(xx, yy, y_predicted.reshape(200, 200), cmap='jet', alpha=0.2)
for i in range(len(y_unique)):
plt.scatter(x_train[y_train == y_unique[i], 0],
x_train[y_train == y_unique[i], 1],
marker=markers[i],
c=colors[i])
plt.title('k=%s' % (k))
plt.show()
In [11]:
# plot train err and test err with different k
# ks = [int(n) for n in np.linspace(1, ntrain, 10)]
ks = [1, 5, 10, 20, 50, 70, 79]
train_errs = []
test_errs = []
for k in ks:
knn = KNN(n_neighbors=k)
knn.fit(x_train, y_train)
train_errs.append(1 - knn.score(x_train, y_train))
test_errs.append(1 - knn.score(x_test, y_test))
plt.figure()
plt.plot(ks, train_errs, 'bs:', label='train')
plt.plot(ks, test_errs, 'rx-', label='test')
plt.legend()
plt.xlabel('k')
plt.ylabel('misclassification rate')
plt.show()
In [12]:
#cross_validate
scores = []
for k in ks:
knn = KNN(n_neighbors=k)
score = cross_val_score(knn, x_train, y_train, cv=5)
scores.append(1 - score.mean())
plt.figure()
plt.plot(ks, scores, 'ko-')
min_k = ks[np.argmin(scores)]
plt.plot([min_k, min_k], [0, 1.0], 'b-')
plt.xlabel('k')
plt.ylabel('misclassification rate')
plt.title('5-fold cross validation, n-train = 200')
Out[12]:
In [13]:
#draw hot-map to show the probability of different class
knn = KNN(n_neighbors=10)
knn.fit(x_train, y_train)
xy_predic = knn.predict_proba(xy)
levels = np.arange(0, 1.01, 0.1)
for i in range(3):
plt.figure()
plt.contourf(xy_predic[:, i].ravel().reshape(200, 200), levels)
plt.colorbar()
plt.title('p(y=%s | data, k=10)' % (i))
plt.show()