by David Taylor, www.prooffreader.com (blog) www.dtdata.io (hire me!)
For links to more material including a slideshow explaining all this stuff in further detail, please see the front page of this GitHub repo.
This is notebook 4 of 8. The next notebook is: [05. Classification with other algorithms]
[01] [02] [03] [04] [05] [06] [07] [08]
In the previous notebook, we finished our look at Unsupervised Learning, a.k.a. clustering; now we'll start Supervised Learning, with the k-Nearest Neighbors algorithm. It's not used much in practice, but it's very intuitive so it's useful for beginners.
Again, we will confine ourselves to the normalized sweetness
and acidity
columns for ease and convenience.
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
from sklearn import neighbors
from random import uniform as random_uniform
df = pd.read_csv('fruit.csv')
fruitnames = {1: 'Orange', 2: 'Pear', 3: 'Apple'}
colors = {1: '#e09028', 2: '#55aa33', 3: '#cc3333'}
# normalize sweetness and acidity
df['sweetness_normal'] = ( df.sweetness - df.sweetness.mean() ) / df.sweetness.std()
df['acidity_normal'] = ( df.acidity - df.acidity.mean() ) / df.acidity.std()
# Create numpy arrays for classifier
X = []
y = []
for i, row in df.iterrows():
X.append([row.sweetness_normal, row.acidity_normal])
y.append(row.fruit_id)
X = np.array(X)
y = np.array(y)
Let's have a look at the numpy arrays; they are usually referred to as X
and y
(case sensitive) in scikit-learn
.
In [2]:
print(X[:10,:]) # x (small x) and y values normalized sweetness vs acidity
print(" ... etc.")
print(y) # fruit labels, 1 to 3
In [3]:
# adapted from http://scikit-learn.org/0.11/auto_examples/neighbors/plot_classification.html
for k in [1, 3, 5, 7, 9, 11, 99, 131, 177]:
h = 0.02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#ffde9e', '#b8ea9d', '#ffbaba'])
cmap_bold = ListedColormap(['#ff8c28', '#11bb11', '#ff0000'])
# create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(k)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot the data points
plt.scatter(X[:, 0], X[:, 1], s=40, c=y, cmap=cmap_bold)
plt.xlim(-2.2, 2.2) # hardcoded
plt.ylim(-2, 3.4)
plt.title("K-nearest neighbors of entire dataset (k = %i)"
% (k))
plt.show()
In [4]:
# Divide 70:30 train:test
def kNN_test(k=15, reps=1, prop_test=0.3, plot=True):
""" Function to run k-nearest neighbors on df with given value of k, repeated 'reps' times.
prop_test: the proportion of instances to sequester for the test set. All the rest are training set.
plot: if True, outputs charts. If false, returns tuple of (training set error rate, test set error rate)
Only test set is shown on chart, with misclassified instances larger."""
assert 0<prop_test<1
for rep in range(reps):
df['is_train'] = np.random.uniform(0, 1, len(df)) <= (1-prop_test)
train, test = df[df['is_train']==True], df[df['is_train']==False]
X = []
y = []
for i, row in train.iterrows():
X.append([row.sweetness_normal, row.acidity_normal])
y.append(row.fruit_id)
X = np.array(X)
y = np.array(y)
h = 0.02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#ffde9e', '#b8ea9d', '#ffbaba'])
cmap_bold = ListedColormap(['#ff8c28', '#11bb11', '#ff0000'])
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(k)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
#evaluate the training points
trainpred = clf.predict(train[['sweetness_normal', 'acidity_normal']])
train['pred'] = [x for x in trainpred]
train['correct'] = 0
for idx, row in train.iterrows():
if row.fruit_id == row.pred:
train.loc[idx, 'correct'] = True
else:
train.loc[idx, 'correct'] = False
# Predict on the testing points
preds = clf.predict(test[['sweetness_normal', 'acidity_normal']])
test['pred'] = [x for x in preds]
test['correct'] = 0
for idx, row in test.iterrows():
if row.fruit_id == row.pred:
test.loc[idx, 'correct'] = True
else:
test.loc[idx, 'correct'] = False
if plot:
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
plt.scatter(train.sweetness_normal, train.acidity_normal, s=1, c="#ff0000")
plt.scatter(test[test.correct==True].sweetness_normal, test[test.correct==True].acidity_normal,
s=20, c=test[test.correct==True].pred, cmap=cmap_bold, linewidth=0)
plt.scatter(test[test.correct==False].sweetness_normal, test[test.correct==False].acidity_normal,
s=90, c="#ffffff", linewidth=3)
# plt.xlim(xx.min(), xx.max()) # hardcoded instead to be prettier
# plt.ylim(yy.min(), yy.max())
plt.xlim(-2.2, 2.2) # hardcoded
plt.ylim(-2, 3.4)
if reps == 1:
plt.title("K-Nearest Neighbors, k = {}\nTraining set: small black points\n"
"Correctly classified test set: colored cirles\nMisclassified test set: "
"Large black/white circles".format(k))
else:
plt.title("K-Nearest Neighbors, k = {}\nTraining set: small black points\n"
"Correctly classified test set: colored cirles\nMisclassified test set: "
"Large black/white circles\nrepetition {} of {}".format(k, rep+1, reps))
plt.show()
else:
train_error_rate = len(train[train.correct==False]) / (len(train[train.correct==True])+len(train[train.correct==False]))
test_error_rate = len(test[test.correct==False]) / (len(test[test.correct==True])+len(test[test.correct==False]))
return train_error_rate, test_error_rate
In [6]:
kNN_test(k=1, reps=2, plot=True)
kNN_test(k=9, reps=2, plot=True)
kNN_test(k=99, reps=2, plot=True)
In [5]:
%%time
import csv
with open('kNN_test_train.csv', 'w', newline='') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',')
for k in [1,3,5,7,9,11,15,19,23,27,31,37,41,47,51,57,61,71,81,91,111]:
for i in range(20):
scores = kNN_test(k=k, reps=1, plot=False)
csvwriter.writerow([k, scores[0], scores[1]])
Load the csv file created above and visualize the learning curve.
In [5]:
dftest = pd.read_csv('kNN_test_train.csv', names=['k', 'train_error', 'test_error'])
ks = list(dftest.k.unique())
ks.sort()
xs = []
ys_train = []
ys_test = []
for i, k in enumerate(ks):
xs.append(len(ks) - i)
ys_train.append(dftest[dftest.k==k].train_error.mean())
ys_test.append(dftest[dftest.k==k].test_error.mean())
fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(111)
ax.set_title('Testing and training error rate average of 20 repetitions')
ax.plot(xs, ys_train, 'b-', label="training set")
ax.plot(xs, ys_test, 'r-', label="test set")
ax.set_xlim(0, 22)
ax.set_ylabel('Error rate')
ax.set_xlabel('k')
ax.set_xticks(range(1,22))
ax.legend()
ax.set_xticklabels(ks)
Out[5]: