Nearest neighbor classification

Arguably the most simplest classification method.

We are given example input vectors $x_i$ and corresponding class labels $c_i$ for $i=1,\dots, N$.

The collection of pairs $\{x_i, c_i\}$ for $i=1\dots N$ is called a data set.

Just store the dataset and for a new observed point $x$, find it's nearest neighbor $i^*$ and report $c_{i^*}$

$$ i^* = \arg\min_{i=1\dots N} D(x_i, x) $$

KNN: K nearest neighbors

Find the $k$ nearest neighbors and do a majority voting.



In [35]:

    
import numpy as np
import pandas as pd

import matplotlib.pylab as plt
df = pd.read_csv(u'data/iris.txt',sep=' ')
df









    Out[35]:






  
    
      
      sl
      sw
      pl
      pw
      c
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      1
    
    
      1
      4.9
      3.0
      1.4
      0.2
      1
    
    
      2
      4.7
      3.2
      1.3
      0.2
      1
    
    
      3
      4.6
      3.1
      1.5
      0.2
      1
    
    
      4
      5.0
      3.6
      1.4
      0.2
      1
    
    
      5
      5.4
      3.9
      1.7
      0.4
      1
    
    
      6
      4.6
      3.4
      1.4
      0.3
      1
    
    
      7
      5.0
      3.4
      1.5
      0.2
      1
    
    
      8
      4.4
      2.9
      1.4
      0.2
      1
    
    
      9
      4.9
      3.1
      1.5
      0.1
      1
    
    
      10
      5.4
      3.7
      1.5
      0.2
      1
    
    
      11
      4.8
      3.4
      1.6
      0.2
      1
    
    
      12
      4.8
      3.0
      1.4
      0.1
      1
    
    
      13
      4.3
      3.0
      1.1
      0.1
      1
    
    
      14
      5.8
      4.0
      1.2
      0.2
      1
    
    
      15
      5.7
      4.4
      1.5
      0.4
      1
    
    
      16
      5.4
      3.9
      1.3
      0.4
      1
    
    
      17
      5.1
      3.5
      1.4
      0.3
      1
    
    
      18
      5.7
      3.8
      1.7
      0.3
      1
    
    
      19
      5.1
      3.8
      1.5
      0.3
      1
    
    
      20
      5.4
      3.4
      1.7
      0.2
      1
    
    
      21
      5.1
      3.7
      1.5
      0.4
      1
    
    
      22
      4.6
      3.6
      1.0
      0.2
      1
    
    
      23
      5.1
      3.3
      1.7
      0.5
      1
    
    
      24
      4.8
      3.4
      1.9
      0.2
      1
    
    
      25
      5.0
      3.0
      1.6
      0.2
      1
    
    
      26
      5.0
      3.4
      1.6
      0.4
      1
    
    
      27
      5.2
      3.5
      1.5
      0.2
      1
    
    
      28
      5.2
      3.4
      1.4
      0.2
      1
    
    
      29
      4.7
      3.2
      1.6
      0.2
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      120
      6.9
      3.2
      5.7
      2.3
      3
    
    
      121
      5.6
      2.8
      4.9
      2.0
      3
    
    
      122
      7.7
      2.8
      6.7
      2.0
      3
    
    
      123
      6.3
      2.7
      4.9
      1.8
      3
    
    
      124
      6.7
      3.3
      5.7
      2.1
      3
    
    
      125
      7.2
      3.2
      6.0
      1.8
      3
    
    
      126
      6.2
      2.8
      4.8
      1.8
      3
    
    
      127
      6.1
      3.0
      4.9
      1.8
      3
    
    
      128
      6.4
      2.8
      5.6
      2.1
      3
    
    
      129
      7.2
      3.0
      5.8
      1.6
      3
    
    
      130
      7.4
      2.8
      6.1
      1.9
      3
    
    
      131
      7.9
      3.8
      6.4
      2.0
      3
    
    
      132
      6.4
      2.8
      5.6
      2.2
      3
    
    
      133
      6.3
      2.8
      5.1
      1.5
      3
    
    
      134
      6.1
      2.6
      5.6
      1.4
      3
    
    
      135
      7.7
      3.0
      6.1
      2.3
      3
    
    
      136
      6.3
      3.4
      5.6
      2.4
      3
    
    
      137
      6.4
      3.1
      5.5
      1.8
      3
    
    
      138
      6.0
      3.0
      4.8
      1.8
      3
    
    
      139
      6.9
      3.1
      5.4
      2.1
      3
    
    
      140
      6.7
      3.1
      5.6
      2.4
      3
    
    
      141
      6.9
      3.1
      5.1
      2.3
      3
    
    
      142
      5.8
      2.7
      5.1
      1.9
      3
    
    
      143
      6.8
      3.2
      5.9
      2.3
      3
    
    
      144
      6.7
      3.3
      5.7
      2.5
      3
    
    
      145
      6.7
      3.0
      5.2
      2.3
      3
    
    
      146
      6.3
      2.5
      5.0
      1.9
      3
    
    
      147
      6.5
      3.0
      5.2
      2.0
      3
    
    
      148
      6.2
      3.4
      5.4
      2.3
      3
    
    
      149
      5.9
      3.0
      5.1
      1.8
      3
    
  

150 rows × 5 columns



In [36]:

    
X = np.hstack([
        np.matrix(df.sl).T, 
        np.matrix(df.sw).T, 
        np.matrix(df.pl).T, 
        np.matrix(df.pw).T])

print X[:5] # sample view

c = np.matrix(df.c).T
print c[:5]









    



[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]
[[1]
 [1]
 [1]
 [1]
 [1]]

The choice of the distance function (divergence) can be important. In practice, a popular choice is the Euclidian distance but this is by no means the only one.



In [37]:

    
def Divergence(x,y,p=2.):
    e = np.array(x) - np.array(y)
    if np.isscalar(p):
        return np.sum(np.abs(e)**p)
    else:
        return np.sum(np.matrix(e)*p*np.matrix(e).T)

Divergence([0,0],[1,1],p=2)
W = np.matrix(np.diag([2,1]))
Divergence([0,0],[1,1],p=W)
W = np.matrix([[2,1],[1,2]])
Divergence([0,0],[1,1],p=W)









    Out[37]:





6

Equal distance contours



In [27]:

    
%run plot_normballs.py



In [39]:

    
def nearest(A,x, p=2):
    '''A: NxD data matrix, N - number of samples, D - the number of features
       x: test vector
       
       returns the distance and index of the the nearest neigbor
    '''
    N = A.shape[0]
    d = np.zeros((N,1))
    
    md = np.inf
    
    for i in range(N):
        d[i] = Divergence(A[i,:], x, p)
        if d[i]<md:
            md = d[i]
            min_idx = i
    
    return min_idx
    
def predict(A, c, X, p=2):
    L = X.shape[0]
    return [np.asscalar(c[nearest(A, X[i,:], p=p)]) for i in range(L)]

x_test = np.mat('[3.3, 2.5,5.5,1.7]')

#d, idx = distance(X, x_test, p=2)

cc = predict(X, c, x_test)

print(cc)
#float(c[idx])

[3]



In [40]:

    
def leave_one_out(A, c, p=2):
    
    N = A.shape[0]   
    correct = 0    
    for j in range(N):
        md = np.inf
        for i in range(N):
            if i != j:
                d = Divergence(A[i,:], A[j,:], p=p)
                if d<md:
                    md = d
                    min_idx = i
        if c[min_idx] == c[j]:
            correct += 1
    
    accuracy = 1.*correct/N
    return accuracy



In [52]:

    
leave_one_out(X, c, p=np.diag([1,1,1,1]))









    Out[52]:





0.96

http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html



In [3]:

    
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

n_neighbors = 3

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] + 0.02*np.random.randn(150,2)  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = .02 # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

weights='uniform'
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8,8))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
          % (n_neighbors, weights))
plt.axis('equal')
plt.show()

	sl	sw	pl	pw	c
0	5.1	3.5	1.4	0.2	1
1	4.9	3.0	1.4	0.2	1
2	4.7	3.2	1.3	0.2	1
3	4.6	3.1	1.5	0.2	1
4	5.0	3.6	1.4	0.2	1
5	5.4	3.9	1.7	0.4	1
6	4.6	3.4	1.4	0.3	1
7	5.0	3.4	1.5	0.2	1
8	4.4	2.9	1.4	0.2	1
9	4.9	3.1	1.5	0.1	1
10	5.4	3.7	1.5	0.2	1
11	4.8	3.4	1.6	0.2	1
12	4.8	3.0	1.4	0.1	1
13	4.3	3.0	1.1	0.1	1
14	5.8	4.0	1.2	0.2	1
15	5.7	4.4	1.5	0.4	1
16	5.4	3.9	1.3	0.4	1
17	5.1	3.5	1.4	0.3	1
18	5.7	3.8	1.7	0.3	1
19	5.1	3.8	1.5	0.3	1
20	5.4	3.4	1.7	0.2	1
21	5.1	3.7	1.5	0.4	1
22	4.6	3.6	1.0	0.2	1
23	5.1	3.3	1.7	0.5	1
24	4.8	3.4	1.9	0.2	1
25	5.0	3.0	1.6	0.2	1
26	5.0	3.4	1.6	0.4	1
27	5.2	3.5	1.5	0.2	1
28	5.2	3.4	1.4	0.2	1
29	4.7	3.2	1.6	0.2	1
...	...	...	...	...	...
120	6.9	3.2	5.7	2.3	3
121	5.6	2.8	4.9	2.0	3
122	7.7	2.8	6.7	2.0	3
123	6.3	2.7	4.9	1.8	3
124	6.7	3.3	5.7	2.1	3
125	7.2	3.2	6.0	1.8	3
126	6.2	2.8	4.8	1.8	3
127	6.1	3.0	4.9	1.8	3
128	6.4	2.8	5.6	2.1	3
129	7.2	3.0	5.8	1.6	3
130	7.4	2.8	6.1	1.9	3
131	7.9	3.8	6.4	2.0	3
132	6.4	2.8	5.6	2.2	3
133	6.3	2.8	5.1	1.5	3
134	6.1	2.6	5.6	1.4	3
135	7.7	3.0	6.1	2.3	3
136	6.3	3.4	5.6	2.4	3
137	6.4	3.1	5.5	1.8	3
138	6.0	3.0	4.8	1.8	3
139	6.9	3.1	5.4	2.1	3
140	6.7	3.1	5.6	2.4	3
141	6.9	3.1	5.1	2.3	3
142	5.8	2.7	5.1	1.9	3
143	6.8	3.2	5.9	2.3	3
144	6.7	3.3	5.7	2.5	3
145	6.7	3.0	5.2	2.3	3
146	6.3	2.5	5.0	1.9	3
147	6.5	3.0	5.2	2.0	3
148	6.2	3.4	5.4	2.3	3
149	5.9	3.0	5.1	1.8	3

	sl	sw	pl	pw	c
0	5.1	3.5	1.4	0.2	1
1	4.9	3.0	1.4	0.2	1
2	4.7	3.2	1.3	0.2	1
3	4.6	3.1	1.5	0.2	1
4	5.0	3.6	1.4	0.2	1
5	5.4	3.9	1.7	0.4	1
6	4.6	3.4	1.4	0.3	1
7	5.0	3.4	1.5	0.2	1
8	4.4	2.9	1.4	0.2	1
9	4.9	3.1	1.5	0.1	1
10	5.4	3.7	1.5	0.2	1
11	4.8	3.4	1.6	0.2	1
12	4.8	3.0	1.4	0.1	1
13	4.3	3.0	1.1	0.1	1
14	5.8	4.0	1.2	0.2	1
15	5.7	4.4	1.5	0.4	1
16	5.4	3.9	1.3	0.4	1
17	5.1	3.5	1.4	0.3	1
18	5.7	3.8	1.7	0.3	1
19	5.1	3.8	1.5	0.3	1
20	5.4	3.4	1.7	0.2	1
21	5.1	3.7	1.5	0.4	1
22	4.6	3.6	1.0	0.2	1
23	5.1	3.3	1.7	0.5	1
24	4.8	3.4	1.9	0.2	1
25	5.0	3.0	1.6	0.2	1
26	5.0	3.4	1.6	0.4	1
27	5.2	3.5	1.5	0.2	1
28	5.2	3.4	1.4	0.2	1
29	4.7	3.2	1.6	0.2	1
...	...	...	...	...	...
120	6.9	3.2	5.7	2.3	3
121	5.6	2.8	4.9	2.0	3
122	7.7	2.8	6.7	2.0	3
123	6.3	2.7	4.9	1.8	3
124	6.7	3.3	5.7	2.1	3
125	7.2	3.2	6.0	1.8	3
126	6.2	2.8	4.8	1.8	3
127	6.1	3.0	4.9	1.8	3
128	6.4	2.8	5.6	2.1	3
129	7.2	3.0	5.8	1.6	3
130	7.4	2.8	6.1	1.9	3
131	7.9	3.8	6.4	2.0	3
132	6.4	2.8	5.6	2.2	3
133	6.3	2.8	5.1	1.5	3
134	6.1	2.6	5.6	1.4	3
135	7.7	3.0	6.1	2.3	3
136	6.3	3.4	5.6	2.4	3
137	6.4	3.1	5.5	1.8	3
138	6.0	3.0	4.8	1.8	3
139	6.9	3.1	5.4	2.1	3
140	6.7	3.1	5.6	2.4	3
141	6.9	3.1	5.1	2.3	3
142	5.8	2.7	5.1	1.9	3
143	6.8	3.2	5.9	2.3	3
144	6.7	3.3	5.7	2.5	3
145	6.7	3.0	5.2	2.3	3
146	6.3	2.5	5.0	1.9	3
147	6.5	3.0	5.2	2.0	3
148	6.2	3.4	5.4	2.3	3
149	5.9	3.0	5.1	1.8	3

	sl	sw	pl	pw	c
0	5.1	3.5	1.4	0.2	1
1	4.9	3.0	1.4	0.2	1
2	4.7	3.2	1.3	0.2	1
3	4.6	3.1	1.5	0.2	1
4	5.0	3.6	1.4	0.2	1
5	5.4	3.9	1.7	0.4	1
6	4.6	3.4	1.4	0.3	1
7	5.0	3.4	1.5	0.2	1
8	4.4	2.9	1.4	0.2	1
9	4.9	3.1	1.5	0.1	1
10	5.4	3.7	1.5	0.2	1
11	4.8	3.4	1.6	0.2	1
12	4.8	3.0	1.4	0.1	1
13	4.3	3.0	1.1	0.1	1
14	5.8	4.0	1.2	0.2	1
15	5.7	4.4	1.5	0.4	1
16	5.4	3.9	1.3	0.4	1
17	5.1	3.5	1.4	0.3	1
18	5.7	3.8	1.7	0.3	1
19	5.1	3.8	1.5	0.3	1
20	5.4	3.4	1.7	0.2	1
21	5.1	3.7	1.5	0.4	1
22	4.6	3.6	1.0	0.2	1
23	5.1	3.3	1.7	0.5	1
24	4.8	3.4	1.9	0.2	1
25	5.0	3.0	1.6	0.2	1
26	5.0	3.4	1.6	0.4	1
27	5.2	3.5	1.5	0.2	1
28	5.2	3.4	1.4	0.2	1
29	4.7	3.2	1.6	0.2	1
...	...	...	...	...	...
120	6.9	3.2	5.7	2.3	3
121	5.6	2.8	4.9	2.0	3
122	7.7	2.8	6.7	2.0	3
123	6.3	2.7	4.9	1.8	3
124	6.7	3.3	5.7	2.1	3
125	7.2	3.2	6.0	1.8	3
126	6.2	2.8	4.8	1.8	3
127	6.1	3.0	4.9	1.8	3
128	6.4	2.8	5.6	2.1	3
129	7.2	3.0	5.8	1.6	3
130	7.4	2.8	6.1	1.9	3
131	7.9	3.8	6.4	2.0	3
132	6.4	2.8	5.6	2.2	3
133	6.3	2.8	5.1	1.5	3
134	6.1	2.6	5.6	1.4	3
135	7.7	3.0	6.1	2.3	3
136	6.3	3.4	5.6	2.4	3
137	6.4	3.1	5.5	1.8	3
138	6.0	3.0	4.8	1.8	3
139	6.9	3.1	5.4	2.1	3
140	6.7	3.1	5.6	2.4	3
141	6.9	3.1	5.1	2.3	3
142	5.8	2.7	5.1	1.9	3
143	6.8	3.2	5.9	2.3	3
144	6.7	3.3	5.7	2.5	3
145	6.7	3.0	5.2	2.3	3
146	6.3	2.5	5.0	1.9	3
147	6.5	3.0	5.2	2.0	3
148	6.2	3.4	5.4	2.3	3
149	5.9	3.0	5.1	1.8	3