In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.spatial
import math

np.random.seed( 2503865 ) # We'll set the random number generator's seed so everyone generates the exact same dataset

In [2]:
# Let's define 2 clusters defined by Gaussians (we'll call them clumps to avoid confusion later)
sigma = 0.05

clump1_N = 100
clump1_x = [ np.random.normal(loc=0.25,scale=sigma) for i in range(clump1_N) ]
clump1_y = [ np.random.normal(loc=0.75,scale=sigma) for i in range(clump1_N) ]

clump2_N = clump1_N
clump2_x = [ np.random.normal(loc=0.75,scale=sigma) for i in range(clump2_N) ]
clump2_y = [ np.random.normal(loc=0.25,scale=sigma) for i in range(clump2_N) ]

In [3]:
points_x = clump1_x + clump2_x
points_y = clump1_y + clump2_y
clump1_color = 0
clump2_color = 1
clump_area = 75
colors = [ clump1_color for i in range(clump1_N) ] + [ clump2_color for i in range(clump2_N) ]
areas = [ clump_area for i in range(clump1_N+clump2_N) ]

plt.scatter( points_x, points_y, c=colors, s=areas )


Out[3]:
<matplotlib.collections.PathCollection at 0x1060c5710>
/Users/junwang/miniconda3/envs/py2/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [4]:
clump3_N = 5
clump3_x = [ np.random.normal(loc=0.5,scale=sigma) for i in range(clump3_N) ]
clump3_y = [ np.random.normal(loc=0.5,scale=sigma) for i in range(clump3_N) ]

In [5]:
points_x = clump1_x + clump2_x + clump3_x
points_y = clump1_y + clump2_y + clump3_y
clump1_color = 0
clump2_color = 1
clump3_color = 0.5
clump_area = 75
colors = [ clump1_color for i in range(clump1_N) ] + [ clump2_color for i in range(clump2_N) ] + [ clump3_color for i in range(clump3_N) ]
areas = [ clump_area for i in range(clump1_N+clump2_N+clump3_N) ]

plt.scatter( points_x, points_y, c=colors, s=areas )
#plt.savefig('../images/instance_based_learning_001.png')


Out[5]:
<matplotlib.collections.PathCollection at 0x106252610>

In [6]:
# Exhaustive searching for knn
# Method:
# 1. Set up the dataset for all the data
# 2. Calculte the distance of all the points to these 5 points
# 3. Find the 10 points which has the smallest distance to these 5 points respectively (k = 10 in other words)
# 4. Vote for the classes

# Set up the dataset for exhaustive searching
points_x = clump1_x + clump2_x
points_y = clump1_y + clump2_y
clump_N = 2 * clump1_N

# Calculate the distance of all the points with these five points
distance = []
for n in range(clump3_N):
    distance.append([])
for i in range(clump3_N):
    for j in range(clump_N):
        temp_distance = math.sqrt(pow((clump3_x[i] - points_x[j]), 2) + pow((clump3_y[i] - points_y[j]), 2))
        distance[i].append(temp_distance)

# Find the index with the smaller distance to these 5 points, k = 10
index_book = []
new_distance = []
k = 10
for n in range(clump3_N):
    index_book.append([])
    new_distance.append([])
for i in range(clump3_N):
    new_distance[i] = sorted(distance[i])
    for j in range(k):
        index_book[i].append(distance[i].index(new_distance[i][j]))

# Vote for the classes for these 5 points
knn_vote = range(clump3_N)
for i in range(clump3_N):
    knn_vote[i] = np.sum([1 for idx in index_book[i] if idx < clump1_N])

# Draw the scatter picture (Copied from the original code below with slightly modification)
points_x = clump1_x + clump2_x + clump3_x
points_y = clump1_y + clump2_y + clump3_y
clump1_color = 0
clump2_color = 1
clump3_color = 0.5
clump_area = 75
colors = [ clump1_color for i in range(clump1_N) ] 
colors += [ clump2_color for i in range(clump2_N) ] 
colors += [ clump1_color if ( knn_vote[i] > ( k / 2 ) ) else clump2_color for i in range(clump3_N) ]
areas = [ clump_area for i in range(clump_N+clump3_N) ]

plt.scatter( points_x, points_y, c=colors, s=areas )


Out[6]:
<matplotlib.collections.PathCollection at 0x106378150>

In [7]:
points_x = clump1_x + clump2_x
points_y = clump1_y + clump2_y

dataset = np.matrix( zip(points_x,points_y) )

kdtree = scipy.spatial.KDTree( dataset )

query_result = kdtree.query( [0.5, 0.5], k=10 )

clump1_vote = np.sum( [ 1 for nbr_idx in query_result[1] if nbr_idx <= clump1_N ] )

In [8]:
dataset = np.array( zip((clump1_x + clump2_x), (clump1_y + clump2_y)) )
kdtree = scipy.spatial.KDTree( dataset )

kNN_k = 11

query_dataset = np.array( zip(clump3_x,clump3_y) )
query_result = kdtree.query( query_dataset, k=kNN_k )

query_votes = [ np.sum( [ 1 for nbr_idx in row_result if nbr_idx < clump1_N ] ) for row_result in query_result[1] ]
query_votes


Out[8]:
[0.0, 10, 2, 11, 11]

In [9]:
points_x = clump1_x + clump2_x + clump3_x
points_y = clump1_y + clump2_y + clump3_y
clump1_color = 0
clump2_color = 1
clump3_color = 0.5
clump_area = 75
colors = [ clump1_color for i in range(clump1_N) ] 
colors += [ clump2_color for i in range(clump2_N) ] 
colors += [ clump1_color if ( query_votes[i] > ( kNN_k / 2 ) ) else clump2_color for i in range(clump3_N) ]
areas = [ clump_area for i in range(clump1_N+clump2_N+clump3_N) ]

plt.scatter( points_x, points_y, c=colors, s=areas )
#plt.savefig('../images/instance_based_learning_002.png')


Out[9]:
<matplotlib.collections.PathCollection at 0x1065bbe50>

In [58]:
# Taken from some code for plotting a KD-tree in Python available here:
# https://salzis.wordpress.com/2014/06/28/kd-tree-and-nearest-neighbor-nn-search-2d-case/

from collections import namedtuple
from operator import itemgetter
from pprint import pformat

class Node(namedtuple('Node', 'location left_child right_child')):
 
    def __repr__(self):
        return pformat(tuple(self))
 
def viz_kdtree(point_list, depth=0):
    """ build K-D tree
    :param point_list list of input points
    :param depth      current tree's depth
    :return tree node
    """
 
    # assumes all points have the same dimension
    try:
        k = len(point_list[0])
    except IndexError:
        return None
 
    # Select axis based on depth so that axis cycles through
    # all valid values
    axis = depth % k
 
    # Sort point list and choose median as pivot element
    point_list.sort(key=itemgetter(axis))
    median = len(point_list) // 2         # choose median
 
    # Create node and construct subtrees
    return Node(
        location=point_list[median],
        left_child=viz_kdtree(point_list[:median], depth + 1),
        right_child=viz_kdtree(point_list[median + 1:], depth + 1)
    )

 
def plot_tree(tree, min_x, max_x, min_y, max_y, prev_node, branch, depth=0):
    """ plot K-D tree
    :param tree      input tree to be plotted
    :param min_x
    :param max_x
    :param min_y
    :param max_y
    :param prev_node parent's node
    :param branch    True if left, False if right
    :param depth     tree's depth
    :return tree     node
    """
 
    cur_node = tree.location         # current tree's node
    left_branch = tree.left_child    # its left branch
    right_branch = tree.right_child  # its right branch
 
    # set line's width depending on tree's depth
    if depth > len(line_width)-1:
        ln_width = line_width[len(line_width)-1]
    else:
        ln_width = line_width[depth]
 
    k = len(cur_node)
    axis = depth % k
 
    # draw a vertical splitting line
    if axis == 0:
 
        if branch is not None and prev_node is not None:
 
            if branch:
                max_y = prev_node[1]
            else:
                min_y = prev_node[1]
 
        plt.plot([cur_node[0],cur_node[0]], [min_y,max_y], linestyle='-', color='red', linewidth=ln_width)
 
    # draw a horizontal splitting line
    elif axis == 1:
 
        if branch is not None and prev_node is not None:
 
            if branch:
                max_x = prev_node[0]
            else:
                min_x = prev_node[0]
 
        plt.plot([min_x,max_x], [cur_node[1],cur_node[1]], linestyle='-', color='blue', linewidth=ln_width)
 
    # draw the current node
    plt.plot(cur_node[0], cur_node[1], 'ko')
 
    # draw left and right branches of the current node
    if left_branch is not None:
        plot_tree(left_branch, min_x, max_x, min_y, max_y, cur_node, True, depth+1)
 
    if right_branch is not None:
        plot_tree(right_branch, min_x, max_x, min_y, max_y, cur_node, False, depth+1)

In [65]:
sigma = 0.25

clump4_N = 10 #500
clump4_x = [ np.random.normal(loc=0.5,scale=sigma) for i in range(clump4_N) ]
clump4_y = [ np.random.normal(loc=0.5,scale=sigma) for i in range(clump4_N) ]

vkdtree = viz_kdtree( zip(clump4_x, clump4_y) )

# line width for visualization of K-D tree
line_width = [4., 3.5, 3., 2.5, 2., 1.5, 1., .5, 0.3]

min_val = 0
max_val = 1
delta = 0

plt.figure("K-d Tree", figsize=(10., 10.))
plt.axis( [min_val-delta, max_val+delta, min_val-delta, max_val+delta] )
 
plt.grid(b=True, which='major', color='0.75', linestyle='--')
plt.xticks([i for i in range(min_val-delta, max_val+delta, 1)])
plt.yticks([i for i in range(min_val-delta, max_val+delta, 1)])
 
# draw the tree
plot_tree(vkdtree, min_val-delta, max_val+delta, min_val-delta, max_val+delta, None, None)
 
plt.title('K-D Tree')
#plt.show()
plt.savefig('../images/instance_based_learning_003.png')



In [66]:
vkdtree = viz_kdtree( zip((clump1_x + clump2_x), (clump1_y + clump2_y)) )

# line width for visualization of K-D tree
line_width = [4., 3.5, 3., 2.5, 2., 1.5, 1., .5, 0.3]

min_val = 0
max_val = 1
delta = 0

plt.figure("K-d Tree", figsize=(10., 10.))
plt.axis( [min_val-delta, max_val+delta, min_val-delta, max_val+delta] )
 
plt.grid(b=True, which='major', color='0.75', linestyle='--')
plt.xticks([i for i in range(min_val-delta, max_val+delta, 1)])
plt.yticks([i for i in range(min_val-delta, max_val+delta, 1)])
 
# draw the tree
plot_tree(vkdtree, min_val-delta, max_val+delta, min_val-delta, max_val+delta, None, None)
 
plt.title('K-D Tree')
#plt.show()
plt.savefig('../images/instance_based_learning_004.png')



In [ ]: