In [1]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import load_iris
from sklearn.metrics import euclidean_distances
from numba import jit

In [2629]:
iris = load_iris()
data = iris.data

In [2631]:
data = np.random.randint(0, 20000, size=(100000, 2))

In [2632]:
findNemo = NearestNeighbors(n_neighbors=16)
%time findNemo.fit(data)
%time coassoc, neighbors = findNemo.kneighbors(data, n_neighbors=15, return_distance=True)


CPU times: user 82.7 ms, sys: 0 ns, total: 82.7 ms
Wall time: 81.8 ms
CPU times: user 640 ms, sys: 7.3 ms, total: 647 ms
Wall time: 647 ms

In [2633]:
#remove first col
coassoc = coassoc[:,1:]
neighbors = neighbors[:,1:]

In [2635]:
Z = np.empty((data.shape[0] - 1, 3), dtype = coassoc.dtype)
Z_pointer = 0
n_samples = coassoc.shape[0]
track = np.arange(data.shape[0], dtype = np.int32)

In [71]:
a_min = coassoc.argmin()
pattern, neigh = a_min // 4, a_min % 4

In [ ]:
%time knn_slhac_fast(coassoc, neighbors, Z)

In [ ]:
def knn_slhac(weights, neighbors, Z):
    n_samples, n_neighbors = weights.shape
    
    track = np.arange(data.shape[0], dtype = np.int32)

    Z_pointer = 0
    while Z_pointer != n_samples - 1:
        a_min = weights.argmin()
        pattern, neigh_idx = a_min // n_neighbors, a_min % n_neighbors
        
        # get neighbor
        neigh = neighbors[pattern, neigh_idx]

        pattern_track = track[pattern]
        neigh_track = track[neigh]        
        
        # unconnected clusters
        # pick any two different clusters and cluster them
        if weights[pattern, neigh_idx] == np.inf:
            clust1 = track[0]
            for i in range(1, n_samples):
                clust2 = track[i]
                if clust1 != clust2:
                    # update the clusters of the samples in track
                    track[track == clust1] = n_samples + Z_pointer
                    track[track == clust2] = n_samples + Z_pointer

                    # add cluster to Z
                    Z[Z_pointer, 0] = pattern_track
                    Z[Z_pointer, 1] = neigh_track
                    Z[Z_pointer, 2] = np.inf
                    Z_pointer += 1
                    break
            continue

        # check if patterns belong to same cluster
        if pattern_track != neigh_track:

            # update the clusters of the samples in track
            track[track == pattern_track] = n_samples + Z_pointer
            track[track == neigh_track] = n_samples + Z_pointer

            # add cluster to Z
            Z[Z_pointer, 0] = pattern_track
            Z[Z_pointer, 1] = neigh_track
            Z[Z_pointer, 2] = links[pattern, neigh_idx]
            Z_pointer += 1

        # remove distance in coassoc
        weights[pattern, neigh_idx] = np.inf
        
@jit(nopython=True)
def knn_slhac_fast(weights, neighbors, Z):
    n_samples, n_neighbors = coassoc.shape
    
    track = np.empty(n_samples, dtype = np.int32)
    for i in range(n_samples):
        track[i] = i

    Z_pointer = 0
    while Z_pointer != n_samples - 1:
        a_min = weights.argmin()
        pattern = a_min // n_neighbors
        neigh_idx = a_min % n_neighbors

        # get neighbor
        neigh = neighbors[pattern, neigh_idx]

        # get clusters of origin and destination
        pattern_track = track[pattern]
        neigh_track = track[neigh]        
        
        # unconnected clusters
        # pick any two different clusters and cluster them
        if weights[pattern, neigh_idx] == np.inf:
            clust1 = track[0]
            for i in range(1, n_samples):
                clust2 = track[i]
                if clust1 != clust2:
                    new_clust = n_samples + Z_pointer
                    # update the clusters of the samples in track
                    for i in range(n_samples):
                        i_clust = track[i]
                        if i_clust == pattern_track or i_clust == neigh_track:
                            track[i] = new_clust

                    # add cluster to Z
                    Z[Z_pointer, 0] = pattern_track
                    Z[Z_pointer, 1] = neigh_track
                    Z[Z_pointer, 2] = np.inf
                    Z_pointer += 1
                    break
            continue

        # check if patterns belong to same cluster
        if pattern_track != neigh_track:

            new_clust = n_samples + Z_pointer
            # update the clusters of the samples in track
            for i in range(n_samples):
                i_clust = track[i]
                if i_clust == pattern_track or i_clust == neigh_track:
                    track[i] = new_clust

            # add cluster to Z
            Z[Z_pointer, 0] = pattern_track
            Z[Z_pointer, 1] = neigh_track
            Z[Z_pointer, 2] = weights[pattern, neigh_idx]
            Z_pointer += 1

        # remove distance in coassoc
        weights[pattern, neigh_idx] = np.inf

In [2623]:
%time knn_slhac(coassoc, neighbors, Z)


CPU times: user 58.5 ms, sys: 0 ns, total: 58.5 ms
Wall time: 59.5 ms

In [2624]:
Z


Out[2624]:
array([[  9.00000000e+00,   3.70000000e+01,   0.00000000e+00],
       [  1.50000000e+02,   3.40000000e+01,   0.00000000e+00],
       [  1.01000000e+02,   1.42000000e+02,   0.00000000e+00],
       [  7.00000000e+00,   3.90000000e+01,   1.00000000e-01],
       [  0.00000000e+00,   1.70000000e+01,   1.00000000e-01],
       [  1.28000000e+02,   1.32000000e+02,   1.00000000e-01],
       [  1.00000000e+01,   4.80000000e+01,   1.00000000e-01],
       [  1.54000000e+02,   4.00000000e+01,   1.41421356e-01],
       [  1.90000000e+01,   2.10000000e+01,   1.41421356e-01],
       [  1.57000000e+02,   4.00000000e+00,   1.41421356e-01],
       [  2.90000000e+01,   3.00000000e+01,   1.41421356e-01],
       [  5.70000000e+01,   9.30000000e+01,   1.41421356e-01],
       [  8.00000000e+01,   8.10000000e+01,   1.41421356e-01],
       [  1.16000000e+02,   1.37000000e+02,   1.41421356e-01],
       [  8.00000000e+00,   3.80000000e+01,   1.41421356e-01],
       [  1.58000000e+02,   4.60000000e+01,   1.41421356e-01],
       [  1.59000000e+02,   1.53000000e+02,   1.41421356e-01],
       [  3.00000000e+00,   4.70000000e+01,   1.41421356e-01],
       [  1.66000000e+02,   4.90000000e+01,   1.41421356e-01],
       [  2.70000000e+01,   2.80000000e+01,   1.41421356e-01],
       [  8.20000000e+01,   9.20000000e+01,   1.41421356e-01],
       [  9.50000000e+01,   9.60000000e+01,   1.41421356e-01],
       [  1.27000000e+02,   1.38000000e+02,   1.41421356e-01],
       [  2.00000000e+00,   1.67000000e+02,   1.41421356e-01],
       [  1.00000000e+00,   4.50000000e+01,   1.41421356e-01],
       [  1.74000000e+02,   1.20000000e+01,   1.41421356e-01],
       [  1.68000000e+02,   1.69000000e+02,   1.41421356e-01],
       [  6.30000000e+01,   9.10000000e+01,   1.41421356e-01],
       [  6.50000000e+01,   7.50000000e+01,   1.41421356e-01],
       [  1.71000000e+02,   9.90000000e+01,   1.41421356e-01],
       [  6.90000000e+01,   1.62000000e+02,   1.73205081e-01],
       [  1.23000000e+02,   1.26000000e+02,   1.73205081e-01],
       [  1.75000000e+02,   1.51000000e+02,   1.73205081e-01],
       [  1.12000000e+02,   1.39000000e+02,   1.73205081e-01],
       [  1.82000000e+02,   1.60000000e+02,   1.73205081e-01],
       [  1.73000000e+02,   1.84000000e+02,   1.73205081e-01],
       [  9.40000000e+01,   1.79000000e+02,   1.73205081e-01],
       [  8.80000000e+01,   1.86000000e+02,   1.73205081e-01],
       [  6.60000000e+01,   8.40000000e+01,   2.00000000e-01],
       [  7.80000000e+01,   1.77000000e+02,   2.00000000e-01],
       [  2.30000000e+01,   2.60000000e+01,   2.00000000e-01],
       [  1.85000000e+02,   2.50000000e+01,   2.00000000e-01],
       [  1.64000000e+02,   4.20000000e+01,   2.00000000e-01],
       [  5.30000000e+01,   8.90000000e+01,   2.00000000e-01],
       [  7.40000000e+01,   9.70000000e+01,   2.00000000e-01],
       [  1.92000000e+02,   1.91000000e+02,   2.23606798e-01],
       [  1.10000000e+01,   1.95000000e+02,   2.23606798e-01],
       [  6.00000000e+00,   1.96000000e+02,   2.23606798e-01],
       [  3.50000000e+01,   1.76000000e+02,   2.23606798e-01],
       [  1.90000000e+02,   4.30000000e+01,   2.23606798e-01],
       [  1.98000000e+02,   1.56000000e+02,   2.23606798e-01],
       [  1.89000000e+02,   7.30000000e+01,   2.23606798e-01],
       [  7.00000000e+01,   1.72000000e+02,   2.23606798e-01],
       [  2.00000000e+02,   1.99000000e+02,   2.23606798e-01],
       [  2.03000000e+02,   1.97000000e+02,   2.23606798e-01],
       [  1.10000000e+02,   1.47000000e+02,   2.23606798e-01],
       [  1.20000000e+02,   1.43000000e+02,   2.23606798e-01],
       [  1.36000000e+02,   1.48000000e+02,   2.44948974e-01],
       [  5.80000000e+01,   1.78000000e+02,   2.44948974e-01],
       [  5.40000000e+01,   2.08000000e+02,   2.44948974e-01],
       [  6.70000000e+01,   1.70000000e+02,   2.44948974e-01],
       [  1.80000000e+02,   1.93000000e+02,   2.44948974e-01],
       [  1.81000000e+02,   1.46000000e+02,   2.44948974e-01],
       [  1.65000000e+02,   2.04000000e+02,   2.44948974e-01],
       [  1.03000000e+02,   1.63000000e+02,   2.44948974e-01],
       [  1.40000000e+02,   1.44000000e+02,   2.44948974e-01],
       [  1.30000000e+01,   2.13000000e+02,   2.44948974e-01],
       [  1.41000000e+02,   1.45000000e+02,   2.44948974e-01],
       [  2.12000000e+02,   2.02000000e+02,   2.44948974e-01],
       [  6.80000000e+01,   8.70000000e+01,   2.64575131e-01],
       [  2.10000000e+02,   1.87000000e+02,   2.64575131e-01],
       [  1.94000000e+02,   2.09000000e+02,   2.64575131e-01],
       [  1.52000000e+02,   1.13000000e+02,   2.64575131e-01],
       [  5.00000000e+01,   5.20000000e+01,   2.64575131e-01],
       [  9.00000000e+01,   2.20000000e+02,   2.64575131e-01],
       [  2.11000000e+02,   2.24000000e+02,   2.64575131e-01],
       [  5.10000000e+01,   5.60000000e+01,   2.64575131e-01],
       [  1.07000000e+02,   1.30000000e+02,   2.64575131e-01],
       [  2.06000000e+02,   2.15000000e+02,   2.64575131e-01],
       [  1.05000000e+02,   1.22000000e+02,   2.64575131e-01],
       [  2.18000000e+02,   1.49000000e+02,   2.82842712e-01],
       [  2.00000000e+01,   3.10000000e+01,   2.82842712e-01],
       [  2.23000000e+02,   8.60000000e+01,   2.82842712e-01],
       [  2.16000000e+02,   2.40000000e+01,   3.00000000e-01],
       [  2.28000000e+02,   1.24000000e+02,   3.00000000e-01],
       [  2.33000000e+02,   3.60000000e+01,   3.00000000e-01],
       [  2.31000000e+02,   2.35000000e+02,   3.00000000e-01],
       [  6.10000000e+01,   2.25000000e+02,   3.00000000e-01],
       [  1.15000000e+02,   2.07000000e+02,   3.00000000e-01],
       [  1.04000000e+02,   1.55000000e+02,   3.00000000e-01],
       [  5.50000000e+01,   1.88000000e+02,   3.00000000e-01],
       [  2.26000000e+02,   2.21000000e+02,   3.16227766e-01],
       [  2.22000000e+02,   1.21000000e+02,   3.16227766e-01],
       [  2.40000000e+02,   2.37000000e+02,   3.16227766e-01],
       [  2.41000000e+02,   2.32000000e+02,   3.16227766e-01],
       [  2.44000000e+02,   7.70000000e+01,   3.16227766e-01],
       [  2.45000000e+02,   7.60000000e+01,   3.16227766e-01],
       [  2.43000000e+02,   2.01000000e+02,   3.31662479e-01],
       [  8.30000000e+01,   1.33000000e+02,   3.31662479e-01],
       [  5.00000000e+00,   1.80000000e+01,   3.31662479e-01],
       [  2.42000000e+02,   2.30000000e+02,   3.31662479e-01],
       [  7.10000000e+01,   2.46000000e+02,   3.31662479e-01],
       [  2.14000000e+02,   2.39000000e+02,   3.31662479e-01],
       [  1.11000000e+02,   2.05000000e+02,   3.46410162e-01],
       [  1.83000000e+02,   2.34000000e+02,   3.46410162e-01],
       [  2.49000000e+02,   2.36000000e+02,   3.46410162e-01],
       [  2.55000000e+02,   1.60000000e+01,   3.46410162e-01],
       [  2.51000000e+02,   2.47000000e+02,   3.46410162e-01],
       [  3.20000000e+01,   3.30000000e+01,   3.46410162e-01],
       [  2.58000000e+02,   2.56000000e+02,   3.46410162e-01],
       [  1.25000000e+02,   1.29000000e+02,   3.46410162e-01],
       [  7.90000000e+01,   2.57000000e+02,   3.46410162e-01],
       [  7.20000000e+01,   2.48000000e+02,   3.60555128e-01],
       [  4.40000000e+01,   2.59000000e+02,   3.60555128e-01],
       [  2.52000000e+02,   2.53000000e+02,   3.60555128e-01],
       [  6.00000000e+01,   1.61000000e+02,   3.60555128e-01],
       [  2.62000000e+02,   2.50000000e+02,   3.60555128e-01],
       [  2.17000000e+02,   2.64000000e+02,   3.60555128e-01],
       [  2.67000000e+02,   2.54000000e+02,   3.60555128e-01],
       [  1.50000000e+01,   2.63000000e+02,   3.60555128e-01],
       [  2.68000000e+02,   2.38000000e+02,   3.74165739e-01],
       [  2.61000000e+02,   8.50000000e+01,   3.74165739e-01],
       [  2.70000000e+02,   2.66000000e+02,   3.74165739e-01],
       [  2.65000000e+02,   9.80000000e+01,   3.87298335e-01],
       [  5.90000000e+01,   2.71000000e+02,   3.87298335e-01],
       [  1.02000000e+02,   2.60000000e+02,   3.87298335e-01],
       [  2.75000000e+02,   2.72000000e+02,   4.00000000e-01],
       [  1.18000000e+02,   2.29000000e+02,   4.12310563e-01],
       [  1.40000000e+01,   2.69000000e+02,   4.12310563e-01],
       [  1.17000000e+02,   1.31000000e+02,   4.12310563e-01],
       [  2.74000000e+02,   2.76000000e+02,   4.12310563e-01],
       [  6.40000000e+01,   2.80000000e+02,   4.24264069e-01],
       [  1.00000000e+02,   2.81000000e+02,   4.24264069e-01],
       [  2.82000000e+02,   1.19000000e+02,   4.35889894e-01],
       [  2.27000000e+02,   2.83000000e+02,   4.35889894e-01],
       [  2.78000000e+02,   2.20000000e+01,   4.58257569e-01],
       [  1.14000000e+02,   2.84000000e+02,   4.89897949e-01],
       [  6.20000000e+01,   2.86000000e+02,   4.89897949e-01],
       [  2.19000000e+02,   2.87000000e+02,   5.09901951e-01],
       [  2.77000000e+02,   2.88000000e+02,   5.29150262e-01],
       [  2.89000000e+02,   1.35000000e+02,   5.38516481e-01],
       [  2.90000000e+02,   1.34000000e+02,   5.38516481e-01],
       [  1.08000000e+02,   2.91000000e+02,   5.56776436e-01],
       [  4.10000000e+01,   2.85000000e+02,   6.24499800e-01],
       [  1.09000000e+02,   2.92000000e+02,   6.32455532e-01],
       [  2.94000000e+02,   2.73000000e+02,   6.48074070e-01],
       [  1.06000000e+02,   2.95000000e+02,   7.34846923e-01],
       [  2.96000000e+02,   2.79000000e+02,   8.18535277e-01],
       [  2.93000000e+02,   2.93000000e+02,              inf]])

In [8]:
n_samples, n_neighbors = neighbors.shape

In [26]:
print pattern, neigh_idx


142 0

In [2558]:
a_min = coassoc.argmin()
pattern, neigh_idx = a_min // n_neighbors, a_min % n_neighbors

# unconnected clusters
if coassoc[pattern, neigh_idx] == np.inf:
    clust1 = track[0]
    for i in range(1, n_samples):
        clust2 = track[i]
        if clust1 != clust2:
            # update the clusters of the samples in track
            track[track == clust1] = n_samples + Z_pointer
            track[track == clust2] = n_samples + Z_pointer

            # add cluster to Z
            Z[Z_pointer, 0] = pattern_track
            Z[Z_pointer, 1] = neigh_track
            Z[Z_pointer, 2] = np.inf
            Z_pointer += 1
else:


    # get neighbor
    neigh = neighbors[pattern, neigh_idx]

    pattern_track = track[pattern]
    neigh_track = track[neigh]

    # check if patterns belong to same cluster
    if pattern_track != neigh_track:

        # update the clusters of the samples in track
        track[track == pattern_track] = n_samples + Z_pointer
        track[track == neigh_track] = n_samples + Z_pointer

        # add cluster to Z
        Z[Z_pointer, 0] = pattern_track
        Z[Z_pointer, 1] = neigh_track
        Z[Z_pointer, 2] = coassoc[pattern, neigh_idx]
        Z_pointer += 1

    # remove distance in coassoc
    coassoc[pattern, neigh_idx] = np.inf
    
print "Z_pointer=", Z_pointer
print "cluster=", Z[Z_pointer-1]
print track


Z_pointer= 34
cluster= [ 112.          139.            0.17320508]
[176 182 173 173 176   5   6 176 164 182 156  11 182  13  14  15  16 176
  18 165  20 165  22  23  24  25  26 176 176 160 160  31  32  33 182  35
  36 182 164 176 176  41  42  43  44 182 165 173 156 176  50  51  52  53
  54  55  56 161  58  59  60  61  62 177  64 178  66  67  68 180  70  71
  72  73  74 178  76  77  78  79 180 180 170  83  84  85  86  87  88  89
  90 177 170 161  94 179 179  97  98 179 100 152 102 103 104 105 106 107
 108 109 110 111 183 113 114 115 163 117 118 119 120 121 122 181 124 125
 181 172 155 129 130 131 155 133 134 135 136 163 172 183 140 141 152 143
 144 145 146 147 148 149]

In [2454]:
np.where(coassoc==np.inf)[0].size


Out[2454]:
2100

In [1121]:
np.where(coassoc == 0)


Out[1121]:
(array([], dtype=int64), array([], dtype=int64))