In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats

%matplotlib inline
%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=3)

A = np.arange(12).reshape((4,3))
x = np.arange(9)
l = [1,3,2,4]
d = {'a': 1, 'b': 2, 'c': 3}

In [2]:
t = np.linspace(np.pi, 2*np.pi, 100)
y = (1 + np.cos(t)) / 2.
x = t / np.pi - 1
plt.plot(x, y)
plt.plot(x, y*y)
plt.plot(x, y*y * y*y)
plt.plot(x, y*y * y*y * y*y)
plt.show()



In [3]:
# alright, so above is incorrect; this is actually what you get
# when you do signed random projections LSH
t = np.linspace(-1, 1, 100)
y = 1 - np.arccos(t) / np.pi
x = t
# 1 bit to 8 bits
y2 = y*y
y4 = y2 * y2
y6 = y4 * y2
y8 = y4 * y4
y16 = y8 * y8

colors = 'bgrcmykw'

plt.figure(figsize=(8, 7))
yvals = [y, y2, y4, y8, y16]
powers = [1, 2, 4, 8, 16]
for i, yval in enumerate(yvals):
    plt.plot(x, yval, color=colors[i])
    plt.plot(x, np.minimum(1, yval * (2 ** powers[i]) / 2 ), '--', color=colors[i])
#     plt.plot(x, np.minimum(1, yval * 2), '--', color=colors[i])

labels = [(str(s), str(s)) for s in powers] # second one is times 2
labels = reduce(lambda l1, l2: l1 + l2, labels)
plt.legend(labels, loc='upper left')
plt.show()

# okay, so it looks like using 4 signed random projections should be good;
# even just 2 might work; could also use two hash vals for the query to
# get double the collision probability (roughly) and it would also be fine



In [9]:
def amp(sims, power, scale=1):
    sims = np.minimum(1, sims)
    p = 1 - np.arccos(sims) / np.pi
    activation = np.power(p, power) * scale
    return activation

mu = 0
sigma = .15
sims = np.random.randn(int(1e5)) * sigma + mu
# sims = np.sqrt(np.abs(sims)) * np.sign(sims)
# sims = .5 + .5 * sims
# sims = .7 + .3 * sims

bits = 4
num_hashes = 64 * 8 / bits
amped = amp(sims, bits, num_hashes)
# amped /= num_hashes
plt.figure(figsize=(12, 5))
plt.hist(sims, 50)
plt.xlim((0, 1))
# ax = plt.subplots(1)
plt.figure(figsize=(12, 5))
# plt.xlim((0, np.max(amped)))
plt.xlim((0, num_hashes))
# plt.xlim((0, 1))
plt.hist(amped, 50)

# what if we replace sims with probs?
sims = amp(sims, 1)

top_k_orig = np.sort(sims)[-10:][::-1]
top_k_amped = np.sort(amped)[-10:][::-1]
# print "orig mean: ", np.mean(sims)
# print "amped mean: ", np.mean(amped)
print "orig best sims: ", top_k_orig
# print "orig best probs: ", 1 - np.arccos(top_k_orig) / np.pi
print "amped best sims: ", top_k_amped

# compute std of how many collisions there will actually be
# sketch_variances_amped = top_k_amped * (2.**-bits) * (1 - 2.**-bits)
sketch_p = top_k_amped / num_hashes
sketch_variances_amped = num_hashes * sketch_p * (1 - sketch_p)
sketch_stds_amped = np.sqrt(sketch_variances_amped) # sqrt(np(1-p))
print "sketch stds:\t", sketch_stds_amped
print "sketch_p: ", sketch_p
#print
#print "sketch stds amped: ", sketch_stds_amped
#print "lower 95% ci amped: ", top_k_amped - 1.96 * sketch_stds_amped

# print "\nstds: orig = {:.4}, amped = {:.4}\n".format(std_orig, std_amped)

# rel contrast is a weird measure if sims are 0-centered cuz
# tends to become enormous as mean approaches 0
# print "orig rel contrasts: ", top_k_orig / np.mean(sims)
# print "amped rel contrasts: ", top_k_amped / np.mean(amped)

gaps_orig = top_k_orig - np.mean(sims)
gaps_amped = top_k_amped - np.mean(amped)
std_orig = np.std(sims)
std_amped = np.std(amped)
normed_gaps_orig = gaps_orig / std_orig
normed_gaps_amped = gaps_amped / std_amped
# print "orig gaps: ", gaps_orig
# print "amped gaps: ", gaps_amped
print "orig sigma gaps:\t", normed_gaps_orig
print "amped sigma gaps:\t", normed_gaps_amped
print "ratio:\t\t\t", normed_gaps_amped / normed_gaps_orig

# so what if we take into account the variance of the binomial
# describing how many collisions we actually get? Because if this
# variance is high, we'll need to look at stuff with many fewer
# than the expected number of collisions to avoid false negatives
#
# okay, it appears that with 32B of 4bit hashes, this std is pretty small
#
# sketch_stds_over_sigma = sketch_stds_amped / std_amped
# sketch_lower_68_ci = normed_gaps_amped - 1 * sketch_stds_over_sigma
# sketch_lower_95_ci = normed_gaps_amped - 1.96 * sketch_stds_over_sigma
#
# EDIT: let's try this again...
sketch_lower_68_ci = (gaps_amped - 1 * sketch_stds_amped)
sketch_lower_95_ci = (gaps_amped - 1.96 * sketch_stds_amped)
sketch_lower_68_ci = sketch_lower_68_ci / std_amped
sketch_lower_95_ci = sketch_lower_95_ci / std_amped
print
print "sketch lower 95% ci:\t", sketch_lower_95_ci

distro = stats.norm()
# distro = stats.norm(loc=np.mean(amped), scale=std_amped)

N = int(1e6)
print "\nExpected number of points this extreme for N = {:g}".format(N)
print "orig:\t\t", N * distro.sf(normed_gaps_orig)
print "amped:\t\t", N * distro.sf(normed_gaps_amped)
print "amped 68% ci:\t", N * distro.sf(sketch_lower_68_ci)
print "amped 95% ci:\t", N * distro.sf(sketch_lower_95_ci)

# plt.plot(t, amped)
# plt.plot(amped)
plt.show()


orig best sims:  [ 0.709  0.707  0.705  0.703  0.702  0.7    0.7    0.699  0.698  0.697]
amped best sims:  [ 32.351  32.043  31.69   31.25   31.11   30.789  30.654  30.478  30.444
  30.156]
sketch stds:	[ 4.917  4.901  4.883  4.86   4.853  4.836  4.828  4.819  4.817  4.801]
sketch_p:  [ 0.253  0.25   0.248  0.244  0.243  0.241  0.239  0.238  0.238  0.236]
orig sigma gaps:	[ 4.324  4.289  4.248  4.197  4.181  4.143  4.127  4.106  4.102  4.068]
amped sigma gaps:	[ 7.346  7.251  7.143  7.007  6.964  6.866  6.824  6.77   6.76   6.671]
ratio:			[ 1.699  1.691  1.681  1.67   1.666  1.657  1.653  1.649  1.648  1.64 ]

sketch lower 95% ci:	[ 4.383  4.298  4.2    4.079  4.04   3.951  3.914  3.866  3.857  3.778]

Expected number of points this extreme for N = 1e+06
orig:		[  7.674   8.993  10.782  13.516  14.525  17.128  18.359  20.086  20.44
  23.703]
amped:		[  1.021e-07   2.065e-07   4.575e-07   1.213e-06   1.649e-06   3.308e-06
   4.422e-06   6.427e-06   6.908e-06   1.270e-05]
amped 68% ci:	[ 0.003  0.005  0.008  0.018  0.022  0.037  0.047  0.062  0.065  0.102]
amped 95% ci:	[  5.854   8.637  13.351  22.663  26.739  38.833  45.329  55.28   57.437
  79.18 ]

In [10]:
# note the increasing ratios; moving from 4 sigma to 5 sigma prunes
# a higher fraction of points than moving from 2 sigma to 3 sigma
print "2:\t", N * distro.sf(2)
print "2.5:\t", N * distro.sf(2.5)
print "3:\t", N * distro.sf(3)
print "3.5:\t", N * distro.sf(3.5)
print "4:\t", N * distro.sf(4)
print "4.5:\t", N * distro.sf(4.5)
print "5:\t", N * distro.sf(5)


2:	22750.1319482
2.5:	6209.66532578
3:	1349.89803163
3.5:	232.629079036
4:	31.6712418331
4.5:	3.39767312473
5:	0.286651571879

In [11]:
p = np.linspace(0, 1, 100)
plt.plot(p, p / (1-p))
plt.show()



In [12]:
# ya, so convex combo of sims with a constant doesn't mess
# with number of sigma the top points are from the mean at all
augmented_sims = .5 + .5 * sims
std_augmented = np.std(augmented_sims)
top_k_augmented = .5 + .5 * top_k_orig
gaps_augmented = top_k_augmented - np.mean(augmented_sims)
normed_gaps_augmented = gaps_augmented / std_augmented
print "orig normed gaps:\t", normed_gaps_orig
print "augmented normed gaps:\t", normed_gaps_augmented


orig normed gaps:	[ 4.324  4.289  4.248  4.197  4.181  4.143  4.127  4.106  4.102  4.068]
augmented normed gaps:	[ 4.324  4.289  4.248  4.197  4.181  4.143  4.127  4.106  4.102  4.068]

In [13]:
# alright, let's see what combinations of high and low probs actually
# do better with amp codes (slash, whether *any* do)
#
# okay, so it appears that amp codes are basically never better given
# that they take more bits per hash; if we pretend they still just take
# one bit, they're *slightly* better when sims are above .5, and a lot
# better when they're close to 1
#
# ah, so I think a thing that's happening is that this is worse because
# we're using the pooled std, while the above is just looking at
# the 95% conf interval around the higher prob's mean, so it caps at
# using two of the higher sigma; if we do this 4 or 8 bits becomes better
# for high probs (.8 or more) vs lower probs (like .3 less or more)
#
# also, if we just use the 2 sigma conf interval and also pretend that
# n doesn't have to be lower, then amp codes wreck; what we really need
# is a way to not have n go down.
#   -maybe bootstrap resampling? implemented by shifting stuff to look
#   at more groups of 4 or whatever?
#     -easy to implement; just repeatedly shift mask, cmpeq, add
#
# I think the reason the above simulations do better is that we happen
# to have high enough probs that it hits the regime where amp codes 
# are better; it's much worse if the best probs are below like .7.

# default_probs = np.arange(.01, .99, .01)
# default_probs = np.arange(.5, .99, .01)
default_probs = np.arange(.4, .99, .01)

def compute_gaps(probs, n):
    probs = np.asarray(probs)
    num_probs = len(probs)
    means = n * probs
    variances = means * (1 - probs)
    
    # okay, ya, it does seem to be computing stuff right
#     print "probs: ", probs
#     print "means: ", means
#     print "variances: ", variances
#     print "sum variance: ", np.sum(variances)
#     print "sum sqrt: ", np.sqrt(np.sum(variances))
#     mean_gap = means[1] - means[0]
#     print "mean gap: ", mean_gap
#     print "sigma gap: ", mean_gap / np.sqrt(np.sum(variances))
    
    gaps = np.zeros((num_probs, num_probs))
    for i in range(num_probs):
#         end = min(num_probs, i + 60)
        end = num_probs
        for j in range(i, end):
            sum_variances = variances[i] + variances[j]
            sum_std = np.sqrt(sum_variances)
            diff = means[j] - means[i]
            diff -= 1.96 * np.sqrt(variances[j])
#             diff -= 1 * np.sqrt(variances[j])
            diff = np.maximum(0, diff)
            gap = diff / np.sqrt(variances[i])
            
#             gap = diff / sum_std
            # even just using one is worse
#             gap = diff / np.sqrt(variances[j])
#             gap = diff / np.sqrt(variances[i])
    
            gaps[i, j] = gaps[j, i] = gap
            
    return gaps

def gaps_for_nbits(n, nbits, p=default_probs):
    p = np.asarray(p)
    hash_p = p ** nbits
#     num_hashes = n / nbits 
    # note that if we ignore increased bit costs (below) and
    # also just look at 95% conf interval, amp codes wreck
    num_hashes = n
#     num_hashes = n / np.sqrt(nbits)
    return compute_gaps(probs=hash_p, n=num_hashes)

# print gaps_for_nbits(4, 1, p=[.4, .5])
# print
# print gaps_for_nbits(4, 2, p=[.4, .5])
# # import sys; sys.exit()
# return

n0 = 16 * 8
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()
all_gaps = []
bit_counts = [1, 2, 3, 4]
for nbits in bit_counts:
    gaps = gaps_for_nbits(n0, nbits, p=default_probs)
    all_gaps.append(gaps)

for i, gaps in enumerate(all_gaps[1:]): # subtract off gaps using one bit
    gaps -= all_gaps[0]
#     all_gaps[i + 1] = np.maximum(0, gaps)
    all_gaps[i + 1] = gaps

# max_val = np.max([np.max(gaps) for gaps in all_gaps])
# max_val = min(4, max_val)
max_val = 5
for i, nbits in enumerate(bit_counts):
    gaps = all_gaps[i]
    gaps = np.minimum(gaps, max_val)
    gaps = np.maximum(gaps, -max_val)
    gaps[0, 0] = max_val  # so all will have the same scale
    gaps[-1, -1] = -max_val  # so 0 is in the middle
    axes[i].set_title(nbits)
#     im = axes[i].imshow(gaps, cmap='seismic')
#     im = axes[i].imshow(gaps, cmap='gist_ncar')
    min_p, max_p = np.min(default_probs), np.max(default_probs)
    extent = [min_p, max_p, max_p, min_p]
    im = axes[i].imshow(gaps, cmap='CMRmap', interpolation=None, extent=extent)

plt.tight_layout() # must be before we add the colorbar

fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im, cax=cbar_ax)
    
plt.show()


//anaconda/lib/python2.7/site-packages/matplotlib/collections.py:571: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [14]:
# alright, so let's see if we can OR together bit vects and get
# a probabalistic lower bound to let us look at 4 of them at once

N = int(1e4)
D = 64
Q = 4

bits = np.random.rand(N, D) < .5
queries = np.random.rand(Q, D) < .5

row_sums = np.sum(bits, axis=1)
print np.mean(row_sums)
print np.std(row_sums)
print

# idxs = np.random.randint(N, size=100)
# idxs = np.random.randint(N, size=3)

def top_k_idxs(elements, k, smaller_better=False):
    if smaller_better:
        which_nn = np.arange(k)
        return np.argpartition(elements, kth=which_nn)[:k]
    else:
        which_nn = len(elements) - 1 - np.arange(k)
        return np.argpartition(elements, kth=which_nn)[-k:][::-1]

def hamming_dist(v1, v2):
    return np.count_nonzero(v1 != v2)

which_nn = N - 1 - np.arange(10)
# number of neighbors to choose from, number of vects in cluster
c = 4
k = c + 2
for q in queries:
    print
#     sims = bits.dot(q)
#     print sims.shape
#     print sims[:20]
#     sims = bits.dot(q.reshape((-1, 1)))
#     sims = np.sum(bits * q, axis=1)
#     print "{} +/- {}".format(np.mean(sims), np.std(sims))
    diffs = q != bits
    dists = np.array([np.count_nonzero(row) for row in diffs])
    print "{} +/- {}".format(np.mean(dists), np.std(dists))
    nn_idxs = top_k_idxs(dists, k, smaller_better=True) # 6nn
    print nn_idxs
    print dists[nn_idxs]
    
    tiebreaker = np.zeros(D, dtype=np.int)
    for omit_which1 in range(k):
        if c % 2 == 0:  # even number -> need a tiebreaker
            tiebreaker_idx = nn_idxs[omit_which1]
            tiebreaker = bits[tiebreaker_idx]
        
        for omit_which2 in range(omit_which1 + 1, k):
            use_which_idxs = list(nn_idxs)
            
            del use_which_idxs[omit_which1]
            del use_which_idxs[omit_which2 - 1]
            
            use_rows = bits[use_which_idxs]
            assert(len(use_rows) == c)
            sums = np.sum(use_rows, axis=0) + tiebreaker
            cluster_bits = sums > (c // 2)
            
            cluster_dists = [hamming_dist(r, cluster_bits) for r in use_rows]
            mean_dist = np.mean(cluster_dists)
            std_dists = np.std(cluster_dists)
            
            unanimity_sums = np.sum(use_rows, axis=0)
            unanimous_idxs = (unanimity_sums == 0) + (unanimity_sums == c)
            num_unanimous_idxs = np.sum(unanimous_idxs)
            
            q_dist = hamming_dist(q, cluster_bits)
            
            print "{}: {:.3f} + {:.3f} =\t{:.3f} -> {}, {}".format(
                use_which_idxs,
                mean_dist, std_dists, mean_dist + std_dists,
                num_unanimous_idxs, q_dist)
            

            
#             if c == 4 or c == 5:
#                 agreeing_idxs = (sums <= 1) + (sums >= 4)
#             elif c == 3:
#                 agreeing_idxs = (sums == 0) + (sums == 3)
            
#             print np.sum(agreeing_idxs), np.count_nonzero(cluster_bits != q)


31.9922
4.01884798916


31.938 +/- 3.99891935403
[ 776 1485 1674 2710 4926 6621]
[19 19 19 19 19 19]
[1674, 2710, 4926, 6621]: 16.000 + 4.243 =	20.243 -> 17, 13
[1485, 2710, 4926, 6621]: 15.500 + 0.866 =	16.366 -> 21, 10
[1485, 1674, 4926, 6621]: 17.000 + 3.317 =	20.317 -> 14, 11
[1485, 1674, 2710, 6621]: 17.000 + 2.828 =	19.828 -> 19, 8
[1485, 1674, 2710, 4926]: 15.500 + 2.598 =	18.098 -> 15, 14
[776, 2710, 4926, 6621]: 15.000 + 0.000 =	15.000 -> 21, 10
[776, 1674, 4926, 6621]: 17.500 + 2.958 =	20.458 -> 14, 11
[776, 1674, 2710, 6621]: 16.500 + 2.958 =	19.458 -> 17, 8
[776, 1674, 2710, 4926]: 15.000 + 2.000 =	17.000 -> 18, 14
[776, 1485, 4926, 6621]: 15.500 + 1.658 =	17.158 -> 16, 11
[776, 1485, 2710, 6621]: 15.500 + 1.658 =	17.158 -> 20, 8
[776, 1485, 2710, 4926]: 15.500 + 2.598 =	18.098 -> 18, 14
[776, 1485, 1674, 6621]: 17.500 + 2.179 =	19.679 -> 12, 8
[776, 1485, 1674, 4926]: 16.500 + 2.179 =	18.679 -> 15, 14
[776, 1485, 1674, 2710]: 16.500 + 2.179 =	18.679 -> 13, 14

31.9468 +/- 3.93842224247
[2328 4895  600 9113 9587  802]
[18 18 19 19 19 20]
[600, 9113, 9587, 802]: 14.750 + 1.785 =	16.535 -> 17, 12
[4895, 9113, 9587, 802]: 14.500 + 1.803 =	16.303 -> 20, 12
[4895, 600, 9587, 802]: 14.500 + 3.041 =	17.541 -> 21, 10
[4895, 600, 9113, 802]: 14.500 + 2.062 =	16.562 -> 18, 12
[4895, 600, 9113, 9587]: 14.750 + 3.562 =	18.312 -> 21, 11
[2328, 9113, 9587, 802]: 16.000 + 1.581 =	17.581 -> 16, 12
[2328, 600, 9587, 802]: 16.000 + 2.915 =	18.915 -> 18, 10
[2328, 600, 9113, 802]: 15.500 + 3.202 =	18.702 -> 16, 12
[2328, 600, 9113, 9587]: 16.750 + 3.112 =	19.862 -> 21, 11
[2328, 4895, 9587, 802]: 14.750 + 3.269 =	18.019 -> 18, 10
[2328, 4895, 9113, 802]: 15.250 + 3.112 =	18.362 -> 17, 12
[2328, 4895, 9113, 9587]: 15.500 + 4.031 =	19.531 -> 20, 11
[2328, 4895, 600, 802]: 16.250 + 2.861 =	19.111 -> 18, 12
[2328, 4895, 600, 9587]: 14.500 + 3.202 =	17.702 -> 22, 11
[2328, 4895, 600, 9113]: 16.500 + 3.500 =	20.000 -> 20, 11

31.9624 +/- 3.98308752603
[1216 1899 6141 3306 6151  316]
[18 18 18 19 19 20]
[6141, 3306, 6151, 316]: 15.500 + 1.803 =	17.303 -> 17, 8
[1899, 3306, 6151, 316]: 15.500 + 4.031 =	19.531 -> 17, 9
[1899, 6141, 6151, 316]: 17.250 + 2.165 =	19.415 -> 17, 4
[1899, 6141, 3306, 316]: 16.250 + 1.920 =	18.170 -> 14, 7
[1899, 6141, 3306, 6151]: 14.500 + 1.803 =	16.303 -> 20, 9
[1216, 3306, 6151, 316]: 16.500 + 4.717 =	21.217 -> 15, 9
[1216, 6141, 6151, 316]: 17.250 + 2.165 =	19.415 -> 11, 4
[1216, 6141, 3306, 316]: 16.750 + 2.278 =	19.028 -> 13, 7
[1216, 6141, 3306, 6151]: 16.000 + 3.391 =	19.391 -> 14, 9
[1216, 1899, 6151, 316]: 18.250 + 1.090 =	19.340 -> 13, 4
[1216, 1899, 3306, 316]: 17.250 + 2.046 =	19.296 -> 13, 7
[1216, 1899, 3306, 6151]: 15.500 + 3.354 =	18.854 -> 17, 9
[1216, 1899, 6141, 316]: 17.500 + 1.658 =	19.158 -> 12, 7
[1216, 1899, 6141, 6151]: 16.750 + 2.681 =	19.431 -> 15, 9
[1216, 1899, 6141, 3306]: 16.250 + 3.269 =	19.519 -> 16, 9

31.9296 +/- 4.00462780293
[4197 6630 6706 8119 1721 3339]
[17 18 18 18 19 19]
[6706, 8119, 1721, 3339]: 15.500 + 1.118 =	16.618 -> 19, 10
[6630, 8119, 1721, 3339]: 17.000 + 1.581 =	18.581 -> 15, 8
[6630, 6706, 1721, 3339]: 16.500 + 1.500 =	18.000 -> 15, 7
[6630, 6706, 8119, 3339]: 15.750 + 1.920 =	17.670 -> 13, 5
[6630, 6706, 8119, 1721]: 16.250 + 0.829 =	17.079 -> 15, 9
[4197, 8119, 1721, 3339]: 16.250 + 1.639 =	17.889 -> 19, 8
[4197, 6706, 1721, 3339]: 16.750 + 1.639 =	18.389 -> 15, 7
[4197, 6706, 8119, 3339]: 16.500 + 2.693 =	19.193 -> 17, 5
[4197, 6706, 8119, 1721]: 16.000 + 0.707 =	16.707 -> 19, 9
[4197, 6630, 1721, 3339]: 16.750 + 1.639 =	18.389 -> 13, 7
[4197, 6630, 8119, 3339]: 17.000 + 2.550 =	19.550 -> 14, 5
[4197, 6630, 8119, 1721]: 16.000 + 0.707 =	16.707 -> 17, 9
[4197, 6630, 6706, 3339]: 17.500 + 1.803 =	19.303 -> 12, 5
[4197, 6630, 6706, 1721]: 16.500 + 0.500 =	17.000 -> 13, 9
[4197, 6630, 6706, 8119]: 16.250 + 0.829 =	17.079 -> 16, 9

In [15]:
def pick_cluster(q, neighbors):
    k = len(neighbors)
    c = k + 1 - 2;  # q with neighbors is 2 more than c
    D = len(q)
    even_cluster_size = (c % 2) == 0
    
    
    use_rows = np.empty((c, D))
    use_rows[0] = np.copy(q)
    
    # TODO systematically compare this to the fancy approach;
    # based on informal experimentation, no discernible difference
    use_rows[1:] = neighbors[:-2]
    centroid = np.sum(use_rows, axis=0)
    if even_cluster_size:
        centroid += neighbors[k-1]
    centroid = centroid > (c // 2)
    return centroid, np.arange(k-1)
    
    best_loss = np.inf
    best_centroid = None
    best_neighbor_idxs = None
    for omit_which1 in range(k):
        if even_cluster_size:
            tiebreaker_idx = nn_idxs[omit_which1]
            tiebreaker = bits[tiebreaker_idx]
        
        for omit_which2 in range(omit_which1 + 1, k):
            # compute which neighbors we're using; note that we
            # always use the query
            use_which_idxs = list(range(k))
            del use_which_idxs[omit_which1]
            del use_which_idxs[omit_which2 - 1]
            assert(len(use_which_idxs) == c - 1)
            use_rows[1:, :] = neighbors[use_which_idxs]
            
            # now compute the cluster centroid, which is a bitstring
            sums = np.sum(use_rows, axis=0)
            if even_cluster_size:
                sums += tiebreaker
            centroid = sums > (c // 2)
        
            # compute loss associated with this centroid
            cluster_dists = [hamming_dist(r, centroid) for r in use_rows]
            mean_dist = np.mean(cluster_dists)
#             std_dists = np.std(cluster_dists)
#             loss = mean_dist + std_dists
            loss = mean_dist
    
            if loss < best_loss:
                best_loss = loss
                best_centroid = centroid
                best_neighbor_idxs = use_which_idxs
    
#             print "{}: {:.3f}".format(use_which_idxs, loss)
    
    return best_centroid, np.array(best_neighbor_idxs)
    
# so it appears that c = 4 yeilds consistently lower dists to 
# query than c = 3; former has 15 of biggest I saw in like 100
# queries, while latter has 15+ all the time; c = 6 seems to also
# be about the same; c = 8 has even lower dists to query...wat?

N = int(1e4)
D = 64
Q = 4

bits = np.random.rand(N, D) < .5
queries = np.random.rand(Q, D) < .5

c = 8
k = c + 2

# okay, so it appears that if the seed used to pick what's in the
# cluster is q's nn, then the seed's neighbors are almost all just
# q's neighbors and the centroid of the cluster is almost identical
# to both the seed and q
#
# what makes absolutely no sense is that, as c increases, q's dist
# to the seed's centroid keeps going down, while the seed's dist
# to its centroid approaches the avg (min?) dist to one of its neighbors

for q in queries:
    print '------------------------'
    print "q:\t\t", q[:20].astype(np.int)
    diffs = q != bits
    dists = np.array([np.count_nonzero(row) for row in diffs])
    print "{} +/- {}".format(np.mean(dists), np.std(dists))
    q_nn_idxs = top_k_idxs(dists, k + (not k % 2), smaller_better=True)
    print dists[q_nn_idxs][:20]
    
    # find nearest neighbors of q's nearest neighbor (the 'seed')
    # EDIT: actually, just pick a pretty close neighbor, not closest
#     seed_idx = np.argmin(dists)
    seed_idx = q_nn_idxs[np.random.randint(k)]
    seed_dist = dists[seed_idx]
    seed = bits[seed_idx]
    diffs = seed != bits
    seed_dists = np.array([np.count_nonzero(row) for row in diffs])
    nn_idxs = top_k_idxs(seed_dists, c + 2, smaller_better=True)
    nn_idxs = nn_idxs[1:]  # seed will find itself as 1nn
    nn_idxs = np.random.randint(N, size=len(nn_idxs)); seed = bits[np.random.randint(N)]  # TODO rm
    seed_neighbors = bits[nn_idxs]
#     print "seed nn idxs: ", nn_idxs
#     print "q dist to seed: ", seed_dist
    
#     q_neighbors = bits[q_nn_idxs]
#     q_neighbor_sums = np.sum(q_neighbors, axis=0)
# #     q_neighbor_sums += q
#     q_centroid = q_neighbor_sums > (len(q_neighbors) // 2)
#     print "q to its centroid:\t", hamming_dist(q, q_centroid)
#     print "seed to its centroid: ", hamming_dist(seed, q_centroid)
    
#     q_seed_sums = np.sum(seed_neighbors, axis=0) + seed
#     q_seed_centroid = q_seed_sums > (len(seed_neighbors) // 2)
#     print "q to seed centroid:\t", hamming_dist(q, q_seed_centroid)
#     print "seed to seed centroid:\t", hamming_dist(seed, q_seed_centroid)
    
#     break
    
    # hmm; replacing nn_idxs with random ones sends dists back up to noise levels...
#     nn_idxs = np.random.randint(N, size=len(nn_idxs))
    

    
#     print "seed dists: ", seed_dists[nn_idxs[:20]]
#     print "min q to seed neighbor: ", np.min([hamming_dist(q, row) for row in seed_neighbors])
#     print
    
    # build the cluster the seed would be in
#     print "q is seed? ", q is seed
    centroid, used_idxs = pick_cluster(seed, seed_neighbors)    
#     print "centroid:\t", centroid[:20].astype(np.int)i
    
    # alright, so it has a reasonable dist when we don't use real cluster
#     used_idxs = np.array([0])
#     centroid = seed_neighbors[np.random.randint(c)]

    # compare dists to this cluster and the rows within it
    q_dists = [seed_dist] + [hamming_dist(q, row) for row in seed_neighbors]
    used_q_dists = np.array(q_dists)[used_idxs + 1]
    used_q_dists = [seed_dist] + list(used_q_dists)
    seed_centroid_dist = hamming_dist(seed, centroid)
#     print "q is centroid? ", q is centroid
#     print "q:\t\t", q[:20].astype(np.int)
#     print "centroid:\t", centroid[:20].astype(np.int)
    q_centroid_dist = hamming_dist(q, centroid)
#     print q[:10]
#     print centroid[:10]
#     print "seed dist to centroid: ", hamming_dist(centroid, seed)
#     print "{} ({}) {}".format(q_dists, used_q_dists, q_centroid_dist)
#     print "{} {}".format(used_q_dists, q_centroid_dist)
#     print
    print "{}: seed, query dist to centroid: {}, {}".format(
        used_q_dists, seed_centroid_dist, q_centroid_dist * 64 / D)


------------------------
q:		[1 0 1 1 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 1]
32.0159 +/- 3.98045816333
[17 18 18 18 18 18 19 20 20 20 20]
[20, 28, 30, 31, 41, 32, 31, 35, 37]: seed, query dist to centroid: 18, 34
------------------------
q:		[0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1]
32.0577 +/- 4.00491831502
[18 18 18 18 18 19 19 19 19 19 19]
[18, 36, 31, 27, 27, 32, 28, 29, 38]: seed, query dist to centroid: 25, 32
------------------------
q:		[0 1 1 1 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 1]
32.0145 +/- 4.02170234478
[16 18 19 19 19 19 20 20 20 20 20]
[19, 32, 33, 29, 27, 37, 36, 31, 32]: seed, query dist to centroid: 22, 35
------------------------
q:		[1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 1 0 0 1]
32.0349 +/- 3.98033691916
[18 19 19 20 20 20 20 20 21 21 21]
[21, 28, 27, 32, 39, 33, 37, 37, 35]: seed, query dist to centroid: 13, 33

In [15]:
# let's see how well points match the centroids of their knn in euclidean space
#
# So it appears that there's a positive correlation between dist to centroid of
# the nearest neighbors and dist to the seed, but it's not super high for iid
# gaussian data.
#
# Also, even dist to 1nn appears to be not super correlated with true dist,
# which bodes poorly for quantization attempts

import pandas as pd

def dists_sq(X, q):
    diffs = X - q
    return np.sum(diffs * diffs, axis=(len(X.shape) - 1)) # handle 1D X

def randwalk(*args):
    ret = np.random.randn(*args)
    ret = np.cumsum(ret, axis=-1)
    return ret / np.linalg.norm(ret) * ret.shape[-1]

N = int(2e4)
D = 128
Q = 200

data_func = np.random.randn
# data_func = np.random.rand  # much better corr with dist to orig point (.7 vs .62 in 128D)
# data_func = randwalk
X = data_func(N, D)
queries = data_func(Q, D)

# np.set_printoptions(precision=3)


# TODO self try various weighting schemes to come up with centroid

def try_query(X, q, k):
    D = X.shape[-1]
    seed = data_func(D)
    dists = dists_sq(X, seed)
    knn_idxs = top_k_idxs(dists, k, smaller_better=True)
    neighbors = X[knn_idxs]
    
#     weights = 1. / np.arange(1, k + 1)
    weights = 1. / dists[knn_idxs]
#     weights = np.exp(-dists[knn_idxs])
    
    weights = (weights / np.sum(weights)).reshape((-1, 1))
    nn_centroid = np.sum(neighbors * weights, axis=0)
    
    # multiplying by 2 to un-shrink it (away from 0) seems to help
#     nn_centroid *= np.sqrt(k)
    nn_centroid *= 2
    
    centroid_dist = dists_sq(nn_centroid, q) / D
    seed_dist = dists_sq(seed, q) / D

    
#     print "seed dists to its nn:\t\t", dists[knn_idxs][:10] / D
    seed_centroid_dist = dists_sq(nn_centroid, seed) / D
#     print "seed to nn centroid, zeros:\t{:.3f}, {:.3f}".format(seed_centroid_dist, np.var(seed))
#     print "q to seed, centroid, zeros:\t{:.3f}, {:.3f}, {:.3f}".format(seed_dist, centroid_dist, np.var(q))
    
    return seed_dist, centroid_dist, seed_centroid_dist

k = 32
q_stats = []
for q in queries:
      q_stats.append(try_query(X, q, k))

q_stats = np.array(q_stats)
df = pd.DataFrame(q_stats, columns=['q_seed', 'q_centroid', 'seed_centroid'])
# print stats
x, y = df['q_seed'], df['q_centroid']
# x, y = np.sqrt(x), np.sqrt(y)
plt.scatter(x, y)
plt.show()
print stats.pearsonr(x, y)
print np.sum(df['q_centroid'] < 1.1) / float(Q)


(0.77656734491610468, 1.3663625479250007e-41)
0.095

In [226]:
walk = np.cumsum(np.random.randn(128))
plt.plot(walk, lw=4)
plt.axis('off')
# plt.savefig('/Users/davis/Desktop/tmp3.pdf')
plt.show()



In [146]:


In [147]:


In [11]:


In [163]:


In [11]:


In [ ]:


In [11]:


In [11]:


In [11]:


In [3]:
# can we construct a butterfly curve for probs by ANDing and then ORing to get 1 bit?

def hamming_dist(v1, v2):
    return np.count_nonzero(v1 != v2)

def and_or_curve(p, b=2, h=2):
    p = np.asarray(p)
    
    total_bits = b * h
    possible_events = np.arange(2**total_bits, dtype=np.uint8)
    bit_combos = np.unpackbits(possible_events.reshape((-1, 1)), axis=1)
    bit_combos = bit_combos[:, -total_bits:]
#     print bit_combos
    # AND together bits within each hash
    hash_vals = bit_combos.reshape((-1, h, b)).min(axis=2)
    # actually, try taking majority vote
#     hash_vals = (bit_combos.reshape((-1, h, b)).mean(axis=2) > .5).astype(np.int)
#     print hash_vals
    # OR together the hashes
#     bit_vals = hash_vals.max(axis=1)
    bit_vals = (hash_vals.mean(axis=1) >= .5).astype(np.int)
#     print bit_vals

    # for each possible set of 1-bit hash func values the
    # first obj could have (all of which are equiprobable),
    # compute prob of each possible set of bits for this
    # obj, given that they agree with prob p
    # wait, no, each combo of bits will be equiprobable cuz
    # will match one bit combo for first obj and differ in
    # the same way from the rest of its bit combos; so what
    # we actually want is the overall prob of them agreeing
    # vs disagreeing
    p_eq = np.zeros_like(p)
    p_neq = np.zeros_like(p)
    for i, bits0 in enumerate(bit_combos):
        for j, bits1 in enumerate(bit_combos):
            num_diffs = hamming_dist(bits0, bits1)
            prob = p**(total_bits - num_diffs) * (1-p)**(num_diffs)
            
            if bit_vals[i] == bit_vals[j]:
                p_eq += prob
            else:
                p_neq += prob

    denom = 2. ** total_bits
    return p_eq / denom, p_neq / denom

p = np.array([.001, .3, .4, .5, .6, .7, .8, .9, 1.]) - .001 # 3 dec places for printing
p_eq, p_neq = and_or_curve(p)
print "p, p after hash func:"
print p
print p_eq
plt.figure(figsize=(4, 4))
plt.plot(p, p_eq)
plt.plot(p, p, 'k--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
# print p_neq


p, p after hash func:
[ 0.     0.299  0.399  0.499  0.599  0.699  0.799  0.899  0.999]
[ 0.375  0.421  0.458  0.507  0.57   0.649  0.745  0.861  0.999]

In [4]:
# let's try / analyze hypersphere hashing
def p_ones(p0, cdfd, k, approximate=False):
    p0_bar = 1. - p0
    if approximate:
        term1 = 2. * (p0_bar ** k)
        term2 = ((1. - p0 * cdfd) * p0_bar) ** k
        return 1. - term1 + term2
    # looks like this is actually more convex, which is good;
    # seemingly not quite monotonic for k = 1 though
    cdf_diff = cdfd - cdfd*cdfd
    left_parens = 2. - ((1. - p0*cdfd) ** k)
    right_parens = p0_bar + p0*p0*cdf_diff
    return 1. - left_parens * right_parens ** k

def p_zeros(p0, cdfd, k):
    p_d = 1. - cdfd
    return (1. - 2*p0 + p0*p_d)**k

def plot_collision_probs(alphas, cdfs, ks, xvals=None, xlabel=None):
    xvals = xvals if xvals is not None else 1. - cdfs
    xlabel = xlabel or "Fraction of volume shared (cdf)"
    
    for alpha in alphas:
        fig, axes = plt.subplots(1, 3, figsize=(14*.8, 4*.8))
        fig.suptitle("alpha = " + str(alpha))
        for k in ks:
            p0_bar = alpha ** (1. / k)
            p0 = 1. - p0_bar
            p_1s = p_ones(p0, cdfd=cdfs, k=k)
            p_0s = p_zeros(p0, cdfd=cdfs, k=k)
            
            axes[0].plot(xvals, p_0s)
            axes[0].set_ylabel("p(both zeros)")
            
            axes[1].plot(xvals, p_1s)
            axes[1].set_ylabel("p(both ones)")
    
            axes[2].plot(xvals, p_0s + p_1s)
            axes[2].set_ylabel("p(same)")
            axes[2].set_ylim([0, 1])
            
            for ax in axes:
                ax.set_ylim([0, 1])
                ax.legend(["k=" + str(s) for s in ks], loc='best')
                ax.set_xlabel(xlabel)

from scipy.special import erf
def cdf_for_dist(dist, dims): # assumes 0 <= dist < 1
    return erf(dist * np.sqrt((dims + 2) / 2.))

cdfs = np.arange(0, 1.1, .1)
ks = [1, 2, 4, 8]
alphas = [.25, .5, .75]
plot_collision_probs(alphas, cdfs, ks)



In [5]:
def plot_collision_probs_vs_dist(ks, Ds):
    ks, Ds = np.asarray(ks), np.asarray(Ds)
#     alphas = [.5, .6] # .6 has noise collision prob of like .55, not .5; still pretty darn good
    alphas = [.5]
    dists = np.linspace(0, 1, 20)
    for D in Ds:
        # larger k -> smaller circles for a given p0; this is the coefficient
        # a such that k(ar)^d = r^d, where d is just Ds[0] here as a "reference" dimensionality
        # basically, it yields the same total volume of all the hyperspheres
        #
        # EDIT: actually, can't do this because it's different for different k values
        #
#         radius_shrinkage = 1. / ks ** (1. / D / Ds[0])
#         dists = np.minimum(dists_orig * radius_shrinkage, 1.)
        
        cdfs = cdf_for_dist(dists, D)
        plot_collision_probs(alphas, cdfs, ks, xvals=dists, xlabel='Distance')
        plt.suptitle('D = ' + str(D))

ks = [1, 2, 4, 8]
Ds = [4, 16, 64, 256]
plot_collision_probs_vs_dist(ks, Ds)



In [10]:
dists = np.linspace(0, 1, 20)
for D in Ds:
    r = 1.  # arbitrary value
    volumes = (r * dists) ** D
    plt.plot(dists, volumes / np.max(volumes))
plt.legend(["D=" + str(s) for s in Ds], loc='best')
plt.show()



In [11]:
for row in np.arange(5).reshape((1, -1)):
    print row


[0 1 2 3 4]