In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=3)
A = np.arange(12).reshape((4,3))
x = np.arange(9)
l = [1,3,2,4]
d = {'a': 1, 'b': 2, 'c': 3}
In [2]:
t = np.linspace(np.pi, 2*np.pi, 100)
y = (1 + np.cos(t)) / 2.
x = t / np.pi - 1
plt.plot(x, y)
plt.plot(x, y*y)
plt.plot(x, y*y * y*y)
plt.plot(x, y*y * y*y * y*y)
plt.show()
In [3]:
# alright, so above is incorrect; this is actually what you get
# when you do signed random projections LSH
t = np.linspace(-1, 1, 100)
y = 1 - np.arccos(t) / np.pi
x = t
# 1 bit to 8 bits
y2 = y*y
y4 = y2 * y2
y6 = y4 * y2
y8 = y4 * y4
y16 = y8 * y8
colors = 'bgrcmykw'
plt.figure(figsize=(8, 7))
yvals = [y, y2, y4, y8, y16]
powers = [1, 2, 4, 8, 16]
for i, yval in enumerate(yvals):
plt.plot(x, yval, color=colors[i])
plt.plot(x, np.minimum(1, yval * (2 ** powers[i]) / 2 ), '--', color=colors[i])
# plt.plot(x, np.minimum(1, yval * 2), '--', color=colors[i])
labels = [(str(s), str(s)) for s in powers] # second one is times 2
labels = reduce(lambda l1, l2: l1 + l2, labels)
plt.legend(labels, loc='upper left')
plt.show()
# okay, so it looks like using 4 signed random projections should be good;
# even just 2 might work; could also use two hash vals for the query to
# get double the collision probability (roughly) and it would also be fine
In [9]:
def amp(sims, power, scale=1):
sims = np.minimum(1, sims)
p = 1 - np.arccos(sims) / np.pi
activation = np.power(p, power) * scale
return activation
mu = 0
sigma = .15
sims = np.random.randn(int(1e5)) * sigma + mu
# sims = np.sqrt(np.abs(sims)) * np.sign(sims)
# sims = .5 + .5 * sims
# sims = .7 + .3 * sims
bits = 4
num_hashes = 64 * 8 / bits
amped = amp(sims, bits, num_hashes)
# amped /= num_hashes
plt.figure(figsize=(12, 5))
plt.hist(sims, 50)
plt.xlim((0, 1))
# ax = plt.subplots(1)
plt.figure(figsize=(12, 5))
# plt.xlim((0, np.max(amped)))
plt.xlim((0, num_hashes))
# plt.xlim((0, 1))
plt.hist(amped, 50)
# what if we replace sims with probs?
sims = amp(sims, 1)
top_k_orig = np.sort(sims)[-10:][::-1]
top_k_amped = np.sort(amped)[-10:][::-1]
# print "orig mean: ", np.mean(sims)
# print "amped mean: ", np.mean(amped)
print "orig best sims: ", top_k_orig
# print "orig best probs: ", 1 - np.arccos(top_k_orig) / np.pi
print "amped best sims: ", top_k_amped
# compute std of how many collisions there will actually be
# sketch_variances_amped = top_k_amped * (2.**-bits) * (1 - 2.**-bits)
sketch_p = top_k_amped / num_hashes
sketch_variances_amped = num_hashes * sketch_p * (1 - sketch_p)
sketch_stds_amped = np.sqrt(sketch_variances_amped) # sqrt(np(1-p))
print "sketch stds:\t", sketch_stds_amped
print "sketch_p: ", sketch_p
#print
#print "sketch stds amped: ", sketch_stds_amped
#print "lower 95% ci amped: ", top_k_amped - 1.96 * sketch_stds_amped
# print "\nstds: orig = {:.4}, amped = {:.4}\n".format(std_orig, std_amped)
# rel contrast is a weird measure if sims are 0-centered cuz
# tends to become enormous as mean approaches 0
# print "orig rel contrasts: ", top_k_orig / np.mean(sims)
# print "amped rel contrasts: ", top_k_amped / np.mean(amped)
gaps_orig = top_k_orig - np.mean(sims)
gaps_amped = top_k_amped - np.mean(amped)
std_orig = np.std(sims)
std_amped = np.std(amped)
normed_gaps_orig = gaps_orig / std_orig
normed_gaps_amped = gaps_amped / std_amped
# print "orig gaps: ", gaps_orig
# print "amped gaps: ", gaps_amped
print "orig sigma gaps:\t", normed_gaps_orig
print "amped sigma gaps:\t", normed_gaps_amped
print "ratio:\t\t\t", normed_gaps_amped / normed_gaps_orig
# so what if we take into account the variance of the binomial
# describing how many collisions we actually get? Because if this
# variance is high, we'll need to look at stuff with many fewer
# than the expected number of collisions to avoid false negatives
#
# okay, it appears that with 32B of 4bit hashes, this std is pretty small
#
# sketch_stds_over_sigma = sketch_stds_amped / std_amped
# sketch_lower_68_ci = normed_gaps_amped - 1 * sketch_stds_over_sigma
# sketch_lower_95_ci = normed_gaps_amped - 1.96 * sketch_stds_over_sigma
#
# EDIT: let's try this again...
sketch_lower_68_ci = (gaps_amped - 1 * sketch_stds_amped)
sketch_lower_95_ci = (gaps_amped - 1.96 * sketch_stds_amped)
sketch_lower_68_ci = sketch_lower_68_ci / std_amped
sketch_lower_95_ci = sketch_lower_95_ci / std_amped
print
print "sketch lower 95% ci:\t", sketch_lower_95_ci
distro = stats.norm()
# distro = stats.norm(loc=np.mean(amped), scale=std_amped)
N = int(1e6)
print "\nExpected number of points this extreme for N = {:g}".format(N)
print "orig:\t\t", N * distro.sf(normed_gaps_orig)
print "amped:\t\t", N * distro.sf(normed_gaps_amped)
print "amped 68% ci:\t", N * distro.sf(sketch_lower_68_ci)
print "amped 95% ci:\t", N * distro.sf(sketch_lower_95_ci)
# plt.plot(t, amped)
# plt.plot(amped)
plt.show()
In [10]:
# note the increasing ratios; moving from 4 sigma to 5 sigma prunes
# a higher fraction of points than moving from 2 sigma to 3 sigma
print "2:\t", N * distro.sf(2)
print "2.5:\t", N * distro.sf(2.5)
print "3:\t", N * distro.sf(3)
print "3.5:\t", N * distro.sf(3.5)
print "4:\t", N * distro.sf(4)
print "4.5:\t", N * distro.sf(4.5)
print "5:\t", N * distro.sf(5)
In [11]:
p = np.linspace(0, 1, 100)
plt.plot(p, p / (1-p))
plt.show()
In [12]:
# ya, so convex combo of sims with a constant doesn't mess
# with number of sigma the top points are from the mean at all
augmented_sims = .5 + .5 * sims
std_augmented = np.std(augmented_sims)
top_k_augmented = .5 + .5 * top_k_orig
gaps_augmented = top_k_augmented - np.mean(augmented_sims)
normed_gaps_augmented = gaps_augmented / std_augmented
print "orig normed gaps:\t", normed_gaps_orig
print "augmented normed gaps:\t", normed_gaps_augmented
In [13]:
# alright, let's see what combinations of high and low probs actually
# do better with amp codes (slash, whether *any* do)
#
# okay, so it appears that amp codes are basically never better given
# that they take more bits per hash; if we pretend they still just take
# one bit, they're *slightly* better when sims are above .5, and a lot
# better when they're close to 1
#
# ah, so I think a thing that's happening is that this is worse because
# we're using the pooled std, while the above is just looking at
# the 95% conf interval around the higher prob's mean, so it caps at
# using two of the higher sigma; if we do this 4 or 8 bits becomes better
# for high probs (.8 or more) vs lower probs (like .3 less or more)
#
# also, if we just use the 2 sigma conf interval and also pretend that
# n doesn't have to be lower, then amp codes wreck; what we really need
# is a way to not have n go down.
# -maybe bootstrap resampling? implemented by shifting stuff to look
# at more groups of 4 or whatever?
# -easy to implement; just repeatedly shift mask, cmpeq, add
#
# I think the reason the above simulations do better is that we happen
# to have high enough probs that it hits the regime where amp codes
# are better; it's much worse if the best probs are below like .7.
# default_probs = np.arange(.01, .99, .01)
# default_probs = np.arange(.5, .99, .01)
default_probs = np.arange(.4, .99, .01)
def compute_gaps(probs, n):
probs = np.asarray(probs)
num_probs = len(probs)
means = n * probs
variances = means * (1 - probs)
# okay, ya, it does seem to be computing stuff right
# print "probs: ", probs
# print "means: ", means
# print "variances: ", variances
# print "sum variance: ", np.sum(variances)
# print "sum sqrt: ", np.sqrt(np.sum(variances))
# mean_gap = means[1] - means[0]
# print "mean gap: ", mean_gap
# print "sigma gap: ", mean_gap / np.sqrt(np.sum(variances))
gaps = np.zeros((num_probs, num_probs))
for i in range(num_probs):
# end = min(num_probs, i + 60)
end = num_probs
for j in range(i, end):
sum_variances = variances[i] + variances[j]
sum_std = np.sqrt(sum_variances)
diff = means[j] - means[i]
diff -= 1.96 * np.sqrt(variances[j])
# diff -= 1 * np.sqrt(variances[j])
diff = np.maximum(0, diff)
gap = diff / np.sqrt(variances[i])
# gap = diff / sum_std
# even just using one is worse
# gap = diff / np.sqrt(variances[j])
# gap = diff / np.sqrt(variances[i])
gaps[i, j] = gaps[j, i] = gap
return gaps
def gaps_for_nbits(n, nbits, p=default_probs):
p = np.asarray(p)
hash_p = p ** nbits
# num_hashes = n / nbits
# note that if we ignore increased bit costs (below) and
# also just look at 95% conf interval, amp codes wreck
num_hashes = n
# num_hashes = n / np.sqrt(nbits)
return compute_gaps(probs=hash_p, n=num_hashes)
# print gaps_for_nbits(4, 1, p=[.4, .5])
# print
# print gaps_for_nbits(4, 2, p=[.4, .5])
# # import sys; sys.exit()
# return
n0 = 16 * 8
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()
all_gaps = []
bit_counts = [1, 2, 3, 4]
for nbits in bit_counts:
gaps = gaps_for_nbits(n0, nbits, p=default_probs)
all_gaps.append(gaps)
for i, gaps in enumerate(all_gaps[1:]): # subtract off gaps using one bit
gaps -= all_gaps[0]
# all_gaps[i + 1] = np.maximum(0, gaps)
all_gaps[i + 1] = gaps
# max_val = np.max([np.max(gaps) for gaps in all_gaps])
# max_val = min(4, max_val)
max_val = 5
for i, nbits in enumerate(bit_counts):
gaps = all_gaps[i]
gaps = np.minimum(gaps, max_val)
gaps = np.maximum(gaps, -max_val)
gaps[0, 0] = max_val # so all will have the same scale
gaps[-1, -1] = -max_val # so 0 is in the middle
axes[i].set_title(nbits)
# im = axes[i].imshow(gaps, cmap='seismic')
# im = axes[i].imshow(gaps, cmap='gist_ncar')
min_p, max_p = np.min(default_probs), np.max(default_probs)
extent = [min_p, max_p, max_p, min_p]
im = axes[i].imshow(gaps, cmap='CMRmap', interpolation=None, extent=extent)
plt.tight_layout() # must be before we add the colorbar
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im, cax=cbar_ax)
plt.show()
In [14]:
# alright, so let's see if we can OR together bit vects and get
# a probabalistic lower bound to let us look at 4 of them at once
N = int(1e4)
D = 64
Q = 4
bits = np.random.rand(N, D) < .5
queries = np.random.rand(Q, D) < .5
row_sums = np.sum(bits, axis=1)
print np.mean(row_sums)
print np.std(row_sums)
print
# idxs = np.random.randint(N, size=100)
# idxs = np.random.randint(N, size=3)
def top_k_idxs(elements, k, smaller_better=False):
if smaller_better:
which_nn = np.arange(k)
return np.argpartition(elements, kth=which_nn)[:k]
else:
which_nn = len(elements) - 1 - np.arange(k)
return np.argpartition(elements, kth=which_nn)[-k:][::-1]
def hamming_dist(v1, v2):
return np.count_nonzero(v1 != v2)
which_nn = N - 1 - np.arange(10)
# number of neighbors to choose from, number of vects in cluster
c = 4
k = c + 2
for q in queries:
print
# sims = bits.dot(q)
# print sims.shape
# print sims[:20]
# sims = bits.dot(q.reshape((-1, 1)))
# sims = np.sum(bits * q, axis=1)
# print "{} +/- {}".format(np.mean(sims), np.std(sims))
diffs = q != bits
dists = np.array([np.count_nonzero(row) for row in diffs])
print "{} +/- {}".format(np.mean(dists), np.std(dists))
nn_idxs = top_k_idxs(dists, k, smaller_better=True) # 6nn
print nn_idxs
print dists[nn_idxs]
tiebreaker = np.zeros(D, dtype=np.int)
for omit_which1 in range(k):
if c % 2 == 0: # even number -> need a tiebreaker
tiebreaker_idx = nn_idxs[omit_which1]
tiebreaker = bits[tiebreaker_idx]
for omit_which2 in range(omit_which1 + 1, k):
use_which_idxs = list(nn_idxs)
del use_which_idxs[omit_which1]
del use_which_idxs[omit_which2 - 1]
use_rows = bits[use_which_idxs]
assert(len(use_rows) == c)
sums = np.sum(use_rows, axis=0) + tiebreaker
cluster_bits = sums > (c // 2)
cluster_dists = [hamming_dist(r, cluster_bits) for r in use_rows]
mean_dist = np.mean(cluster_dists)
std_dists = np.std(cluster_dists)
unanimity_sums = np.sum(use_rows, axis=0)
unanimous_idxs = (unanimity_sums == 0) + (unanimity_sums == c)
num_unanimous_idxs = np.sum(unanimous_idxs)
q_dist = hamming_dist(q, cluster_bits)
print "{}: {:.3f} + {:.3f} =\t{:.3f} -> {}, {}".format(
use_which_idxs,
mean_dist, std_dists, mean_dist + std_dists,
num_unanimous_idxs, q_dist)
# if c == 4 or c == 5:
# agreeing_idxs = (sums <= 1) + (sums >= 4)
# elif c == 3:
# agreeing_idxs = (sums == 0) + (sums == 3)
# print np.sum(agreeing_idxs), np.count_nonzero(cluster_bits != q)
In [15]:
def pick_cluster(q, neighbors):
k = len(neighbors)
c = k + 1 - 2; # q with neighbors is 2 more than c
D = len(q)
even_cluster_size = (c % 2) == 0
use_rows = np.empty((c, D))
use_rows[0] = np.copy(q)
# TODO systematically compare this to the fancy approach;
# based on informal experimentation, no discernible difference
use_rows[1:] = neighbors[:-2]
centroid = np.sum(use_rows, axis=0)
if even_cluster_size:
centroid += neighbors[k-1]
centroid = centroid > (c // 2)
return centroid, np.arange(k-1)
best_loss = np.inf
best_centroid = None
best_neighbor_idxs = None
for omit_which1 in range(k):
if even_cluster_size:
tiebreaker_idx = nn_idxs[omit_which1]
tiebreaker = bits[tiebreaker_idx]
for omit_which2 in range(omit_which1 + 1, k):
# compute which neighbors we're using; note that we
# always use the query
use_which_idxs = list(range(k))
del use_which_idxs[omit_which1]
del use_which_idxs[omit_which2 - 1]
assert(len(use_which_idxs) == c - 1)
use_rows[1:, :] = neighbors[use_which_idxs]
# now compute the cluster centroid, which is a bitstring
sums = np.sum(use_rows, axis=0)
if even_cluster_size:
sums += tiebreaker
centroid = sums > (c // 2)
# compute loss associated with this centroid
cluster_dists = [hamming_dist(r, centroid) for r in use_rows]
mean_dist = np.mean(cluster_dists)
# std_dists = np.std(cluster_dists)
# loss = mean_dist + std_dists
loss = mean_dist
if loss < best_loss:
best_loss = loss
best_centroid = centroid
best_neighbor_idxs = use_which_idxs
# print "{}: {:.3f}".format(use_which_idxs, loss)
return best_centroid, np.array(best_neighbor_idxs)
# so it appears that c = 4 yeilds consistently lower dists to
# query than c = 3; former has 15 of biggest I saw in like 100
# queries, while latter has 15+ all the time; c = 6 seems to also
# be about the same; c = 8 has even lower dists to query...wat?
N = int(1e4)
D = 64
Q = 4
bits = np.random.rand(N, D) < .5
queries = np.random.rand(Q, D) < .5
c = 8
k = c + 2
# okay, so it appears that if the seed used to pick what's in the
# cluster is q's nn, then the seed's neighbors are almost all just
# q's neighbors and the centroid of the cluster is almost identical
# to both the seed and q
#
# what makes absolutely no sense is that, as c increases, q's dist
# to the seed's centroid keeps going down, while the seed's dist
# to its centroid approaches the avg (min?) dist to one of its neighbors
for q in queries:
print '------------------------'
print "q:\t\t", q[:20].astype(np.int)
diffs = q != bits
dists = np.array([np.count_nonzero(row) for row in diffs])
print "{} +/- {}".format(np.mean(dists), np.std(dists))
q_nn_idxs = top_k_idxs(dists, k + (not k % 2), smaller_better=True)
print dists[q_nn_idxs][:20]
# find nearest neighbors of q's nearest neighbor (the 'seed')
# EDIT: actually, just pick a pretty close neighbor, not closest
# seed_idx = np.argmin(dists)
seed_idx = q_nn_idxs[np.random.randint(k)]
seed_dist = dists[seed_idx]
seed = bits[seed_idx]
diffs = seed != bits
seed_dists = np.array([np.count_nonzero(row) for row in diffs])
nn_idxs = top_k_idxs(seed_dists, c + 2, smaller_better=True)
nn_idxs = nn_idxs[1:] # seed will find itself as 1nn
nn_idxs = np.random.randint(N, size=len(nn_idxs)); seed = bits[np.random.randint(N)] # TODO rm
seed_neighbors = bits[nn_idxs]
# print "seed nn idxs: ", nn_idxs
# print "q dist to seed: ", seed_dist
# q_neighbors = bits[q_nn_idxs]
# q_neighbor_sums = np.sum(q_neighbors, axis=0)
# # q_neighbor_sums += q
# q_centroid = q_neighbor_sums > (len(q_neighbors) // 2)
# print "q to its centroid:\t", hamming_dist(q, q_centroid)
# print "seed to its centroid: ", hamming_dist(seed, q_centroid)
# q_seed_sums = np.sum(seed_neighbors, axis=0) + seed
# q_seed_centroid = q_seed_sums > (len(seed_neighbors) // 2)
# print "q to seed centroid:\t", hamming_dist(q, q_seed_centroid)
# print "seed to seed centroid:\t", hamming_dist(seed, q_seed_centroid)
# break
# hmm; replacing nn_idxs with random ones sends dists back up to noise levels...
# nn_idxs = np.random.randint(N, size=len(nn_idxs))
# print "seed dists: ", seed_dists[nn_idxs[:20]]
# print "min q to seed neighbor: ", np.min([hamming_dist(q, row) for row in seed_neighbors])
# print
# build the cluster the seed would be in
# print "q is seed? ", q is seed
centroid, used_idxs = pick_cluster(seed, seed_neighbors)
# print "centroid:\t", centroid[:20].astype(np.int)i
# alright, so it has a reasonable dist when we don't use real cluster
# used_idxs = np.array([0])
# centroid = seed_neighbors[np.random.randint(c)]
# compare dists to this cluster and the rows within it
q_dists = [seed_dist] + [hamming_dist(q, row) for row in seed_neighbors]
used_q_dists = np.array(q_dists)[used_idxs + 1]
used_q_dists = [seed_dist] + list(used_q_dists)
seed_centroid_dist = hamming_dist(seed, centroid)
# print "q is centroid? ", q is centroid
# print "q:\t\t", q[:20].astype(np.int)
# print "centroid:\t", centroid[:20].astype(np.int)
q_centroid_dist = hamming_dist(q, centroid)
# print q[:10]
# print centroid[:10]
# print "seed dist to centroid: ", hamming_dist(centroid, seed)
# print "{} ({}) {}".format(q_dists, used_q_dists, q_centroid_dist)
# print "{} {}".format(used_q_dists, q_centroid_dist)
# print
print "{}: seed, query dist to centroid: {}, {}".format(
used_q_dists, seed_centroid_dist, q_centroid_dist * 64 / D)
In [15]:
# let's see how well points match the centroids of their knn in euclidean space
#
# So it appears that there's a positive correlation between dist to centroid of
# the nearest neighbors and dist to the seed, but it's not super high for iid
# gaussian data.
#
# Also, even dist to 1nn appears to be not super correlated with true dist,
# which bodes poorly for quantization attempts
import pandas as pd
def dists_sq(X, q):
diffs = X - q
return np.sum(diffs * diffs, axis=(len(X.shape) - 1)) # handle 1D X
def randwalk(*args):
ret = np.random.randn(*args)
ret = np.cumsum(ret, axis=-1)
return ret / np.linalg.norm(ret) * ret.shape[-1]
N = int(2e4)
D = 128
Q = 200
data_func = np.random.randn
# data_func = np.random.rand # much better corr with dist to orig point (.7 vs .62 in 128D)
# data_func = randwalk
X = data_func(N, D)
queries = data_func(Q, D)
# np.set_printoptions(precision=3)
# TODO self try various weighting schemes to come up with centroid
def try_query(X, q, k):
D = X.shape[-1]
seed = data_func(D)
dists = dists_sq(X, seed)
knn_idxs = top_k_idxs(dists, k, smaller_better=True)
neighbors = X[knn_idxs]
# weights = 1. / np.arange(1, k + 1)
weights = 1. / dists[knn_idxs]
# weights = np.exp(-dists[knn_idxs])
weights = (weights / np.sum(weights)).reshape((-1, 1))
nn_centroid = np.sum(neighbors * weights, axis=0)
# multiplying by 2 to un-shrink it (away from 0) seems to help
# nn_centroid *= np.sqrt(k)
nn_centroid *= 2
centroid_dist = dists_sq(nn_centroid, q) / D
seed_dist = dists_sq(seed, q) / D
# print "seed dists to its nn:\t\t", dists[knn_idxs][:10] / D
seed_centroid_dist = dists_sq(nn_centroid, seed) / D
# print "seed to nn centroid, zeros:\t{:.3f}, {:.3f}".format(seed_centroid_dist, np.var(seed))
# print "q to seed, centroid, zeros:\t{:.3f}, {:.3f}, {:.3f}".format(seed_dist, centroid_dist, np.var(q))
return seed_dist, centroid_dist, seed_centroid_dist
k = 32
q_stats = []
for q in queries:
q_stats.append(try_query(X, q, k))
q_stats = np.array(q_stats)
df = pd.DataFrame(q_stats, columns=['q_seed', 'q_centroid', 'seed_centroid'])
# print stats
x, y = df['q_seed'], df['q_centroid']
# x, y = np.sqrt(x), np.sqrt(y)
plt.scatter(x, y)
plt.show()
print stats.pearsonr(x, y)
print np.sum(df['q_centroid'] < 1.1) / float(Q)
In [226]:
walk = np.cumsum(np.random.randn(128))
plt.plot(walk, lw=4)
plt.axis('off')
# plt.savefig('/Users/davis/Desktop/tmp3.pdf')
plt.show()
In [146]:
In [147]:
In [11]:
In [163]:
In [11]:
In [ ]:
In [11]:
In [11]:
In [11]:
In [3]:
# can we construct a butterfly curve for probs by ANDing and then ORing to get 1 bit?
def hamming_dist(v1, v2):
return np.count_nonzero(v1 != v2)
def and_or_curve(p, b=2, h=2):
p = np.asarray(p)
total_bits = b * h
possible_events = np.arange(2**total_bits, dtype=np.uint8)
bit_combos = np.unpackbits(possible_events.reshape((-1, 1)), axis=1)
bit_combos = bit_combos[:, -total_bits:]
# print bit_combos
# AND together bits within each hash
hash_vals = bit_combos.reshape((-1, h, b)).min(axis=2)
# actually, try taking majority vote
# hash_vals = (bit_combos.reshape((-1, h, b)).mean(axis=2) > .5).astype(np.int)
# print hash_vals
# OR together the hashes
# bit_vals = hash_vals.max(axis=1)
bit_vals = (hash_vals.mean(axis=1) >= .5).astype(np.int)
# print bit_vals
# for each possible set of 1-bit hash func values the
# first obj could have (all of which are equiprobable),
# compute prob of each possible set of bits for this
# obj, given that they agree with prob p
# wait, no, each combo of bits will be equiprobable cuz
# will match one bit combo for first obj and differ in
# the same way from the rest of its bit combos; so what
# we actually want is the overall prob of them agreeing
# vs disagreeing
p_eq = np.zeros_like(p)
p_neq = np.zeros_like(p)
for i, bits0 in enumerate(bit_combos):
for j, bits1 in enumerate(bit_combos):
num_diffs = hamming_dist(bits0, bits1)
prob = p**(total_bits - num_diffs) * (1-p)**(num_diffs)
if bit_vals[i] == bit_vals[j]:
p_eq += prob
else:
p_neq += prob
denom = 2. ** total_bits
return p_eq / denom, p_neq / denom
p = np.array([.001, .3, .4, .5, .6, .7, .8, .9, 1.]) - .001 # 3 dec places for printing
p_eq, p_neq = and_or_curve(p)
print "p, p after hash func:"
print p
print p_eq
plt.figure(figsize=(4, 4))
plt.plot(p, p_eq)
plt.plot(p, p, 'k--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
# print p_neq
In [4]:
# let's try / analyze hypersphere hashing
def p_ones(p0, cdfd, k, approximate=False):
p0_bar = 1. - p0
if approximate:
term1 = 2. * (p0_bar ** k)
term2 = ((1. - p0 * cdfd) * p0_bar) ** k
return 1. - term1 + term2
# looks like this is actually more convex, which is good;
# seemingly not quite monotonic for k = 1 though
cdf_diff = cdfd - cdfd*cdfd
left_parens = 2. - ((1. - p0*cdfd) ** k)
right_parens = p0_bar + p0*p0*cdf_diff
return 1. - left_parens * right_parens ** k
def p_zeros(p0, cdfd, k):
p_d = 1. - cdfd
return (1. - 2*p0 + p0*p_d)**k
def plot_collision_probs(alphas, cdfs, ks, xvals=None, xlabel=None):
xvals = xvals if xvals is not None else 1. - cdfs
xlabel = xlabel or "Fraction of volume shared (cdf)"
for alpha in alphas:
fig, axes = plt.subplots(1, 3, figsize=(14*.8, 4*.8))
fig.suptitle("alpha = " + str(alpha))
for k in ks:
p0_bar = alpha ** (1. / k)
p0 = 1. - p0_bar
p_1s = p_ones(p0, cdfd=cdfs, k=k)
p_0s = p_zeros(p0, cdfd=cdfs, k=k)
axes[0].plot(xvals, p_0s)
axes[0].set_ylabel("p(both zeros)")
axes[1].plot(xvals, p_1s)
axes[1].set_ylabel("p(both ones)")
axes[2].plot(xvals, p_0s + p_1s)
axes[2].set_ylabel("p(same)")
axes[2].set_ylim([0, 1])
for ax in axes:
ax.set_ylim([0, 1])
ax.legend(["k=" + str(s) for s in ks], loc='best')
ax.set_xlabel(xlabel)
from scipy.special import erf
def cdf_for_dist(dist, dims): # assumes 0 <= dist < 1
return erf(dist * np.sqrt((dims + 2) / 2.))
cdfs = np.arange(0, 1.1, .1)
ks = [1, 2, 4, 8]
alphas = [.25, .5, .75]
plot_collision_probs(alphas, cdfs, ks)
In [5]:
def plot_collision_probs_vs_dist(ks, Ds):
ks, Ds = np.asarray(ks), np.asarray(Ds)
# alphas = [.5, .6] # .6 has noise collision prob of like .55, not .5; still pretty darn good
alphas = [.5]
dists = np.linspace(0, 1, 20)
for D in Ds:
# larger k -> smaller circles for a given p0; this is the coefficient
# a such that k(ar)^d = r^d, where d is just Ds[0] here as a "reference" dimensionality
# basically, it yields the same total volume of all the hyperspheres
#
# EDIT: actually, can't do this because it's different for different k values
#
# radius_shrinkage = 1. / ks ** (1. / D / Ds[0])
# dists = np.minimum(dists_orig * radius_shrinkage, 1.)
cdfs = cdf_for_dist(dists, D)
plot_collision_probs(alphas, cdfs, ks, xvals=dists, xlabel='Distance')
plt.suptitle('D = ' + str(D))
ks = [1, 2, 4, 8]
Ds = [4, 16, 64, 256]
plot_collision_probs_vs_dist(ks, Ds)
In [10]:
dists = np.linspace(0, 1, 20)
for D in Ds:
r = 1. # arbitrary value
volumes = (r * dists) ** D
plt.plot(dists, volumes / np.max(volumes))
plt.legend(["D=" + str(s) for s in Ds], loc='best')
plt.show()
In [11]:
for row in np.arange(5).reshape((1, -1)):
print row