In [21]:
# NOTE: most of the commented-out code was executed in a terminal and/or 
# datasets.py and subsequently pasted in here; consequently, it probably
# won't work out of the box.

import os
import numpy as np
from scipy.io import loadmat

%load_ext autoreload

%aimport utils
%autoreload 1  # 1 = autoreload only modules imported with %aimport
# %autoreload 2

DATA_DIR = os.path.expanduser("~/Desktop/datasets/nn-search/")

contig = np.ascontiguousarray
join = os.path.join

def save_to_data_dir(relative_path, X):
    np.save(join(DATA_DIR, relative_path), np.ascontiguousarray(X))

def load_from_data_dir(relative_path):
    return np.load(join(DATA_DIR, relative_path))

def loadmat_from_data_dir(relative_path):
    return loadmat(join(DATA_DIR, relative_path))

def read_yael_vecs(path, c_contiguous=True, limit_rows=-1, dtype=None):
    dim = np.fromfile(path, dtype=np.int32, count=2)[0]
    print "vector length = {}".format(dim)

    if dtype is None:
        if 'fvecs' in path:
            dtype = np.float32
        elif 'ivecs' in path:
            dtype = np.int32
        elif 'bvecs' in path:
            dtype = np.uint8
        else:
            raise ValueError("couldn't infer dtype from path {}".format(path))
    itemsize = np.dtype(dtype).itemsize

    assert dim > 0
    assert itemsize in (1, 2, 4)

    cols_for_dim = 4 // itemsize
    row_size_bytes = 4 + dim * itemsize
    row_size_elems = row_size_bytes // itemsize
    limit = int(limit_rows) * row_size_elems if limit_rows > 0 else -1

    fv = np.fromfile(path, dtype=dtype, count=limit)
    fv = fv.reshape((-1, row_size_elems))

    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + path)

    fv = fv[:, cols_for_dim:]

    if c_contiguous:
        fv = fv.copy()
    return fv


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Gist1M


In [6]:
# # ------------------------ clean up gist (now broken)
# path = Paths.DATA_DIR + '/gist/gist_base.fvecs'
# path = Paths.DATA_DIR + '/gist/gist_learn.fvecs'
# path = Paths.DATA_DIR + '/gist/gist_queries.fvecs'
# path = Paths.DATA_DIR + '/gist/gist_groundtruth.ivecs'
# out_path = Paths.GIST_100
# out_path = Paths.GIST_200
# out_path = Paths.GIST_TRAIN
# out_path = Paths.GIST_QUERIES
# out_path = Paths.GIST_TRUTH
# X = read_yael_vecs(path)[:100000]
# X = read_yael_vecs(path)[:200000]
# X = read_yael_vecs(path)
# X = read_yael_vecs(path, dtype=np.int32)
# print X[:2]
# print X.shape
# np.save(out_path, X)
# truth_dir = data_dir + 'gnd/'
# # truth_idxs_files = ['idx_1M', 'idx_10M', 'idx_100M']
# truth_idxs_files = ['idx_1000M']
# for f in truth_idxs_files:
#     path = truth_dir + f + '.ivecs'
#     out_path = out_dir + f + '.npy'
#     print "unpacking {} to {}".format(path, out_path)
#     X = read_yael_vecs(path)
#     print X.shape
#     np.save(out_path, X)

Sift1M


In [7]:
# ------------------------ clean up sift1m
# data_dir = '/Volumes/MacHDD/datasets/sift1m/'
# out_dir = '/Volumes/MacSSD_OS/Users/davis/Desktop/datasets/sift1m/'
# for fname in os.listdir(data_dir):
#     in_path = data_dir + fname
#     out_path = out_dir + fname.split('.')[0] + '.npy'
#     print "unpacking {} to {}".format(in_path, out_path)
#     X = read_yael_vecs(in_path)
#     print X.shape
#     np.save(out_path, X)

Deep1M


In [54]:
# # ------------------------ clean up Deep1M
# data_dir = os.path.expanduser('~/Desktop/datasets/nn-search/deep1M-raw/')
# out_dir = os.path.expanduser('~/Desktop/datasets/nn-search/deep1M/')
# print "in dir, out dir:", data_dir, out_dir
# for fname in os.listdir(data_dir):
#     in_path = data_dir + fname
#     out_path = out_dir + fname.split('.')[0] + '.npy'
#     print "unpacking {} to {}".format(in_path, out_path)
#     X = read_yael_vecs(in_path)
#     print X.shape
#     np.save(out_path, X)

X_train = load_from_data_dir('deep1m/deep1M_learn.npy')
print X_train.shape
print np.isfortran(X_train) # false

Q = load_from_data_dir('deep1m/deep1M_queries.npy')
print Q.shape
print np.isfortran(Q) # false


(300000, 256)
False
(1000, 256)
False

In [56]:
truth_train = utils.compute_true_knn(X_train, Q)
print truth_train.shape
print truth_train.dtype


computing top k for query block 0 (queries 0-128)...
computing top k for query block 5 (queries 640-768)...
done
(1000, 1000)
int32

In [57]:
save_to_data_dir('deep1m/deep1M_truth_train.npy', truth_train.astype(np.int32))

In [74]:
X_test = load_from_data_dir('deep1m/deep1M_base.npy')
print X_test.shape
print np.isfortran(X_test) # false


(1000000, 256)
False

In [63]:
truth_test = utils.compute_true_knn(X_test, Q)
print truth_test.shape
print truth_test.dtype


computing top k for query block 0 (queries 0-128)...
computing top k for query block 5 (queries 640-768)...
done
(1000, 1000)
int32

In [65]:
save_to_data_dir('deep1m/deep1M_truth_test.npy', truth_test.astype(np.int32))

In [75]:
save_to_data_dir('deep1m/deep1m_test_100k.npy', X_test[:100000, :])

Convnet 1M


In [24]:
# # ------------------------ clean up Convnet1M
# import numpy as np
# from scipy.io import loadmat
# d = loadmat('features_m_128.mat')
# contig = np.ascontiguousarray
# savedir = '../convnet1m/'
# np.save(savedir + 'convnet_train.npy', contig(d['feats_m_128_train']))
# np.save(savedir + 'convnet_test.npy', contig(d['feats_m_128_test']))
# np.save(savedir + 'convnet_base.npy', contig(d['feats_m_128_base']))

X_train = load_from_data_dir('convnet1m/convnet_train.npy')
print np.isfortran(X_train) # false
print X_train.shape
# if np.isfortran(X_train):
#     save_to_data_dir('convnet1m/convnet_train.npy', X_train)


False
(100000, 128)

In [6]:
Q = load_from_data_dir('convnet1m/convnet_queries.npy')
print Q.shape
print np.isfortran(Q)
# save_to_data_dir('convnet1m/convnet_queries.npy', Q)


(10000, 128)
False

In [11]:
# print Q[:20, :20]  yep, looks like relu activations
# dists = utils.sq_dists_to_vectors(Q, X_train)
# print dists.shape

In [71]:
nqueries = Q.shape[0]
k = 1000
truth_train = np.full((nqueries, k), -999, dtype=np.int32)
print nqueries
# for i in range(100):
for i in range(nqueries):
    if i % 1000 == 0:
        print "computing top k for query {}...".format(i)
    truth_train[i, :] = utils.top_k_idxs(dists[i, :], k)
print "done"


10000
computing top k for query 0...
computing top k for query 500...
computing top k for query 1000...
computing top k for query 1500...
computing top k for query 2000...
computing top k for query 2500...
computing top k for query 3000...
computing top k for query 3500...
computing top k for query 4000...
computing top k for query 4500...
computing top k for query 5000...
computing top k for query 5500...
computing top k for query 6000...
computing top k for query 6500...
computing top k for query 7000...
computing top k for query 7500...
computing top k for query 8000...
computing top k for query 8500...
computing top k for query 9000...
computing top k for query 9500...

In [74]:
assert not np.any(truth_train == -999)
save_to_data_dir('convnet1m/truth_train.npy', truth_train.astype(np.int32))

In [71]:
X_test = load_from_data_dir('convnet1m/convnet_test.npy')
print X_test.shape
print np.isfortran(X_test)
# if np.isfortran(X_test):
#     save_to_data_dir('convnet1m/convnet_test.npy', X_test)


(1000000, 128)
False

In [37]:
# # truth_test = utils.compute_true_knn(X_test[:1000], Q[:130], block_sz=128)
# truth_test = utils.compute_true_knn(X_test, Q)
# print truth_test.shape
# print truth_test.dtype


WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 0 (queries 0-128)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 5 (queries 640-768)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 10 (queries 1280-1408)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 15 (queries 1920-2048)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 20 (queries 2560-2688)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 25 (queries 3200-3328)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 30 (queries 3840-3968)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 35 (queries 4480-4608)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 40 (queries 5120-5248)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 45 (queries 5760-5888)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 50 (queries 6400-6528)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 55 (queries 7040-7168)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 60 (queries 7680-7808)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 65 (queries 8320-8448)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 70 (queries 8960-9088)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
computing top k for query block 75 (queries 9600-9728)...
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
WARNING: sq_dists_to_vectors: attempting to create a matrixof size 128000000
done
(10000, 1000)
int32

In [38]:
assert not np.any(truth_test == -999)
save_to_data_dir('convnet1m/truth_test.npy', truth_test)

In [73]:
save_to_data_dir('convnet1m/convnet_test_100k.npy', X_test[:100000, :])

Deep1B


In [10]:
# ------------------------ clean up deep1b
# data_dir = '/Volumes/MacHDD/datasets/deep1b/'
# out_dir = '/Volumes/MacSSD_OS/Users/davis/Desktop/datasets/deep1b/'

# # expected_cols = 96
# # equivalent_elements_in_first_1M = int(1e6) * (1 + expected_cols)
# arrays = []
# # arrays.append(('deep1B_queries.fvecs', 'deep_queries.npy', -1))
# # arrays.append(('deep1B_groundtruth.ivecs', 'deep_true_nn_idxs.npy', -1))
# # arrays.append(('deep10M.fvecs', 'deep_1M.npy', 1e6))
# arrays.append(('deep10M.fvecs', 'deep_10M.npy', -1))
# for in_file, out_file, limit in arrays:
#     in_path = data_dir + in_file
#     out_path = out_dir + out_file
#     X = read_yael_vecs(in_path, limit_rows=limit)
#     print "unpacking {} to {}".format(in_path, out_path)
#     print X.shape
#     np.save(out_path, X)

LabelMe


In [2]:
# ------------------------ clean up LabelMe
# >>> from scipy.io import loadmat
# >>> d = loadmat('LabelMe_gist.mat')
# >>> for k, v in d.iteritems():
# ...     try:
# ...             print k, v.shape
# ...     except:
# ...             pass
# ...
# gist (22019, 512)
# img (32, 32, 3, 22019)
# nmat (1000, 1000, 20)
# __header__ param (1, 1)
# __globals__ seg (32, 32, 22019)
# names (1, 3597)
# DistLM (22019, 22019)
# __version__ ndxtrain (1, 20019)
# ndxtest (1, 2000)
#
# okay, no idea what most of these are even with the readme...
#
# >>> np.save('labelme_train_idxs', d['ndxtrain']) # training data idxs
# >>> np.save('labelme_test_idxs', d['ndxtest'])   # test data idxs
# >>> np.save('labelme_all_gists', d['gist'])     # actual gist descriptors

X = load_from_data_dir('labelme/labelme_all_gists.npy')
np.isfortran(X)  # this is true; suggests lots of other stuff also F order


Out[2]:
True

In [26]:
train_idxs = load_from_data_dir('labelme/labelme_train_idxs.npy').ravel() - 1 # one-indexed
X_train = X[train_idxs, :]
# save_to_data_dir('labelme/labelme_train.npy', X_train)

test_idxs = load_from_data_dir('labelme/labelme_test_idxs.npy').ravel() - 1 # one-indexed
X_test = X[test_idxs, :]
# save_to_data_dir('labelme/labelme_test.npy', X_test)

print train_idxs.shape
print test_idxs.shape

train_set = set(list(train_idxs))
test_set = set(list(test_idxs))
assert len(train_set.intersection(test_set)) == 0
assert len(train_set) + len(test_set) == len(X)


(20019,)
(2000,)

In [3]:
d = loadmat_from_data_dir('labelme/LabelMe_gist.mat')

In [4]:
print d.keys()
print d['gist'].shape
print d['seg'].shape
print d['nmat'].shape
print d['DistLM'].shape
print (d['DistLM'] != 0).sum() / 22019

# alright, ya, there don't seem to be predefined queries or true neighbors for them


['gist', 'img', 'nmat', '__header__', 'param', '__globals__', 'seg', 'names', 'DistLM', '__version__', 'ndxtrain', 'ndxtest']
(22019, 512)
(32, 32, 22019)
(1000, 1000, 20)
(22019, 22019)
836

In [47]:
# dists = utils.sq_dists_to_vectors(X, X)

In [48]:
# truth = np.argsort(dists, axis=1)[:1000]
# save_to_data_dir('labelme/labelme_truth.npy', truth)

In [22]:
X_train = load_from_data_dir('labelme/labelme_train.npy')
print X_train.shape
print np.isfortran(X_train) # false


(20019, 512)
False

In [23]:
X_test = load_from_data_dir('labelme/labelme_test.npy')
print X_test.shape
print np.isfortran(X_test)  # false


(2000, 512)
False

In [24]:
truth = utils.compute_true_knn(X_train, X_test)


computing top k for query block 0 (queries 0-128)...
computing top k for query block 5 (queries 640-768)...
computing top k for query block 10 (queries 1280-1408)...
computing top k for query block 15 (queries 1920-2000)...
done

In [29]:
nrows = 5
X = X_train
Q = X_test
print X[:5, :5]
print Q[:5, :5]
for i in range(nrows):
    dists = utils.dists_sq(X, Q[i, :])
    print np.argsort(dists, axis=-1)[:10]
    print truth[i, :10]


[[ 0.04996838  0.08774966  0.04675926  0.06643171  0.01966016]
 [ 0.0075171   0.0136047   0.02036818  0.01539837  0.00822605]
 [ 0.01428436  0.04946869  0.03160411  0.05185791  0.00135769]
 [ 0.00696543  0.0505463   0.09298885  0.04827282  0.00205458]
 [ 0.01164107  0.04647382  0.06512821  0.03711114  0.00478815]]
[[ 0.03001729  0.03533581  0.02367221  0.00753544  0.00722851]
 [ 0.05340568  0.08936567  0.12642182  0.07891258  0.11755066]
 [ 0.05266207  0.06291605  0.04678894  0.05438183  0.00601782]
 [ 0.03347877  0.07613105  0.14790234  0.008575    0.04197339]
 [ 0.04045155  0.0408119   0.07632685  0.10935131  0.02766959]]
[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627]
[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627]
[ 8889  8046 11823  4267 16831 14420 12037  2076  8762  9254]
[ 8889  8046 11823  4267 16831 14420 12037  2076  8762  9254]
[ 1143  1136  1138  9057 16789  1146  2066  1148  1649  2062]
[ 1143  1136  1138  9057 16789  1146  2066  1148  1649  2062]
[ 4439  6094  4343 18569 18608 18654  4435  1055  6451  4446]
[ 4439  6094  4343 18569 18608 18654  4435  1055  6451  4446]
[16583  6164 16613 14143 16943  7424 10617 10078 10656 13140]
[16583  6164 16613 14143 16943  7424 10617 10078 10656 13140]

In [19]:
print truth.shape
print np.isfortran(truth)
save_to_data_dir('labelme/labelme_truth.npy', truth)


(2000, 1000)
False

In [20]:
print truth[0, :20]
dists = utils.dists_sq(X_train, X_test[0])
print np.argsort(dists)[:20]


[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627 13380 10476
 14664 13382 17015  1023 15942 10431 10440  8320]
[ 9259  4870 13950 14336  8944 10479  1577 14127 10323  3627 13380 10476
 14664 13382 17015  1023 15942 10431 10440  8320]

MNIST


In [42]:
# ------------------------ clean up mnist
import mnist
loader = mnist.MNIST(DATA_DIR + 'mnist/')
X_train, Y_train = loader.load_training()

In [43]:
X_train_2d = np.concatenate(X_train, axis=0)
X_train_2d = X_train_2d.reshape((-1, 784))
print mnist.MNIST.display(X_train[3])
print Y_train[0]


............................
............................
............................
............................
............................
...................@@.......
..................@@@.......
..................@@@.......
.................@@@........
................@@@.........
................@@..........
...............@@@..........
..............@@@...........
..............@@............
.............@@.............
............@@@.............
............@@@.............
...........@@@..............
..........@@@...............
..........@@@...............
..........@@@...............
.........@@@................
.........@@@................
.........@@@................
..........@@................
............................
............................
............................
5

In [49]:
print type(Y_train)
save_to_data_dir('mnist/X_train.npy', X_train_2d.astype(np.float32))
save_to_data_dir('mnist/Y_train.npy', Y_train)


<type 'array.array'>

In [19]:


In [46]:
X_test, Y_test = loader.load_testing()
X_test_2d = np.concatenate(X_test, axis=0)
X_test_2d = X_test_2d.reshape((-1, 784))

In [50]:
save_to_data_dir('mnist/X_test.npy', X_test_2d.astype(np.float32))
save_to_data_dir('mnist/Y_test.npy', Y_test)

In [52]:
print X_test_2d.shape
print np.min(X_test)
print np.max(X_test)


(10000, 784)
0
255

In [27]:
# like other papers, treat test set as queries, train as database
truth = utils.compute_true_knn(X_train_2d, X_test_2d)


computing true euclidean distances...
computing top k for query 0...
computing top k for query 1000...
computing top k for query 2000...
computing top k for query 3000...
computing top k for query 4000...
computing top k for query 5000...
computing top k for query 6000...
computing top k for query 7000...
computing top k for query 8000...
computing top k for query 9000...
done

In [28]:
# save_to_data_dir('mnist/truth_Q=train_X=test.npy', truth)
save_to_data_dir('mnist/truth_Q=test_X=train.npy', truth)

Glove


In [68]:
X = np.loadtxt(DATA_DIR + 'glove/glove.txt')
print X.shape
print np.isfortran(X) # false

Q, X_test = X[:10000], X[10000:]
save_to_data_dir('glove/glove_queries.npy', Q)
save_to_data_dir('glove/glove_test.npy', X_test)

# Q = load_from_data_dir('deep1m/deep1M_queries.npy')
# print Q.shape
# print np.isfortran(Q) # false


(1193514, 100)
False

In [69]:
truth = utils.compute_true_knn(X_test, Q)
print truth.dtype
print truth.shape


computing top k for query block 0 (queries 0-128)...
computing top k for query block 5 (queries 640-768)...
computing top k for query block 10 (queries 1280-1408)...
computing top k for query block 15 (queries 1920-2048)...
computing top k for query block 20 (queries 2560-2688)...
computing top k for query block 25 (queries 3200-3328)...
computing top k for query block 30 (queries 3840-3968)...
computing top k for query block 35 (queries 4480-4608)...
computing top k for query block 40 (queries 5120-5248)...
computing top k for query block 45 (queries 5760-5888)...
computing top k for query block 50 (queries 6400-6528)...
computing top k for query block 55 (queries 7040-7168)...
computing top k for query block 60 (queries 7680-7808)...
computing top k for query block 65 (queries 8320-8448)...
computing top k for query block 70 (queries 8960-9088)...
computing top k for query block 75 (queries 9600-9728)...
done
int32
(10000, 1000)

In [70]:
save_to_data_dir('glove/truth.npy', truth)