In [21]:
# NOTE: most of the commented-out code was executed in a terminal and/or
# datasets.py and subsequently pasted in here; consequently, it probably
# won't work out of the box.
import os
import numpy as np
from scipy.io import loadmat
%load_ext autoreload
%aimport utils
%autoreload 1 # 1 = autoreload only modules imported with %aimport
# %autoreload 2
DATA_DIR = os.path.expanduser("~/Desktop/datasets/nn-search/")
contig = np.ascontiguousarray
join = os.path.join
def save_to_data_dir(relative_path, X):
np.save(join(DATA_DIR, relative_path), np.ascontiguousarray(X))
def load_from_data_dir(relative_path):
return np.load(join(DATA_DIR, relative_path))
def loadmat_from_data_dir(relative_path):
return loadmat(join(DATA_DIR, relative_path))
def read_yael_vecs(path, c_contiguous=True, limit_rows=-1, dtype=None):
dim = np.fromfile(path, dtype=np.int32, count=2)[0]
print "vector length = {}".format(dim)
if dtype is None:
if 'fvecs' in path:
dtype = np.float32
elif 'ivecs' in path:
dtype = np.int32
elif 'bvecs' in path:
dtype = np.uint8
else:
raise ValueError("couldn't infer dtype from path {}".format(path))
itemsize = np.dtype(dtype).itemsize
assert dim > 0
assert itemsize in (1, 2, 4)
cols_for_dim = 4 // itemsize
row_size_bytes = 4 + dim * itemsize
row_size_elems = row_size_bytes // itemsize
limit = int(limit_rows) * row_size_elems if limit_rows > 0 else -1
fv = np.fromfile(path, dtype=dtype, count=limit)
fv = fv.reshape((-1, row_size_elems))
if not all(fv.view(np.int32)[:, 0] == dim):
raise IOError("Non-uniform vector sizes in " + path)
fv = fv[:, cols_for_dim:]
if c_contiguous:
fv = fv.copy()
return fv
In [6]:
# # ------------------------ clean up gist (now broken)
# path = Paths.DATA_DIR + '/gist/gist_base.fvecs'
# path = Paths.DATA_DIR + '/gist/gist_learn.fvecs'
# path = Paths.DATA_DIR + '/gist/gist_queries.fvecs'
# path = Paths.DATA_DIR + '/gist/gist_groundtruth.ivecs'
# out_path = Paths.GIST_100
# out_path = Paths.GIST_200
# out_path = Paths.GIST_TRAIN
# out_path = Paths.GIST_QUERIES
# out_path = Paths.GIST_TRUTH
# X = read_yael_vecs(path)[:100000]
# X = read_yael_vecs(path)[:200000]
# X = read_yael_vecs(path)
# X = read_yael_vecs(path, dtype=np.int32)
# print X[:2]
# print X.shape
# np.save(out_path, X)
# truth_dir = data_dir + 'gnd/'
# # truth_idxs_files = ['idx_1M', 'idx_10M', 'idx_100M']
# truth_idxs_files = ['idx_1000M']
# for f in truth_idxs_files:
# path = truth_dir + f + '.ivecs'
# out_path = out_dir + f + '.npy'
# print "unpacking {} to {}".format(path, out_path)
# X = read_yael_vecs(path)
# print X.shape
# np.save(out_path, X)
In [7]:
# ------------------------ clean up sift1m
# data_dir = '/Volumes/MacHDD/datasets/sift1m/'
# out_dir = '/Volumes/MacSSD_OS/Users/davis/Desktop/datasets/sift1m/'
# for fname in os.listdir(data_dir):
# in_path = data_dir + fname
# out_path = out_dir + fname.split('.')[0] + '.npy'
# print "unpacking {} to {}".format(in_path, out_path)
# X = read_yael_vecs(in_path)
# print X.shape
# np.save(out_path, X)
In [54]:
# # ------------------------ clean up Deep1M
# data_dir = os.path.expanduser('~/Desktop/datasets/nn-search/deep1M-raw/')
# out_dir = os.path.expanduser('~/Desktop/datasets/nn-search/deep1M/')
# print "in dir, out dir:", data_dir, out_dir
# for fname in os.listdir(data_dir):
# in_path = data_dir + fname
# out_path = out_dir + fname.split('.')[0] + '.npy'
# print "unpacking {} to {}".format(in_path, out_path)
# X = read_yael_vecs(in_path)
# print X.shape
# np.save(out_path, X)
X_train = load_from_data_dir('deep1m/deep1M_learn.npy')
print X_train.shape
print np.isfortran(X_train) # false
Q = load_from_data_dir('deep1m/deep1M_queries.npy')
print Q.shape
print np.isfortran(Q) # false
In [56]:
truth_train = utils.compute_true_knn(X_train, Q)
print truth_train.shape
print truth_train.dtype
In [57]:
save_to_data_dir('deep1m/deep1M_truth_train.npy', truth_train.astype(np.int32))
In [74]:
X_test = load_from_data_dir('deep1m/deep1M_base.npy')
print X_test.shape
print np.isfortran(X_test) # false
In [63]:
truth_test = utils.compute_true_knn(X_test, Q)
print truth_test.shape
print truth_test.dtype
In [65]:
save_to_data_dir('deep1m/deep1M_truth_test.npy', truth_test.astype(np.int32))
In [75]:
save_to_data_dir('deep1m/deep1m_test_100k.npy', X_test[:100000, :])
In [24]:
# # ------------------------ clean up Convnet1M
# import numpy as np
# from scipy.io import loadmat
# d = loadmat('features_m_128.mat')
# contig = np.ascontiguousarray
# savedir = '../convnet1m/'
# np.save(savedir + 'convnet_train.npy', contig(d['feats_m_128_train']))
# np.save(savedir + 'convnet_test.npy', contig(d['feats_m_128_test']))
# np.save(savedir + 'convnet_base.npy', contig(d['feats_m_128_base']))
X_train = load_from_data_dir('convnet1m/convnet_train.npy')
print np.isfortran(X_train) # false
print X_train.shape
# if np.isfortran(X_train):
# save_to_data_dir('convnet1m/convnet_train.npy', X_train)
In [6]:
Q = load_from_data_dir('convnet1m/convnet_queries.npy')
print Q.shape
print np.isfortran(Q)
# save_to_data_dir('convnet1m/convnet_queries.npy', Q)
In [11]:
# print Q[:20, :20] yep, looks like relu activations
# dists = utils.sq_dists_to_vectors(Q, X_train)
# print dists.shape
In [71]:
nqueries = Q.shape[0]
k = 1000
truth_train = np.full((nqueries, k), -999, dtype=np.int32)
print nqueries
# for i in range(100):
for i in range(nqueries):
if i % 1000 == 0:
print "computing top k for query {}...".format(i)
truth_train[i, :] = utils.top_k_idxs(dists[i, :], k)
print "done"
In [74]:
assert not np.any(truth_train == -999)
save_to_data_dir('convnet1m/truth_train.npy', truth_train.astype(np.int32))
In [71]:
X_test = load_from_data_dir('convnet1m/convnet_test.npy')
print X_test.shape
print np.isfortran(X_test)
# if np.isfortran(X_test):
# save_to_data_dir('convnet1m/convnet_test.npy', X_test)
In [37]:
# # truth_test = utils.compute_true_knn(X_test[:1000], Q[:130], block_sz=128)
# truth_test = utils.compute_true_knn(X_test, Q)
# print truth_test.shape
# print truth_test.dtype
In [38]:
assert not np.any(truth_test == -999)
save_to_data_dir('convnet1m/truth_test.npy', truth_test)
In [73]:
save_to_data_dir('convnet1m/convnet_test_100k.npy', X_test[:100000, :])
In [10]:
# ------------------------ clean up deep1b
# data_dir = '/Volumes/MacHDD/datasets/deep1b/'
# out_dir = '/Volumes/MacSSD_OS/Users/davis/Desktop/datasets/deep1b/'
# # expected_cols = 96
# # equivalent_elements_in_first_1M = int(1e6) * (1 + expected_cols)
# arrays = []
# # arrays.append(('deep1B_queries.fvecs', 'deep_queries.npy', -1))
# # arrays.append(('deep1B_groundtruth.ivecs', 'deep_true_nn_idxs.npy', -1))
# # arrays.append(('deep10M.fvecs', 'deep_1M.npy', 1e6))
# arrays.append(('deep10M.fvecs', 'deep_10M.npy', -1))
# for in_file, out_file, limit in arrays:
# in_path = data_dir + in_file
# out_path = out_dir + out_file
# X = read_yael_vecs(in_path, limit_rows=limit)
# print "unpacking {} to {}".format(in_path, out_path)
# print X.shape
# np.save(out_path, X)
In [2]:
# ------------------------ clean up LabelMe
# >>> from scipy.io import loadmat
# >>> d = loadmat('LabelMe_gist.mat')
# >>> for k, v in d.iteritems():
# ... try:
# ... print k, v.shape
# ... except:
# ... pass
# ...
# gist (22019, 512)
# img (32, 32, 3, 22019)
# nmat (1000, 1000, 20)
# __header__ param (1, 1)
# __globals__ seg (32, 32, 22019)
# names (1, 3597)
# DistLM (22019, 22019)
# __version__ ndxtrain (1, 20019)
# ndxtest (1, 2000)
#
# okay, no idea what most of these are even with the readme...
#
# >>> np.save('labelme_train_idxs', d['ndxtrain']) # training data idxs
# >>> np.save('labelme_test_idxs', d['ndxtest']) # test data idxs
# >>> np.save('labelme_all_gists', d['gist']) # actual gist descriptors
X = load_from_data_dir('labelme/labelme_all_gists.npy')
np.isfortran(X) # this is true; suggests lots of other stuff also F order
Out[2]:
In [26]:
train_idxs = load_from_data_dir('labelme/labelme_train_idxs.npy').ravel() - 1 # one-indexed
X_train = X[train_idxs, :]
# save_to_data_dir('labelme/labelme_train.npy', X_train)
test_idxs = load_from_data_dir('labelme/labelme_test_idxs.npy').ravel() - 1 # one-indexed
X_test = X[test_idxs, :]
# save_to_data_dir('labelme/labelme_test.npy', X_test)
print train_idxs.shape
print test_idxs.shape
train_set = set(list(train_idxs))
test_set = set(list(test_idxs))
assert len(train_set.intersection(test_set)) == 0
assert len(train_set) + len(test_set) == len(X)
In [3]:
d = loadmat_from_data_dir('labelme/LabelMe_gist.mat')
In [4]:
print d.keys()
print d['gist'].shape
print d['seg'].shape
print d['nmat'].shape
print d['DistLM'].shape
print (d['DistLM'] != 0).sum() / 22019
# alright, ya, there don't seem to be predefined queries or true neighbors for them
In [47]:
# dists = utils.sq_dists_to_vectors(X, X)
In [48]:
# truth = np.argsort(dists, axis=1)[:1000]
# save_to_data_dir('labelme/labelme_truth.npy', truth)
In [22]:
X_train = load_from_data_dir('labelme/labelme_train.npy')
print X_train.shape
print np.isfortran(X_train) # false
In [23]:
X_test = load_from_data_dir('labelme/labelme_test.npy')
print X_test.shape
print np.isfortran(X_test) # false
In [24]:
truth = utils.compute_true_knn(X_train, X_test)
In [29]:
nrows = 5
X = X_train
Q = X_test
print X[:5, :5]
print Q[:5, :5]
for i in range(nrows):
dists = utils.dists_sq(X, Q[i, :])
print np.argsort(dists, axis=-1)[:10]
print truth[i, :10]
In [19]:
print truth.shape
print np.isfortran(truth)
save_to_data_dir('labelme/labelme_truth.npy', truth)
In [20]:
print truth[0, :20]
dists = utils.dists_sq(X_train, X_test[0])
print np.argsort(dists)[:20]
In [42]:
# ------------------------ clean up mnist
import mnist
loader = mnist.MNIST(DATA_DIR + 'mnist/')
X_train, Y_train = loader.load_training()
In [43]:
X_train_2d = np.concatenate(X_train, axis=0)
X_train_2d = X_train_2d.reshape((-1, 784))
print mnist.MNIST.display(X_train[3])
print Y_train[0]
In [49]:
print type(Y_train)
save_to_data_dir('mnist/X_train.npy', X_train_2d.astype(np.float32))
save_to_data_dir('mnist/Y_train.npy', Y_train)
In [19]:
In [46]:
X_test, Y_test = loader.load_testing()
X_test_2d = np.concatenate(X_test, axis=0)
X_test_2d = X_test_2d.reshape((-1, 784))
In [50]:
save_to_data_dir('mnist/X_test.npy', X_test_2d.astype(np.float32))
save_to_data_dir('mnist/Y_test.npy', Y_test)
In [52]:
print X_test_2d.shape
print np.min(X_test)
print np.max(X_test)
In [27]:
# like other papers, treat test set as queries, train as database
truth = utils.compute_true_knn(X_train_2d, X_test_2d)
In [28]:
# save_to_data_dir('mnist/truth_Q=train_X=test.npy', truth)
save_to_data_dir('mnist/truth_Q=test_X=train.npy', truth)
In [68]:
X = np.loadtxt(DATA_DIR + 'glove/glove.txt')
print X.shape
print np.isfortran(X) # false
Q, X_test = X[:10000], X[10000:]
save_to_data_dir('glove/glove_queries.npy', Q)
save_to_data_dir('glove/glove_test.npy', X_test)
# Q = load_from_data_dir('deep1m/deep1M_queries.npy')
# print Q.shape
# print np.isfortran(Q) # false
In [69]:
truth = utils.compute_true_knn(X_test, Q)
print truth.dtype
print truth.shape
In [70]:
save_to_data_dir('glove/truth.npy', truth)