In [1]:
import pickle
import faiss


Loading faiss.

In [2]:
def load_data():
    with open('movies.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = load_data()
data


Out[2]:
{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

In [3]:
# class FalconIndex():
#     def __init__(self, vectors, labels):
#         self.dimention = vectors.shape[1]
#         self.vectors = vectors.astype('float32')
#         self.labels = labels


#     def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
#         quantizer = faiss.IndexFlatL2(self.dimention)
#         self.index = faiss.IndexIVFPQ(quantizer, self.dimention, number_of_partition, search_in_x_partitions, subvector_size)
#         self.index.train(self.vectors)
#         self.index.add(self.vectors)
        
#     def query(self, vectors, k=10):
#         distances, indices = self.index.search(vectors, k) 
#         return [self.labels[i] for i in indices[0]]
# # https://github.com/erikbern/ann-benchmarks/commit/ecc56def165234fbec830fd1eed44396a1a52c49
# https://github.com/nmslib/nmslib/tree/master/python_bindings

In [4]:
# index = IVPQIndex(data["vector"], data["name"])
# index.build()

In [5]:
# movie_vector, movie_name = data['vector'][90:91], data['name'][90]
# simlar_movies_names = '\n* '.join(index.query(movie_vector))
# print(f"The most similar movies to {movie_name} are:\n* {simlar_movies_names}")


The most similar movies to Nightmare Before Christmas, The (1993) are:
* Nightmare Before Christmas, The (1993)
* Fantasia (1940)
* Brazil (1985)
* Monty Python's Life of Brian (1979)
* This Is Spinal Tap (1984)
* Hunt for Red October, The (1990)
* Sneakers (1992)
* Lion King, The (1994)
* Clockwork Orange, A (1971)
* Full Metal Jacket (1987)

In [ ]: