In [1]:
import pickle
import faiss


Loading faiss.

In [3]:
def load_data():
    with open('movies.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = load_data()
data


Out[3]:
{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

IVPQ


In [5]:
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimention)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimention, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [13]:
index = IVPQIndex(data["vector"], data["name"])
index.build()

In [14]:
movie_index = 90
movie_vector = data['vector'][movie_index:movie_index+1]
print(f"The most simillar movies to {data['name'][movie_index]} are:")
index.query(movie_vector)


The most simillar movies to Nightmare Before Christmas, The (1993) are:
Out[14]:
['Nightmare Before Christmas, The (1993)',
 'Fantasia (1940)',
 'Brazil (1985)',
 "Monty Python's Life of Brian (1979)",
 'This Is Spinal Tap (1984)',
 'Hunt for Red October, The (1990)',
 'Sneakers (1992)',
 'Lion King, The (1994)',
 'Clockwork Orange, A (1971)',
 'Full Metal Jacket (1987)']