In [1]:

    
import pickle
import faiss









    



Loading faiss.



In [3]:

    
def load_data():
    with open('movies.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = load_data()
data









    Out[3]:





{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

IVPQ



In [5]:

    
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimention)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimention, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]



In [13]:

    
index = IVPQIndex(data["vector"], data["name"])
index.build()



In [14]:

    
movie_index = 90
movie_vector = data['vector'][movie_index:movie_index+1]
print(f"The most simillar movies to {data['name'][movie_index]} are:")
index.query(movie_vector)









    



The most simillar movies to Nightmare Before Christmas, The (1993) are:






    Out[14]:





['Nightmare Before Christmas, The (1993)',
 'Fantasia (1940)',
 'Brazil (1985)',
 "Monty Python's Life of Brian (1979)",
 'This Is Spinal Tap (1984)',
 'Hunt for Red October, The (1990)',
 'Sneakers (1992)',
 'Lion King, The (1994)',
 'Clockwork Orange, A (1971)',
 'Full Metal Jacket (1987)']