Nearest Neighbor Search with Tensorflow and Glove Embeddings

This example features:

Using Glove embeddings to turn IMDB words into vectors
Using a simple TF model to compute representations of IMDB reviews by averaging word vectors
Using the annoy library to build an index of word vectors
Search similar reviews via embeddings and the annoy library

Versions:

Python: 2.7
TF: {1.14.0, 1.15.2}



In [1]:

    
import six

import json

import numpy as np
import pandas as pd
import tensorflow as tf
import annoy

from verta import Client
from verta.utils import ModelAPI, TFSavedModel



In [2]:

    
REPRESENTATION_LENGTH = 25
MAX_INPUT_LENGTH = 50

HOST = 'app.verta.ai'
DATA_DIR = ''
DATA_FILE = DATA_DIR + 'imdb_master.csv'
EMBEDDING_FILE = DATA_DIR + 'glove.twitter.27B/glove.twitter.27B.25d.txt'
SAVED_MODEL_DIR = 'saved-model'

Load the Glove embeddings



In [3]:

    
# Load the glove embeddings
embeddings_index = dict()
with open(EMBEDDING_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
embeddings_index['UNK'] = [0.0] * REPRESENTATION_LENGTH
print('Loaded %s word vectors.' % len(embeddings_index))



In [4]:

    
embedding_matrix = np.zeros((len(embeddings_index), REPRESENTATION_LENGTH))
word_to_index = {}
ctr = 0
UNK_INDEX = -1
for word, embedding in embeddings_index.items():
    if word == 'UNK':
        UNK_INDEX = ctr
    word_to_index[word] = ctr
    embedding_matrix[ctr, :] = embedding
    ctr += 1
VOCABULARY_SIZE = embedding_matrix.shape[0]



In [5]:

    
reviews = pd.read_csv(DATA_FILE)['review'].values.tolist()

input_data = reviews[:1000]
input_data[0]

Build Simple TF model



In [6]:

    
import shutil; shutil.rmtree(SAVED_MODEL_DIR, ignore_errors=True)

batch_indexes = tf.placeholder(tf.int32, shape=[50], name="input")

tf_embedding = tf.Variable(
    tf.constant(0.0, shape=[VOCABULARY_SIZE, REPRESENTATION_LENGTH]),
    trainable=False,
    name="Embedding",
)

tf_embedding_placeholder = tf.placeholder(
    tf.float32,
    [VOCABULARY_SIZE, REPRESENTATION_LENGTH]
)
tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)

embedding_list = tf.nn.embedding_lookup(
    params=tf_embedding, ids=batch_indexes)

concatenated_embedding = tf.concat(embedding_list, -1)

embedding = tf.reduce_mean(concatenated_embedding, axis=0, name="output")

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    _ = sess.run(
        tf_embedding_init,
        feed_dict={
            tf_embedding_placeholder: embedding_matrix
        }
    )

    # save the model for use later
    tf.saved_model.simple_save(
        sess,
        SAVED_MODEL_DIR,
        {'batch_indexes': batch_indexes},
        {'embedding': embedding}
    )

Build Index



In [7]:

    
from annoy import AnnoyIndex

model = TFSavedModel(SAVED_MODEL_DIR)
t = AnnoyIndex(REPRESENTATION_LENGTH, 'angular')  # Length of item vector that will be indexed
for i in range(len(reviews)):
    review = reviews[i]
    words = review.split()[:MAX_INPUT_LENGTH]
    batch_indexes = [word_to_index.get(w.lower(), word_to_index['UNK']) for w in words]
    batch_indexes += [UNK_INDEX] * (MAX_INPUT_LENGTH - len(batch_indexes))
        
    # calculate embedding with TF
    embedding = model.predict(batch_indexes=batch_indexes)['embedding']
    t.add_item(i, embedding)

t.build(10) # 10 trees
t.save('reviews.ann')

Instantiate Client



In [8]:

    
client = Client(HOST)
client.set_project("TF")
client.set_experiment("SavedModel")
run = client.set_experiment_run()

Log Artifacts



In [9]:

    
run.log_artifact("saved_model", SAVED_MODEL_DIR)



In [10]:

    
with open("word_to_index.json", 'w') as f:
    json.dump(word_to_index, f)
run.log_artifact("word_to_index", "word_to_index.json")



In [11]:

    
run.log_artifact("reviews_index", "reviews.ann")

Build a NearestNeighbor Search class



In [12]:

    
class TextNNSearch(object):
    def __init__(self, artifacts):
        self.session = tf.Session()
        tf.compat.v1.saved_model.load(self.session, ['serve'], artifacts['saved_model'])
        
        with open(artifacts['word_to_index'], 'r') as f:
            self.word_to_index = json.load(f)
        
        self.index = annoy.AnnoyIndex(REPRESENTATION_LENGTH, "angular")
        self.index.load(artifacts['reviews_index'])

    def predict(self, input_strs):
        predictions = []
        for input_str in input_strs:
            words = input_str.split()[:MAX_INPUT_LENGTH]
            batch_indexes = [self.word_to_index.get(w.lower(), self.word_to_index['UNK']) for w in words]
            batch_indexes += [UNK_INDEX] * (MAX_INPUT_LENGTH - len(batch_indexes))
        
            # calculate embedding with TF
            embedding = self.session.run("output:0", {"input:0": batch_indexes})
            
            # find embedding vectors of ten nearest neighbors
            predictions.append({
                input_str: self.index.get_nns_by_vector(embedding, 10)
            })
        return predictions

Run some simple tests



In [13]:

    
artifacts = run.fetch_artifacts(["saved_model", "word_to_index", "reviews_index"])



In [14]:

    
model = TextNNSearch(artifacts)

prediction = model.predict(["omg I love this film"])
similar_reviews = [reviews[i] for i in prediction[0].values()[0]]
print(similar_reviews[0])

Deploy Model



In [15]:

    
run.log_model(
    TextNNSearch,
    custom_modules=[],
    model_api=ModelAPI(input_data, model.predict(input_data)),
    artifacts=["saved_model", "word_to_index", "reviews_index"],
)
run.log_requirements(["tensorflow", "annoy==1.15.2"])



In [16]:

    
run

Deploy the model and make predictions



In [17]:

    
# remove reviews with weird bytes
bad_reviews = []
for i, review in enumerate(input_data):
    try:
        unicode(review, 'utf-8')
    except UnicodeDecodeError:
        bad_reviews.append(i)
    else:
        pass
for i in sorted(bad_reviews, reverse=True):
    del input_data[i]



In [18]:

    
from verta.deployment import DeployedModel

embeddings = DeployedModel(HOST, run.id).predict(input_data[:1000], compress=True)

embeddings[0]