Nearest Neighbor Search with Tensorflow and Glove Embeddings

This example features:

  • Using Glove embeddings to turn IMDB words into vectors
  • Using a simple TF model to compute representations of IMDB reviews by averaging word vectors
  • Using the annoy library to build an index of word vectors
  • Search similar reviews via embeddings and the annoy library

Versions:

  • Python: 2.7
  • TF: {1.14.0, 1.15.2}

In [1]:
import six

import json

import numpy as np
import pandas as pd
import tensorflow as tf
import annoy

from verta import Client
from verta.utils import ModelAPI, TFSavedModel

In [2]:
REPRESENTATION_LENGTH = 25
MAX_INPUT_LENGTH = 50

HOST = 'app.verta.ai'
DATA_DIR = ''
DATA_FILE = DATA_DIR + 'imdb_master.csv'
EMBEDDING_FILE = DATA_DIR + 'glove.twitter.27B/glove.twitter.27B.25d.txt'
SAVED_MODEL_DIR = 'saved-model'

Load the Glove embeddings


In [3]:
# Load the glove embeddings
embeddings_index = dict()
with open(EMBEDDING_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
embeddings_index['UNK'] = [0.0] * REPRESENTATION_LENGTH
print('Loaded %s word vectors.' % len(embeddings_index))

In [4]:
embedding_matrix = np.zeros((len(embeddings_index), REPRESENTATION_LENGTH))
word_to_index = {}
ctr = 0
UNK_INDEX = -1
for word, embedding in embeddings_index.items():
    if word == 'UNK':
        UNK_INDEX = ctr
    word_to_index[word] = ctr
    embedding_matrix[ctr, :] = embedding
    ctr += 1
VOCABULARY_SIZE = embedding_matrix.shape[0]

In [5]:
reviews = pd.read_csv(DATA_FILE)['review'].values.tolist()

input_data = reviews[:1000]
input_data[0]

Build Simple TF model


In [6]:
import shutil; shutil.rmtree(SAVED_MODEL_DIR, ignore_errors=True)

batch_indexes = tf.placeholder(tf.int32, shape=[50], name="input")

tf_embedding = tf.Variable(
    tf.constant(0.0, shape=[VOCABULARY_SIZE, REPRESENTATION_LENGTH]),
    trainable=False,
    name="Embedding",
)

tf_embedding_placeholder = tf.placeholder(
    tf.float32,
    [VOCABULARY_SIZE, REPRESENTATION_LENGTH]
)
tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)

embedding_list = tf.nn.embedding_lookup(
    params=tf_embedding, ids=batch_indexes)

concatenated_embedding = tf.concat(embedding_list, -1)

embedding = tf.reduce_mean(concatenated_embedding, axis=0, name="output")

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    _ = sess.run(
        tf_embedding_init,
        feed_dict={
            tf_embedding_placeholder: embedding_matrix
        }
    )

    # save the model for use later
    tf.saved_model.simple_save(
        sess,
        SAVED_MODEL_DIR,
        {'batch_indexes': batch_indexes},
        {'embedding': embedding}
    )

Build Index


In [7]:
from annoy import AnnoyIndex

model = TFSavedModel(SAVED_MODEL_DIR)
t = AnnoyIndex(REPRESENTATION_LENGTH, 'angular')  # Length of item vector that will be indexed
for i in range(len(reviews)):
    review = reviews[i]
    words = review.split()[:MAX_INPUT_LENGTH]
    batch_indexes = [word_to_index.get(w.lower(), word_to_index['UNK']) for w in words]
    batch_indexes += [UNK_INDEX] * (MAX_INPUT_LENGTH - len(batch_indexes))
        
    # calculate embedding with TF
    embedding = model.predict(batch_indexes=batch_indexes)['embedding']
    t.add_item(i, embedding)

t.build(10) # 10 trees
t.save('reviews.ann')

Instantiate Client


In [8]:
client = Client(HOST)
client.set_project("TF")
client.set_experiment("SavedModel")
run = client.set_experiment_run()

Log Artifacts


In [9]:
run.log_artifact("saved_model", SAVED_MODEL_DIR)

In [10]:
with open("word_to_index.json", 'w') as f:
    json.dump(word_to_index, f)
run.log_artifact("word_to_index", "word_to_index.json")

In [11]:
run.log_artifact("reviews_index", "reviews.ann")

Build a NearestNeighbor Search class


In [12]:
class TextNNSearch(object):
    def __init__(self, artifacts):
        self.session = tf.Session()
        tf.compat.v1.saved_model.load(self.session, ['serve'], artifacts['saved_model'])
        
        with open(artifacts['word_to_index'], 'r') as f:
            self.word_to_index = json.load(f)
        
        self.index = annoy.AnnoyIndex(REPRESENTATION_LENGTH, "angular")
        self.index.load(artifacts['reviews_index'])

    def predict(self, input_strs):
        predictions = []
        for input_str in input_strs:
            words = input_str.split()[:MAX_INPUT_LENGTH]
            batch_indexes = [self.word_to_index.get(w.lower(), self.word_to_index['UNK']) for w in words]
            batch_indexes += [UNK_INDEX] * (MAX_INPUT_LENGTH - len(batch_indexes))
        
            # calculate embedding with TF
            embedding = self.session.run("output:0", {"input:0": batch_indexes})
            
            # find embedding vectors of ten nearest neighbors
            predictions.append({
                input_str: self.index.get_nns_by_vector(embedding, 10)
            })
        return predictions

Run some simple tests


In [13]:
artifacts = run.fetch_artifacts(["saved_model", "word_to_index", "reviews_index"])

In [14]:
model = TextNNSearch(artifacts)

prediction = model.predict(["omg I love this film"])
similar_reviews = [reviews[i] for i in prediction[0].values()[0]]
print(similar_reviews[0])

Deploy Model


In [15]:
run.log_model(
    TextNNSearch,
    custom_modules=[],
    model_api=ModelAPI(input_data, model.predict(input_data)),
    artifacts=["saved_model", "word_to_index", "reviews_index"],
)
run.log_requirements(["tensorflow", "annoy==1.15.2"])

In [16]:
run

Deploy the model and make predictions


In [17]:
# remove reviews with weird bytes
bad_reviews = []
for i, review in enumerate(input_data):
    try:
        unicode(review, 'utf-8')
    except UnicodeDecodeError:
        bad_reviews.append(i)
    else:
        pass
for i in sorted(bad_reviews, reverse=True):
    del input_data[i]

In [18]:
from verta.deployment import DeployedModel

embeddings = DeployedModel(HOST, run.id).predict(input_data[:1000], compress=True)

embeddings[0]