This example features:
Versions:
In [1]:
import six
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import annoy
from verta import Client
from verta.utils import ModelAPI, TFSavedModel
In [2]:
REPRESENTATION_LENGTH = 25
MAX_INPUT_LENGTH = 50
HOST = 'app.verta.ai'
DATA_DIR = ''
DATA_FILE = DATA_DIR + 'imdb_master.csv'
EMBEDDING_FILE = DATA_DIR + 'glove.twitter.27B/glove.twitter.27B.25d.txt'
SAVED_MODEL_DIR = 'saved-model'
In [3]:
# Load the glove embeddings
embeddings_index = dict()
with open(EMBEDDING_FILE) as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
embeddings_index['UNK'] = [0.0] * REPRESENTATION_LENGTH
print('Loaded %s word vectors.' % len(embeddings_index))
In [4]:
embedding_matrix = np.zeros((len(embeddings_index), REPRESENTATION_LENGTH))
word_to_index = {}
ctr = 0
UNK_INDEX = -1
for word, embedding in embeddings_index.items():
if word == 'UNK':
UNK_INDEX = ctr
word_to_index[word] = ctr
embedding_matrix[ctr, :] = embedding
ctr += 1
VOCABULARY_SIZE = embedding_matrix.shape[0]
In [5]:
reviews = pd.read_csv(DATA_FILE)['review'].values.tolist()
input_data = reviews[:1000]
input_data[0]
In [6]:
import shutil; shutil.rmtree(SAVED_MODEL_DIR, ignore_errors=True)
batch_indexes = tf.placeholder(tf.int32, shape=[50], name="input")
tf_embedding = tf.Variable(
tf.constant(0.0, shape=[VOCABULARY_SIZE, REPRESENTATION_LENGTH]),
trainable=False,
name="Embedding",
)
tf_embedding_placeholder = tf.placeholder(
tf.float32,
[VOCABULARY_SIZE, REPRESENTATION_LENGTH]
)
tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)
embedding_list = tf.nn.embedding_lookup(
params=tf_embedding, ids=batch_indexes)
concatenated_embedding = tf.concat(embedding_list, -1)
embedding = tf.reduce_mean(concatenated_embedding, axis=0, name="output")
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
_ = sess.run(
tf_embedding_init,
feed_dict={
tf_embedding_placeholder: embedding_matrix
}
)
# save the model for use later
tf.saved_model.simple_save(
sess,
SAVED_MODEL_DIR,
{'batch_indexes': batch_indexes},
{'embedding': embedding}
)
In [7]:
from annoy import AnnoyIndex
model = TFSavedModel(SAVED_MODEL_DIR)
t = AnnoyIndex(REPRESENTATION_LENGTH, 'angular') # Length of item vector that will be indexed
for i in range(len(reviews)):
review = reviews[i]
words = review.split()[:MAX_INPUT_LENGTH]
batch_indexes = [word_to_index.get(w.lower(), word_to_index['UNK']) for w in words]
batch_indexes += [UNK_INDEX] * (MAX_INPUT_LENGTH - len(batch_indexes))
# calculate embedding with TF
embedding = model.predict(batch_indexes=batch_indexes)['embedding']
t.add_item(i, embedding)
t.build(10) # 10 trees
t.save('reviews.ann')
In [8]:
client = Client(HOST)
client.set_project("TF")
client.set_experiment("SavedModel")
run = client.set_experiment_run()
In [9]:
run.log_artifact("saved_model", SAVED_MODEL_DIR)
In [10]:
with open("word_to_index.json", 'w') as f:
json.dump(word_to_index, f)
run.log_artifact("word_to_index", "word_to_index.json")
In [11]:
run.log_artifact("reviews_index", "reviews.ann")
In [12]:
class TextNNSearch(object):
def __init__(self, artifacts):
self.session = tf.Session()
tf.compat.v1.saved_model.load(self.session, ['serve'], artifacts['saved_model'])
with open(artifacts['word_to_index'], 'r') as f:
self.word_to_index = json.load(f)
self.index = annoy.AnnoyIndex(REPRESENTATION_LENGTH, "angular")
self.index.load(artifacts['reviews_index'])
def predict(self, input_strs):
predictions = []
for input_str in input_strs:
words = input_str.split()[:MAX_INPUT_LENGTH]
batch_indexes = [self.word_to_index.get(w.lower(), self.word_to_index['UNK']) for w in words]
batch_indexes += [UNK_INDEX] * (MAX_INPUT_LENGTH - len(batch_indexes))
# calculate embedding with TF
embedding = self.session.run("output:0", {"input:0": batch_indexes})
# find embedding vectors of ten nearest neighbors
predictions.append({
input_str: self.index.get_nns_by_vector(embedding, 10)
})
return predictions
In [13]:
artifacts = run.fetch_artifacts(["saved_model", "word_to_index", "reviews_index"])
In [14]:
model = TextNNSearch(artifacts)
prediction = model.predict(["omg I love this film"])
similar_reviews = [reviews[i] for i in prediction[0].values()[0]]
print(similar_reviews[0])
In [15]:
run.log_model(
TextNNSearch,
custom_modules=[],
model_api=ModelAPI(input_data, model.predict(input_data)),
artifacts=["saved_model", "word_to_index", "reviews_index"],
)
run.log_requirements(["tensorflow", "annoy==1.15.2"])
In [16]:
run
In [17]:
# remove reviews with weird bytes
bad_reviews = []
for i, review in enumerate(input_data):
try:
unicode(review, 'utf-8')
except UnicodeDecodeError:
bad_reviews.append(i)
else:
pass
for i in sorted(bad_reviews, reverse=True):
del input_data[i]
In [18]:
from verta.deployment import DeployedModel
embeddings = DeployedModel(HOST, run.id).predict(input_data[:1000], compress=True)
embeddings[0]