In [1]:
try:
import verta
except ImportError:
!pip install verta
This example features:
class
as a model to be instantiated at deployment time
In [2]:
HOST = "app.verta.ai"
PROJECT_NAME = "Film Review Embeddings"
EXPERIMENT_NAME = "TF Hub and Annoy"
In [3]:
# import os
# os.environ['VERTA_EMAIL'] =
# os.environ['VERTA_DEV_KEY'] =
In [4]:
from __future__ import print_function
import os
import time
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import annoy
In [5]:
try:
import wget
except ImportError:
!pip install wget # you may need pip3
import wget
In [6]:
train_data_url = "http://s3.amazonaws.com/verta-starter/imdb_master.csv"
train_data_filename = wget.detect_filename(train_data_url)
if not os.path.isfile(train_data_filename):
wget.download(train_data_url)
In [7]:
all_reviews = pd.read_csv(train_data_filename, encoding='latin')['review'].values.tolist()
reviews = all_reviews[:2000] # just a subset for this example
reviews[0]
In [8]:
from verta import Client
from verta.utils import ModelAPI
client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)
run = client.set_experiment_run()
In [9]:
EMBEDDING_LENGTH = 512
NN_INDEX_FILENAME = "reviews.ann"
In [10]:
os.environ["TFHUB_CACHE_DIR"] = "tf_cache_dir"
# define graph
g = tf.Graph()
with g.as_default():
text_input = tf.placeholder(dtype=tf.string, shape=[None])
encoder = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
embed = encoder(text_input)
init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()
# initialize session
sess = tf.Session(graph=g)
sess.run(init_op)
# build and save embedding index
t = annoy.AnnoyIndex(EMBEDDING_LENGTH, 'angular') # Length of item vector that will be indexed
for i, review in enumerate(reviews):
# produce embedding with TF
embedding = sess.run(embed, feed_dict={text_input: [review]})
t.add_item(i, embedding[0])
t.build(10) # 10 trees
t.save(NN_INDEX_FILENAME)
In [11]:
run.log_artifact("nn_index", open(NN_INDEX_FILENAME, 'rb'))
A TensorFlow model—particularly one using TensorFlow Hub and a pre-built Annoy index—will require some setup at deployment time.
To support this, the Verta platform allows a model to be defined as a class
that will be instantiated when it's deployed.
This class should have provide the following interface:
__init__(self, artifacts)
where artifacts
is a mapping of artifact keys to filepaths. This will be explained below, but Verta will provide this so you can open these artifact files and set up your model. Other initialization steps would be in this method, as well.predict(self, data)
where data
—like in other custom Verta models—is a list of input values for the model.
In [12]:
class EmbeddingAndLookupModel:
def __init__(self, artifacts):
"""
Parameters
----------
artifacts
Mapping of Experiment Run artifact keys to filepaths.
This is provided by ``run.fetch_artifacts(artifact_keys)``.
"""
# get artifact filepath from `artifacts` mapping
annoy_index_filepath = artifacts['nn_index']
# load embedding index
self.index = annoy.AnnoyIndex(EMBEDDING_LENGTH, "angular")
self.index.load(annoy_index_filepath)
os.environ["TFHUB_CACHE_DIR"] = "tf_cache_dir"
# define graph
g = tf.Graph()
with g.as_default():
self.text_input = tf.placeholder(dtype=tf.string, shape=[None])
self.encoder = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
self.embed = self.encoder(self.text_input)
init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()
self.graph = g
# initialize session
self.session = tf.Session(graph=self.graph)
self.session.run(init_op)
def predict(self, data):
predictions = []
for review in data:
# embed sentence
embedding = self.session.run(self.embed, feed_dict={self.text_input: [review]})
# find closest
predictions.append({
review: self.index.get_nns_by_vector(embedding[0], 10)
})
return predictions
Earlier we logged an artifact with the key "nn_index"
.
You can obtain an artifacts
mapping mentioned above using run.fetch_artifacts(keys)
to work with locally.
A similar mapping—that works identically—will be passed into __init__()
when the model is deployed.
In [13]:
artifacts = run.fetch_artifacts(["nn_index"])
model = EmbeddingAndLookupModel(artifacts=artifacts)
In [14]:
model.predict(["Good film.", "Bad film!"])
The keys expected in the artifacts
mapping mentioned above must be passed into run.log_model()
to be available during deployment!
In [15]:
run.log_model(
model=EmbeddingAndLookupModel,
artifacts=['nn_index'],
)
We also have to make sure we provide every package involved in the model.
In [16]:
run.log_requirements([
"annoy==1.16.2",
"tensorflow",
"tensorflow_hub",
])
Access the Experiment Run through the Verta Web App and deploy it!
In [17]:
run
In [18]:
reviews = all_reviews[-2000:]
In [19]:
from verta.deployment import DeployedModel
deployed_model = DeployedModel(HOST, run.id)
In [20]:
for review in reviews:
print(deployed_model.predict([review]))
time.sleep(.5)