This walkthrough is based on this spaCy tutorial.
Train a convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
automatically via Thinc's built-in dataset loader. The model is added to
spacy.pipeline, and predictions are available via doc.cats.
This notebook has been tested with the following package versions:
(you may need to change pip to pip3, depending on your own Python environment)
In [1]:
# Python >3.5
!pip install verta
!pip install spacy==2.1.6
!python -m spacy download en
In [2]:
HOST = 'app.verta.ai'
PROJECT_NAME = 'Film Review Classification'
EXPERIMENT_NAME = 'spaCy CNN'
In [3]:
# import os
# os.environ['VERTA_EMAIL'] =
# os.environ['VERTA_DEV_KEY'] =
In [4]:
from verta import Client
from verta.utils import ModelAPI
client = Client(HOST, use_git=False)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)
run = client.set_experiment_run()
In [5]:
from __future__ import print_function
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import random
import six
import numpy as np
import thinc.extra.datasets
import spacy
from spacy.util import minibatch, compounding
In [6]:
def load_data(limit=0, split=0.8):
"""Load data from the IMDB dataset."""
# Partition off part of the dataset to train and test
train_data, _ = thinc.extra.datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
split = int(len(train_data) * split)
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
In [7]:
def evaluate(tokenizer, textcat, texts, cats):
"""Evaluate with text data, calculates precision, recall and f score"""
docs = (tokenizer(text) for text in texts)
tp = 0.0 # True positives
fp = 1e-8 # False positives
fn = 1e-8 # False negatives
tn = 0.0 # True negatives
for i, doc in enumerate(textcat.pipe(docs)):
gold = cats[i]
for label, score in doc.cats.items():
if label not in gold:
continue
if label == "NEGATIVE":
continue
if score >= 0.5 and gold[label] >= 0.5:
tp += 1.0
elif score >= 0.5 and gold[label] < 0.5:
fp += 1.0
elif score < 0.5 and gold[label] < 0.5:
tn += 1
elif score < 0.5 and gold[label] >= 0.5:
fn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if (precision + recall) == 0:
f_score = 0.0
else:
f_score = 2 * (precision * recall) / (precision + recall)
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
In [8]:
hyperparams = {
'model':'en',
'n_iter': 2, # epochs
'n_texts': 500, # num of training samples
'architecture': 'simple_cnn',
'num_samples': 1000,
'train_test_split': 0.8,
'dropout': 0.2
}
run.log_hyperparameters(hyperparams)
In [9]:
# using the basic en model
try:
nlp = spacy.load(hyperparams['model']) # load existing spaCy model
except OSError:
nlp = spacy.blank(hyperparams['model']) # create blank Language class
print("Created blank '{}' model".format(hyperparams['model']))
else:
print("Loaded model '{}'".format(nlp))
# add the text classifier to the pipeline if it doesn't exist
if "textcat" not in nlp.pipe_names:
textcat = nlp.create_pipe(
"textcat",
config={
"exclusive_classes": True,
"architecture": hyperparams['architecture'],
}
)
nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
textcat = nlp.get_pipe("textcat")
# add label to text classifier
_= textcat.add_label("POSITIVE")
_= textcat.add_label("NEGATIVE")
In [10]:
# load the IMDB dataset
print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=hyperparams['num_samples'],
split=hyperparams['train_test_split'])
print(
"Using {} examples ({} training, {} evaluation)".format(
hyperparams['num_samples'], len(train_texts), len(dev_texts)
)
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
In [11]:
# sample train data
train_data[:1]
In [12]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
print("other pipes:", other_pipes)
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
batch_sizes = compounding(4.0, 32.0, 1.001)
for i in range(hyperparams['n_iter']):
losses = {}
# batch up the examples using spaCy's minibatch
random.shuffle(train_data)
batches = minibatch(train_data, size=batch_sizes)
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=hyperparams['dropout'], losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
print(
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
losses["textcat"],
scores["textcat_p"],
scores["textcat_r"],
scores["textcat_f"],
)
)
run.log_observation('loss', losses['textcat'])
run.log_observation('precision', scores['textcat_p'])
run.log_observation('recall', scores['textcat_r'])
run.log_observation('f_score', scores['textcat_f'])
In [13]:
class TextClassifier:
def __init__(self, nlp):
self.nlp = nlp
def predict(self, input_list): # param must be a list/batch of inputs
predictions = []
for text in input_list:
scores = self.nlp(text).cats
if scores['POSITIVE'] > scores['NEGATIVE']:
predictions.append("POSITIVE")
else:
predictions.append("NEGATIVE")
return np.array(predictions) # response currently must be a NumPy array
In [14]:
input_list = [
"This movie was subpar at best.",
"Plot didn't make sense."
]
model = TextClassifier(nlp)
model.predict(input_list)
What do its inputs and outputs look like?
In [15]:
from verta.utils import ModelAPI # Verta-provided utility class
model_api = ModelAPI(
input_list, # example inputs
model.predict(input_list), # example outputs
)
What PyPI-installable packages (with version numbers) are required to deserialize and run the model?
In [16]:
requirements = ["numpy", "spacy", "thinc"]
# this could also have been a path to a requirements.txt file on disk
run.log_requirements(requirements)
In [17]:
# test the trained model
test_text = 'The Lion King was very entertaining. The movie was visually spectacular.'
doc = nlp(test_text)
print(test_text)
print(doc.cats)
In [18]:
run.log_model(
model,
model_api=model_api,
)
In [19]:
run
Click the link above to view your Experiment Run in the Verta Web App, and deploy it.
Once it's ready, you can make predictions against the deployed model.
In [20]:
from verta._demo_utils import DeployedModel
deployed_model = DeployedModel(HOST, run.id)
In [21]:
deployed_model.predict(["I would definitely watch this again!"])