In [1]:
#!/usr/bin/env python
# coding: utf8
In [2]:
# Bot Training and Testing
# https://spacy.io/usage/training#example-new-entity-type
# For our example Travel conversation NLP, here we will see how we can do train
# new entity types a. source, b. destination c. travelDate on Train Data
# Then update the model and see how we can test it on new data
In [3]:
# Importing required modules
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
In [4]:
# Importing spacy, it may take sometime to load
import spacy
# Importing displacy for visualization
from spacy import displacy
In [11]:
# Data preparation
# The Training Data
TRAIN_DATA = [
('I want to travel from Delhi to Bangalore', {'entities': [(22, 27, 'source'), (31, 40, 'destination')]}),
('Book a ticket for Bangalore tomorrow', {'entities': [(18, 27, 'destination'), (28, 36, 'travelDate')]}),
('i am planning to travel on 15/07/18', {'entities': [(27, 35, 'travelDate')]})
]
# number of iterations, try with 10, 5
n_iter = 8
# Test text want to
TEST_DATA = ['I want to travel from Hyderabad to Kochi today',
'It is going to rain today in Mumbai',
'Delhi is capital of India']
In [6]:
nlp = spacy.blank('en') # create blank Language class which will be trained
In [7]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
print('Named Entity Recognition(NER) is not there in NLP pipe, creating and adding a new pipe with "ner"')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
# Type of ner is EntityRecognizer
print('Type of ner - ', type(ner))
In [8]:
# Adding the labels
# TODO: Can we add it to the nlp.vocab while creating the pipe
# Iterating throught the training data tuples
for _, annotations in TRAIN_DATA:
# Getting start char, end char and label from annotations.entities
# Here we are only interested in label to be added
# TODO: Need to add only unique labels
for _s, _e, label in annotations.get('entities', []) :
print('Adding label - "', label, '"')
ner.add_label(label)
In [12]:
"""
Observation:
--
1.)
If we have less number of "sentences for training", with more "number of iterations"
the losses will be less making the Model to be biased!!, so make sure we have
more number of "training data", if we have more
--
TODO: test other scenarios, with the parameters to tune for model.
2.) More training data
--
Depending upon the use case requirement make sure the model meets the Confusion matrix.
"""
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
# Strart training here
# optimizer function to update the model's weights.
optimizer = nlp.begin_training()
for itn in range(n_iter):
# At each iteration, the training data is shuffled to ensure the model
# doesn't make any generalisations based on the order of examples
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
# "drop" is to improve the learning results, rate at which to randomly
# "drop" individual features and representations, making the model to
# memorise the training data
# sgd = Stochastic Gradient Descent, see https://en.wikipedia.org/wiki/Stochastic_gradient_descent
nlp.update([text], [annotations], sgd=optimizer, drop=0.35,losses=losses)
print('losses -', losses)
In [13]:
print('Train Data Updated Entities')
# Iterating through TRAIN_DATA
for train_text, _ in TRAIN_DATA:
doc = nlp(train_text)
# Using displacy.render to see the updated entities
displacy.render(doc, style='ent', jupyter=True)
In [14]:
print('\nTest Data Updated Entities')
# Iterating through TEST_DATA
for test_text in TEST_DATA:
doc = nlp(test_text)
# Using displacy.render to see the updated entities
displacy.render(doc, style='ent', jupyter=True)
In [15]:
# Now lets see how the original model does the tagging.
# Loading original english language model, it may take some time to load
# depending upon the model type - small, medium or large
org_nlp = spacy.load('en')
In [16]:
print('Train Data Original Entities')
# Iterating through TRAIN_DATA
for train_text, _ in TRAIN_DATA:
org_doc = org_nlp(train_text)
# Using displacy.render to see the entities already identified
displacy.render(org_doc, style='ent', jupyter=True)
In [17]:
print('\nTest Data Original Entities')
# Iterating through TEST_DATA
for test_text in TEST_DATA:
org_doc = org_nlp(test_text)
# Using displacy.render to see the entities already identified
displacy.render(org_doc, style='ent', jupyter=True)
In [ ]: