In [1]:
#!/usr/bin/env python
# coding: utf8

In [2]:
# Bot Training and Testing
# https://spacy.io/usage/training#example-new-entity-type
# For our example Travel conversation NLP, here we will see how we can do train 
# new entity types a. source, b. destination c. travelDate on Train Data
# Then update the model and see how we can test it on new data

In [3]:
# Importing required modules
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path

In [4]:
# Importing spacy, it may take sometime to load
import spacy
# Importing displacy for visualization
from spacy import displacy

In [11]:
# Data preparation
# The Training Data
TRAIN_DATA = [
    ('I want to travel from Delhi to Bangalore', {'entities': [(22, 27, 'source'), (31, 40, 'destination')]}),
    ('Book a ticket for Bangalore tomorrow', {'entities': [(18, 27, 'destination'), (28, 36, 'travelDate')]}),
    ('i am planning to travel on 15/07/18', {'entities': [(27, 35, 'travelDate')]})
]

# number of iterations, try with 10, 5
n_iter = 8

# Test text want to 
TEST_DATA = ['I want to travel from Hyderabad to Kochi today', 
             'It is going to rain today in Mumbai',
            'Delhi is capital of India']

In [6]:
nlp = spacy.blank('en')  # create blank Language class which will be trained

In [7]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    print('Named Entity Recognition(NER) is not there in NLP pipe, creating and adding a new pipe with "ner"')
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# Type of ner is EntityRecognizer
print('Type of ner - ', type(ner))


Named Entity Recognition(NER) is not there in NLP pipe, creating and adding a new pipe with "ner"
Type of ner -  <class 'spacy.pipeline.EntityRecognizer'>

In [8]:
# Adding the labels 
# TODO: Can we add it to the nlp.vocab while creating the pipe
# Iterating throught the training data tuples
for _, annotations in TRAIN_DATA:
    # Getting start char, end char and label from annotations.entities
    # Here we are only interested in label to be added
    # TODO: Need to add only unique labels
    for _s, _e, label in annotations.get('entities', []) :
        print('Adding label - "', label, '"')
        ner.add_label(label)


Adding label - " source "
Adding label - " destination "
Adding label - " destination "
Adding label - " travelDate "
Adding label - " travelDate "

In [12]:
"""
Observation:

--
1.)
If we have less number of "sentences for training", with more "number of iterations"
the losses will be less making the Model to be biased!!, so make sure we have
more number of "training data", if we have more 
--
TODO: test other scenarios, with the parameters to tune for model.
2.) More training data
--
Depending upon the use case requirement make sure the model meets the Confusion matrix.
"""

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    # Strart training here
    # optimizer function to update the model's weights. 
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        # At each iteration, the training data is shuffled to ensure the model 
        # doesn't make any generalisations based on the order of examples
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            # "drop" is to improve the learning results, rate at which to randomly 
            # "drop" individual features and representations, making the model to
            # memorise the training data
            # sgd = Stochastic Gradient Descent, see https://en.wikipedia.org/wiki/Stochastic_gradient_descent
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35,losses=losses)
        print('losses -', losses)


losses - {'ner': 8.5897110183867689}
losses - {'ner': 4.0043320339549489}
losses - {'ner': 3.5416068638551419}
losses - {'ner': 2.3882739497774605}
losses - {'ner': 3.093733938831531}
losses - {'ner': 0.078656306189116357}
losses - {'ner': 0.021017901426065407}
losses - {'ner': 0.00065465895895030644}
losses - {'ner': 2.8846165625929623e-10}
losses - {'ner': 5.4703440470535987e-06}

In [13]:
print('Train Data Updated Entities')
# Iterating through TRAIN_DATA
for train_text, _ in TRAIN_DATA:
    doc = nlp(train_text)
    # Using displacy.render to see the updated entities
    displacy.render(doc, style='ent', jupyter=True)


Train Data Updated Entities
Book a ticket for Bangalore destination tomorrow travelDate
i am planning to travel on 15/07/18 travelDate
I want to travel from Delhi source to Bangalore destination

In [14]:
print('\nTest Data Updated Entities')
# Iterating through TEST_DATA
for test_text in TEST_DATA:
    doc = nlp(test_text)
    # Using displacy.render to see the updated entities
    displacy.render(doc, style='ent', jupyter=True)


Test Data Updated Entities
I want to travel from Hyderabad source to Kochi destination today travelDate
It is going to rain today in Mumbai destination
Delhi is capital of India travelDate

In [15]:
# Now lets see how the original model does the tagging.

# Loading original english language model, it may take some time to load
# depending upon the model type - small, medium or large
org_nlp = spacy.load('en')

In [16]:
print('Train Data Original Entities')
# Iterating through TRAIN_DATA
for train_text, _ in TRAIN_DATA:
    org_doc = org_nlp(train_text)
    # Using displacy.render to see the entities already identified
    displacy.render(org_doc, style='ent', jupyter=True)


Train Data Original Entities
Book a ticket for Bangalore GPE tomorrow DATE
i am planning to travel on 15/07/18 GPE
I want to travel from Delhi GPE to Bangalore GPE

In [17]:
print('\nTest Data Original Entities')
# Iterating through TEST_DATA
for test_text in TEST_DATA:
    org_doc = org_nlp(test_text)
    # Using displacy.render to see the entities already identified
    displacy.render(org_doc, style='ent', jupyter=True)


Test Data Original Entities
I want to travel from Hyderabad GPE to Kochi GPE today DATE
It is going to rain today DATE in Mumbai GPE
Delhi GPE is capital of India GPE

In [ ]: