notebook.community

Edit and run



In [1]:

    
#!/usr/bin/env python
# coding: utf8



In [2]:

    
# Bot Training and Testing
# https://spacy.io/usage/training#example-new-entity-type
# For our example Travel conversation NLP, here we will see how we can do train 
# new entity types a. source, b. destination c. travelDate on Train Data
# Then update the model and see how we can test it on new data



In [3]:

    
# Importing required modules
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path



In [4]:

    
# Importing spacy, it may take sometime to load
import spacy
# Importing displacy for visualization
from spacy import displacy



In [11]:

    
# Data preparation
# The Training Data
TRAIN_DATA = [
    ('I want to travel from Delhi to Bangalore', {'entities': [(22, 27, 'source'), (31, 40, 'destination')]}),
    ('Book a ticket for Bangalore tomorrow', {'entities': [(18, 27, 'destination'), (28, 36, 'travelDate')]}),
    ('i am planning to travel on 15/07/18', {'entities': [(27, 35, 'travelDate')]})
]

# number of iterations, try with 10, 5
n_iter = 8

# Test text want to 
TEST_DATA = ['I want to travel from Hyderabad to Kochi today', 
             'It is going to rain today in Mumbai',
            'Delhi is capital of India']



In [6]:

    
nlp = spacy.blank('en')  # create blank Language class which will be trained



In [7]:

    
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    print('Named Entity Recognition(NER) is not there in NLP pipe, creating and adding a new pipe with "ner"')
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# Type of ner is EntityRecognizer
print('Type of ner - ', type(ner))









    



Named Entity Recognition(NER) is not there in NLP pipe, creating and adding a new pipe with "ner"
Type of ner -  <class 'spacy.pipeline.EntityRecognizer'>



In [8]:

    
# Adding the labels 
# TODO: Can we add it to the nlp.vocab while creating the pipe
# Iterating throught the training data tuples
for _, annotations in TRAIN_DATA:
    # Getting start char, end char and label from annotations.entities
    # Here we are only interested in label to be added
    # TODO: Need to add only unique labels
    for _s, _e, label in annotations.get('entities', []) :
        print('Adding label - "', label, '"')
        ner.add_label(label)









    



Adding label - " source "
Adding label - " destination "
Adding label - " destination "
Adding label - " travelDate "
Adding label - " travelDate "



In [12]:

    
"""
Observation:

--
1.)
If we have less number of "sentences for training", with more "number of iterations"
the losses will be less making the Model to be biased!!, so make sure we have
more number of "training data", if we have more 
--
TODO: test other scenarios, with the parameters to tune for model.
2.) More training data
--
Depending upon the use case requirement make sure the model meets the Confusion matrix.
"""

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    # Strart training here
    # optimizer function to update the model's weights. 
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        # At each iteration, the training data is shuffled to ensure the model 
        # doesn't make any generalisations based on the order of examples
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            # "drop" is to improve the learning results, rate at which to randomly 
            # "drop" individual features and representations, making the model to
            # memorise the training data
            # sgd = Stochastic Gradient Descent, see https://en.wikipedia.org/wiki/Stochastic_gradient_descent
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35,losses=losses)
        print('losses -', losses)









    



losses - {'ner': 8.5897110183867689}
losses - {'ner': 4.0043320339549489}
losses - {'ner': 3.5416068638551419}
losses - {'ner': 2.3882739497774605}
losses - {'ner': 3.093733938831531}
losses - {'ner': 0.078656306189116357}
losses - {'ner': 0.021017901426065407}
losses - {'ner': 0.00065465895895030644}
losses - {'ner': 2.8846165625929623e-10}
losses - {'ner': 5.4703440470535987e-06}



In [13]:

    
print('Train Data Updated Entities')
# Iterating through TRAIN_DATA
for train_text, _ in TRAIN_DATA:
    doc = nlp(train_text)
    # Using displacy.render to see the updated entities
    displacy.render(doc, style='ent', jupyter=True)









    



Train Data Updated Entities






    




Book a ticket for 

    Bangalore
    destination

 

    tomorrow
    travelDate








    




i am planning to travel on 

    15/07/18
    travelDate








    




I want to travel from 

    Delhi
    source

 to 

    Bangalore
    destination



In [14]:

    
print('\nTest Data Updated Entities')
# Iterating through TEST_DATA
for test_text in TEST_DATA:
    doc = nlp(test_text)
    # Using displacy.render to see the updated entities
    displacy.render(doc, style='ent', jupyter=True)









    



Test Data Updated Entities






    




I want to travel from 

    Hyderabad
    source

 to 

    Kochi
    destination

 

    today
    travelDate








    




It is going to rain today in 

    Mumbai
    destination








    




Delhi is capital of 

    India
    travelDate



In [15]:

    
# Now lets see how the original model does the tagging.

# Loading original english language model, it may take some time to load
# depending upon the model type - small, medium or large
org_nlp = spacy.load('en')



In [16]:

    
print('Train Data Original Entities')
# Iterating through TRAIN_DATA
for train_text, _ in TRAIN_DATA:
    org_doc = org_nlp(train_text)
    # Using displacy.render to see the entities already identified
    displacy.render(org_doc, style='ent', jupyter=True)









    



Train Data Original Entities






    




Book a ticket for 

    Bangalore
    GPE

 

    tomorrow
    DATE








    




i am planning to travel on 

    15/07/18
    GPE








    




I want to travel from 

    Delhi
    GPE

 to 

    Bangalore
    GPE



In [17]:

    
print('\nTest Data Original Entities')
# Iterating through TEST_DATA
for test_text in TEST_DATA:
    org_doc = org_nlp(test_text)
    # Using displacy.render to see the entities already identified
    displacy.render(org_doc, style='ent', jupyter=True)









    



Test Data Original Entities






    




I want to travel from 

    Hyderabad
    GPE

 to 

    Kochi
    GPE

 

    today
    DATE








    




It is going to rain 

    today
    DATE

 in 

    Mumbai
    GPE








    






    Delhi
    GPE

 is capital of 

    India
    GPE



In [ ]: