Rule based Matching in Linguistic Features

Link here


In [1]:
# Adding Patterns https://spacy.io/usage/linguistic-features#adding-patterns

# Importing spaCy
import spacy

In [ ]:
# Load the language model
nlp = spacy.load('en')

In [ ]:
# Importing Matcher from spacy
from spacy.matcher import Matcher

# The matcher must always share the same vocab with the documents it will operate on.
matcher = Matcher(nlp.vocab)
# add match ID "HelloWorld" with no callback and one pattern
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

# The second argument lets you pass in an optional callback function "on_match" to invoke on a successful match.
matcher.add('HelloWorld', None, pattern)

# Getting the document from the sentence
doc = nlp(u'Hello, world! Hello world!')

# the matcher will only return the matches and not do anything else, like merge entities or assign labels
# we can implement custom logic by passing in a callback function as the on_match argument on add()
matches = matcher(doc)

# TODO: spacy.io Documentation update, it refers as [('HelloWorld', 0, 2)]
print('first sentence match -', matches)
# Should match and return the result, TODO: Should it return 2 ?
assert len(matches) == 1

# New sentence which doesnt match the matcher
# TODO: this seems to be the 64bit hash id, use stringstore to retrieve the original string ?
doc = nlp(u'another arbitary sentence')
matches = matcher(doc)
assert len(matches) == 0

In [ ]:
# Adding Phrase Patterns

# If you need to match large terminology lists, prefer to use PhraseMatcher

# Importing PhraseMatcher from spacy
from spacy.matcher import PhraseMatcher

# Creating new PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
# Preparing terminology list
terminology_list = ['Barack Obama', 'Angela Merkel', 'Washington, D.C.']
# Preparing the patterns

patterns = [nlp(text) for text in terminology_list]
print('Patterns - ', [patrn.text for patrn in patterns])
# Adding to matcher
matcher.add('TerminologyList', None, *patterns)

doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama "
          u"converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
# TODO: Why all them matched with same id?
print('Matches ', matches)

In [8]:
# Adding on_match rules
# https://spacy.io/usage/linguistic-features#on_match

# Loading language model
nlp = spacy.load('en')

# Loading displacy to visualize
from spacy import displacy

doc = nlp('Google I/O event on next Friday')

print('Entities Without matcher')
# Initially without matcher, Observe that Google I/O is not matched as entity
# Is it because we loaded the small model, check with medium and large models
for ent in doc.ents:
    print('ent - "', ent,'"')

# next Friday is entity as DATE
displacy.render(doc, style='ent', jupyter=True)


Entities Without matcher
ent - " next Friday "
Google I/O event on next Friday DATE

In [9]:
# want to match all mentions of "Google I/O" 
# add a second pattern with an added {IS_DIGIT: True} token – this will make sure you also match on "Google I/O 2017"

# Loading Matcher

from spacy.matcher import Matcher
# Creating Matcher
matcher = Matcher(nlp.vocab)

# Get the ID of the 'EVENT' entity type. This is required to set an entity.
EVENT = nlp.vocab.strings['EVENT']

# on_match callback function
def add_event_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    # Adding the EVENT to the document entities
    doc.ents += ((EVENT, start, end),)

matcher.add('GoogleIO', add_event_ent,
            [{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}],
            [{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}, {'IS_DIGIT': True}])

doc = nlp('Google I/O event on next Friday')

matches = matcher(doc)

print('Now new entity With matcher')
# Observation - see that Google I/O is matched as entity
for ent in doc.ents:
    print('ent -"', ent,'"')

# Now we can see that "Google I/O" is marked as entity EVENT
displacy.render(doc, style='ent', jupyter=True)


Now new entity With matcher
ent -" Google I/O "
ent -" next Friday "
Google I/O EVENT event on next Friday DATE

In [10]:
# Matching with Ending with digit(year in this case)
doc = nlp('Google I/O 2018 event on next Friday')
matches = matcher(doc)
for ent in doc.ents:
    print('ent -"', ent,'"')

# Now we can see that "Google I/O 2018" is marked as entity EVENT
displacy.render(doc, style='ent', jupyter=True)


ent -" Google I/O 2018 "
ent -" next Friday "
Google I/O 2018 EVENT event on next Friday DATE

In [ ]: