Link here
In [1]:
# Adding Patterns https://spacy.io/usage/linguistic-features#adding-patterns
# Importing spaCy
import spacy
In [ ]:
# Load the language model
nlp = spacy.load('en')
In [ ]:
# Importing Matcher from spacy
from spacy.matcher import Matcher
# The matcher must always share the same vocab with the documents it will operate on.
matcher = Matcher(nlp.vocab)
# add match ID "HelloWorld" with no callback and one pattern
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]
# The second argument lets you pass in an optional callback function "on_match" to invoke on a successful match.
matcher.add('HelloWorld', None, pattern)
# Getting the document from the sentence
doc = nlp(u'Hello, world! Hello world!')
# the matcher will only return the matches and not do anything else, like merge entities or assign labels
# we can implement custom logic by passing in a callback function as the on_match argument on add()
matches = matcher(doc)
# TODO: spacy.io Documentation update, it refers as [('HelloWorld', 0, 2)]
print('first sentence match -', matches)
# Should match and return the result, TODO: Should it return 2 ?
assert len(matches) == 1
# New sentence which doesnt match the matcher
# TODO: this seems to be the 64bit hash id, use stringstore to retrieve the original string ?
doc = nlp(u'another arbitary sentence')
matches = matcher(doc)
assert len(matches) == 0
In [ ]:
# Adding Phrase Patterns
# If you need to match large terminology lists, prefer to use PhraseMatcher
# Importing PhraseMatcher from spacy
from spacy.matcher import PhraseMatcher
# Creating new PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
# Preparing terminology list
terminology_list = ['Barack Obama', 'Angela Merkel', 'Washington, D.C.']
# Preparing the patterns
patterns = [nlp(text) for text in terminology_list]
print('Patterns - ', [patrn.text for patrn in patterns])
# Adding to matcher
matcher.add('TerminologyList', None, *patterns)
doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama "
u"converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
# TODO: Why all them matched with same id?
print('Matches ', matches)
In [8]:
# Adding on_match rules
# https://spacy.io/usage/linguistic-features#on_match
# Loading language model
nlp = spacy.load('en')
# Loading displacy to visualize
from spacy import displacy
doc = nlp('Google I/O event on next Friday')
print('Entities Without matcher')
# Initially without matcher, Observe that Google I/O is not matched as entity
# Is it because we loaded the small model, check with medium and large models
for ent in doc.ents:
print('ent - "', ent,'"')
# next Friday is entity as DATE
displacy.render(doc, style='ent', jupyter=True)
In [9]:
# want to match all mentions of "Google I/O"
# add a second pattern with an added {IS_DIGIT: True} token – this will make sure you also match on "Google I/O 2017"
# Loading Matcher
from spacy.matcher import Matcher
# Creating Matcher
matcher = Matcher(nlp.vocab)
# Get the ID of the 'EVENT' entity type. This is required to set an entity.
EVENT = nlp.vocab.strings['EVENT']
# on_match callback function
def add_event_ent(matcher, doc, i, matches):
# Get the current match and create tuple of entity label, start and end.
# Append entity to the doc's entity. (Don't overwrite doc.ents!)
match_id, start, end = matches[i]
# Adding the EVENT to the document entities
doc.ents += ((EVENT, start, end),)
matcher.add('GoogleIO', add_event_ent,
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}],
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}, {'IS_DIGIT': True}])
doc = nlp('Google I/O event on next Friday')
matches = matcher(doc)
print('Now new entity With matcher')
# Observation - see that Google I/O is matched as entity
for ent in doc.ents:
print('ent -"', ent,'"')
# Now we can see that "Google I/O" is marked as entity EVENT
displacy.render(doc, style='ent', jupyter=True)
In [10]:
# Matching with Ending with digit(year in this case)
doc = nlp('Google I/O 2018 event on next Friday')
matches = matcher(doc)
for ent in doc.ents:
print('ent -"', ent,'"')
# Now we can see that "Google I/O 2018" is marked as entity EVENT
displacy.render(doc, style='ent', jupyter=True)
In [ ]: