Link here
In [1]:
# Importing spacy
import spacy
In [2]:
# Loading language model
nlp = spacy.load('en')
In [3]:
# Adding special case Tokenization rules
# https://spacy.io/usage/linguistic-features#special-cases
# Importing necessary symbols from spacy
from spacy.symbols import ORTH, LEMMA, POS, TAG
# New sentence wanted to tokenize
doc = nlp(u'gimme that')
assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization has only 2 tokens
# add special case rule
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
# Adding the special case to tokenizer, which will be affective from next sentence parsing
nlp.tokenizer.add_special_case(u'gimme', special_case)
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] # After customization got 3 tokens
# Pronoun lemma is returned as -PRON-!
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
In [4]:
# The special case doesn't have to match an entire whitespace-delimited substring.
# The tokenizer will incrementally split off punctuation, and keep looking up the remaining substring
# gimme! when split it returns 3 values gim, me, ! and lemmas are give, -PRON-, !
assert 'gimme' not in [w.text for w in nlp(u'gimme!')] # gimme should not be there in token texts
assert [w.lemma_ for w in nlp(u'gimme!')] == [u'give', u'-PRON-', u'!'] # lemmas should be give, -PRON-, !
# Tokenizing works even with the string part of the periods and punctuations
assert 'gimme' not in [w.text for w in nlp(u'("...gimme...?")')]
# Asserting lemmas for '...gimme...?'
assert [w.lemma_ for w in nlp(u'...gimme...?')] == [u'...', u'give', u'-PRON-', u'...', u'?']
# Adding another case for matching the whole "...gimme...?" to a single token
special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]
nlp.tokenizer.add_special_case(u'...gimme...?', special_case)
# the length of tokens should be one
assert len(nlp(u'...gimme...?')) == 1
In [5]:
nlp = spacy.load('en')
# Observation: |prefix_and_suffix| is taken as single word
doc = nlp(u'testing the |prefix_and_suffix| here')
print([token.text for token in doc])
In [6]:
# Importing necessary modules
import regex as re
from spacy.tokenizer import Tokenizer
# Creating regular Expressions
prefix_re = re.compile(r'''^[\[\("']''')
# Observe that | (pipe) is been added as suffix
suffix_re = re.compile(r'''[\]\)"|']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')
# TODO: Test it with other scenarios of infix and "token_match"
In [7]:
# Customizing spaCy's Tokenizer class
# defining the custom function
def custom_tokenizer(nlp):
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match)
# Updating the tokenizer
nlp.tokenizer = custom_tokenizer(nlp)
In [8]:
doc = nlp(u'testing the new |prefix_and_suffix| here')
# Observation: suffix | is separated as another token
# TODO: Check why the prefix is not working?
print([token.text for token in doc])