Tokenization under Linguistic Features

Link here


In [1]:
# Importing spacy
import spacy

In [2]:
# Loading language model
nlp = spacy.load('en')

In [3]:
# Adding special case Tokenization rules
# https://spacy.io/usage/linguistic-features#special-cases

# Importing necessary symbols from spacy
from spacy.symbols import ORTH, LEMMA, POS, TAG

# New sentence wanted to tokenize
doc = nlp(u'gimme that')
assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization has only 2 tokens

# add special case rule
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]

# Adding the special case to tokenizer, which will be affective from next sentence parsing
nlp.tokenizer.add_special_case(u'gimme', special_case)

assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] # After customization got 3 tokens
# Pronoun lemma is returned as -PRON-!
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']

In [4]:
# The special case doesn't have to match an entire whitespace-delimited substring. 
# The tokenizer will incrementally split off punctuation, and keep looking up the remaining substring

# gimme! when split it returns 3 values gim, me, ! and lemmas are give, -PRON-, !
assert 'gimme' not in [w.text for w in nlp(u'gimme!')] # gimme should not be there in token texts
assert [w.lemma_ for w in nlp(u'gimme!')] == [u'give', u'-PRON-', u'!'] # lemmas should be give, -PRON-, !

# Tokenizing works even with the string part of the periods and punctuations
assert 'gimme' not in [w.text for w in nlp(u'("...gimme...?")')]
# Asserting lemmas for '...gimme...?'
assert [w.lemma_ for w in nlp(u'...gimme...?')] == [u'...', u'give', u'-PRON-', u'...', u'?'] 

# Adding another case for matching the whole "...gimme...?" to a single token
special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]
nlp.tokenizer.add_special_case(u'...gimme...?', special_case)

# the length of tokens should be one
assert len(nlp(u'...gimme...?')) == 1

In [5]:
nlp = spacy.load('en')

# Observation: |prefix_and_suffix| is taken as single word
doc = nlp(u'testing the |prefix_and_suffix| here')
print([token.text for token in doc])


['testing', 'the', '|prefix_and_suffix|', 'here']

In [6]:
# Importing necessary modules
import regex as re
from spacy.tokenizer import Tokenizer

# Creating regular Expressions
prefix_re = re.compile(r'''^[\[\("']''')
# Observe that | (pipe) is been added as suffix
suffix_re = re.compile(r'''[\]\)"|']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')
# TODO: Test it with other scenarios of infix and "token_match"

In [7]:
# Customizing spaCy's Tokenizer class

# defining the custom function
def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=simple_url_re.match)

# Updating the tokenizer
nlp.tokenizer = custom_tokenizer(nlp)

In [8]:
doc = nlp(u'testing the new |prefix_and_suffix| here')
# Observation: suffix | is separated as another token
# TODO: Check why the prefix is not working?
print([token.text for token in doc])


['testing', 'the', 'new', '|prefix_and_suffix', '|', 'here']