In [1]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
In [2]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)
In [3]:
len(nlp.Defaults.stop_words)
Out[3]:
In [4]:
nlp.vocab['myself'].is_stop
Out[4]:
In [5]:
nlp.vocab['mystery'].is_stop
Out[5]:
In [6]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')
# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True
In [7]:
len(nlp.Defaults.stop_words)
Out[7]:
In [8]:
nlp.vocab['btw'].is_stop
Out[8]:
When adding stop words, always use lowercase. Lexemes are converted to lowercase before being added to **vocab**.
In [9]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')
# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False
In [10]:
len(nlp.Defaults.stop_words)
Out[10]:
In [11]:
nlp.vocab['beyond'].is_stop
Out[11]: