In [1]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')
In [2]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
for sent in doc.sents:
print(sent)
In [3]:
print(doc[1])
In [4]:
print(doc.sents[1])
However, you can build a sentence collection by running doc.sents
and saving the result to a list:
In [5]:
doc_sents = [sent for sent in doc.sents]
doc_sents
Out[5]:
**NOTE**: `list(doc.sents)` also works. We show a list comprehension as it allows you to pass in conditionals.
In [6]:
# Now you can access individual sentences:
print(doc_sents[1])
In [7]:
type(doc_sents[1])
Out[7]:
In [8]:
print(doc_sents[1].start, doc_sents[1].end)
In [9]:
# Parsing the segmentation start tokens happens during the nlp pipeline
doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.')
for token in doc2:
print(token.is_sent_start, ' '+token.text)
Notice we haven't run `doc2.sents`, and yet `token.is_sent_start` was set to True on two tokens in the Doc.
Let's add a semicolon to our existing segmentation rules. That is, whenever the sentencizer encounters a semicolon, the next token should start a new segment.
In [2]:
# SPACY'S DEFAULT BEHAVIOR
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')
for sent in doc3.sents:
print(sent)
In [2]:
# ADD A NEW RULE TO THE PIPELINE
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == ';':
doc[token.i+1].is_sent_start = True
return doc
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names
Out[2]:
The new rule has to run before the document is parsed. Here we can either pass the argument before='parser'
or first=True
.
In [3]:
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')
for sent in doc4.sents:
print(sent)
In [13]:
# And yet the new rule doesn't apply to the older Doc object:
for sent in doc3.sents:
print(sent)
In [4]:
# Find the token we want to change:
doc3[7]
Out[4]:
In [5]:
# Try to change the .is_sent_start attribute:
doc3[7].is_sent_start = True
spaCy refuses to change the tag after the document is parsed to prevent inconsistencies in the data.
In [24]:
nlp = spacy.load('en_core_web_sm') # reset to the original
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
# SPACY DEFAULT BEHAVIOR:
doc = nlp(mystring)
for sent in doc.sents:
print([token.text for token in sent])
In [25]:
# CHANGING THE RULES
from spacy.pipeline import SentenceSegmenter
def split_on_newlines(doc):
start = 0
seen_newline = False
for word in doc:
if seen_newline:
yield doc[start:word.i]
start = word.i
seen_newline = False
elif word.text.startswith('\n'): # handles multiple occurrences
seen_newline = True
yield doc[start:] # handles the last group of tokens
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)
While the function `split_on_newlines` can be named anything we want, it's important to use the name `sbd` for the SentenceSegmenter.
In [26]:
doc = nlp(mystring)
for sent in doc.sents:
print([token.text for token in sent])