For this assessment we'll be using the short story An Occurrence at Owl Creek Bridge by Ambrose Bierce (1890).
The story is in the public domain; the text file was obtained from Project Gutenberg.
In [1]:
# RUN THIS CELL to perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
1. Create a Doc object from the file owlcreek.txt
HINT: Use
with open('../TextFiles/owlcreek.txt') as f:
In [7]:
# Enter your code here:
with open('../TextFiles/owlcreek.txt') as f:
doc = nlp(f.read())
type(doc)
Out[7]:
In [8]:
# Run this cell to verify it worked:
doc[:36]
Out[8]:
2. How many tokens are contained in the file?
In [10]:
len(doc)
Out[10]:
3. How many sentences are contained in the file?
HINT: You'll want to build a list first!
In [19]:
s = [sent for sent in doc.sents]
len(s)
Out[19]:
4. Print the second sentence in the document
HINT: Indexing starts at zero, and the title counts as the first sentence.
In [21]:
print(s[2].text)
5. For each token in the sentence above, print its text, POS tag, dep tag and lemma
CHALLENGE: Have values line up in columns in the print output.
In [30]:
# NORMAL SOLUTION:
for tokens in s[2]:
print(tokens,tokens.pos_,tokens.dep_, tokens.lemma_)
In [31]:
# CHALLENGE SOLUTION:
for token in s[2]:
print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')
6. Write a matcher called 'Swimming' that finds both occurrences of the phrase "swimming vigorously" in the text
HINT: You should include an 'IS_SPACE': True pattern between the two words!
In [32]:
# Import the Matcher library:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
In [34]:
# Create a pattern and add it to matcher:
pattern = [{'LOWER':'swimming'},{'IS_SPACE':True},{'LOWER':'vigorously'}]
matcher.add('swimmer',None,pattern)
In [35]:
# Create a list of matches called "found_matches" and print the list:
found_matches = matcher(doc)
print(found_matches)
7. Print the text surrounding each found match
In [45]:
print(doc[1265:1290])
In [47]:
print(doc[3600:3617])
EXTRA CREDIT:
Print the sentence that contains each found match
In [55]:
sents = [sent for sent in doc.sents]
for s in sents:
if found_matches[0][1] < s.end:
print(s)
break
In [56]:
for s in sents:
if found_matches[1][1] < s.end:
print(s)
break