For this assessment we'll be using the short story An Occurrence at Owl Creek Bridge by Ambrose Bierce (1890).
The story is in the public domain; the text file was obtained from Project Gutenberg.
In [1]:
# RUN THIS CELL to perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
1. Create a Doc object from the file owlcreek.txt
HINT: Use
with open('../TextFiles/owlcreek.txt') as f:
In [2]:
# Enter your code here:
with open('../TextFiles/owlcreek.txt') as f:
doc = nlp(f.read())
In [3]:
# Run this cell to verify it worked:
doc[:36]
Out[3]:
2. How many tokens are contained in the file?
In [4]:
len(doc)
Out[4]:
3. How many sentences are contained in the file?
HINT: You'll want to build a list first!
In [5]:
sents = [sent for sent in doc.sents]
len(sents)
Out[5]:
4. Print the second sentence in the document
HINT: Indexing starts at zero, and the title counts as the first sentence.
In [6]:
print(sents[1].text)
5. For each token in the sentence above, print its text, POS tag, dep tag and lemma
CHALLENGE: Have values line up in columns in the print output.
In [7]:
# NORMAL SOLUTION:
for token in sents[1]:
print(token.text, token.pos_, token.dep_, token.lemma_)
In [8]:
# CHALLENGE SOLUTION:
for token in sents[1]:
print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')
6. Write a matcher called 'Swimming' that finds both occurrences of the phrase "swimming vigorously" in the text
HINT: You should include an 'IS_SPACE': True pattern between the two words!
In [9]:
# Import the Matcher library:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
In [10]:
# Create a pattern and add it to matcher:
pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True, 'OP':'*'}, {'LOWER': 'vigorously'}]
matcher.add('Swimming', None, pattern)
In [11]:
# Create a list of matches called "found_matches" and print the list:
found_matches = matcher(doc)
print(found_matches)
7. Print the text surrounding each found match
In [12]:
print(doc[1265:1290])
In [13]:
print(doc[3600:3615])
EXTRA CREDIT:
Print the sentence that contains each found match
In [18]:
for sent in sents:
if found_matches[0][1] < sent.end:
print(sent)
break
In [19]:
for sent in sents:
if found_matches[1][1] < sent.end:
print(sent)
break