For this assessment we'll be using the short story The Tale of Peter Rabbit by Beatrix Potter (1902).
The story is in the public domain; the text file was obtained from Project Gutenberg.
In [3]:
# RUN THIS CELL to perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy
1. Create a Doc object from the file peterrabbit.txt
HINT: Use
with open('../TextFiles/peterrabbit.txt') as f:
In [4]:
with open('../TextFiles/peterrabbit.txt') as f:
doc = nlp(f.read())
2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.
In [16]:
# Enter your code here:
for tokens in list(doc.sents)[3]:
print(f"{tokens.text:{15}} {tokens.pos_:{10}} {tokens.tag_:{10}} {spacy.explain(tokens.tag_)} ")
3. Provide a frequency list of POS tags from the entire document
In [22]:
POS_counts = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
print(f'{k}. {doc.vocab[k].text:{10}} {v}')
4. CHALLENGE: What percentage of tokens are nouns?
HINT: the attribute ID for 'NOUN' is 91
In [27]:
total_tokens = len([tokens for tokens in doc])
noun_tokens = len([tokens for tokens in doc if tokens.pos_ == 'NOUN'])
(noun_tokens / total_tokens) * 100
Out[27]:
5. Display the Dependency Parse for the third sentence
In [32]:
displacy.render(list(doc.sents)[3],style='dep', jupyter=True, options={'distance':50})
In [34]:
for ent in doc.ents[:3]:
print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
7. How many sentences are contained in The Tale of Peter Rabbit?
In [35]:
len([s for s in doc.sents])
Out[35]:
8. CHALLENGE: How many sentences contain named entities?
In [36]:
list_of_sents = [nlp(sent.text) for sent in doc.sents]
list_of_ners = [doc for doc in list_of_sents if doc.ents]
len(list_of_ners)
Out[36]:
9. CHALLENGE: Display the named entity visualization for list_of_sents[0] from the previous problem
In [37]:
displacy.render(list_of_sents[0], style='ent', jupyter=True)