Exercise solution for POS Tagging

EX1: Print the most frequent nouns, adjective, and verbs in the sentence EX2: Compare the most frequent part-of-speech used in two of the texts in our data folder



In [ ]:

    
import nltk
from nltk import word_tokenize



In [ ]:

    
#EX1: Print the most frequent nouns, adjective, and verbs in the sentence
#First recreate what we did"

sentence="For me it has to do with the work that gets done at the crossroads of \
digital media and traditional humanistic study. And that happens in two different ways. \
On the one hand, it's bringing the tools and techniques of digital media to bear \
on traditional humanistic questions; on the other, it's also bringing humanistic modes \
of inquiry to bear on digital media."
sentence_tokens = word_tokenize(sentence)
tagged_sentence_tokens = nltk.pos_tag(sentence_tokens)
adjectives = [word for (word,pos) in tagged_sentence_tokens if pos == 'JJ' or pos=='JJR' or pos=='JJS']
nouns = [word for (word,pos) in tagged_sentence_tokens if pos=='NN' or pos=='NNS']
verbs = [word for (word,pos) in tagged_sentence_tokens if pos in ['VB', 'VBD','VBG','VBN','VBP','VBZ']]



In [ ]:

    
#Print most frequent nouns, adjective, and verbs
nouns_frequency = nltk.FreqDist(tagged_sentence_tokens)
print("Most Frequent Nouns:")
print(nouns_frequency.most_common())
adj_frequency = nltk.FreqDist(tagged_sentence_tokens)
print()
print("Most Frequent Adjectives:")
print(adj_frequency.most_common())
verbs_frequency = nltk.FreqDist(tagged_sentence_tokens)
print()
print("Most Frequent Verbs:")
print(verbs_frequency.most_common())



In [ ]:

    
#EX2: Compare the most frequent part-of-speech used in two of the texts in our data folder

#open and read the novels, save them as variables
austen_string = open('../Data/Austen_PrideAndPrejudice.txt', encoding='utf-8').read()
alcott_string = open('../Data/Alcott_GarlandForGirls.txt', encoding='utf-8').read()

#tokenize, then tag each word with its part of speech
austen_tokens = word_tokenize(austen_string)
alcott_tokens = word_tokenize(alcott_string)
austen_tokens_tagged = nltk.pos_tag(austen_tokens)
alcott_tokens_tagged = nltk.pos_tag(alcott_tokens)

#create a frequency distribution for the part of speech tags (not the words)
austen_tag_freq = nltk.FreqDist(tag for (word, tag) in austen_tokens_tagged)
alcott_tag_freq = nltk.FreqDist(tag for (word, tag) in alcott_tokens_tagged)

#print most frequent tags
print("Frequent Part of Speech tags in Austen")
print(austen_tag_freq.most_common(10))

print("Frequent Part of Speech tags in Alcott")
print(alcott_tag_freq.most_common(10))