In [1]:
# The idea to get more informative words from a book is
# Get the frequency distribution(FD) of words for a book (ex: alice in wonderland), get 100 most common words(CW) - FD1_100
# Get FD of words for another book (ex: moby dick), get 100 most common words(CW) - FD2_100
# Extract only words from the FD CW lists for both
# Take a difference from one to another, which will give the most common of one book with respect to other
# which gives the informative words for that book.
In [2]:
import nltk
In [3]:
# Loading the alice wonderland book words
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
In [4]:
# Creating the frequency distribution
alice_fd = nltk.FreqDist(alice)
In [5]:
# Getting the first 100 most common words from frequency distribution
alice_fd_mc_100 = alice_fd.most_common(100)
In [6]:
# Now performing the same for moby dick book
moby = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
moby_fd = nltk.FreqDist(moby)
moby_fd_mc_100 = moby_fd.most_common(100)
In [7]:
# Extracting only words from most common words
# [('Turtle', 59), ('quite', 53)] => ['Turtle', 'quite']
alice_words_100 = [word[0] for word in alice_fd_mc_100]
moby_words_100 = [word[0] for word in moby_fd_mc_100]
In [8]:
# Now getting the informative words for
# Alice -> (alive_words - moby_words)
# Moby -> (moby_words - alive_words)
alice_informative_words = set(alice_words_100) - set(moby_words_100)
moby_informative_words = set(moby_words_100) - set(alice_words_100)
In [9]:
# From the set of words below, observe names like 'Alice', 'Gryphon', 'King', 'Queen'
# Which talks about the book of alice in wonderland
alice_informative_words
Out[9]:
In [10]:
# From the set of words below, observe names like 'Ahab', 'boat', 'sea', 'ship', 'whale'
# Which talks about the book of alice in wonderland
moby_informative_words
Out[10]: