In [1]:
# The idea to get more informative words from a book is
# Get the frequency distribution(FD) of words for a book (ex: alice in wonderland), get 100 most common words(CW) - FD1_100
# Get FD of words for another book (ex: moby dick), get 100 most common words(CW) - FD2_100
# Extract only words from the FD CW lists for both
# Take a difference from one to another, which will give the most common of one book with respect to other
# which gives the informative words for that book.

In [2]:
import nltk

In [3]:
# Loading the alice wonderland book words
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')

In [4]:
# Creating the frequency distribution
alice_fd = nltk.FreqDist(alice)

In [5]:
# Getting the first 100 most common words from frequency distribution
alice_fd_mc_100 = alice_fd.most_common(100)

In [6]:
# Now performing the same for moby dick book
moby = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
moby_fd = nltk.FreqDist(moby)
moby_fd_mc_100 = moby_fd.most_common(100)

In [7]:
# Extracting only words from most common words
# [('Turtle', 59), ('quite', 53)] => ['Turtle', 'quite']
alice_words_100 = [word[0] for word in alice_fd_mc_100]
moby_words_100 = [word[0] for word in moby_fd_mc_100]

In [8]:
# Now getting the informative words for
# Alice -> (alive_words - moby_words)
# Moby -> (moby_words - alive_words)
alice_informative_words = set(alice_words_100) - set(moby_words_100)
moby_informative_words = set(moby_words_100) - set(alice_words_100)

In [9]:
# From the set of words below, observe names like 'Alice', 'Gryphon', 'King', 'Queen'
# Which talks about the book of alice in wonderland
alice_informative_words


Out[9]:
{"!'",
 '*',
 ",'",
 ".'",
 ':',
 "?'",
 'Alice',
 'Gryphon',
 'Hatter',
 'King',
 'Mock',
 'Queen',
 'Turtle',
 'again',
 'began',
 'can',
 'could',
 'did',
 'do',
 'herself',
 'know',
 'little',
 'll',
 'm',
 'much',
 'off',
 'quite',
 'said',
 'say',
 'see',
 'she',
 't',
 'thought',
 'way',
 'went',
 'your'}

In [10]:
# From the set of words below, observe names like 'Ahab', 'boat', 'sea', 'ship', 'whale'
# Which talks about the book of alice in wonderland
moby_informative_words


Out[10]:
{'!"',
 '"',
 '."',
 '?',
 'Ahab',
 'But',
 'any',
 'are',
 'been',
 'boat',
 'from',
 'head',
 'him',
 'long',
 'man',
 'more',
 'now',
 'old',
 'only',
 'other',
 'over',
 'sea',
 'ship',
 'some',
 'such',
 'than',
 'their',
 'these',
 'though',
 'upon',
 'we',
 'whale',
 'which',
 'who',
 'will',
 'ye'}