In [1]:
# Importing NLTK
import nltk

In [2]:
# NLTK comes with prepacakged text data
# Project Guntenberg is a group that digitizing the books and literature that are mostly in public domain
# The main page is below
# http://www.gutenberg.org/wiki/Main_Page

In [3]:
# Below code may fail if the gutenberg corpus is not downloaded
# Download it from nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()
# If the above code prints as like u'austen-emma.txt', it represents the string is unicode as opposed to byte string.


Out[3]:
['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
# Now getting all the words from the Moby Dick book
md = nltk.corpus.gutenberg.words('melville-moby_dick.txt')

In [5]:
# Now md is a list variable containing the words of the book, trying to print first few
md[:8]


Out[5]:
['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']']

In [6]:
# Now coming to the count of particular string appeared in Book, we can run the .count('str')
md.count('Moby')


Out[6]:
84

In [7]:
# To see how many total words are there in the book, since it is a list, we can just run len()
len(md)


Out[7]:
260819

In [8]:
# Now if we want to try no of unique words in the book
# we can convert the list into set - set is where which contains the unique elements
md_set = set(md)

In [9]:
# Now finding the no of words in the set, it is nothing but the no of unique words
len(md_set)


Out[9]:
19317

In [10]:
# Now if we want to retrieve the number of sentences in the book
md_sents = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')

In [11]:
len(md_sents)


Out[11]:
10059

In [12]:
# Now if we want to see average no of words per sentence, i.e. we have 26 
len(md) / len(md_sents)


Out[12]:
25.928919375683467