In [1]:
# Importing NLTK
import nltk
In [2]:
# NLTK comes with prepacakged text data
# Project Guntenberg is a group that digitizing the books and literature that are mostly in public domain
# The main page is below
# http://www.gutenberg.org/wiki/Main_Page
In [3]:
# Below code may fail if the gutenberg corpus is not downloaded
# Download it from nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()
# If the above code prints as like u'austen-emma.txt', it represents the string is unicode as opposed to byte string.
Out[3]:
In [4]:
# Now getting all the words from the Moby Dick book
md = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
In [5]:
# Now md is a list variable containing the words of the book, trying to print first few
md[:8]
Out[5]:
In [6]:
# Now coming to the count of particular string appeared in Book, we can run the .count('str')
md.count('Moby')
Out[6]:
In [7]:
# To see how many total words are there in the book, since it is a list, we can just run len()
len(md)
Out[7]:
In [8]:
# Now if we want to try no of unique words in the book
# we can convert the list into set - set is where which contains the unique elements
md_set = set(md)
In [9]:
# Now finding the no of words in the set, it is nothing but the no of unique words
len(md_set)
Out[9]:
In [10]:
# Now if we want to retrieve the number of sentences in the book
md_sents = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')
In [11]:
len(md_sents)
Out[11]:
In [12]:
# Now if we want to see average no of words per sentence, i.e. we have 26
len(md) / len(md_sents)
Out[12]: