notebook.community

Edit and run



In [1]:

    
# Importing NLTK
import nltk



In [2]:

    
# NLTK comes with prepacakged text data
# Project Guntenberg is a group that digitizing the books and literature that are mostly in public domain
# The main page is below
# http://www.gutenberg.org/wiki/Main_Page



In [3]:

    
# Below code may fail if the gutenberg corpus is not downloaded
# Download it from nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()
# If the above code prints as like u'austen-emma.txt', it represents the string is unicode as opposed to byte string.









    Out[3]:





['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']



In [4]:

    
# Now getting all the words from the Moby Dick book
md = nltk.corpus.gutenberg.words('melville-moby_dick.txt')



In [5]:

    
# Now md is a list variable containing the words of the book, trying to print first few
md[:8]









    Out[5]:





['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']']



In [6]:

    
# Now coming to the count of particular string appeared in Book, we can run the .count('str')
md.count('Moby')









    Out[6]:





84



In [7]:

    
# To see how many total words are there in the book, since it is a list, we can just run len()
len(md)









    Out[7]:





260819



In [8]:

    
# Now if we want to try no of unique words in the book
# we can convert the list into set - set is where which contains the unique elements
md_set = set(md)



In [9]:

    
# Now finding the no of words in the set, it is nothing but the no of unique words
len(md_set)









    Out[9]:





19317



In [10]:

    
# Now if we want to retrieve the number of sentences in the book
md_sents = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')



In [11]:

    
len(md_sents)









    Out[11]:





10059



In [12]:

    
# Now if we want to see average no of words per sentence, i.e. we have 26 
len(md) / len(md_sents)









    Out[12]:





25.928919375683467